[dev] [commit] r1027 - app log patches phplib

automailer at dexonline.ro automailer at dexonline.ro
Mon Oct 21 13:24:29 EEST 2013


Author: cata
Date: Mon Oct 21 13:24:29 2013
New Revision: 1027

Log:
Some crawler changes
* moved settings into dex.conf.sample
* replaced user_agent_location with just user_agent. It now contains the actual user agent (we can do that because dex.conf.sample is public, but dex.conf is not).
* simplify the URL white list. It is now crawler.whiteList in dex.conf.sample. Removed all the multiple line comments code.
* remove references to new_line. I hope that's ok. We can always display the log in a browser window with <pre> if needed.
* add MySQL tables. I hope I got all the fields and indexes right. I couldn't find the original schema.

Added:
   patches/00102.sql
Deleted:
   app/WhiteList.txt
   app/app_dex.conf
Modified:
   app/AbstractCrawler.php
   app/Crawler.php
   app/clean.php
   dex.conf.sample
   log/   (props changed)
   phplib/AppLog.php

Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php	Mon Oct 21 12:30:37 2013	(r1026)
+++ app/AbstractCrawler.php	Mon Oct 21 13:24:29 2013	(r1027)
@@ -47,10 +47,10 @@
 	function getPage($url) {
 
 		$this->ch = curl_init();
-		Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
+		Applog::log("User agent is: " . Config::get('crawler.user_agent'));
 		curl_setopt ($this->ch, CURLOPT_URL, $url);
 		curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
-		curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(Config::get('crawler.user_agent_location')));
+		curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
 		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
 		curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
 		curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);

Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php	Mon Oct 21 12:30:37 2013	(r1026)
+++ app/Crawler.php	Mon Oct 21 13:24:29 2013	(r1027)
@@ -3,7 +3,7 @@
  * Alin Ungureanu, 2013
  * alyn.cti at gmail.com
  */
-require_once dirname(__FILE__) . '/AbstractCrawler.php';
+require_once __DIR__ . '/AbstractCrawler.php';
 
 class Crawler extends AbstractCrawler {
 
@@ -99,7 +99,7 @@
 	
 		Applog::log("Crawler started");
 
-		$this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
+		$this->domainsList = Config::get('crawler.whiteList');
 
 		//start processing 
 		$this->processWhiteList();
@@ -109,26 +109,9 @@
 
 
 	function processWhiteList() {
-
-		$multipleLinesComment = false;
-
 		foreach($this->domainsList as $startUrl) {
-			
 			$startUrl = trim($startUrl);
 
-			//comentarii pe mai multe linii
-
-			if (substr($startUrl, 0, 3) == '###') {
-				//flip bit
-				$multipleLinesComment ^= 1;
-			}
-			
-			//comentarii sau linii goale
-			if ($multipleLinesComment || substr($startUrl,0,1) == '#'
-				|| !$startUrl)
-				continue;
-
-
 			//curatam url-ul
 			$this->currentUrl = $this->urlPadding($startUrl);
 			//impartim url-ul pe componente

Deleted: app/WhiteList.txt
==============================================================================
--- app/WhiteList.txt	Mon Oct 21 13:24:29 2013	(r1026)
+++ /dev/null	00:00:00 1970	(deleted)
@@ -1,18 +0,0 @@
-# câte un domeniu pe linie
-# comentarii pe o singură linie cu #
-
-
-###
-
-pe mai multe linii încadrați între ### și ###,
-dar ### de închidere să fie singur pe linie
-pentru că altfel ce urmează după aceasta pe
-aceeași linie va fi ignorat
-
-###
-
-
-http://wiki.dexonline.ro/
-#http://www.romlit.ro/
-#http://ro.wikipedia.org/
-

Deleted: app/app_dex.conf
==============================================================================
--- app/app_dex.conf	Mon Oct 21 13:24:29 2013	(r1026)
+++ /dev/null	00:00:00 1970	(deleted)
@@ -1,65 +0,0 @@
-[crawler]
-
-
-;the as downloaded html page location
-raw_page_path=RawPage/
-;the pure text page associated
-;with the html one location
-parsed_text_path=ParsedText/
-
-;waiting time between getting pages
-t_wait=15
-
-;if this is true, then the application will
-;exit if an exception occured
-exception_exit=true
-
-;the most probable directory index file
-dir_index_file=index
-;the most probable index extensions
-index_file_ext=html,php,aspx,asp,pl,py,jsp
-
-;this should be stored somewhere not on public directories
-user_agent_location=/var/www/CrawlerData/user_agent
-
-
-[diacritics]
-
-diacritics_buffer_limit=2048
-
-;diactritics list
-diacritics=ăâîșț
-
-;if word finished, this char
-;will be used in padding until
-;the reach of the padding number
-padding_char=*
-
-;non-diacritics list
-non_lower_diacritics=aist
-non_upper_diacritics=AIST
-
-;left and right length
-;e.g. for cireșarii cire-s-arii
-;the padding_length is 5
-diacritics_padding_length=5
-
-
-
-[app_log]
-
-log_detail_level=2
-
-;crawler log file path
-crawler_log=crawler_log
-
-;diacritics log file path
-diacritics_log=diacritics_log
-
-;outputs messages to the screen
-;values are true and false
-log2screen=true
-
-;outputs messages to the file
-;specified bycrawler_log
-log2file=true
\ No newline at end of file

Modified: app/clean.php
==============================================================================
--- app/clean.php	Mon Oct 21 12:30:37 2013	(r1026)
+++ app/clean.php	Mon Oct 21 13:24:29 2013	(r1027)
@@ -3,10 +3,10 @@
  * Alin Ungureanu, 2013
  * alyn.cti at gmail.com
  */
-require_once '../../phplib/util.php';
+require_once __DIR__ . '/../phplib/util.php';
 
 function printUsage() {
-	echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
+	echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [ -d | --diacritics]" . PHP_EOL;
 	flush();
 	exit();
 }
@@ -30,25 +30,25 @@
 
 	try {
 
-		//sterge toate fisierele salvate
+		// șterge toate fișierele salvate
 		removeFiles('ParsedText/*');
 		removeFiles('RawPage/*');
 
 
-		echo 'files deleted' . Config::get('crawler.new_line');
+		echo "files deleted\n";
 
-	    $db->exec('TRUNCATE Table CrawledPage;');
-	    $db->exec('TRUNCATE Table Link;');
-	    $db->commit();
+    $db->exec('TRUNCATE Table CrawledPage;');
+    $db->exec('TRUNCATE Table Link;');
+    $db->commit();
 
-		echo "tables 'Link' and 'CrawledPage' truncated" . Config::get('crawler.new_line');
+    echo "tables 'Link' and 'CrawledPage' truncated\n";
 
-		echo 'The cleaning process was successful' . Config::get('crawler.new_line');
+		echo "The cleaning process was successful\n";
 	}
 
 	catch(Exception $ex) {
 
-		echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
+		echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
 	}
 }
 else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
@@ -56,13 +56,13 @@
 	try{
 		$db->exec('TRUNCATE Table Diacritics;');
 		$db->exec('TRUNCATE Table FilesUsedInDiacritics;');
-	    $db->commit();
-		echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated" . Config::get('crawler.new_line');
-		echo 'The cleaning process was successful' . Config::get('crawler.new_line');
+    $db->commit();
+    echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
+		echo "The cleaning process was successful\n";
 	}
-	catch(Exception $e) {
+	catch(Exception $ex) {
 
-		echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
+		echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
 	}
 
 }

Modified: dex.conf.sample
==============================================================================
--- dex.conf.sample	Mon Oct 21 12:30:37 2013	(r1026)
+++ dex.conf.sample	Mon Oct 21 13:24:29 2013	(r1027)
@@ -1,4 +1,5 @@
 ; Configuration file for a DEX online installation
+; Some entire sections are optional. Please refer to each section's comments.
 
 [global]
 ; Modules that are allowed to run and possibly serve a banner instead of the bannerType choice below.
@@ -69,12 +70,14 @@
 skins[] = zepu
 skins[] = polar
 
+; This section handles donations through a third party.
 [euplatesc]
 ; Key for euplatesc.ro or 0 to disable the donation box.
 euPlatescKey = 0
 ; Merchant ID for euplatesc.ro or 0 to disable the donation box.
 euPlatescMid = 0
 
+; Functional testing. We don't do much of it as of 2013.
 [functest]
 ; URL that the functional test engine will exercise
 ; This is the URL you normally go to in development
@@ -91,6 +94,7 @@
 ; Comment this out in production.
 functestLockFile = /tmp/dex-functest
 
+; Skin-specific variables. Necessary if you plan to display banners.
 [skin-zepu]
 ; Display banners after the search box.
 adsense_mainPage = "id=1220723485&width=728&height=90"
@@ -108,3 +112,66 @@
 
 [limits]
 limitFulltextSearch = 1000
+
+; Configuration for the Romanian literature crawler
+[crawler]
+
+; The as downloaded html page location
+raw_page_path = RawPage/
+
+; The pure text page associated with the html one location
+parsed_text_path = ParsedText/
+
+; Waiting time between getting pages (per-domain limit)
+t_wait = 15
+
+; If true, then the application will exit if an exception occurs
+exception_exit = true
+
+; The most probable directory index file
+dir_index_file = index
+
+; The most probable index extensions
+index_file_ext = html,php,aspx,asp,pl,py,jsp
+
+; Crawler signature
+user_agent = "DEX online crawler v1.0"
+
+; Pages to crawl
+; whiteList[] = http://example.com/
+; whiteList[] = http://example.com/
+
+; Configuration for the app that adds Romanian diacritics to a text that doesn't have them
+[diacritics]
+
+diacritics_buffer_limit = 2048
+
+; Diacritics list
+diacritics = ăâîșț
+
+; Padding char beyond the end of text
+padding_char = *
+
+; Non-diacritics list
+non_lower_diacritics = aist
+non_upper_diacritics = AIST
+
+; Left and right length; e.g. for cireșarii cire-s-arii the padding_length is 5
+diacritics_padding_length = 5
+
+; Logging settings for the crawler and any apps built on top of it
+[app_log]
+
+log_detail_level = 2
+
+; Crawler log file path, relative to the root installation directory
+crawler_log = log/crawler_log
+
+; Diacritics log file path, relative to the root installation directory
+diacritics_log = log/diacritics_log
+
+; When set, outputs messages to the screen
+log2screen = true
+
+; when set, outputs messages to the file specified by crawler_log
+log2file = true

Added: patches/00102.sql
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ patches/00102.sql	Mon Oct 21 13:24:29 2013	(r1027)
@@ -0,0 +1,22 @@
+create table if not exists CrawledPage (
+  id int not null auto_increment,
+  timestamp int not null,
+  url varchar(255) not null,
+  httpStatus int not null,
+  rawPagePath varchar(255) not null,
+  parsedTextPath varchar(255) not null,
+
+  primary key (id),
+  key(httpStatus)
+);
+
+create table if not exists Link(
+  id int not null auto_increment,
+  canonicalUrl varchar(255) not null,
+  domain varchar(255) not null,
+  crawledPageId int not null,
+
+  primary key(id),
+  key(domain),
+  key(crawledPageId)
+);

Modified: phplib/AppLog.php
==============================================================================
--- phplib/AppLog.php	Mon Oct 21 12:30:37 2013	(r1026)
+++ phplib/AppLog.php	Mon Oct 21 13:24:29 2013	(r1027)
@@ -71,7 +71,7 @@
 			fclose( $fd);
 		}
 		catch (Exception $ex) {
-			echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
+			echo "LOG FILE PROBLEM\n";
 		}
 		//log in stdout
 		if (Config::get('app_log.log2screen')) {


More information about the Dev mailing list