[dev] [commit] r1027 - app log patches phplib

Alin Ungureanu alyn.cti at gmail.com
Mon Oct 21 15:27:25 EEST 2013


Salut,

Am fost ocupat weekendul acesta, n-am apucat să verific partea de tabele 
și să fac o mică documentație. S-ar putea să verific tabelele în seara 
aceasta și să fac documentația vineri seara.

Alin

On 10/21/2013 1:24 PM, automailer at dexonline.ro wrote:
> Author: cata
> Date: Mon Oct 21 13:24:29 2013
> New Revision: 1027
>
> Log:
> Some crawler changes
> * moved settings into dex.conf.sample
> * replaced user_agent_location with just user_agent. It now contains the actual user agent (we can do that because dex.conf.sample is public, but dex.conf is not).
> * simplify the URL white list. It is now crawler.whiteList in dex.conf.sample. Removed all the multiple line comments code.
> * remove references to new_line. I hope that's ok. We can always display the log in a browser window with <pre> if needed.
> * add MySQL tables. I hope I got all the fields and indexes right. I couldn't find the original schema.
>
> Added:
>     patches/00102.sql
> Deleted:
>     app/WhiteList.txt
>     app/app_dex.conf
> Modified:
>     app/AbstractCrawler.php
>     app/Crawler.php
>     app/clean.php
>     dex.conf.sample
>     log/   (props changed)
>     phplib/AppLog.php
>
> Modified: app/AbstractCrawler.php
> ==============================================================================
> --- app/AbstractCrawler.php	Mon Oct 21 12:30:37 2013	(r1026)
> +++ app/AbstractCrawler.php	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -47,10 +47,10 @@
>   	function getPage($url) {
>   
>   		$this->ch = curl_init();
> -		Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
> +		Applog::log("User agent is: " . Config::get('crawler.user_agent'));
>   		curl_setopt ($this->ch, CURLOPT_URL, $url);
>   		curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
> -		curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(Config::get('crawler.user_agent_location')));
> +		curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
>   		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
>   		curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
>   		curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
>
> Modified: app/Crawler.php
> ==============================================================================
> --- app/Crawler.php	Mon Oct 21 12:30:37 2013	(r1026)
> +++ app/Crawler.php	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -3,7 +3,7 @@
>    * Alin Ungureanu, 2013
>    * alyn.cti at gmail.com
>    */
> -require_once dirname(__FILE__) . '/AbstractCrawler.php';
> +require_once __DIR__ . '/AbstractCrawler.php';
>   
>   class Crawler extends AbstractCrawler {
>   
> @@ -99,7 +99,7 @@
>   	
>   		Applog::log("Crawler started");
>   
> -		$this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
> +		$this->domainsList = Config::get('crawler.whiteList');
>   
>   		//start processing
>   		$this->processWhiteList();
> @@ -109,26 +109,9 @@
>   
>   
>   	function processWhiteList() {
> -
> -		$multipleLinesComment = false;
> -
>   		foreach($this->domainsList as $startUrl) {
> -			
>   			$startUrl = trim($startUrl);
>   
> -			//comentarii pe mai multe linii
> -
> -			if (substr($startUrl, 0, 3) == '###') {
> -				//flip bit
> -				$multipleLinesComment ^= 1;
> -			}
> -			
> -			//comentarii sau linii goale
> -			if ($multipleLinesComment || substr($startUrl,0,1) == '#'
> -				|| !$startUrl)
> -				continue;
> -
> -
>   			//curatam url-ul
>   			$this->currentUrl = $this->urlPadding($startUrl);
>   			//impartim url-ul pe componente
>
> Deleted: app/WhiteList.txt
> ==============================================================================
> --- app/WhiteList.txt	Mon Oct 21 13:24:29 2013	(r1026)
> +++ /dev/null	00:00:00 1970	(deleted)
> @@ -1,18 +0,0 @@
> -# câte un domeniu pe linie
> -# comentarii pe o singură linie cu #
> -
> -
> -###
> -
> -pe mai multe linii încadrați între ### și ###,
> -dar ### de închidere să fie singur pe linie
> -pentru că altfel ce urmează după aceasta pe
> -aceeași linie va fi ignorat
> -
> -###
> -
> -
> -http://wiki.dexonline.ro/
> -#http://www.romlit.ro/
> -#http://ro.wikipedia.org/
> -
>
> Deleted: app/app_dex.conf
> ==============================================================================
> --- app/app_dex.conf	Mon Oct 21 13:24:29 2013	(r1026)
> +++ /dev/null	00:00:00 1970	(deleted)
> @@ -1,65 +0,0 @@
> -[crawler]
> -
> -
> -;the as downloaded html page location
> -raw_page_path=RawPage/
> -;the pure text page associated
> -;with the html one location
> -parsed_text_path=ParsedText/
> -
> -;waiting time between getting pages
> -t_wait=15
> -
> -;if this is true, then the application will
> -;exit if an exception occured
> -exception_exit=true
> -
> -;the most probable directory index file
> -dir_index_file=index
> -;the most probable index extensions
> -index_file_ext=html,php,aspx,asp,pl,py,jsp
> -
> -;this should be stored somewhere not on public directories
> -user_agent_location=/var/www/CrawlerData/user_agent
> -
> -
> -[diacritics]
> -
> -diacritics_buffer_limit=2048
> -
> -;diactritics list
> -diacritics=ăâîșț
> -
> -;if word finished, this char
> -;will be used in padding until
> -;the reach of the padding number
> -padding_char=*
> -
> -;non-diacritics list
> -non_lower_diacritics=aist
> -non_upper_diacritics=AIST
> -
> -;left and right length
> -;e.g. for cireșarii cire-s-arii
> -;the padding_length is 5
> -diacritics_padding_length=5
> -
> -
> -
> -[app_log]
> -
> -log_detail_level=2
> -
> -;crawler log file path
> -crawler_log=crawler_log
> -
> -;diacritics log file path
> -diacritics_log=diacritics_log
> -
> -;outputs messages to the screen
> -;values are true and false
> -log2screen=true
> -
> -;outputs messages to the file
> -;specified bycrawler_log
> -log2file=true
> \ No newline at end of file
>
> Modified: app/clean.php
> ==============================================================================
> --- app/clean.php	Mon Oct 21 12:30:37 2013	(r1026)
> +++ app/clean.php	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -3,10 +3,10 @@
>    * Alin Ungureanu, 2013
>    * alyn.cti at gmail.com
>    */
> -require_once '../../phplib/util.php';
> +require_once __DIR__ . '/../phplib/util.php';
>   
>   function printUsage() {
> -	echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
> +	echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [ -d | --diacritics]" . PHP_EOL;
>   	flush();
>   	exit();
>   }
> @@ -30,25 +30,25 @@
>   
>   	try {
>   
> -		//sterge toate fisierele salvate
> +		// șterge toate fișierele salvate
>   		removeFiles('ParsedText/*');
>   		removeFiles('RawPage/*');
>   
>   
> -		echo 'files deleted' . Config::get('crawler.new_line');
> +		echo "files deleted\n";
>   
> -	    $db->exec('TRUNCATE Table CrawledPage;');
> -	    $db->exec('TRUNCATE Table Link;');
> -	    $db->commit();
> +    $db->exec('TRUNCATE Table CrawledPage;');
> +    $db->exec('TRUNCATE Table Link;');
> +    $db->commit();
>   
> -		echo "tables 'Link' and 'CrawledPage' truncated" . Config::get('crawler.new_line');
> +    echo "tables 'Link' and 'CrawledPage' truncated\n";
>   
> -		echo 'The cleaning process was successful' . Config::get('crawler.new_line');
> +		echo "The cleaning process was successful\n";
>   	}
>   
>   	catch(Exception $ex) {
>   
> -		echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
> +		echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
>   	}
>   }
>   else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
> @@ -56,13 +56,13 @@
>   	try{
>   		$db->exec('TRUNCATE Table Diacritics;');
>   		$db->exec('TRUNCATE Table FilesUsedInDiacritics;');
> -	    $db->commit();
> -		echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated" . Config::get('crawler.new_line');
> -		echo 'The cleaning process was successful' . Config::get('crawler.new_line');
> +    $db->commit();
> +    echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
> +		echo "The cleaning process was successful\n";
>   	}
> -	catch(Exception $e) {
> +	catch(Exception $ex) {
>   
> -		echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
> +		echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
>   	}
>   
>   }
>
> Modified: dex.conf.sample
> ==============================================================================
> --- dex.conf.sample	Mon Oct 21 12:30:37 2013	(r1026)
> +++ dex.conf.sample	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -1,4 +1,5 @@
>   ; Configuration file for a DEX online installation
> +; Some entire sections are optional. Please refer to each section's comments.
>   
>   [global]
>   ; Modules that are allowed to run and possibly serve a banner instead of the bannerType choice below.
> @@ -69,12 +70,14 @@
>   skins[] = zepu
>   skins[] = polar
>   
> +; This section handles donations through a third party.
>   [euplatesc]
>   ; Key for euplatesc.ro or 0 to disable the donation box.
>   euPlatescKey = 0
>   ; Merchant ID for euplatesc.ro or 0 to disable the donation box.
>   euPlatescMid = 0
>   
> +; Functional testing. We don't do much of it as of 2013.
>   [functest]
>   ; URL that the functional test engine will exercise
>   ; This is the URL you normally go to in development
> @@ -91,6 +94,7 @@
>   ; Comment this out in production.
>   functestLockFile = /tmp/dex-functest
>   
> +; Skin-specific variables. Necessary if you plan to display banners.
>   [skin-zepu]
>   ; Display banners after the search box.
>   adsense_mainPage = "id=1220723485&width=728&height=90"
> @@ -108,3 +112,66 @@
>   
>   [limits]
>   limitFulltextSearch = 1000
> +
> +; Configuration for the Romanian literature crawler
> +[crawler]
> +
> +; The as downloaded html page location
> +raw_page_path = RawPage/
> +
> +; The pure text page associated with the html one location
> +parsed_text_path = ParsedText/
> +
> +; Waiting time between getting pages (per-domain limit)
> +t_wait = 15
> +
> +; If true, then the application will exit if an exception occurs
> +exception_exit = true
> +
> +; The most probable directory index file
> +dir_index_file = index
> +
> +; The most probable index extensions
> +index_file_ext = html,php,aspx,asp,pl,py,jsp
> +
> +; Crawler signature
> +user_agent = "DEX online crawler v1.0"
> +
> +; Pages to crawl
> +; whiteList[] = http://example.com/
> +; whiteList[] = http://example.com/
> +
> +; Configuration for the app that adds Romanian diacritics to a text that doesn't have them
> +[diacritics]
> +
> +diacritics_buffer_limit = 2048
> +
> +; Diacritics list
> +diacritics = ăâîșț
> +
> +; Padding char beyond the end of text
> +padding_char = *
> +
> +; Non-diacritics list
> +non_lower_diacritics = aist
> +non_upper_diacritics = AIST
> +
> +; Left and right length; e.g. for cireșarii cire-s-arii the padding_length is 5
> +diacritics_padding_length = 5
> +
> +; Logging settings for the crawler and any apps built on top of it
> +[app_log]
> +
> +log_detail_level = 2
> +
> +; Crawler log file path, relative to the root installation directory
> +crawler_log = log/crawler_log
> +
> +; Diacritics log file path, relative to the root installation directory
> +diacritics_log = log/diacritics_log
> +
> +; When set, outputs messages to the screen
> +log2screen = true
> +
> +; when set, outputs messages to the file specified by crawler_log
> +log2file = true
>
> Added: patches/00102.sql
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ patches/00102.sql	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -0,0 +1,22 @@
> +create table if not exists CrawledPage (
> +  id int not null auto_increment,
> +  timestamp int not null,
> +  url varchar(255) not null,
> +  httpStatus int not null,
> +  rawPagePath varchar(255) not null,
> +  parsedTextPath varchar(255) not null,
> +
> +  primary key (id),
> +  key(httpStatus)
> +);
> +
> +create table if not exists Link(
> +  id int not null auto_increment,
> +  canonicalUrl varchar(255) not null,
> +  domain varchar(255) not null,
> +  crawledPageId int not null,
> +
> +  primary key(id),
> +  key(domain),
> +  key(crawledPageId)
> +);
>
> Modified: phplib/AppLog.php
> ==============================================================================
> --- phplib/AppLog.php	Mon Oct 21 12:30:37 2013	(r1026)
> +++ phplib/AppLog.php	Mon Oct 21 13:24:29 2013	(r1027)
> @@ -71,7 +71,7 @@
>   			fclose( $fd);
>   		}
>   		catch (Exception $ex) {
> -			echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
> +			echo "LOG FILE PROBLEM\n";
>   		}
>   		//log in stdout
>   		if (Config::get('app_log.log2screen')) {
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev



More information about the Dev mailing list