[dev] [commit] r1027 - app log patches phplib

Cătălin Frâncu cata at francu.com
Mon Oct 21 17:00:33 EEST 2013


Salut Alin,

Nu-i problemă. Eu trebuia să fi făcut de mult pașii ăștia de 
încorporare. :-) Aș vrea să mai fac un crawl manager astăzi. Adică, dacă 
vrem să indexăm 10 site-uri și limita per site este de 10 secunde, să 
putem crawla o pagină pe secundă de la diverse site-uri.

Cătălin

On 10/21/2013 03:27 PM, Alin Ungureanu wrote:
> Salut,
>
> Am fost ocupat weekendul acesta, n-am apucat să verific partea de tabele
> și să fac o mică documentație. S-ar putea să verific tabelele în seara
> aceasta și să fac documentația vineri seara.
>
> Alin
>
> On 10/21/2013 1:24 PM, automailer at dexonline.ro wrote:
>> Author: cata
>> Date: Mon Oct 21 13:24:29 2013
>> New Revision: 1027
>>
>> Log:
>> Some crawler changes
>> * moved settings into dex.conf.sample
>> * replaced user_agent_location with just user_agent. It now contains
>> the actual user agent (we can do that because dex.conf.sample is
>> public, but dex.conf is not).
>> * simplify the URL white list. It is now crawler.whiteList in
>> dex.conf.sample. Removed all the multiple line comments code.
>> * remove references to new_line. I hope that's ok. We can always
>> display the log in a browser window with <pre> if needed.
>> * add MySQL tables. I hope I got all the fields and indexes right. I
>> couldn't find the original schema.
>>
>> Added:
>>     patches/00102.sql
>> Deleted:
>>     app/WhiteList.txt
>>     app/app_dex.conf
>> Modified:
>>     app/AbstractCrawler.php
>>     app/Crawler.php
>>     app/clean.php
>>     dex.conf.sample
>>     log/   (props changed)
>>     phplib/AppLog.php
>>
>> Modified: app/AbstractCrawler.php
>> ==============================================================================
>>
>> --- app/AbstractCrawler.php    Mon Oct 21 12:30:37 2013    (r1026)
>> +++ app/AbstractCrawler.php    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -47,10 +47,10 @@
>>       function getPage($url) {
>>           $this->ch = curl_init();
>> -
>> Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
>>
>> +        Applog::log("User agent is: " .
>> Config::get('crawler.user_agent'));
>>           curl_setopt ($this->ch, CURLOPT_URL, $url);
>>           curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
>> -        curl_setopt ($this->ch, CURLOPT_USERAGENT,
>> file_get_contents(Config::get('crawler.user_agent_location')));
>> +        curl_setopt ($this->ch, CURLOPT_USERAGENT,
>> Config::get('crawler.user_agent'));
>>           curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
>>           curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
>>           curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
>>
>> Modified: app/Crawler.php
>> ==============================================================================
>>
>> --- app/Crawler.php    Mon Oct 21 12:30:37 2013    (r1026)
>> +++ app/Crawler.php    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -3,7 +3,7 @@
>>    * Alin Ungureanu, 2013
>>    * alyn.cti at gmail.com
>>    */
>> -require_once dirname(__FILE__) . '/AbstractCrawler.php';
>> +require_once __DIR__ . '/AbstractCrawler.php';
>>   class Crawler extends AbstractCrawler {
>> @@ -99,7 +99,7 @@
>>
>>           Applog::log("Crawler started");
>> -        $this->domainsList = explode(PHP_EOL,
>> file_get_contents("WhiteList.txt"));
>> +        $this->domainsList = Config::get('crawler.whiteList');
>>           //start processing
>>           $this->processWhiteList();
>> @@ -109,26 +109,9 @@
>>       function processWhiteList() {
>> -
>> -        $multipleLinesComment = false;
>> -
>>           foreach($this->domainsList as $startUrl) {
>> -
>>               $startUrl = trim($startUrl);
>> -            //comentarii pe mai multe linii
>> -
>> -            if (substr($startUrl, 0, 3) == '###') {
>> -                //flip bit
>> -                $multipleLinesComment ^= 1;
>> -            }
>> -
>> -            //comentarii sau linii goale
>> -            if ($multipleLinesComment || substr($startUrl,0,1) == '#'
>> -                || !$startUrl)
>> -                continue;
>> -
>> -
>>               //curatam url-ul
>>               $this->currentUrl = $this->urlPadding($startUrl);
>>               //impartim url-ul pe componente
>>
>> Deleted: app/WhiteList.txt
>> ==============================================================================
>>
>> --- app/WhiteList.txt    Mon Oct 21 13:24:29 2013    (r1026)
>> +++ /dev/null    00:00:00 1970    (deleted)
>> @@ -1,18 +0,0 @@
>> -# câte un domeniu pe linie
>> -# comentarii pe o singură linie cu #
>> -
>> -
>> -###
>> -
>> -pe mai multe linii încadrați între ### și ###,
>> -dar ### de închidere să fie singur pe linie
>> -pentru că altfel ce urmează după aceasta pe
>> -aceeași linie va fi ignorat
>> -
>> -###
>> -
>> -
>> -http://wiki.dexonline.ro/
>> -#http://www.romlit.ro/
>> -#http://ro.wikipedia.org/
>> -
>>
>> Deleted: app/app_dex.conf
>> ==============================================================================
>>
>> --- app/app_dex.conf    Mon Oct 21 13:24:29 2013    (r1026)
>> +++ /dev/null    00:00:00 1970    (deleted)
>> @@ -1,65 +0,0 @@
>> -[crawler]
>> -
>> -
>> -;the as downloaded html page location
>> -raw_page_path=RawPage/
>> -;the pure text page associated
>> -;with the html one location
>> -parsed_text_path=ParsedText/
>> -
>> -;waiting time between getting pages
>> -t_wait=15
>> -
>> -;if this is true, then the application will
>> -;exit if an exception occured
>> -exception_exit=true
>> -
>> -;the most probable directory index file
>> -dir_index_file=index
>> -;the most probable index extensions
>> -index_file_ext=html,php,aspx,asp,pl,py,jsp
>> -
>> -;this should be stored somewhere not on public directories
>> -user_agent_location=/var/www/CrawlerData/user_agent
>> -
>> -
>> -[diacritics]
>> -
>> -diacritics_buffer_limit=2048
>> -
>> -;diactritics list
>> -diacritics=ăâîșț
>> -
>> -;if word finished, this char
>> -;will be used in padding until
>> -;the reach of the padding number
>> -padding_char=*
>> -
>> -;non-diacritics list
>> -non_lower_diacritics=aist
>> -non_upper_diacritics=AIST
>> -
>> -;left and right length
>> -;e.g. for cireșarii cire-s-arii
>> -;the padding_length is 5
>> -diacritics_padding_length=5
>> -
>> -
>> -
>> -[app_log]
>> -
>> -log_detail_level=2
>> -
>> -;crawler log file path
>> -crawler_log=crawler_log
>> -
>> -;diacritics log file path
>> -diacritics_log=diacritics_log
>> -
>> -;outputs messages to the screen
>> -;values are true and false
>> -log2screen=true
>> -
>> -;outputs messages to the file
>> -;specified bycrawler_log
>> -log2file=true
>> \ No newline at end of file
>>
>> Modified: app/clean.php
>> ==============================================================================
>>
>> --- app/clean.php    Mon Oct 21 12:30:37 2013    (r1026)
>> +++ app/clean.php    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -3,10 +3,10 @@
>>    * Alin Ungureanu, 2013
>>    * alyn.cti at gmail.com
>>    */
>> -require_once '../../phplib/util.php';
>> +require_once __DIR__ . '/../phplib/util.php';
>>   function printUsage() {
>> -    echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [
>> -d | --diacritics]".PHP_EOL;
>> +    echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [
>> -d | --diacritics]" . PHP_EOL;
>>       flush();
>>       exit();
>>   }
>> @@ -30,25 +30,25 @@
>>       try {
>> -        //sterge toate fisierele salvate
>> +        // șterge toate fișierele salvate
>>           removeFiles('ParsedText/*');
>>           removeFiles('RawPage/*');
>> -        echo 'files deleted' . Config::get('crawler.new_line');
>> +        echo "files deleted\n";
>> -        $db->exec('TRUNCATE Table CrawledPage;');
>> -        $db->exec('TRUNCATE Table Link;');
>> -        $db->commit();
>> +    $db->exec('TRUNCATE Table CrawledPage;');
>> +    $db->exec('TRUNCATE Table Link;');
>> +    $db->commit();
>> -        echo "tables 'Link' and 'CrawledPage' truncated" .
>> Config::get('crawler.new_line');
>> +    echo "tables 'Link' and 'CrawledPage' truncated\n";
>> -        echo 'The cleaning process was successful' .
>> Config::get('crawler.new_line');
>> +        echo "The cleaning process was successful\n";
>>       }
>>       catch(Exception $ex) {
>> -        echo 'The cleaning process encountered a problem ' .
>> Config::get('crawler.new_line').$ex->getMessage();
>> +        echo "The cleaning process encountered a problem: " .
>> $ex->getMessage() . "\n";
>>       }
>>   }
>>   else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
>> @@ -56,13 +56,13 @@
>>       try{
>>           $db->exec('TRUNCATE Table Diacritics;');
>>           $db->exec('TRUNCATE Table FilesUsedInDiacritics;');
>> -        $db->commit();
>> -        echo "tables 'Diacritics' and 'FilesUsedInDiacritics'
>> truncated" . Config::get('crawler.new_line');
>> -        echo 'The cleaning process was successful' .
>> Config::get('crawler.new_line');
>> +    $db->commit();
>> +    echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
>> +        echo "The cleaning process was successful\n";
>>       }
>> -    catch(Exception $e) {
>> +    catch(Exception $ex) {
>> -        echo 'The cleaning process encountered a problem ' .
>> Config::get('crawler.new_line').$ex->getMessage();
>> +        echo "The cleaning process encountered a problem: " .
>> $ex->getMessage() . "\n";
>>       }
>>   }
>>
>> Modified: dex.conf.sample
>> ==============================================================================
>>
>> --- dex.conf.sample    Mon Oct 21 12:30:37 2013    (r1026)
>> +++ dex.conf.sample    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -1,4 +1,5 @@
>>   ; Configuration file for a DEX online installation
>> +; Some entire sections are optional. Please refer to each section's
>> comments.
>>   [global]
>>   ; Modules that are allowed to run and possibly serve a banner
>> instead of the bannerType choice below.
>> @@ -69,12 +70,14 @@
>>   skins[] = zepu
>>   skins[] = polar
>> +; This section handles donations through a third party.
>>   [euplatesc]
>>   ; Key for euplatesc.ro or 0 to disable the donation box.
>>   euPlatescKey = 0
>>   ; Merchant ID for euplatesc.ro or 0 to disable the donation box.
>>   euPlatescMid = 0
>> +; Functional testing. We don't do much of it as of 2013.
>>   [functest]
>>   ; URL that the functional test engine will exercise
>>   ; This is the URL you normally go to in development
>> @@ -91,6 +94,7 @@
>>   ; Comment this out in production.
>>   functestLockFile = /tmp/dex-functest
>> +; Skin-specific variables. Necessary if you plan to display banners.
>>   [skin-zepu]
>>   ; Display banners after the search box.
>>   adsense_mainPage = "id=1220723485&width=728&height=90"
>> @@ -108,3 +112,66 @@
>>   [limits]
>>   limitFulltextSearch = 1000
>> +
>> +; Configuration for the Romanian literature crawler
>> +[crawler]
>> +
>> +; The as downloaded html page location
>> +raw_page_path = RawPage/
>> +
>> +; The pure text page associated with the html one location
>> +parsed_text_path = ParsedText/
>> +
>> +; Waiting time between getting pages (per-domain limit)
>> +t_wait = 15
>> +
>> +; If true, then the application will exit if an exception occurs
>> +exception_exit = true
>> +
>> +; The most probable directory index file
>> +dir_index_file = index
>> +
>> +; The most probable index extensions
>> +index_file_ext = html,php,aspx,asp,pl,py,jsp
>> +
>> +; Crawler signature
>> +user_agent = "DEX online crawler v1.0"
>> +
>> +; Pages to crawl
>> +; whiteList[] = http://example.com/
>> +; whiteList[] = http://example.com/
>> +
>> +; Configuration for the app that adds Romanian diacritics to a text
>> that doesn't have them
>> +[diacritics]
>> +
>> +diacritics_buffer_limit = 2048
>> +
>> +; Diacritics list
>> +diacritics = ăâîșț
>> +
>> +; Padding char beyond the end of text
>> +padding_char = *
>> +
>> +; Non-diacritics list
>> +non_lower_diacritics = aist
>> +non_upper_diacritics = AIST
>> +
>> +; Left and right length; e.g. for cireșarii cire-s-arii the
>> padding_length is 5
>> +diacritics_padding_length = 5
>> +
>> +; Logging settings for the crawler and any apps built on top of it
>> +[app_log]
>> +
>> +log_detail_level = 2
>> +
>> +; Crawler log file path, relative to the root installation directory
>> +crawler_log = log/crawler_log
>> +
>> +; Diacritics log file path, relative to the root installation directory
>> +diacritics_log = log/diacritics_log
>> +
>> +; When set, outputs messages to the screen
>> +log2screen = true
>> +
>> +; when set, outputs messages to the file specified by crawler_log
>> +log2file = true
>>
>> Added: patches/00102.sql
>> ==============================================================================
>>
>> --- /dev/null    00:00:00 1970    (empty, because file is newly added)
>> +++ patches/00102.sql    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -0,0 +1,22 @@
>> +create table if not exists CrawledPage (
>> +  id int not null auto_increment,
>> +  timestamp int not null,
>> +  url varchar(255) not null,
>> +  httpStatus int not null,
>> +  rawPagePath varchar(255) not null,
>> +  parsedTextPath varchar(255) not null,
>> +
>> +  primary key (id),
>> +  key(httpStatus)
>> +);
>> +
>> +create table if not exists Link(
>> +  id int not null auto_increment,
>> +  canonicalUrl varchar(255) not null,
>> +  domain varchar(255) not null,
>> +  crawledPageId int not null,
>> +
>> +  primary key(id),
>> +  key(domain),
>> +  key(crawledPageId)
>> +);
>>
>> Modified: phplib/AppLog.php
>> ==============================================================================
>>
>> --- phplib/AppLog.php    Mon Oct 21 12:30:37 2013    (r1026)
>> +++ phplib/AppLog.php    Mon Oct 21 13:24:29 2013    (r1027)
>> @@ -71,7 +71,7 @@
>>               fclose( $fd);
>>           }
>>           catch (Exception $ex) {
>> -            echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
>> +            echo "LOG FILE PROBLEM\n";
>>           }
>>           //log in stdout
>>           if (Config::get('app_log.log2screen')) {
>> _______________________________________________
>> Dev mailing list
>> Dev at dexonline.ro
>> http://list.dexonline.ro/listinfo/dev
>
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev


More information about the Dev mailing list