[dev] [commit] r1027 - app log patches phplib
Cătălin Frâncu
cata at francu.com
Mon Oct 21 17:00:33 EEST 2013
Salut Alin,
Nu-i problemă. Eu trebuia să fi făcut de mult pașii ăștia de
încorporare. :-) Aș vrea să mai fac un crawl manager astăzi. Adică, dacă
vrem să indexăm 10 site-uri și limita per site este de 10 secunde, să
putem crawla o pagină pe secundă de la diverse site-uri.
Cătălin
On 10/21/2013 03:27 PM, Alin Ungureanu wrote:
> Salut,
>
> Am fost ocupat weekendul acesta, n-am apucat să verific partea de tabele
> și să fac o mică documentație. S-ar putea să verific tabelele în seara
> aceasta și să fac documentația vineri seara.
>
> Alin
>
> On 10/21/2013 1:24 PM, automailer at dexonline.ro wrote:
>> Author: cata
>> Date: Mon Oct 21 13:24:29 2013
>> New Revision: 1027
>>
>> Log:
>> Some crawler changes
>> * moved settings into dex.conf.sample
>> * replaced user_agent_location with just user_agent. It now contains
>> the actual user agent (we can do that because dex.conf.sample is
>> public, but dex.conf is not).
>> * simplify the URL white list. It is now crawler.whiteList in
>> dex.conf.sample. Removed all the multiple line comments code.
>> * remove references to new_line. I hope that's ok. We can always
>> display the log in a browser window with <pre> if needed.
>> * add MySQL tables. I hope I got all the fields and indexes right. I
>> couldn't find the original schema.
>>
>> Added:
>> patches/00102.sql
>> Deleted:
>> app/WhiteList.txt
>> app/app_dex.conf
>> Modified:
>> app/AbstractCrawler.php
>> app/Crawler.php
>> app/clean.php
>> dex.conf.sample
>> log/ (props changed)
>> phplib/AppLog.php
>>
>> Modified: app/AbstractCrawler.php
>> ==============================================================================
>>
>> --- app/AbstractCrawler.php Mon Oct 21 12:30:37 2013 (r1026)
>> +++ app/AbstractCrawler.php Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -47,10 +47,10 @@
>> function getPage($url) {
>> $this->ch = curl_init();
>> -
>> Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
>>
>> + Applog::log("User agent is: " .
>> Config::get('crawler.user_agent'));
>> curl_setopt ($this->ch, CURLOPT_URL, $url);
>> curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
>> - curl_setopt ($this->ch, CURLOPT_USERAGENT,
>> file_get_contents(Config::get('crawler.user_agent_location')));
>> + curl_setopt ($this->ch, CURLOPT_USERAGENT,
>> Config::get('crawler.user_agent'));
>> curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
>> curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
>> curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
>>
>> Modified: app/Crawler.php
>> ==============================================================================
>>
>> --- app/Crawler.php Mon Oct 21 12:30:37 2013 (r1026)
>> +++ app/Crawler.php Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -3,7 +3,7 @@
>> * Alin Ungureanu, 2013
>> * alyn.cti at gmail.com
>> */
>> -require_once dirname(__FILE__) . '/AbstractCrawler.php';
>> +require_once __DIR__ . '/AbstractCrawler.php';
>> class Crawler extends AbstractCrawler {
>> @@ -99,7 +99,7 @@
>>
>> Applog::log("Crawler started");
>> - $this->domainsList = explode(PHP_EOL,
>> file_get_contents("WhiteList.txt"));
>> + $this->domainsList = Config::get('crawler.whiteList');
>> //start processing
>> $this->processWhiteList();
>> @@ -109,26 +109,9 @@
>> function processWhiteList() {
>> -
>> - $multipleLinesComment = false;
>> -
>> foreach($this->domainsList as $startUrl) {
>> -
>> $startUrl = trim($startUrl);
>> - //comentarii pe mai multe linii
>> -
>> - if (substr($startUrl, 0, 3) == '###') {
>> - //flip bit
>> - $multipleLinesComment ^= 1;
>> - }
>> -
>> - //comentarii sau linii goale
>> - if ($multipleLinesComment || substr($startUrl,0,1) == '#'
>> - || !$startUrl)
>> - continue;
>> -
>> -
>> //curatam url-ul
>> $this->currentUrl = $this->urlPadding($startUrl);
>> //impartim url-ul pe componente
>>
>> Deleted: app/WhiteList.txt
>> ==============================================================================
>>
>> --- app/WhiteList.txt Mon Oct 21 13:24:29 2013 (r1026)
>> +++ /dev/null 00:00:00 1970 (deleted)
>> @@ -1,18 +0,0 @@
>> -# câte un domeniu pe linie
>> -# comentarii pe o singură linie cu #
>> -
>> -
>> -###
>> -
>> -pe mai multe linii încadrați între ### și ###,
>> -dar ### de închidere să fie singur pe linie
>> -pentru că altfel ce urmează după aceasta pe
>> -aceeași linie va fi ignorat
>> -
>> -###
>> -
>> -
>> -http://wiki.dexonline.ro/
>> -#http://www.romlit.ro/
>> -#http://ro.wikipedia.org/
>> -
>>
>> Deleted: app/app_dex.conf
>> ==============================================================================
>>
>> --- app/app_dex.conf Mon Oct 21 13:24:29 2013 (r1026)
>> +++ /dev/null 00:00:00 1970 (deleted)
>> @@ -1,65 +0,0 @@
>> -[crawler]
>> -
>> -
>> -;the as downloaded html page location
>> -raw_page_path=RawPage/
>> -;the pure text page associated
>> -;with the html one location
>> -parsed_text_path=ParsedText/
>> -
>> -;waiting time between getting pages
>> -t_wait=15
>> -
>> -;if this is true, then the application will
>> -;exit if an exception occured
>> -exception_exit=true
>> -
>> -;the most probable directory index file
>> -dir_index_file=index
>> -;the most probable index extensions
>> -index_file_ext=html,php,aspx,asp,pl,py,jsp
>> -
>> -;this should be stored somewhere not on public directories
>> -user_agent_location=/var/www/CrawlerData/user_agent
>> -
>> -
>> -[diacritics]
>> -
>> -diacritics_buffer_limit=2048
>> -
>> -;diactritics list
>> -diacritics=ăâîșț
>> -
>> -;if word finished, this char
>> -;will be used in padding until
>> -;the reach of the padding number
>> -padding_char=*
>> -
>> -;non-diacritics list
>> -non_lower_diacritics=aist
>> -non_upper_diacritics=AIST
>> -
>> -;left and right length
>> -;e.g. for cireșarii cire-s-arii
>> -;the padding_length is 5
>> -diacritics_padding_length=5
>> -
>> -
>> -
>> -[app_log]
>> -
>> -log_detail_level=2
>> -
>> -;crawler log file path
>> -crawler_log=crawler_log
>> -
>> -;diacritics log file path
>> -diacritics_log=diacritics_log
>> -
>> -;outputs messages to the screen
>> -;values are true and false
>> -log2screen=true
>> -
>> -;outputs messages to the file
>> -;specified bycrawler_log
>> -log2file=true
>> \ No newline at end of file
>>
>> Modified: app/clean.php
>> ==============================================================================
>>
>> --- app/clean.php Mon Oct 21 12:30:37 2013 (r1026)
>> +++ app/clean.php Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -3,10 +3,10 @@
>> * Alin Ungureanu, 2013
>> * alyn.cti at gmail.com
>> */
>> -require_once '../../phplib/util.php';
>> +require_once __DIR__ . '/../phplib/util.php';
>> function printUsage() {
>> - echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [
>> -d | --diacritics]".PHP_EOL;
>> + echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [
>> -d | --diacritics]" . PHP_EOL;
>> flush();
>> exit();
>> }
>> @@ -30,25 +30,25 @@
>> try {
>> - //sterge toate fisierele salvate
>> + // șterge toate fișierele salvate
>> removeFiles('ParsedText/*');
>> removeFiles('RawPage/*');
>> - echo 'files deleted' . Config::get('crawler.new_line');
>> + echo "files deleted\n";
>> - $db->exec('TRUNCATE Table CrawledPage;');
>> - $db->exec('TRUNCATE Table Link;');
>> - $db->commit();
>> + $db->exec('TRUNCATE Table CrawledPage;');
>> + $db->exec('TRUNCATE Table Link;');
>> + $db->commit();
>> - echo "tables 'Link' and 'CrawledPage' truncated" .
>> Config::get('crawler.new_line');
>> + echo "tables 'Link' and 'CrawledPage' truncated\n";
>> - echo 'The cleaning process was successful' .
>> Config::get('crawler.new_line');
>> + echo "The cleaning process was successful\n";
>> }
>> catch(Exception $ex) {
>> - echo 'The cleaning process encountered a problem ' .
>> Config::get('crawler.new_line').$ex->getMessage();
>> + echo "The cleaning process encountered a problem: " .
>> $ex->getMessage() . "\n";
>> }
>> }
>> else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
>> @@ -56,13 +56,13 @@
>> try{
>> $db->exec('TRUNCATE Table Diacritics;');
>> $db->exec('TRUNCATE Table FilesUsedInDiacritics;');
>> - $db->commit();
>> - echo "tables 'Diacritics' and 'FilesUsedInDiacritics'
>> truncated" . Config::get('crawler.new_line');
>> - echo 'The cleaning process was successful' .
>> Config::get('crawler.new_line');
>> + $db->commit();
>> + echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
>> + echo "The cleaning process was successful\n";
>> }
>> - catch(Exception $e) {
>> + catch(Exception $ex) {
>> - echo 'The cleaning process encountered a problem ' .
>> Config::get('crawler.new_line').$ex->getMessage();
>> + echo "The cleaning process encountered a problem: " .
>> $ex->getMessage() . "\n";
>> }
>> }
>>
>> Modified: dex.conf.sample
>> ==============================================================================
>>
>> --- dex.conf.sample Mon Oct 21 12:30:37 2013 (r1026)
>> +++ dex.conf.sample Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -1,4 +1,5 @@
>> ; Configuration file for a DEX online installation
>> +; Some entire sections are optional. Please refer to each section's
>> comments.
>> [global]
>> ; Modules that are allowed to run and possibly serve a banner
>> instead of the bannerType choice below.
>> @@ -69,12 +70,14 @@
>> skins[] = zepu
>> skins[] = polar
>> +; This section handles donations through a third party.
>> [euplatesc]
>> ; Key for euplatesc.ro or 0 to disable the donation box.
>> euPlatescKey = 0
>> ; Merchant ID for euplatesc.ro or 0 to disable the donation box.
>> euPlatescMid = 0
>> +; Functional testing. We don't do much of it as of 2013.
>> [functest]
>> ; URL that the functional test engine will exercise
>> ; This is the URL you normally go to in development
>> @@ -91,6 +94,7 @@
>> ; Comment this out in production.
>> functestLockFile = /tmp/dex-functest
>> +; Skin-specific variables. Necessary if you plan to display banners.
>> [skin-zepu]
>> ; Display banners after the search box.
>> adsense_mainPage = "id=1220723485&width=728&height=90"
>> @@ -108,3 +112,66 @@
>> [limits]
>> limitFulltextSearch = 1000
>> +
>> +; Configuration for the Romanian literature crawler
>> +[crawler]
>> +
>> +; The as downloaded html page location
>> +raw_page_path = RawPage/
>> +
>> +; The pure text page associated with the html one location
>> +parsed_text_path = ParsedText/
>> +
>> +; Waiting time between getting pages (per-domain limit)
>> +t_wait = 15
>> +
>> +; If true, then the application will exit if an exception occurs
>> +exception_exit = true
>> +
>> +; The most probable directory index file
>> +dir_index_file = index
>> +
>> +; The most probable index extensions
>> +index_file_ext = html,php,aspx,asp,pl,py,jsp
>> +
>> +; Crawler signature
>> +user_agent = "DEX online crawler v1.0"
>> +
>> +; Pages to crawl
>> +; whiteList[] = http://example.com/
>> +; whiteList[] = http://example.com/
>> +
>> +; Configuration for the app that adds Romanian diacritics to a text
>> that doesn't have them
>> +[diacritics]
>> +
>> +diacritics_buffer_limit = 2048
>> +
>> +; Diacritics list
>> +diacritics = ăâîșț
>> +
>> +; Padding char beyond the end of text
>> +padding_char = *
>> +
>> +; Non-diacritics list
>> +non_lower_diacritics = aist
>> +non_upper_diacritics = AIST
>> +
>> +; Left and right length; e.g. for cireșarii cire-s-arii the
>> padding_length is 5
>> +diacritics_padding_length = 5
>> +
>> +; Logging settings for the crawler and any apps built on top of it
>> +[app_log]
>> +
>> +log_detail_level = 2
>> +
>> +; Crawler log file path, relative to the root installation directory
>> +crawler_log = log/crawler_log
>> +
>> +; Diacritics log file path, relative to the root installation directory
>> +diacritics_log = log/diacritics_log
>> +
>> +; When set, outputs messages to the screen
>> +log2screen = true
>> +
>> +; when set, outputs messages to the file specified by crawler_log
>> +log2file = true
>>
>> Added: patches/00102.sql
>> ==============================================================================
>>
>> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
>> +++ patches/00102.sql Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -0,0 +1,22 @@
>> +create table if not exists CrawledPage (
>> + id int not null auto_increment,
>> + timestamp int not null,
>> + url varchar(255) not null,
>> + httpStatus int not null,
>> + rawPagePath varchar(255) not null,
>> + parsedTextPath varchar(255) not null,
>> +
>> + primary key (id),
>> + key(httpStatus)
>> +);
>> +
>> +create table if not exists Link(
>> + id int not null auto_increment,
>> + canonicalUrl varchar(255) not null,
>> + domain varchar(255) not null,
>> + crawledPageId int not null,
>> +
>> + primary key(id),
>> + key(domain),
>> + key(crawledPageId)
>> +);
>>
>> Modified: phplib/AppLog.php
>> ==============================================================================
>>
>> --- phplib/AppLog.php Mon Oct 21 12:30:37 2013 (r1026)
>> +++ phplib/AppLog.php Mon Oct 21 13:24:29 2013 (r1027)
>> @@ -71,7 +71,7 @@
>> fclose( $fd);
>> }
>> catch (Exception $ex) {
>> - echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
>> + echo "LOG FILE PROBLEM\n";
>> }
>> //log in stdout
>> if (Config::get('app_log.log2screen')) {
>> _______________________________________________
>> Dev mailing list
>> Dev at dexonline.ro
>> http://list.dexonline.ro/listinfo/dev
>
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev
More information about the Dev
mailing list