[dev] [commit] r1027 - app log patches phplib
Alin Ungureanu
alyn.cti at gmail.com
Mon Oct 21 15:27:25 EEST 2013
Salut,
Am fost ocupat weekendul acesta, n-am apucat să verific partea de tabele
și să fac o mică documentație. S-ar putea să verific tabelele în seara
aceasta și să fac documentația vineri seara.
Alin
On 10/21/2013 1:24 PM, automailer at dexonline.ro wrote:
> Author: cata
> Date: Mon Oct 21 13:24:29 2013
> New Revision: 1027
>
> Log:
> Some crawler changes
> * moved settings into dex.conf.sample
> * replaced user_agent_location with just user_agent. It now contains the actual user agent (we can do that because dex.conf.sample is public, but dex.conf is not).
> * simplify the URL white list. It is now crawler.whiteList in dex.conf.sample. Removed all the multiple line comments code.
> * remove references to new_line. I hope that's ok. We can always display the log in a browser window with <pre> if needed.
> * add MySQL tables. I hope I got all the fields and indexes right. I couldn't find the original schema.
>
> Added:
> patches/00102.sql
> Deleted:
> app/WhiteList.txt
> app/app_dex.conf
> Modified:
> app/AbstractCrawler.php
> app/Crawler.php
> app/clean.php
> dex.conf.sample
> log/ (props changed)
> phplib/AppLog.php
>
> Modified: app/AbstractCrawler.php
> ==============================================================================
> --- app/AbstractCrawler.php Mon Oct 21 12:30:37 2013 (r1026)
> +++ app/AbstractCrawler.php Mon Oct 21 13:24:29 2013 (r1027)
> @@ -47,10 +47,10 @@
> function getPage($url) {
>
> $this->ch = curl_init();
> - Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
> + Applog::log("User agent is: " . Config::get('crawler.user_agent'));
> curl_setopt ($this->ch, CURLOPT_URL, $url);
> curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
> - curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(Config::get('crawler.user_agent_location')));
> + curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
> curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
> curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
> curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
>
> Modified: app/Crawler.php
> ==============================================================================
> --- app/Crawler.php Mon Oct 21 12:30:37 2013 (r1026)
> +++ app/Crawler.php Mon Oct 21 13:24:29 2013 (r1027)
> @@ -3,7 +3,7 @@
> * Alin Ungureanu, 2013
> * alyn.cti at gmail.com
> */
> -require_once dirname(__FILE__) . '/AbstractCrawler.php';
> +require_once __DIR__ . '/AbstractCrawler.php';
>
> class Crawler extends AbstractCrawler {
>
> @@ -99,7 +99,7 @@
>
> Applog::log("Crawler started");
>
> - $this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
> + $this->domainsList = Config::get('crawler.whiteList');
>
> //start processing
> $this->processWhiteList();
> @@ -109,26 +109,9 @@
>
>
> function processWhiteList() {
> -
> - $multipleLinesComment = false;
> -
> foreach($this->domainsList as $startUrl) {
> -
> $startUrl = trim($startUrl);
>
> - //comentarii pe mai multe linii
> -
> - if (substr($startUrl, 0, 3) == '###') {
> - //flip bit
> - $multipleLinesComment ^= 1;
> - }
> -
> - //comentarii sau linii goale
> - if ($multipleLinesComment || substr($startUrl,0,1) == '#'
> - || !$startUrl)
> - continue;
> -
> -
> //curatam url-ul
> $this->currentUrl = $this->urlPadding($startUrl);
> //impartim url-ul pe componente
>
> Deleted: app/WhiteList.txt
> ==============================================================================
> --- app/WhiteList.txt Mon Oct 21 13:24:29 2013 (r1026)
> +++ /dev/null 00:00:00 1970 (deleted)
> @@ -1,18 +0,0 @@
> -# câte un domeniu pe linie
> -# comentarii pe o singură linie cu #
> -
> -
> -###
> -
> -pe mai multe linii încadrați între ### și ###,
> -dar ### de închidere să fie singur pe linie
> -pentru că altfel ce urmează după aceasta pe
> -aceeași linie va fi ignorat
> -
> -###
> -
> -
> -http://wiki.dexonline.ro/
> -#http://www.romlit.ro/
> -#http://ro.wikipedia.org/
> -
>
> Deleted: app/app_dex.conf
> ==============================================================================
> --- app/app_dex.conf Mon Oct 21 13:24:29 2013 (r1026)
> +++ /dev/null 00:00:00 1970 (deleted)
> @@ -1,65 +0,0 @@
> -[crawler]
> -
> -
> -;the as downloaded html page location
> -raw_page_path=RawPage/
> -;the pure text page associated
> -;with the html one location
> -parsed_text_path=ParsedText/
> -
> -;waiting time between getting pages
> -t_wait=15
> -
> -;if this is true, then the application will
> -;exit if an exception occured
> -exception_exit=true
> -
> -;the most probable directory index file
> -dir_index_file=index
> -;the most probable index extensions
> -index_file_ext=html,php,aspx,asp,pl,py,jsp
> -
> -;this should be stored somewhere not on public directories
> -user_agent_location=/var/www/CrawlerData/user_agent
> -
> -
> -[diacritics]
> -
> -diacritics_buffer_limit=2048
> -
> -;diactritics list
> -diacritics=ăâîșț
> -
> -;if word finished, this char
> -;will be used in padding until
> -;the reach of the padding number
> -padding_char=*
> -
> -;non-diacritics list
> -non_lower_diacritics=aist
> -non_upper_diacritics=AIST
> -
> -;left and right length
> -;e.g. for cireșarii cire-s-arii
> -;the padding_length is 5
> -diacritics_padding_length=5
> -
> -
> -
> -[app_log]
> -
> -log_detail_level=2
> -
> -;crawler log file path
> -crawler_log=crawler_log
> -
> -;diacritics log file path
> -diacritics_log=diacritics_log
> -
> -;outputs messages to the screen
> -;values are true and false
> -log2screen=true
> -
> -;outputs messages to the file
> -;specified bycrawler_log
> -log2file=true
> \ No newline at end of file
>
> Modified: app/clean.php
> ==============================================================================
> --- app/clean.php Mon Oct 21 12:30:37 2013 (r1026)
> +++ app/clean.php Mon Oct 21 13:24:29 2013 (r1027)
> @@ -3,10 +3,10 @@
> * Alin Ungureanu, 2013
> * alyn.cti at gmail.com
> */
> -require_once '../../phplib/util.php';
> +require_once __DIR__ . '/../phplib/util.php';
>
> function printUsage() {
> - echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
> + echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [ -d | --diacritics]" . PHP_EOL;
> flush();
> exit();
> }
> @@ -30,25 +30,25 @@
>
> try {
>
> - //sterge toate fisierele salvate
> + // șterge toate fișierele salvate
> removeFiles('ParsedText/*');
> removeFiles('RawPage/*');
>
>
> - echo 'files deleted' . Config::get('crawler.new_line');
> + echo "files deleted\n";
>
> - $db->exec('TRUNCATE Table CrawledPage;');
> - $db->exec('TRUNCATE Table Link;');
> - $db->commit();
> + $db->exec('TRUNCATE Table CrawledPage;');
> + $db->exec('TRUNCATE Table Link;');
> + $db->commit();
>
> - echo "tables 'Link' and 'CrawledPage' truncated" . Config::get('crawler.new_line');
> + echo "tables 'Link' and 'CrawledPage' truncated\n";
>
> - echo 'The cleaning process was successful' . Config::get('crawler.new_line');
> + echo "The cleaning process was successful\n";
> }
>
> catch(Exception $ex) {
>
> - echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
> + echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
> }
> }
> else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
> @@ -56,13 +56,13 @@
> try{
> $db->exec('TRUNCATE Table Diacritics;');
> $db->exec('TRUNCATE Table FilesUsedInDiacritics;');
> - $db->commit();
> - echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated" . Config::get('crawler.new_line');
> - echo 'The cleaning process was successful' . Config::get('crawler.new_line');
> + $db->commit();
> + echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
> + echo "The cleaning process was successful\n";
> }
> - catch(Exception $e) {
> + catch(Exception $ex) {
>
> - echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
> + echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
> }
>
> }
>
> Modified: dex.conf.sample
> ==============================================================================
> --- dex.conf.sample Mon Oct 21 12:30:37 2013 (r1026)
> +++ dex.conf.sample Mon Oct 21 13:24:29 2013 (r1027)
> @@ -1,4 +1,5 @@
> ; Configuration file for a DEX online installation
> +; Some entire sections are optional. Please refer to each section's comments.
>
> [global]
> ; Modules that are allowed to run and possibly serve a banner instead of the bannerType choice below.
> @@ -69,12 +70,14 @@
> skins[] = zepu
> skins[] = polar
>
> +; This section handles donations through a third party.
> [euplatesc]
> ; Key for euplatesc.ro or 0 to disable the donation box.
> euPlatescKey = 0
> ; Merchant ID for euplatesc.ro or 0 to disable the donation box.
> euPlatescMid = 0
>
> +; Functional testing. We don't do much of it as of 2013.
> [functest]
> ; URL that the functional test engine will exercise
> ; This is the URL you normally go to in development
> @@ -91,6 +94,7 @@
> ; Comment this out in production.
> functestLockFile = /tmp/dex-functest
>
> +; Skin-specific variables. Necessary if you plan to display banners.
> [skin-zepu]
> ; Display banners after the search box.
> adsense_mainPage = "id=1220723485&width=728&height=90"
> @@ -108,3 +112,66 @@
>
> [limits]
> limitFulltextSearch = 1000
> +
> +; Configuration for the Romanian literature crawler
> +[crawler]
> +
> +; The as downloaded html page location
> +raw_page_path = RawPage/
> +
> +; The pure text page associated with the html one location
> +parsed_text_path = ParsedText/
> +
> +; Waiting time between getting pages (per-domain limit)
> +t_wait = 15
> +
> +; If true, then the application will exit if an exception occurs
> +exception_exit = true
> +
> +; The most probable directory index file
> +dir_index_file = index
> +
> +; The most probable index extensions
> +index_file_ext = html,php,aspx,asp,pl,py,jsp
> +
> +; Crawler signature
> +user_agent = "DEX online crawler v1.0"
> +
> +; Pages to crawl
> +; whiteList[] = http://example.com/
> +; whiteList[] = http://example.com/
> +
> +; Configuration for the app that adds Romanian diacritics to a text that doesn't have them
> +[diacritics]
> +
> +diacritics_buffer_limit = 2048
> +
> +; Diacritics list
> +diacritics = ăâîșț
> +
> +; Padding char beyond the end of text
> +padding_char = *
> +
> +; Non-diacritics list
> +non_lower_diacritics = aist
> +non_upper_diacritics = AIST
> +
> +; Left and right length; e.g. for cireșarii cire-s-arii the padding_length is 5
> +diacritics_padding_length = 5
> +
> +; Logging settings for the crawler and any apps built on top of it
> +[app_log]
> +
> +log_detail_level = 2
> +
> +; Crawler log file path, relative to the root installation directory
> +crawler_log = log/crawler_log
> +
> +; Diacritics log file path, relative to the root installation directory
> +diacritics_log = log/diacritics_log
> +
> +; When set, outputs messages to the screen
> +log2screen = true
> +
> +; when set, outputs messages to the file specified by crawler_log
> +log2file = true
>
> Added: patches/00102.sql
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ patches/00102.sql Mon Oct 21 13:24:29 2013 (r1027)
> @@ -0,0 +1,22 @@
> +create table if not exists CrawledPage (
> + id int not null auto_increment,
> + timestamp int not null,
> + url varchar(255) not null,
> + httpStatus int not null,
> + rawPagePath varchar(255) not null,
> + parsedTextPath varchar(255) not null,
> +
> + primary key (id),
> + key(httpStatus)
> +);
> +
> +create table if not exists Link(
> + id int not null auto_increment,
> + canonicalUrl varchar(255) not null,
> + domain varchar(255) not null,
> + crawledPageId int not null,
> +
> + primary key(id),
> + key(domain),
> + key(crawledPageId)
> +);
>
> Modified: phplib/AppLog.php
> ==============================================================================
> --- phplib/AppLog.php Mon Oct 21 12:30:37 2013 (r1026)
> +++ phplib/AppLog.php Mon Oct 21 13:24:29 2013 (r1027)
> @@ -71,7 +71,7 @@
> fclose( $fd);
> }
> catch (Exception $ex) {
> - echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
> + echo "LOG FILE PROBLEM\n";
> }
> //log in stdout
> if (Config::get('app_log.log2screen')) {
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev
More information about the Dev
mailing list