[dev] [commit] r1027 - app log patches phplib
automailer at dexonline.ro
automailer at dexonline.ro
Mon Oct 21 13:24:29 EEST 2013
Author: cata
Date: Mon Oct 21 13:24:29 2013
New Revision: 1027
Log:
Some crawler changes
* moved settings into dex.conf.sample
* replaced user_agent_location with just user_agent. It now contains the actual user agent (we can do that because dex.conf.sample is public, but dex.conf is not).
* simplify the URL white list. It is now crawler.whiteList in dex.conf.sample. Removed all the multiple line comments code.
* remove references to new_line. I hope that's ok. We can always display the log in a browser window with <pre> if needed.
* add MySQL tables. I hope I got all the fields and indexes right. I couldn't find the original schema.
Added:
patches/00102.sql
Deleted:
app/WhiteList.txt
app/app_dex.conf
Modified:
app/AbstractCrawler.php
app/Crawler.php
app/clean.php
dex.conf.sample
log/ (props changed)
phplib/AppLog.php
Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php Mon Oct 21 12:30:37 2013 (r1026)
+++ app/AbstractCrawler.php Mon Oct 21 13:24:29 2013 (r1027)
@@ -47,10 +47,10 @@
function getPage($url) {
$this->ch = curl_init();
- Applog::log(file_get_contents(Config::get('crawler.user_agent_location')));
+ Applog::log("User agent is: " . Config::get('crawler.user_agent'));
curl_setopt ($this->ch, CURLOPT_URL, $url);
curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(Config::get('crawler.user_agent_location')));
+ curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php Mon Oct 21 12:30:37 2013 (r1026)
+++ app/Crawler.php Mon Oct 21 13:24:29 2013 (r1027)
@@ -3,7 +3,7 @@
* Alin Ungureanu, 2013
* alyn.cti at gmail.com
*/
-require_once dirname(__FILE__) . '/AbstractCrawler.php';
+require_once __DIR__ . '/AbstractCrawler.php';
class Crawler extends AbstractCrawler {
@@ -99,7 +99,7 @@
Applog::log("Crawler started");
- $this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
+ $this->domainsList = Config::get('crawler.whiteList');
//start processing
$this->processWhiteList();
@@ -109,26 +109,9 @@
function processWhiteList() {
-
- $multipleLinesComment = false;
-
foreach($this->domainsList as $startUrl) {
-
$startUrl = trim($startUrl);
- //comentarii pe mai multe linii
-
- if (substr($startUrl, 0, 3) == '###') {
- //flip bit
- $multipleLinesComment ^= 1;
- }
-
- //comentarii sau linii goale
- if ($multipleLinesComment || substr($startUrl,0,1) == '#'
- || !$startUrl)
- continue;
-
-
//curatam url-ul
$this->currentUrl = $this->urlPadding($startUrl);
//impartim url-ul pe componente
Deleted: app/WhiteList.txt
==============================================================================
--- app/WhiteList.txt Mon Oct 21 13:24:29 2013 (r1026)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,18 +0,0 @@
-# câte un domeniu pe linie
-# comentarii pe o singură linie cu #
-
-
-###
-
-pe mai multe linii încadrați între ### și ###,
-dar ### de închidere să fie singur pe linie
-pentru că altfel ce urmează după aceasta pe
-aceeași linie va fi ignorat
-
-###
-
-
-http://wiki.dexonline.ro/
-#http://www.romlit.ro/
-#http://ro.wikipedia.org/
-
Deleted: app/app_dex.conf
==============================================================================
--- app/app_dex.conf Mon Oct 21 13:24:29 2013 (r1026)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,65 +0,0 @@
-[crawler]
-
-
-;the as downloaded html page location
-raw_page_path=RawPage/
-;the pure text page associated
-;with the html one location
-parsed_text_path=ParsedText/
-
-;waiting time between getting pages
-t_wait=15
-
-;if this is true, then the application will
-;exit if an exception occured
-exception_exit=true
-
-;the most probable directory index file
-dir_index_file=index
-;the most probable index extensions
-index_file_ext=html,php,aspx,asp,pl,py,jsp
-
-;this should be stored somewhere not on public directories
-user_agent_location=/var/www/CrawlerData/user_agent
-
-
-[diacritics]
-
-diacritics_buffer_limit=2048
-
-;diactritics list
-diacritics=ăâîșț
-
-;if word finished, this char
-;will be used in padding until
-;the reach of the padding number
-padding_char=*
-
-;non-diacritics list
-non_lower_diacritics=aist
-non_upper_diacritics=AIST
-
-;left and right length
-;e.g. for cireșarii cire-s-arii
-;the padding_length is 5
-diacritics_padding_length=5
-
-
-
-[app_log]
-
-log_detail_level=2
-
-;crawler log file path
-crawler_log=crawler_log
-
-;diacritics log file path
-diacritics_log=diacritics_log
-
-;outputs messages to the screen
-;values are true and false
-log2screen=true
-
-;outputs messages to the file
-;specified bycrawler_log
-log2file=true
\ No newline at end of file
Modified: app/clean.php
==============================================================================
--- app/clean.php Mon Oct 21 12:30:37 2013 (r1026)
+++ app/clean.php Mon Oct 21 13:24:29 2013 (r1027)
@@ -3,10 +3,10 @@
* Alin Ungureanu, 2013
* alyn.cti at gmail.com
*/
-require_once '../../phplib/util.php';
+require_once __DIR__ . '/../phplib/util.php';
function printUsage() {
- echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
+ echo "::Usage::" . PHP_EOL . "php clean.php [ -c | --crawler] [ -d | --diacritics]" . PHP_EOL;
flush();
exit();
}
@@ -30,25 +30,25 @@
try {
- //sterge toate fisierele salvate
+ // șterge toate fișierele salvate
removeFiles('ParsedText/*');
removeFiles('RawPage/*');
- echo 'files deleted' . Config::get('crawler.new_line');
+ echo "files deleted\n";
- $db->exec('TRUNCATE Table CrawledPage;');
- $db->exec('TRUNCATE Table Link;');
- $db->commit();
+ $db->exec('TRUNCATE Table CrawledPage;');
+ $db->exec('TRUNCATE Table Link;');
+ $db->commit();
- echo "tables 'Link' and 'CrawledPage' truncated" . Config::get('crawler.new_line');
+ echo "tables 'Link' and 'CrawledPage' truncated\n";
- echo 'The cleaning process was successful' . Config::get('crawler.new_line');
+ echo "The cleaning process was successful\n";
}
catch(Exception $ex) {
- echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
+ echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
}
}
else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
@@ -56,13 +56,13 @@
try{
$db->exec('TRUNCATE Table Diacritics;');
$db->exec('TRUNCATE Table FilesUsedInDiacritics;');
- $db->commit();
- echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated" . Config::get('crawler.new_line');
- echo 'The cleaning process was successful' . Config::get('crawler.new_line');
+ $db->commit();
+ echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated\n";
+ echo "The cleaning process was successful\n";
}
- catch(Exception $e) {
+ catch(Exception $ex) {
- echo 'The cleaning process encountered a problem ' . Config::get('crawler.new_line').$ex->getMessage();
+ echo "The cleaning process encountered a problem: " . $ex->getMessage() . "\n";
}
}
Modified: dex.conf.sample
==============================================================================
--- dex.conf.sample Mon Oct 21 12:30:37 2013 (r1026)
+++ dex.conf.sample Mon Oct 21 13:24:29 2013 (r1027)
@@ -1,4 +1,5 @@
; Configuration file for a DEX online installation
+; Some entire sections are optional. Please refer to each section's comments.
[global]
; Modules that are allowed to run and possibly serve a banner instead of the bannerType choice below.
@@ -69,12 +70,14 @@
skins[] = zepu
skins[] = polar
+; This section handles donations through a third party.
[euplatesc]
; Key for euplatesc.ro or 0 to disable the donation box.
euPlatescKey = 0
; Merchant ID for euplatesc.ro or 0 to disable the donation box.
euPlatescMid = 0
+; Functional testing. We don't do much of it as of 2013.
[functest]
; URL that the functional test engine will exercise
; This is the URL you normally go to in development
@@ -91,6 +94,7 @@
; Comment this out in production.
functestLockFile = /tmp/dex-functest
+; Skin-specific variables. Necessary if you plan to display banners.
[skin-zepu]
; Display banners after the search box.
adsense_mainPage = "id=1220723485&width=728&height=90"
@@ -108,3 +112,66 @@
[limits]
limitFulltextSearch = 1000
+
+; Configuration for the Romanian literature crawler
+[crawler]
+
+; The as downloaded html page location
+raw_page_path = RawPage/
+
+; The pure text page associated with the html one location
+parsed_text_path = ParsedText/
+
+; Waiting time between getting pages (per-domain limit)
+t_wait = 15
+
+; If true, then the application will exit if an exception occurs
+exception_exit = true
+
+; The most probable directory index file
+dir_index_file = index
+
+; The most probable index extensions
+index_file_ext = html,php,aspx,asp,pl,py,jsp
+
+; Crawler signature
+user_agent = "DEX online crawler v1.0"
+
+; Pages to crawl
+; whiteList[] = http://example.com/
+; whiteList[] = http://example.com/
+
+; Configuration for the app that adds Romanian diacritics to a text that doesn't have them
+[diacritics]
+
+diacritics_buffer_limit = 2048
+
+; Diacritics list
+diacritics = ăâîșț
+
+; Padding char beyond the end of text
+padding_char = *
+
+; Non-diacritics list
+non_lower_diacritics = aist
+non_upper_diacritics = AIST
+
+; Left and right length; e.g. for cireșarii cire-s-arii the padding_length is 5
+diacritics_padding_length = 5
+
+; Logging settings for the crawler and any apps built on top of it
+[app_log]
+
+log_detail_level = 2
+
+; Crawler log file path, relative to the root installation directory
+crawler_log = log/crawler_log
+
+; Diacritics log file path, relative to the root installation directory
+diacritics_log = log/diacritics_log
+
+; When set, outputs messages to the screen
+log2screen = true
+
+; when set, outputs messages to the file specified by crawler_log
+log2file = true
Added: patches/00102.sql
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ patches/00102.sql Mon Oct 21 13:24:29 2013 (r1027)
@@ -0,0 +1,22 @@
+create table if not exists CrawledPage (
+ id int not null auto_increment,
+ timestamp int not null,
+ url varchar(255) not null,
+ httpStatus int not null,
+ rawPagePath varchar(255) not null,
+ parsedTextPath varchar(255) not null,
+
+ primary key (id),
+ key(httpStatus)
+);
+
+create table if not exists Link(
+ id int not null auto_increment,
+ canonicalUrl varchar(255) not null,
+ domain varchar(255) not null,
+ crawledPageId int not null,
+
+ primary key(id),
+ key(domain),
+ key(crawledPageId)
+);
Modified: phplib/AppLog.php
==============================================================================
--- phplib/AppLog.php Mon Oct 21 12:30:37 2013 (r1026)
+++ phplib/AppLog.php Mon Oct 21 13:24:29 2013 (r1027)
@@ -71,7 +71,7 @@
fclose( $fd);
}
catch (Exception $ex) {
- echo "LOG FILE PROBLEM" . Config::get('app_log.new_line');
+ echo "LOG FILE PROBLEM\n";
}
//log in stdout
if (Config::get('app_log.log2screen')) {
More information about the Dev
mailing list