[dev] [commit] r979 - phplib/models wwwbase/Crawler wwwbase/styles
automailer at dexonline.ro
automailer at dexonline.ro
Sun Sep 8 23:31:54 EEST 2013
Author: alinu
Date: Sun Sep 8 23:31:54 2013
New Revision: 979
Log:
Added:
wwwbase/Crawler/MemoryManagement.php
wwwbase/Crawler/WhiteList.txt
wwwbase/Crawler/crawler_dex.conf
wwwbase/Crawler/database_tables.sql
Deleted:
wwwbase/Crawler/sectiunea_de_crawler_dex.conf
Modified:
phplib/models/CrawledPage.php
phplib/models/Link.php
wwwbase/Crawler/.htaccess
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/Crawler.php
wwwbase/Crawler/clean_all.php
wwwbase/styles/crawler.css
Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php Fri Sep 6 17:49:26 2013 (r978)
+++ phplib/models/CrawledPage.php Sun Sep 8 23:31:54 2013 (r979)
@@ -2,7 +2,6 @@
class CrawledPage extends BaseObject implements DatedObject {
- //implements DatedObject {
public static $_table = 'CrawledPage';
@@ -36,6 +35,11 @@
return Model::factory(self::$_table)->raw_query("select id, domain from
(select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
}
+
+ function getNextDiacriticsFile() {
+
+ return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);");
+ }
}
Modified: phplib/models/Link.php
==============================================================================
--- phplib/models/Link.php Fri Sep 6 17:49:26 2013 (r978)
+++ phplib/models/Link.php Sun Sep 8 23:31:54 2013 (r979)
@@ -7,7 +7,7 @@
public static $_table = 'Link';
//adauga o intrare nou in tabelul Link
- public static function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
+ public static function saveLink2DB($canonicalUrl, $domain, $crawledPageId) {
//nu inseram acelasi link de 2 ori
if (Model::factory(self::$_table)->where('canonicalUrl', $canonicalUrl)->find_one()) {
@@ -20,7 +20,6 @@
$tableObj->create();
$tableObj->canonicalUrl = $canonicalUrl;
$tableObj->domain = $domain;
- $tableObj->urlHash = $urlHash;
$tableObj->crawledPageId = $crawledPageId;
$tableObj->save();
@@ -33,8 +32,6 @@
return null;
}
-
-
}
-?>
\ No newline at end of file
+?>
Modified: wwwbase/Crawler/.htaccess
==============================================================================
--- wwwbase/Crawler/.htaccess Fri Sep 6 17:49:26 2013 (r978)
+++ wwwbase/Crawler/.htaccess Sun Sep 8 23:31:54 2013 (r979)
@@ -1,8 +1,13 @@
+AuthType Basic
+AuthName "Password Protected Area"
+AuthUserFile /etc/php5/apache2/.htpasswd
+
Order Deny,Allow
Deny from all
Allow from 127.0.0.1
-<Files index.php>
- Order Allow,Deny
- Allow from all
-</Files>
\ No newline at end of file
+<Files "index.php">
+ Order Allow,Deny
+ Allow from all
+ Require valid-user
+</Files>
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Fri Sep 6 17:49:26 2013 (r978)
+++ wwwbase/Crawler/AbstractCrawler.php Sun Sep 8 23:31:54 2013 (r979)
@@ -10,6 +10,7 @@
require_once '../../phplib/idiorm/paris.php';
require_once 'AppLog.php';
+require_once 'MemoryManagement.php';
db_init();
@@ -32,7 +33,7 @@
protected $directoryIndexFile;
protected $IndexFileExt;
- private $justStarted;
+ protected $domainsList;
function __construct() {
@@ -51,7 +52,7 @@
curl_setopt ($this->ch, CURLOPT_URL, $url);
curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt ($this->ch, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0");
+ curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
curl_setopt ($this->ch, CURLOPT_TIMEOUT, 60);
curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
@@ -89,15 +90,7 @@
return strstr($this->getUrlMimeType($buffer), 'html');
}
- //elibereaza memoria ale carei referinte s-au pierdut
- function manageMemory() {
-
- crawlerLog('MEM USAGE BEFORE GC - ' . memory_get_usage());
- gc_enable(); // Enable Garbage Collector
- crawlerLog(gc_collect_cycles() . " garbage cycles cleaned"); // # of elements cleaned up
- gc_disable(); // Disable Garbage Collector
- crawlerLog('MEM USAGE After GC - ' . memory_get_usage());
- }
+
//seteaza locatia unde vor fi salvate fisierele html raw si clean text
function setStorePageParams() {
@@ -153,9 +146,96 @@
}
}
+ //returneaza codul HTTP
+ function httpResponse() {
+
+ return $this->info['http_code'];
+ }
+
+ //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+ function getNextLink() {
+
+
+ //$nextLink = null;
+ try {
+ //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
+ $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+
+ if ($nextLink != null) {
+
+ return $nextLink->canonicalUrl;
+ }
+ }
+ catch(Exception $ex) {
+
+ logException($ex);
+ }
+
+ return null;
+ }
+
+ //repara HTML-ul stricat intr-un mod minimal astfel incat
+ //sa poata fi interpretat de biblioteca simple_html_dom
+ function fixHtml($html) {
+
+ foreach($html->find('head') as $script) {
+
+ $script->outertext = '';
+ }
+ foreach($html->find('script') as $script) {
+
+ $script->outertext = '';
+ }
+
+ foreach($html->find('style') as $style) {
+ $style->outertext = '';
+ }
+
+ $html->load($html->save());
+
+ //transforma pagina raw in simple_html_dom_node
+ //$this->dom = str_get_html($pageContent);
+
+ $buffer = '<html><body>';
+ $nodes = $html->childNodes();
+ foreach($nodes as $node) {
+
+ $buffer .= $node->innertext();
+ }
+
+ $buffer .= '</body></html>';
+
+ return str_get_html($buffer);
+ }
+
+
+ //metode pentru prelucrarea linkurilor
//sterge directory index file si elimina slash-urile in plus
+ //gaseste toate linkurile
+ //le transforma in absolute daca sunt relative
+ function processLink($url) {
+
+ crawlerLog('Processing link: '.$url);
+ $canonicalUrl = null;
+ if ($this->isRelativeLink($url)) {
+
+ $url = $this->makeAbsoluteLink($url);
+ }
+ //daca ultimul caracter este '/', il eliminam
+ //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
+ if (substr($url, -1) == "/") $url = substr($url, 0, -1);
+
+ //sterge slash-uri in plus si directory index file
+ $canonicalUrl = $this->urlPadding($url);
+
+ if (!strstr($url, $this->currentLocation)) return;
+
+ Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+ }
+
+
function urlPadding($url) {
return $this->delDuplicateSlashes($this->delDirIndexFile($url));
@@ -190,10 +270,7 @@
$parsedUrl = parse_url($url);
- /*if (substr($parsedUrl['host'], 0, 4) != 'www.') {
-
- $parsedUrl['host'] = 'www.'.$parsedUrl['host'];
- }*/
+
if (substr_count($parsedUrl['host'], '.') < 2) {
$parsedUrl['host'] = 'www.'.$parsedUrl['host'];
@@ -220,57 +297,25 @@
}
//eliminarea slash-ului final
- //$retUrl = substr($retUrl, 0, -1);
+
if (substr($retUrl, -1) == "/") $retUrl = substr($retUrl, 0, -1);
- //crawlerLog("DelDuplicateSlashes ". $retUrl);
-
return $retUrl;
}
- //gaseste toate linkurile
- //le transforma in absolute daca sunt relative
- function processLink($url) {
-
- crawlerLog('Processing link: '.$url);
- $canonicalUrl = null;
- if ($this->isRelativeLink($url)) {
-
- $url = $this->makeAbsoluteLink($url);
- }
- //daca ultimul caracter este '/', il eliminam
- //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
- if (substr($url, -1) == "/") $url = substr($url, 0, -1);
-
- //sterge slash-uri in plus si directory index file
- $canonicalUrl = $this->urlPadding($url);
-
- //$this->urlResource = parse_url($url);
-
-
-
- if (!strstr($url, $this->currentLocation)) return;
-
-
- $urlHash = $this->getLinkHash($url);
-
- $domain = $this->getDomain($url);
-
- Link::saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
- }
-
function isRelativeLink($url) {
return !strstr($url, "http");
}
-
+ //cauta directorul link-ului curent si returneaza
+ //url-ul spre acel director
function getDeepestDir($url) {
try {
$retVal = substr($url, 0, strrpos($url,'/'));
- //crawlerLog("GetDeepestDir: " . $retVal);
+
if (strstr($retVal, $this->currentLocation))
return $retVal;
else return $url;
@@ -287,100 +332,19 @@
return $this->getDeepestDir($this->currentUrl) .'/'. $url;
}
-
- function getLinkHash($url) {
-
- $liteURL = substr($url, strpos($url, "//") + 2);
- if (strstr($liteURL, "index.php") || strstr($liteURL, "index.asp") ||
- strstr($liteURL, "index.htm"))
- $liteURL = substr($liteURL, 0, strrpos($liteURL, "//"));
- return md5($liteURL);
- }
-
-
function getDomain($url) {
return $this->urlResource['host'];
}
- //returneaza codul HTTP
- function httpResponse() {
-
- return $this->info['http_code'];
- }
-
- //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
- function getNextLink() {
-
-
- if (!isset($this->justStarted)) {
- $this->justStarted = true;
- return $this->currentUrl;
- }
-
-
- //$nextLink = null;
- try {
- //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
- $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
-
- if ($nextLink != null) {
-
- return $nextLink->canonicalUrl;
- }
- }
- catch(Exception $ex) {
-
- logException($ex);
- }
-
- return null;
- }
-
-
- function fixHtml($html) {
-
- foreach($html->find('head') as $script) {
-
- $script->outertext = '';
- }
-
-
- foreach($html->find('script') as $script) {
-
- $script->outertext = '';
- }
-
- foreach($html->find('style') as $style) {
-
- $style->outertext = '';
- }
-
- $html->load($html->save());
-
- //transforma pagina raw in simple_html_dom_node
- //$this->dom = str_get_html($pageContent);
-
- $buffer = '<html><body>';
- $nodes = $html->childNodes();
- foreach($nodes as $node) {
-
- $buffer .= $node->innertext();
- }
-
- $buffer .= '</body></html>';
-
- return str_get_html($buffer);
- }
-
//Clasele care deriva aceasta clasa vor trebui
//sa implementeze metodele de mai jos
-
abstract function extractText($domNode);
- abstract function startCrawling($startUrl);
-}
+ abstract function crawlDomain();
+ abstract function start();
+}
?>
\ No newline at end of file
Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php Fri Sep 6 17:49:26 2013 (r978)
+++ wwwbase/Crawler/Crawler.php Sun Sep 8 23:31:54 2013 (r979)
@@ -11,7 +11,7 @@
//extrage textul fara cod html
function getText($domNode) {
- $this->plainText = strip_tags($domNode->text());
+ $this->plainText = html_entity_decode(strip_tags($domNode->text()));
//$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
}
//extrage textul cu cod html din nodul respectiv
@@ -26,26 +26,44 @@
}
}
+ function processPage($pageContent) {
- function startCrawling($startUrl) {
-
- crawlerLog("Started");
+ try {
+
+ $html = str_get_html($pageContent);
+ //reparam html stricat
+ if (!$html->find('body', 0, true)) {
- $this->currentUrl = $this->urlPadding($startUrl);
+ $html = $this->fixHtml($html);
+ }
+
+
+ $this->extractText($html->find('body', 0, true));
+ $this->saveCurrentPage();
+
+ //cata memorie consuma
+ //si eliberare referinte pierdute
+
+ $html->clear();
- crawlerLog('FIRST START URL: '.$this->currentUrl);
+ MemoryManagement::showUsage('before cleaning', true, 'KB');
+
+ MemoryManagement::clean(true);
- $this->urlResource = parse_url($this->currentUrl);
+ MemoryManagement::showUsage('after cleaning', true, 'KB');
+ //niceness
+ sleep(pref_getSectionPreference('crawler', 't_wait'));
+ }
+ catch (Exception $ex) {
- //locatia curenta, va fi folosita pentru a nu depasi sfera
- //de exemplu vrem sa crawlam doar o anumita zona a site-ului
- $this->currentLocation = substr($startUrl, strpos($startUrl, ':') + 3);
- crawlerLog('domain start location: '.$this->currentLocation);
+ logException($ex);
+ }
+ }
- $url = $startUrl;
+ function crawlDomain() {
- $justStarted = true;
+ crawlerLog("Crawling: " . $this->getDomain($this->currentUrl) . " started");
while(1) {
@@ -71,36 +89,61 @@
continue;
}
- try {
+ $this->processPage($pageContent);
+ }
+
+ crawlerLog("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
+ }
- $html = str_get_html($pageContent);
+ function start() {
+
+ crawlerLog("Crawler started");
- //reparam html stricat
- if (!$html->find('body', 0, true)) {
+ $this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
- $html = $this->fixHtml($html);
- }
+ //start processing
+ $this->processWhiteList();
+ crawlerLog('Crawler finished');
+ }
- $this->extractText($html->find('body', 0, true));
- $this->saveCurrentPage();
-
- //cata memorie consuma
- //si eliberare referinte pierdute
- $this->manageMemory();
- //niceness
- sleep(pref_getSectionPreference('crawler', 't_wait'));
- }
- catch (Exception $ex) {
+ function processWhiteList() {
- logException($ex);
- }
- }
+ $multipleLinesComment = false;
+
+ foreach($this->domainsList as $startUrl) {
+ //comentarii pe mai multe linii
+ if (substr($startUrl, 0, 3) == '###')
+ $multipleLinesComment ^= $multipleLinesComment;
+
+ if ($multipleLinesComment)
+ continue;
+ //comentarii pe o singura linie
+ if (substr($startUrl,0,1) == '#')
+ continue;
- crawlerLog('Finished');
+ //curatam url-ul
+ $this->currentUrl = $this->urlPadding($startUrl);
+ //impartim url-ul pe componente
+ $this->urlResource = parse_url($this->currentUrl);
+
+ //salvam startUrl in tabelul Link pentru a incepe extragerea,
+ //startUrl nu va avea o pagina din care este descoperit
+ //asa ca pagina crawledPageId va avea valoarea 0.
+ Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
+
+ //locatia curenta, va fi folosita pentru a nu depasi sfera
+ //de exemplu vrem sa crawlam doar o anumita zona a site-ului
+ $this->currentLocation = substr($this->currentUrl, 0);
+ crawlerLog('domain start location: '.$this->currentLocation);
+
+ $this->crawlDomain();
+ }
+
}
+
}
/*
@@ -109,8 +152,8 @@
if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
$obj = new Crawler();
- //$obj->startCrawling("http://wiki.dexonline.ro/");
- $obj->startCrawling("http://www.romlit.ro");
+
+ $obj->start();
}
?>
\ No newline at end of file
Added: wwwbase/Crawler/MemoryManagement.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/MemoryManagement.php Sun Sep 8 23:31:54 2013 (r979)
@@ -0,0 +1,45 @@
+<?php
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+
+require_once 'AppLog.php';
+
+class MemoryManagement {
+
+ /*
+ * cleans lost memory refferences
+ */
+ public static function clean($print = false) {
+
+ gc_enable(); // Enable Garbage Collector
+ if ($print) {
+
+ crawlerLog(gc_collect_cycles() . " garbage cycles cleaned"); // # of elements cleaned up
+ }
+ gc_disable(); // Disable Garbage Collector
+ }
+
+ public static function showUsage($message = '', $realUsage = false, $units = "B") {
+
+ $truncate = 1;
+ switch($units) {
+
+ case 'KB':
+ $truncate = pow(10,3);
+ break;
+ case 'MB':
+ $truncate = pow(10,6);
+ break;
+ case 'GB':
+ $truncate = pow(10,9);
+ break;
+ default: //Bytes
+
+ break;
+ }
+
+ crawlerLog("Memory Usage $message: " . sprintf("%.0f", memory_get_usage($realUsage) / $truncate) . ' ' . $units);
+ }
+}
+
+?>
\ No newline at end of file
Added: wwwbase/Crawler/WhiteList.txt
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/WhiteList.txt Sun Sep 8 23:31:54 2013 (r979)
@@ -0,0 +1,15 @@
+# câte un domeniu pe linie
+# comentarii pe o singură linie cu #
+
+###
+
+pe mai multe linii încadrați între ### și ###,
+dar ### de închidere să fie singur pe linie
+pentru că altfel ce urmează după aceasta pe
+aceeași linie va fi ignorat
+
+###
+
+#http://wiki.dexonline.ro/
+#http://www.romlit.ro
+http://ro.wikipedia.org/
Modified: wwwbase/Crawler/clean_all.php
==============================================================================
--- wwwbase/Crawler/clean_all.php Fri Sep 6 17:49:26 2013 (r978)
+++ wwwbase/Crawler/clean_all.php Sun Sep 8 23:31:54 2013 (r979)
@@ -9,6 +9,8 @@
require_once '../../phplib/idiorm/idiorm.php';
+
+
function removeFiles($regexPath) {
exec("rm -rf $regexPath");
Added: wwwbase/Crawler/crawler_dex.conf
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/crawler_dex.conf Sun Sep 8 23:31:54 2013 (r979)
@@ -0,0 +1,42 @@
+[crawler]
+
+
+;the as downloaded html page location
+raw_page_path=RawPage/
+;the pure text page associated
+;with the html one location
+parsed_text_path=ParsedText/
+
+;waiting time between getting pages
+t_wait=10
+
+
+;if this is true, then the application will
+;exit if an exception occured
+exception_exit=true
+
+
+;crawler log
+crawler_log=crawler_log
+diacritics_log=diacritics_log
+;outputs messages to the screen
+;values are true and false
+log2screen=true
+;outputs messages to the file
+;specified bycrawler_log
+log2file=true
+new_line=PHP_EOL
+
+
+;the most probable directory index file
+dir_index_file=index
+;the most probable index extensions
+index_file_ext=html,php,aspx,asp,pl,py,jsp
+
+
+;this should be stored somewhere not on public directories
+user_agent_location=/var/www/CrawlerData/user_agent
+
+
+;diactritics list
+diacritics=ăâîșț
Added: wwwbase/Crawler/database_tables.sql
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/database_tables.sql Sun Sep 8 23:31:54 2013 (r979)
@@ -0,0 +1,25 @@
+CREATE TABLE IF NOT EXISTS `CrawledPage` (
+ `id` bigint(20) NOT NULL AUTO_INCREMENT,
+ `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+ `url` varchar(256) NOT NULL,
+ `httpStatus` int(11) NOT NULL,
+ `rawPagePath` varchar(128) NOT NULL,
+ `parsedTextPath` varchar(128) NOT NULL,
+ `createDate` int(4) DEFAULT NULL,
+ `modDate` int(4) DEFAULT NULL,
+ PRIMARY KEY (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;
+
+
+
+CREATE TABLE IF NOT EXISTS `Link` (
+ `id` bigint(20) NOT NULL AUTO_INCREMENT,
+ `canonicalUrl` varchar(256) NOT NULL,
+ `domain` varchar(128) NOT NULL,
+ `crawledPageId` bigint(20) NOT NULL,
+ `createDate` int(4) DEFAULT NULL,
+ `modDate` int(4) DEFAULT NULL,
+ PRIMARY KEY (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;
+
+
Deleted: wwwbase/Crawler/sectiunea_de_crawler_dex.conf
==============================================================================
--- wwwbase/Crawler/sectiunea_de_crawler_dex.conf Sun Sep 8 23:31:54 2013 (r978)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,9 +0,0 @@
-[crawler]
-raw_page_path=RawPage/
-parsed_page_path=ParsedText/
-t_wait=30
-exception_exit=true
-crawler_log=crawler_log
-new_line=PHP_EOL
-log2screen=true
-log2file=true
\ No newline at end of file
Modified: wwwbase/styles/crawler.css
==============================================================================
--- wwwbase/styles/crawler.css Fri Sep 6 17:49:26 2013 (r978)
+++ wwwbase/styles/crawler.css Sun Sep 8 23:31:54 2013 (r979)
@@ -86,7 +86,7 @@
#crawlerTitle img {
- width: 310px !important;
+ width: 330px !important;
height: 50px !important;
margin: 0 auto !important;
}
More information about the Dev
mailing list