[dev] [commit] r1034 - app phplib phplib/models
automailer at dexonline.ro
automailer at dexonline.ro
Fri Nov 1 17:03:05 EET 2013
Author: cata
Date: Fri Nov 1 17:03:05 2013
New Revision: 1034
Log:
Make the crawler crawl the entire whitelist at once.
Use the $accessTimes map to keep track of access time to each site on the whitelist.
Modified:
app/AbstractCrawler.php
app/Crawler.php
phplib/StringUtil.php
phplib/models/CrawledPage.php
phplib/models/Link.php
phplib/util.php
Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php Sat Oct 26 10:30:33 2013 (r1033)
+++ app/AbstractCrawler.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -18,10 +18,10 @@
protected $currentPageId;
protected $rawPagePath;
protected $parsedTextPath;
-
protected $urlResource;
protected $directoryIndexFile;
protected $indexFileExt;
+ protected $accessTimes;
function __construct() {
$this->plainText = '';
@@ -29,6 +29,12 @@
$this->directoryIndexFile = Config::get('crawler.dir_index_file');
$this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
$this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
+
+ $this->accessTimes = array();
+ foreach (Config::get('crawler.whiteList') as $startUrl) {
+ $rec = StringUtil::parseUtf8Url($startUrl);
+ $this->accessTimes[$rec['host']] = 0;
+ }
}
@@ -49,15 +55,16 @@
$this->info = curl_getinfo($this->ch);
if(!curl_errno($this->ch)) {
-
$this->info = curl_getinfo($this->ch);
- }
- else{
-
+ } else{
$this->info = array('http_code' => 404);
}
- curl_close( $this->ch);
+ curl_close($this->ch);
+
+ // Update access time for this page's host
+ $rec = StringUtil::parseUtf8Url($url);
+ $this->accessTimes[$rec['host']] = time();
return $this->pageContent;
}
@@ -80,12 +87,10 @@
//seteaza locatia unde vor fi salvate fisierele html raw si clean text
function setStorePageParams() {
-
- $this->currentTimestamp = date("Y-m-d H:i:s");
- $this->rawPagePath = Config::get('crawler.raw_page_path')
- .$this->urlResource['host'] .'/'. $this->currentTimestamp;
- $this->parsedTextPath = Config::get('crawler.parsed_text_path')
- .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+ $date = date("Y-m-d H:i:s");
+ $this->currentTimestamp = time();
+ $this->rawPagePath = Config::get('crawler.raw_page_path') . $this->urlResource['host'] . '/' . $date;
+ $this->parsedTextPath = Config::get('crawler.parsed_text_path') . $this->urlResource['host'] . '/' . $date;
}
//verifica daca pagina poate fi descarcata si daca e HTML
@@ -113,25 +118,20 @@
return $this->info['http_code'];
}
- //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+ // Returneaza urmatorul URL ne crawl-at din baza de date.
+ // Returneaza null dacă niciun URL nu poate fi crawlat încă.
function getNextLink() {
-
-
- //$nextLink = null;
- try {
- //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
- $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
-
- if ($nextLink != null) {
-
- return $nextLink->canonicalUrl;
+ $delay = Config::get('crawler.t_wait');
+ foreach (Config::get('crawler.whiteList') as $startUrl) {
+ $rec = StringUtil::parseUtf8Url($startUrl);
+ if ($this->accessTimes[$rec['host']] < time() - $delay) {
+ $query = sprintf("select canonicalUrl from Link where domain = '%s' and canonicalUrl not in (select url from CrawledPage)", $rec['host']);
+ $link = Model::factory('Link')->raw_query($query)->find_one();
+ if ($link) {
+ return $link;
+ }
}
}
- catch(Exception $ex) {
-
- Applog::exceptionLog($ex);
- }
-
return null;
}
@@ -172,17 +172,12 @@
}
function eligibleUrl($url) {
-
- $resource = util_parseUtf8Url($url);
+ $resource = StringUtil::parseUtf8Url($url);
$pathInfo = pathinfo($resource['path']);
if (isset($pathInfo['extension'])) {
-
$ext = $pathInfo['extension'];
-
-
if (array_search($ext, $this->fileExt) === false) {
-
return false;
}
}
@@ -195,27 +190,20 @@
//gaseste toate linkurile
//le transforma in absolute daca sunt relative
function processLink($url) {
-
-
- if (!$this->eligibleUrl($url)) {
-
- return;
- }
-
Applog::log('Processing link: '.$url);
$canonicalUrl = null;
if ($this->isRelativeLink($url)) {
-
$url = $this->makeAbsoluteLink($url);
}
- //daca ultimul caracter este '/', il eliminam
- //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
- if (substr($url, -1) == "/") $url = substr($url, 0, -1);
//sterge slash-uri in plus si directory index file
$canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
- $rec = util_parseUtf8Url($canonicalUrl);
+ if (!$this->eligibleUrl($url)) {
+ return;
+ }
+
+ $rec = StringUtil::parseUtf8Url($canonicalUrl);
if ($rec['host'] == $this->getDomain($url)) {
Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
}
Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php Sat Oct 26 10:30:33 2013 (r1033)
+++ app/Crawler.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -55,40 +55,39 @@
Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
while (1) {
-
//extrage urmatorul link neprelucrat din baza de date
- $url = $this->getNextLink();
- Applog::log('current URL: ' . $url);
- //daca s-a terminat crawling-ul
- if ($url == null || $url == '') break;
-
- //download pagina
- $pageContent = $this->getPage($url);
- //setam url-ul curent pentru store in Database
- $this->currentUrl = $url;
- $this->urlResource = util_parseUtf8Url($url);
- $links = $this->processPage($pageContent);
-
- $this->setStorePageParams();
-
- //salveaza o intrare despre pagina curenta in baza de date
- $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
-
- //daca pagina nu e in format html (e imagine sau alt fisier)
- //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
- if (!$this->pageOk()) {
- continue;
- }
+ $link = $this->getNextLink();
+ if ($link) {
+ Applog::log('current URL: ' . $link->canonicalUrl);
+
+ //download pagina
+ $pageContent = $this->getPage($link->canonicalUrl);
+ //setam url-ul curent pentru store in Database
+ $this->currentUrl = $link->canonicalUrl;
+ $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl);
+ $links = $this->processPage($pageContent);
+
+ $this->setStorePageParams();
+
+ //salveaza o intrare despre pagina curenta in baza de date
+ $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
+
+ //daca pagina nu e in format html (e imagine sau alt fisier)
+ //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
+ if (!$this->pageOk()) {
+ continue;
+ }
- foreach($links as $link) {
- $this->processLink($link);
+ foreach($links as $link) {
+ $this->processLink($link);
+ }
}
- //niceness
- sleep(Config::get('crawler.t_wait'));
+ // Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
+ $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
+ Applog::log("Sleeping for $sleepTime seconds");
+ sleep($sleepTime);
}
-
- Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
}
@@ -99,7 +98,7 @@
// Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
foreach (Config::get('crawler.whiteList') as $startUrl) {
$startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
- $rec = util_parseUtf8Url($startUrl);
+ $rec = StringUtil::parseUtf8Url($startUrl);
Link::saveLink2DB($startUrl, $rec['host'], 0);
}
Modified: phplib/StringUtil.php
==============================================================================
--- phplib/StringUtil.php Sat Oct 26 10:30:33 2013 (r1033)
+++ phplib/StringUtil.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -301,6 +301,17 @@
return array_values(array_filter(explode($delimiter, $s), 'strlen'));
}
+
+ /** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
+ static function parseUtf8Url($url) {
+ static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+ if (is_string($url) && preg_match('~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+ '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+ return $matches;
+ }
+ return false;
+ }
+
/**
* Cleans up a URL in various ways:
* - trims any known index files and extensions (passed as arguments)
@@ -311,6 +322,12 @@
* @param $indexExt Array of index file extensions
**/
static function urlCleanup($url, $indexFile, $indexExt) {
+ // Delete any fragment
+ $pos = strrpos($url, '#');
+ if ($pos !== false) {
+ $url = substr($url, 0, $pos);
+ }
+
// Scroll through the extension list until we find one that matches
$i = 0;
$found = false;
Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php Sat Oct 26 10:30:33 2013 (r1033)
+++ phplib/models/CrawledPage.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -1,6 +1,5 @@
<?php
-
class CrawledPage extends BaseObject implements DatedObject {
public static $_table = 'CrawledPage';
@@ -13,8 +12,7 @@
file_put_contents($parsedTextPath, $parsedText);
try {
- $tableObj = Model::factory(self::$_table);
- $tableObj->create();
+ $tableObj = Model::factory(self::$_table)->create();
$tableObj->timestamp = $timestamp;
$tableObj->url = $url;
$tableObj->httpStatus = $httpStatus;
Modified: phplib/models/Link.php
==============================================================================
--- phplib/models/Link.php Sat Oct 26 10:30:33 2013 (r1033)
+++ phplib/models/Link.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -1,7 +1,6 @@
<?php
-
-class Link extends BaseObject {
+class Link extends BaseObject implements DatedObject {
//implements DatedObject {
public static $_table = 'Link';
@@ -16,8 +15,7 @@
try {
- $tableObj = Model::factory(self::$_table);
- $tableObj->create();
+ $tableObj = Model::factory(self::$_table)->create();
$tableObj->canonicalUrl = $canonicalUrl;
$tableObj->domain = $domain;
$tableObj->crawledPageId = $crawledPageId;
Modified: phplib/util.php
==============================================================================
--- phplib/util.php Sat Oct 26 10:30:33 2013 (r1033)
+++ phplib/util.php Fri Nov 1 17:03:05 2013 (r1034)
@@ -491,20 +491,4 @@
return $result;
}
-/** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
-function util_parseUtf8Url($url) {
- static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
- if (is_string($url) && preg_match(
- '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
- '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
- foreach ($matches as $key => $value) {
- if (!isset($keys[$key]) || empty($value)) {
- unset($matches[$key]);
- }
- }
- return $matches;
- }
- return false;
-}
-
?>
More information about the Dev
mailing list