[dev] [commit] r1034 - app phplib phplib/models

Fri Nov 1 17:03:05 EET 2013

Author: cata
Date: Fri Nov  1 17:03:05 2013
New Revision: 1034

Log:
Make the crawler crawl the entire whitelist at once.
Use the $accessTimes map to keep track of access time to each site on the whitelist.

Modified:
   app/AbstractCrawler.php
   app/Crawler.php
   phplib/StringUtil.php
   phplib/models/CrawledPage.php
   phplib/models/Link.php
   phplib/util.php

Modified: app/AbstractCrawler.php
==============================================================================

--- app/AbstractCrawler.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ app/AbstractCrawler.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -18,10 +18,10 @@
   protected $currentPageId;
   protected $rawPagePath;
   protected $parsedTextPath;
-
   protected $urlResource;
   protected $directoryIndexFile;
   protected $indexFileExt;
+  protected $accessTimes;
 
   function __construct() {
     $this->plainText = '';
@@ -29,6 +29,12 @@
     $this->directoryIndexFile = Config::get('crawler.dir_index_file');
     $this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
     $this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
+    
+    $this->accessTimes = array();
+    foreach (Config::get('crawler.whiteList') as $startUrl) {
+      $rec = StringUtil::parseUtf8Url($startUrl);
+      $this->accessTimes[$rec['host']] = 0;
+    }
   }
 
 
@@ -49,15 +55,16 @@
     $this->info = curl_getinfo($this->ch);
 
     if(!curl_errno($this->ch)) {
-       
       $this->info = curl_getinfo($this->ch);
-    }
-    else{
-
+    } else{
       $this->info = array('http_code' => 404);
     }
 
-    curl_close( $this->ch);
+    curl_close($this->ch);
+
+    // Update access time for this page's host
+    $rec = StringUtil::parseUtf8Url($url);
+    $this->accessTimes[$rec['host']] = time();
 
     return $this->pageContent;
   }
@@ -80,12 +87,10 @@
   
   //seteaza locatia unde vor fi salvate fisierele html raw si clean text
   function setStorePageParams() {
-
-    $this->currentTimestamp = date("Y-m-d H:i:s");
-    $this->rawPagePath = Config::get('crawler.raw_page_path')
-      .$this->urlResource['host'] .'/'. $this->currentTimestamp;
-    $this->parsedTextPath = Config::get('crawler.parsed_text_path')
-      .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+    $date = date("Y-m-d H:i:s");
+    $this->currentTimestamp = time();
+    $this->rawPagePath = Config::get('crawler.raw_page_path') . $this->urlResource['host'] . '/' . $date;
+    $this->parsedTextPath = Config::get('crawler.parsed_text_path') . $this->urlResource['host'] . '/' . $date;
   }
 
   //verifica daca pagina poate fi descarcata si daca e HTML
@@ -113,25 +118,20 @@
     return $this->info['http_code'];
   }
 
-  //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+  // Returneaza urmatorul URL ne crawl-at din baza de date.
+  // Returneaza null dacă niciun URL nu poate fi crawlat încă.
   function getNextLink() {
-
-
-    //$nextLink = null;
-    try {
-      //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
-      $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
-        
-      if ($nextLink != null) {
-        
-        return $nextLink->canonicalUrl;
+    $delay = Config::get('crawler.t_wait');
+    foreach (Config::get('crawler.whiteList') as $startUrl) {
+      $rec = StringUtil::parseUtf8Url($startUrl);
+      if ($this->accessTimes[$rec['host']] < time() - $delay) {
+        $query = sprintf("select canonicalUrl from Link where domain = '%s' and canonicalUrl not in (select url from CrawledPage)", $rec['host']);
+        $link = Model::factory('Link')->raw_query($query)->find_one();
+        if ($link) {
+          return $link;
+        }
       }
     }
-    catch(Exception $ex) {
-
-      Applog::exceptionLog($ex);
-    }
-
     return null;
   }
 
@@ -172,17 +172,12 @@
   }
 
   function eligibleUrl($url) {
-
-    $resource = util_parseUtf8Url($url);
+    $resource = StringUtil::parseUtf8Url($url);
     $pathInfo = pathinfo($resource['path']);
 
     if (isset($pathInfo['extension'])) {
-
       $ext = $pathInfo['extension'];
-
-
       if (array_search($ext, $this->fileExt) === false) {
-
         return false;
       }
     }
@@ -195,27 +190,20 @@
   //gaseste toate linkurile
   //le transforma in absolute daca sunt relative
   function processLink($url) {
-
-
-    if (!$this->eligibleUrl($url)) {
-
-      return;
-    }
-
     Applog::log('Processing link: '.$url);
     $canonicalUrl = null;
     if ($this->isRelativeLink($url)) {
-
       $url = $this->makeAbsoluteLink($url);
     }
-    //daca ultimul caracter este '/', il eliminam
-    //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
-    if (substr($url, -1) == "/") $url = substr($url, 0, -1);
 
     //sterge slash-uri in plus si directory index file
     $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
 
-    $rec = util_parseUtf8Url($canonicalUrl);
+    if (!$this->eligibleUrl($url)) {
+      return;
+    }
+
+    $rec = StringUtil::parseUtf8Url($canonicalUrl);
     if ($rec['host'] == $this->getDomain($url)) {
       Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
     }

Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ app/Crawler.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -55,40 +55,39 @@
     Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
 
     while (1) {
-
       //extrage urmatorul link neprelucrat din baza de date
-      $url = $this->getNextLink();
-      Applog::log('current URL: ' . $url);
-      //daca s-a terminat crawling-ul
-      if ($url == null || $url == '') break;
-
-      //download pagina
-      $pageContent = $this->getPage($url);
-      //setam url-ul curent pentru store in Database
-      $this->currentUrl = $url;
-      $this->urlResource = util_parseUtf8Url($url);
-      $links = $this->processPage($pageContent);
-
-      $this->setStorePageParams();
-
-      //salveaza o intrare despre pagina curenta in baza de date
-      $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
-
-      //daca pagina nu e in format html (e imagine sau alt fisier)
-      //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
-      if (!$this->pageOk()) {
-        continue;
-      }
+      $link = $this->getNextLink();
+      if ($link) {
+        Applog::log('current URL: ' . $link->canonicalUrl);
+
+        //download pagina
+        $pageContent = $this->getPage($link->canonicalUrl);
+        //setam url-ul curent pentru store in Database
+        $this->currentUrl = $link->canonicalUrl;
+        $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl);
+        $links = $this->processPage($pageContent);
+
+        $this->setStorePageParams();
+
+        //salveaza o intrare despre pagina curenta in baza de date
+        $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
+
+        //daca pagina nu e in format html (e imagine sau alt fisier)
+        //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
+        if (!$this->pageOk()) {
+          continue;
+        }
       
-      foreach($links as $link) {
-        $this->processLink($link);
+        foreach($links as $link) {
+          $this->processLink($link);
+        }
       }
 
-      //niceness
-      sleep(Config::get('crawler.t_wait'));
+      // Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
+      $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
+      Applog::log("Sleeping for $sleepTime seconds");
+      sleep($sleepTime);
     }
-
-    Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
   }
 
 
@@ -99,7 +98,7 @@
     // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
     foreach (Config::get('crawler.whiteList') as $startUrl) {
       $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
-      $rec = util_parseUtf8Url($startUrl);
+      $rec = StringUtil::parseUtf8Url($startUrl);
       Link::saveLink2DB($startUrl, $rec['host'], 0);
     }
 

Modified: phplib/StringUtil.php
==============================================================================
--- phplib/StringUtil.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ phplib/StringUtil.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -301,6 +301,17 @@
     return array_values(array_filter(explode($delimiter, $s), 'strlen'));
   }
 
+
+  /** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
+  static function parseUtf8Url($url) {
+    static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+    if (is_string($url) && preg_match('~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+                                      '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+      return $matches;
+    }
+    return false;
+  }
+
 	/**
    * Cleans up a URL in various ways:
    * - trims any known index files and extensions (passed as arguments)
@@ -311,6 +322,12 @@
    * @param $indexExt Array of index file extensions
    **/
 	static function urlCleanup($url, $indexFile, $indexExt) {
+    // Delete any fragment
+    $pos = strrpos($url, '#');
+    if ($pos !== false) {
+      $url = substr($url, 0, $pos);
+    }
+
     // Scroll through the extension list until we find one that matches
     $i = 0;
     $found = false;

Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ phplib/models/CrawledPage.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -1,6 +1,5 @@
 <?php
 
-
 class CrawledPage  extends BaseObject implements DatedObject {
   
   public static $_table = 'CrawledPage';
@@ -13,8 +12,7 @@
     file_put_contents($parsedTextPath, $parsedText);
 
     try {
-      $tableObj = Model::factory(self::$_table);
-      $tableObj->create();
+      $tableObj = Model::factory(self::$_table)->create();
       $tableObj->timestamp = $timestamp;
       $tableObj->url = $url;
       $tableObj->httpStatus = $httpStatus;

Modified: phplib/models/Link.php
==============================================================================
--- phplib/models/Link.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ phplib/models/Link.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -1,7 +1,6 @@
 <?php
 
-
-class Link extends BaseObject {
+class Link extends BaseObject implements DatedObject {
 	//implements DatedObject {
 
 	public static $_table = 'Link';
@@ -16,8 +15,7 @@
 
 		try {
 
-			$tableObj = Model::factory(self::$_table);
-			$tableObj->create();
+			$tableObj = Model::factory(self::$_table)->create();
 			$tableObj->canonicalUrl = $canonicalUrl;
 			$tableObj->domain = $domain;
 			$tableObj->crawledPageId = $crawledPageId;

Modified: phplib/util.php
==============================================================================
--- phplib/util.php	Sat Oct 26 10:30:33 2013	(r1033)
+++ phplib/util.php	Fri Nov  1 17:03:05 2013	(r1034)
@@ -491,20 +491,4 @@
   return $result;
 }
 
-/** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
-function util_parseUtf8Url($url) {
-  static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
-  if (is_string($url) && preg_match(
-        '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
-        '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
-    foreach ($matches as $key => $value) {
-      if (!isset($keys[$key]) || empty($value)) {
-        unset($matches[$key]);
-      }
-    }
-    return $matches;
-  }
-  return false;
-}
-
 ?>