[dev] [commit] r1029 - app

Mon Oct 21 21:20:24 EEST 2013

Author: cata
Date: Mon Oct 21 21:20:24 2013
New Revision: 1029

Log:
* Allow the crawler to mix domains.
* Removed the $currentLocation field. This should be reimplemented with a per-domain regexp.
* This required a rewrite of makeAbsoluteLink().

Modified:
   app/AbstractCrawler.php
   app/Crawler.php

Modified: app/AbstractCrawler.php
==============================================================================

--- app/AbstractCrawler.php	Mon Oct 21 20:35:08 2013	(r1028)
+++ app/AbstractCrawler.php	Mon Oct 21 21:20:24 2013	(r1029)
@@ -19,15 +19,10 @@
   protected $rawPagePath;
   protected $parsedTextPath;
 
-  protected $currentLocation;
-
   protected $urlResource;
   protected $directoryIndexFile;
   protected $indexFileExt;
 
-  protected $domainsList;
-
-
   function __construct() {
     $this->plainText = '';
     $this->pageContent = '';
@@ -125,7 +120,7 @@
     //$nextLink = null;
     try {
       //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
-      $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+      $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
         
       if ($nextLink != null) {
         
@@ -219,37 +214,30 @@
 
     //sterge slash-uri in plus si directory index file
     $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
-    
-    if (!strstr($url, $this->currentLocation)) return;    
 
-    Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+    $rec = util_parseUtf8Url($canonicalUrl);
+    if ($rec['host'] == $this->getDomain($url)) {
+      Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+    }
   }
 
   function isRelativeLink($url) {
     return !strstr($url, "http");
   }
 
-  //cauta directorul link-ului curent si returneaza
-  //url-ul spre acel director
+  // Cauta directorul link-ului curent si returneaza url-ul spre acel director
+  // Returnează întregul URL dacă nu există un director.
   function getDeepestDir($url) {
-
-    try {
-      $retVal = substr($url, 0, strrpos($url,'/'));
-
-      if (strstr($retVal, $this->currentLocation))
-        return $retVal;
-      else return $url;
-    }
-    catch(Exception $ex) {
-
-      exceptionLog($ex);
+    $parts = explode('//', $url, 2); // Salvează protocolul
+    $pos = strrpos($parts[1], '/');
+    if ($pos !== false) {
+      $parts[1] = substr($parts[1], 0, $pos);
     }
-    return $url;
+		return implode('//', $parts);    
   }
 
   function makeAbsoluteLink($url) {
-
-    return $this->getDeepestDir($this->currentUrl) .'/'. $url;
+    return $this->getDeepestDir($this->currentUrl) . '/' . $url;
   }
 
   function getDomain($url) {
@@ -260,9 +248,6 @@
 
   // Clasele care deriva aceasta clasa vor trebui sa implementeze metodele de mai jos
   abstract function extractText($domNode);
-
-  abstract function crawlDomain();
-
   abstract function start();
 }
 

Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php	Mon Oct 21 20:35:08 2013	(r1028)
+++ app/Crawler.php	Mon Oct 21 21:20:24 2013	(r1029)
@@ -50,7 +50,7 @@
     }
   }
 
-  function crawlDomain() {
+  function crawlLoop() {
 
     Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
 
@@ -66,6 +66,7 @@
       $pageContent = $this->getPage($url);
       //setam url-ul curent pentru store in Database
       $this->currentUrl = $url;
+      $this->urlResource = util_parseUtf8Url($url);
       $links = $this->processPage($pageContent);
 
       $this->setStorePageParams();
@@ -92,40 +93,17 @@
 
 
   function start() {
-  
     Applog::log("Crawler started");
 
-    $this->domainsList = Config::get('crawler.whiteList');
-
-    //start processing 
-    $this->processWhiteList();
-
-    Applog::log('Crawler finished');
-  }
-
-
-  function processWhiteList() {
-    foreach($this->domainsList as $startUrl) {
-      $startUrl = trim($startUrl);
-
-      //curatam url-ul
-      $this->currentUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
-      //impartim url-ul pe componente
-      $this->urlResource = util_parseUtf8Url($this->currentUrl);
-
-      //salvam startUrl in tabelul Link pentru a incepe extragerea,
-      //startUrl nu va avea o pagina din care este descoperit
-      //asa ca pagina crawledPageId va avea valoarea 0.
-      Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
-
-      //locatia curenta, va fi folosita pentru a nu depasi sfera
-      //de exemplu vrem sa crawlam doar o anumita zona a site-ului
-      $this->currentLocation = substr($this->currentUrl, 0);
-      Applog::log('domain start location: '.$this->currentLocation);
-
-      $this->crawlDomain();
+    // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea.
+    // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
+    foreach (Config::get('crawler.whiteList') as $startUrl) {
+      $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
+      $rec = util_parseUtf8Url($startUrl);
+      Link::saveLink2DB($startUrl, $rec['host'], 0);
     }
-    
+
+    $this->crawlLoop();
   }
 
 }