[dev] [commit] r1028 - app patches phplib phplib/models

automailer at dexonline.ro automailer at dexonline.ro
Mon Oct 21 20:35:08 EEST 2013


Author: cata
Date: Mon Oct 21 20:35:08 2013
New Revision: 1028

Log:
* CrawledPage handles both saves to the database and to the disk, to make save operations atomic
* This required some reordering in Crawler.php so that the plainText is available when savePage2DB is invoked.
* Moved URL cleanup code to StringUtil and rewrote the duplicate slash removal code

Added:
   patches/00103.sql
Modified:
   app/AbstractCrawler.php
   app/Crawler.php
   phplib/StringUtil.php
   phplib/models/CrawledPage.php
   phplib/util.php

Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php	Mon Oct 21 13:24:29 2013	(r1027)
+++ app/AbstractCrawler.php	Mon Oct 21 20:35:08 2013	(r1028)
@@ -1,372 +1,269 @@
-<?php
-/*
- * Alin Ungureanu, 2013
- * alyn.cti at gmail.com
- */
-require_once __DIR__ . '/../phplib/util.php';
-require_once util_getRootPath() . 'phplib/simple_html_dom.php';
-
-require_once util_getRootPath() . 'phplib/AppLog.php';
-require_once util_getRootPath() . 'phplib/MemoryManagement.php';
-
-
-db_init();
-
-abstract class AbstractCrawler {
-
-	protected $ch;
-	protected $pageContent;
-	protected $plainText;
-	protected $info;
-	protected $currentUrl;
-	protected $currentTimestamp;
-	protected $currentPageId;
-	protected $rawPagePath;
-	protected $parsedTextPath;
-
-	protected $currentLocation;
-
-	protected $urlResource;
-	protected $directoryIndexFile;
-	protected $indexFileExt;
-
-	protected $domainsList;
-
-
-	function __construct() {
-
-		$this->plainText = '';
-		$this->pageContent = '';
-		$this->directoryIndexFile = Config::get('crawler.dir_index_file');
-		$this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
-		$this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
-	}
-
-
-	//descarca pagina de la $url
-	function getPage($url) {
-
-		$this->ch = curl_init();
-		Applog::log("User agent is: " . Config::get('crawler.user_agent'));
-		curl_setopt ($this->ch, CURLOPT_URL, $url);
-		curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
-		curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
-		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
-		curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
-		curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
-		curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
-		curl_setopt ($this->ch, CURLOPT_REFERER, $url);
-		$this->pageContent = curl_exec($this->ch);
-		$this->info = curl_getinfo($this->ch);
-
-		if(!curl_errno($this->ch)) {
- 			
- 			$this->info = curl_getinfo($this->ch);
-		}
-		else{
-
-			$this->info = array('http_code' => 404);
-		}
-
-		curl_close( $this->ch);
-
-		return $this->pageContent;
-	}
-
-
-    //returneaza tipul continutului paginii
-    function getUrlMimeType($buffer) {
-
-	    $finfo = new finfo(FILEINFO_MIME_TYPE);
-	    return $finfo->buffer($buffer);
-	}
-	//verifica daca continutul paginii e html, nu alt fisier
-	function isHtml($buffer) {
-
-		Applog::log("PAGE TYPE=".$this->getUrlMimeType($buffer));
-
-		return strstr($this->getUrlMimeType($buffer), 'html');
-	}
-
-	
-	//seteaza locatia unde vor fi salvate fisierele html raw si clean text
-	function setStorePageParams() {
-
-		$this->currentTimestamp = date("Y-m-d H:i:s");
-		$this->rawPagePath = Config::get('crawler.raw_page_path')
-			.$this->urlResource['host'] .'/'. $this->currentTimestamp;
-		$this->parsedTextPath = Config::get('crawler.parsed_text_path')
-			.$this->urlResource['host'] .'/'. $this->currentTimestamp;
-	}
-
-	//verifica daca pagina poate fi descarcata si daca e HTML
-	function pageOk() {
-
-		Applog::log("HTTP CODE " .$this->httpResponse());
-		//verifica codul HTTP
-		if ($this->httpResponse() >= 400) {
-				Applog::log("HTTP Error, URL Skipped");
-				return false;
-		}
-		//verifica daca pagina e HTML
-		if (!$this->isHtml($this->pageContent)) {
-
-				Applog::log("Page not HTML, URL Skipped");
-				return false;
-		}
-
-		return true;
-	}
-	
-	/*
-	 * Salveaza pagina in format raw si clean text in fisiere 
-	 */
-	function saveCurrentPage() {
-
-
-		try {
-			if (!file_exists(Config::get('crawler.raw_page_path').$this->urlResource['host'])) {
-				mkdir(Config::get('crawler.raw_page_path').$this->urlResource['host'], 0777, true);
-			}
-			if (!file_exists(Config::get('crawler.parsed_text_path').$this->urlResource['host'])) {
-				mkdir(Config::get('crawler.parsed_text_path').$this->urlResource['host'], 0777, true);
-			}
-			//salveaza pagina raw pe disk
-			file_put_contents($this->rawPagePath, $this->pageContent);
-			//converteste simbolurile HTML in format text si elimina din spatii.
-			$this->plainText = preg_replace("/  /", "", html_entity_decode($this->plainText));
-			//salveaza textul extras pe disk
-			file_put_contents($this->parsedTextPath, $this->plainText);
-		}
-		catch(Exception $ex) {
-
-			Applog::exceptionLog($ex);
-		}
-	}
-
-	//returneaza codul HTTP
-	function httpResponse() {
-
-		return $this->info['http_code'];
-	}
-
-	//returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
-    function getNextLink() {
-
-
-    	//$nextLink = null;
-    	try {
-	    	//$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
-	    	$nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
-	    	
-	    	if ($nextLink != null) {
-	    	
-	    		return $nextLink->canonicalUrl;
-	    	}
-	    }
-	    catch(Exception $ex) {
-
-	    	Applog::exceptionLog($ex);
-	    }
-
-	    return null;
-    }
-
-    //repara HTML-ul stricat intr-un mod minimal astfel incat
-    //sa poata fi interpretat de biblioteca simple_html_dom
-    function fixHtml($html) {
-
-    	foreach($html->find('head') as $script) {
-
-			$script->outertext = '';
-		}
-
-    	foreach($html->find('script') as $script) {
-
-			$script->outertext = '';
-		}
-
-		foreach($html->find('style') as $style) {
-
-			$style->outertext = '';
-		}
-
-		$html->load($html->save());
-		
-		//transforma pagina raw in simple_html_dom_node
-		//$this->dom = str_get_html($pageContent);
-		
-		$buffer = '<html><body>';
-		$nodes = $html->childNodes();
-		foreach($nodes as $node) {
-
-			$buffer .= $node->innertext();
-		}
-
-		$buffer .= '</body></html>';
-
-		return str_get_html($buffer);
-    }
-
-    function eligibleUrl($url) {
-
-    	$resource = parse_utf8_url($url);
-    	$pathInfo = pathinfo($resource['path']);
-
-    	if (isset($pathInfo['extension'])) {
-
-    		$ext = $pathInfo['extension'];
-
-
-    		if (array_search($ext, $this->fileExt) === false) {
-
-    			return false;
-    		}
-    	}
-
-    	return true;
-    }
-
-    //metode pentru prelucrarea linkurilor
-	//sterge directory index file si elimina slash-urile in plus
-	//gaseste toate linkurile
-	//le transforma in absolute daca sunt relative
-	function processLink($url) {
-
-
-		if (!$this->eligibleUrl($url)) {
-
-			return;
-		}
-
-		Applog::log('Processing link: '.$url);
-		$canonicalUrl = null;
-		if ($this->isRelativeLink($url)) {
-
-			$url = $this->makeAbsoluteLink($url);
-		}
-		//daca ultimul caracter este '/', il eliminam
-		//exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
-		if (substr($url, -1) == "/") $url = substr($url, 0, -1);
-
-		//sterge slash-uri in plus si directory index file
-		$canonicalUrl = $this->urlPadding($url);
-		
-		if (!strstr($url, $this->currentLocation)) return;		
-
-		Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
-	}
-
-
-	function urlPadding($url) {
-
-		return $this->delDuplicateSlashes($this->delDirIndexFile($url));
-	}
-
-
-	//delestes index.php/html/pl/py/jsp  etc
-	function delDirIndexFile($url) {
-
-		//Applog::log('delDirIndexFile  '.$url);
-
-		foreach($this->indexFileExt as $ext) {
-
-			$target = $this->directoryIndexFile .'.'. $ext;
-
-			if (strstr($url, $target))
-				return str_replace($target, "", $url);
-		}
-
-		return $url;
-	}
-
-	//deletes slashes when not needed
-	function delDuplicateSlashes($url) {
-
-		if (strlen($url) < 5) {
-
-			Applog::log("whatup with delDuplicateSlashes: $url");
-			return $this->currentUrl;
-		}
-		
-
-		$parsedUrl = parse_utf8_url($url);
-		
-
-		if (substr_count($parsedUrl['host'], '.') < 2) {
-
-			$parsedUrl['host'] = 'www.'.$parsedUrl['host'];
-		}
-
-		$retUrl = $parsedUrl['scheme'].'://'.$parsedUrl['host'];
-		$consecutiveSlash = false;
-
-		$url = substr($url, strlen($retUrl));
-
-		for ($i = 0; $i < strlen($url); ++$i) {
-			$nextCh = substr($url, $i, 1);
-
-			if ($nextCh == '/' && !$consecutiveSlash) {
-
-				$retUrl .= $nextCh;
-				$consecutiveSlash = true;
-			}
-			else if ($nextCh == '/') {}
-			else {
-				$retUrl .= $nextCh;
-				$consecutiveSlash = false;
-			}
-		}
-
-		//eliminarea slash-ului final
-	
-		if (substr($retUrl, -1) == "/") $retUrl = substr($retUrl, 0, -1);
-
-		return $retUrl;
-	}
-
-
-	function isRelativeLink($url) {
-
-		return !strstr($url, "http");
-	}
-
-	//cauta directorul link-ului curent si returneaza
-	//url-ul spre acel director
-	function getDeepestDir($url) {
-
-		try {
-			$retVal = substr($url, 0, strrpos($url,'/'));
-
-			if (strstr($retVal, $this->currentLocation))
-				return $retVal;
-			else return $url;
-		}
-		catch(Exception $ex) {
-
-			exceptionLog($ex);
-		}
-		return $url;
-	}
-
-	function makeAbsoluteLink($url) {
-
-		return $this->getDeepestDir($this->currentUrl) .'/'. $url;
-	}
-
-	function getDomain($url) {
-
-		return $this->urlResource['host'];
-	}
-
-
-	//Clasele care deriva aceasta clasa vor trebui
-	//sa implementeze metodele de mai jos
-	abstract function extractText($domNode);
-
-	abstract function crawlDomain();
-
-	abstract function start();
-}
-
-?>1
\ No newline at end of file
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once __DIR__ . '/../phplib/util.php';
+require_once util_getRootPath() . 'phplib/simple_html_dom.php';
+require_once util_getRootPath() . 'phplib/AppLog.php';
+require_once util_getRootPath() . 'phplib/MemoryManagement.php';
+
+abstract class AbstractCrawler {
+  protected $ch;
+  protected $pageContent;
+  protected $plainText;
+  protected $info;
+  protected $currentUrl;
+  protected $currentTimestamp;
+  protected $currentPageId;
+  protected $rawPagePath;
+  protected $parsedTextPath;
+
+  protected $currentLocation;
+
+  protected $urlResource;
+  protected $directoryIndexFile;
+  protected $indexFileExt;
+
+  protected $domainsList;
+
+
+  function __construct() {
+    $this->plainText = '';
+    $this->pageContent = '';
+    $this->directoryIndexFile = Config::get('crawler.dir_index_file');
+    $this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
+    $this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
+  }
+
+
+  //descarca pagina de la $url
+  function getPage($url) {
+
+    $this->ch = curl_init();
+    Applog::log("User agent is: " . Config::get('crawler.user_agent'));
+    curl_setopt ($this->ch, CURLOPT_URL, $url);
+    curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
+    curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
+    curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
+    curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
+    curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
+    curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
+    curl_setopt ($this->ch, CURLOPT_REFERER, $url);
+    $this->pageContent = curl_exec($this->ch);
+    $this->info = curl_getinfo($this->ch);
+
+    if(!curl_errno($this->ch)) {
+       
+      $this->info = curl_getinfo($this->ch);
+    }
+    else{
+
+      $this->info = array('http_code' => 404);
+    }
+
+    curl_close( $this->ch);
+
+    return $this->pageContent;
+  }
+
+
+  //returneaza tipul continutului paginii
+  function getUrlMimeType($buffer) {
+
+    $finfo = new finfo(FILEINFO_MIME_TYPE);
+    return $finfo->buffer($buffer);
+  }
+  //verifica daca continutul paginii e html, nu alt fisier
+  function isHtml($buffer) {
+
+    Applog::log("PAGE TYPE=".$this->getUrlMimeType($buffer));
+
+    return strstr($this->getUrlMimeType($buffer), 'html');
+  }
+
+  
+  //seteaza locatia unde vor fi salvate fisierele html raw si clean text
+  function setStorePageParams() {
+
+    $this->currentTimestamp = date("Y-m-d H:i:s");
+    $this->rawPagePath = Config::get('crawler.raw_page_path')
+      .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+    $this->parsedTextPath = Config::get('crawler.parsed_text_path')
+      .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+  }
+
+  //verifica daca pagina poate fi descarcata si daca e HTML
+  function pageOk() {
+
+    Applog::log("HTTP CODE " .$this->httpResponse());
+    //verifica codul HTTP
+    if ($this->httpResponse() >= 400) {
+      Applog::log("HTTP Error, URL Skipped");
+      return false;
+    }
+    //verifica daca pagina e HTML
+    if (!$this->isHtml($this->pageContent)) {
+
+      Applog::log("Page not HTML, URL Skipped");
+      return false;
+    }
+
+    return true;
+  }
+  
+  //returneaza codul HTTP
+  function httpResponse() {
+
+    return $this->info['http_code'];
+  }
+
+  //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+  function getNextLink() {
+
+
+    //$nextLink = null;
+    try {
+      //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
+      $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+        
+      if ($nextLink != null) {
+        
+        return $nextLink->canonicalUrl;
+      }
+    }
+    catch(Exception $ex) {
+
+      Applog::exceptionLog($ex);
+    }
+
+    return null;
+  }
+
+  //repara HTML-ul stricat intr-un mod minimal astfel incat
+  //sa poata fi interpretat de biblioteca simple_html_dom
+  function fixHtml($html) {
+
+    foreach($html->find('head') as $script) {
+
+      $script->outertext = '';
+    }
+
+    foreach($html->find('script') as $script) {
+
+      $script->outertext = '';
+    }
+
+    foreach($html->find('style') as $style) {
+
+      $style->outertext = '';
+    }
+
+    $html->load($html->save());
+    
+    //transforma pagina raw in simple_html_dom_node
+    //$this->dom = str_get_html($pageContent);
+    
+    $buffer = '<html><body>';
+    $nodes = $html->childNodes();
+    foreach($nodes as $node) {
+
+      $buffer .= $node->innertext();
+    }
+
+    $buffer .= '</body></html>';
+
+    return str_get_html($buffer);
+  }
+
+  function eligibleUrl($url) {
+
+    $resource = util_parseUtf8Url($url);
+    $pathInfo = pathinfo($resource['path']);
+
+    if (isset($pathInfo['extension'])) {
+
+      $ext = $pathInfo['extension'];
+
+
+      if (array_search($ext, $this->fileExt) === false) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  //metode pentru prelucrarea linkurilor
+  //sterge directory index file si elimina slash-urile in plus
+  //gaseste toate linkurile
+  //le transforma in absolute daca sunt relative
+  function processLink($url) {
+
+
+    if (!$this->eligibleUrl($url)) {
+
+      return;
+    }
+
+    Applog::log('Processing link: '.$url);
+    $canonicalUrl = null;
+    if ($this->isRelativeLink($url)) {
+
+      $url = $this->makeAbsoluteLink($url);
+    }
+    //daca ultimul caracter este '/', il eliminam
+    //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
+    if (substr($url, -1) == "/") $url = substr($url, 0, -1);
+
+    //sterge slash-uri in plus si directory index file
+    $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
+    
+    if (!strstr($url, $this->currentLocation)) return;    
+
+    Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+  }
+
+  function isRelativeLink($url) {
+    return !strstr($url, "http");
+  }
+
+  //cauta directorul link-ului curent si returneaza
+  //url-ul spre acel director
+  function getDeepestDir($url) {
+
+    try {
+      $retVal = substr($url, 0, strrpos($url,'/'));
+
+      if (strstr($retVal, $this->currentLocation))
+        return $retVal;
+      else return $url;
+    }
+    catch(Exception $ex) {
+
+      exceptionLog($ex);
+    }
+    return $url;
+  }
+
+  function makeAbsoluteLink($url) {
+
+    return $this->getDeepestDir($this->currentUrl) .'/'. $url;
+  }
+
+  function getDomain($url) {
+
+    return $this->urlResource['host'];
+  }
+
+
+  // Clasele care deriva aceasta clasa vor trebui sa implementeze metodele de mai jos
+  abstract function extractText($domNode);
+
+  abstract function crawlDomain();
+
+  abstract function start();
+}
+
+?>

Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php	Mon Oct 21 13:24:29 2013	(r1027)
+++ app/Crawler.php	Mon Oct 21 20:35:08 2013	(r1028)
@@ -7,130 +7,126 @@
 
 class Crawler extends AbstractCrawler {
 
-	//extrage textul fara cod html
-	function getText($domNode) {
-		
-		$this->plainText = html_entity_decode(strip_tags($domNode->text()));
-		//$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
-	}
-	//extrage textul cu cod html din nodul respectiv
-	function extractText($domNode) {
-
-		Applog::log("extracting text");
-		$this->getText($domNode);
-
-		foreach($domNode->find("a") as $link) {
-
-			$this->processLink($link->href);
-		}
-	}
-
-	function processPage($pageContent) {
-
-		try {
-			
-			$html = str_get_html($pageContent);
-
-			//reparam html stricat
-			if (!$html->find('body', 0, true)) {
-
-				$html = $this->fixHtml($html);
-			}
-			
-
-			$this->extractText($html->find('body', 0, true));
-			$this->saveCurrentPage();
-			
-			//cata memorie consuma
-			//si eliberare referinte pierdute
-			
-			$html->clear();
-
-			MemoryManagement::showUsage('before cleaning', true, 'KB');
-			
-			MemoryManagement::clean(true);
-
-			MemoryManagement::showUsage('after cleaning', true, 'KB');
-			//niceness
-			sleep(Config::get('crawler.t_wait'));
-		}
-		catch (Exception $ex) {
-
-			Applog::exceptionLog($ex);
-		}
-	}
-
-	function crawlDomain() {
-
-		Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
-
-		while(1) {
-
-			//extrage urmatorul link neprelucrat din baza de date
-			$url = $this->getNextLink();
-			Applog::log('current URL: ' . $url);
-			//daca s-a terminat crawling-ul
-			if ($url == null || $url == '') break;
-
-			//download pagina
-			$pageContent = $this->getPage($url);
-			//setam url-ul curent pentru store in Database
-			$this->currentUrl = $url;
-
-			$this->setStorePageParams();
-
-			//salveaza o intrare despre pagina curenta in baza de date
-			$this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
-			
-			//daca pagina nu e in format html (e imagine sau alt fisier)
-			//sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
-			if (!$this->pageOk()) {
-				continue;
-			}
-			
-			$this->processPage($pageContent);
-		}
-
-		Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
-	}
-
-
-	function start() {
-	
-		Applog::log("Crawler started");
-
-		$this->domainsList = Config::get('crawler.whiteList');
-
-		//start processing 
-		$this->processWhiteList();
-
-		Applog::log('Crawler finished');
-	}
-
-
-	function processWhiteList() {
-		foreach($this->domainsList as $startUrl) {
-			$startUrl = trim($startUrl);
-
-			//curatam url-ul
-			$this->currentUrl = $this->urlPadding($startUrl);
-			//impartim url-ul pe componente
-			$this->urlResource = parse_utf8_url($this->currentUrl);
-
-			//salvam startUrl in tabelul Link pentru a incepe extragerea,
-			//startUrl nu va avea o pagina din care este descoperit
-			//asa ca pagina crawledPageId va avea valoarea 0.
-			Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
-
-			//locatia curenta, va fi folosita pentru a nu depasi sfera
-			//de exemplu vrem sa crawlam doar o anumita zona a site-ului
-			$this->currentLocation = substr($this->currentUrl, 0);
-			Applog::log('domain start location: '.$this->currentLocation);
-
-			$this->crawlDomain();
-		}
-		
-	}
+  //extrage textul cu cod html din nodul respectiv
+  function extractText($domNode) {
+    Applog::log("extracting text");
+    $this->plainText = html_entity_decode(strip_tags($domNode->text()));
+    $this->plainText = preg_replace("/  +/", " ", $this->plainText);
+  }
+
+  /* Returns an array of links */
+  function processPage($pageContent) {
+    try {
+      $links = array();
+      $html = str_get_html($pageContent);
+
+      //reparam html stricat
+      if (!$html->find('body', 0, true)) {
+
+        $html = $this->fixHtml($html);
+      }
+      
+
+      $body = $html->find('body', 0, true);
+      $this->extractText($body);
+      foreach ($body->find("a") as $link) {
+        $links[] = $link->href;
+      }
+      //cata memorie consuma
+      //si eliberare referinte pierdute
+      
+      $html->clear();
+
+      MemoryManagement::showUsage('before cleaning', true, 'KB');
+      
+      MemoryManagement::clean(true);
+
+      MemoryManagement::showUsage('after cleaning', true, 'KB');
+      return $links;
+    }
+    catch (Exception $ex) {
+
+      Applog::exceptionLog($ex);
+    }
+  }
+
+  function crawlDomain() {
+
+    Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
+
+    while (1) {
+
+      //extrage urmatorul link neprelucrat din baza de date
+      $url = $this->getNextLink();
+      Applog::log('current URL: ' . $url);
+      //daca s-a terminat crawling-ul
+      if ($url == null || $url == '') break;
+
+      //download pagina
+      $pageContent = $this->getPage($url);
+      //setam url-ul curent pentru store in Database
+      $this->currentUrl = $url;
+      $links = $this->processPage($pageContent);
+
+      $this->setStorePageParams();
+
+      //salveaza o intrare despre pagina curenta in baza de date
+      $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
+
+      //daca pagina nu e in format html (e imagine sau alt fisier)
+      //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
+      if (!$this->pageOk()) {
+        continue;
+      }
+      
+      foreach($links as $link) {
+        $this->processLink($link);
+      }
+
+      //niceness
+      sleep(Config::get('crawler.t_wait'));
+    }
+
+    Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
+  }
+
+
+  function start() {
+  
+    Applog::log("Crawler started");
+
+    $this->domainsList = Config::get('crawler.whiteList');
+
+    //start processing 
+    $this->processWhiteList();
+
+    Applog::log('Crawler finished');
+  }
+
+
+  function processWhiteList() {
+    foreach($this->domainsList as $startUrl) {
+      $startUrl = trim($startUrl);
+
+      //curatam url-ul
+      $this->currentUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
+      //impartim url-ul pe componente
+      $this->urlResource = util_parseUtf8Url($this->currentUrl);
+
+      //salvam startUrl in tabelul Link pentru a incepe extragerea,
+      //startUrl nu va avea o pagina din care este descoperit
+      //asa ca pagina crawledPageId va avea valoarea 0.
+      Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
+
+      //locatia curenta, va fi folosita pentru a nu depasi sfera
+      //de exemplu vrem sa crawlam doar o anumita zona a site-ului
+      $this->currentLocation = substr($this->currentUrl, 0);
+      Applog::log('domain start location: '.$this->currentLocation);
+
+      $this->crawlDomain();
+    }
+    
+  }
 
 }
 
@@ -139,9 +135,9 @@
  */
 if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
 
-	$obj = new Crawler();
+  $obj = new Crawler();
 
-	$obj->start();
+  $obj->start();
 }
 
-?>
\ No newline at end of file
+?>

Added: patches/00103.sql
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ patches/00103.sql	Mon Oct 21 20:35:08 2013	(r1028)
@@ -0,0 +1,3 @@
+alter table CrawledPage add key(url);
+alter table CrawledPage add createDate int not null, add modDate int not null;
+alter table Link add createDate int not null, add modDate int not null;

Modified: phplib/StringUtil.php
==============================================================================
--- phplib/StringUtil.php	Mon Oct 21 13:24:29 2013	(r1027)
+++ phplib/StringUtil.php	Mon Oct 21 20:35:08 2013	(r1028)
@@ -300,6 +300,44 @@
   static function explode($delimiter, $s) {
     return array_values(array_filter(explode($delimiter, $s), 'strlen'));
   }
+
+	/**
+   * Cleans up a URL in various ways:
+   * - trims any known index files and extensions (passed as arguments)
+   * - replaces consecutive slashes with a single slash;
+   * - trims any final slashes
+   * Assumes the URL includes a protocol.
+   * @param $indexFile Index file name (without extension)
+   * @param $indexExt Array of index file extensions
+   **/
+	static function urlCleanup($url, $indexFile, $indexExt) {
+    // Scroll through the extension list until we find one that matches
+    $i = 0;
+    $found = false;
+    do {
+      $target = $indexFile . '.' . $indexExt[$i];
+			if (self::endsWith($url, $target)) {
+        $url = substr($url, 0, -strlen($target));
+        $found = true;
+      }
+      $i++;
+    } while (($i < count($indexExt)) && !$found);
+
+    // Save the protocol first
+    $parts = explode('//', $url, 2);
+
+    // Replace //+ by /
+    $parts[1] = preg_replace('#//+#', '/', $parts[1]);
+
+    // Delete any trailing slashes
+    $parts[1] = rtrim($parts[1], '/');
+
+    // Reassemble and return the URL
+		return implode('//', $parts);
+	}
+
+
+
 }
 
 ?>

Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php	Mon Oct 21 13:24:29 2013	(r1027)
+++ phplib/models/CrawledPage.php	Mon Oct 21 20:35:08 2013	(r1028)
@@ -2,44 +2,45 @@
 
 
 class CrawledPage  extends BaseObject implements DatedObject {
-	
-	public static $_table = 'CrawledPage';
+  
+  public static $_table = 'CrawledPage';
 
-	//salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
-	public static function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath, $timestamp) {
-
-		try {
-			$tableObj = Model::factory(self::$_table);
-			$tableObj->create();
-			$tableObj->timestamp = $timestamp;
-			$tableObj->url = $url;
-			$tableObj->httpStatus = $httpStatus;
-			$tableObj->rawPagePath = $rawPagePath;
-			$tableObj->parsedTextPath = $parsedTextPath;
-			$tableObj->save();
-
-			return $tableObj->id;
-		}
-		catch(Exception $ex) {
-
-			AppLog::exceptionLog($ex);
-		}
-		return null;
-	}
-
-	//intoarce o lista cu domeniile parsate
-	public static function getListOfDomains() {
-
-		//return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
-		return Model::factory(self::$_table)->raw_query("select id, domain from
-			 (select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
-	}
-
-	 function getNextDiacriticsFile() {
-
-	 	return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);")->find_one();
-	 }
-	
+  // Salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
+  public static function savePage2DB($url, $httpStatus, $rawPage, $parsedText, $rawPagePath, $parsedTextPath, $timestamp) {
+    @mkdir(dirname($rawPagePath), 0777, true);
+    @mkdir(dirname($parsedTextPath), 0777, true);
+    file_put_contents($rawPagePath, $rawPage);
+    file_put_contents($parsedTextPath, $parsedText);
+
+    try {
+      $tableObj = Model::factory(self::$_table);
+      $tableObj->create();
+      $tableObj->timestamp = $timestamp;
+      $tableObj->url = $url;
+      $tableObj->httpStatus = $httpStatus;
+      $tableObj->rawPagePath = $rawPagePath;
+      $tableObj->parsedTextPath = $parsedTextPath;
+      $tableObj->save();
+
+      return $tableObj->id;
+    } catch(Exception $ex) {
+      AppLog::exceptionLog($ex);
+    }
+    return null;
+  }
+
+  //intoarce o lista cu domeniile parsate
+  public static function getListOfDomains() {
+
+    //return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
+    return Model::factory(self::$_table)->raw_query("select id, domain from
+       (select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
+  }
+
+  function getNextDiacriticsFile() {
+    return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);")->find_one();
+  }
+  
 
 }
 

Modified: phplib/util.php
==============================================================================
--- phplib/util.php	Mon Oct 21 13:24:29 2013	(r1027)
+++ phplib/util.php	Mon Oct 21 20:35:08 2013	(r1028)
@@ -491,18 +491,20 @@
   return $result;
 }
 
-function parse_utf8_url($url)
-{
-    static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
-    if (is_string($url) && preg_match(
-            '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
-            '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
-        foreach ($matches as $key => $value)
-            if (!isset($keys[$key]) || empty($value))
-                unset($matches[$key]);
-        return $matches;
+/** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
+function util_parseUtf8Url($url) {
+  static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+  if (is_string($url) && preg_match(
+        '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+        '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+    foreach ($matches as $key => $value) {
+      if (!isset($keys[$key]) || empty($value)) {
+        unset($matches[$key]);
+      }
     }
-    return false;
+    return $matches;
+  }
+  return false;
 }
 
 ?>


More information about the Dev mailing list