[dev] [commit] r937 - wwwbase/Crawler

Sun Aug 18 22:48:13 EEST 2013

Author: alinu
Date: Sun Aug 18 22:48:13 2013
New Revision: 937

Log:


Modified:
   wwwbase/Crawler/AbstractCrawler.php
   wwwbase/Crawler/Crawler.php
   wwwbase/Crawler/simple_html_dom.php

Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================

--- wwwbase/Crawler/AbstractCrawler.php	Sun Aug 18 21:38:55 2013	(r936)
+++ wwwbase/Crawler/AbstractCrawler.php	Sun Aug 18 22:48:13 2013	(r937)
@@ -18,8 +18,6 @@
 
 	protected $ch;
 	protected $pageContent;
-	protected $dom;
-	protected $body;
 	protected $plainText;
 	protected $info;
 	protected $currentUrl;
@@ -339,6 +337,43 @@
 	    return null;
     }
 
+
+    function fixHtml($html) {
+
+    	foreach($html->find('head') as $script) {
+
+			$script->outertext = '';
+		}
+
+
+    	foreach($html->find('script') as $script) {
+
+			$script->outertext = '';
+		}
+
+		foreach($html->find('style') as $style) {
+
+			$style->outertext = '';
+		}
+
+		$html->load($html->save());
+		
+		//transforma pagina raw in simple_html_dom_node
+		//$this->dom = str_get_html($pageContent);
+		
+		$buffer = '<html><body>';
+		$nodes = $html->childNodes();
+		foreach($nodes as $node) {
+
+			$buffer .= $node->innertext();
+		}
+
+		$buffer .= '</body></html>';
+
+		return str_get_html($buffer);
+    }
+
+
 	//Clasele care deriva aceasta clasa vor trebui
 	//sa implementeze metodele de mai jos
 

Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php	Sun Aug 18 21:38:55 2013	(r936)
+++ wwwbase/Crawler/Crawler.php	Sun Aug 18 22:48:13 2013	(r937)
@@ -10,8 +10,9 @@
 
 	//extrage textul fara cod html
 	function getText($domNode) {
-		//dump_html_tree($domNode);
-		$this->plainText = $domNode->text();
+		
+		$this->plainText = strip_tags($domNode->text());
+		//$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
 	}
 	//extrage textul cu cod html din nodul respectiv
 	function extractText($domNode) {
@@ -71,16 +72,23 @@
 			}
 			
 			try {
-				//transforma pagina raw in simple_html_dom_node
-				$this->dom = str_get_html($pageContent);
-				//extrage continutul dintre tagurile <BODY> si </BODY>
-				$this->body = $this->dom->find('body', 0, true);
-				//extrage recursiv linkurile si textul din body
-				$this->extractText($this->body);
-				//salveaza pagina in 2 formate: raw html si clean text
-				$this->saveCurrentPage();
 
+
+				$html = str_get_html($pageContent);
+
+				//reparam html stricat
+				if (!$html->find('body', 0, true)) {
+
+					$html = $this->fixHtml($html);
+				}
+
+
+
+				$this->extractText($html->find('body', 0, true));
+				$this->saveCurrentPage();
+				
 				//cata memorie consuma
+				//si eliberare referinte pierdute
 				$this->manageMemory();
 				//niceness
 				sleep(pref_getSectionPreference('crawler', 't_wait'));
@@ -89,7 +97,6 @@
 
 				logException($ex);
 			}
-
 		}
 
 		crawlerLog('Finished');
@@ -105,4 +112,5 @@
 	//$obj->startCrawling("http://wiki.dexonline.ro/");
 	$obj->startCrawling("http://www.romlit.ro");
 }
+
 ?>
\ No newline at end of file

Modified: wwwbase/Crawler/simple_html_dom.php
==============================================================================
--- wwwbase/Crawler/simple_html_dom.php	Sun Aug 18 21:38:55 2013	(r936)
+++ wwwbase/Crawler/simple_html_dom.php	Sun Aug 18 22:48:13 2013	(r937)
@@ -450,7 +450,7 @@
 			// If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 			if ($this->tag == "span")
 			{
-				$ret .= $this->dom->default_span_text;
+				$ret .= $this->dom->default_span_text.' ';
 			}