[dev] [commit] r937 - wwwbase/Crawler
automailer at dexonline.ro
automailer at dexonline.ro
Sun Aug 18 22:48:13 EEST 2013
Author: alinu
Date: Sun Aug 18 22:48:13 2013
New Revision: 937
Log:
Modified:
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/Crawler.php
wwwbase/Crawler/simple_html_dom.php
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Sun Aug 18 21:38:55 2013 (r936)
+++ wwwbase/Crawler/AbstractCrawler.php Sun Aug 18 22:48:13 2013 (r937)
@@ -18,8 +18,6 @@
protected $ch;
protected $pageContent;
- protected $dom;
- protected $body;
protected $plainText;
protected $info;
protected $currentUrl;
@@ -339,6 +337,43 @@
return null;
}
+
+ function fixHtml($html) {
+
+ foreach($html->find('head') as $script) {
+
+ $script->outertext = '';
+ }
+
+
+ foreach($html->find('script') as $script) {
+
+ $script->outertext = '';
+ }
+
+ foreach($html->find('style') as $style) {
+
+ $style->outertext = '';
+ }
+
+ $html->load($html->save());
+
+ //transforma pagina raw in simple_html_dom_node
+ //$this->dom = str_get_html($pageContent);
+
+ $buffer = '<html><body>';
+ $nodes = $html->childNodes();
+ foreach($nodes as $node) {
+
+ $buffer .= $node->innertext();
+ }
+
+ $buffer .= '</body></html>';
+
+ return str_get_html($buffer);
+ }
+
+
//Clasele care deriva aceasta clasa vor trebui
//sa implementeze metodele de mai jos
Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php Sun Aug 18 21:38:55 2013 (r936)
+++ wwwbase/Crawler/Crawler.php Sun Aug 18 22:48:13 2013 (r937)
@@ -10,8 +10,9 @@
//extrage textul fara cod html
function getText($domNode) {
- //dump_html_tree($domNode);
- $this->plainText = $domNode->text();
+
+ $this->plainText = strip_tags($domNode->text());
+ //$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
}
//extrage textul cu cod html din nodul respectiv
function extractText($domNode) {
@@ -71,16 +72,23 @@
}
try {
- //transforma pagina raw in simple_html_dom_node
- $this->dom = str_get_html($pageContent);
- //extrage continutul dintre tagurile <BODY> si </BODY>
- $this->body = $this->dom->find('body', 0, true);
- //extrage recursiv linkurile si textul din body
- $this->extractText($this->body);
- //salveaza pagina in 2 formate: raw html si clean text
- $this->saveCurrentPage();
+
+ $html = str_get_html($pageContent);
+
+ //reparam html stricat
+ if (!$html->find('body', 0, true)) {
+
+ $html = $this->fixHtml($html);
+ }
+
+
+
+ $this->extractText($html->find('body', 0, true));
+ $this->saveCurrentPage();
+
//cata memorie consuma
+ //si eliberare referinte pierdute
$this->manageMemory();
//niceness
sleep(pref_getSectionPreference('crawler', 't_wait'));
@@ -89,7 +97,6 @@
logException($ex);
}
-
}
crawlerLog('Finished');
@@ -105,4 +112,5 @@
//$obj->startCrawling("http://wiki.dexonline.ro/");
$obj->startCrawling("http://www.romlit.ro");
}
+
?>
\ No newline at end of file
Modified: wwwbase/Crawler/simple_html_dom.php
==============================================================================
--- wwwbase/Crawler/simple_html_dom.php Sun Aug 18 21:38:55 2013 (r936)
+++ wwwbase/Crawler/simple_html_dom.php Sun Aug 18 22:48:13 2013 (r937)
@@ -450,7 +450,7 @@
// If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
if ($this->tag == "span")
{
- $ret .= $this->dom->default_span_text;
+ $ret .= $this->dom->default_span_text.' ';
}
More information about the Dev
mailing list