[dev] [commit] r1028 - app patches phplib phplib/models
automailer at dexonline.ro
automailer at dexonline.ro
Mon Oct 21 20:35:08 EEST 2013
Author: cata
Date: Mon Oct 21 20:35:08 2013
New Revision: 1028
Log:
* CrawledPage handles both saves to the database and to the disk, to make save operations atomic
* This required some reordering in Crawler.php so that the plainText is available when savePage2DB is invoked.
* Moved URL cleanup code to StringUtil and rewrote the duplicate slash removal code
Added:
patches/00103.sql
Modified:
app/AbstractCrawler.php
app/Crawler.php
phplib/StringUtil.php
phplib/models/CrawledPage.php
phplib/util.php
Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php Mon Oct 21 13:24:29 2013 (r1027)
+++ app/AbstractCrawler.php Mon Oct 21 20:35:08 2013 (r1028)
@@ -1,372 +1,269 @@
-<?php
-/*
- * Alin Ungureanu, 2013
- * alyn.cti at gmail.com
- */
-require_once __DIR__ . '/../phplib/util.php';
-require_once util_getRootPath() . 'phplib/simple_html_dom.php';
-
-require_once util_getRootPath() . 'phplib/AppLog.php';
-require_once util_getRootPath() . 'phplib/MemoryManagement.php';
-
-
-db_init();
-
-abstract class AbstractCrawler {
-
- protected $ch;
- protected $pageContent;
- protected $plainText;
- protected $info;
- protected $currentUrl;
- protected $currentTimestamp;
- protected $currentPageId;
- protected $rawPagePath;
- protected $parsedTextPath;
-
- protected $currentLocation;
-
- protected $urlResource;
- protected $directoryIndexFile;
- protected $indexFileExt;
-
- protected $domainsList;
-
-
- function __construct() {
-
- $this->plainText = '';
- $this->pageContent = '';
- $this->directoryIndexFile = Config::get('crawler.dir_index_file');
- $this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
- $this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
- }
-
-
- //descarca pagina de la $url
- function getPage($url) {
-
- $this->ch = curl_init();
- Applog::log("User agent is: " . Config::get('crawler.user_agent'));
- curl_setopt ($this->ch, CURLOPT_URL, $url);
- curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
- curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
- curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
- curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
- curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
- curl_setopt ($this->ch, CURLOPT_REFERER, $url);
- $this->pageContent = curl_exec($this->ch);
- $this->info = curl_getinfo($this->ch);
-
- if(!curl_errno($this->ch)) {
-
- $this->info = curl_getinfo($this->ch);
- }
- else{
-
- $this->info = array('http_code' => 404);
- }
-
- curl_close( $this->ch);
-
- return $this->pageContent;
- }
-
-
- //returneaza tipul continutului paginii
- function getUrlMimeType($buffer) {
-
- $finfo = new finfo(FILEINFO_MIME_TYPE);
- return $finfo->buffer($buffer);
- }
- //verifica daca continutul paginii e html, nu alt fisier
- function isHtml($buffer) {
-
- Applog::log("PAGE TYPE=".$this->getUrlMimeType($buffer));
-
- return strstr($this->getUrlMimeType($buffer), 'html');
- }
-
-
- //seteaza locatia unde vor fi salvate fisierele html raw si clean text
- function setStorePageParams() {
-
- $this->currentTimestamp = date("Y-m-d H:i:s");
- $this->rawPagePath = Config::get('crawler.raw_page_path')
- .$this->urlResource['host'] .'/'. $this->currentTimestamp;
- $this->parsedTextPath = Config::get('crawler.parsed_text_path')
- .$this->urlResource['host'] .'/'. $this->currentTimestamp;
- }
-
- //verifica daca pagina poate fi descarcata si daca e HTML
- function pageOk() {
-
- Applog::log("HTTP CODE " .$this->httpResponse());
- //verifica codul HTTP
- if ($this->httpResponse() >= 400) {
- Applog::log("HTTP Error, URL Skipped");
- return false;
- }
- //verifica daca pagina e HTML
- if (!$this->isHtml($this->pageContent)) {
-
- Applog::log("Page not HTML, URL Skipped");
- return false;
- }
-
- return true;
- }
-
- /*
- * Salveaza pagina in format raw si clean text in fisiere
- */
- function saveCurrentPage() {
-
-
- try {
- if (!file_exists(Config::get('crawler.raw_page_path').$this->urlResource['host'])) {
- mkdir(Config::get('crawler.raw_page_path').$this->urlResource['host'], 0777, true);
- }
- if (!file_exists(Config::get('crawler.parsed_text_path').$this->urlResource['host'])) {
- mkdir(Config::get('crawler.parsed_text_path').$this->urlResource['host'], 0777, true);
- }
- //salveaza pagina raw pe disk
- file_put_contents($this->rawPagePath, $this->pageContent);
- //converteste simbolurile HTML in format text si elimina din spatii.
- $this->plainText = preg_replace("/ /", "", html_entity_decode($this->plainText));
- //salveaza textul extras pe disk
- file_put_contents($this->parsedTextPath, $this->plainText);
- }
- catch(Exception $ex) {
-
- Applog::exceptionLog($ex);
- }
- }
-
- //returneaza codul HTTP
- function httpResponse() {
-
- return $this->info['http_code'];
- }
-
- //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
- function getNextLink() {
-
-
- //$nextLink = null;
- try {
- //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
- $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
-
- if ($nextLink != null) {
-
- return $nextLink->canonicalUrl;
- }
- }
- catch(Exception $ex) {
-
- Applog::exceptionLog($ex);
- }
-
- return null;
- }
-
- //repara HTML-ul stricat intr-un mod minimal astfel incat
- //sa poata fi interpretat de biblioteca simple_html_dom
- function fixHtml($html) {
-
- foreach($html->find('head') as $script) {
-
- $script->outertext = '';
- }
-
- foreach($html->find('script') as $script) {
-
- $script->outertext = '';
- }
-
- foreach($html->find('style') as $style) {
-
- $style->outertext = '';
- }
-
- $html->load($html->save());
-
- //transforma pagina raw in simple_html_dom_node
- //$this->dom = str_get_html($pageContent);
-
- $buffer = '<html><body>';
- $nodes = $html->childNodes();
- foreach($nodes as $node) {
-
- $buffer .= $node->innertext();
- }
-
- $buffer .= '</body></html>';
-
- return str_get_html($buffer);
- }
-
- function eligibleUrl($url) {
-
- $resource = parse_utf8_url($url);
- $pathInfo = pathinfo($resource['path']);
-
- if (isset($pathInfo['extension'])) {
-
- $ext = $pathInfo['extension'];
-
-
- if (array_search($ext, $this->fileExt) === false) {
-
- return false;
- }
- }
-
- return true;
- }
-
- //metode pentru prelucrarea linkurilor
- //sterge directory index file si elimina slash-urile in plus
- //gaseste toate linkurile
- //le transforma in absolute daca sunt relative
- function processLink($url) {
-
-
- if (!$this->eligibleUrl($url)) {
-
- return;
- }
-
- Applog::log('Processing link: '.$url);
- $canonicalUrl = null;
- if ($this->isRelativeLink($url)) {
-
- $url = $this->makeAbsoluteLink($url);
- }
- //daca ultimul caracter este '/', il eliminam
- //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
- if (substr($url, -1) == "/") $url = substr($url, 0, -1);
-
- //sterge slash-uri in plus si directory index file
- $canonicalUrl = $this->urlPadding($url);
-
- if (!strstr($url, $this->currentLocation)) return;
-
- Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
- }
-
-
- function urlPadding($url) {
-
- return $this->delDuplicateSlashes($this->delDirIndexFile($url));
- }
-
-
- //delestes index.php/html/pl/py/jsp etc
- function delDirIndexFile($url) {
-
- //Applog::log('delDirIndexFile '.$url);
-
- foreach($this->indexFileExt as $ext) {
-
- $target = $this->directoryIndexFile .'.'. $ext;
-
- if (strstr($url, $target))
- return str_replace($target, "", $url);
- }
-
- return $url;
- }
-
- //deletes slashes when not needed
- function delDuplicateSlashes($url) {
-
- if (strlen($url) < 5) {
-
- Applog::log("whatup with delDuplicateSlashes: $url");
- return $this->currentUrl;
- }
-
-
- $parsedUrl = parse_utf8_url($url);
-
-
- if (substr_count($parsedUrl['host'], '.') < 2) {
-
- $parsedUrl['host'] = 'www.'.$parsedUrl['host'];
- }
-
- $retUrl = $parsedUrl['scheme'].'://'.$parsedUrl['host'];
- $consecutiveSlash = false;
-
- $url = substr($url, strlen($retUrl));
-
- for ($i = 0; $i < strlen($url); ++$i) {
- $nextCh = substr($url, $i, 1);
-
- if ($nextCh == '/' && !$consecutiveSlash) {
-
- $retUrl .= $nextCh;
- $consecutiveSlash = true;
- }
- else if ($nextCh == '/') {}
- else {
- $retUrl .= $nextCh;
- $consecutiveSlash = false;
- }
- }
-
- //eliminarea slash-ului final
-
- if (substr($retUrl, -1) == "/") $retUrl = substr($retUrl, 0, -1);
-
- return $retUrl;
- }
-
-
- function isRelativeLink($url) {
-
- return !strstr($url, "http");
- }
-
- //cauta directorul link-ului curent si returneaza
- //url-ul spre acel director
- function getDeepestDir($url) {
-
- try {
- $retVal = substr($url, 0, strrpos($url,'/'));
-
- if (strstr($retVal, $this->currentLocation))
- return $retVal;
- else return $url;
- }
- catch(Exception $ex) {
-
- exceptionLog($ex);
- }
- return $url;
- }
-
- function makeAbsoluteLink($url) {
-
- return $this->getDeepestDir($this->currentUrl) .'/'. $url;
- }
-
- function getDomain($url) {
-
- return $this->urlResource['host'];
- }
-
-
- //Clasele care deriva aceasta clasa vor trebui
- //sa implementeze metodele de mai jos
- abstract function extractText($domNode);
-
- abstract function crawlDomain();
-
- abstract function start();
-}
-
-?>1
\ No newline at end of file
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once __DIR__ . '/../phplib/util.php';
+require_once util_getRootPath() . 'phplib/simple_html_dom.php';
+require_once util_getRootPath() . 'phplib/AppLog.php';
+require_once util_getRootPath() . 'phplib/MemoryManagement.php';
+
+abstract class AbstractCrawler {
+ protected $ch;
+ protected $pageContent;
+ protected $plainText;
+ protected $info;
+ protected $currentUrl;
+ protected $currentTimestamp;
+ protected $currentPageId;
+ protected $rawPagePath;
+ protected $parsedTextPath;
+
+ protected $currentLocation;
+
+ protected $urlResource;
+ protected $directoryIndexFile;
+ protected $indexFileExt;
+
+ protected $domainsList;
+
+
+ function __construct() {
+ $this->plainText = '';
+ $this->pageContent = '';
+ $this->directoryIndexFile = Config::get('crawler.dir_index_file');
+ $this->indexFileExt = explode(',', Config::get('crawler.index_file_ext'));
+ $this->fileExt = explode(',', Config::get('crawler.index_file_ext').',txt');
+ }
+
+
+ //descarca pagina de la $url
+ function getPage($url) {
+
+ $this->ch = curl_init();
+ Applog::log("User agent is: " . Config::get('crawler.user_agent'));
+ curl_setopt ($this->ch, CURLOPT_URL, $url);
+ curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
+ curl_setopt ($this->ch, CURLOPT_USERAGENT, Config::get('crawler.user_agent'));
+ curl_setopt ($this->ch, CURLOPT_TIMEOUT, 20);
+ curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
+ curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
+ curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
+ curl_setopt ($this->ch, CURLOPT_REFERER, $url);
+ $this->pageContent = curl_exec($this->ch);
+ $this->info = curl_getinfo($this->ch);
+
+ if(!curl_errno($this->ch)) {
+
+ $this->info = curl_getinfo($this->ch);
+ }
+ else{
+
+ $this->info = array('http_code' => 404);
+ }
+
+ curl_close( $this->ch);
+
+ return $this->pageContent;
+ }
+
+
+ //returneaza tipul continutului paginii
+ function getUrlMimeType($buffer) {
+
+ $finfo = new finfo(FILEINFO_MIME_TYPE);
+ return $finfo->buffer($buffer);
+ }
+ //verifica daca continutul paginii e html, nu alt fisier
+ function isHtml($buffer) {
+
+ Applog::log("PAGE TYPE=".$this->getUrlMimeType($buffer));
+
+ return strstr($this->getUrlMimeType($buffer), 'html');
+ }
+
+
+ //seteaza locatia unde vor fi salvate fisierele html raw si clean text
+ function setStorePageParams() {
+
+ $this->currentTimestamp = date("Y-m-d H:i:s");
+ $this->rawPagePath = Config::get('crawler.raw_page_path')
+ .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+ $this->parsedTextPath = Config::get('crawler.parsed_text_path')
+ .$this->urlResource['host'] .'/'. $this->currentTimestamp;
+ }
+
+ //verifica daca pagina poate fi descarcata si daca e HTML
+ function pageOk() {
+
+ Applog::log("HTTP CODE " .$this->httpResponse());
+ //verifica codul HTTP
+ if ($this->httpResponse() >= 400) {
+ Applog::log("HTTP Error, URL Skipped");
+ return false;
+ }
+ //verifica daca pagina e HTML
+ if (!$this->isHtml($this->pageContent)) {
+
+ Applog::log("Page not HTML, URL Skipped");
+ return false;
+ }
+
+ return true;
+ }
+
+ //returneaza codul HTTP
+ function httpResponse() {
+
+ return $this->info['http_code'];
+ }
+
+ //returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+ function getNextLink() {
+
+
+ //$nextLink = null;
+ try {
+ //$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
+ $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+
+ if ($nextLink != null) {
+
+ return $nextLink->canonicalUrl;
+ }
+ }
+ catch(Exception $ex) {
+
+ Applog::exceptionLog($ex);
+ }
+
+ return null;
+ }
+
+ //repara HTML-ul stricat intr-un mod minimal astfel incat
+ //sa poata fi interpretat de biblioteca simple_html_dom
+ function fixHtml($html) {
+
+ foreach($html->find('head') as $script) {
+
+ $script->outertext = '';
+ }
+
+ foreach($html->find('script') as $script) {
+
+ $script->outertext = '';
+ }
+
+ foreach($html->find('style') as $style) {
+
+ $style->outertext = '';
+ }
+
+ $html->load($html->save());
+
+ //transforma pagina raw in simple_html_dom_node
+ //$this->dom = str_get_html($pageContent);
+
+ $buffer = '<html><body>';
+ $nodes = $html->childNodes();
+ foreach($nodes as $node) {
+
+ $buffer .= $node->innertext();
+ }
+
+ $buffer .= '</body></html>';
+
+ return str_get_html($buffer);
+ }
+
+ function eligibleUrl($url) {
+
+ $resource = util_parseUtf8Url($url);
+ $pathInfo = pathinfo($resource['path']);
+
+ if (isset($pathInfo['extension'])) {
+
+ $ext = $pathInfo['extension'];
+
+
+ if (array_search($ext, $this->fileExt) === false) {
+
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ //metode pentru prelucrarea linkurilor
+ //sterge directory index file si elimina slash-urile in plus
+ //gaseste toate linkurile
+ //le transforma in absolute daca sunt relative
+ function processLink($url) {
+
+
+ if (!$this->eligibleUrl($url)) {
+
+ return;
+ }
+
+ Applog::log('Processing link: '.$url);
+ $canonicalUrl = null;
+ if ($this->isRelativeLink($url)) {
+
+ $url = $this->makeAbsoluteLink($url);
+ }
+ //daca ultimul caracter este '/', il eliminam
+ //exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
+ if (substr($url, -1) == "/") $url = substr($url, 0, -1);
+
+ //sterge slash-uri in plus si directory index file
+ $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
+
+ if (!strstr($url, $this->currentLocation)) return;
+
+ Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+ }
+
+ function isRelativeLink($url) {
+ return !strstr($url, "http");
+ }
+
+ //cauta directorul link-ului curent si returneaza
+ //url-ul spre acel director
+ function getDeepestDir($url) {
+
+ try {
+ $retVal = substr($url, 0, strrpos($url,'/'));
+
+ if (strstr($retVal, $this->currentLocation))
+ return $retVal;
+ else return $url;
+ }
+ catch(Exception $ex) {
+
+ exceptionLog($ex);
+ }
+ return $url;
+ }
+
+ function makeAbsoluteLink($url) {
+
+ return $this->getDeepestDir($this->currentUrl) .'/'. $url;
+ }
+
+ function getDomain($url) {
+
+ return $this->urlResource['host'];
+ }
+
+
+ // Clasele care deriva aceasta clasa vor trebui sa implementeze metodele de mai jos
+ abstract function extractText($domNode);
+
+ abstract function crawlDomain();
+
+ abstract function start();
+}
+
+?>
Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php Mon Oct 21 13:24:29 2013 (r1027)
+++ app/Crawler.php Mon Oct 21 20:35:08 2013 (r1028)
@@ -7,130 +7,126 @@
class Crawler extends AbstractCrawler {
- //extrage textul fara cod html
- function getText($domNode) {
-
- $this->plainText = html_entity_decode(strip_tags($domNode->text()));
- //$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
- }
- //extrage textul cu cod html din nodul respectiv
- function extractText($domNode) {
-
- Applog::log("extracting text");
- $this->getText($domNode);
-
- foreach($domNode->find("a") as $link) {
-
- $this->processLink($link->href);
- }
- }
-
- function processPage($pageContent) {
-
- try {
-
- $html = str_get_html($pageContent);
-
- //reparam html stricat
- if (!$html->find('body', 0, true)) {
-
- $html = $this->fixHtml($html);
- }
-
-
- $this->extractText($html->find('body', 0, true));
- $this->saveCurrentPage();
-
- //cata memorie consuma
- //si eliberare referinte pierdute
-
- $html->clear();
-
- MemoryManagement::showUsage('before cleaning', true, 'KB');
-
- MemoryManagement::clean(true);
-
- MemoryManagement::showUsage('after cleaning', true, 'KB');
- //niceness
- sleep(Config::get('crawler.t_wait'));
- }
- catch (Exception $ex) {
-
- Applog::exceptionLog($ex);
- }
- }
-
- function crawlDomain() {
-
- Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
-
- while(1) {
-
- //extrage urmatorul link neprelucrat din baza de date
- $url = $this->getNextLink();
- Applog::log('current URL: ' . $url);
- //daca s-a terminat crawling-ul
- if ($url == null || $url == '') break;
-
- //download pagina
- $pageContent = $this->getPage($url);
- //setam url-ul curent pentru store in Database
- $this->currentUrl = $url;
-
- $this->setStorePageParams();
-
- //salveaza o intrare despre pagina curenta in baza de date
- $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
-
- //daca pagina nu e in format html (e imagine sau alt fisier)
- //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
- if (!$this->pageOk()) {
- continue;
- }
-
- $this->processPage($pageContent);
- }
-
- Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
- }
-
-
- function start() {
-
- Applog::log("Crawler started");
-
- $this->domainsList = Config::get('crawler.whiteList');
-
- //start processing
- $this->processWhiteList();
-
- Applog::log('Crawler finished');
- }
-
-
- function processWhiteList() {
- foreach($this->domainsList as $startUrl) {
- $startUrl = trim($startUrl);
-
- //curatam url-ul
- $this->currentUrl = $this->urlPadding($startUrl);
- //impartim url-ul pe componente
- $this->urlResource = parse_utf8_url($this->currentUrl);
-
- //salvam startUrl in tabelul Link pentru a incepe extragerea,
- //startUrl nu va avea o pagina din care este descoperit
- //asa ca pagina crawledPageId va avea valoarea 0.
- Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
-
- //locatia curenta, va fi folosita pentru a nu depasi sfera
- //de exemplu vrem sa crawlam doar o anumita zona a site-ului
- $this->currentLocation = substr($this->currentUrl, 0);
- Applog::log('domain start location: '.$this->currentLocation);
-
- $this->crawlDomain();
- }
-
- }
+ //extrage textul cu cod html din nodul respectiv
+ function extractText($domNode) {
+ Applog::log("extracting text");
+ $this->plainText = html_entity_decode(strip_tags($domNode->text()));
+ $this->plainText = preg_replace("/ +/", " ", $this->plainText);
+ }
+
+ /* Returns an array of links */
+ function processPage($pageContent) {
+ try {
+ $links = array();
+ $html = str_get_html($pageContent);
+
+ //reparam html stricat
+ if (!$html->find('body', 0, true)) {
+
+ $html = $this->fixHtml($html);
+ }
+
+
+ $body = $html->find('body', 0, true);
+ $this->extractText($body);
+ foreach ($body->find("a") as $link) {
+ $links[] = $link->href;
+ }
+ //cata memorie consuma
+ //si eliberare referinte pierdute
+
+ $html->clear();
+
+ MemoryManagement::showUsage('before cleaning', true, 'KB');
+
+ MemoryManagement::clean(true);
+
+ MemoryManagement::showUsage('after cleaning', true, 'KB');
+ return $links;
+ }
+ catch (Exception $ex) {
+
+ Applog::exceptionLog($ex);
+ }
+ }
+
+ function crawlDomain() {
+
+ Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
+
+ while (1) {
+
+ //extrage urmatorul link neprelucrat din baza de date
+ $url = $this->getNextLink();
+ Applog::log('current URL: ' . $url);
+ //daca s-a terminat crawling-ul
+ if ($url == null || $url == '') break;
+
+ //download pagina
+ $pageContent = $this->getPage($url);
+ //setam url-ul curent pentru store in Database
+ $this->currentUrl = $url;
+ $links = $this->processPage($pageContent);
+
+ $this->setStorePageParams();
+
+ //salveaza o intrare despre pagina curenta in baza de date
+ $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
+
+ //daca pagina nu e in format html (e imagine sau alt fisier)
+ //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
+ if (!$this->pageOk()) {
+ continue;
+ }
+
+ foreach($links as $link) {
+ $this->processLink($link);
+ }
+
+ //niceness
+ sleep(Config::get('crawler.t_wait'));
+ }
+
+ Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
+ }
+
+
+ function start() {
+
+ Applog::log("Crawler started");
+
+ $this->domainsList = Config::get('crawler.whiteList');
+
+ //start processing
+ $this->processWhiteList();
+
+ Applog::log('Crawler finished');
+ }
+
+
+ function processWhiteList() {
+ foreach($this->domainsList as $startUrl) {
+ $startUrl = trim($startUrl);
+
+ //curatam url-ul
+ $this->currentUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
+ //impartim url-ul pe componente
+ $this->urlResource = util_parseUtf8Url($this->currentUrl);
+
+ //salvam startUrl in tabelul Link pentru a incepe extragerea,
+ //startUrl nu va avea o pagina din care este descoperit
+ //asa ca pagina crawledPageId va avea valoarea 0.
+ Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
+
+ //locatia curenta, va fi folosita pentru a nu depasi sfera
+ //de exemplu vrem sa crawlam doar o anumita zona a site-ului
+ $this->currentLocation = substr($this->currentUrl, 0);
+ Applog::log('domain start location: '.$this->currentLocation);
+
+ $this->crawlDomain();
+ }
+
+ }
}
@@ -139,9 +135,9 @@
*/
if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
- $obj = new Crawler();
+ $obj = new Crawler();
- $obj->start();
+ $obj->start();
}
-?>
\ No newline at end of file
+?>
Added: patches/00103.sql
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ patches/00103.sql Mon Oct 21 20:35:08 2013 (r1028)
@@ -0,0 +1,3 @@
+alter table CrawledPage add key(url);
+alter table CrawledPage add createDate int not null, add modDate int not null;
+alter table Link add createDate int not null, add modDate int not null;
Modified: phplib/StringUtil.php
==============================================================================
--- phplib/StringUtil.php Mon Oct 21 13:24:29 2013 (r1027)
+++ phplib/StringUtil.php Mon Oct 21 20:35:08 2013 (r1028)
@@ -300,6 +300,44 @@
static function explode($delimiter, $s) {
return array_values(array_filter(explode($delimiter, $s), 'strlen'));
}
+
+ /**
+ * Cleans up a URL in various ways:
+ * - trims any known index files and extensions (passed as arguments)
+ * - replaces consecutive slashes with a single slash;
+ * - trims any final slashes
+ * Assumes the URL includes a protocol.
+ * @param $indexFile Index file name (without extension)
+ * @param $indexExt Array of index file extensions
+ **/
+ static function urlCleanup($url, $indexFile, $indexExt) {
+ // Scroll through the extension list until we find one that matches
+ $i = 0;
+ $found = false;
+ do {
+ $target = $indexFile . '.' . $indexExt[$i];
+ if (self::endsWith($url, $target)) {
+ $url = substr($url, 0, -strlen($target));
+ $found = true;
+ }
+ $i++;
+ } while (($i < count($indexExt)) && !$found);
+
+ // Save the protocol first
+ $parts = explode('//', $url, 2);
+
+ // Replace //+ by /
+ $parts[1] = preg_replace('#//+#', '/', $parts[1]);
+
+ // Delete any trailing slashes
+ $parts[1] = rtrim($parts[1], '/');
+
+ // Reassemble and return the URL
+ return implode('//', $parts);
+ }
+
+
+
}
?>
Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php Mon Oct 21 13:24:29 2013 (r1027)
+++ phplib/models/CrawledPage.php Mon Oct 21 20:35:08 2013 (r1028)
@@ -2,44 +2,45 @@
class CrawledPage extends BaseObject implements DatedObject {
-
- public static $_table = 'CrawledPage';
+
+ public static $_table = 'CrawledPage';
- //salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
- public static function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath, $timestamp) {
-
- try {
- $tableObj = Model::factory(self::$_table);
- $tableObj->create();
- $tableObj->timestamp = $timestamp;
- $tableObj->url = $url;
- $tableObj->httpStatus = $httpStatus;
- $tableObj->rawPagePath = $rawPagePath;
- $tableObj->parsedTextPath = $parsedTextPath;
- $tableObj->save();
-
- return $tableObj->id;
- }
- catch(Exception $ex) {
-
- AppLog::exceptionLog($ex);
- }
- return null;
- }
-
- //intoarce o lista cu domeniile parsate
- public static function getListOfDomains() {
-
- //return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
- return Model::factory(self::$_table)->raw_query("select id, domain from
- (select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
- }
-
- function getNextDiacriticsFile() {
-
- return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);")->find_one();
- }
-
+ // Salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
+ public static function savePage2DB($url, $httpStatus, $rawPage, $parsedText, $rawPagePath, $parsedTextPath, $timestamp) {
+ @mkdir(dirname($rawPagePath), 0777, true);
+ @mkdir(dirname($parsedTextPath), 0777, true);
+ file_put_contents($rawPagePath, $rawPage);
+ file_put_contents($parsedTextPath, $parsedText);
+
+ try {
+ $tableObj = Model::factory(self::$_table);
+ $tableObj->create();
+ $tableObj->timestamp = $timestamp;
+ $tableObj->url = $url;
+ $tableObj->httpStatus = $httpStatus;
+ $tableObj->rawPagePath = $rawPagePath;
+ $tableObj->parsedTextPath = $parsedTextPath;
+ $tableObj->save();
+
+ return $tableObj->id;
+ } catch(Exception $ex) {
+ AppLog::exceptionLog($ex);
+ }
+ return null;
+ }
+
+ //intoarce o lista cu domeniile parsate
+ public static function getListOfDomains() {
+
+ //return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
+ return Model::factory(self::$_table)->raw_query("select id, domain from
+ (select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
+ }
+
+ function getNextDiacriticsFile() {
+ return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);")->find_one();
+ }
+
}
Modified: phplib/util.php
==============================================================================
--- phplib/util.php Mon Oct 21 13:24:29 2013 (r1027)
+++ phplib/util.php Mon Oct 21 20:35:08 2013 (r1028)
@@ -491,18 +491,20 @@
return $result;
}
-function parse_utf8_url($url)
-{
- static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
- if (is_string($url) && preg_match(
- '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
- '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
- foreach ($matches as $key => $value)
- if (!isset($keys[$key]) || empty($value))
- unset($matches[$key]);
- return $matches;
+/** Kudos http://www.php.net/manual/pt_BR/function.parse-url.php#107291 **/
+function util_parseUtf8Url($url) {
+ static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+ if (is_string($url) && preg_match(
+ '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+ '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+ foreach ($matches as $key => $value) {
+ if (!isset($keys[$key]) || empty($value)) {
+ unset($matches[$key]);
+ }
}
- return false;
+ return $matches;
+ }
+ return false;
}
?>
More information about the Dev
mailing list