[dev] [commit] r1029 - app
automailer at dexonline.ro
automailer at dexonline.ro
Mon Oct 21 21:20:24 EEST 2013
Author: cata
Date: Mon Oct 21 21:20:24 2013
New Revision: 1029
Log:
* Allow the crawler to mix domains.
* Removed the $currentLocation field. This should be reimplemented with a per-domain regexp.
* This required a rewrite of makeAbsoluteLink().
Modified:
app/AbstractCrawler.php
app/Crawler.php
Modified: app/AbstractCrawler.php
==============================================================================
--- app/AbstractCrawler.php Mon Oct 21 20:35:08 2013 (r1028)
+++ app/AbstractCrawler.php Mon Oct 21 21:20:24 2013 (r1029)
@@ -19,15 +19,10 @@
protected $rawPagePath;
protected $parsedTextPath;
- protected $currentLocation;
-
protected $urlResource;
protected $directoryIndexFile;
protected $indexFileExt;
- protected $domainsList;
-
-
function __construct() {
$this->plainText = '';
$this->pageContent = '';
@@ -125,7 +120,7 @@
//$nextLink = null;
try {
//$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
- $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+ $nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
if ($nextLink != null) {
@@ -219,37 +214,30 @@
//sterge slash-uri in plus si directory index file
$canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
-
- if (!strstr($url, $this->currentLocation)) return;
- Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+ $rec = util_parseUtf8Url($canonicalUrl);
+ if ($rec['host'] == $this->getDomain($url)) {
+ Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+ }
}
function isRelativeLink($url) {
return !strstr($url, "http");
}
- //cauta directorul link-ului curent si returneaza
- //url-ul spre acel director
+ // Cauta directorul link-ului curent si returneaza url-ul spre acel director
+ // Returnează întregul URL dacă nu există un director.
function getDeepestDir($url) {
-
- try {
- $retVal = substr($url, 0, strrpos($url,'/'));
-
- if (strstr($retVal, $this->currentLocation))
- return $retVal;
- else return $url;
- }
- catch(Exception $ex) {
-
- exceptionLog($ex);
+ $parts = explode('//', $url, 2); // Salvează protocolul
+ $pos = strrpos($parts[1], '/');
+ if ($pos !== false) {
+ $parts[1] = substr($parts[1], 0, $pos);
}
- return $url;
+ return implode('//', $parts);
}
function makeAbsoluteLink($url) {
-
- return $this->getDeepestDir($this->currentUrl) .'/'. $url;
+ return $this->getDeepestDir($this->currentUrl) . '/' . $url;
}
function getDomain($url) {
@@ -260,9 +248,6 @@
// Clasele care deriva aceasta clasa vor trebui sa implementeze metodele de mai jos
abstract function extractText($domNode);
-
- abstract function crawlDomain();
-
abstract function start();
}
Modified: app/Crawler.php
==============================================================================
--- app/Crawler.php Mon Oct 21 20:35:08 2013 (r1028)
+++ app/Crawler.php Mon Oct 21 21:20:24 2013 (r1029)
@@ -50,7 +50,7 @@
}
}
- function crawlDomain() {
+ function crawlLoop() {
Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
@@ -66,6 +66,7 @@
$pageContent = $this->getPage($url);
//setam url-ul curent pentru store in Database
$this->currentUrl = $url;
+ $this->urlResource = util_parseUtf8Url($url);
$links = $this->processPage($pageContent);
$this->setStorePageParams();
@@ -92,40 +93,17 @@
function start() {
-
Applog::log("Crawler started");
- $this->domainsList = Config::get('crawler.whiteList');
-
- //start processing
- $this->processWhiteList();
-
- Applog::log('Crawler finished');
- }
-
-
- function processWhiteList() {
- foreach($this->domainsList as $startUrl) {
- $startUrl = trim($startUrl);
-
- //curatam url-ul
- $this->currentUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
- //impartim url-ul pe componente
- $this->urlResource = util_parseUtf8Url($this->currentUrl);
-
- //salvam startUrl in tabelul Link pentru a incepe extragerea,
- //startUrl nu va avea o pagina din care este descoperit
- //asa ca pagina crawledPageId va avea valoarea 0.
- Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
-
- //locatia curenta, va fi folosita pentru a nu depasi sfera
- //de exemplu vrem sa crawlam doar o anumita zona a site-ului
- $this->currentLocation = substr($this->currentUrl, 0);
- Applog::log('domain start location: '.$this->currentLocation);
-
- $this->crawlDomain();
+ // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea.
+ // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
+ foreach (Config::get('crawler.whiteList') as $startUrl) {
+ $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
+ $rec = util_parseUtf8Url($startUrl);
+ Link::saveLink2DB($startUrl, $rec['host'], 0);
}
-
+
+ $this->crawlLoop();
}
}
More information about the Dev
mailing list