[dev] [commit] r979 - phplib/models wwwbase/Crawler wwwbase/styles

Sun Sep 8 23:31:54 EEST 2013

Author: alinu
Date: Sun Sep  8 23:31:54 2013
New Revision: 979

Log:


Added:
   wwwbase/Crawler/MemoryManagement.php
   wwwbase/Crawler/WhiteList.txt
   wwwbase/Crawler/crawler_dex.conf
   wwwbase/Crawler/database_tables.sql
Deleted:
   wwwbase/Crawler/sectiunea_de_crawler_dex.conf
Modified:
   phplib/models/CrawledPage.php
   phplib/models/Link.php
   wwwbase/Crawler/.htaccess
   wwwbase/Crawler/AbstractCrawler.php
   wwwbase/Crawler/Crawler.php
   wwwbase/Crawler/clean_all.php
   wwwbase/styles/crawler.css

Modified: phplib/models/CrawledPage.php
==============================================================================

--- phplib/models/CrawledPage.php	Fri Sep  6 17:49:26 2013	(r978)
+++ phplib/models/CrawledPage.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -2,7 +2,6 @@
 
 
 class CrawledPage  extends BaseObject implements DatedObject {
-	//implements DatedObject {
 	
 	public static $_table = 'CrawledPage';
 
@@ -36,6 +35,11 @@
 		return Model::factory(self::$_table)->raw_query("select id, domain from
 			 (select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage order by id desc) alias1 group by domain order by id asc;")->find_many();
 	}
+
+	 function getNextDiacriticsFile() {
+
+	 	return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);");
+	 }
 	
 
 }

Modified: phplib/models/Link.php
==============================================================================
--- phplib/models/Link.php	Fri Sep  6 17:49:26 2013	(r978)
+++ phplib/models/Link.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -7,7 +7,7 @@
 	public static $_table = 'Link';
 
 	//adauga o intrare nou in tabelul Link
-	public static function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
+	public static function saveLink2DB($canonicalUrl, $domain, $crawledPageId) {
 
 		//nu inseram acelasi link de 2 ori
 		if (Model::factory(self::$_table)->where('canonicalUrl', $canonicalUrl)->find_one()) {
@@ -20,7 +20,6 @@
 			$tableObj->create();
 			$tableObj->canonicalUrl = $canonicalUrl;
 			$tableObj->domain = $domain;
-			$tableObj->urlHash = $urlHash;
 			$tableObj->crawledPageId = $crawledPageId;
 			$tableObj->save();
 
@@ -33,8 +32,6 @@
 
 		return null;
 	}
-	
-
 }
 
-?>
\ No newline at end of file
+?>

Modified: wwwbase/Crawler/.htaccess
==============================================================================
--- wwwbase/Crawler/.htaccess	Fri Sep  6 17:49:26 2013	(r978)
+++ wwwbase/Crawler/.htaccess	Sun Sep  8 23:31:54 2013	(r979)
@@ -1,8 +1,13 @@
+AuthType Basic
+AuthName "Password Protected Area"
+AuthUserFile /etc/php5/apache2/.htpasswd
+
 Order Deny,Allow
 Deny from all
 Allow from 127.0.0.1
 
-<Files index.php>
-    Order Allow,Deny
-    Allow from all
-</Files>
\ No newline at end of file
+<Files "index.php">
+	Order Allow,Deny
+	Allow from all
+	Require valid-user
+</Files>

Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php	Fri Sep  6 17:49:26 2013	(r978)
+++ wwwbase/Crawler/AbstractCrawler.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -10,6 +10,7 @@
 require_once '../../phplib/idiorm/paris.php';
 
 require_once 'AppLog.php';
+require_once 'MemoryManagement.php';
 
 
 db_init();
@@ -32,7 +33,7 @@
 	protected $directoryIndexFile;
 	protected $IndexFileExt;
 
-	private $justStarted;
+	protected $domainsList;
 
 
 	function __construct() {
@@ -51,7 +52,7 @@
 
 		curl_setopt ($this->ch, CURLOPT_URL, $url);
 		curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
-		curl_setopt ($this->ch, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0");
+		curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
 		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 60);
 		curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
 		curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
@@ -89,15 +90,7 @@
 		return strstr($this->getUrlMimeType($buffer), 'html');
 	}
 
-	//elibereaza memoria ale carei referinte s-au pierdut
-	function manageMemory() {
-
-			crawlerLog('MEM USAGE BEFORE GC - ' . memory_get_usage());
-			gc_enable(); // Enable Garbage Collector
-			crawlerLog(gc_collect_cycles() . " garbage cycles cleaned"); // # of elements cleaned up
-			gc_disable(); // Disable Garbage Collector
-			crawlerLog('MEM USAGE After GC - ' . memory_get_usage());
-	}
+	
 	//seteaza locatia unde vor fi salvate fisierele html raw si clean text
 	function setStorePageParams() {
 
@@ -153,9 +146,96 @@
 		}
 	}
 
+	//returneaza codul HTTP
+	function httpResponse() {
+
+		return $this->info['http_code'];
+	}
+
+	//returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
+    function getNextLink() {
+
+
+    	//$nextLink = null;
+    	try {
+	    	//$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
+	    	$nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl LIKE '$this->currentLocation%' and canonicalUrl not in (Select url from CrawledPage);")->find_one();
+	    	
+	    	if ($nextLink != null) {
+	    	
+	    		return $nextLink->canonicalUrl;
+	    	}
+	    }
+	    catch(Exception $ex) {
+
+	    	logException($ex);
+	    }
+
+	    return null;
+    }
+
+    //repara HTML-ul stricat intr-un mod minimal astfel incat
+    //sa poata fi interpretat de biblioteca simple_html_dom
+    function fixHtml($html) {
+
+    	foreach($html->find('head') as $script) {
+
+			$script->outertext = '';
+		}
 
+    	foreach($html->find('script') as $script) {
+
+			$script->outertext = '';
+		}
+
+		foreach($html->find('style') as $style) {
 
+			$style->outertext = '';
+		}
+
+		$html->load($html->save());
+		
+		//transforma pagina raw in simple_html_dom_node
+		//$this->dom = str_get_html($pageContent);
+		
+		$buffer = '<html><body>';
+		$nodes = $html->childNodes();
+		foreach($nodes as $node) {
+
+			$buffer .= $node->innertext();
+		}
+
+		$buffer .= '</body></html>';
+
+		return str_get_html($buffer);
+    }
+
+
+    //metode pentru prelucrarea linkurilor
 	//sterge directory index file si elimina slash-urile in plus
+	//gaseste toate linkurile
+	//le transforma in absolute daca sunt relative
+	function processLink($url) {
+
+		crawlerLog('Processing link: '.$url);
+		$canonicalUrl = null;
+		if ($this->isRelativeLink($url)) {
+
+			$url = $this->makeAbsoluteLink($url);
+		}
+		//daca ultimul caracter este '/', il eliminam
+		//exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
+		if (substr($url, -1) == "/") $url = substr($url, 0, -1);
+
+		//sterge slash-uri in plus si directory index file
+		$canonicalUrl = $this->urlPadding($url);
+		
+		if (!strstr($url, $this->currentLocation)) return;		
+
+		Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
+	}
+
+
 	function urlPadding($url) {
 
 		return $this->delDuplicateSlashes($this->delDirIndexFile($url));
@@ -190,10 +270,7 @@
 
 		$parsedUrl = parse_url($url);
 		
-		/*if (substr($parsedUrl['host'], 0, 4) != 'www.') {
-		
-			$parsedUrl['host'] = 'www.'.$parsedUrl['host'];
-		}*/
+
 		if (substr_count($parsedUrl['host'], '.') < 2) {
 
 			$parsedUrl['host'] = 'www.'.$parsedUrl['host'];
@@ -220,57 +297,25 @@
 		}
 
 		//eliminarea slash-ului final
-		//$retUrl = substr($retUrl, 0, -1);
+	
 		if (substr($retUrl, -1) == "/") $retUrl = substr($retUrl, 0, -1);
 
-		//crawlerLog("DelDuplicateSlashes ". $retUrl);
-
 		return $retUrl;
 	}
 
 
-	//gaseste toate linkurile
-	//le transforma in absolute daca sunt relative
-	function processLink($url) {
-
-		crawlerLog('Processing link: '.$url);
-		$canonicalUrl = null;
-		if ($this->isRelativeLink($url)) {
-
-			$url = $this->makeAbsoluteLink($url);
-		}
-		//daca ultimul caracter este '/', il eliminam
-		//exemplu wiki.dexonline.ro nu wiki.dexonline.ro/
-		if (substr($url, -1) == "/") $url = substr($url, 0, -1);
-
-		//sterge slash-uri in plus si directory index file
-		$canonicalUrl = $this->urlPadding($url);
-		
-		//$this->urlResource = parse_url($url);
-
-
-
-		if (!strstr($url, $this->currentLocation)) return;
-
-
-		$urlHash = $this->getLinkHash($url);
-
-		$domain = $this->getDomain($url);
-
-		Link::saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
-	}
-
 	function isRelativeLink($url) {
 
 		return !strstr($url, "http");
 	}
 
-
+	//cauta directorul link-ului curent si returneaza
+	//url-ul spre acel director
 	function getDeepestDir($url) {
 
 		try {
 			$retVal = substr($url, 0, strrpos($url,'/'));
-			//crawlerLog("GetDeepestDir: " . $retVal);
+
 			if (strstr($retVal, $this->currentLocation))
 				return $retVal;
 			else return $url;
@@ -287,100 +332,19 @@
 		return $this->getDeepestDir($this->currentUrl) .'/'. $url;
 	}
 
-
-	function getLinkHash($url) {
-
-		$liteURL = substr($url, strpos($url, "//") + 2);
-		if (strstr($liteURL, "index.php") || strstr($liteURL, "index.asp") ||
-			strstr($liteURL, "index.htm"))
-			$liteURL = substr($liteURL, 0, strrpos($liteURL, "//"));
-		return md5($liteURL);
-	}
-
-
 	function getDomain($url) {
 
 		return $this->urlResource['host'];
 	}
 
-	//returneaza codul HTTP
-	function httpResponse() {
-
-		return $this->info['http_code'];
-	}
-
-	//returneaza urmatorul URL ne crawl-at din baza de date sau null daca nu exista
-    function getNextLink() {
-
-
-    	if (!isset($this->justStarted)) {
-    		$this->justStarted = true;
-    		return $this->currentUrl;
-    	}
-
-
-    	//$nextLink = null;
-    	try {
-	    	//$nextLink = (string)ORM::for_table('Link')->raw_query("Select concat(domain,canonicalUrl) as concat_link from Link where concat(domain,canonicalUrl) not in (Select url from CrawledPage);")->find_one()->concat_link;
-	    	$nextLink = ORM::for_table('Link')->raw_query("Select canonicalUrl from Link where canonicalUrl not in (Select url from CrawledPage);")->find_one();
-	    	
-	    	if ($nextLink != null) {
-	    	
-	    		return $nextLink->canonicalUrl;
-	    	}
-	    }
-	    catch(Exception $ex) {
-
-	    	logException($ex);
-	    }
-
-	    return null;
-    }
-
-
-    function fixHtml($html) {
-
-    	foreach($html->find('head') as $script) {
-
-			$script->outertext = '';
-		}
-
-
-    	foreach($html->find('script') as $script) {
-
-			$script->outertext = '';
-		}
-
-		foreach($html->find('style') as $style) {
-
-			$style->outertext = '';
-		}
-
-		$html->load($html->save());
-		
-		//transforma pagina raw in simple_html_dom_node
-		//$this->dom = str_get_html($pageContent);
-		
-		$buffer = '<html><body>';
-		$nodes = $html->childNodes();
-		foreach($nodes as $node) {
-
-			$buffer .= $node->innertext();
-		}
-
-		$buffer .= '</body></html>';
-
-		return str_get_html($buffer);
-    }
-
 
 	//Clasele care deriva aceasta clasa vor trebui
 	//sa implementeze metodele de mai jos
-
 	abstract function extractText($domNode);
 
-	abstract function startCrawling($startUrl);
-}
+	abstract function crawlDomain();
 
+	abstract function start();
+}
 
 ?>
\ No newline at end of file

Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php	Fri Sep  6 17:49:26 2013	(r978)
+++ wwwbase/Crawler/Crawler.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -11,7 +11,7 @@
 	//extrage textul fara cod html
 	function getText($domNode) {
 		
-		$this->plainText = strip_tags($domNode->text());
+		$this->plainText = html_entity_decode(strip_tags($domNode->text()));
 		//$this->plainText = str_replace(array('\t','\n',' ', ' '), array('','.','',''),strip_tags($domNode->text()));
 	}
 	//extrage textul cu cod html din nodul respectiv
@@ -26,26 +26,44 @@
 		}
 	}
 
+	function processPage($pageContent) {
 
-	function startCrawling($startUrl) {
-	
-		crawlerLog("Started");
+		try {
+			
+			$html = str_get_html($pageContent);
 
+			//reparam html stricat
+			if (!$html->find('body', 0, true)) {
 
-		$this->currentUrl = $this->urlPadding($startUrl);
+				$html = $this->fixHtml($html);
+			}
+			
+
+			$this->extractText($html->find('body', 0, true));
+			$this->saveCurrentPage();
+			
+			//cata memorie consuma
+			//si eliberare referinte pierdute
+			
+			$html->clear();
 
-		crawlerLog('FIRST START URL: '.$this->currentUrl);
+			MemoryManagement::showUsage('before cleaning', true, 'KB');
+			
+			MemoryManagement::clean(true);
 
-		$this->urlResource = parse_url($this->currentUrl);
+			MemoryManagement::showUsage('after cleaning', true, 'KB');
+			//niceness
+			sleep(pref_getSectionPreference('crawler', 't_wait'));
+		}
+		catch (Exception $ex) {
 
-		//locatia curenta, va fi folosita pentru a nu depasi sfera
-		//de exemplu vrem sa crawlam doar o anumita zona a site-ului
-		$this->currentLocation = substr($startUrl, strpos($startUrl, ':') + 3);
-		crawlerLog('domain start location: '.$this->currentLocation);
+			logException($ex);
+		}
+	}
 
-		$url = $startUrl;
+	function crawlDomain() {
 
-		$justStarted = true;
+		crawlerLog("Crawling: " . $this->getDomain($this->currentUrl) . " started");
 
 		while(1) {
 
@@ -71,36 +89,61 @@
 				continue;
 			}
 			
-			try {
+			$this->processPage($pageContent);
+		}
+
+		crawlerLog("Crawling: " . $this->getDomain($this->currentUrl) . " finished");
+	}
 
 
-				$html = str_get_html($pageContent);
+	function start() {
+	
+		crawlerLog("Crawler started");
 
-				//reparam html stricat
-				if (!$html->find('body', 0, true)) {
+		$this->domainsList = explode(PHP_EOL, file_get_contents("WhiteList.txt"));
 
-					$html = $this->fixHtml($html);
-				}
+		//start processing 
+		$this->processWhiteList();
 
+		crawlerLog('Crawler finished');
+	}
 
 
-				$this->extractText($html->find('body', 0, true));
-				$this->saveCurrentPage();
-				
-				//cata memorie consuma
-				//si eliberare referinte pierdute
-				$this->manageMemory();
-				//niceness
-				sleep(pref_getSectionPreference('crawler', 't_wait'));
-			}
-			catch (Exception $ex) {
+	function processWhiteList() {
 
-				logException($ex);
-			}
-		}
+		$multipleLinesComment = false;
+
+		foreach($this->domainsList as $startUrl) {
+			//comentarii pe mai multe linii
+			if (substr($startUrl, 0, 3) == '###')
+				$multipleLinesComment ^= $multipleLinesComment;
+
+			if ($multipleLinesComment)
+				continue;
+			//comentarii pe o singura linie
+			if (substr($startUrl,0,1) == '#')
+				continue;
 
-		crawlerLog('Finished');
+			//curatam url-ul
+			$this->currentUrl = $this->urlPadding($startUrl);
+			//impartim url-ul pe componente
+			$this->urlResource = parse_url($this->currentUrl);
+
+			//salvam startUrl in tabelul Link pentru a incepe extragerea,
+			//startUrl nu va avea o pagina din care este descoperit
+			//asa ca pagina crawledPageId va avea valoarea 0.
+			Link::saveLink2DB($this->currentUrl, $this->getDomain($this->currentUrl), '0');
+
+			//locatia curenta, va fi folosita pentru a nu depasi sfera
+			//de exemplu vrem sa crawlam doar o anumita zona a site-ului
+			$this->currentLocation = substr($this->currentUrl, 0);
+			crawlerLog('domain start location: '.$this->currentLocation);
+
+			$this->crawlDomain();
+		}
+		
 	}
+
 }
 
 /*
@@ -109,8 +152,8 @@
 if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
 
 	$obj = new Crawler();
-	//$obj->startCrawling("http://wiki.dexonline.ro/");
-	$obj->startCrawling("http://www.romlit.ro");
+
+	$obj->start();
 }
 
 ?>
\ No newline at end of file

Added: wwwbase/Crawler/MemoryManagement.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/MemoryManagement.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -0,0 +1,45 @@
+<?php
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+
+require_once 'AppLog.php';
+
+class MemoryManagement {
+
+	/*
+	 * cleans lost memory refferences
+	 */
+	public static function clean($print = false) {
+
+			gc_enable(); // Enable Garbage Collector
+			if ($print) {
+			
+				crawlerLog(gc_collect_cycles() . " garbage cycles cleaned"); // # of elements cleaned up
+			}
+			gc_disable(); // Disable Garbage Collector
+	}
+
+	public static function showUsage($message = '', $realUsage = false, $units = "B") {
+
+		$truncate = 1;
+		switch($units) {
+
+			case 'KB':
+				$truncate = pow(10,3);
+				break;
+			case 'MB':
+				$truncate = pow(10,6);
+				break;
+			case 'GB':
+				$truncate = pow(10,9);
+				break;
+			default: //Bytes
+
+				break;
+		}
+
+		crawlerLog("Memory Usage $message: " . sprintf("%.0f", memory_get_usage($realUsage) / $truncate) . ' ' . $units);
+	}
+}
+
+?>
\ No newline at end of file

Added: wwwbase/Crawler/WhiteList.txt
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/WhiteList.txt	Sun Sep  8 23:31:54 2013	(r979)
@@ -0,0 +1,15 @@
+# câte un domeniu pe linie
+# comentarii pe o singură linie cu #
+
+###
+
+pe mai multe linii încadrați între ### și ###,
+dar ### de închidere să fie singur pe linie
+pentru că altfel ce urmează după aceasta pe
+aceeași linie va fi ignorat
+
+###
+
+#http://wiki.dexonline.ro/
+#http://www.romlit.ro
+http://ro.wikipedia.org/

Modified: wwwbase/Crawler/clean_all.php
==============================================================================
--- wwwbase/Crawler/clean_all.php	Fri Sep  6 17:49:26 2013	(r978)
+++ wwwbase/Crawler/clean_all.php	Sun Sep  8 23:31:54 2013	(r979)
@@ -9,6 +9,8 @@
 require_once '../../phplib/idiorm/idiorm.php';
 
 
+
+
 function removeFiles($regexPath) {
 
 	exec("rm -rf $regexPath");

Added: wwwbase/Crawler/crawler_dex.conf
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/crawler_dex.conf	Sun Sep  8 23:31:54 2013	(r979)
@@ -0,0 +1,42 @@
+[crawler]
+
+
+;the as downloaded html page location
+raw_page_path=RawPage/
+;the pure text page associated
+;with the html one location
+parsed_text_path=ParsedText/
+
+;waiting time between getting pages
+t_wait=10
+
+
+;if this is true, then the application will
+;exit if an exception occured
+exception_exit=true
+
+
+;crawler log
+crawler_log=crawler_log
+diacritics_log=diacritics_log
+;outputs messages to the screen
+;values are true and false
+log2screen=true
+;outputs messages to the file
+;specified bycrawler_log
+log2file=true
+new_line=PHP_EOL
+
+
+;the most probable directory index file
+dir_index_file=index
+;the most probable index extensions
+index_file_ext=html,php,aspx,asp,pl,py,jsp
+
+
+;this should be stored somewhere not on public directories
+user_agent_location=/var/www/CrawlerData/user_agent
+
+
+;diactritics list
+diacritics=ăâîșț

Added: wwwbase/Crawler/database_tables.sql
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/database_tables.sql	Sun Sep  8 23:31:54 2013	(r979)
@@ -0,0 +1,25 @@
+CREATE TABLE IF NOT EXISTS `CrawledPage` (
+  `id` bigint(20) NOT NULL AUTO_INCREMENT,
+  `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+  `url` varchar(256) NOT NULL,
+  `httpStatus` int(11) NOT NULL,
+  `rawPagePath` varchar(128) NOT NULL,
+  `parsedTextPath` varchar(128) NOT NULL,
+  `createDate` int(4) DEFAULT NULL,
+  `modDate` int(4) DEFAULT NULL,
+  PRIMARY KEY (`id`)
+) ENGINE=InnoDB  DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;
+
+
+
+CREATE TABLE IF NOT EXISTS `Link` (
+  `id` bigint(20) NOT NULL AUTO_INCREMENT,
+  `canonicalUrl` varchar(256) NOT NULL,
+  `domain` varchar(128) NOT NULL,
+  `crawledPageId` bigint(20) NOT NULL,
+  `createDate` int(4) DEFAULT NULL,
+  `modDate` int(4) DEFAULT NULL,
+  PRIMARY KEY (`id`)
+) ENGINE=InnoDB  DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;
+
+

Deleted: wwwbase/Crawler/sectiunea_de_crawler_dex.conf
==============================================================================
--- wwwbase/Crawler/sectiunea_de_crawler_dex.conf	Sun Sep  8 23:31:54 2013	(r978)
+++ /dev/null	00:00:00 1970	(deleted)
@@ -1,9 +0,0 @@
-[crawler]
-raw_page_path=RawPage/
-parsed_page_path=ParsedText/
-t_wait=30
-exception_exit=true
-crawler_log=crawler_log
-new_line=PHP_EOL
-log2screen=true
-log2file=true
\ No newline at end of file

Modified: wwwbase/styles/crawler.css
==============================================================================
--- wwwbase/styles/crawler.css	Fri Sep  6 17:49:26 2013	(r978)
+++ wwwbase/styles/crawler.css	Sun Sep  8 23:31:54 2013	(r979)
@@ -86,7 +86,7 @@
 
 	#crawlerTitle img {
 
-		width: 310px !important;
+		width: 330px !important;
 		height: 50px !important;
 		margin: 0 auto !important;
 	}