[dev] [commit] r1003 - in wwwbase: . Crawler styles

automailer at dexonline.ro automailer at dexonline.ro
Mon Sep 23 09:15:45 EEST 2013


Author: alinu
Date: Mon Sep 23 09:15:44 2013
New Revision: 1003

Log:
AbstractCrawler.php - nu stocheaza linkuri inutile in baza de date (img, pdf, etc).
diacritics.php - pastreaza formatul textului initial (tab si \n), rezolvat eroare cu inceput de text (offset 0 vs null) la cuvantul 'introducere' spre exemplu 
clean.php - curata baza de date si fisierele produse de Crawler sau de DiacriticsBuilder
diacritics_fix.css - adaugat stil la div id="text_input" + aliniere text stanga

Added:
   wwwbase/Crawler/clean.php
Modified:
   wwwbase/Crawler/AbstractCrawler.php
   wwwbase/Crawler/AppLog.php
   wwwbase/Crawler/Crawler.php
   wwwbase/Crawler/DiacriticsBuilder.php
   wwwbase/Crawler/MemoryManagement.php
   wwwbase/Crawler/index.php
   wwwbase/diacritice.php
   wwwbase/styles/diacritics_fix.css

Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/AbstractCrawler.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -31,7 +31,7 @@
 
 	protected $urlResource;
 	protected $directoryIndexFile;
-	protected $IndexFileExt;
+	protected $indexFileExt;
 
 	protected $domainsList;
 
@@ -41,7 +41,8 @@
 		$this->plainText = '';
 		$this->pageContent = '';
 		$this->directoryIndexFile = pref_getSectionPreference('crawler', 'dir_index_file');
-		$this->IndexFileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext'));
+		$this->indexFileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext'));
+		$this->fileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext').',txt');
 	}
 
 
@@ -49,11 +50,11 @@
 	function getPage($url) {
 
 		$this->ch = curl_init();
-
+		crawlerLog(file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
 		curl_setopt ($this->ch, CURLOPT_URL, $url);
 		curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
 		curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
-		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 60);
+		curl_setopt ($this->ch, CURLOPT_TIMEOUT, 30);
 		curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
 		curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
 		curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
@@ -210,6 +211,24 @@
 		return str_get_html($buffer);
     }
 
+    function eligeableUrl($url) {
+
+    	$resource = parse_url($url);
+    	$pathInfo = pathinfo($resource['path']);
+
+    	if (isset($pathInfo['extension'])) {
+
+    		$ext = $pathInfo['extension'];
+
+
+    		if (array_search($ext, $this->fileExt) === false) {
+
+    			return false;
+    		}
+    	}
+
+    	return true;
+    }
 
     //metode pentru prelucrarea linkurilor
 	//sterge directory index file si elimina slash-urile in plus
@@ -217,6 +236,12 @@
 	//le transforma in absolute daca sunt relative
 	function processLink($url) {
 
+
+		if (!$this->eligeableUrl($url)) {
+
+			return;
+		}
+
 		crawlerLog('Processing link: '.$url);
 		$canonicalUrl = null;
 		if ($this->isRelativeLink($url)) {
@@ -247,7 +272,7 @@
 
 		//crawlerLog('delDirIndexFile  '.$url);
 
-		foreach($this->IndexFileExt as $ext) {
+		foreach($this->indexFileExt as $ext) {
 
 			$target = $this->directoryIndexFile .'.'. $ext;
 
@@ -347,4 +372,4 @@
 	abstract function start();
 }
 
-?>
\ No newline at end of file
+?>1
\ No newline at end of file

Modified: wwwbase/Crawler/AppLog.php
==============================================================================
--- wwwbase/Crawler/AppLog.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/AppLog.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -3,8 +3,6 @@
  * Alin Ungureanu, 2013
  * alyn.cti at gmail.com
  */
-require_once '../../phplib/util.php';
-require_once '../../phplib/serverPreferences.php';
 
 $exceptionExit = pref_getSectionPreference('crawler', 'exception_exit');
 $logFile = pref_getSectionPreference('crawler', 'crawler_log');
@@ -13,9 +11,29 @@
  * $level poate fi de forma :  __FILE__.' - '.__CLASS__.'::'.__FUNCTION__.' line '.__LINE__
  * sau mai simplu
  */
+function getCorrespondentNewLine() {
+
+	//daca este terminal
+	if (PHP_SAPI == 'cli') {
+		return PHP_EOL;
+	}
+	//altfel este browser
+	else return '<br>';
+}
+
 function crawlerLog($message, $level = '') {
 
 	global $logFile;
+
+	//afisaza sau nu in log "INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - '
+	// . 'line '.__LINE__ acolo unde exista
+	if (!pref_getSectionPreference('crawler', 'function_trace')) {
+
+		if (substr($message, 0, 6) == 'INSIDE')
+			return;
+	}
+
+
 	//log in fisier
 	if (pref_getSectionPreference('crawler', 'log2file'))
 	try{
@@ -30,7 +48,7 @@
 	//log in stdout
 	if(pref_getSectionPreference('crawler', 'log2screen')) {
 
-		echo date("Y-m-d H:i:s") . '::' . $level . '::' . $message.pref_getSectionPreference('crawler', 'new_line');
+		echo date("Y-m-d H:i:s") . '::' . $level . '::' . $message.getCorrespondentNewLine();
 		flush();
 	}
 

Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/Crawler.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -130,8 +130,6 @@
 				continue;
 
 
-			
-
 			//curatam url-ul
 			$this->currentUrl = $this->urlPadding($startUrl);
 			//impartim url-ul pe componente

Modified: wwwbase/Crawler/DiacriticsBuilder.php
==============================================================================
--- wwwbase/Crawler/DiacriticsBuilder.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/DiacriticsBuilder.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -22,14 +22,14 @@
 class DiacriticsBuilder {
 
 
-	private $currOffset;
+	protected $currOffset;
 	protected $file;
-	private $fileEndOffset;
+	protected $fileEndOffset;
 
-	private static $diacritics;
-	private static $nonDiacritics;
-	private static $paddingNumber;
-	private static $paddingChar;
+	protected static $diacritics;
+	protected static $nonDiacritics;
+	protected static $paddingNumber;
+	protected static $paddingChar;
 	private $globalCount;
 	private $localCount;
 	private $currentFolder;
@@ -41,7 +41,7 @@
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
 
 		self::$diacritics = pref_getSectionPreference("crawler", "diacritics");
-		self::$nonDiacritics = pref_getSectionPreference("crawler", "non_diacritics");
+		self::$nonDiacritics = pref_getSectionPreference("crawler", "non_lower_diacritics");
 		self::$paddingNumber = pref_getSectionPreference('crawler', 'diacritics_padding_length');
 		self::$paddingChar = pref_getSectionPreference('crawler', 'padding_char');
 
@@ -77,15 +77,16 @@
 
 			$crawledPage = CrawledPage::getNextDiacriticsFile();
 			
-			$this->showProcessingFileStatus($crawledPage);
-			
 			if ($crawledPage == null) {
 
 				return null;
 			}
+
+			$this->showProcessingFileStatus($crawledPage);
+
 			FilesUsedInDiacritics::save2Db($crawledPage->id);
 
-			if (is_file($crawledPage->parsedTextPath)) {
+			if (is_file($crawledPage->parsedTextPath) || $crawledPage->httpStatus < 400) {
 				return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
 			}
 		}
@@ -141,32 +142,37 @@
 		for ($i = 0; $i < self::$paddingNumber; $i++) {
 			
 			if ($infOffset < 0) {
-				$infPadding = true;
-			}
-			$infCh = StringUtil::getCharAt($this->file, $infOffset);
-			$infPadding = self::isSeparator($infCh);
-
-			if ($infPadding) {
 				$before = self::$paddingChar . $before;
 			}
 			else {
-				$before = $infCh . $before;
-				$infOffset --;
-			}
-
-			if ($supOffset > $this->fileEndOffset) {
-				$supPadding = true;
+				if (!$infPadding) {
+					$infCh = StringUtil::getCharAt($this->text, $infOffset);
+					$infPadding = self::isSeparator($infCh);
+				}
+				if ($infPadding) {
+					$before = self::$paddingChar . $before;
+				}
+				else {
+					$before = $infCh . $before;
+					$infOffset --;
+				}
 			}
-
-			$supCh = StringUtil::getCharAt($this->file, $supOffset);
-			$supPadding = self::isSeparator($supCh);
-
-			if ($supPadding) {
+			
+			if ($supOffset > $this->textEndOffset) {
 				$after = $after . self::$paddingChar;
 			}
 			else {
-				$after = $after . $supCh;
-				$supOffset ++;
+				if (!$supPadding) {
+					$supCh = StringUtil::getCharAt($this->text, $supOffset);
+					$supPadding = self::isSeparator($supCh);
+				}
+				if ($supPadding) {
+					$after = $after . self::$paddingChar;
+				}
+				else {
+					$after = $after . $supCh;
+					$supOffset ++;
+				}
 			}
 		}
 
@@ -181,8 +187,8 @@
 		$this->currOffset = 0;
 		$this->fileEndOffset = mb_strlen($file) - 1;
 
-		while(($offset = $this->getNextOffset()) != '') {
-
+		while(($offset = $this->getNextOffset()) !== null) {
+			
 			$this->leftAndRightPadding($offset);
 		}
 	}
@@ -190,9 +196,12 @@
 	function start() {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
 		while(($file = $this->getNextFile()) != null) {
+			
 			$this->processFile($file);
 			MemoryManagement::clean();
 		}
+
+		crawlerLog("Finished");
 	}
 }
 

Modified: wwwbase/Crawler/MemoryManagement.php
==============================================================================
--- wwwbase/Crawler/MemoryManagement.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/MemoryManagement.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -1,8 +1,4 @@
 <?php
-require_once '../../phplib/util.php';
-require_once '../../phplib/serverPreferences.php';
-
-require_once 'AppLog.php';
 
 class MemoryManagement {
 
@@ -42,4 +38,4 @@
 	}
 }
 
-?>
\ No newline at end of file
+?>

Added: wwwbase/Crawler/clean.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/clean.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -0,0 +1,75 @@
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+
+
+function printUsage() {
+	echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
+	flush();
+	exit();
+}
+
+if (count($argv) == 1) printUsage();
+
+db_init();
+
+$db = ORM::get_db();
+$db->beginTransaction();
+
+
+
+if ($argv[1] == '--crawler' || $argv[1] == '-c') {
+
+
+	function removeFiles($regexPath) {
+
+		exec("rm -rf $regexPath");
+	}
+
+	try {
+
+		//sterge toate fisierele salvate
+		removeFiles('ParsedText/*');
+		removeFiles('RawPage/*');
+
+
+		echo 'files deleted'.pref_getSectionPreference('crawler', 'new_line');
+
+	    $db->exec('TRUNCATE Table CrawledPage;');
+	    $db->exec('TRUNCATE Table Link;');
+	    $db->commit();
+
+		echo "tables 'Link' and 'CrawledPage' truncated".pref_getSectionPreference('crawler', 'new_line');
+
+		echo 'The cleaning process was successful'.pref_getSectionPreference('crawler', 'new_line');
+	}
+
+	catch(Exception $ex) {
+
+		echo 'The cleaning process encountered a problem '.pref_getSectionPreference('crawler', 'new_line').$ex->getMessage();
+	}
+}
+else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
+
+	try{
+		$db->exec('TRUNCATE Table Diacritics;');
+		$db->exec('TRUNCATE Table FilesUsedInDiacritics;');
+	    $db->commit();
+		echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated".pref_getSectionPreference('crawler', 'new_line');
+		echo 'The cleaning process was successful'.pref_getSectionPreference('crawler', 'new_line');
+	}
+	catch(Exception $e) {
+
+		echo 'The cleaning process encountered a problem '.pref_getSectionPreference('crawler', 'new_line').$ex->getMessage();
+	}
+
+}
+else printUsage(); 
+/**/
+?>
\ No newline at end of file

Modified: wwwbase/Crawler/index.php
==============================================================================
--- wwwbase/Crawler/index.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/Crawler/index.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -30,4 +30,4 @@
 
 SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
 
-?>
+?>
\ No newline at end of file

Modified: wwwbase/diacritice.php
==============================================================================
--- wwwbase/diacritice.php	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/diacritice.php	Mon Sep 23 09:15:44 2013	(r1003)
@@ -57,8 +57,11 @@
 	 */
 	function getNextOffset() {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		
 		while($this->currOffset <= $this->textEndOffset) {
 			//daca urmatorul offset e a,i,s,t sau ă,â,î,ș,ț
+			crawlerLog(StringUtil::getCharAt($this->text, $this->currOffset) . ' - offset ' .$this->currOffset);
+
 			if (self::isPossibleDiacritic(StringUtil::getCharAt($this->text, $this->currOffset))) {
 				return $this->currOffset ++;
 			}
@@ -86,21 +89,26 @@
 
 		$this->textEndOffset = mb_strlen($text) - 1;
 		$offset = 0;
-		while(($offset = $this->getNextOffset()) != null) {
-
+		while(($offset = $this->getNextOffset()) !== null ) {
+			
 			$this->leftAndRightPadding($offset);
 		}
+
 		//copiem de la ultimul posibil diacritic pana la final
-		$this->resultText .= mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
-		$this->hiddenText .= mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
+		$lastChunk = mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
+		$this->resultText .= $lastChunk;
+		$this->hiddenText .= $lastChunk;
 	}
 
 
 	public function fix($text) {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
+		if (mb_strlen($text) > pref_getSectionPreference('crawler', 'diacritics_buffer_limit')) {
+			return "Dimensiune text prea mare, limita este de " .
+			pref_getSectionPreference('crawler', 'diacritics_buffer_limit') . ' de caractere';
+		}
 		$this->processText($text);
-		return $this->resultText;
+		return $this->text2Html($this->resultText);
 	}
 
 	static function toLower($content) {
@@ -130,35 +138,39 @@
 		for ($i = 0; $i < self::$paddingNumber; $i++) {
 			
 			if ($infOffset < 0) {
-				$infPadding = true;
-			}
-			else {
-				
-				$infCh = StringUtil::getCharAt($this->text, $infOffset);
-				$infPadding = self::isSeparator($infCh);
-			}
-			
-			if ($infPadding) {
 				$before = self::$paddingChar . $before;
 			}
 			else {
-				$before = $infCh . $before;
-				$infOffset --;
+				if (!$infPadding) {
+					$infCh = StringUtil::getCharAt($this->text, $infOffset);
+					$infPadding = self::isSeparator($infCh);
+				}
+				if ($infPadding) {
+					$before = self::$paddingChar . $before;
+				}
+				else {
+					$before = $infCh . $before;
+					$infOffset --;
+				}
 			}
+			
+			
 
 			if ($supOffset > $this->textEndOffset) {
-				$supPadding = true;
-			}
-			else {
-				$supCh = StringUtil::getCharAt($this->text, $supOffset);
-				$supPadding = self::isSeparator($supCh);
-			}
-			if ($supPadding) {
 				$after = $after . self::$paddingChar;
 			}
 			else {
-				$after = $after . $supCh;
-				$supOffset ++;
+				if (!$supPadding) {
+					$supCh = StringUtil::getCharAt($this->text, $supOffset);
+					$supPadding = self::isSeparator($supCh);
+				}
+				if ($supPadding) {
+					$after = $after . self::$paddingChar;
+				}
+				else {
+					$after = $after . $supCh;
+					$supOffset ++;
+				}
 			}
 		}
 
@@ -281,6 +293,12 @@
 		return $this->hiddenText;
 	}
 
+	function text2Html($content) {
+
+		//new line to <br> si tab to space( )
+		return preg_replace('/[\t]/', '    ', nl2br($content));
+	}
+
 	function replaceDiacritics() {
 
 		if (isset($_POST['hiddenText'])) {

Modified: wwwbase/styles/diacritics_fix.css
==============================================================================
--- wwwbase/styles/diacritics_fix.css	Sun Sep 22 16:51:22 2013	(r1002)
+++ wwwbase/styles/diacritics_fix.css	Mon Sep 23 09:15:44 2013	(r1003)
@@ -34,4 +34,22 @@
 
 	border:1px solid gray;
 	background-color:#EEEEF6;
+}
+
+#text_input {
+
+	position: relative;
+	text-align:left;
+
+	overflow-y: scroll;
+
+	width: 580px;
+	height: 250px;
+
+	resize: none;
+
+	margin: 10px;
+
+	border:1px solid gray;
+	background-color:#EEEEF6;
 }
\ No newline at end of file


More information about the Dev mailing list