[dev] [commit] r985 - phplib/models wwwbase/Crawler

automailer at dexonline.ro automailer at dexonline.ro
Sun Sep 15 20:21:16 EEST 2013


Author: alinu
Date: Sun Sep 15 20:21:16 2013
New Revision: 985

Log:
DiacriticsBuilder.php - am folosit functiile mb_ si StringUtil
Diacritics.php - am implementat update

Modified:
   phplib/models/Diacritics.php
   wwwbase/Crawler/DiacriticsBuilder.php

Modified: phplib/models/Diacritics.php
==============================================================================
--- phplib/models/Diacritics.php	Fri Sep 13 19:06:02 2013	(r984)
+++ phplib/models/Diacritics.php	Sun Sep 15 20:21:16 2013	(r985)
@@ -5,103 +5,60 @@
 	
 	public static $_table = 'Diacritics';
 
-	
-	private static function replaceDiacritic($ch) {
-
-		switch($ch) {
-
-			case 'ă':
-			case 'â':
-				
-				return 'a';
-			
-			case 'î':
-
-				return 'i';
-
-			case 'ș':
+	//inlocuieste diactriticele
+	private static function stripDiacritics($str) {
 
-				return 's';
+		return str_replace(array('ă','â','î','ș','ț'), array('a','a','i','s','t'), $str);
+	}
 
-			case 'ț':
+	/*
+	 * Am definit propria forma de diacritice
+	 * â,î		- circumflex
+	 * ă,ș,ț	- curbat (caciulita si virgulita) 
+	 * a,i,s 	- default
+	 */
 
-				return 't';
+	private static function getDiacriticForm($diacritic, $update = array()) {
 
-			default:
+		$defaultForm	= '0';
+		$curvedForm 	= '0';
+		$circumflexForm	= '0';
 
-				return $ch;
+		if (!empty($update)) {
+			list($defaultForm, $curvedForm, $circumflexForm) = $update;
 		}
-	}
-
-	//inlocuieste diactriticele
-	private static function stripDiacritics($str) {
-
-		$strippedStr = '';
-		
-		$currOffset = 0;
-		$finalOffset = strlen($str) - 1;
-		
-		while($currOffset <= $finalOffset) {
 
-			$ch = '';
-			if ($currOffset == $finalOffset) {
+		if (strstr("âî", $diacritic)) {
 
-				$ch = substr($str, $currOffset, 1);
-				$currOffset ++;
-			}
-			else {
-
-				$ch = substr($str, $currOffset, 2);
-				if (strstr('ăâîșț', $ch)) {
+			$circumflexForm = intval($circumflexForm) + 1;
+		}
+		else if (strstr("ășț", $diacritic)) {
 
-					$currOffset += 2;
-				}
-				else {
-
-					$ch = substr($str, $currOffset, 1);
-					$currOffset ++;
-				}
-			}
+			$curvedForm = intval($curvedForm) + 1;
+		}
+		else { // daca strstr("aist", $diacritic)
 
-			$strippedStr .= self::replaceDiacritic($ch);
+			$defaultForm = intval($defaultForm) + 1;
 		}
+		return array($defaultForm, $curvedForm, $circumflexForm);
 
-		return $strippedStr;
 	}
 
 
-
-	public function insertRow($before, $middle, $after, $diacritic) {
+	public static function insertRow($before, $middle, $after, $diacritic) {
 
 		try {
 			
 			$tableObj = Model::factory(self::$_table);
 			$tableObj->create();
-
-
-			$tableObj->before = $before;
-			$tableObj->middle = $middle;
-			$tableObj->after = $after;
 			
-			
-			$tableObj->defaultForm = '0';
-			$tableObj->curvedForm = '0';
-			$tableObj->circumflexForm = '0';
-
-			if (strstr("âî", $middle)) {
-
-				$tableObj->circumflexForm = '1';
-			}
-			else if (strstr("ășț", $middle)) {
-
-				$tableObj->curvedForm = '1';
-			}
-			else { // if (strstr("aist", $middle))
-
-				$tableObj->defaultForm = '1';
-			}
-
+			$tableObj->before	= $before;
+			$tableObj->middle	= $middle;
+			$tableObj->after	= $after;
 
+			list($tableObj->defaultForm, $tableObj->curvedForm,
+				$tableObj->circumflexForm) = self::getDiacriticForm($diacritic);
+			
 			$tableObj->save();
 		}
 		catch(Exception $ex) {
@@ -110,44 +67,43 @@
 		}
 	}
 
-	
-
-
-	public static function updateRow($before, $middle, $after) {
-
-		return false;
-	}
-
-
-	public static function entryExists($before, $middle, $after) {
-		
-		return false;
-		$foundEntry = Model::factory(self::$_table)->raw_query("Select id from self::$_table where
-				 before = '$before' and middle = '$middle' and after = '$after';")->find_one();
-		if ($foundEntry) {
 
+	
+	public static function updateRow($before, $middle, $after, $diacritic) {
+	
+		try {	
+			$tableObj = Model::factory(self::$_table)->raw_query("Select * from Diacritics where
+				 `before` = '$before' and `middle` = '$middle' and `after` = '$after';")->find_one();
+			if (!$tableObj) {
+				return false;
+			}
+			else {
+				list($tableObj->defaultForm, $tableObj->curvedForm,
+					$tableObj->circumflexForm) = self::getDiacriticForm($diacritic,
+					array($tableObj->defaultForm, $tableObj->curvedForm,
+						$tableObj->circumflexForm));
+				$tableObj->save();
+			}
 			return true;
 		}
+		catch(Exception $e) {
 
+			echo $e;
+		}
+		
 		return false;
 	}
 
 
 	public static function save2Db($before, $middle, $after) {
 
-		$diacritic = substr($middle, 0);
+		$diacritic = mb_substr($middle, 0, 1);
 
 		$before = self::stripDiacritics($before);
 		$middle = self::stripDiacritics($middle);
 		$after = self::stripDiacritics($after);
 			
-
-			
-		if (self::entryExists($before, $middle, $after)) {
-
-			self::updateRow($before, $middle, $after, $diacritic);
-		}
-		else {
+		if (!self::updateRow($before, $middle, $after, $diacritic)) {
 
 			self::insertRow($before, $middle, $after, $diacritic);
 		}

Modified: wwwbase/Crawler/DiacriticsBuilder.php
==============================================================================
--- wwwbase/Crawler/DiacriticsBuilder.php	Fri Sep 13 19:06:02 2013	(r984)
+++ wwwbase/Crawler/DiacriticsBuilder.php	Sun Sep 15 20:21:16 2013	(r985)
@@ -32,7 +32,8 @@
 	private static $paddingChar;
 	private $globalCount;
 	private $localCount;
-	private $currentDir;
+	private $currentFolder;
+	private $folderCount;
 	/*
 	 * initialises instance variables
 	 */
@@ -47,273 +48,129 @@
 		$this->globalCount = 0;
  	}
 
+
+ 	function showProcessingFileStatus($crawledPage) {
+ 		$start  = strpos($crawledPage->parsedTextPath, '/') + 1;
+ 		$length = strrpos($crawledPage->parsedTextPath, '/') - $start; 
+ 		$folder = substr($crawledPage->parsedTextPath, $start, $length);
+	
+		if ($folder != $this->currentFolder) {
+
+			$this->currentFolder = $folder;
+			$this->localCount = 0;
+			$this->folderCount = iterator_count(new DirectoryIterator(substr($crawledPage->parsedTextPath,0,strrpos($crawledPage->parsedTextPath, '/'))));
+		}
+
+		$this->localCount ++;		
+		$this->globalCount ++;
+
+ 		crawlerLog("Total(this run)::$this->globalCount, now processing $folder $this->localCount/".$this->folderCount);
+ 	}
+
 	/* 
 	 * gets the next unprocessed file for diacritics
 	 */
 	function getNextFile() {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
 
-		$crawledPage = CrawledPage::getNextDiacriticsFile();
-		
-		if ($crawledPage == null) {
+		while(1) {
 
-			return null;
+			$crawledPage = CrawledPage::getNextDiacriticsFile();
+			
+			$this->showProcessingFileStatus($crawledPage);
+			
+			if ($crawledPage == null) {
+
+				return null;
+			}
+			FilesUsedInDiacritics::save2Db($crawledPage->id);
+
+			if (is_file($crawledPage->parsedTextPath)) {
+				return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
+			}
 		}
-		FilesUsedInDiacritics::save2Db($crawledPage->id);
 
-		return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
+		return null;
 	}
 
 
 	function toLower($content) {
-
-		$content = str_replace(array('Ă','Â','Î','Ș','Ț'), array('ă', 'â', 'î', 'ș', 'ț'), $content);
-
-		return strtolower($content);
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		return mb_strtolower($content);
 	}
+	/* verifica daca $ch este un caracter din lista
+	 * [a,i,s,t,ă,â,î,ș,ț]
+	 */
+	static function isPossibleDiacritic($ch) {
 
-
+		return strstr(self::$diacritics, $ch) || strstr(self::$nonDiacritics, $ch);
+	}
+	/* returneaza urmatorul index in fisier care contine
+	 * un caracter din lista [a,i,s,t,ă,â,î,ș,ț]
+	 */
 	function getNextOffset() {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-		
-
 		while($this->currOffset <= $this->fileEndOffset) {
 			//daca urmatorul offset e a,i,s,t sau ă,â,î,ș,ț
-			$ch = substr($this->file, $this->currOffset, 1);
-			if (strstr(self::$nonDiacritics, $ch)) {
-				
+			if (self::isPossibleDiacritic(StringUtil::getCharAt($this->file, $this->currOffset))) {
 				return $this->currOffset ++;
 			}
-			else {
-
-				$ch = substr($this->file, $this->currOffset, 2);
-
-				if (strstr(self::$diacritics, $ch)) {
-				
-					$this->currOffset += 2;
-
-					return $this->currOffset - 2;
-				}
-			}
-
-
-
-			//trecem la urmatorul caracter
 			$this->currOffset ++;
 		}
-
 		return null;
 	}
 
-	function isSeparator($ch) {
+	static function isSeparator($ch) {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-		
 		return !(ctype_lower($ch) || strstr(self::$diacritics, $ch) || $ch == '-');
 	}
-
-	function pointOfInterestPadding($offset) {
+	/*
+	 * in the word arhivare, the 'i' padding is *arh  i  vare
+	 */
+	function leftAndRightPadding($offset) {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-		
 		$before = '';
-		if (strstr(self::$diacritics, substr($this->file, $offset, 2)))
-			$middle = substr($this->file, $offset, 2);
-		else
-			$middle = substr($this->file, $offset, 1);
-
+		$middle = StringUtil::getCharAt($this->file, $offset);
 		$after = '';
-
+		$infOffset = $offset - 1;
+		$supOffset = $offset + 1;
 		$infPadding = false;
 		$supPadding = false;
-
-
-		//echo "OFFSET ".$offset. '  char '.substr($this->file, $offset, 1).PHP_EOL; 
-
-
-		$infOffset = $offset - 2;
-
-		$supOffset = $offset + strlen($middle);
-
-		$firstLetter = false;
-		$lastLetter = false;
+		
 
 		for ($i = 0; $i < self::$paddingNumber; $i++) {
-
-			//before
-
+			
 			if ($infOffset < 0) {
-				//daca e primul caracter
-				if ($infOffset + 1 == 0) {
-
-					$firstLetter = true;
-				}
-				else {
-
-					$infPadding = true;
-				}
+				$infPadding = true;
 			}
-
-
+			$infCh = StringUtil::getCharAt($this->file, $infOffset);
+			$infPadding = self::isSeparator($infCh);
 
 			if ($infPadding) {
-
 				$before = self::$paddingChar . $before;
 			}
 			else {
-
-				if ($firstLetter) {
-
-					$infCh = substr($this->file, $infOffset, 1);
-
-					if ($this->isSeparator($infCh)) {
-
-						$infPadding = true;
-						$before = self::$paddingChar . $before;
-					}
-					else {
-
-						$before = $infCh . $before;
-					}
-
-				}
-
-				else {
-
-					$infCh = substr($this->file, $infOffset, 2);
-
-					if (!strstr(self::$diacritics, $infCh)) {
-						$infOffset ++;
-						$infCh = substr($this->file, $infOffset, 1);
-					}
-
-					if ($this->isSeparator($infCh)) {
-
-						$infPadding = true;
-						$before = self::$paddingChar . $before;
-					}
-					else {
-
-						$before = $infCh . $before;
-
-						$infOffset -= 2;
-					}
-				}
+				$before = $infCh . $before;
+				$infOffset --;
 			}
 
-			//after
-
 			if ($supOffset > $this->fileEndOffset) {
-				//daca e ultimul caracter
-				if ($supOffset - 1 == $this->fileEndOffset) {
-
-					$lastLetter = true;
-				}
-				else {
-
-					$supPadding = true;
-				}
+				$supPadding = true;
 			}
 
-
+			$supCh = StringUtil::getCharAt($this->file, $supOffset);
+			$supPadding = self::isSeparator($supCh);
 
 			if ($supPadding) {
-
-				$after .= self::$paddingChar;
-			}
-			else {
-
-				if ($lastLetter) {
-
-					$supCh = substr($this->file, $supOffset, 1);
-
-					if ($this->isSeparator($infCh)) {
-
-						$supPadding = true;
-						$after = self::$paddingChar . $after;
-					}
-					else {
-
-						$after .= $supCh;
-					}
-
-				}
-
-				else {
-
-					$supCh = substr($this->file, $supOffset, 2);
-
-					if (!strstr(self::$diacritics, $supCh)) {
-						
-						$supCh = substr($this->file, $supOffset, 1);
-					}
-
-					if ($this->isSeparator($supCh)) {
-
-						$supPadding = true;
-						$after .= self::$paddingChar;
-					}
-					else {
-
-						$after .= $supCh;
-
-						$supOffset += strlen($supCh);
-					}
-				}
-			}
-
-
-		}
-/*
-			$supCh = substr($this->file, $superiorOffset, 2);
-
-			if (!strstr(self::$diacritics, $supCh)) {
-
-				$supCh = substr($this->file, $superiorOffset, 1);
-			}
-
-			
-
-
-
-			if ($inferiorOffset < 0) {
-				
-				$before = self::$paddingChar . $before;
+				$after = $after . self::$paddingChar;
 			}
 			else {
-				
-				$ch = substr($this->file, $inferiorOffset, 1);
-			
-				if (!$inferiorSeparator) {
-					
-					$inferiorSeparator = $this->isSeparator($ch);
-				}
-
-				$before = ($inferiorSeparator ? self::$paddingChar : $ch) . $before;
+				$after = $after . $supCh;
+				$supOffset ++;
 			}
-
-			if ($superiorOffset > $this->fileEndOffset) {
-				
-				$after .= self::$paddingChar;
-			}
-			else {
-
-			
-				$ch = substr($this->file, $superiorOffset, 1);
-			
-				if (!$superiorSeparator) {
-					
-					$superiorSeparator = $this->isSeparator($ch);
-				}
-
-				$after .= ($superiorSeparator ? self::$paddingChar : $ch);
-			}
-
 		}
 
-		//echo "RESULT   $before|$middle|$after".PHP_EOL;
-*/
-
 		Diacritics::save2Db($before, $middle, $after);
-
 	}
 
 
@@ -322,20 +179,17 @@
 
 		$this->file = $file;
 		$this->currOffset = 0;
-		$this->fileEndOffset = strlen($file) - 1;
+		$this->fileEndOffset = mb_strlen($file) - 1;
 
-		while(($offset = $this->getNextOffset()) != null) {
+		while(($offset = $this->getNextOffset()) != '') {
 
-			$this->pointOfInterestPadding($offset);
+			$this->leftAndRightPadding($offset);
 		}
 	}
 
 	function start() {
 		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
 		while(($file = $this->getNextFile()) != null) {
-
-
 			$this->processFile($file);
 			MemoryManagement::clean();
 		}


More information about the Dev mailing list