[dev] [commit] r985 - phplib/models wwwbase/Crawler
automailer at dexonline.ro
automailer at dexonline.ro
Sun Sep 15 20:21:16 EEST 2013
Author: alinu
Date: Sun Sep 15 20:21:16 2013
New Revision: 985
Log:
DiacriticsBuilder.php - am folosit functiile mb_ si StringUtil
Diacritics.php - am implementat update
Modified:
phplib/models/Diacritics.php
wwwbase/Crawler/DiacriticsBuilder.php
Modified: phplib/models/Diacritics.php
==============================================================================
--- phplib/models/Diacritics.php Fri Sep 13 19:06:02 2013 (r984)
+++ phplib/models/Diacritics.php Sun Sep 15 20:21:16 2013 (r985)
@@ -5,103 +5,60 @@
public static $_table = 'Diacritics';
-
- private static function replaceDiacritic($ch) {
-
- switch($ch) {
-
- case 'ă':
- case 'â':
-
- return 'a';
-
- case 'î':
-
- return 'i';
-
- case 'ș':
+ //inlocuieste diactriticele
+ private static function stripDiacritics($str) {
- return 's';
+ return str_replace(array('ă','â','î','ș','ț'), array('a','a','i','s','t'), $str);
+ }
- case 'ț':
+ /*
+ * Am definit propria forma de diacritice
+ * â,î - circumflex
+ * ă,ș,ț - curbat (caciulita si virgulita)
+ * a,i,s - default
+ */
- return 't';
+ private static function getDiacriticForm($diacritic, $update = array()) {
- default:
+ $defaultForm = '0';
+ $curvedForm = '0';
+ $circumflexForm = '0';
- return $ch;
+ if (!empty($update)) {
+ list($defaultForm, $curvedForm, $circumflexForm) = $update;
}
- }
-
- //inlocuieste diactriticele
- private static function stripDiacritics($str) {
-
- $strippedStr = '';
-
- $currOffset = 0;
- $finalOffset = strlen($str) - 1;
-
- while($currOffset <= $finalOffset) {
- $ch = '';
- if ($currOffset == $finalOffset) {
+ if (strstr("âî", $diacritic)) {
- $ch = substr($str, $currOffset, 1);
- $currOffset ++;
- }
- else {
-
- $ch = substr($str, $currOffset, 2);
- if (strstr('ăâîșț', $ch)) {
+ $circumflexForm = intval($circumflexForm) + 1;
+ }
+ else if (strstr("ășț", $diacritic)) {
- $currOffset += 2;
- }
- else {
-
- $ch = substr($str, $currOffset, 1);
- $currOffset ++;
- }
- }
+ $curvedForm = intval($curvedForm) + 1;
+ }
+ else { // daca strstr("aist", $diacritic)
- $strippedStr .= self::replaceDiacritic($ch);
+ $defaultForm = intval($defaultForm) + 1;
}
+ return array($defaultForm, $curvedForm, $circumflexForm);
- return $strippedStr;
}
-
- public function insertRow($before, $middle, $after, $diacritic) {
+ public static function insertRow($before, $middle, $after, $diacritic) {
try {
$tableObj = Model::factory(self::$_table);
$tableObj->create();
-
-
- $tableObj->before = $before;
- $tableObj->middle = $middle;
- $tableObj->after = $after;
-
- $tableObj->defaultForm = '0';
- $tableObj->curvedForm = '0';
- $tableObj->circumflexForm = '0';
-
- if (strstr("âî", $middle)) {
-
- $tableObj->circumflexForm = '1';
- }
- else if (strstr("ășț", $middle)) {
-
- $tableObj->curvedForm = '1';
- }
- else { // if (strstr("aist", $middle))
-
- $tableObj->defaultForm = '1';
- }
-
+ $tableObj->before = $before;
+ $tableObj->middle = $middle;
+ $tableObj->after = $after;
+ list($tableObj->defaultForm, $tableObj->curvedForm,
+ $tableObj->circumflexForm) = self::getDiacriticForm($diacritic);
+
$tableObj->save();
}
catch(Exception $ex) {
@@ -110,44 +67,43 @@
}
}
-
-
-
- public static function updateRow($before, $middle, $after) {
-
- return false;
- }
-
-
- public static function entryExists($before, $middle, $after) {
-
- return false;
- $foundEntry = Model::factory(self::$_table)->raw_query("Select id from self::$_table where
- before = '$before' and middle = '$middle' and after = '$after';")->find_one();
- if ($foundEntry) {
+
+ public static function updateRow($before, $middle, $after, $diacritic) {
+
+ try {
+ $tableObj = Model::factory(self::$_table)->raw_query("Select * from Diacritics where
+ `before` = '$before' and `middle` = '$middle' and `after` = '$after';")->find_one();
+ if (!$tableObj) {
+ return false;
+ }
+ else {
+ list($tableObj->defaultForm, $tableObj->curvedForm,
+ $tableObj->circumflexForm) = self::getDiacriticForm($diacritic,
+ array($tableObj->defaultForm, $tableObj->curvedForm,
+ $tableObj->circumflexForm));
+ $tableObj->save();
+ }
return true;
}
+ catch(Exception $e) {
+ echo $e;
+ }
+
return false;
}
public static function save2Db($before, $middle, $after) {
- $diacritic = substr($middle, 0);
+ $diacritic = mb_substr($middle, 0, 1);
$before = self::stripDiacritics($before);
$middle = self::stripDiacritics($middle);
$after = self::stripDiacritics($after);
-
-
- if (self::entryExists($before, $middle, $after)) {
-
- self::updateRow($before, $middle, $after, $diacritic);
- }
- else {
+ if (!self::updateRow($before, $middle, $after, $diacritic)) {
self::insertRow($before, $middle, $after, $diacritic);
}
Modified: wwwbase/Crawler/DiacriticsBuilder.php
==============================================================================
--- wwwbase/Crawler/DiacriticsBuilder.php Fri Sep 13 19:06:02 2013 (r984)
+++ wwwbase/Crawler/DiacriticsBuilder.php Sun Sep 15 20:21:16 2013 (r985)
@@ -32,7 +32,8 @@
private static $paddingChar;
private $globalCount;
private $localCount;
- private $currentDir;
+ private $currentFolder;
+ private $folderCount;
/*
* initialises instance variables
*/
@@ -47,273 +48,129 @@
$this->globalCount = 0;
}
+
+ function showProcessingFileStatus($crawledPage) {
+ $start = strpos($crawledPage->parsedTextPath, '/') + 1;
+ $length = strrpos($crawledPage->parsedTextPath, '/') - $start;
+ $folder = substr($crawledPage->parsedTextPath, $start, $length);
+
+ if ($folder != $this->currentFolder) {
+
+ $this->currentFolder = $folder;
+ $this->localCount = 0;
+ $this->folderCount = iterator_count(new DirectoryIterator(substr($crawledPage->parsedTextPath,0,strrpos($crawledPage->parsedTextPath, '/'))));
+ }
+
+ $this->localCount ++;
+ $this->globalCount ++;
+
+ crawlerLog("Total(this run)::$this->globalCount, now processing $folder $this->localCount/".$this->folderCount);
+ }
+
/*
* gets the next unprocessed file for diacritics
*/
function getNextFile() {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
- $crawledPage = CrawledPage::getNextDiacriticsFile();
-
- if ($crawledPage == null) {
+ while(1) {
- return null;
+ $crawledPage = CrawledPage::getNextDiacriticsFile();
+
+ $this->showProcessingFileStatus($crawledPage);
+
+ if ($crawledPage == null) {
+
+ return null;
+ }
+ FilesUsedInDiacritics::save2Db($crawledPage->id);
+
+ if (is_file($crawledPage->parsedTextPath)) {
+ return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
+ }
}
- FilesUsedInDiacritics::save2Db($crawledPage->id);
- return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
+ return null;
}
function toLower($content) {
-
- $content = str_replace(array('Ă','Â','Î','Ș','Ț'), array('ă', 'â', 'î', 'ș', 'ț'), $content);
-
- return strtolower($content);
+ crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+ return mb_strtolower($content);
}
+ /* verifica daca $ch este un caracter din lista
+ * [a,i,s,t,ă,â,î,ș,ț]
+ */
+ static function isPossibleDiacritic($ch) {
-
+ return strstr(self::$diacritics, $ch) || strstr(self::$nonDiacritics, $ch);
+ }
+ /* returneaza urmatorul index in fisier care contine
+ * un caracter din lista [a,i,s,t,ă,â,î,ș,ț]
+ */
function getNextOffset() {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
-
while($this->currOffset <= $this->fileEndOffset) {
//daca urmatorul offset e a,i,s,t sau ă,â,î,ș,ț
- $ch = substr($this->file, $this->currOffset, 1);
- if (strstr(self::$nonDiacritics, $ch)) {
-
+ if (self::isPossibleDiacritic(StringUtil::getCharAt($this->file, $this->currOffset))) {
return $this->currOffset ++;
}
- else {
-
- $ch = substr($this->file, $this->currOffset, 2);
-
- if (strstr(self::$diacritics, $ch)) {
-
- $this->currOffset += 2;
-
- return $this->currOffset - 2;
- }
- }
-
-
-
- //trecem la urmatorul caracter
$this->currOffset ++;
}
-
return null;
}
- function isSeparator($ch) {
+ static function isSeparator($ch) {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
return !(ctype_lower($ch) || strstr(self::$diacritics, $ch) || $ch == '-');
}
-
- function pointOfInterestPadding($offset) {
+ /*
+ * in the word arhivare, the 'i' padding is *arh i vare
+ */
+ function leftAndRightPadding($offset) {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
$before = '';
- if (strstr(self::$diacritics, substr($this->file, $offset, 2)))
- $middle = substr($this->file, $offset, 2);
- else
- $middle = substr($this->file, $offset, 1);
-
+ $middle = StringUtil::getCharAt($this->file, $offset);
$after = '';
-
+ $infOffset = $offset - 1;
+ $supOffset = $offset + 1;
$infPadding = false;
$supPadding = false;
-
-
- //echo "OFFSET ".$offset. ' char '.substr($this->file, $offset, 1).PHP_EOL;
-
-
- $infOffset = $offset - 2;
-
- $supOffset = $offset + strlen($middle);
-
- $firstLetter = false;
- $lastLetter = false;
+
for ($i = 0; $i < self::$paddingNumber; $i++) {
-
- //before
-
+
if ($infOffset < 0) {
- //daca e primul caracter
- if ($infOffset + 1 == 0) {
-
- $firstLetter = true;
- }
- else {
-
- $infPadding = true;
- }
+ $infPadding = true;
}
-
-
+ $infCh = StringUtil::getCharAt($this->file, $infOffset);
+ $infPadding = self::isSeparator($infCh);
if ($infPadding) {
-
$before = self::$paddingChar . $before;
}
else {
-
- if ($firstLetter) {
-
- $infCh = substr($this->file, $infOffset, 1);
-
- if ($this->isSeparator($infCh)) {
-
- $infPadding = true;
- $before = self::$paddingChar . $before;
- }
- else {
-
- $before = $infCh . $before;
- }
-
- }
-
- else {
-
- $infCh = substr($this->file, $infOffset, 2);
-
- if (!strstr(self::$diacritics, $infCh)) {
- $infOffset ++;
- $infCh = substr($this->file, $infOffset, 1);
- }
-
- if ($this->isSeparator($infCh)) {
-
- $infPadding = true;
- $before = self::$paddingChar . $before;
- }
- else {
-
- $before = $infCh . $before;
-
- $infOffset -= 2;
- }
- }
+ $before = $infCh . $before;
+ $infOffset --;
}
- //after
-
if ($supOffset > $this->fileEndOffset) {
- //daca e ultimul caracter
- if ($supOffset - 1 == $this->fileEndOffset) {
-
- $lastLetter = true;
- }
- else {
-
- $supPadding = true;
- }
+ $supPadding = true;
}
-
+ $supCh = StringUtil::getCharAt($this->file, $supOffset);
+ $supPadding = self::isSeparator($supCh);
if ($supPadding) {
-
- $after .= self::$paddingChar;
- }
- else {
-
- if ($lastLetter) {
-
- $supCh = substr($this->file, $supOffset, 1);
-
- if ($this->isSeparator($infCh)) {
-
- $supPadding = true;
- $after = self::$paddingChar . $after;
- }
- else {
-
- $after .= $supCh;
- }
-
- }
-
- else {
-
- $supCh = substr($this->file, $supOffset, 2);
-
- if (!strstr(self::$diacritics, $supCh)) {
-
- $supCh = substr($this->file, $supOffset, 1);
- }
-
- if ($this->isSeparator($supCh)) {
-
- $supPadding = true;
- $after .= self::$paddingChar;
- }
- else {
-
- $after .= $supCh;
-
- $supOffset += strlen($supCh);
- }
- }
- }
-
-
- }
-/*
- $supCh = substr($this->file, $superiorOffset, 2);
-
- if (!strstr(self::$diacritics, $supCh)) {
-
- $supCh = substr($this->file, $superiorOffset, 1);
- }
-
-
-
-
-
- if ($inferiorOffset < 0) {
-
- $before = self::$paddingChar . $before;
+ $after = $after . self::$paddingChar;
}
else {
-
- $ch = substr($this->file, $inferiorOffset, 1);
-
- if (!$inferiorSeparator) {
-
- $inferiorSeparator = $this->isSeparator($ch);
- }
-
- $before = ($inferiorSeparator ? self::$paddingChar : $ch) . $before;
+ $after = $after . $supCh;
+ $supOffset ++;
}
-
- if ($superiorOffset > $this->fileEndOffset) {
-
- $after .= self::$paddingChar;
- }
- else {
-
-
- $ch = substr($this->file, $superiorOffset, 1);
-
- if (!$superiorSeparator) {
-
- $superiorSeparator = $this->isSeparator($ch);
- }
-
- $after .= ($superiorSeparator ? self::$paddingChar : $ch);
- }
-
}
- //echo "RESULT $before|$middle|$after".PHP_EOL;
-*/
-
Diacritics::save2Db($before, $middle, $after);
-
}
@@ -322,20 +179,17 @@
$this->file = $file;
$this->currOffset = 0;
- $this->fileEndOffset = strlen($file) - 1;
+ $this->fileEndOffset = mb_strlen($file) - 1;
- while(($offset = $this->getNextOffset()) != null) {
+ while(($offset = $this->getNextOffset()) != '') {
- $this->pointOfInterestPadding($offset);
+ $this->leftAndRightPadding($offset);
}
}
function start() {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
while(($file = $this->getNextFile()) != null) {
-
-
$this->processFile($file);
MemoryManagement::clean();
}
More information about the Dev
mailing list