[dev] [commit] r1007 - phplib/models wwwbase wwwbase/Crawler
automailer at dexonline.ro
automailer at dexonline.ro
Wed Sep 25 21:31:23 EEST 2013
Author: alinu
Date: Wed Sep 25 21:31:23 2013
New Revision: 1007
Log:
CrawledPage, uitasem sa aduc ultima versiune, uitasem un ->find_one() la ultima functie
AbstractCrawler.php - eligeableUrl este eligibleUrl
DiacriticsBuilder.php si diacritice.php - schimbat modul de adaugare in baza de date pentru imbunatatirea vitezei: reverse(before)
sumar = **sum a r****, acum este mus** a r****
Modified:
phplib/models/CrawledPage.php
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/DiacriticsBuilder.php
wwwbase/diacritice.php
Modified: phplib/models/CrawledPage.php
==============================================================================
--- phplib/models/CrawledPage.php Wed Sep 25 09:58:45 2013 (r1006)
+++ phplib/models/CrawledPage.php Wed Sep 25 21:31:23 2013 (r1007)
@@ -38,7 +38,7 @@
function getNextDiacriticsFile() {
- return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);");
+ return Model::factory(self::$_table)->raw_query("select id, parsedTextPath from CrawledPage where id not in (select fileId from FilesUsedInDiacritics);")->find_one();
}
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Wed Sep 25 09:58:45 2013 (r1006)
+++ wwwbase/Crawler/AbstractCrawler.php Wed Sep 25 21:31:23 2013 (r1007)
@@ -211,7 +211,7 @@
return str_get_html($buffer);
}
- function eligeableUrl($url) {
+ function eligibleUrl($url) {
$resource = parse_utf8_url($url);
$pathInfo = pathinfo($resource['path']);
@@ -237,7 +237,7 @@
function processLink($url) {
- if (!$this->eligeableUrl($url)) {
+ if (!$this->eligibleUrl($url)) {
return;
}
Modified: wwwbase/Crawler/DiacriticsBuilder.php
==============================================================================
--- wwwbase/Crawler/DiacriticsBuilder.php Wed Sep 25 09:58:45 2013 (r1006)
+++ wwwbase/Crawler/DiacriticsBuilder.php Wed Sep 25 21:31:23 2013 (r1007)
@@ -58,13 +58,12 @@
$this->currentFolder = $folder;
$this->localCount = 0;
- $this->folderCount = iterator_count(new DirectoryIterator(substr($crawledPage->parsedTextPath,0,strrpos($crawledPage->parsedTextPath, '/'))));
}
$this->localCount ++;
$this->globalCount ++;
- crawlerLog("Total(this run)::$this->globalCount, now processing $folder $this->localCount/".$this->folderCount);
+ crawlerLog("Total(this run)::$this->globalCount, now processing $folder");
}
/*
@@ -77,6 +76,7 @@
$crawledPage = CrawledPage::getNextDiacriticsFile();
+
if ($crawledPage == null) {
return null;
@@ -142,28 +142,31 @@
for ($i = 0; $i < self::$paddingNumber; $i++) {
if ($infOffset < 0) {
- $before = self::$paddingChar . $before;
+ //$before = self::$paddingChar . $before;
+ $before = $before . self::$paddingChar;
}
else {
if (!$infPadding) {
- $infCh = StringUtil::getCharAt($this->text, $infOffset);
+ $infCh = StringUtil::getCharAt($this->file, $infOffset);
$infPadding = self::isSeparator($infCh);
}
if ($infPadding) {
- $before = self::$paddingChar . $before;
+ //$before = self::$paddingChar . $before;
+ $before = $before . self::$paddingChar;
}
else {
- $before = $infCh . $before;
+ //$before = $infCh . $before;
+ $before = $before . $infCh;
$infOffset --;
}
- }
-
- if ($supOffset > $this->textEndOffset) {
+ }
+
+ if ($supOffset > $this->fileEndOffset) {
$after = $after . self::$paddingChar;
}
else {
if (!$supPadding) {
- $supCh = StringUtil::getCharAt($this->text, $supOffset);
+ $supCh = StringUtil::getCharAt($this->file, $supOffset);
$supPadding = self::isSeparator($supCh);
}
if ($supPadding) {
@@ -188,7 +191,7 @@
$this->fileEndOffset = mb_strlen($file) - 1;
while(($offset = $this->getNextOffset()) !== null) {
-
+
$this->leftAndRightPadding($offset);
}
}
Modified: wwwbase/diacritice.php
==============================================================================
--- wwwbase/diacritice.php Wed Sep 25 09:58:45 2013 (r1006)
+++ wwwbase/diacritice.php Wed Sep 25 21:31:23 2013 (r1007)
@@ -138,7 +138,8 @@
for ($i = 0; $i < self::$paddingNumber; $i++) {
if ($infOffset < 0) {
- $before = self::$paddingChar . $before;
+ //$before = self::$paddingChar . $before;
+ $before = $before . self::$paddingChar;
}
else {
if (!$infPadding) {
@@ -146,15 +147,15 @@
$infPadding = self::isSeparator($infCh);
}
if ($infPadding) {
- $before = self::$paddingChar . $before;
+ //$before = self::$paddingChar . $before;
+ $before = $before . self::$paddingChar;
}
else {
- $before = $infCh . $before;
+ //$before = $infCh . $before;
+ $before = $before . $infCh;
$infOffset --;
}
- }
-
-
+ }
if ($supOffset > $this->textEndOffset) {
$after = $after . self::$paddingChar;
@@ -174,6 +175,7 @@
}
}
+
crawlerLog("IN TEXT " . $before .'|' . $middle . '|' . $after);
$tableObj = Diacritics::entryExists($before, $middle, $after);
More information about the Dev
mailing list