[dev] [commit] r1003 - in wwwbase: . Crawler styles
automailer at dexonline.ro
automailer at dexonline.ro
Mon Sep 23 09:15:45 EEST 2013
Author: alinu
Date: Mon Sep 23 09:15:44 2013
New Revision: 1003
Log:
AbstractCrawler.php - nu stocheaza linkuri inutile in baza de date (img, pdf, etc).
diacritics.php - pastreaza formatul textului initial (tab si \n), rezolvat eroare cu inceput de text (offset 0 vs null) la cuvantul 'introducere' spre exemplu
clean.php - curata baza de date si fisierele produse de Crawler sau de DiacriticsBuilder
diacritics_fix.css - adaugat stil la div id="text_input" + aliniere text stanga
Added:
wwwbase/Crawler/clean.php
Modified:
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/AppLog.php
wwwbase/Crawler/Crawler.php
wwwbase/Crawler/DiacriticsBuilder.php
wwwbase/Crawler/MemoryManagement.php
wwwbase/Crawler/index.php
wwwbase/diacritice.php
wwwbase/styles/diacritics_fix.css
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/AbstractCrawler.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -31,7 +31,7 @@
protected $urlResource;
protected $directoryIndexFile;
- protected $IndexFileExt;
+ protected $indexFileExt;
protected $domainsList;
@@ -41,7 +41,8 @@
$this->plainText = '';
$this->pageContent = '';
$this->directoryIndexFile = pref_getSectionPreference('crawler', 'dir_index_file');
- $this->IndexFileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext'));
+ $this->indexFileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext'));
+ $this->fileExt = explode(',', pref_getSectionPreference('crawler', 'index_file_ext').',txt');
}
@@ -49,11 +50,11 @@
function getPage($url) {
$this->ch = curl_init();
-
+ crawlerLog(file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
curl_setopt ($this->ch, CURLOPT_URL, $url);
curl_setopt ($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt ($this->ch, CURLOPT_USERAGENT, file_get_contents(pref_getSectionPreference('crawler', 'user_agent_location')));
- curl_setopt ($this->ch, CURLOPT_TIMEOUT, 60);
+ curl_setopt ($this->ch, CURLOPT_TIMEOUT, 30);
curl_setopt ($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($this->ch, CURLOPT_COOKIEFILE, 'cookie_jar');
@@ -210,6 +211,24 @@
return str_get_html($buffer);
}
+ function eligeableUrl($url) {
+
+ $resource = parse_url($url);
+ $pathInfo = pathinfo($resource['path']);
+
+ if (isset($pathInfo['extension'])) {
+
+ $ext = $pathInfo['extension'];
+
+
+ if (array_search($ext, $this->fileExt) === false) {
+
+ return false;
+ }
+ }
+
+ return true;
+ }
//metode pentru prelucrarea linkurilor
//sterge directory index file si elimina slash-urile in plus
@@ -217,6 +236,12 @@
//le transforma in absolute daca sunt relative
function processLink($url) {
+
+ if (!$this->eligeableUrl($url)) {
+
+ return;
+ }
+
crawlerLog('Processing link: '.$url);
$canonicalUrl = null;
if ($this->isRelativeLink($url)) {
@@ -247,7 +272,7 @@
//crawlerLog('delDirIndexFile '.$url);
- foreach($this->IndexFileExt as $ext) {
+ foreach($this->indexFileExt as $ext) {
$target = $this->directoryIndexFile .'.'. $ext;
@@ -347,4 +372,4 @@
abstract function start();
}
-?>
\ No newline at end of file
+?>1
\ No newline at end of file
Modified: wwwbase/Crawler/AppLog.php
==============================================================================
--- wwwbase/Crawler/AppLog.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/AppLog.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -3,8 +3,6 @@
* Alin Ungureanu, 2013
* alyn.cti at gmail.com
*/
-require_once '../../phplib/util.php';
-require_once '../../phplib/serverPreferences.php';
$exceptionExit = pref_getSectionPreference('crawler', 'exception_exit');
$logFile = pref_getSectionPreference('crawler', 'crawler_log');
@@ -13,9 +11,29 @@
* $level poate fi de forma : __FILE__.' - '.__CLASS__.'::'.__FUNCTION__.' line '.__LINE__
* sau mai simplu
*/
+function getCorrespondentNewLine() {
+
+ //daca este terminal
+ if (PHP_SAPI == 'cli') {
+ return PHP_EOL;
+ }
+ //altfel este browser
+ else return '<br>';
+}
+
function crawlerLog($message, $level = '') {
global $logFile;
+
+ //afisaza sau nu in log "INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - '
+ // . 'line '.__LINE__ acolo unde exista
+ if (!pref_getSectionPreference('crawler', 'function_trace')) {
+
+ if (substr($message, 0, 6) == 'INSIDE')
+ return;
+ }
+
+
//log in fisier
if (pref_getSectionPreference('crawler', 'log2file'))
try{
@@ -30,7 +48,7 @@
//log in stdout
if(pref_getSectionPreference('crawler', 'log2screen')) {
- echo date("Y-m-d H:i:s") . '::' . $level . '::' . $message.pref_getSectionPreference('crawler', 'new_line');
+ echo date("Y-m-d H:i:s") . '::' . $level . '::' . $message.getCorrespondentNewLine();
flush();
}
Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/Crawler.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -130,8 +130,6 @@
continue;
-
-
//curatam url-ul
$this->currentUrl = $this->urlPadding($startUrl);
//impartim url-ul pe componente
Modified: wwwbase/Crawler/DiacriticsBuilder.php
==============================================================================
--- wwwbase/Crawler/DiacriticsBuilder.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/DiacriticsBuilder.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -22,14 +22,14 @@
class DiacriticsBuilder {
- private $currOffset;
+ protected $currOffset;
protected $file;
- private $fileEndOffset;
+ protected $fileEndOffset;
- private static $diacritics;
- private static $nonDiacritics;
- private static $paddingNumber;
- private static $paddingChar;
+ protected static $diacritics;
+ protected static $nonDiacritics;
+ protected static $paddingNumber;
+ protected static $paddingChar;
private $globalCount;
private $localCount;
private $currentFolder;
@@ -41,7 +41,7 @@
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
self::$diacritics = pref_getSectionPreference("crawler", "diacritics");
- self::$nonDiacritics = pref_getSectionPreference("crawler", "non_diacritics");
+ self::$nonDiacritics = pref_getSectionPreference("crawler", "non_lower_diacritics");
self::$paddingNumber = pref_getSectionPreference('crawler', 'diacritics_padding_length');
self::$paddingChar = pref_getSectionPreference('crawler', 'padding_char');
@@ -77,15 +77,16 @@
$crawledPage = CrawledPage::getNextDiacriticsFile();
- $this->showProcessingFileStatus($crawledPage);
-
if ($crawledPage == null) {
return null;
}
+
+ $this->showProcessingFileStatus($crawledPage);
+
FilesUsedInDiacritics::save2Db($crawledPage->id);
- if (is_file($crawledPage->parsedTextPath)) {
+ if (is_file($crawledPage->parsedTextPath) || $crawledPage->httpStatus < 400) {
return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
}
}
@@ -141,32 +142,37 @@
for ($i = 0; $i < self::$paddingNumber; $i++) {
if ($infOffset < 0) {
- $infPadding = true;
- }
- $infCh = StringUtil::getCharAt($this->file, $infOffset);
- $infPadding = self::isSeparator($infCh);
-
- if ($infPadding) {
$before = self::$paddingChar . $before;
}
else {
- $before = $infCh . $before;
- $infOffset --;
- }
-
- if ($supOffset > $this->fileEndOffset) {
- $supPadding = true;
+ if (!$infPadding) {
+ $infCh = StringUtil::getCharAt($this->text, $infOffset);
+ $infPadding = self::isSeparator($infCh);
+ }
+ if ($infPadding) {
+ $before = self::$paddingChar . $before;
+ }
+ else {
+ $before = $infCh . $before;
+ $infOffset --;
+ }
}
-
- $supCh = StringUtil::getCharAt($this->file, $supOffset);
- $supPadding = self::isSeparator($supCh);
-
- if ($supPadding) {
+
+ if ($supOffset > $this->textEndOffset) {
$after = $after . self::$paddingChar;
}
else {
- $after = $after . $supCh;
- $supOffset ++;
+ if (!$supPadding) {
+ $supCh = StringUtil::getCharAt($this->text, $supOffset);
+ $supPadding = self::isSeparator($supCh);
+ }
+ if ($supPadding) {
+ $after = $after . self::$paddingChar;
+ }
+ else {
+ $after = $after . $supCh;
+ $supOffset ++;
+ }
}
}
@@ -181,8 +187,8 @@
$this->currOffset = 0;
$this->fileEndOffset = mb_strlen($file) - 1;
- while(($offset = $this->getNextOffset()) != '') {
-
+ while(($offset = $this->getNextOffset()) !== null) {
+
$this->leftAndRightPadding($offset);
}
}
@@ -190,9 +196,12 @@
function start() {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
while(($file = $this->getNextFile()) != null) {
+
$this->processFile($file);
MemoryManagement::clean();
}
+
+ crawlerLog("Finished");
}
}
Modified: wwwbase/Crawler/MemoryManagement.php
==============================================================================
--- wwwbase/Crawler/MemoryManagement.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/MemoryManagement.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -1,8 +1,4 @@
<?php
-require_once '../../phplib/util.php';
-require_once '../../phplib/serverPreferences.php';
-
-require_once 'AppLog.php';
class MemoryManagement {
@@ -42,4 +38,4 @@
}
}
-?>
\ No newline at end of file
+?>
Added: wwwbase/Crawler/clean.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/clean.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -0,0 +1,75 @@
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+
+
+function printUsage() {
+ echo "::Usage::".PHP_EOL."php clean_all.php [ -c | --crawler] [ -d | --diacritics]".PHP_EOL;
+ flush();
+ exit();
+}
+
+if (count($argv) == 1) printUsage();
+
+db_init();
+
+$db = ORM::get_db();
+$db->beginTransaction();
+
+
+
+if ($argv[1] == '--crawler' || $argv[1] == '-c') {
+
+
+ function removeFiles($regexPath) {
+
+ exec("rm -rf $regexPath");
+ }
+
+ try {
+
+ //sterge toate fisierele salvate
+ removeFiles('ParsedText/*');
+ removeFiles('RawPage/*');
+
+
+ echo 'files deleted'.pref_getSectionPreference('crawler', 'new_line');
+
+ $db->exec('TRUNCATE Table CrawledPage;');
+ $db->exec('TRUNCATE Table Link;');
+ $db->commit();
+
+ echo "tables 'Link' and 'CrawledPage' truncated".pref_getSectionPreference('crawler', 'new_line');
+
+ echo 'The cleaning process was successful'.pref_getSectionPreference('crawler', 'new_line');
+ }
+
+ catch(Exception $ex) {
+
+ echo 'The cleaning process encountered a problem '.pref_getSectionPreference('crawler', 'new_line').$ex->getMessage();
+ }
+}
+else if ($argv[1] == '--diacritics' || $argv[1] == '-d') {
+
+ try{
+ $db->exec('TRUNCATE Table Diacritics;');
+ $db->exec('TRUNCATE Table FilesUsedInDiacritics;');
+ $db->commit();
+ echo "tables 'Diacritics' and 'FilesUsedInDiacritics' truncated".pref_getSectionPreference('crawler', 'new_line');
+ echo 'The cleaning process was successful'.pref_getSectionPreference('crawler', 'new_line');
+ }
+ catch(Exception $e) {
+
+ echo 'The cleaning process encountered a problem '.pref_getSectionPreference('crawler', 'new_line').$ex->getMessage();
+ }
+
+}
+else printUsage();
+/**/
+?>
\ No newline at end of file
Modified: wwwbase/Crawler/index.php
==============================================================================
--- wwwbase/Crawler/index.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/Crawler/index.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -30,4 +30,4 @@
SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
-?>
+?>
\ No newline at end of file
Modified: wwwbase/diacritice.php
==============================================================================
--- wwwbase/diacritice.php Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/diacritice.php Mon Sep 23 09:15:44 2013 (r1003)
@@ -57,8 +57,11 @@
*/
function getNextOffset() {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+
while($this->currOffset <= $this->textEndOffset) {
//daca urmatorul offset e a,i,s,t sau ă,â,î,ș,ț
+ crawlerLog(StringUtil::getCharAt($this->text, $this->currOffset) . ' - offset ' .$this->currOffset);
+
if (self::isPossibleDiacritic(StringUtil::getCharAt($this->text, $this->currOffset))) {
return $this->currOffset ++;
}
@@ -86,21 +89,26 @@
$this->textEndOffset = mb_strlen($text) - 1;
$offset = 0;
- while(($offset = $this->getNextOffset()) != null) {
-
+ while(($offset = $this->getNextOffset()) !== null ) {
+
$this->leftAndRightPadding($offset);
}
+
//copiem de la ultimul posibil diacritic pana la final
- $this->resultText .= mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
- $this->hiddenText .= mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
+ $lastChunk = mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
+ $this->resultText .= $lastChunk;
+ $this->hiddenText .= $lastChunk;
}
public function fix($text) {
crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
-
+ if (mb_strlen($text) > pref_getSectionPreference('crawler', 'diacritics_buffer_limit')) {
+ return "Dimensiune text prea mare, limita este de " .
+ pref_getSectionPreference('crawler', 'diacritics_buffer_limit') . ' de caractere';
+ }
$this->processText($text);
- return $this->resultText;
+ return $this->text2Html($this->resultText);
}
static function toLower($content) {
@@ -130,35 +138,39 @@
for ($i = 0; $i < self::$paddingNumber; $i++) {
if ($infOffset < 0) {
- $infPadding = true;
- }
- else {
-
- $infCh = StringUtil::getCharAt($this->text, $infOffset);
- $infPadding = self::isSeparator($infCh);
- }
-
- if ($infPadding) {
$before = self::$paddingChar . $before;
}
else {
- $before = $infCh . $before;
- $infOffset --;
+ if (!$infPadding) {
+ $infCh = StringUtil::getCharAt($this->text, $infOffset);
+ $infPadding = self::isSeparator($infCh);
+ }
+ if ($infPadding) {
+ $before = self::$paddingChar . $before;
+ }
+ else {
+ $before = $infCh . $before;
+ $infOffset --;
+ }
}
+
+
if ($supOffset > $this->textEndOffset) {
- $supPadding = true;
- }
- else {
- $supCh = StringUtil::getCharAt($this->text, $supOffset);
- $supPadding = self::isSeparator($supCh);
- }
- if ($supPadding) {
$after = $after . self::$paddingChar;
}
else {
- $after = $after . $supCh;
- $supOffset ++;
+ if (!$supPadding) {
+ $supCh = StringUtil::getCharAt($this->text, $supOffset);
+ $supPadding = self::isSeparator($supCh);
+ }
+ if ($supPadding) {
+ $after = $after . self::$paddingChar;
+ }
+ else {
+ $after = $after . $supCh;
+ $supOffset ++;
+ }
}
}
@@ -281,6 +293,12 @@
return $this->hiddenText;
}
+ function text2Html($content) {
+
+ //new line to <br> si tab to space( )
+ return preg_replace('/[\t]/', ' ', nl2br($content));
+ }
+
function replaceDiacritics() {
if (isset($_POST['hiddenText'])) {
Modified: wwwbase/styles/diacritics_fix.css
==============================================================================
--- wwwbase/styles/diacritics_fix.css Sun Sep 22 16:51:22 2013 (r1002)
+++ wwwbase/styles/diacritics_fix.css Mon Sep 23 09:15:44 2013 (r1003)
@@ -34,4 +34,22 @@
border:1px solid gray;
background-color:#EEEEF6;
+}
+
+#text_input {
+
+ position: relative;
+ text-align:left;
+
+ overflow-y: scroll;
+
+ width: 580px;
+ height: 250px;
+
+ resize: none;
+
+ margin: 10px;
+
+ border:1px solid gray;
+ background-color:#EEEEF6;
}
\ No newline at end of file
More information about the Dev
mailing list