[dev] [commit] r932 - phplib/models templates/crawler wwwbase/Crawler wwwbase/ajax wwwbase/js wwwbase/styles
automailer at dexonline.ro
automailer at dexonline.ro
Sun Aug 18 00:16:16 EEST 2013
Author: alinu
Date: Sun Aug 18 00:16:16 2013
New Revision: 932
Log:
Added:
phplib/models/CrawledPage.php
phplib/models/Link.php
templates/crawler/
templates/crawler/crawler.ihtml
wwwbase/Crawler/index.php
wwwbase/ajax/fetchCrawlerStatus.php
wwwbase/js/crawler_ajax.js
wwwbase/styles/crawler.css
Modified:
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/Crawler.php
wwwbase/Crawler/clean_all.php
Added: phplib/models/CrawledPage.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ phplib/models/CrawledPage.php Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,41 @@
+<?php
+
+
+class CrawledPage extends BaseObject implements DatedObject {
+ //implements DatedObject {
+
+ public static $_table = 'CrawledPage';
+
+ //salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
+ public static function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath, $timestamp) {
+
+ try {
+ $tableObj = Model::factory(self::$_table);
+ $tableObj->create();
+ $tableObj->timestamp = $timestamp;
+ $tableObj->url = $url;
+ $tableObj->httpStatus = $httpStatus;
+ $tableObj->rawPagePath = $rawPagePath;
+ $tableObj->parsedTextPath = $parsedTextPath;
+ $tableObj->save();
+
+ return $tableObj->id;
+ }
+ catch(Exception $ex) {
+
+ logException($ex);
+ }
+
+ return null;
+ }
+
+ //intoarce o lista cu domeniile parsate
+ public static function getListOfDomains() {
+
+ return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
+ }
+
+
+}
+
+?>
\ No newline at end of file
Added: phplib/models/Link.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ phplib/models/Link.php Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,40 @@
+<?php
+
+
+class Link extends BaseObject {
+ //implements DatedObject {
+
+ public static $_table = 'Link';
+
+ //adauga o intrare nou in tabelul Link
+ public static function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
+
+ //nu inseram acelasi link de 2 ori
+ if (Model::factory(self::$_table)->where('canonicalUrl', $canonicalUrl)->find_one()) {
+ return;
+ }
+
+ try {
+
+ $tableObj = Model::factory(self::$_table);
+ $tableObj->create();
+ $tableObj->canonicalUrl = $canonicalUrl;
+ $tableObj->domain = $domain;
+ $tableObj->urlHash = $urlHash;
+ $tableObj->crawledPageId = $crawledPageId;
+ $tableObj->save();
+
+ return $tableObj->id;
+ }
+ catch(Exception $ex) {
+
+ logException($ex);
+ }
+
+ return null;
+ }
+
+
+}
+
+?>
\ No newline at end of file
Added: templates/crawler/crawler.ihtml
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ templates/crawler/crawler.ihtml Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,53 @@
+<html>
+ <head>
+
+ <title>{$page_title}</title>
+
+ <script type="text/javascript" src='http://code.jquery.com/jquery-2.0.3.js'></script>
+
+ <script type="text/javascript" src='../js/crawler_ajax.js'></script>
+
+ <link rel="StyleSheet" type="text/css" href="../styles/crawler.css"/>
+
+ </head>
+ <body>
+
+
+ <div id="crawlerTitle">
+ <img src="../img/crawler/romanian_crawler_log.png">
+ </div>
+
+
+ <div id="selectDomain">
+ <br>
+ <span class="selectDomain">Select domain: </span>
+
+ <select name="dropDown">
+ {html_options values=$values output=$options}
+ </select>
+
+ <br>
+ </div>
+
+ <div id="info">
+ <span class="domain">
+ <center>Showing: <span class="inSelection">all</span></center>
+ <br>
+ </span>
+ <span class="infoTitle"><center>General Stats</center></span>
+
+ <div class="infoPanel">
+ <span class="total"></span>
+ </div>
+
+ <span class="infoTitle"><center>HTTP Code Stats</center></span>
+
+ <div class="infoPanel">
+ <span class="perHttpCode"></span>
+ </div>
+ </div>
+
+ <img id="logo" src="../img/logo-dexonline-2.png"/>
+
+ </body>
+</html>
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Sat Aug 17 19:55:07 2013 (r931)
+++ wwwbase/Crawler/AbstractCrawler.php Sun Aug 18 00:16:16 2013 (r932)
@@ -7,7 +7,7 @@
require_once '../../phplib/serverPreferences.php';
require_once '../../phplib/db.php';
require_once '../../phplib/idiorm/idiorm.php';
-
+require_once '../../phplib/idiorm/paris.php';
require_once 'AppLog.php';
@@ -259,53 +259,8 @@
$domain = $this->getDomain($url);
- $this->saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
- }
-
- //adauga o intrare nou in tabelul Link
- function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
-
- //nu inseram acelasi link de 2 ori
- if (ORM::for_table('Link')->where('canonicalUrl', $canonicalUrl)->find_one()) {
- return;
- }
-
- try {
- $tableObj = ORM::for_table("Link");
- $tableObj->create();
- $tableObj->canonicalUrl = $canonicalUrl;
- $tableObj->domain = $domain;
- $tableObj->urlHash = $urlHash;
- $tableObj->crawledPageId = $crawledPageId;
- $tableObj->save();
- }
- catch(Exception $ex) {
-
- logException($ex);
- }
+ Link::saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
}
- //salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
- function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath) {
-
- try {
- $tableObj = ORM::for_table("CrawledPage");
- $tableObj->create();
- $tableObj->timestamp = $this->currentTimestamp;
- $tableObj->url = $url;
- $tableObj->httpStatus = ''.$this->info["http_code"];
- $tableObj->rawPagePath = $rawPagePath;
- $tableObj->parsedTextPath = $parsedTextPath;
- $tableObj->save();
-
- $this->currentPageId = ORM::for_table('CrawledPage')->order_by_desc('id')->find_one()->id;
-
- }
- catch(Exception $ex) {
-
- logException($ex);
- }
- }
-
function isRelativeLink($url) {
Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php Sat Aug 17 19:55:07 2013 (r931)
+++ wwwbase/Crawler/Crawler.php Sun Aug 18 00:16:16 2013 (r932)
@@ -62,7 +62,7 @@
$this->setStorePageParams();
//salveaza o intrare despre pagina curenta in baza de date
- $this->savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath);
+ $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
//daca pagina nu e in format html (e imagine sau alt fisier)
//sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
@@ -102,7 +102,7 @@
if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
$obj = new Crawler();
- $obj->startCrawling("http://wiki.dexonline.ro/");
- //$obj->startCrawling("http://www.romlit.ro");
+ //$obj->startCrawling("http://wiki.dexonline.ro/");
+ $obj->startCrawling("http://www.romlit.ro");
}
?>
\ No newline at end of file
Modified: wwwbase/Crawler/clean_all.php
==============================================================================
--- wwwbase/Crawler/clean_all.php Sat Aug 17 19:55:07 2013 (r931)
+++ wwwbase/Crawler/clean_all.php Sun Aug 18 00:16:16 2013 (r932)
@@ -13,10 +13,7 @@
exec("rm -rf $regexPath");
}
-$user_ag = pref_getSectionPreference('crawler', 'user_agent_location').pref_getSectionPreference('crawler', 'new_line');
-echo file_get_contents($user_ag);
-/*
try {
//sterge toate fisierele salvate
Added: wwwbase/Crawler/index.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/Crawler/index.php Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,33 @@
+<?php
+
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+require_once '../../phplib/idiorm/paris.php';
+
+
+$rows = CrawledPage::getListOfDomains();
+
+
+$options = array('all', 'most recent domain');
+$last = end($rows);
+$values = array('all', $last->domain);
+
+
+foreach($rows as $obj) {
+
+ array_push($options,$obj->domain);
+ array_push($values,$obj->domain);
+}
+
+//var_dump($options);
+
+SmartyWrap::assign('page_title', 'Romanian Crawler Log');
+
+SmartyWrap::assign('values', $values);
+SmartyWrap::assign('options', $options);
+
+SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
+
+?>
Added: wwwbase/ajax/fetchCrawlerStatus.php
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/ajax/fetchCrawlerStatus.php Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,103 @@
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+require_once '../../phplib/idiorm/paris.php';
+
+
+class FetchCrawlerStatus {
+
+
+ function getStatusPerHttpCode($domain = null) {
+
+ $crawledPage = Model::factory('CrawledPage');
+ $link = Model::factory('Link')->create();
+
+ $data = '';
+ $crawledPageFilter = '';
+
+ if (isset($domain)) {
+
+ $crawledPageFilter = " where url like 'http://$domain%'";
+ }
+
+ $rows = $crawledPage->raw_query("Select httpStatus, count(httpStatus) as cnt from CrawledPage $crawledPageFilter Group by httpStatus;")->find_many();
+
+ foreach($rows as $row) {
+
+ $data .= $row->httpStatus . ': <span class="httpStatusResult">' . $row->cnt .'</span><br>';
+ }
+
+ return $data;
+ }
+
+ function getTotalNumber($domain = null) {
+
+
+ $crawledPage = Model::factory('CrawledPage');
+ $link = Model::factory('Link');
+
+ $data = '';
+ $linkFilter = '';
+ $crawledPageFilter = '';
+
+ if (isset($domain)) {
+
+ $linkFilter = " where domain like '$domain'";
+ $crawledPageFilter = " where url like 'http://$domain%'";
+
+ }
+
+ $row = $crawledPage->raw_query("Select count(*) as cnt from CrawledPage $crawledPageFilter;")->find_one()->cnt;
+ $data .= 'Total processed pages: <span class="totalProcessed">'. $row . '</span><br>';
+
+ $row = $link->raw_query("Select count(*) as cnt from Link $linkFilter;")->find_one()->cnt;
+ $data .= 'Total links discovered: <span class="totalLinks">'. $row . '</span><br>';
+
+ return $data;
+ }
+
+}
+
+
+if (isset($_POST['method']) && !empty($_POST['method'])) {
+
+
+ $fetch = new FetchCrawlerStatus();
+
+
+
+
+
+ if (isset($_POST['domain']) && $_POST['domain'] != 'all') {
+
+ if ($_POST['method'] == 'fetch_total') {
+
+ echo $fetch->getTotalNumber($_POST['domain']);
+ }
+
+ else if ($_POST['method'] == 'fetch_per_http_code') {
+
+ echo $fetch->getStatusPerHttpCode($_POST['domain']);
+ }
+ }
+ else if (isset($_POST['domain']) && $_POST['domain'] == 'all') {
+
+ if ($_POST['method'] == 'fetch_total') {
+
+ echo $fetch->getTotalNumber();
+ }
+ else if ($_POST['method'] == 'fetch_per_http_code') {
+
+ echo $fetch->getStatusPerHttpCode();
+
+ }
+ }
+}
+
+?>
\ No newline at end of file
Added: wwwbase/js/crawler_ajax.js
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/js/crawler_ajax.js Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,60 @@
+var crawlerStatus = {}
+
+
+crawlerStatus.fetchStatus = function() {
+
+ var selectedDomain = $('select[name=dropDown]').val();
+
+ $.ajax({
+
+ url: '../ajax/fetchCrawlerStatus.php',
+ type: 'post',
+ data: { method: 'fetch_total', domain: selectedDomain },
+ success: function(data) {
+ //face refresh la informatiile generale
+ $('.total').html(data + '</br>');
+
+ }
+ });
+}
+
+crawlerStatus.fetchHttpStatus = function() {
+
+ var selectedDomain = $('select[name=dropDown]').val();
+
+ $.ajax({
+
+ url: '../ajax/fetchCrawlerStatus.php',
+ type: 'post',
+ data: { method: 'fetch_per_http_code', domain: selectedDomain },
+ success: function(data) {
+ //face refresh la HTTP status
+ $('.perHttpCode').html(data);
+
+ //schimba automat dimensiunea logo-ului
+ $('#logo').height($('#info').height() * 2/3);
+ $('#logo').css('margin-top', $('#info').height() * 2/7);
+ $('#logo').css('margin-left', $('#info').height() * 1/4);
+ }
+
+ });
+}
+
+
+$(document).ready(function() {
+
+
+ crawlerStatus.fetchStatus();
+ crawlerStatus.fetchHttpStatus();
+
+ setInterval(crawlerStatus.fetchStatus, 5000);
+ setInterval(crawlerStatus.fetchHttpStatus, 5000);
+
+ $('select[name=dropDown]').change(function() {
+
+
+ $('.inSelection').html($('select[name=dropDown]').val());
+ crawlerStatus.fetchStatus();
+ crawlerStatus.fetchHttpStatus();
+ });
+});
\ No newline at end of file
Added: wwwbase/styles/crawler.css
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ wwwbase/styles/crawler.css Sun Aug 18 00:16:16 2013 (r932)
@@ -0,0 +1,101 @@
+#info {
+
+ overflow: hidden;
+}
+
+#info {
+ position: relative;
+ display: inline-block;
+ float: left;
+ margin-top: 30px;
+ background-color: #ffe075;
+ width: 310px;
+ margin-left: 30px;
+ padding: 10px;
+}
+.total, .perHttpCode {
+
+ font-weight: 600;
+ font-size: 24;
+ font-family: serif;
+ color: #3399FF;
+}
+
+.infoPanel {
+
+ position: relative;
+ display: inline-block;
+ margin-left: 10px;
+}
+
+.totalProcessed, .totalLinks, .httpStatusResult {
+
+ color: #FF3300;
+ font-family: "Times New Roman";
+ font-weight: 900;
+ font-size: 26;
+}
+
+.infoTitle, .domain {
+
+ position: relative;
+ text-allign: center;
+ font-weight: 700;
+ font-family: arial;
+ font-size: 24;
+ padding-top: 20px;
+ color: #8F0000;
+}
+
+.inSelection {
+
+ color: #6B4724;
+}
+
+
+.selectDomain {
+
+ text-allign: left;
+ font-weight: 900;
+ font-family: serif;
+ font-size: 21;
+ color: #8F0000;
+}
+
+#selectDomain {
+
+ margin-left: 30px;
+}
+
+#selectDomain select {
+
+ width: 172px;
+}
+
+#logo {
+
+ position: relative;
+ display: inline-block;
+ margin-top: 60px;
+ margin-left: 40px;
+}
+
+
+ at media (max-device-width: 640px) {
+
+
+ #crawlerTitle img {
+
+ width: 310px !important;
+ height: 50px !important;
+ margin: 0 auto !important;
+ }
+
+ #selectDomain, #info, #logo {
+
+ margin: 0 auto !important;
+ margin-top: 20px !important;
+ }
+}
+
+
More information about the Dev
mailing list