[dev] [commit] r932 - phplib/models templates/crawler wwwbase/Crawler wwwbase/ajax wwwbase/js wwwbase/styles
Cătălin Frâncu
cata at francu.com
Mon Aug 19 09:52:33 EEST 2013
Arată bine.
- În AbstractCrawler, nu ai nevoie să incluzi decât util.php, care la
rândul lui le include pe toate celelalte. Am avut o perioadă când eram
tare grijuliu cu volumul de cod executat, dar mi-a trecut de când
folosim Varnish. :-)
- Nu știam de html_options în Smarty. Foarte tare!
- ajax/fetchCrawlerStatus.php: nu mă omor după toate liniile alea goale.
Aerisirea e bună, dar prefer să încapă tot codul pe un ecran, ca să-l
pot vedea în ansamblu.
- tot acolo: poți folosi $domain = util_getRequestParameter('domain');
urmat de if ($domain == 'all'). E o mică rutină care face testul de
isset(), ca să facă codul mai ușor de înțeles. Și caută direct în
$_REQUEST, nu doar în $_POST, ca să nu mai trebuiască să ții minte dacă
folosești POST sau GET în formular.
Cătălin
On 2013-08-18 00:16, automailer at dexonline.ro wrote:
> Author: alinu
> Date: Sun Aug 18 00:16:16 2013
> New Revision: 932
>
> Log:
>
>
> Added:
> phplib/models/CrawledPage.php
> phplib/models/Link.php
> templates/crawler/
> templates/crawler/crawler.ihtml
> wwwbase/Crawler/index.php
> wwwbase/ajax/fetchCrawlerStatus.php
> wwwbase/js/crawler_ajax.js
> wwwbase/styles/crawler.css
> Modified:
> wwwbase/Crawler/AbstractCrawler.php
> wwwbase/Crawler/Crawler.php
> wwwbase/Crawler/clean_all.php
>
> Added: phplib/models/CrawledPage.php
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ phplib/models/CrawledPage.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,41 @@
> +<?php
> +
> +
> +class CrawledPage extends BaseObject implements DatedObject {
> + //implements DatedObject {
> +
> + public static $_table = 'CrawledPage';
> +
> + //salveaza informatiile despre pagina curent crawl-ata in tabelul
> CrawledPage
> + public static function savePage2DB($url, $httpStatus, $rawPagePath,
> $parsedTextPath, $timestamp) {
> +
> + try {
> + $tableObj = Model::factory(self::$_table);
> + $tableObj->create();
> + $tableObj->timestamp = $timestamp;
> + $tableObj->url = $url;
> + $tableObj->httpStatus = $httpStatus;
> + $tableObj->rawPagePath = $rawPagePath;
> + $tableObj->parsedTextPath = $parsedTextPath;
> + $tableObj->save();
> +
> + return $tableObj->id;
> + }
> + catch(Exception $ex) {
> +
> + logException($ex);
> + }
> +
> + return null;
> + }
> +
> + //intoarce o lista cu domeniile parsate
> + public static function getListOfDomains() {
> +
> + return Model::factory(self::$_table)->raw_query("select id,
> substr(substring_index(url, '/', 3),8) as domain from CrawledPage
> group by domain order by id asc;")->find_many();
> + }
> +
> +
> +}
> +
> +?>
> \ No newline at end of file
>
> Added: phplib/models/Link.php
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ phplib/models/Link.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,40 @@
> +<?php
> +
> +
> +class Link extends BaseObject {
> + //implements DatedObject {
> +
> + public static $_table = 'Link';
> +
> + //adauga o intrare nou in tabelul Link
> + public static function saveLink2DB($canonicalUrl, $domain, $urlHash,
> $crawledPageId) {
> +
> + //nu inseram acelasi link de 2 ori
> + if (Model::factory(self::$_table)->where('canonicalUrl',
> $canonicalUrl)->find_one()) {
> + return;
> + }
> +
> + try {
> +
> + $tableObj = Model::factory(self::$_table);
> + $tableObj->create();
> + $tableObj->canonicalUrl = $canonicalUrl;
> + $tableObj->domain = $domain;
> + $tableObj->urlHash = $urlHash;
> + $tableObj->crawledPageId = $crawledPageId;
> + $tableObj->save();
> +
> + return $tableObj->id;
> + }
> + catch(Exception $ex) {
> +
> + logException($ex);
> + }
> +
> + return null;
> + }
> +
> +
> +}
> +
> +?>
> \ No newline at end of file
>
> Added: templates/crawler/crawler.ihtml
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ templates/crawler/crawler.ihtml Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,53 @@
> +<html>
> + <head>
> +
> + <title>{$page_title}</title>
> +
> + <script type="text/javascript"
> src='http://code.jquery.com/jquery-2.0.3.js'></script>
> +
> + <script type="text/javascript" src='../js/crawler_ajax.js'></script>
> +
> + <link rel="StyleSheet" type="text/css"
> href="../styles/crawler.css"/>
> +
> + </head>
> + <body>
> +
> +
> + <div id="crawlerTitle">
> + <img src="../img/crawler/romanian_crawler_log.png">
> + </div>
> +
> +
> + <div id="selectDomain">
> + <br>
> + <span class="selectDomain">Select domain: </span>
> +
> + <select name="dropDown">
> + {html_options values=$values output=$options}
> + </select>
> +
> + <br>
> + </div>
> +
> + <div id="info">
> + <span class="domain">
> + <center>Showing: <span class="inSelection">all</span></center>
> + <br>
> + </span>
> + <span class="infoTitle"><center>General Stats</center></span>
> +
> + <div class="infoPanel">
> + <span class="total"></span>
> + </div>
> +
> + <span class="infoTitle"><center>HTTP Code Stats</center></span>
> +
> + <div class="infoPanel">
> + <span class="perHttpCode"></span>
> + </div>
> + </div>
> +
> + <img id="logo" src="../img/logo-dexonline-2.png"/>
> +
> + </body>
> +</html>
>
> Modified: wwwbase/Crawler/AbstractCrawler.php
> ==============================================================================
> --- wwwbase/Crawler/AbstractCrawler.php Sat Aug 17 19:55:07 2013 (r931)
> +++ wwwbase/Crawler/AbstractCrawler.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -7,7 +7,7 @@
> require_once '../../phplib/serverPreferences.php';
> require_once '../../phplib/db.php';
> require_once '../../phplib/idiorm/idiorm.php';
> -
> +require_once '../../phplib/idiorm/paris.php';
>
> require_once 'AppLog.php';
>
> @@ -259,53 +259,8 @@
>
> $domain = $this->getDomain($url);
>
> - $this->saveLink2DB($canonicalUrl, $domain, $urlHash,
> $this->currentPageId);
> - }
> -
> - //adauga o intrare nou in tabelul Link
> - function saveLink2DB($canonicalUrl, $domain, $urlHash,
> $crawledPageId) {
> -
> - //nu inseram acelasi link de 2 ori
> - if (ORM::for_table('Link')->where('canonicalUrl',
> $canonicalUrl)->find_one()) {
> - return;
> - }
> -
> - try {
> - $tableObj = ORM::for_table("Link");
> - $tableObj->create();
> - $tableObj->canonicalUrl = $canonicalUrl;
> - $tableObj->domain = $domain;
> - $tableObj->urlHash = $urlHash;
> - $tableObj->crawledPageId = $crawledPageId;
> - $tableObj->save();
> - }
> - catch(Exception $ex) {
> -
> - logException($ex);
> - }
> + Link::saveLink2DB($canonicalUrl, $domain, $urlHash,
> $this->currentPageId);
> }
> - //salveaza informatiile despre pagina curent crawl-ata in tabelul
> CrawledPage
> - function savePage2DB($url, $httpStatus, $rawPagePath,
> $parsedTextPath) {
> -
> - try {
> - $tableObj = ORM::for_table("CrawledPage");
> - $tableObj->create();
> - $tableObj->timestamp = $this->currentTimestamp;
> - $tableObj->url = $url;
> - $tableObj->httpStatus = ''.$this->info["http_code"];
> - $tableObj->rawPagePath = $rawPagePath;
> - $tableObj->parsedTextPath = $parsedTextPath;
> - $tableObj->save();
> -
> - $this->currentPageId =
> ORM::for_table('CrawledPage')->order_by_desc('id')->find_one()->id;
> -
> - }
> - catch(Exception $ex) {
> -
> - logException($ex);
> - }
> - }
> -
>
> function isRelativeLink($url) {
>
>
> Modified: wwwbase/Crawler/Crawler.php
> ==============================================================================
> --- wwwbase/Crawler/Crawler.php Sat Aug 17 19:55:07 2013 (r931)
> +++ wwwbase/Crawler/Crawler.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -62,7 +62,7 @@
> $this->setStorePageParams();
>
> //salveaza o intrare despre pagina curenta in baza de date
> - $this->savePage2DB($this->currentUrl, $this->httpResponse(),
> $this->rawPagePath, $this->parsedTextPath);
> + $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl,
> $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath,
> $this->currentTimestamp);
>
> //daca pagina nu e in format html (e imagine sau alt fisier)
> //sau daca am primit un cod HTTP de eroare, sarim peste pagina
> acesta
> @@ -102,7 +102,7 @@
> if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
>
> $obj = new Crawler();
> - $obj->startCrawling("http://wiki.dexonline.ro/");
> - //$obj->startCrawling("http://www.romlit.ro");
> + //$obj->startCrawling("http://wiki.dexonline.ro/");
> + $obj->startCrawling("http://www.romlit.ro");
> }
> ?>
> \ No newline at end of file
>
> Modified: wwwbase/Crawler/clean_all.php
> ==============================================================================
> --- wwwbase/Crawler/clean_all.php Sat Aug 17 19:55:07 2013 (r931)
> +++ wwwbase/Crawler/clean_all.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -13,10 +13,7 @@
>
> exec("rm -rf $regexPath");
> }
> -$user_ag = pref_getSectionPreference('crawler',
> 'user_agent_location').pref_getSectionPreference('crawler',
> 'new_line');
> -echo file_get_contents($user_ag);
>
> -/*
> try {
>
> //sterge toate fisierele salvate
>
> Added: wwwbase/Crawler/index.php
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ wwwbase/Crawler/index.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,33 @@
> +<?php
> +
> +require_once '../../phplib/util.php';
> +require_once '../../phplib/serverPreferences.php';
> +require_once '../../phplib/db.php';
> +require_once '../../phplib/idiorm/idiorm.php';
> +require_once '../../phplib/idiorm/paris.php';
> +
> +
> +$rows = CrawledPage::getListOfDomains();
> +
> +
> +$options = array('all', 'most recent domain');
> +$last = end($rows);
> +$values = array('all', $last->domain);
> +
> +
> +foreach($rows as $obj) {
> +
> + array_push($options,$obj->domain);
> + array_push($values,$obj->domain);
> +}
> +
> +//var_dump($options);
> +
> +SmartyWrap::assign('page_title', 'Romanian Crawler Log');
> +
> +SmartyWrap::assign('values', $values);
> +SmartyWrap::assign('options', $options);
> +
> +SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
> +
> +?>
>
> Added: wwwbase/ajax/fetchCrawlerStatus.php
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ wwwbase/ajax/fetchCrawlerStatus.php Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,103 @@
> +<?php
> +/*
> + * Alin Ungureanu, 2013
> + * alyn.cti at gmail.com
> + */
> +require_once '../../phplib/util.php';
> +require_once '../../phplib/serverPreferences.php';
> +require_once '../../phplib/db.php';
> +require_once '../../phplib/idiorm/idiorm.php';
> +require_once '../../phplib/idiorm/paris.php';
> +
> +
> +class FetchCrawlerStatus {
> +
> +
> + function getStatusPerHttpCode($domain = null) {
> +
> + $crawledPage = Model::factory('CrawledPage');
> + $link = Model::factory('Link')->create();
> +
> + $data = '';
> + $crawledPageFilter = '';
> +
> + if (isset($domain)) {
> +
> + $crawledPageFilter = " where url like 'http://$domain%'";
> + }
> +
> + $rows = $crawledPage->raw_query("Select httpStatus,
> count(httpStatus) as cnt from CrawledPage $crawledPageFilter Group by
> httpStatus;")->find_many();
> +
> + foreach($rows as $row) {
> +
> + $data .= $row->httpStatus . ': <span class="httpStatusResult">' .
> $row->cnt .'</span><br>';
> + }
> +
> + return $data;
> + }
> +
> + function getTotalNumber($domain = null) {
> +
> +
> + $crawledPage = Model::factory('CrawledPage');
> + $link = Model::factory('Link');
> +
> + $data = '';
> + $linkFilter = '';
> + $crawledPageFilter = '';
> +
> + if (isset($domain)) {
> +
> + $linkFilter = " where domain like '$domain'";
> + $crawledPageFilter = " where url like 'http://$domain%'";
> +
> + }
> +
> + $row = $crawledPage->raw_query("Select count(*) as cnt from
> CrawledPage $crawledPageFilter;")->find_one()->cnt;
> + $data .= 'Total processed pages: <span class="totalProcessed">'.
> $row . '</span><br>';
> +
> + $row = $link->raw_query("Select count(*) as cnt from Link
> $linkFilter;")->find_one()->cnt;
> + $data .= 'Total links discovered: <span class="totalLinks">'. $row
> . '</span><br>';
> +
> + return $data;
> + }
> +
> +}
> +
> +
> +if (isset($_POST['method']) && !empty($_POST['method'])) {
> +
> +
> + $fetch = new FetchCrawlerStatus();
> +
> +
> +
> +
> +
> + if (isset($_POST['domain']) && $_POST['domain'] != 'all') {
> +
> + if ($_POST['method'] == 'fetch_total') {
> +
> + echo $fetch->getTotalNumber($_POST['domain']);
> + }
> +
> + else if ($_POST['method'] == 'fetch_per_http_code') {
> +
> + echo $fetch->getStatusPerHttpCode($_POST['domain']);
> + }
> + }
> + else if (isset($_POST['domain']) && $_POST['domain'] == 'all') {
> +
> + if ($_POST['method'] == 'fetch_total') {
> +
> + echo $fetch->getTotalNumber();
> + }
> + else if ($_POST['method'] == 'fetch_per_http_code') {
> +
> + echo $fetch->getStatusPerHttpCode();
> +
> + }
> + }
> +}
> +
> +?>
> \ No newline at end of file
>
> Added: wwwbase/js/crawler_ajax.js
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ wwwbase/js/crawler_ajax.js Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,60 @@
> +var crawlerStatus = {}
> +
> +
> +crawlerStatus.fetchStatus = function() {
> +
> + var selectedDomain = $('select[name=dropDown]').val();
> +
> + $.ajax({
> +
> + url: '../ajax/fetchCrawlerStatus.php',
> + type: 'post',
> + data: { method: 'fetch_total', domain: selectedDomain },
> + success: function(data) {
> + //face refresh la informatiile generale
> + $('.total').html(data + '</br>');
> +
> + }
> + });
> +}
> +
> +crawlerStatus.fetchHttpStatus = function() {
> +
> + var selectedDomain = $('select[name=dropDown]').val();
> +
> + $.ajax({
> +
> + url: '../ajax/fetchCrawlerStatus.php',
> + type: 'post',
> + data: { method: 'fetch_per_http_code', domain: selectedDomain },
> + success: function(data) {
> + //face refresh la HTTP status
> + $('.perHttpCode').html(data);
> +
> + //schimba automat dimensiunea logo-ului
> + $('#logo').height($('#info').height() * 2/3);
> + $('#logo').css('margin-top', $('#info').height() * 2/7);
> + $('#logo').css('margin-left', $('#info').height() * 1/4);
> + }
> +
> + });
> +}
> +
> +
> +$(document).ready(function() {
> +
> +
> + crawlerStatus.fetchStatus();
> + crawlerStatus.fetchHttpStatus();
> +
> + setInterval(crawlerStatus.fetchStatus, 5000);
> + setInterval(crawlerStatus.fetchHttpStatus, 5000);
> +
> + $('select[name=dropDown]').change(function() {
> +
> +
> + $('.inSelection').html($('select[name=dropDown]').val());
> + crawlerStatus.fetchStatus();
> + crawlerStatus.fetchHttpStatus();
> + });
> +});
> \ No newline at end of file
>
> Added: wwwbase/styles/crawler.css
> ==============================================================================
> --- /dev/null 00:00:00 1970 (empty, because file is newly added)
> +++ wwwbase/styles/crawler.css Sun Aug 18 00:16:16 2013 (r932)
> @@ -0,0 +1,101 @@
> +#info {
> +
> + overflow: hidden;
> +}
> +
> +#info {
> + position: relative;
> + display: inline-block;
> + float: left;
> + margin-top: 30px;
> + background-color: #ffe075;
> + width: 310px;
> + margin-left: 30px;
> + padding: 10px;
> +}
> +.total, .perHttpCode {
> +
> + font-weight: 600;
> + font-size: 24;
> + font-family: serif;
> + color: #3399FF;
> +}
> +
> +.infoPanel {
> +
> + position: relative;
> + display: inline-block;
> + margin-left: 10px;
> +}
> +
> +.totalProcessed, .totalLinks, .httpStatusResult {
> +
> + color: #FF3300;
> + font-family: "Times New Roman";
> + font-weight: 900;
> + font-size: 26;
> +}
> +
> +.infoTitle, .domain {
> +
> + position: relative;
> + text-allign: center;
> + font-weight: 700;
> + font-family: arial;
> + font-size: 24;
> + padding-top: 20px;
> + color: #8F0000;
> +}
> +
> +.inSelection {
> +
> + color: #6B4724;
> +}
> +
> +
> +.selectDomain {
> +
> + text-allign: left;
> + font-weight: 900;
> + font-family: serif;
> + font-size: 21;
> + color: #8F0000;
> +}
> +
> +#selectDomain {
> +
> + margin-left: 30px;
> +}
> +
> +#selectDomain select {
> +
> + width: 172px;
> +}
> +
> +#logo {
> +
> + position: relative;
> + display: inline-block;
> + margin-top: 60px;
> + margin-left: 40px;
> +}
> +
> +
> + at media (max-device-width: 640px) {
> +
> +
> + #crawlerTitle img {
> +
> + width: 310px !important;
> + height: 50px !important;
> + margin: 0 auto !important;
> + }
> +
> + #selectDomain, #info, #logo {
> +
> + margin: 0 auto !important;
> + margin-top: 20px !important;
> + }
> +}
> +
> +
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev
More information about the Dev
mailing list