[dev] [commit] r932 - phplib/models templates/crawler wwwbase/Crawler wwwbase/ajax wwwbase/js wwwbase/styles

Cătălin Frâncu cata at francu.com
Mon Aug 19 09:52:33 EEST 2013


Arată bine.

- În AbstractCrawler, nu ai nevoie să incluzi decât util.php, care la 
rândul lui le include pe toate celelalte. Am avut o perioadă când eram 
tare grijuliu cu volumul de cod executat, dar mi-a trecut de când 
folosim Varnish. :-)

- Nu știam de html_options în Smarty. Foarte tare!

- ajax/fetchCrawlerStatus.php: nu mă omor după toate liniile alea goale. 
Aerisirea e bună, dar prefer să încapă tot codul pe un ecran, ca să-l 
pot vedea în ansamblu.

- tot acolo: poți folosi $domain = util_getRequestParameter('domain'); 
urmat de if ($domain == 'all'). E o mică rutină care face testul de 
isset(), ca să facă codul mai ușor de înțeles. Și caută direct în 
$_REQUEST, nu doar în $_POST, ca să nu mai trebuiască să ții minte dacă 
folosești POST sau GET în formular.

Cătălin

On 2013-08-18 00:16, automailer at dexonline.ro wrote:
> Author: alinu
> Date: Sun Aug 18 00:16:16 2013
> New Revision: 932
> 
> Log:
> 
> 
> Added:
>    phplib/models/CrawledPage.php
>    phplib/models/Link.php
>    templates/crawler/
>    templates/crawler/crawler.ihtml
>    wwwbase/Crawler/index.php
>    wwwbase/ajax/fetchCrawlerStatus.php
>    wwwbase/js/crawler_ajax.js
>    wwwbase/styles/crawler.css
> Modified:
>    wwwbase/Crawler/AbstractCrawler.php
>    wwwbase/Crawler/Crawler.php
>    wwwbase/Crawler/clean_all.php
> 
> Added: phplib/models/CrawledPage.php
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ phplib/models/CrawledPage.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,41 @@
> +<?php
> +
> +
> +class CrawledPage  extends BaseObject implements DatedObject {
> +	//implements DatedObject {
> +
> +	public static $_table = 'CrawledPage';
> +
> +	//salveaza informatiile despre pagina curent crawl-ata in tabelul 
> CrawledPage
> +	public static function savePage2DB($url, $httpStatus, $rawPagePath,
> $parsedTextPath, $timestamp) {
> +
> +		try {
> +			$tableObj = Model::factory(self::$_table);
> +			$tableObj->create();
> +			$tableObj->timestamp = $timestamp;
> +			$tableObj->url = $url;
> +			$tableObj->httpStatus = $httpStatus;
> +			$tableObj->rawPagePath = $rawPagePath;
> +			$tableObj->parsedTextPath = $parsedTextPath;
> +			$tableObj->save();
> +
> +			return $tableObj->id;
> +		}
> +		catch(Exception $ex) {
> +
> +			logException($ex);
> +		}
> +
> +		return null;
> +	}
> +
> +	//intoarce o lista cu domeniile parsate
> +	public static function getListOfDomains() {
> +
> +		return Model::factory(self::$_table)->raw_query("select id,
> substr(substring_index(url, '/', 3),8) as domain from CrawledPage
> group by domain order by id asc;")->find_many();
> +	}
> +
> +
> +}
> +
> +?>
> \ No newline at end of file
> 
> Added: phplib/models/Link.php
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ phplib/models/Link.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,40 @@
> +<?php
> +
> +
> +class Link extends BaseObject {
> +	//implements DatedObject {
> +
> +	public static $_table = 'Link';
> +
> +	//adauga o intrare nou in tabelul Link
> +	public static function saveLink2DB($canonicalUrl, $domain, $urlHash,
> $crawledPageId) {
> +
> +		//nu inseram acelasi link de 2 ori
> +		if (Model::factory(self::$_table)->where('canonicalUrl',
> $canonicalUrl)->find_one()) {
> +			return;
> +		}
> +
> +		try {
> +
> +			$tableObj = Model::factory(self::$_table);
> +			$tableObj->create();
> +			$tableObj->canonicalUrl = $canonicalUrl;
> +			$tableObj->domain = $domain;
> +			$tableObj->urlHash = $urlHash;
> +			$tableObj->crawledPageId = $crawledPageId;
> +			$tableObj->save();
> +
> +			return $tableObj->id;
> +		}
> +		catch(Exception $ex) {
> +
> +			logException($ex);
> +		}
> +
> +		return null;
> +	}
> +
> +
> +}
> +
> +?>
> \ No newline at end of file
> 
> Added: templates/crawler/crawler.ihtml
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ templates/crawler/crawler.ihtml	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,53 @@
> +<html>
> +	<head>
> +
> +		<title>{$page_title}</title>
> +
> +		<script type="text/javascript"
> src='http://code.jquery.com/jquery-2.0.3.js'></script>
> +
> +		<script type="text/javascript" src='../js/crawler_ajax.js'></script>
> +
> +		<link rel="StyleSheet" type="text/css" 
> href="../styles/crawler.css"/>
> +
> +	</head>
> +	<body>
> +
> +
> +		<div id="crawlerTitle">
> +			<img src="../img/crawler/romanian_crawler_log.png">
> +		</div>
> +
> +
> +		<div id="selectDomain">
> +			<br>
> +			<span class="selectDomain">Select domain: </span>
> +
> +			<select name="dropDown">
> +				{html_options values=$values output=$options}
> +			</select>
> +
> +			<br>
> +		</div>
> +
> +		<div id="info">
> +				<span class="domain">
> +					<center>Showing: <span class="inSelection">all</span></center>
> +				<br>
> +				</span>
> +				<span class="infoTitle"><center>General Stats</center></span>
> +
> +				<div class="infoPanel">
> +					<span class="total"></span>
> +				</div>
> +
> +				<span class="infoTitle"><center>HTTP Code Stats</center></span>
> +
> +			<div class="infoPanel">
> +				<span class="perHttpCode"></span>
> +			</div>
> +		</div>
> +
> +		<img id="logo" src="../img/logo-dexonline-2.png"/>
> +
> +	</body>
> +</html>
> 
> Modified: wwwbase/Crawler/AbstractCrawler.php
> ==============================================================================
> --- wwwbase/Crawler/AbstractCrawler.php	Sat Aug 17 19:55:07 2013	(r931)
> +++ wwwbase/Crawler/AbstractCrawler.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -7,7 +7,7 @@
>  require_once '../../phplib/serverPreferences.php';
>  require_once '../../phplib/db.php';
>  require_once '../../phplib/idiorm/idiorm.php';
> -
> +require_once '../../phplib/idiorm/paris.php';
> 
>  require_once 'AppLog.php';
> 
> @@ -259,53 +259,8 @@
> 
>  		$domain = $this->getDomain($url);
> 
> -		$this->saveLink2DB($canonicalUrl, $domain, $urlHash, 
> $this->currentPageId);
> -	}
> -
> -	//adauga o intrare nou in tabelul Link
> -	function saveLink2DB($canonicalUrl, $domain, $urlHash, 
> $crawledPageId) {
> -
> -		//nu inseram acelasi link de 2 ori
> -		if (ORM::for_table('Link')->where('canonicalUrl',
> $canonicalUrl)->find_one()) {
> -			return;
> -		}
> -
> -		try {
> -			$tableObj = ORM::for_table("Link");
> -			$tableObj->create();
> -			$tableObj->canonicalUrl = $canonicalUrl;
> -			$tableObj->domain = $domain;
> -			$tableObj->urlHash = $urlHash;
> -			$tableObj->crawledPageId = $crawledPageId;
> -			$tableObj->save();
> -		}
> -		catch(Exception $ex) {
> -
> -			logException($ex);
> -		}
> +		Link::saveLink2DB($canonicalUrl, $domain, $urlHash, 
> $this->currentPageId);
>  	}
> -	//salveaza informatiile despre pagina curent crawl-ata in tabelul 
> CrawledPage
> -	function savePage2DB($url, $httpStatus, $rawPagePath, 
> $parsedTextPath) {
> -
> -		try {
> -			$tableObj = ORM::for_table("CrawledPage");
> -			$tableObj->create();
> -			$tableObj->timestamp = $this->currentTimestamp;
> -			$tableObj->url = $url;
> -			$tableObj->httpStatus = ''.$this->info["http_code"];
> -			$tableObj->rawPagePath = $rawPagePath;
> -			$tableObj->parsedTextPath = $parsedTextPath;
> -			$tableObj->save();
> -
> -			$this->currentPageId =
> ORM::for_table('CrawledPage')->order_by_desc('id')->find_one()->id;
> -
> -		}
> -		catch(Exception $ex) {
> -
> -			logException($ex);
> -		}
> -	}
> -
> 
>  	function isRelativeLink($url) {
> 
> 
> Modified: wwwbase/Crawler/Crawler.php
> ==============================================================================
> --- wwwbase/Crawler/Crawler.php	Sat Aug 17 19:55:07 2013	(r931)
> +++ wwwbase/Crawler/Crawler.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -62,7 +62,7 @@
>  			$this->setStorePageParams();
> 
>  			//salveaza o intrare despre pagina curenta in baza de date
> -			$this->savePage2DB($this->currentUrl, $this->httpResponse(),
> $this->rawPagePath, $this->parsedTextPath);
> +			$this->currentPageId = CrawledPage::savePage2DB($this->currentUrl,
> $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath,
> $this->currentTimestamp);
> 
>  			//daca pagina nu e in format html (e imagine sau alt fisier)
>  			//sau daca am primit un cod HTTP de eroare, sarim peste pagina 
> acesta
> @@ -102,7 +102,7 @@
>  if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
> 
>  	$obj = new Crawler();
> -	$obj->startCrawling("http://wiki.dexonline.ro/");
> -	//$obj->startCrawling("http://www.romlit.ro");
> +	//$obj->startCrawling("http://wiki.dexonline.ro/");
> +	$obj->startCrawling("http://www.romlit.ro");
>  }
>  ?>
> \ No newline at end of file
> 
> Modified: wwwbase/Crawler/clean_all.php
> ==============================================================================
> --- wwwbase/Crawler/clean_all.php	Sat Aug 17 19:55:07 2013	(r931)
> +++ wwwbase/Crawler/clean_all.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -13,10 +13,7 @@
> 
>  	exec("rm -rf $regexPath");
>  }
> -$user_ag =  pref_getSectionPreference('crawler',
> 'user_agent_location').pref_getSectionPreference('crawler',
> 'new_line');
> -echo file_get_contents($user_ag);
> 
> -/*
>  try {
> 
>  	//sterge toate fisierele salvate
> 
> Added: wwwbase/Crawler/index.php
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ wwwbase/Crawler/index.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,33 @@
> +<?php
> +
> +require_once '../../phplib/util.php';
> +require_once '../../phplib/serverPreferences.php';
> +require_once '../../phplib/db.php';
> +require_once '../../phplib/idiorm/idiorm.php';
> +require_once '../../phplib/idiorm/paris.php';
> +
> +
> +$rows = CrawledPage::getListOfDomains();
> +
> +
> +$options = array('all', 'most recent domain');
> +$last = end($rows);
> +$values = array('all', $last->domain);
> +
> +
> +foreach($rows as $obj) {
> +
> +	array_push($options,$obj->domain);
> +	array_push($values,$obj->domain);
> +}
> +
> +//var_dump($options);
> +
> +SmartyWrap::assign('page_title', 'Romanian Crawler Log');
> +
> +SmartyWrap::assign('values', $values);
> +SmartyWrap::assign('options', $options);
> +
> +SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
> +
> +?>
> 
> Added: wwwbase/ajax/fetchCrawlerStatus.php
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ wwwbase/ajax/fetchCrawlerStatus.php	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,103 @@
> +<?php
> +/*
> + * Alin Ungureanu, 2013
> + * alyn.cti at gmail.com
> + */
> +require_once '../../phplib/util.php';
> +require_once '../../phplib/serverPreferences.php';
> +require_once '../../phplib/db.php';
> +require_once '../../phplib/idiorm/idiorm.php';
> +require_once '../../phplib/idiorm/paris.php';
> +
> +
> +class FetchCrawlerStatus {
> +
> +
> +	function getStatusPerHttpCode($domain = null) {
> +
> +		$crawledPage = Model::factory('CrawledPage');
> +		$link = Model::factory('Link')->create();
> +
> +		$data = '';
> +		$crawledPageFilter = '';
> +
> +		if (isset($domain)) {
> +
> +			$crawledPageFilter = " where url like 'http://$domain%'";
> +		}
> +
> +		$rows = $crawledPage->raw_query("Select httpStatus,
> count(httpStatus) as cnt from CrawledPage $crawledPageFilter Group by
> httpStatus;")->find_many();
> +
> +		foreach($rows as $row) {
> +
> +			$data .= $row->httpStatus . ': <span class="httpStatusResult">' .
> $row->cnt .'</span><br>';
> +		}
> +
> +		return $data;
> +	}
> +
> +	function getTotalNumber($domain = null) {
> +
> +
> +		$crawledPage = Model::factory('CrawledPage');
> +		$link = Model::factory('Link');
> +
> +		$data = '';
> +		$linkFilter = '';
> +		$crawledPageFilter = '';
> +
> +		if (isset($domain)) {
> +
> +			$linkFilter = " where domain like '$domain'";
> +			$crawledPageFilter = " where url like 'http://$domain%'";
> +
> +		}
> +
> +		$row = $crawledPage->raw_query("Select count(*) as cnt from
> CrawledPage $crawledPageFilter;")->find_one()->cnt;
> +		$data .= 'Total processed pages: <span class="totalProcessed">'.
> $row . '</span><br>';
> +
> +		$row = $link->raw_query("Select count(*) as cnt from Link
> $linkFilter;")->find_one()->cnt;
> +		$data .= 'Total links discovered: <span class="totalLinks">'. $row
> . '</span><br>';
> +
> +		return $data;
> +	}
> +
> +}
> +
> +
> +if (isset($_POST['method']) && !empty($_POST['method'])) {
> +
> +
> +	$fetch = new FetchCrawlerStatus();
> +
> +
> +
> +
> +
> +	if (isset($_POST['domain']) && $_POST['domain'] != 'all') {
> +
> +		if ($_POST['method'] == 'fetch_total') {
> +
> +			echo $fetch->getTotalNumber($_POST['domain']);
> +		}
> +
> +		else if ($_POST['method'] == 'fetch_per_http_code') {
> +
> +			echo $fetch->getStatusPerHttpCode($_POST['domain']);
> +		}
> +	}
> +	else if (isset($_POST['domain']) && $_POST['domain'] == 'all') {
> +
> +		if ($_POST['method'] == 'fetch_total') {
> +
> +			echo $fetch->getTotalNumber();
> +		}
> +		else if ($_POST['method'] == 'fetch_per_http_code') {
> +
> +			echo $fetch->getStatusPerHttpCode();
> +
> +		}
> +	}
> +}
> +
> +?>
> \ No newline at end of file
> 
> Added: wwwbase/js/crawler_ajax.js
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ wwwbase/js/crawler_ajax.js	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,60 @@
> +var crawlerStatus = {}
> +
> +
> +crawlerStatus.fetchStatus = function() {
> +
> +	var selectedDomain = $('select[name=dropDown]').val();
> +
> +	$.ajax({
> +
> +		url: '../ajax/fetchCrawlerStatus.php',
> +		type: 'post',
> +		data: { method: 'fetch_total', domain: selectedDomain },
> +		success: function(data) {
> +			//face refresh la informatiile generale
> +			$('.total').html(data + '</br>');
> +
> +		}
> +	});
> +}
> +
> +crawlerStatus.fetchHttpStatus = function() {
> +
> +	var selectedDomain = $('select[name=dropDown]').val();
> +
> +	$.ajax({
> +
> +		url: '../ajax/fetchCrawlerStatus.php',
> +		type: 'post',
> +		data: { method: 'fetch_per_http_code', domain: selectedDomain },
> +		success: function(data) {
> +			//face refresh la HTTP status
> +			$('.perHttpCode').html(data);
> +
> +			//schimba automat dimensiunea logo-ului
> +			$('#logo').height($('#info').height() * 2/3);
> +			$('#logo').css('margin-top', $('#info').height() * 2/7);
> +			$('#logo').css('margin-left', $('#info').height() * 1/4);
> +		}
> +
> +	});
> +}
> +
> +
> +$(document).ready(function() {
> +
> +
> +	crawlerStatus.fetchStatus();
> +	crawlerStatus.fetchHttpStatus();
> +
> +	setInterval(crawlerStatus.fetchStatus, 5000);
> +	setInterval(crawlerStatus.fetchHttpStatus, 5000);
> +
> +	$('select[name=dropDown]').change(function() {
> +
> +
> +		$('.inSelection').html($('select[name=dropDown]').val());
> +		crawlerStatus.fetchStatus();
> +		crawlerStatus.fetchHttpStatus();
> +	});
> +});
> \ No newline at end of file
> 
> Added: wwwbase/styles/crawler.css
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ wwwbase/styles/crawler.css	Sun Aug 18 00:16:16 2013	(r932)
> @@ -0,0 +1,101 @@
> +#info {
> +
> +	overflow: hidden;
> +}
> +
> +#info {
> +	position: relative;
> +	display: inline-block;
> +	float: left;
> +	margin-top: 30px;
> +	background-color: #ffe075;
> +	width: 310px;
> +	margin-left: 30px;
> +	padding: 10px;
> +}
> +.total, .perHttpCode {
> +
> +	font-weight: 600;
> +	font-size: 24;
> +	font-family: serif;
> +	color: #3399FF;
> +}
> +
> +.infoPanel {
> +
> +	position: relative;
> +	display: inline-block;
> +	margin-left: 10px;
> +}
> +
> +.totalProcessed, .totalLinks, .httpStatusResult {
> +
> +	color: #FF3300;
> +	font-family: "Times New Roman";
> +	font-weight: 900;
> +	font-size: 26;
> +}
> +
> +.infoTitle, .domain {
> +
> +	position: relative;
> +	text-allign: center;
> +	font-weight: 700;
> +	font-family: arial;
> +	font-size: 24;
> +	padding-top: 20px;
> +	color: #8F0000;
> +}
> +
> +.inSelection {
> +
> +	color: #6B4724;
> +}
> +
> +
> +.selectDomain {
> +
> +	text-allign: left;
> +	font-weight: 900;
> +	font-family: serif;
> +	font-size: 21;
> +	color: #8F0000;
> +}
> +
> +#selectDomain {
> +
> +	margin-left: 30px;
> +}
> +
> +#selectDomain select {
> +
> +	width: 172px;
> +}
> +
> +#logo {
> +
> +	position: relative;
> +	display: inline-block;
> +	margin-top: 60px;
> +	margin-left: 40px;
> +}
> +
> +
> + at media (max-device-width: 640px) {
> +
> +
> +	#crawlerTitle img {
> +
> +		width: 310px !important;
> +		height: 50px !important;
> +		margin: 0 auto !important;
> +	}
> +
> +	#selectDomain, #info, #logo {
> +
> +		margin: 0 auto !important;
> +		margin-top: 20px !important;
> +	}
> +}
> +
> +
> _______________________________________________
> Dev mailing list
> Dev at dexonline.ro
> http://list.dexonline.ro/listinfo/dev


More information about the Dev mailing list