[dev] [commit] r932 - phplib/models templates/crawler wwwbase/Crawler wwwbase/ajax wwwbase/js wwwbase/styles

automailer at dexonline.ro automailer at dexonline.ro
Sun Aug 18 00:16:16 EEST 2013


Author: alinu
Date: Sun Aug 18 00:16:16 2013
New Revision: 932

Log:


Added:
   phplib/models/CrawledPage.php
   phplib/models/Link.php
   templates/crawler/
   templates/crawler/crawler.ihtml
   wwwbase/Crawler/index.php
   wwwbase/ajax/fetchCrawlerStatus.php
   wwwbase/js/crawler_ajax.js
   wwwbase/styles/crawler.css
Modified:
   wwwbase/Crawler/AbstractCrawler.php
   wwwbase/Crawler/Crawler.php
   wwwbase/Crawler/clean_all.php

Added: phplib/models/CrawledPage.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ phplib/models/CrawledPage.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,41 @@
+<?php
+
+
+class CrawledPage  extends BaseObject implements DatedObject {
+	//implements DatedObject {
+	
+	public static $_table = 'CrawledPage';
+
+	//salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
+	public static function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath, $timestamp) {
+
+		try {
+			$tableObj = Model::factory(self::$_table);
+			$tableObj->create();
+			$tableObj->timestamp = $timestamp;
+			$tableObj->url = $url;
+			$tableObj->httpStatus = $httpStatus;
+			$tableObj->rawPagePath = $rawPagePath;
+			$tableObj->parsedTextPath = $parsedTextPath;
+			$tableObj->save();
+
+			return $tableObj->id;
+		}
+		catch(Exception $ex) {
+
+			logException($ex);
+		}
+
+		return null;
+	}
+
+	//intoarce o lista cu domeniile parsate
+	public static function getListOfDomains() {
+
+		return Model::factory(self::$_table)->raw_query("select id, substr(substring_index(url, '/', 3),8) as domain from CrawledPage group by domain order by id asc;")->find_many();
+	}
+	
+
+}
+
+?>
\ No newline at end of file

Added: phplib/models/Link.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ phplib/models/Link.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,40 @@
+<?php
+
+
+class Link extends BaseObject {
+	//implements DatedObject {
+
+	public static $_table = 'Link';
+
+	//adauga o intrare nou in tabelul Link
+	public static function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
+
+		//nu inseram acelasi link de 2 ori
+		if (Model::factory(self::$_table)->where('canonicalUrl', $canonicalUrl)->find_one()) {
+			return;
+		}
+
+		try {
+
+			$tableObj = Model::factory(self::$_table);
+			$tableObj->create();
+			$tableObj->canonicalUrl = $canonicalUrl;
+			$tableObj->domain = $domain;
+			$tableObj->urlHash = $urlHash;
+			$tableObj->crawledPageId = $crawledPageId;
+			$tableObj->save();
+
+			return $tableObj->id;
+		}
+		catch(Exception $ex) {
+
+			logException($ex);
+		}
+
+		return null;
+	}
+	
+
+}
+
+?>
\ No newline at end of file

Added: templates/crawler/crawler.ihtml
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ templates/crawler/crawler.ihtml	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,53 @@
+<html>
+	<head>
+
+		<title>{$page_title}</title>
+
+		<script type="text/javascript" src='http://code.jquery.com/jquery-2.0.3.js'></script>
+
+		<script type="text/javascript" src='../js/crawler_ajax.js'></script>
+
+		<link rel="StyleSheet" type="text/css" href="../styles/crawler.css"/>
+
+	</head>
+	<body>
+		
+
+		<div id="crawlerTitle">
+			<img src="../img/crawler/romanian_crawler_log.png">
+		</div>
+
+
+		<div id="selectDomain">
+			<br>
+			<span class="selectDomain">Select domain: </span>
+			
+			<select name="dropDown">
+				{html_options values=$values output=$options} 
+			</select>
+			
+			<br>
+		</div>
+
+		<div id="info">
+				<span class="domain">
+					<center>Showing: <span class="inSelection">all</span></center>
+				<br>
+				</span>			
+				<span class="infoTitle"><center>General Stats</center></span>
+				
+				<div class="infoPanel">
+					<span class="total"></span>
+				</div>
+				
+				<span class="infoTitle"><center>HTTP Code Stats</center></span>
+			
+			<div class="infoPanel">
+				<span class="perHttpCode"></span>
+			</div>
+		</div>
+
+		<img id="logo" src="../img/logo-dexonline-2.png"/>
+
+	</body>
+</html>

Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php	Sat Aug 17 19:55:07 2013	(r931)
+++ wwwbase/Crawler/AbstractCrawler.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -7,7 +7,7 @@
 require_once '../../phplib/serverPreferences.php';
 require_once '../../phplib/db.php';
 require_once '../../phplib/idiorm/idiorm.php';
-
+require_once '../../phplib/idiorm/paris.php';
 
 require_once 'AppLog.php';
 
@@ -259,53 +259,8 @@
 
 		$domain = $this->getDomain($url);
 
-		$this->saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
-	}
-
-	//adauga o intrare nou in tabelul Link
-	function saveLink2DB($canonicalUrl, $domain, $urlHash, $crawledPageId) {
-
-		//nu inseram acelasi link de 2 ori
-		if (ORM::for_table('Link')->where('canonicalUrl', $canonicalUrl)->find_one()) {
-			return;
-		}
-
-		try {
-			$tableObj = ORM::for_table("Link");
-			$tableObj->create();
-			$tableObj->canonicalUrl = $canonicalUrl;
-			$tableObj->domain = $domain;
-			$tableObj->urlHash = $urlHash;
-			$tableObj->crawledPageId = $crawledPageId;
-			$tableObj->save();
-		}
-		catch(Exception $ex) {
-
-			logException($ex);
-		}
+		Link::saveLink2DB($canonicalUrl, $domain, $urlHash, $this->currentPageId);
 	}
-	//salveaza informatiile despre pagina curent crawl-ata in tabelul CrawledPage
-	function savePage2DB($url, $httpStatus, $rawPagePath, $parsedTextPath) {
-
-		try {
-			$tableObj = ORM::for_table("CrawledPage");
-			$tableObj->create();
-			$tableObj->timestamp = $this->currentTimestamp;
-			$tableObj->url = $url;
-			$tableObj->httpStatus = ''.$this->info["http_code"];
-			$tableObj->rawPagePath = $rawPagePath;
-			$tableObj->parsedTextPath = $parsedTextPath;
-			$tableObj->save();
-
-			$this->currentPageId = ORM::for_table('CrawledPage')->order_by_desc('id')->find_one()->id;
-
-		}
-		catch(Exception $ex) {
-
-			logException($ex);
-		}
-	}
-
 
 	function isRelativeLink($url) {
 

Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php	Sat Aug 17 19:55:07 2013	(r931)
+++ wwwbase/Crawler/Crawler.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -62,7 +62,7 @@
 			$this->setStorePageParams();
 
 			//salveaza o intrare despre pagina curenta in baza de date
-			$this->savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath);
+			$this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
 			
 			//daca pagina nu e in format html (e imagine sau alt fisier)
 			//sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
@@ -102,7 +102,7 @@
 if (strstr( $_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
 
 	$obj = new Crawler();
-	$obj->startCrawling("http://wiki.dexonline.ro/");
-	//$obj->startCrawling("http://www.romlit.ro");
+	//$obj->startCrawling("http://wiki.dexonline.ro/");
+	$obj->startCrawling("http://www.romlit.ro");
 }
 ?>
\ No newline at end of file

Modified: wwwbase/Crawler/clean_all.php
==============================================================================
--- wwwbase/Crawler/clean_all.php	Sat Aug 17 19:55:07 2013	(r931)
+++ wwwbase/Crawler/clean_all.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -13,10 +13,7 @@
 
 	exec("rm -rf $regexPath");
 }
-$user_ag =  pref_getSectionPreference('crawler', 'user_agent_location').pref_getSectionPreference('crawler', 'new_line');
-echo file_get_contents($user_ag);
 
-/*
 try {
 
 	//sterge toate fisierele salvate

Added: wwwbase/Crawler/index.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/Crawler/index.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,33 @@
+<?php
+
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+require_once '../../phplib/idiorm/paris.php';
+
+
+$rows = CrawledPage::getListOfDomains();
+
+
+$options = array('all', 'most recent domain');
+$last = end($rows);
+$values = array('all', $last->domain);
+
+
+foreach($rows as $obj) {
+
+	array_push($options,$obj->domain);
+	array_push($values,$obj->domain);
+}
+
+//var_dump($options);
+
+SmartyWrap::assign('page_title', 'Romanian Crawler Log');
+
+SmartyWrap::assign('values', $values);
+SmartyWrap::assign('options', $options);
+
+SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
+
+?>

Added: wwwbase/ajax/fetchCrawlerStatus.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/ajax/fetchCrawlerStatus.php	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,103 @@
+<?php
+/*
+ * Alin Ungureanu, 2013
+ * alyn.cti at gmail.com
+ */
+require_once '../../phplib/util.php';
+require_once '../../phplib/serverPreferences.php';
+require_once '../../phplib/db.php';
+require_once '../../phplib/idiorm/idiorm.php';
+require_once '../../phplib/idiorm/paris.php';
+
+
+class FetchCrawlerStatus {
+
+	
+	function getStatusPerHttpCode($domain = null) {
+
+		$crawledPage = Model::factory('CrawledPage');
+		$link = Model::factory('Link')->create();
+
+		$data = '';
+		$crawledPageFilter = '';
+
+		if (isset($domain)) {
+
+			$crawledPageFilter = " where url like 'http://$domain%'";
+		}
+
+		$rows = $crawledPage->raw_query("Select httpStatus, count(httpStatus) as cnt from CrawledPage $crawledPageFilter Group by httpStatus;")->find_many();
+		
+		foreach($rows as $row) {
+
+			$data .= $row->httpStatus . ': <span class="httpStatusResult">' . $row->cnt .'</span><br>';
+		}
+
+		return $data;
+	}
+
+	function getTotalNumber($domain = null) {
+
+
+		$crawledPage = Model::factory('CrawledPage');
+		$link = Model::factory('Link');
+
+		$data = '';
+		$linkFilter = '';
+		$crawledPageFilter = '';
+
+		if (isset($domain)) {
+			
+			$linkFilter = " where domain like '$domain'";
+			$crawledPageFilter = " where url like 'http://$domain%'";
+			
+		}
+
+		$row = $crawledPage->raw_query("Select count(*) as cnt from CrawledPage $crawledPageFilter;")->find_one()->cnt;
+		$data .= 'Total processed pages: <span class="totalProcessed">'. $row . '</span><br>';
+
+		$row = $link->raw_query("Select count(*) as cnt from Link $linkFilter;")->find_one()->cnt;
+		$data .= 'Total links discovered: <span class="totalLinks">'. $row . '</span><br>';
+
+		return $data;
+	}
+
+}
+
+
+if (isset($_POST['method']) && !empty($_POST['method'])) {
+
+
+	$fetch = new FetchCrawlerStatus();
+
+
+
+
+
+	if (isset($_POST['domain']) && $_POST['domain'] != 'all') {
+
+		if ($_POST['method'] == 'fetch_total') {
+
+			echo $fetch->getTotalNumber($_POST['domain']);
+		}
+
+		else if ($_POST['method'] == 'fetch_per_http_code') {
+
+			echo $fetch->getStatusPerHttpCode($_POST['domain']);
+		}
+	}
+	else if (isset($_POST['domain']) && $_POST['domain'] == 'all') {
+
+		if ($_POST['method'] == 'fetch_total') {
+
+			echo $fetch->getTotalNumber();
+		}
+		else if ($_POST['method'] == 'fetch_per_http_code') {
+
+			echo $fetch->getStatusPerHttpCode();
+
+		}
+	}
+}
+
+?>
\ No newline at end of file

Added: wwwbase/js/crawler_ajax.js
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/js/crawler_ajax.js	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,60 @@
+var crawlerStatus = {}
+
+
+crawlerStatus.fetchStatus = function() {
+
+	var selectedDomain = $('select[name=dropDown]').val();
+
+	$.ajax({
+
+		url: '../ajax/fetchCrawlerStatus.php',
+		type: 'post',
+		data: { method: 'fetch_total', domain: selectedDomain },
+		success: function(data) {
+			//face refresh la informatiile generale
+			$('.total').html(data + '</br>');
+
+		}
+	});
+}
+
+crawlerStatus.fetchHttpStatus = function() {
+
+	var selectedDomain = $('select[name=dropDown]').val();
+
+	$.ajax({
+
+		url: '../ajax/fetchCrawlerStatus.php',
+		type: 'post',
+		data: { method: 'fetch_per_http_code', domain: selectedDomain },
+		success: function(data) {
+			//face refresh la HTTP status
+			$('.perHttpCode').html(data);
+
+			//schimba automat dimensiunea logo-ului
+			$('#logo').height($('#info').height() * 2/3);
+			$('#logo').css('margin-top', $('#info').height() * 2/7);
+			$('#logo').css('margin-left', $('#info').height() * 1/4);
+		}
+
+	});
+}
+
+
+$(document).ready(function() {
+
+
+	crawlerStatus.fetchStatus();
+	crawlerStatus.fetchHttpStatus();
+
+	setInterval(crawlerStatus.fetchStatus, 5000);
+	setInterval(crawlerStatus.fetchHttpStatus, 5000);
+	
+	$('select[name=dropDown]').change(function() {
+
+
+		$('.inSelection').html($('select[name=dropDown]').val());
+		crawlerStatus.fetchStatus();
+		crawlerStatus.fetchHttpStatus();
+	});
+});
\ No newline at end of file

Added: wwwbase/styles/crawler.css
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/styles/crawler.css	Sun Aug 18 00:16:16 2013	(r932)
@@ -0,0 +1,101 @@
+#info {
+
+	overflow: hidden;
+}
+
+#info {
+	position: relative;
+	display: inline-block;
+	float: left;
+	margin-top: 30px;
+	background-color: #ffe075;
+	width: 310px;
+	margin-left: 30px;
+	padding: 10px;
+}
+.total, .perHttpCode {
+
+	font-weight: 600;
+	font-size: 24;
+	font-family: serif;
+	color: #3399FF;
+}
+
+.infoPanel {
+
+	position: relative;
+	display: inline-block;
+	margin-left: 10px;
+}			
+
+.totalProcessed, .totalLinks, .httpStatusResult {
+
+	color: #FF3300;
+	font-family: "Times New Roman";
+	font-weight: 900;
+	font-size: 26;
+}
+
+.infoTitle, .domain {
+
+	position: relative;
+	text-allign: center;
+	font-weight: 700;
+	font-family: arial;
+	font-size: 24;
+	padding-top: 20px;
+	color: #8F0000;
+}
+
+.inSelection {
+
+	color: #6B4724;
+}
+
+
+.selectDomain {
+
+	text-allign: left;
+	font-weight: 900;
+	font-family: serif;
+	font-size: 21;
+	color: #8F0000;
+}
+
+#selectDomain {
+
+	margin-left: 30px;
+}
+
+#selectDomain select {
+
+	width: 172px;
+}
+
+#logo {
+
+	position: relative;
+	display: inline-block;
+	margin-top: 60px;
+	margin-left: 40px;
+}
+
+
+ at media (max-device-width: 640px) {
+
+
+	#crawlerTitle img {
+
+		width: 310px !important;
+		height: 50px !important;
+		margin: 0 auto !important;
+	}
+
+	#selectDomain, #info, #logo {
+
+		margin: 0 auto !important;
+		margin-top: 20px !important;
+	}
+}
+
+


More information about the Dev mailing list