[dev] [commit] r1005 - phplib wwwbase/Crawler

Wed Sep 25 09:49:22 EEST 2013

Author: alinu
Date: Wed Sep 25 09:49:22 2013
New Revision: 1005

Log:
Am gasit o functie implementata foarte frumos pe internet, parse_utf8_url() pe care am adaugat-o la sfarsitul phplib/util.php. Folosind aceasta functie in crawler, am rezolvat problema parsarii linkurilor de forma en.wikipedia.org, inainte en.wikipedia.org era 'path' (parse_url), acum este 'host'.

Modified:
   phplib/util.php
   wwwbase/Crawler/AbstractCrawler.php
   wwwbase/Crawler/Crawler.php

Modified: phplib/util.php
==============================================================================

--- phplib/util.php	Mon Sep 23 09:53:21 2013	(r1004)
+++ phplib/util.php	Wed Sep 25 09:49:22 2013	(r1005)
@@ -491,4 +491,18 @@
   return $result;
 }
 
+function parse_utf8_url($url)
+{
+    static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+    if (is_string($url) && preg_match(
+            '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+            '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+        foreach ($matches as $key => $value)
+            if (!isset($keys[$key]) || empty($value))
+                unset($matches[$key]);
+        return $matches;
+    }
+    return false;
+}
+
 ?>

Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php	Mon Sep 23 09:53:21 2013	(r1004)
+++ wwwbase/Crawler/AbstractCrawler.php	Wed Sep 25 09:49:22 2013	(r1005)
@@ -213,7 +213,7 @@
 
     function eligeableUrl($url) {
 
-    	$resource = parse_url($url);
+    	$resource = parse_utf8_url($url);
     	$pathInfo = pathinfo($resource['path']);
 
     	if (isset($pathInfo['extension'])) {
@@ -293,7 +293,7 @@
 		}
 		
 
-		$parsedUrl = parse_url($url);
+		$parsedUrl = parse_utf8_url($url);
 		
 
 		if (substr_count($parsedUrl['host'], '.') < 2) {

Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php	Mon Sep 23 09:53:21 2013	(r1004)
+++ wwwbase/Crawler/Crawler.php	Wed Sep 25 09:49:22 2013	(r1005)
@@ -133,7 +133,7 @@
 			//curatam url-ul
 			$this->currentUrl = $this->urlPadding($startUrl);
 			//impartim url-ul pe componente
-			$this->urlResource = parse_url($this->currentUrl);
+			$this->urlResource = parse_utf8_url($this->currentUrl);
 
 			//salvam startUrl in tabelul Link pentru a incepe extragerea,
 			//startUrl nu va avea o pagina din care este descoperit