[dev] [commit] r1005 - phplib wwwbase/Crawler
automailer at dexonline.ro
automailer at dexonline.ro
Wed Sep 25 09:49:22 EEST 2013
Author: alinu
Date: Wed Sep 25 09:49:22 2013
New Revision: 1005
Log:
Am gasit o functie implementata foarte frumos pe internet, parse_utf8_url() pe care am adaugat-o la sfarsitul phplib/util.php. Folosind aceasta functie in crawler, am rezolvat problema parsarii linkurilor de forma en.wikipedia.org, inainte en.wikipedia.org era 'path' (parse_url), acum este 'host'.
Modified:
phplib/util.php
wwwbase/Crawler/AbstractCrawler.php
wwwbase/Crawler/Crawler.php
Modified: phplib/util.php
==============================================================================
--- phplib/util.php Mon Sep 23 09:53:21 2013 (r1004)
+++ phplib/util.php Wed Sep 25 09:49:22 2013 (r1005)
@@ -491,4 +491,18 @@
return $result;
}
+function parse_utf8_url($url)
+{
+ static $keys = array('scheme'=>0,'user'=>0,'pass'=>0,'host'=>0,'port'=>0,'path'=>0,'query'=>0,'fragment'=>0);
+ if (is_string($url) && preg_match(
+ '~^((?P<scheme>[^:/?#]+):(//))?((\\3|//)?(?:(?P<user>[^:]+):(?P<pass>[^@]+)@)?(?P<host>[^/?:#]*))(:(?P<port>\\d+))?' .
+ '(?P<path>[^?#]*)(\\?(?P<query>[^#]*))?(#(?P<fragment>.*))?~u', $url, $matches)) {
+ foreach ($matches as $key => $value)
+ if (!isset($keys[$key]) || empty($value))
+ unset($matches[$key]);
+ return $matches;
+ }
+ return false;
+}
+
?>
Modified: wwwbase/Crawler/AbstractCrawler.php
==============================================================================
--- wwwbase/Crawler/AbstractCrawler.php Mon Sep 23 09:53:21 2013 (r1004)
+++ wwwbase/Crawler/AbstractCrawler.php Wed Sep 25 09:49:22 2013 (r1005)
@@ -213,7 +213,7 @@
function eligeableUrl($url) {
- $resource = parse_url($url);
+ $resource = parse_utf8_url($url);
$pathInfo = pathinfo($resource['path']);
if (isset($pathInfo['extension'])) {
@@ -293,7 +293,7 @@
}
- $parsedUrl = parse_url($url);
+ $parsedUrl = parse_utf8_url($url);
if (substr_count($parsedUrl['host'], '.') < 2) {
Modified: wwwbase/Crawler/Crawler.php
==============================================================================
--- wwwbase/Crawler/Crawler.php Mon Sep 23 09:53:21 2013 (r1004)
+++ wwwbase/Crawler/Crawler.php Wed Sep 25 09:49:22 2013 (r1005)
@@ -133,7 +133,7 @@
//curatam url-ul
$this->currentUrl = $this->urlPadding($startUrl);
//impartim url-ul pe componente
- $this->urlResource = parse_url($this->currentUrl);
+ $this->urlResource = parse_utf8_url($this->currentUrl);
//salvam startUrl in tabelul Link pentru a incepe extragerea,
//startUrl nu va avea o pagina din care este descoperit
More information about the Dev
mailing list