[dev] [commit] r990 - wwwbase

automailer at dexonline.ro automailer at dexonline.ro
Thu Sep 19 08:40:51 EEST 2013


Author: alinu
Date: Thu Sep 19 08:40:51 2013
New Revision: 990

Log:


Added:
   wwwbase/diacritice.php

Added: wwwbase/diacritice.php
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ wwwbase/diacritice.php	Thu Sep 19 08:40:51 2013	(r990)
@@ -0,0 +1,234 @@
+<?php
+
+require_once '../phplib/util.php';
+require_once '../phplib/serverPreferences.php';
+require_once '../phplib/db.php';
+require_once '../phplib/idiorm/idiorm.php';
+require_once '../phplib/idiorm/paris.php';
+
+require_once 'Crawler/AppLog.php';
+require_once 'Crawler/MemoryManagement.php';
+
+
+db_init();
+
+$logFile = pref_getSectionPreference('crawler', 'diacritics_log');
+
+
+class DiacriticsFixer {
+
+
+	private static $a = array('defaultForm' => 'a', 'curvedForm' => 'ă', 'circumflexForm' => 'â');
+	private static $i = array('defaultForm' => 'i', 'curvedForm' => null, 'circumflexForm' => 'î');
+	private static $s = array('defaultForm' => 's', 'curvedForm' => 'ș', 'circumflexForm' => null);
+	private static $t = array('defaultForm' => 't', 'curvedForm' => 'ț', 'circumflexForm' => null);
+
+	private $resultText;
+	private $lastOffset;
+
+
+	protected $currOffset;
+	protected $text;
+	protected $fileEndOffset;
+
+	protected static $diacritics;
+	protected static $nonDiacritics;
+	protected static $paddingNumber;
+	protected static $paddingChar;
+	/*
+	 * initialises instance variables
+	 */
+	function __construct() {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+
+		self::$diacritics = pref_getSectionPreference("crawler", "diacritics");
+		self::$nonDiacritics = pref_getSectionPreference("crawler", "non_diacritics");
+		self::$paddingNumber = pref_getSectionPreference('crawler', 'diacritics_padding_length');
+		self::$paddingChar = pref_getSectionPreference('crawler', 'padding_char');
+ 	}
+
+	/* returneaza urmatorul index in fisier care contine
+	 * un caracter din lista [a,i,s,t]
+	 */
+	function getNextOffset() {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		while($this->currOffset <= $this->textEndOffset) {
+			//daca urmatorul offset e a,i,s,t sau ă,â,î,ș,ț
+			if (self::isPossibleDiacritic(StringUtil::getCharAt($this->text, $this->currOffset))) {
+				return $this->currOffset ++;
+			}
+			$this->currOffset ++;
+		}
+		return null;
+	}
+
+	static function isSeparator($ch) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		return !(ctype_lower($ch) || $ch == '-');
+	}
+
+
+	function processText($text) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+
+
+		$this->currOffset = 0;
+		$this->lastOffset = 0;
+
+		$this->resultText = '';
+		$this->text = $text;
+
+		$this->textEndOffset = mb_strlen($text) - 1;
+		$offset = 0;
+		while(($offset = $this->getNextOffset()) != null) {
+
+			$this->leftAndRightPadding($offset);
+		}
+		//copiem de la ultimul posibil diacritic pana la final
+		$this->resultText .= mb_substr($this->text, $this->lastOffset, $this->textEndOffset - $this->lastOffset + 1);
+	}
+
+
+	public function fix($text) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+
+		$this->processText($text);
+		return $this->resultText;
+	}
+
+	function toLower($content) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		return mb_strtolower($content);
+	}
+
+
+	static function isPossibleDiacritic($ch) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		return strstr(self::$nonDiacritics, $ch);
+	}
+
+
+	function leftAndRightPadding($offset) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		$before = '';
+		$middle = StringUtil::getCharAt($this->text, $offset);
+		$after = '';
+		$infOffset = $offset - 1;
+		$supOffset = $offset + 1;
+		$infPadding = false;
+		$supPadding = false;
+		
+
+		for ($i = 0; $i < self::$paddingNumber; $i++) {
+			
+			if ($infOffset < 0) {
+				$infPadding = true;
+			}
+
+			$infCh = StringUtil::getCharAt($this->text, $infOffset);
+			$infPadding = self::isSeparator($infCh);
+
+			if ($infPadding) {
+				$before = self::$paddingChar . $before;
+			}
+			else {
+				$before = $infCh . $before;
+				$infOffset --;
+			}
+
+			if ($supOffset > $this->textEndOffset) {
+				$supPadding = true;
+			}
+
+			$supCh = StringUtil::getCharAt($this->text, $supOffset);
+			$supPadding = self::isSeparator($supCh);
+
+			if ($supPadding) {
+				$after = $after . self::$paddingChar;
+			}
+			else {
+				$after = $after . $supCh;
+				$supOffset ++;
+			}
+		}
+
+		crawlerLog("FOUND " . $before . '|' . $middle . '|' . $after);
+
+		$tableObj = Diacritics::entryExists($before, $middle, $after);
+		if ($tableObj != null) {
+			crawlerLog("Entry Exists");
+			$ch = $this->getMostProbableChar($tableObj);
+
+			$this->resultText .= mb_substr($this->text, $this->lastOffset, $offset - $this->lastOffset);
+
+			$this->resultText .= $ch;
+		}
+		else {
+
+			$this->resultText .= mb_substr($this->text, $this->lastOffset, $offset - $this->lastOffset + 1);			
+		}
+
+		$this->lastOffset = $this->currOffset;
+	}
+
+	public function getMostProbableChar($tableObj) {
+		crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line '.__LINE__ );
+		$ch = $tableObj->middle;
+		//$ch = self::$a['circumflexForm'];
+
+		$sortedSet = self::getCharProbabilityArray($tableObj);
+
+		$charArray = $this->getCharArray($ch);
+
+		crawlerLog("ARRAY ". print_r($sortedSet, true));
+
+		$key = key($sortedSet);//array_search($charArray[0], $charArray);
+		crawlerLog("WTF " . $key);
+		$ch = $charArray[$key];
+
+		return $ch;
+	}
+
+	private function getCharArray($ch) {
+
+		return self::$$ch;
+	}
+
+	private static function getCharProbabilityArray($tableObj) {
+
+		$array = array(
+			'defaultForm' => $tableObj->defaultForm,
+			'curvedForm' => $tableObj->curvedForm,
+			'circumflexForm' => $tableObj->circumflexForm
+			);
+		//sort array desc
+		arsort($array);
+		return $array;
+	}
+
+
+}
+
+
+if (strstr( $_SERVER['SCRIPT_NAME'], 'diacritice.php')) {
+
+
+
+	SmartyWrap::assign('page_title', 'Corector diacritice');
+
+
+	if (isset($_POST['text']) && $_POST['text'] != '') {
+
+		$obj = new DiacriticsFixer();
+		SmartyWrap::assign('result', $obj->fix($_POST['text']));
+	}
+	else {
+
+		SmartyWrap::assign('result', '');
+	}
+
+
+	SmartyWrap::displayPageWithSkin('../diacritics_fix/diacritics_fix.ihtml');
+}
+
+?>
\ No newline at end of file


More information about the Dev mailing list