<?php
/**
* PrettyLatin - helper for utf decoding (free of multibyte extension functions)
* Copyright (C) 2008 by Michal Amerek.
* $Id: PrettyLatin.class.php,v 1.1 2008/03/04 00:44 ameros Exp $
*
* License: GNU LGPL (http://www.opensource.org/licenses/lgpl-license.html)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/**
* This is a class of PrettyLatin helper.
* Helpfull especially for translation of national
* characters to latin ones. Usefull for creating
* friendly urls from blog post title for instance.
* Two ways of further custom development:
* - entities table could be refilled with new entity names
* (for not latin alphabets as cyrillic ones)
* - mapping function could be refilled with missing ascii codes
* (for particular charactes being extension of latin base)
*
* @author Michal Amerek (ameros) <hide@address.com>
* @contributors
* @since 2008/03/04
* @version 1.1 - filling fully russian characters
* @version 1.0 - filling fully only polish characters (plus some others)
*/
class PrettyLatin{
/**
* character that will replace one for which no equivalent found
* @type string
* @public
*/
public $rep = "_";
/**
* array of patterns for so called friendly codes of entities
*
* @type array
* @public
* @TODO: should be refilled with possible descriptors
* suffixing latin character equivalent for national one
* @see http://webdesign.about.com/library/bl_htmlcodes.htm
*/
private $arrPatterns =
array(
//'/&(.)[^;]*;/',
'/&(.)uml;/',
'/&(.)acute;/',
'/&(.)grave;/',
'/&(.)circ;/',
'/&(.)tilde;/',
'/&(.)ring;/',
'/&(.)acute;/',
'/&(..)lig;/',
'/&(.)cedil;/',
'/&(.)slash;/',
'/&(.)macron;/',
'/&(.)breve;/',
'/&(.)dot;/',
'/&(.)hachek;/',
'/&(..)hachek;/',
'/&(.)stroke;/',
'/&(.)cyr;/',
'/&(..)cyr;/',
'/&(....)cyr;/',
'/&(.)ogonek;/',
'/&(.)bar;/',);
/**
* array of according replacements for patterns
*
* @type array
* @private
* @TODO: should be refilled accordingly to the changes in above array of patterns
*/
//private $arrRepl = array('$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1');
private $arrRepl = '$1';
/**
* function mapping national ascii codes for latin ones
*
* @param string(1) c
* @return int - ascii value
*/
private function maptolatin($c){
$ord = $this->uniord($c);
if ($ord==256 || $ord==258 || $ord==260 || $ord==478 || $ord==506)
return 65; // 'A'
if ($ord==257 || $ord==259 || $ord==261 || $ord==479 || $ord==507)
return 97; // 'a'
if ($ord==262 || $ord==264 || $ord==266 || $ord==268 || $ord==479 || $ord==507)
return 67; // 'C'
if ($ord==263 || $ord==265 || $ord==269 || $ord==267)
return 99; // 'c'
if ($ord==314 || $ord==315 || $ord==318 || $ord==320 || $ord==322)
return 108; // 'l'
if ($ord==324 || $ord==326 || $ord==328)
return 110; // 'n'
if ($ord==323 || $ord==325 || $ord==327)
return 78; // 'N'
if ($ord==347 || $ord==351 || $ord==353 || $ord==349 || $ord==7777)
return 115; // 's'
if ($ord==346 || $ord==350 || $ord==352 || $ord==348 || $ord==7776)
return 83; // 'S'
if ($ord==377 || $ord==379 || $ord==381)
return 90; // 'Z'
if ($ord==378 || $ord==380 || $ord==381)
return 122; // 'z'
}
/**
* constructor extending html entities table
* base on ini file
*/
public function __construct($example=0){
if($example){
try{
$this->example();
}
catch(Exception $e){
die(e);
}
}
$this->arrEntities = get_html_translation_table(HTML_ENTITIES);
// this is the file where new entities name can be defined
$arrEntitiesExt = parse_ini_file("entities.ini");
foreach ($arrEntitiesExt as $key=>$value){
$this->arrEntities[html_entity_decode("&#".$key.";",ENT_QUOTES,"UTF-8")] = "&".$value.";";
}
}
/**
* proper function of interest - it replaces national characters by latin equivalents
*
* @param string - text to be translated
* @return string - latin text
* @example PrettyLatin->utfToLatin('Ã
ÃÄÅÄ
ÑÄÅÄ
óÅźż') @see PrettyLatin::example()
*/
public function utfToLatin($tst) {
$char = "";
while (strlen($tst) > 0) {
preg_match("/^(.)(.*)$/u", $tst, $match);
$test = utf8_decode($match[1]);
if ($test != "?") {
$char .= $this->remove_diacritic($test);
} else if (strlen($match[1]) > 1) {
$new_char = $this->remove_diacritic($match[1]);
if ($match[1]==$new_char) $new_char=$this->rep;
$char .= $new_char;
}
$tst = $match[2];
}
$tst = $char;
return $tst;
}
/**
* function removing diactric signs from character
*
* @param string
* @return string
*/
public function remove_diacritic($c){
if (!array_key_exists($c,$this->arrEntities)){
if ($this->maptolatin($c))
return html_entity_decode("&#".$this->maptolatin($c).";",ENT_QUOTES,"UTF-8");
return $c;
}
$c = $this->arrEntities[$c];
$c = preg_replace($this->arrPatterns,$this->arrRepl,$c);
return $c;
}
/**
* @param string(1) c
* @return int - ord ascii value of given character
*/
public function uniord($c) {
$uord = 0;
if (ord($c{0}) >= 0 && ord($c{0}) <= 127) $uord = ord($c{0});
if (ord($c{0}) >= 192 && ord($c{0}) <= 223) $uord = (ord($c{0})-192)*64 + (ord($c{1})-128);
if (ord($c{0}) >= 224 && ord($c{0}) <= 239) $uord = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
if (ord($c{0}) >= 240 && ord($c{0}) <= 247) $uord = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
if (ord($c{0}) >= 248 && ord($c{0}) <= 251) $uord = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
if (ord($c{0}) >= 252 && ord($c{0}) <= 253) $uord = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
if (ord($c{0}) >= 254 && ord($c{0}) <= 255) $uord = false;
return $uord;
}
/**
* example test function
* @run PrettyLatin->example()
*/
public function example($string="Ã
ÃÄÅÄ
ÑÄÅÄ
óÅźżЩЯÑ"){
$c = __CLASS__;
$PL = new $c;
print "UTF=>".$string."<br/>to latin=>".$PL->utfToLatin($string);
}
}
?>