Location: PHPKode > scripts > Pretty Latin > pretty-latin/PrettyLatin.class.php
<?php
/**
 * PrettyLatin - helper for utf decoding (free of multibyte extension functions)
 * Copyright (C) 2008 by Michal Amerek.
 * $Id: PrettyLatin.class.php,v 1.1 2008/03/04 00:44 ameros Exp $
 *
 * License: GNU LGPL (http://www.opensource.org/licenses/lgpl-license.html)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 
/**
 * This is a class of PrettyLatin helper.
 * Helpfull especially for translation of national
 * characters to latin ones. Usefull for creating
 * friendly urls from blog post title for instance.
 * Two ways of further custom development:
 * - entities table could be refilled with new entity names
 * (for not latin alphabets as cyrillic ones)
 * - mapping function could be refilled with missing ascii codes
 * (for particular charactes being extension of latin base)
 *
 * @author Michal Amerek (ameros) <hide@address.com>
 * @contributors
 * @since 2008/03/04
 * @version 1.1 - filling fully russian characters
 * @version 1.0 - filling fully only polish characters (plus some others)
 */
class PrettyLatin{
	
   /**
    * character that will replace one for which no equivalent found
    * @type string
    * @public
    */
   public $rep = "_";
   /**
    * array of patterns for so called friendly codes of entities
    * 
    * @type array
    * @public
    * @TODO: should be refilled with possible descriptors
    * suffixing latin character equivalent for national one
    * @see http://webdesign.about.com/library/bl_htmlcodes.htm
    */
   private $arrPatterns = 
   		array(
   		//'/&(.)[^;]*;/',
   		'/&(.)uml;/',
   		'/&(.)acute;/',
   		'/&(.)grave;/',
   		'/&(.)circ;/',
   		'/&(.)tilde;/',
   		'/&(.)ring;/',
   		'/&(.)acute;/',
   		'/&(..)lig;/',
   		'/&(.)cedil;/',
   		'/&(.)slash;/',
   		'/&(.)macron;/',
   		'/&(.)breve;/',
   		'/&(.)dot;/',
   		'/&(.)hachek;/',
   		'/&(..)hachek;/',
   		'/&(.)stroke;/',
                '/&(.)cyr;/',
   		'/&(..)cyr;/',
   		'/&(....)cyr;/',
   		'/&(.)ogonek;/',
   		'/&(.)bar;/',);

   /**
    * array of according replacements for patterns
    *
    * @type array
    * @private
    * @TODO: should be refilled accordingly to the changes in above array of patterns
    */
   //private $arrRepl = array('$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1','$1');
   private $arrRepl = '$1';
   
   /**
    * function mapping national ascii codes for latin ones
    *
    * @param string(1) c
    * @return int - ascii value
    */   
   private function maptolatin($c){
   	$ord = $this->uniord($c);
   	if ($ord==256 || $ord==258 || $ord==260 || $ord==478 || $ord==506)
         return 65; // 'A'
   	if ($ord==257 || $ord==259 || $ord==261 || $ord==479 || $ord==507)
         return 97; // 'a'
   	if ($ord==262 || $ord==264 || $ord==266 || $ord==268 || $ord==479 || $ord==507)
         return 67; // 'C'
   	if ($ord==263 || $ord==265 || $ord==269 || $ord==267)
         return 99; // 'c'
   	if ($ord==314 || $ord==315 || $ord==318 || $ord==320 || $ord==322)
         return 108; // 'l'
   	if ($ord==324 || $ord==326 || $ord==328)
         return 110; // 'n'
   	if ($ord==323 || $ord==325 || $ord==327)
         return 78; // 'N'
   	if ($ord==347 || $ord==351 || $ord==353 || $ord==349 || $ord==7777)
         return 115; // 's'
   	if ($ord==346 || $ord==350 || $ord==352 || $ord==348 || $ord==7776)
         return 83; // 'S'
   	if ($ord==377 || $ord==379 || $ord==381)
         return 90; // 'Z'
   	if ($ord==378 || $ord==380 || $ord==381)
         return 122; // 'z'
   }
   /**
    * constructor extending html entities table
    * base on ini file
    */ 
   public function __construct($example=0){   
      if($example){
        try{
           $this->example();
        }
        catch(Exception $e){
           die(e);
        }
      }  
   	$this->arrEntities = get_html_translation_table(HTML_ENTITIES);
   	// this is the file where new entities name can be defined
   	$arrEntitiesExt = parse_ini_file("entities.ini");
   	foreach ($arrEntitiesExt as $key=>$value){
   		$this->arrEntities[html_entity_decode("&#".$key.";",ENT_QUOTES,"UTF-8")] = "&".$value.";";
   	}
   }
	/**
	 * proper function of interest - it replaces national characters by latin equivalents
	 *
	 * @param string - text to be translated
	 * @return string - latin text
	 * @example PrettyLatin->utfToLatin('ÅÆČńąя꣹󜟿') @see PrettyLatin::example()
	 */
	public function utfToLatin($tst)	{
		$char = "";
		while (strlen($tst) > 0) {
			preg_match("/^(.)(.*)$/u", $tst, $match);
			$test = utf8_decode($match[1]);
			if ($test != "?") {
				$char .= $this->remove_diacritic($test);
			} else if (strlen($match[1]) > 1) {
			   $new_char = $this->remove_diacritic($match[1]);
			   if ($match[1]==$new_char) $new_char=$this->rep;
				$char .= $new_char;
			}
			$tst = $match[2];
		}
		$tst = $char;
		return $tst;
	}
	/**
	 * function removing diactric signs from character
	 *
	 * @param string
	 * @return string
	 */
	public function remove_diacritic($c){		
		if (!array_key_exists($c,$this->arrEntities)){
			if ($this->maptolatin($c))
				return html_entity_decode("&#".$this->maptolatin($c).";",ENT_QUOTES,"UTF-8");
			return $c;
		}
		$c = $this->arrEntities[$c];
		$c = preg_replace($this->arrPatterns,$this->arrRepl,$c);
		return $c;
	}

   /**
    * @param string(1) c
    * @return int - ord ascii value of given character
    */
   public function uniord($c) {
   	$uord = 0;
   	if (ord($c{0}) >= 0 && ord($c{0}) <= 127) $uord = ord($c{0});
   	if (ord($c{0}) >= 192 && ord($c{0}) <= 223) $uord = (ord($c{0})-192)*64 + (ord($c{1})-128);
   	if (ord($c{0}) >= 224 && ord($c{0}) <= 239) $uord = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
   	if (ord($c{0}) >= 240 && ord($c{0}) <= 247) $uord = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
   	if (ord($c{0}) >= 248 && ord($c{0}) <= 251) $uord = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
   	if (ord($c{0}) >= 252 && ord($c{0}) <= 253) $uord = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
   	if (ord($c{0}) >= 254 && ord($c{0}) <= 255) $uord = false;
   	return $uord;
   }
	/**
	 * example test function 
	 * @run PrettyLatin->example()
	 */
	public function example($string="ÅÆČńąя꣹󜟿ЩЯч"){
	   $c = __CLASS__;
	   $PL = new $c;
	   print "UTF=>".$string."<br/>to latin=>".$PL->utfToLatin($string);
	}   
}
?>
Return current item: Pretty Latin