Location: PHPKode > scripts > convertor > convertor/convertor.class.php
<?php

/*******************************************************************************
________________________________________________________________________________
 PROJECT convertor
	version 2.1
	stream safe character set conversion class based on iconv
	http://www.phpclasses.org/browse/package/993.html
________________________________________________________________________________
 LICENCE
  
	PROJECT convertor - stream safe character set conversion class
	Copyright (C) 2002, 2004  Robert Sevcik
	
	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.
	
	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.
	
	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
	
	Robert Sevcik
	robert,sevcik(a)pavouk,net
________________________________________________________________________________
 CHANGE LOG
	convertor 2.1
		- remade UTF-8 stream preparation
		- added stream safety option
		- code cleanup, comments and error handling
		- excluded html unicode entities support (needs to be coded again)
	convertor 2.0
		- completely remade to implement iconv to be faster and wider
		- stream safety optimalized to check only the significant tail
		- interface is the same implementation differs
		- new! convert() is interface too
	convertor 1.1
		- the conversion tabs are now stored in class members
			not as former globals
		- function conv_tab_set moved in class method
	convertor 1.0
		- first official version (replaces old translator)
		
********************************************************************************/

/**
	Convertor class - stream safe character set conversion
	
	uses iconv
	
	
	Character set conversion
	------------------------
	
	Supports what ever iconv supports :)
	
	Note:
		Do not use old character set names (W1250, I88592, UTF8, UE)
		 - they are deprecated and may be they will not be supported
		 in the future.	Use standard (iconv) names instead.
	
		 
	HTML unicode entity encoding/decoding
	-------------------------------------
	
	It WILL convert unicode entities with stream safety. (&#222;&#x8e;)
	IN NEXT VERSION
	
	Stream safety
	-------------
	
	If a multibyte character is splitted within two parts of stream,
	convertor remembers the unended part and adds it before the next.
	It returns only finished characters. You can find unended part 
	of last character in the Data property after converting or 
	pulling splitted stream.
	
	Supports stream safety for:
		1. all single-byte encodings :)
		2. utf-8
		// not yet - 3. HTML unicode entities

 	Note:
 		To preserve stream safety for multibyte characterset encodings
 		one convertor instance per stream should be used.
	
 		
 	Examples
 	--------

 	/*************************************************************
 	
 	if(!function_exists('iconv')) die('You must load iconv extension to se this example working!') ;
 	
 	print "Conversion from iso-8859-2 to utf-8 and back again:\r\n\r\n" ;

 	$convISO2UTF = new convertor('iso-8859-2','utf-8') ;
 	$convUTF2ISO = new convertor('utf-8','iso-8859-2') ;

 	$strISO = "pøíli¹ ¾lu»ouèký kùò úpìl ïábelské kódy\r\nPØÍLI© ®LU«OUÈKÝ KÙÒ ÚPÌL ÏÁBELSKÉ KÓDY" ;
 	$strUTF = $convISO2UTF -> convert( $strISO ) ;
 	$strISO2 = $convUTF2ISO -> convert( $strUTF ) ;
 	
 	print "Original:\r\n$strISO\r\n\r\n" ;
 	print "UTF-8:\r\n$strUTF\r\n\r\n" ;
 	print "ISO-8859-2 again:\r\n$strISO2\r\n\r\n" ;
 	
 	//---------------

 	print "UTF-8 stream safety:\r\n\r\n" ;
 	
 	$convUTF2ISO = new convertor('utf-8','iso-8859-2') ;
 	
 	$strUTF = 'ří' ; // = øí in iso-8859-2
 	
	print $convUTF2ISO -> convert( $strUTF{0} ) ;	// outputs nothing
	print $convUTF2ISO -> convert( $strUTF{1} ) ;	// outputs ø
	print $convUTF2ISO -> convert( $strUTF{2} ) ;	// outputs nothing
	print $convUTF2ISO -> convert( $strUTF{2} ) ;	// outputs nothing (stream error)
	
	print "\r\n\r\n" ;
	
	print_r( $convUTF2ISO ) ;						// errors are tracked
 	
	/**************************************************************** 	
 	
 		
*/

class convertor
{
	

	// INTERNAL DATA *************************************************************
	// they should be read only and changed only via interface

	/// private iconv input encoding
	var $InputEncoding = null ;
	/// private iconv output encoding
	var $OutputEncoding = null ;
	/// private iconv internal encoding
	var $InternalEncoding = null ;
	/// private data provided or unended stream tail
	var $Data = null ;

	/// perl regexp pattern indicating which encodings are treated as singlebyte
	var $SingleBytePattern = '/ASCII|ISO|CP|WINDOWS/i' ;

	

	// INTERFACE *****************************************************************

	/// specifies whether the stream safety should be preserved or not
	var $PreserveStreamSafety = true ;

	/// readonly error track
	var $Errors = array() ;
	/// readonly last error description
	var $Error = false ;

  	
	/**
		convertor constructor
		does the basic settings and stores initiate data but nothing more
	*/
	function convertor( $sInputEncoding = null, $sOutputEncoding = null, $sData = null, $sInternalEncoding = null )
	{
		if(!function_exists('iconv')) trigger_error( 'ICONV extension required for convertor class!', E_USER_ERROR ) ;

		$sInputEncoding = $sInputEncoding ? strtolower( $sInputEncoding ) : iconv_get_encoding( 'input_encoding' ) ;
		$sOutputEncoding = $sOutputEncoding ? strtolower( $sOutputEncoding ) : iconv_get_encoding( 'output_encoding' ) ;
		$sInternalEncoding = $sInternalEncoding ? strtolower( $sInternalEncoding ) : iconv_get_encoding( 'internal_encoding' ) ;
		
		$aOld2Standard = Array(	'w1250'		=> 'windows-1250'
								,'w1256'	=> 'windows-1256'
								,'i88592'	=> 'iso-8859-2'
								,'utf8'		=> 'utf-8'
								,'ue'		=> 'html-unicode-entities'
								);
		
		$sInputEncoding = strtr( $sInputEncoding, $aOld2Standard );
		$sOutputEncoding = strtr( $sOutputEncoding, $aOld2Standard );
		
		$this -> InputEncoding = $sInputEncoding ;
		$this -> OutputEncoding = $sOutputEncoding ;
		$this -> InternalEncoding = $sInternalEncoding ;
		$this -> Data = $sData ;
	}

	/**
		does the whole think - converts data from input to output encoding
		if there is internal data remaining, data and stream are catenated
		
		streamEnd indicates that no data should remain after conversion
		
		errors are tracked internaly
	*/
	function convert( $sStream, $bStreamEnd=false )
	{
		$this -> _eh( true ) ;
		
		$this -> Data .= $sStream ;
		$tail = $this -> PreserveStreamSafety ? $this -> _prepareStream() : '' ;
		$result = $this -> _iconv() ;
		$this -> Data = $tail ;
		if( $tail!=="" && $bStreamEnd ) trigger_error( 'Stream is not complete or is invalid', E_USER_WARNING ) ;

		$this -> _eh( false ) ;

		return $result ;
	}

	/**
		stores stream to internal data but nothing more
	*/
	function push( $sStream )
	{
		$this -> Data .= $sStream ;
	}
	
	/**
		calls $this->convert('',$streamEnd) method
		converts internal data from input to output encoding
		if there is internal data remaining, data and stream are catenated
		streamEnd indicates that no data should remain after conversion
		errors are tracked internaly
		
		back compat
	*/
	function pull( $bStreamEnd = false )
	{
		return $this -> convert( '', $bStreamEnd ) ;
	}
	
	// IMPLEMENTENTATION *********************************************************

	function _eh( $enable = true )
	{
		static $previousErrorReportings = array() ;
		
		if( $enable )
		{
			array_push( $previousErrorReportings, error_reporting( E_ALL ) ) ;
			set_error_handler( array( &$this, '_error' ) ) ;
		}
		else
		{
			error_reporting( array_pop( $previousErrorReportings ) ) ;
			restore_error_handler() ;
		}
	}
	function _error( $type, $desc, $file, $line )
	{
		array_push( $this -> Errors, compact( 'type', 'desc', 'file', 'line' ) ) ;
		$this -> Error = $desc ;
	}
	
	function _iconv( )
	{
		//TODO: unicode entities
		
		$sPreviousInternalEncoding = iconv_get_encoding( 'internal_encoding' ) ;
		$b = iconv_set_encoding( 'internal_encoding', $this -> InternalEncoding ) ;
		$sResultStream = iconv( $this -> InputEncoding, $this -> OutputEncoding, $this -> Data ) ;
		$b = iconv_set_encoding( 'internal_encoding', $sPreviousInternalEncoding ) ;
		
		return $sResultStream ;
	}
	
	function _prepareStream( )
	{
		// TOIMP: other charactersets must be implemented here
		// this function must separate unended part of multibyte chars.
		// separated bytes must be returned
		// the rest consisting of complete chars remains in objects Data property
		// Implementators can modify this method or overload it and call it for charsets they do not handle 
		
		$tail = null ;
		
		switch( strtolower( $this -> InputEncoding ) )
		{
			
			//TODO: unicode entities
			case 'utf-8':
				$tail = $this -> _prepareStream_UTF8() ;
				break ;
			default:
				// if the input characterset encoding is singlebyte,
				// no preparation needs to be done
				// else there is a problem with unhandled encoding
				if( preg_match($this->SingleBytePattern, $this -> InputEncoding ) )
				{
					$tail = '' ;
				}
				else
				{
					trigger_error( 'Unable to prepare stream, input charset is not implemented ('.$this -> InputEncoding.')', E_USER_WARNING ) ;
					$tail = false ;
				}
				break ;
		}
		
		return $tail ;
	}

	function _prepareStream_UTF8()
	{

		// 0vvv vvvv is single-byte char
		// 10vv vvvv is complememnt to nB char
		// 110v vvvv is first of 2B char
		// 1110 vvvv is first of 3B char
		// 1111 0vvv is first of 4B char
		
		// UNICODE (UCS) VALUE
		// 0000 0000  0000 0000  0000 0000  0vvv vvvv
		// 0000 0000  0000 0000  0000 0vvv  vvvv vvvv
		// 0000 0000  0000 0000  vvvv vvvv  vvvv vvvv
		// 0000 0000  000v vvvv  vvvv vvvv  vvvv vvvv
		
		// UTF-8
		// 0000 0000  0000 0000  0000 0000  0vvv vvvv
		// 0000 0000  0000 0000  110v vvvv  10vv vvvv
		// 0000 0000  1110 vvvv  10vv vvvv  10vv vvvv
		// 1111 0vvv  10vv vvvv  10vv vvvv  10vv vvvv
			
		$cnt = 0 ;
		$B4 = substr( $this -> Data, -4, 4 ) ;
		$B4hex = array_pop( unpack( 'H*', $B4 ) ) ;
		$B4dec = hexdec( $B4hex ) ;

		// this if... only checks if there is something missing on the end of stream part
		// it does not check character and sequence validity - this is work for iconv
		
		if(     (0x80 & $B4dec) == 0x00 ) $cnt = 0 ;			// 4th char is 1B	( 0... .... ) (mostly used)
		elseif( (0xE0 & $B4dec) == 0xC0 ) $cnt = 1 ;			// 4th char is 1/2B	( 110. .... )
		elseif( (0xF0 & $B4dec) == 0xE0 ) $cnt = 1 ;			// 4th char is 1/3B	( 1110 .... )
		elseif( (0xF8 & $B4dec) == 0xF0 ) $cnt = 1 ;			// 4th char is 1/4B	( 1111 0... )
		elseif( (0xF000 & $B4dec) == 0xE000 ) $cnt = 2 ;		// 3th char is 1/3B	( 1110 .... )
		elseif( (0xF800 & $B4dec) == 0xF000 ) $cnt = 2 ;		// 3th char is 1/4B	( 1111 0... )
		elseif( (0xF80000 & $B4dec) == 0xF00000 ) $cnt = 3 ;	// 2th char is 1/4B	( 1111 0... )
		else ;													// any other situation is handled by iconv
		
		$tail = '' ;
		
		// if any unended nB char is present separate it from the stream and return the tail
		if( $cnt )
		{
			$tail         = substr( $this -> Data,    - $cnt ) ;
			$this -> Data = substr( $this -> Data, 0, - $cnt ) ;
		}
		
		return $tail ;
	}
}	

?>
Return current item: convertor