Location: PHPKode > scripts > Chinese Party > chinese-party/chinese_party.class.php
<?php
/**
 * chinese_party.class.php
 * Convert between T-Chinese and S-Chinese that handle
 * Cantonese character correctly
 *
 * @author Mickey9801 <hide@address.com>
 * @copyright copyright ComicParty.com 2006
 * @version 1.0.0
 * @package chinese_party
 * 
 * DESCRIPTION :
 * Tranditional Chinese <-> Simplified Chinese
 * Big5 <-> UTF-8 <-> GB2312
 *
 * Most of the class and even iconv do not handle Traditional Chinese
 * and Simplified Chinese conversion correctly when they meet
 * specific Chinese character, such is Cantonese Chinese characters
 * that commonly used in Hong Kong, even the character is included in
 * Big5 character set.
 *
 * This class is use to covert Traditional Chinese and Simplified
 * Chinese correctly. Original idea from MindForward
 * (http://www.mindforward.com/blog/?p=66)
 *
 * This class is also useful for convert UTF-8 Chinese mail content
 * to Big5 Chinese while sending email. Many webmail now a days still
 * cannot display UTF-8 Chinese correctly, such as Hotmail...
 *
 * Hope that China will be as one and we don't need to convert any
 * more...
 *
 * USAGE :
 * To convert UTF-8 Traditional Chinese to Simplified Chinese :
 * $cconv =& new chinese_party();
 * $strResult = $cconv->utf8_tchi2schi($strInput);
 *
 * METHODS :
 * utf8_tchi2schi() - Convert UTF-8 Traditional Chinese character to
 *                    UTF-8 Simplified Chinese character
 * utf8_schi2tchi() - Convert UTF-8 Simplified Chinese character to
 *                    UTF-8 Traditional Chinese character
 * u82tchi()        - Convert UTF-8 Traditional Chinese to Big5 or
 *                    Big5-HKSCS Traditional Chinese
 * u82gb()          - Convert UTF-8 Traditional Chinese to GB2312
 *                    Simplified Chinese
 * b52gb()          - Convert Big5 Chinese to GB2312 Chinese
 * gb2b5()          - Convert GB2312 Chinese to Big5 Chinese
 * ascii_encode()   - Convert UTF-8 character to ASCII unicode string
 * ascii_decode()   - Convert ASCII unicode string to UTF-8 character
 * uniord()         - Convert UTF-8 character to unicode
 * unichr()         - Convert unicode to UTF-8 character
 *
 * DEPENDANCY :
 * mbstring and iconv is required
 * 
 * HISTORY :
 * Mickey Chan 2006-05-23 04:15
 *             First Release
 */

class chinese_party {
	
	/**
	 * Convert UTF-8 Traditional Chinese character to UTF-8 Simplified
	 * Chinese character
	 * Original idea from MindForward (http://www.mindforward.com/blog/?p=66)
	 *
	 * @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
	 * @return string UTF-8 encoded Simplified Chinese string
	 * @access public
	 */
	function utf8_tchi2schi ($strUTF8TCInput) {
		$strResult = $this->u82tchi($strUTF8TCInput);
		$strResult = $this->b52gb($strResult);
		$strResult = @iconv('gb2312','UTF-8',$strResult);
		// convert unicode number back to utf-8 character
		$strResult = $this->ascii_decode($strResult);
		return $strResult;
	}
	
	/**
	 * Convert UTF-8 Simplified Chinese character to UTF-8 Traditional
	 * Chinese character
	 * Original idea from MindForward (http://www.mindforward.com/blog/?p=66)
	 *
	 * @param string $strUTF8SCInput original UTF-8 encoded Simplified Chinese string
	 * @return string UTF-8 encoded Traditional Chinese string
	 * @access public
	 */
	function utf8_schi2tchi ($strUTF8SCInput) {
		$strResult = $this->u82gb($strUTF8SCInput);
		$strResult = $this->gb2b5($strResult);
		$strResult = @iconv('big5', 'UTF-8', $strResult);
		// convert unicode number back to utf-8 character
		$strResult = $this->ascii_decode($strResult);
		return $strResult;
	}
	
	/**
	 * Convert UTF-8 Traditional Chinese to Big5 or Big5-HKSCS T-Chinese
	 * Original idea from JRH
	 *
	 * @param string $strInput original UTF-8 encoded chinese string
	 * @param boolean $boolHK set to true for convert to Big5-HKSCS encode
	 * @param boolean $boolUnicode if true, any incompatible character will convert to unicode in '&#nnnnn;' format, otherwise will convert to '*'.
	 * @return string Big5 or Big5-HKSCS encoded string
	 * @access public
	 */
	function u82tchi ($strInput, $boolHK=FALSE, $boolUnicode=TRUE) {
		$i = 0;
		$len = strlen($strInput);
		$strOutput = "";
		if ($boolHK) $strTargetEncode = 'big5-HKSCS';
		else $strTargetEncode = 'big5';
		for ($i=0; $i<$len; $i++) {
			$sbit = ord(substr($strInput,$i,1));
			if ($sbit < 128) {
				$strOutput .= substr($strInput,$i,1);
			} elseif ($sbit > 191  &&  $sbit < 224) {
				$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,2));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,2)):'*') : $new_word;
				$i++;
			} elseif ($sbit > 223 && $sbit < 240) {
				$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,3));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,3)):'*') : $new_word;
				$i += 2;
			} elseif ($sbit > 239  &&  $sbit < 248) {
				$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,4));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,4)):'*') : $new_word;
				$i += 3;
			}
		}
		return $strOutput;
	}
	
	/**
	 * Convert UTF-8 Traditional Chinese to GB2312 Simplified Chinese
	 * Original idea from JRH
	 *
	 * @param string $strInput original UTF-8 encoded chinese string
	 * @param boolean $boolUnicode if true, any incompatible character will convert to unicode in '&#nnnnn;' format, otherwise will convert to '*'.
	 * @return string GB2312 encoded string
	 * @access public
	 */
	function u82gb ($strInput, $boolUnicode=TRUE) {
		$i = 0;
		$len = strlen($strInput);
		$strOutput = "";
		for ($i=0; $i<$len; $i++) {
			$sbit = ord(substr($strInput,$i,1));
			if ($sbit < 128) {
				$strOutput .= substr($strInput,$i,1);
			} elseif ($sbit > 191  &&  $sbit < 224) {
				$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,2));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,2)):'*') : $new_word;
				$i++;
			} elseif ($sbit > 223 && $sbit < 240) {
				$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,3));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,3)):'*') : $new_word;
				$i += 2;
			} elseif ($sbit > 239  &&  $sbit < 248) {
				$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,4));
				$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,4)):'*') : $new_word;
				$i += 3;
			}
		}
		return $strOutput;
	}
	
	/**
	 * Convert Big5 Chinese to GB2312 Chinese
	 *
	 * @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
	 * @return string UTF-8 encoded Simplified Chinese string
	 * @access public
	 */
	function b52gb ($strB5Input) {
		$i = 0;
		$len = mb_strlen($strB5Input, 'big5');
		$strOutput = "";
		for ($i=0; $i<$len; $i++) {
			$original_word = mb_substr($strB5Input,$i,1, 'big5');
			$new_word = @iconv('big5','gb2312',$original_word);
			$strOutput .= ($new_word=="") ? $this->ascii_encode(iconv('big5','UTF-8',$original_word)) : $new_word;
		}
		return $strOutput;
	}

	/**
	 * Convert GB2312 Chinese to Big5 Chinese
	 *
	 * @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
	 * @return string UTF-8 encoded Simplified Chinese string
	 * @access public
	 */
	function gb2b5 ($strGBInput) {
		$i = 0;
		$len = mb_strlen($strGBInput, 'gb2312');
		$strOutput = "";
		for ($i=0; $i<$len; $i++) {
			$original_word = mb_substr($strGBInput,$i,1, 'gb2312');
			$new_word = @iconv('gb2312','big5',$original_word);
			$strOutput .= ($new_word=="") ? $this->ascii_encode(iconv('gb2312','UTF-8',$original_word)) : $new_word;
		}
		return $strOutput;
	}

	/**
	 * Convert UTF-8 character to unicode string
	 *
	 * @param string $string UTF-8 string
	 * @return string Unicode string
	 * @access public
	 */
	function ascii_encode ($string)  {
		 $encoded = "";
		 for ($i=0; $i < strlen($string); $i++)  {
				 if (0 != ($intUniCode = $this->uniord(substr($string,$i))))
				 	$encoded .= '&#'.$intUniCode.';';
		 }
		 return $encoded;
	}
	
	/**
	 * Convert UTF-8 character to unicode
	 * Original from php.net
	 *
	 * @param string $c UTF-8 character
	 * @return string Unicode string
	 * @access public
	 */
	function uniord ($c) {
		$ud = 0;
		if (ord($c{0})>=0 && ord($c{0})<=127)
			$ud = ord($c{0});
		if (ord($c{0})>=192 && ord($c{0})<=223)
			$ud = (ord($c{0})-192)*64 + (ord($c{1})-128);
		if (ord($c{0})>=224 && ord($c{0})<=239)
			$ud = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
		if (ord($c{0})>=240 && ord($c{0})<=247)
			$ud = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
		if (ord($c{0})>=248 && ord($c{0})<=251)
			$ud = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
		if (ord($c{0})>=252 && ord($c{0})<=253)
			$ud = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
		if (ord($c{0})>=254 && ord($c{0})<=255) //error
			$ud = FALSE;
		return $ud;
	}

	/**
	 * Convert ASCII unicode string to UTF-8 character
	 * Original from php.net (grey - greywyvern - com)
	 *
	 * @param string $string Unicode string
	 * @return string UTF-8 string
	 * @access public
	 */
	function ascii_decode ($string) {
		return preg_replace("/&#(\d{2,5});/e", "\$this->unichr($1);", $string);
	}
	
	/**
	 * Convert unicode to UTF-8 character
	 * Original from php.net
	 *
	 * @param string $dec Unicode string
	 * @return string UTF-8 character
	 * @access public
	 */
	function unichr ($dec) {
		if ($dec < 128) {
			$utf = chr($dec);
		} else if ($dec < 2048) {
			$utf = chr(192 + (($dec - ($dec % 64)) / 64));
			$utf .= chr(128 + ($dec % 64));
		} else {
			$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
			$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
			$utf .= chr(128 + ($dec % 64));
		}
		return $utf;
	}
	
}
?>
Return current item: Chinese Party