<?php
/**
* chinese_party.class.php
* Convert between T-Chinese and S-Chinese that handle
* Cantonese character correctly
*
* @author Mickey9801 <hide@address.com>
* @copyright copyright ComicParty.com 2006
* @version 1.0.0
* @package chinese_party
*
* DESCRIPTION :
* Tranditional Chinese <-> Simplified Chinese
* Big5 <-> UTF-8 <-> GB2312
*
* Most of the class and even iconv do not handle Traditional Chinese
* and Simplified Chinese conversion correctly when they meet
* specific Chinese character, such is Cantonese Chinese characters
* that commonly used in Hong Kong, even the character is included in
* Big5 character set.
*
* This class is use to covert Traditional Chinese and Simplified
* Chinese correctly. Original idea from MindForward
* (http://www.mindforward.com/blog/?p=66)
*
* This class is also useful for convert UTF-8 Chinese mail content
* to Big5 Chinese while sending email. Many webmail now a days still
* cannot display UTF-8 Chinese correctly, such as Hotmail...
*
* Hope that China will be as one and we don't need to convert any
* more...
*
* USAGE :
* To convert UTF-8 Traditional Chinese to Simplified Chinese :
* $cconv =& new chinese_party();
* $strResult = $cconv->utf8_tchi2schi($strInput);
*
* METHODS :
* utf8_tchi2schi() - Convert UTF-8 Traditional Chinese character to
* UTF-8 Simplified Chinese character
* utf8_schi2tchi() - Convert UTF-8 Simplified Chinese character to
* UTF-8 Traditional Chinese character
* u82tchi() - Convert UTF-8 Traditional Chinese to Big5 or
* Big5-HKSCS Traditional Chinese
* u82gb() - Convert UTF-8 Traditional Chinese to GB2312
* Simplified Chinese
* b52gb() - Convert Big5 Chinese to GB2312 Chinese
* gb2b5() - Convert GB2312 Chinese to Big5 Chinese
* ascii_encode() - Convert UTF-8 character to ASCII unicode string
* ascii_decode() - Convert ASCII unicode string to UTF-8 character
* uniord() - Convert UTF-8 character to unicode
* unichr() - Convert unicode to UTF-8 character
*
* DEPENDANCY :
* mbstring and iconv is required
*
* HISTORY :
* Mickey Chan 2006-05-23 04:15
* First Release
*/
class chinese_party {
/**
* Convert UTF-8 Traditional Chinese character to UTF-8 Simplified
* Chinese character
* Original idea from MindForward (http://www.mindforward.com/blog/?p=66)
*
* @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
* @return string UTF-8 encoded Simplified Chinese string
* @access public
*/
function utf8_tchi2schi ($strUTF8TCInput) {
$strResult = $this->u82tchi($strUTF8TCInput);
$strResult = $this->b52gb($strResult);
$strResult = @iconv('gb2312','UTF-8',$strResult);
// convert unicode number back to utf-8 character
$strResult = $this->ascii_decode($strResult);
return $strResult;
}
/**
* Convert UTF-8 Simplified Chinese character to UTF-8 Traditional
* Chinese character
* Original idea from MindForward (http://www.mindforward.com/blog/?p=66)
*
* @param string $strUTF8SCInput original UTF-8 encoded Simplified Chinese string
* @return string UTF-8 encoded Traditional Chinese string
* @access public
*/
function utf8_schi2tchi ($strUTF8SCInput) {
$strResult = $this->u82gb($strUTF8SCInput);
$strResult = $this->gb2b5($strResult);
$strResult = @iconv('big5', 'UTF-8', $strResult);
// convert unicode number back to utf-8 character
$strResult = $this->ascii_decode($strResult);
return $strResult;
}
/**
* Convert UTF-8 Traditional Chinese to Big5 or Big5-HKSCS T-Chinese
* Original idea from JRH
*
* @param string $strInput original UTF-8 encoded chinese string
* @param boolean $boolHK set to true for convert to Big5-HKSCS encode
* @param boolean $boolUnicode if true, any incompatible character will convert to unicode in '&#nnnnn;' format, otherwise will convert to '*'.
* @return string Big5 or Big5-HKSCS encoded string
* @access public
*/
function u82tchi ($strInput, $boolHK=FALSE, $boolUnicode=TRUE) {
$i = 0;
$len = strlen($strInput);
$strOutput = "";
if ($boolHK) $strTargetEncode = 'big5-HKSCS';
else $strTargetEncode = 'big5';
for ($i=0; $i<$len; $i++) {
$sbit = ord(substr($strInput,$i,1));
if ($sbit < 128) {
$strOutput .= substr($strInput,$i,1);
} elseif ($sbit > 191 && $sbit < 224) {
$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,2));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,2)):'*') : $new_word;
$i++;
} elseif ($sbit > 223 && $sbit < 240) {
$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,3));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,3)):'*') : $new_word;
$i += 2;
} elseif ($sbit > 239 && $sbit < 248) {
$new_word = @iconv('UTF-8',$strTargetEncode,substr($strInput,$i,4));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,4)):'*') : $new_word;
$i += 3;
}
}
return $strOutput;
}
/**
* Convert UTF-8 Traditional Chinese to GB2312 Simplified Chinese
* Original idea from JRH
*
* @param string $strInput original UTF-8 encoded chinese string
* @param boolean $boolUnicode if true, any incompatible character will convert to unicode in '&#nnnnn;' format, otherwise will convert to '*'.
* @return string GB2312 encoded string
* @access public
*/
function u82gb ($strInput, $boolUnicode=TRUE) {
$i = 0;
$len = strlen($strInput);
$strOutput = "";
for ($i=0; $i<$len; $i++) {
$sbit = ord(substr($strInput,$i,1));
if ($sbit < 128) {
$strOutput .= substr($strInput,$i,1);
} elseif ($sbit > 191 && $sbit < 224) {
$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,2));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,2)):'*') : $new_word;
$i++;
} elseif ($sbit > 223 && $sbit < 240) {
$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,3));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,3)):'*') : $new_word;
$i += 2;
} elseif ($sbit > 239 && $sbit < 248) {
$new_word = @iconv('UTF-8','gb2312',substr($strInput,$i,4));
$strOutput .= ($new_word=="") ? (($boolUnicode)?$this->ascii_encode(substr($strInput,$i,4)):'*') : $new_word;
$i += 3;
}
}
return $strOutput;
}
/**
* Convert Big5 Chinese to GB2312 Chinese
*
* @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
* @return string UTF-8 encoded Simplified Chinese string
* @access public
*/
function b52gb ($strB5Input) {
$i = 0;
$len = mb_strlen($strB5Input, 'big5');
$strOutput = "";
for ($i=0; $i<$len; $i++) {
$original_word = mb_substr($strB5Input,$i,1, 'big5');
$new_word = @iconv('big5','gb2312',$original_word);
$strOutput .= ($new_word=="") ? $this->ascii_encode(iconv('big5','UTF-8',$original_word)) : $new_word;
}
return $strOutput;
}
/**
* Convert GB2312 Chinese to Big5 Chinese
*
* @param string $strUTF8TCInput original UTF-8 encoded Traditional Chinese string
* @return string UTF-8 encoded Simplified Chinese string
* @access public
*/
function gb2b5 ($strGBInput) {
$i = 0;
$len = mb_strlen($strGBInput, 'gb2312');
$strOutput = "";
for ($i=0; $i<$len; $i++) {
$original_word = mb_substr($strGBInput,$i,1, 'gb2312');
$new_word = @iconv('gb2312','big5',$original_word);
$strOutput .= ($new_word=="") ? $this->ascii_encode(iconv('gb2312','UTF-8',$original_word)) : $new_word;
}
return $strOutput;
}
/**
* Convert UTF-8 character to unicode string
*
* @param string $string UTF-8 string
* @return string Unicode string
* @access public
*/
function ascii_encode ($string) {
$encoded = "";
for ($i=0; $i < strlen($string); $i++) {
if (0 != ($intUniCode = $this->uniord(substr($string,$i))))
$encoded .= '&#'.$intUniCode.';';
}
return $encoded;
}
/**
* Convert UTF-8 character to unicode
* Original from php.net
*
* @param string $c UTF-8 character
* @return string Unicode string
* @access public
*/
function uniord ($c) {
$ud = 0;
if (ord($c{0})>=0 && ord($c{0})<=127)
$ud = ord($c{0});
if (ord($c{0})>=192 && ord($c{0})<=223)
$ud = (ord($c{0})-192)*64 + (ord($c{1})-128);
if (ord($c{0})>=224 && ord($c{0})<=239)
$ud = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
if (ord($c{0})>=240 && ord($c{0})<=247)
$ud = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
if (ord($c{0})>=248 && ord($c{0})<=251)
$ud = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
if (ord($c{0})>=252 && ord($c{0})<=253)
$ud = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
if (ord($c{0})>=254 && ord($c{0})<=255) //error
$ud = FALSE;
return $ud;
}
/**
* Convert ASCII unicode string to UTF-8 character
* Original from php.net (grey - greywyvern - com)
*
* @param string $string Unicode string
* @return string UTF-8 string
* @access public
*/
function ascii_decode ($string) {
return preg_replace("/&#(\d{2,5});/e", "\$this->unichr($1);", $string);
}
/**
* Convert unicode to UTF-8 character
* Original from php.net
*
* @param string $dec Unicode string
* @return string UTF-8 character
* @access public
*/
function unichr ($dec) {
if ($dec < 128) {
$utf = chr($dec);
} else if ($dec < 2048) {
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
} else {
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
}
return $utf;
}
}
?>