<?php
/**
* Unicode manipulation class.
* It works with UTF-8, UTF-16 and UTF-32 (big-endian and little-endian)
*
* @see http://unicode.org/ Unicode Consortium
* @see http://www.rfc-editor.org/rfc/rfc3629.txt UTF-8 (RFC-3629)
* @see http://www.rfc-editor.org/rfc/rfc2781.txt UTF-16 (RFC-2781)
* @see http://en.wikipedia.org/wiki/UTF-32 UTF-32
* @author Rubens Takiguti Ribeiro
* @date 2008-07-29
* @version 1.1 2009-01-05
* @license http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3 (LICENSE.TXT)
* @copyright Copyright (C) 2008 Rubens Takiguti Ribeiro
*/
final class unicode {
/// # PRIVATE METHODS
/**
* Private constructor (use static methods only)
*
* @return void
*/
private function __construct() {}
/// # GENERAL METHODS
/**
* Returns a BOM (Byte Order Mark).
* It defines if a document is encoded with big or little endian,
* and should be in begining of document.
*
* @param bool $big_endian Whether the result is in big or little endian
* @return string
*/
static public function get_bom($big_endian = true) {
return $big_endian ? chr(0xFE).chr(0xFF)
: chr(0xFF).chr(0xFE);
}
/// # UTF-8 METHODS
/**
* Return ord value of an UTF-8 character.
*
* @param string $c Unicode character.
* @return int
*/
static public function ord_utf8($c) {
/*
* UTF-8 characters have 8 to 32 bits (see table), where 7 to 21 are used
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
$real_len = strlen($c);
// 1 - Get decimal value of each byte of chacarter
$vt_ord = array();
for ($i = $real_len - 1; $i >= 0; $i--) {
$vt_ord[$i] = ord($c[$i]);
}
// 2 - Check if the character is an ASCII (1 byte): has 7 used bits
if ($vt_ord[0] <= 0x7F) { // byte1 <= 01111111
return $vt_ord[0];
}
/// 3 - Validate character and get needed bits
// If waiting for 2 bytes: have 8 to 11 used bits
if ($vt_ord[0] <= 0xDF) { // byte1 <= 11011111
if ($real_len == 2 && // has 2 bytes
(($vt_ord[1] & 0xC0) == 0x80)) { // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)
return ($vt_ord[1] & 0x3F) | // byte2 & 00111111 (6 bits)
(($vt_ord[0] & 0x1F) << 6); // byte1 & 00011111 (+ 5 bits)
}
// If waiting for 3 bytes: have 12 to 16 used bits
} elseif ($vt_ord[0] <= 0xEF) { // byte1 <= 11101111
if ($real_len == 3 && // has 3 bytes
(($vt_ord[1] & 0xC0) == 0x80) && // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)
(($vt_ord[2] & 0xC0) == 0x80)) { // byte3 & 11000000 == 10000000 (byte3 == 10xxxxxx)
return ($vt_ord[2] & 0x3F) | // byte3 & 00111111 (6 bits)
(($vt_ord[1] & 0x3F) << 6) | // byte2 & 00111111 (+ 6 bits)
(($vt_ord[0] & 0x1F) << 12); // byte1 & 00011111 (+ 4 bits)
}
// If waiting for 4 bytes: has 17 to 21 used bits
} elseif ($vt_ord[0] <= 0xF4) { // byte1 <= 11110111
if ($real_len == 4 && // has 4 bytes
(($vt_ord[1] & 0xC0) == 0x80) && // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)
(($vt_ord[2] & 0xC0) == 0x80) && // byte3 & 11000000 == 10000000 (byte3 == 10xxxxxx)
(($vt_ord[3] & 0xC0) == 0x80)) { // byte4 & 11000000 == 10000000 (byte4 == 10xxxxxx)
return ($vt_ord[3] & 0x3F) | // byte4 & 00111111 (6 bits)
(($vt_ord[2] & 0x3F) << 6) | // byte3 & 00111111 (+ 6 bits)
(($vt_ord[1] & 0x3F) << 12) | // byte2 & 00111111 (+ 6 bits)
(($vt_ord[0] & 0x1F) << 18); // byte1 & 00011111 (+ 3 bits)
}
}
// If it is an invalid UTF-8 character
$vt_binary = array();
for ($i = 0; $i < $real_len; $i++) {
$vt_binary[] = sprintf('%08d', decbin($vt_ord[$i]));
}
$binary = implode(' ', $vt_binary);
trigger_error('Invalid UTF-8 character: '.$binary, E_USER_WARNING);
return false;
}
/**
* Returns an UTF-8 character from ord code.
*
* @param int $ord Ord code.
* @return string
*/
static public function chr_utf8($ord) {
// Have 1 byte (7 used bits)
if ($ord <= 0x7F) {
return chr($ord);
// Have 2 bytes (11 used bits = 5 + 6)
} elseif ($ord <= 0x7FF) {
return chr((($ord >> 6) & 0x1F) | 0xC0). // ((ord >> 6) & 00011111) | 11000000
chr(( $ord & 0x3F) | 0x80); // ( ord & 00111111) | 10000000
// Have 3 bytes (16 used bits = 4 + 6 + 6)
} elseif ($ord <= 0xFFFF) {
return chr((($ord >> 12) & 0xF) | 0xE0). // ((ord >> 12) & 00001111) | 11100000
chr(( ($ord >> 6) & 0x3F) | 0x80). // ( (ord >> 6) & 00111111) | 10000000
chr(( $ord & 0x3F) | 0x80); // ( ord & 00111111) | 10000000
// Have 4 bytes (21 used bits = 3 + 6 + 6 + 6)
} elseif ($ord <= 0x10FFFF) {
return chr((($ord >> 18) & 0x7) | 0xF0). // ((ord >> 18) & 00000111) | 11110000
chr((($ord >> 12) & 0x3F) | 0x80). // ((ord >> 12) & 00111111) | 10000000
chr((($ord >> 6) & 0x3F) | 0x80). // ( (ord >> 6) & 00111111) | 10000000
chr(( $ord & 0x3F) | 0x80); // ( ord & 00111111) | 10000000
}
trigger_error('Ord code "'.$ord.'" is not a valid UTF-8 character', E_USER_WARNING);
return false;
}
/**
* Returns the expected number of bytes that an UTF-8 character uses.
*
* @param string $c UTF-8 character.
* @return int
*/
static public function utf8_size($c) {
$ord = ord($c[0]);
if ($ord <= 0x7F) { // byte <= 01111111
return 1;
} elseif ($ord <= 0xDF) { // byte <= 11011111
return 2;
} elseif ($ord <= 0xEF) { // byte <= 11101111
return 3;
} elseif ($ord <= 0xF4) { // byte <= 11110111
return 4;
}
trigger_error('Invalid UTF-8 character "'.$c.'"', E_USER_WARNING);
return false;
}
/**
* Converts an UTF-8 string to an UTF-16 string
*
* @param string $str_utf8 String in UTF-8 encoding
* @param bool $big_endian_out Whether the output is in big or little endian
* @return string String in UTF-16 encoding
*/
static public function utf8_to_utf16($str_utf8, $big_endian_out = true) {
$len = strlen($str_utf8);
$return = '';
$i = 0;
while ($i < $len) {
$posible_char = substr($str_utf8, $i, 4);
$char_len = self::utf8_size($posible_char);
$char = substr($str_utf8, $i, $char_len);
$ord = self::ord_utf8($char);
$return .= self::chr_utf16($ord, $big_endian_out);
$i += $char_len;
}
return $return;
}
/**
* Converts an UTF-8 string to an UTF-32 string
*
* @param string $str_utf8 String in UTF-8 encoding
* @param bool $big_endian_out Whether the output is in big or little endian
* @return string String in UTF-32 encoding
*/
static public function utf8_to_utf32($str_utf8, $big_endian_out = true) {
$len = strlen($str_utf8);
$return = '';
$i = 0;
while ($i < $len) {
$posible_char = substr($str_utf8, $i, 4);
$char_len = self::utf8_size($posible_char);
$char = substr($str_utf8, $i, $char_len);
$ord = self::ord_utf8($char);
$return .= self::chr_utf32($ord, $big_endian_out);
$i += $char_len;
}
return $return;
}
/**
* Returns a substring of an UTF-8 string.
*
* @param string $str Original value.
* @param int $from Begining of substring.
* @param int $length Length of substring.
* @return string
*/
static public function substr_utf8($str, $from, $length = false) {
$utf8 = '[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]';
if (is_int($length)) {
return preg_replace('#^(?:'.$utf8.'+){0,'.$from.'}'.
'((?:'.$utf8.'+){0,'.$length.'}).*#s',
'$1', $str);
} else {
return preg_replace('#^(?:'.$utf8.'+){0,'.$from.'}'.
'((?:'.$utf8.'+)*)#s',
'$1', $str);
}
}
/**
* Returns the length of an UTF-8 string.
*
* @param string $str String to be checked.
* @return int
*/
static public function strlen_utf8($str) {
return strlen(utf8_decode($str));
}
/**
* Returns an UTF-8 character from a position of an UTF-8 string.
*
* @param string $str Text in UTF-8 encoding.
* @param int $pos Position to be get.
* @return string
*/
static public function get_char_utf8($str, $pos) {
$len = strlen($str);
$char_len = 0;
for ($i = 0, $current = 0; $i < $len; $i += $char_len, $current++) {
// Get UTF-8 character length
$posible_char = substr($str, $i, 4);
$char_len = self::utf8_size($posible_char);
if ($current == $pos) {
return substr($str, $i, $char_len);
}
}
trigger_error('Invalid position "'.$pos.'" of string "'.$str.'"', E_USER_WARNING);
return false;
}
/**
* Returns if a string is a valid UTF-8.
*
* @param string $str String to be checked.
* @return bool
*/
static public function is_utf8($str) {
return (bool)preg_match ('/.+/u', $str);
}
/// # UTF-16 METHODS
/**
* Return ord value of an UTF-16 character.
*
* @param string $c Unicode character.
* @param bool $big_endian Whether the input char is in big or little endian.
* @param bool $big_endian_in Whether the input is in big or little endian
* @return int
*/
static public function ord_utf16($c, $big_endian = true) {
$bits16 = $big_endian ? 'n' : 'v';
// The character value U is the value of W1
$w1 = array_pop(unpack($bits16, $c[0].$c[1]));
if ($w1 <= 0xD800 || $w1 >= 0xDFFF) {
return $w1;
}
// No valid character can be obtained using W1
if ($w1 < 0xD800 || $w1 > 0xDBFF) {
trigger_error('No valid character can be obtained using W1 in UTF-16 string', E_USER_WARNING);
return false;
}
// There is no W2
if (!isset($c[2]) || !isset($c[3])) {
trigger_error('There is no W2 in UTF-16 string', E_USER_WARNING);
return false;
}
$w2 = array_pop(unpack($bits16, $c[2].$c[3]));
if ($w2 < 0xDC00 || $w2 > 0xDFFF) {
trigger_error('The sequence is an error', E_USER_WARNING);
return false;
}
// Get 10 low-order bits of W1 and 10 low-order bits of W2
$y = $w1 & 0x3FF;
$x = $w2 & 0x3FF;
// Create a 20bit unsigned integer U'
$u_line = ($y << 10) | $x;
// Add 0x10000 to U' and get U
return $u_line + 0x10000;
}
/**
* Returns an UTF-16 character from ord code.
*
* @param int $ord Ord code
* @param bool $big_endian Whether the result is in big or little endian
* @return string
*/
static public function chr_utf16($ord, $big_endian = true) {
$bits16 = $big_endian ? 'n' : 'v';
if ($ord < 0x10000) {
// Return 16bit unsigned integer
return pack($bits16, $ord);
}
$u_line = $ord - 0x10000; // U' <= 0xFFFFF (20 bits)
$w1 = 0xD800;
$w2 = 0xDC00;
// U' = yyyyyyyyyyxxxxxxxxxx
// Get 10 high-order bits of U'
$y = $u_line >> 10; // U' >> 10
// Get 10 low-order bits of U'
$x = $u_line & 0x3FF; // U' & 00000000001111111111
// Return 16 bits of (W1 | Y) and 16 bits of (W2 | X)
return pack($bits16.$bits16, $w1 | $y, $w2 | $x);
}
/**
* Converts an UTF-16 string to an UTF-8 string
*
* @param string $str_utf16 String in UTF-16 encoding
* @param bool $big_endian_in Whether the input is in big or little endian
* @return string String in UTF-8 encoding
*/
static public function utf16_to_utf8($str_utf16, $big_endian_in = true) {
$len = strlen($str_utf16);
$return = '';
$i = 0;
while ($i < $len) {
$posible_char = substr($str_utf16, $i, 4);
$char_len = self::utf16_size($posible_char);
$char = substr($str_utf16, $i, $char_len);
$ord = self::ord_utf16($char, $big_endian_in);
$return .= self::chr_utf8($ord);
$i += $char_len;
}
return $return;
}
/**
* Converts an UTF-16 string to an UTF-32 string
*
* @param string $str_utf16 String in UTF-16 encoding
* @param bool $big_endian_out Whether the input is in big or little endian
* @param bool $big_endian_in Whether the output is in big or little endian
* @return string String in UTF-32 encoding
*/
static public function utf16_to_utf32($str_utf16, $big_endian_out = true, $big_endian_in = true) {
$len = strlen($str_utf16);
$return = '';
$i = 0;
while ($i < $len) {
$posible_char = substr($str_utf16, $i, 4);
$char_len = self::utf16_size($posible_char);
$char = substr($str_utf16, $i, $char_len);
$ord = self::ord_utf16($char, $big_endian_in);
$return .= self::chr_utf32($ord, $big_endian_out);
$i += $char_len;
}
return $return;
}
/**
* Returns the expected number of bytes that an UTF-16 character uses.
*
* @param string $c UTF-16 character.
* @return int
*/
static public function utf16_size($c) {
$ord = self::ord_utf16($c);
if ($ord < 0xd800) {
return 2;
} elseif ($c > 0xdfff && $c < 0x10000) {
return 4;
}
trigger_error('Invalid UTF-16 character "'.$c.'"', E_USER_WARNING);
return false;
}
/**
* Returns an UTF-16 character from a position of an UTF-16 string.
*
* @param string $str Original string.
* @param int $pos Position to be get.
* @return string
*/
static public function get_char_utf16($str, $pos) {
$len = strlen($str);
$char_len = 0;
for ($i = 0, $current = 0; $i < $len; $i += $char_len, $current++) {
// Get UTF-16 character length
$posible_char = substr($str, $i, 4);
$char_len = self::utf16_size($posible_char);
if ($current == $pos) {
return substr($str, $i, $char_len);
}
}
trigger_error('Invalid position "'.$pos.'" of string "'.$str.'"', E_USER_WARNING);
return false;
}
/**
* Returns the length of an UTF-16 string.
*
* @param string $str String to be checked.
* @return int
*/
static public function strlen_utf16($str) {
$len = strlen($str);
$size = 0;
$i = 0;
while ($i < $len) {
$posible_char = substr($str, $i, 4);
$i += self::utf16_len($posible_char);
$size++;
}
return $size;
}
/// # UTF-32 METHODS
/**
* Returns an UTF-32 character from ord code.
*
* @param int $ord Ord code
* @param bool $big_endian Whether the result is in big or little endian
* @return string
*/
static public function chr_utf32($ord, $big_endian = true) {
$bits32 = $big_endian ? 'N' : 'V';
return pack($bits32, $ord);
}
/**
* Return ord value of an UTF-32 character.
*
* @param string $c Unicode character.
* @param bool $big_endian_in Whether the input is in big or little endian
* @return int
*/
static public function ord_utf32($c, $big_endian = true) {
$bits32 = $big_endian ? 'N' : 'V';
if (strlen($c) == 4) {
return array_pop(unpack($bits32, $c));
}
trigger_error('Invalid UTF-32 character', E_USER_WARNING);
return false;
}
/**
* Converts an UTF-32 string to an UTF-8 string
*
* @param string $str_utf32 String in UTF-32 encoding
* @param bool $big_endian_in Whether the input is in big or little endian
* @return string String in UTF-8 encoding
*/
static public function utf32_to_utf8($str_utf32, $big_endian_in = true) {
$len = strlen($str_utf32);
$return = '';
$i = 0;
$char_len = self::utf32_size();
while ($i < $len) {
$char = substr($str_utf32, $i, $char_len);
$ord = self::ord_utf32($char, $big_endian_in);
$return .= self::chr_utf8($ord);
$i += $char_len;
}
return $return;
}
/**
* Converts an UTF-32 string to an UTF-16 string
*
* @param string $str_utf32 String in UTF-32 encoding
* @param bool $big_endian_out Whether the output is in big or little endian
* @param bool $big_endian_in Whether the input is in big or little endian
* @return string String in UTF-16 encoding
*/
static public function utf32_to_utf16($str_utf32, $big_endian_out = true, $big_endian_in = true) {
$len = strlen($str_utf32);
$return = '';
$i = 0;
$char_len = self::utf32_size();
while ($i < $len) {
$char = substr($str_utf32, $i, $char_len);
$ord = self::ord_utf32($char, $big_endian_in);
$return .= self::chr_utf16($ord, $big_endian_out);
$i += $char_len;
}
return $return;
}
/**
* Returns the number of bytes that an UTF-32 character uses.
*
* @return int
*/
static public function utf32_size() {
return 4; // Fix length
}
/**
* Returns an UTF-32 character from a position of an UTF-32 string.
*
* @param string $str Original string.
* @param int $pos Position to be get.
* @return string
*/
static public function get_char_utf32($str, $pos) {
return substr($str, $pos * 4, 4);
}
/**
* Returns the length of an UTF-32 string.
*
* @param string $str String to be checked.
* @return int
*/
static public function strlen_utf32($str) {
return strlen($str) / 4;
}
}