Location: PHPKode > scripts > Unicode Manipulation > unicode-manipulation/unicode.class.php
<?php
/**
 * Unicode manipulation class.
 * It works with UTF-8, UTF-16 and UTF-32 (big-endian and little-endian)
 *
 * @see http://unicode.org/ Unicode Consortium
 * @see http://www.rfc-editor.org/rfc/rfc3629.txt UTF-8 (RFC-3629)
 * @see http://www.rfc-editor.org/rfc/rfc2781.txt UTF-16 (RFC-2781)
 * @see http://en.wikipedia.org/wiki/UTF-32 UTF-32
 * @author Rubens Takiguti Ribeiro
 * @date 2008-07-29
 * @version 1.1 2009-01-05
 * @license http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3 (LICENSE.TXT)
 * @copyright Copyright (C) 2008  Rubens Takiguti Ribeiro
 */
final class unicode {


/// # PRIVATE METHODS


    /**
     * Private constructor (use static methods only)
     *
     * @return void
     */
    private function __construct() {}

    
/// # GENERAL METHODS


    /**
     * Returns a BOM (Byte Order Mark).
     * It defines if a document is encoded with big or little endian,
     * and should be in begining of document.
     *
     * @param bool $big_endian Whether the result is in big or little endian
     * @return string
     */
    static public function get_bom($big_endian = true) {
        return $big_endian ? chr(0xFE).chr(0xFF)
                           : chr(0xFF).chr(0xFE);
    }


/// # UTF-8 METHODS


    /**
     * Return ord value of an UTF-8 character.
     *
     * @param string $c Unicode character.
     * @return int
     */
    static public function ord_utf8($c) {

        /*
         * UTF-8 characters have 8 to 32 bits (see table), where 7 to 21 are used
         *
         *   Char. number range  |        UTF-8 octet sequence
         *      (hexadecimal)    |              (binary)
         *   --------------------+---------------------------------------------
         *   0000 0000-0000 007F | 0xxxxxxx
         *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
         *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
         *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         */

        $real_len = strlen($c);

        // 1 - Get decimal value of each byte of chacarter
        $vt_ord = array();
        for ($i = $real_len - 1; $i >= 0; $i--) {
            $vt_ord[$i] = ord($c[$i]);
        }

        // 2 - Check if the character is an ASCII (1 byte): has 7 used bits
        if ($vt_ord[0] <= 0x7F) { // byte1 <= 01111111
            return $vt_ord[0];
        }

        /// 3 - Validate character and get needed bits

        // If waiting for 2 bytes: have 8 to 11 used bits
        if ($vt_ord[0] <= 0xDF) {                  // byte1 <= 11011111
            if ($real_len == 2 &&                  // has 2 bytes
                (($vt_ord[1] & 0xC0) == 0x80)) {   // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)

                return ($vt_ord[1] & 0x3F) |       // byte2 & 00111111 (6 bits)
                       (($vt_ord[0] & 0x1F) << 6); // byte1 & 00011111 (+ 5 bits)
            }

        // If waiting for 3 bytes: have 12 to 16 used bits
        } elseif ($vt_ord[0] <= 0xEF) {            // byte1 <= 11101111
            if ($real_len == 3 &&                  // has 3 bytes
                (($vt_ord[1] & 0xC0) == 0x80) &&   // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)
                (($vt_ord[2] & 0xC0) == 0x80)) {   // byte3 & 11000000 == 10000000 (byte3 == 10xxxxxx)

                return ($vt_ord[2] & 0x3F) |        // byte3 & 00111111 (6 bits)
                       (($vt_ord[1] & 0x3F) << 6) | // byte2 & 00111111 (+ 6 bits)
                       (($vt_ord[0] & 0x1F) << 12); // byte1 & 00011111 (+ 4 bits)
            }

        // If waiting for 4 bytes: has 17 to 21 used bits
        } elseif ($vt_ord[0] <= 0xF4) {            // byte1 <= 11110111
            if ($real_len == 4 &&                  // has 4 bytes
                (($vt_ord[1] & 0xC0) == 0x80) &&   // byte2 & 11000000 == 10000000 (byte2 == 10xxxxxx)
                (($vt_ord[2] & 0xC0) == 0x80) &&   // byte3 & 11000000 == 10000000 (byte3 == 10xxxxxx)
                (($vt_ord[3] & 0xC0) == 0x80)) {   // byte4 & 11000000 == 10000000 (byte4 == 10xxxxxx)

                return ($vt_ord[3] & 0x3F) |         // byte4 & 00111111 (6 bits)
                       (($vt_ord[2] & 0x3F) << 6) |  // byte3 & 00111111 (+ 6 bits)
                       (($vt_ord[1] & 0x3F) << 12) | // byte2 & 00111111 (+ 6 bits)
                       (($vt_ord[0] & 0x1F) << 18);  // byte1 & 00011111 (+ 3 bits)
            }
        }

        // If it is an invalid UTF-8 character
        $vt_binary = array();
        for ($i = 0; $i < $real_len; $i++) {
            $vt_binary[] = sprintf('%08d', decbin($vt_ord[$i]));
        }
        $binary = implode(' ', $vt_binary);
        trigger_error('Invalid UTF-8 character: '.$binary, E_USER_WARNING);
        return false;
    }


    /**
     * Returns an UTF-8 character from ord code.
     *
     * @param int $ord Ord code.
     * @return string
     */
    static public function chr_utf8($ord) {

        // Have 1 byte (7 used bits)
        if ($ord <= 0x7F) {
            return chr($ord);

        // Have 2 bytes (11 used bits = 5 + 6)
        } elseif ($ord <= 0x7FF) {
            return chr((($ord >> 6) & 0x1F) | 0xC0).   // ((ord >> 6) & 00011111) | 11000000
                   chr((   $ord     & 0x3F) | 0x80);   // (   ord     & 00111111) | 10000000

        // Have 3 bytes (16 used bits = 4 + 6 + 6)
        } elseif ($ord <= 0xFFFF) {
            return chr((($ord >> 12) & 0xF)  | 0xE0).  // ((ord >> 12) & 00001111) | 11100000
                   chr(( ($ord >> 6) & 0x3F) | 0x80).  // ( (ord >> 6) & 00111111) | 10000000
                   chr((    $ord     & 0x3F) | 0x80);  // (    ord     & 00111111) | 10000000

        // Have 4 bytes (21 used bits = 3 + 6 + 6 + 6)
        } elseif ($ord <= 0x10FFFF) {
            return chr((($ord >> 18) & 0x7)  | 0xF0).  // ((ord >> 18) & 00000111) | 11110000
                   chr((($ord >> 12) & 0x3F) | 0x80).  // ((ord >> 12) & 00111111) | 10000000
                   chr((($ord >> 6)  & 0x3F) | 0x80).  // ( (ord >> 6) & 00111111) | 10000000
                   chr((    $ord     & 0x3F) | 0x80);  // (    ord     & 00111111) | 10000000
        }
        trigger_error('Ord code "'.$ord.'" is not a valid UTF-8 character', E_USER_WARNING);
        return false;
    }


    /**
     * Returns the expected number of bytes that an UTF-8 character uses.
     *
     * @param string $c UTF-8 character.
     * @return int
     */
    static public function utf8_size($c) {
        $ord = ord($c[0]);

        if ($ord <= 0x7F) {       // byte <= 01111111
            return 1;
        } elseif ($ord <= 0xDF) { // byte <= 11011111
            return 2;
        } elseif ($ord <= 0xEF) { // byte <= 11101111
            return 3;
        } elseif ($ord <= 0xF4) { // byte <= 11110111
            return 4;
        }

        trigger_error('Invalid UTF-8 character "'.$c.'"', E_USER_WARNING);
        return false;
    }


    /**
     * Converts an UTF-8 string to an UTF-16 string
     *
     * @param string $str_utf8 String in UTF-8 encoding
     * @param bool $big_endian_out Whether the output is in big or little endian
     * @return string String in UTF-16 encoding
     */
    static public function utf8_to_utf16($str_utf8, $big_endian_out = true) {
        $len = strlen($str_utf8);
        $return = '';
        $i = 0;
        while ($i < $len) {
            $posible_char = substr($str_utf8, $i, 4);
            $char_len = self::utf8_size($posible_char);
            $char = substr($str_utf8, $i, $char_len);
            $ord = self::ord_utf8($char);
            $return .= self::chr_utf16($ord, $big_endian_out);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Converts an UTF-8 string to an UTF-32 string
     *
     * @param string $str_utf8 String in UTF-8 encoding
     * @param bool $big_endian_out Whether the output is in big or little endian
     * @return string String in UTF-32 encoding
     */
    static public function utf8_to_utf32($str_utf8, $big_endian_out = true) {
        $len = strlen($str_utf8);
        $return = '';
        $i = 0;
        while ($i < $len) {
            $posible_char = substr($str_utf8, $i, 4);
            $char_len = self::utf8_size($posible_char);
            $char = substr($str_utf8, $i, $char_len);
            $ord = self::ord_utf8($char);
            $return .= self::chr_utf32($ord, $big_endian_out);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Returns a substring of an UTF-8 string.
     *
     * @param string $str Original value.
     * @param int $from Begining of substring.
     * @param int $length Length of substring.
     * @return string
     */
    static public function substr_utf8($str, $from, $length = false) {
        $utf8 = '[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]';
        if (is_int($length)) {
            return preg_replace('#^(?:'.$utf8.'+){0,'.$from.'}'.
                                '((?:'.$utf8.'+){0,'.$length.'}).*#s',
                                '$1', $str);
        } else {
            return preg_replace('#^(?:'.$utf8.'+){0,'.$from.'}'.
                                '((?:'.$utf8.'+)*)#s',
                                '$1', $str);
        }
    }


    /**
     * Returns the length of an UTF-8 string.
     *
     * @param string $str String to be checked.
     * @return int
     */
    static public function strlen_utf8($str) {
        return strlen(utf8_decode($str));
    }


    /**
     * Returns an UTF-8 character from a position of an UTF-8 string.
     *
     * @param string $str Text in UTF-8 encoding.
     * @param int $pos Position to be get.
     * @return string
     */
    static public function get_char_utf8($str, $pos) {
        $len = strlen($str);
        $char_len = 0;

        for ($i = 0, $current = 0; $i < $len; $i += $char_len, $current++) {

            // Get UTF-8 character length
            $posible_char = substr($str, $i, 4);
            $char_len = self::utf8_size($posible_char);

            if ($current == $pos) {
                return substr($str, $i, $char_len);
            }
        }
        trigger_error('Invalid position "'.$pos.'" of string "'.$str.'"', E_USER_WARNING);
        return false;
    }


    /**
     * Returns if a string is a valid UTF-8.
     *
     * @param string $str String to be checked.
     * @return bool
     */
    static public function is_utf8($str) {
        return (bool)preg_match ('/.+/u', $str);
    }


/// # UTF-16 METHODS


    /**
     * Return ord value of an UTF-16 character.
     *
     * @param string $c Unicode character.
     * @param bool $big_endian Whether the input char is in big or little endian.
     * @param bool $big_endian_in Whether the input is in big or little endian
     * @return int
     */
    static public function ord_utf16($c, $big_endian = true) {
        $bits16 = $big_endian ? 'n' : 'v';

        // The character value U is the value of W1
        $w1 = array_pop(unpack($bits16, $c[0].$c[1]));
        if ($w1 <= 0xD800 || $w1 >= 0xDFFF) {
            return $w1;
        }

        // No valid character can be obtained using W1
        if ($w1 < 0xD800 || $w1 > 0xDBFF) {
            trigger_error('No valid character can be obtained using W1 in UTF-16 string', E_USER_WARNING);
            return false;
        }

        // There is no W2
        if (!isset($c[2]) || !isset($c[3])) {
            trigger_error('There is no W2 in UTF-16 string', E_USER_WARNING);
            return false;
        }

        $w2 = array_pop(unpack($bits16, $c[2].$c[3]));
        if ($w2 < 0xDC00 || $w2 > 0xDFFF) {
            trigger_error('The sequence is an error', E_USER_WARNING);
            return false;
        }

        // Get 10 low-order bits of W1 and 10 low-order bits of W2
        $y = $w1 & 0x3FF;
        $x = $w2 & 0x3FF;

        // Create a 20bit unsigned integer U'
        $u_line = ($y << 10) | $x;

        // Add 0x10000 to U' and get U
        return $u_line + 0x10000;
    }


    /**
     * Returns an UTF-16 character from ord code.
     *
     * @param int $ord Ord code
     * @param bool $big_endian Whether the result is in big or little endian
     * @return string
     */
    static public function chr_utf16($ord, $big_endian = true) {
        $bits16 = $big_endian ? 'n' : 'v';

        if ($ord < 0x10000) {
            // Return 16bit unsigned integer
            return pack($bits16, $ord);
        }
        $u_line = $ord - 0x10000;  // U' <= 0xFFFFF (20 bits)

        $w1 = 0xD800;
        $w2 = 0xDC00;

        // U' = yyyyyyyyyyxxxxxxxxxx

        // Get 10 high-order bits of U'
        $y = $u_line >> 10;   // U' >> 10

        // Get 10 low-order bits of U'
        $x = $u_line & 0x3FF; // U' & 00000000001111111111

        // Return 16 bits of (W1 | Y) and 16 bits of (W2 | X)
        return pack($bits16.$bits16, $w1 | $y, $w2 | $x);
    }


    /**
     * Converts an UTF-16 string to an UTF-8 string
     *
     * @param string $str_utf16 String in UTF-16 encoding
     * @param bool $big_endian_in Whether the input is in big or little endian
     * @return string String in UTF-8 encoding
     */
    static public function utf16_to_utf8($str_utf16, $big_endian_in = true) {
        $len = strlen($str_utf16);
        $return = '';
        $i = 0;
        while ($i < $len) {
            $posible_char = substr($str_utf16, $i, 4);
            $char_len = self::utf16_size($posible_char);
            $char = substr($str_utf16, $i, $char_len);
            $ord = self::ord_utf16($char, $big_endian_in);
            $return .= self::chr_utf8($ord);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Converts an UTF-16 string to an UTF-32 string
     *
     * @param string $str_utf16 String in UTF-16 encoding
     * @param bool $big_endian_out Whether the input is in big or little endian
     * @param bool $big_endian_in Whether the output is in big or little endian
     * @return string String in UTF-32 encoding
     */
    static public function utf16_to_utf32($str_utf16, $big_endian_out = true, $big_endian_in = true) {
        $len = strlen($str_utf16);
        $return = '';
        $i = 0;
        while ($i < $len) {
            $posible_char = substr($str_utf16, $i, 4);
            $char_len = self::utf16_size($posible_char);
            $char = substr($str_utf16, $i, $char_len);
            $ord = self::ord_utf16($char, $big_endian_in);
            $return .= self::chr_utf32($ord, $big_endian_out);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Returns the expected number of bytes that an UTF-16 character uses.
     *
     * @param string $c UTF-16 character.
     * @return int
     */
    static public function utf16_size($c) {
        $ord = self::ord_utf16($c);

        if ($ord < 0xd800) {
            return 2;
        } elseif ($c > 0xdfff && $c < 0x10000) {
            return 4;
        }
        trigger_error('Invalid UTF-16 character "'.$c.'"', E_USER_WARNING);
        return false;
    }


    /**
     * Returns an UTF-16 character from a position of an UTF-16 string.
     *
     * @param string $str Original string.
     * @param int $pos Position to be get.
     * @return string
     */
    static public function get_char_utf16($str, $pos) {
        $len = strlen($str);
        $char_len = 0;

        for ($i = 0, $current = 0; $i < $len; $i += $char_len, $current++) {

            // Get UTF-16 character length
            $posible_char = substr($str, $i, 4);
            $char_len = self::utf16_size($posible_char);

            if ($current == $pos) {
                return substr($str, $i, $char_len);
            }
        }
        trigger_error('Invalid position "'.$pos.'" of string "'.$str.'"', E_USER_WARNING);
        return false;
    }


    /**
     * Returns the length of an UTF-16 string.
     *
     * @param string $str String to be checked.
     * @return int
     */
    static public function strlen_utf16($str) {
        $len = strlen($str);
        $size = 0;
        $i = 0;
        while ($i < $len) {
            $posible_char = substr($str, $i, 4);
            $i += self::utf16_len($posible_char);
            $size++;
        }
        return $size;
    }


/// # UTF-32 METHODS


    /**
     * Returns an UTF-32 character from ord code.
     *
     * @param int $ord Ord code
     * @param bool $big_endian Whether the result is in big or little endian
     * @return string
     */
    static public function chr_utf32($ord, $big_endian = true) {
        $bits32 = $big_endian ? 'N' : 'V';
        return pack($bits32, $ord);
    }


    /**
     * Return ord value of an UTF-32 character.
     *
     * @param string $c Unicode character.
     * @param bool $big_endian_in Whether the input is in big or little endian
     * @return int
     */
    static public function ord_utf32($c, $big_endian = true) {
        $bits32 = $big_endian ? 'N' : 'V';
        if (strlen($c) == 4) {
            return array_pop(unpack($bits32, $c));
        }
        trigger_error('Invalid UTF-32 character', E_USER_WARNING);
        return false;
    }


    /**
     * Converts an UTF-32 string to an UTF-8 string
     *
     * @param string $str_utf32 String in UTF-32 encoding
     * @param bool $big_endian_in Whether the input is in big or little endian
     * @return string String in UTF-8 encoding
     */
    static public function utf32_to_utf8($str_utf32, $big_endian_in = true) {
        $len = strlen($str_utf32);
        $return = '';
        $i = 0;
        $char_len = self::utf32_size();
        while ($i < $len) {
            $char = substr($str_utf32, $i, $char_len);
            $ord = self::ord_utf32($char, $big_endian_in);
            $return .= self::chr_utf8($ord);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Converts an UTF-32 string to an UTF-16 string
     *
     * @param string $str_utf32 String in UTF-32 encoding
     * @param bool $big_endian_out Whether the output is in big or little endian
     * @param bool $big_endian_in Whether the input is in big or little endian
     * @return string String in UTF-16 encoding
     */
    static public function utf32_to_utf16($str_utf32, $big_endian_out = true, $big_endian_in = true) {
        $len = strlen($str_utf32);
        $return = '';
        $i = 0;
        $char_len = self::utf32_size();
        while ($i < $len) {
            $char = substr($str_utf32, $i, $char_len);
            $ord = self::ord_utf32($char, $big_endian_in);
            $return .= self::chr_utf16($ord, $big_endian_out);
            $i += $char_len;
        }
        return $return;
    }


    /**
     * Returns the number of bytes that an UTF-32 character uses.
     *
     * @return int
     */
    static public function utf32_size() {
        return 4; // Fix length
    }


    /**
     * Returns an UTF-32 character from a position of an UTF-32 string.
     *
     * @param string $str Original string.
     * @param int $pos Position to be get.
     * @return string
     */
    static public function get_char_utf32($str, $pos) {
        return substr($str, $pos * 4, 4);
    }


    /**
     * Returns the length of an UTF-32 string.
     *
     * @param string $str String to be checked.
     * @return int
     */
    static public function strlen_utf32($str) {
        return strlen($str) / 4;
    }

}
Return current item: Unicode Manipulation