Location: PHPKode > scripts > No404s > I18N_UnicodeString.php
<?php

error_reporting(0);

/* vim: set expandtab tabstop=4 shiftwidth=4: */
/**
* Provides a method of storing and manipulating multibyte strings in PHP.
*
* PHP versions 4 and 5
*
* LICENSE: Copyright 2004-2006 John Downey. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* o Redistributions of source code must retain the above copyright notice, this
*   list of conditions and the following disclaimer.
* o Redistributions in binary form must reproduce the above copyright notice,
*   this list of conditions and the following disclaimer in the documentation
*   and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE FREEBSD PROJECT "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are
* those of the authors and should not be interpreted as representing official
* policies, either expressed or implied, of The PEAR Group.
*
* @category  Internationalization
* @package   I18N_UnicodeString
* @author    John Downey <hide@address.com>
* @copyright 2004-2006 John Downey
* @license   http://www.freebsd.org/copyright/freebsd-license.html 2 Clause BSD License
* @version   CVS: $Id$
* @filesource
*/

/**
* Class provides a way to use and manipulate multibyte strings in PHP
*
* @package I18N_UnicodeString
* @author  John Downey <hide@address.com>
* @access  public
* @version Release: @package_version@
*/
class I18N_UnicodeString
{
    /**
    * The internal representation of the string as an array of numbers.
    * @access private
    * @var    array $_unicode
    */
    var $_unicode = array();
    
    /**
    * Converts an entire array of strings to Unicode capable strings.
    *
    * Useful for converting a GET/POST of all its Unicode values into a
    * workable and easily changed format. Takes an optional second
    * parameter similer to that of {@link setString()}.
    *
    * @static
    * @access public
    * @param  array $array An array of PHP variables.
    * @param  string $encoding The encoding the string values are in.
    * @return array The array with all of its string values converted to
    *               I18N_Unicode objects.
    * @see    setString()
    */
    function convertArray($array, $encoding = 'HTML') {
        foreach($array as $key => $value) {
            if (is_string($value)) {
                $array[$key] = new I18N_UnicodeString($value, $encoding);
            }
        }
        
        return $array;
    }
    
    /**
    * The constructor of the class string which can receive a new string in a
    * number of formats.
    *
    * @access public
    * @param  mixed $value A variable containing the Unicode string in one of
    *                      various encodings.
    * @param  string $encoding The encoding that the string is in.
    * @see    setString()
    */
    function I18N_UnicodeString($value = '', $encoding = 'UTF-8') {
        $this->setString($value, $encoding);
    }
    
    /**
    * Set the string to a value passed in one of many encodings.
    *
    * You may pass the encoding as an optional second parameter which defaults
    * to UTF-8 encoding. Possible encodings are:
    * 
    * o <i>ASCII</i> - when you pass a normal 7 bit ASCII string
    * o <i>UTF-8</i> - when you pass a UTF-8 encoded string
    * o <i>HTML</i> - when you pass a string encoded with HTML entities, such
    *   as the kind received from a GET/POST
    * o <i>Unicode</i> or <i>UCS-4</i> - when passing an array of integer values representing
    *   each character
    *
    * @access public
    * @param  mixed $value A variable containing the Unicode string in one of
    *                      various encodings.
    * @param  string $encoding The encoding that the string is in.
    * @return mixed Returns true on success or PEAR_Error otherwise.
    */
    function setString($value, $encoding = 'UTF-8') {
           switch(strtoupper($encoding)) {
            case 'ASCII':
            case 'UTF8':
            case 'UTF-8':
                $this->_unicode = $this->utf8ToUnicode($value);
                break;

            case 'HTML':
                $this->_unicode = $this->_stringFromHtml($value);
                break;

            case 'UTF32':
            case 'UTF-32':
            case 'UCS4':
            case 'UCS-4':
            case 'UNICODE':
                $this->_unicode = $value;
                break;

            default:
                return $this->raiseError('Unrecognized encoding');
        }
        
        if (strtolower(get_class($this->_unicode)) == 'pear_error') {
            return $this->_unicode;
        }
        
        return true;
    }
    
    /**
    * Converts a string encoded with HTML entities into our internal
    * representation of an array of integers.
    *
    * @access private
    * @param  string $string A string containing Unicode values encoded as HTML
    *                        entities.
    * @return array The array of Unicode values.
    */
    function _stringFromHtml($string = '') {
        $parts   = explode('&#', $string);
        $unicode = array();
        foreach($parts as $part) {
            $text = strstr($part, ';');
            
            if (!empty($text)) {
                $value = substr($part, 0, strpos($part, ';'));

                /* Suggested by Jonathan Yavner to allow HTML to also be in
                   the form of &#xNNNN where NNNN is the hexidecimal of a
                   Unicode character */
                if (ord($value[0]) == 120) {
                    $unicode[] = intval(substr($value, 1), 16);
                } else {
                    $unicode[] = intval($value);
                }

                $text = substr($text, 1);

                for($i = 0, $max = strlen($text); $i < $max; $i++) {
                    $unicode[] = ord($text[$i]);
                }
            } else {
                for($i = 0, $max = strlen($part); $i < $max; $i++) {
                    $unicode[] = ord($part[$i]);
                }
            }
        }

        return $unicode;
    }
    
    /**
    * Converts a UTF-8 string into our representation of an array of integers.
    *
    * Method was made static by suggestion of Lukas Feiler (#7429)
    *
    * @static
    * @access public
    * @param  string $string A string containing Unicode values encoded in UTF-8
    * @return array The array of Unicode values.
    */
    function utf8ToUnicode($string = '') {
        $unicode = array();
        $values  = array();
        $search  = 1;
        
        for($count = 0, $length = strlen($string); $count < $length; $count++) {
            $value = ord($string[$count]);

            if ($value < 128) {
                // if the value is an ASCII char then just go ahead and add it on
                $unicode[] = $value;
            } else {
                // if not then we need to know how many more bytes make up this character
                if (count($values) == 0) {
                    if ($value >> 5 == 6) {
                        $values[] = ($value - 192) << 6;
                        $search   = 2;
                    } elseif ($value >> 4 == 14) {
                        $values[] = ($value - 224) << 12;
                        $search   = 3;
                    } elseif ($value >> 3 == 30) {
                        $values[] = ($value - 240) << 18;
                        $search   = 4;
                    } elseif ($value >> 2 == 62) {
                        $values[] = ($value - 248) << 24;
                        $search   = 5;
                    } elseif ($value >> 1 == 126) {
                        $values[] = ($value - 252) << 30;
                        $search   = 6;
                    } else {
                        return $this->raiseError('Malformed UTF-8 string');
                    }
                } else {
                    if ($value >> 6 == 2) {
                        $values[] = $value - 128;
                    } else {
                        return $this->raiseError('Malformed UTF-8 string');
                    }

                    if (count($values) == $search) {
                        // if we have all of our bytes then go ahead an encode it in unicode
                        $value = $values[0];
                        for($i = 1; $i < $search; $i++) {
                            $value += ($values[$i] << ((($search - $i) - 1) * 6));
                        }
                        
                        $unicode[] = $value;

                        $values = array();
                        $search = 1;
                    }
                }
            }
        }
        
        return $unicode;
    }

    /**
     * Transforms a single unicode character represented by an integer to a UTF-8 string.
     *
     * Suggested by Lukas Feiler (#7429)
     *
     * @static
     * @access public
     * @param  integer $int A unicode character as an integer
     * @return string The unicode character converted to a UTF-8 string.
     */
    function unicodeCharToUtf8($char) {
        $string = '';
        if ($char < 128) {
            // its an ASCII char no encoding needed
            $string .= chr($char);
        } elseif ($char < 1 << 11) {
            // its a 2 byte UTF-8 char
            $string .= chr(192 + ($char >> 6));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 16) {
            // its a 3 byte UTF-8 char
            $string .= chr(224 + ($char >> 12));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 21) {
            // its a 4 byte UTF-8 char
            $string .= chr(240 + ($char >> 18));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >>  6) & 63));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 26) {
            // its a 5 byte UTF-8 char
            $string .= chr(248 + ($char >> 24));
            $string .= chr(128 + (($char >> 18) & 63));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        } else {
            // its a 6 byte UTF-8 char
            $string .= chr(252 + ($char >> 30));
            $string .= chr(128 + (($char >> 24) & 63));
            $string .= chr(128 + (($char >> 18) & 63));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        }

        return $string;
    }
    
    /**
    * Retrieves the string and returns it as a UTF-8 encoded string.
    *
    * @access public
    * @return string A string with the Unicode values encoded in UTF-8.
    */
    function toUtf8String() {
        $string = '';

        foreach($this->_unicode as $char) {
            $string .= $this->unicodeCharToUtf8($char);
        }

        return $string;
    }

    /**
    * Retrieves the string and returns it as a string encoded with HTML
    * entities.
    *
    * @access public
    * @return string A string with the Unicode values encoded as HTML entities.
    */
    function toHtmlEntitiesString() {
        $string = '';

        foreach($this->_unicode as $char) {
            if ($char > 127) {
                $string .= '&#' . $char . ';';
            } else {
                $string .= chr($char);
            }
        }

        return $string;
    }
    
    /**
    * Retrieve the length of the string in characters.
    *
    * @access public
    * @return integer The length of the string.
    */
    function length() {
        return count($this->_unicode);
    }
    
    /**
    * Works exactly like PHP's substr function only it works on Unicode
    * strings.
    *
    * @access public
    * @param  integer $begin The beginning of the substring.
    * @param  integer $length The length to read. Defaults to the rest of the
    *                         string.
    * @return I18N_UnicodeString A new I18N_UnicodeString class containing the
    *                            substring or a PEAR_Error if an error is
    *                            thrown.
    */
    function subString($begin, $length = null) {
        $unicode = array();
        
        if (is_null($length)) {
            $length = $this->length() - $begin;
        }
        
        if (($begin + $length) > $this->length()) {
            return $this->raiseError('Cannot read past end of string.');        
        }
        
        if ($begin > $this->length()) {
            return $this->raiseError('Beginning extends past end of string.');
        }
        
        for($i = $begin, $max_length = ($begin + $length); $i < $max_length; $i++) {
            array_push($unicode, $this->_unicode[$i]);
        }

        return new I18N_UnicodeString($unicode, 'Unicode');
    }
    
    /**
    * Works like PHP's substr_replace function.
    *
    * @access public
    * @param  I18N_UnicodeString $find The string to replaced
    * @param  I18N_UnicodeString $replace The string to replace $find with
    * @param  integer $start The position in the string to start replacing at
    * @param  integer $length The length from the starting to position to stop
    *                         replacing at
    * @return I18N_UnicodeString The current string with all $find replaced by
    *                            $replace
    * @see    stringReplace()
    */
    function subStringReplace(&$find, &$replace, $start, $length = null) {
        if (is_null($length)) {
            $length = $this->length() - $start;
        }
        
        $begin  = $this->subString(0, $start);
        $string = $this->subString($start, $length);
        if (!method_exists($string, 'stringReplace')) {
	        // $string is a PEAR_Error, return it
	        return $string;
        } else {
	        $string = $string->stringReplace($find, $replace);
	        $after  = $this->subString($start + $length);
	        
	        return new I18N_UnicodeString(array_merge($begin->_unicode, $string->_unicode, $after->_unicode), 'Unicode');
        }
    }
    
    /**
    * Works like PHP's str_replace function.
    *
    * @access public
    * @param  I18N_UnicodeString $find The string to replaced
    * @param  I18N_UnicodeString $replace The string to replace $find with
    * @return I18N_UnicodeString The current string with all $find replaced by
    *                            $replace
    * @see    subStringReplace()
    */
    function stringReplace(&$find, &$replace) {
        $return = new I18N_UnicodeString($this->_unicode, 'Unicode');
        
        while($return->strStr($find) !== false) {
            $after = $return->strStr($find);
            $begin = $return->subString(0, $return->length() - $after->length());
            $after = $after->subString($find->length());
            
            $return = new I18N_UnicodeString(array_merge($begin->_unicode, $replace->_unicode, $after->_unicode), 'Unicode');
        }
        
        return $return;
    }
    
    /**
    * Works like PHP's strstr function by returning the string from $find on.
    *
    * @access public
    * @param  I18N_UnicodeString $find The string to found
    * @return I18N_UnicodeString The current string from $find on to the end
    */
    function strStr(&$find) {
        $found = false;
        $after = $find->_unicode;
        
        for($i = 0, $length = $this->length(); $i < $length; $i++) {
            if ($found) {
                $after[] = $this->_unicode[$i];
            } else {
                if ($this->_unicode[$i] == $find->_unicode[0]) {
                    if ($i + $find->length() > $length) {
                        break;
                    }
                    
                    $found = true;
                    for($c = 1, $max = $find->length(); $c < $max; $c++) {
                        if ($this->_unicode[++$i] != $find->_unicode[$c]) {
                            $found = false;
                            break;
                        }
                    }
                }
            }
        }
        
        if ($found) {
            return new I18N_UnicodeString($after, 'Unicode');
        } else {
            return false;
        }
    }

    /**
    * Returns the position of a character much like PHP's strpos function.
    *
    * @access public
    * @param  mixed $char A Unicode char represented as either an integer or a
    *                     UTF-8 char.
    * @return integer The location of the character in the string.
    */
    function indexOf($char) {
        if (!is_int($char)) {
            if (strlen($char) > 1) {
                $char = array_shift($this->utf8ToUnicode($char));
            } else {
                $char = ord($char);
            }
        }
        
        for($i = 0, $length = $this->length(); $i < $length; $i++) {
            if ($this->_unicode[$i] == $char) {
                return $i;
            }
        }
        
        return -1;
    }
    
    /**
    * Returns the last position of a character much like PHP's strrpos function.
    *
    * @access public
    * @param  mixed $char A Unicode char represented as either an integer or a
    *                     UTF-8 char.
    * @return integer The last location of the character in the string.
    */
    function lastIndexOf($char) {
        if (!is_int($char)) {
            if (strlen($char) > 1) {
                $char = array_shift($this->utf8ToUnicode($char));
            } else {
                $char = ord($char);
            }
        }
        
        for($i = $this->length() - 1; $i >= 0; $i--) {
            if ($this->_unicode[$i] == $char) {
                return $i;
            }
        }
        
        return -1;    
    }
    
    /**
    * Determines if two Unicode strings are equal
    *
    * @access public
    * @param  I18N_UnicodeString $unicode The string to compare to.
    * @return boolean True if they are equal, false otherwise.
    */
    function equals(&$unicode) {
        if ($this->length() != $unicode->length()) {
            // if they arn't even the same length no need to even check
            return false;
        }
        
        return ($this->_unicode == $unicode->_unicode);
    }
    
    /**
    * Appends a given Unicode string to the end of the current one.
    *
    * @access public
    * @param  I18N_UnicodeString $unicode The string to append.
    * @return I18N_UnicodeString The new string created from the appension.
    */
    function append(&$unicode) {
        return new I18N_UnicodeString(array_merge($this->_unicode, $unicode->_unicode), 'Unicode');
    }
    
    /**
    * Used to raise a PEAR_Error.
    *
    * Hopefully this method is never called, but when it is it will include the
    * PEAR class and return a new PEAR_Error.
    *
    * @static
    * @access public
    * @param string $message The error message to raise.
    * @return PEAR_Error A PEAR error message.
    */
    function raiseError($message) {
        return $message;
    }
}
?>
Return current item: No404s