<?php
/* vim: set expandtab tabstop=4 shiftwidth=4: */
/**
* PHPBiDi is an implementation of the Unicode Bidirectional Algorithm in PHP.
*
* This class is meant to support external applications that need to handle
* text whose characters are placed in logical order but some or all
* characters should be displayed or printed in a right-to-left direction.
* Plain UTF-8 text and HTML are supported.
*
* It parses the text in two stages. As soon as the object is constructed,
* the text is parsed and tagged in accordance to Unicode's Bidirectional
* Algorithm. Once the external application has decided where to break each
* line, the line details are passed to function getLine(), which returns the
* requested characters in the correct visual order.
*
* @package PHPBiDi
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @version $Id: phpbidi.php, v 1.00 Wed Jan 09 2008 21:31:36 GMT+0200 Efthimios Mavrogeorgiadis $
* @since Wed Jan 09 2008 01:32:38 GMT+0200
* @access public
* @uses unicode.php to parse UTF-8 characters
* @example example.php description
* Displays <a href="http://www.gnu.org/licenses/old-licenses/gpl-2.0.html">GNU Public License</a>
* @license http://www.gnu.org/licenses/old-licenses/gpl-2.0.html GNU Public License
* @copyright Copyright (c) 2008, Efthimios Mavrogeorgiadis
*/
// Load required file
require_once (dirname(__FILE__) . '/unicode.php');
class PHPBiDi {
/**
* The input as it is transformed by the functions of the class.
* @access private
* @var string
*/
private $text = '';
/**
* The offset of array $paragraphs, i.e. the number of the paragraph being processed.
* @access private
* @var integer
*/
private $par = 0;
/**
* How many characters have been processed?
* Resets to 0 whenever a new paragraph is loaded.
* @access private
* @var integer
*/
private $char_counter = 0;
/**
* The paragraph embedding level (can be either 0 or 1).
* @access private
* @var integer
*/
private $pel = 0;
/**
* Is the input HTML text?
* @access private
* @var boolean
*/
private $html = false;
/**
* Does the input contain right-to-left characters?
* @access private
* @var boolean
*/
private $rtl = false;
/**
* Does the input contain Arabic characters?
* @access private
* @var boolean
*/
private $arabic = false;
/**
* The paragraphs comprising the input.
* @access private
* @var array
*/
private $paragraphs = array();
/**
* The results of the first stage of parsing.
* @access private
* @var array
*/
private $result = array();
/**
* The embedding level of each paragraph.
* @access private
* @var array
*/
private $pels = array();
/**
* The tags that are stripped from each paragraph.
* @access private
* @var array
*/
private $tags = array();
/**
* The characters found in each line.
* @access private
* @var array
*/
private $line_chars = array();
/**
* The tags found in each line.
* @access private
* @var array
*/
private $line_tags = array();
/**
* The tags that are open in each line.
* @access private
* @var array
*/
private $open_tags = array();
/**
* The tags that split the paragraphs.
* @access private
* @var array
*/
private $split_tags = array();
/**
* The characters of each paragraph.
* @access private
* @var array
*/
private $text_array = array();
/**
* The embedded levels of each paragraph.
* @access private
* @var array
*/
private $new_levels = array();
/**
* Constructor function
*
* It feeds on text and HTML (true or false).
*
* @access public
* @param string [$text] The input
* @param boolean [$html] Is it HTML or not?
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function PHPBidi($text, $html) {
$this->html = $html;
$this->setText($text);
$this->checkLanguages();
$this->getParagraphs();
foreach($this->paragraphs as &$par) {
$this->result[] = array();
$last = count($this->result) -1;
$this->pel = 0;
$this->new_levels = array();
$this->setText($par);
$this->checkLanguages();
if ($this->html) {
$this->getTags();
}
$this->decodeUTF8();
$this->getBiDiText();
$this->parseBiDiText();
$this->getText();
if ($this->arabic) {
$this->fixArabChars();
}
$this->getText();
$this->pels[] = $this->pel;
if ($this->html and count($this->tags[$this->par])) {
$this->result[$last]['text'] = $this->fixTags();
} else {
$this->result[$last]['text'] = $this->text;
}
$this->result[$last]['newtext'] = $this->text;
$this->result[$last]['rtl'] = $this->pel ? true : false;
$this->result[$last]['levels'] = $this->new_levels;
$this->changeParagraph();
}
$this->resetParagraph();
}
/**
* Get your line in right-to-left order.
*
* @access public
* @param integer [$start] The position in the paragraph where the new line starts (first character is 0). Only positive values are supported.
* @param integer [$length] The length of the new line.
* @param boolean [$tags] True if you need tags returned.
* @param boolean [$input] True if $start and $length are based on getResultText().
* @return string The new line in right-to-left order.
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getLine($start, $length, $tags = false, $input = false) {
if ($input) {
$txt = preg_replace('/<[^>]+>/', '', substr($this->getResultText(), $start, $length));
$length = strlen($txt);
} else {
$txt = substr($this->getResultNewText(), $start, $length);
}
$this->char_counter+= $length;
$this->getStrippedValues($txt, $start, $length);
$ret = $this->getBiDiLine($start, $length);
if ($this->html and $tags) {
$ret = $this->restoreTags($start, $length);
}
return $ret;
}
/**
* Change paragraph.
*
* Use it to notify PHPBiDi that you've finished parsing a paragraph.
*
* @access public
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function changeParagraph() {
$this->par++;
$this->char_counter = 0;
}
/**
* Parse the first paragraph.
*
* Use it to notify PHPBiDi that you want to reset its paragraph counter
* and start from the beginning.
*
* @access public
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function resetParagraph() {
$this->par = 0;
$this->char_counter = 0;
}
/**
* Parse a specific paragraph.
*
* Use it to notify PHPBiDi that you want to move to a specific paragraph.
*
* @access public
* @param integer [$par] The paragraph number to move to.
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function goToParagraph($par) {
$this->par = $par;
$this->char_counter = 0;
}
/**
* Get the number of current paragraph.
*
* Use it to notify PHPBiDi that you want to move to a specific paragraph.
*
* @access public
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getParagraphNum() {
return $this->par;
}
/**
* How big is the $result array?
*
* Ask PHPBiDi to tell you how many paragraphs it has identified
* in your input.
*
* @access public
* @return integer The number of paragraphs in your text
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getResultNum() {
return count($this->result);
}
/**
* Get an original paragraph.
*
* Get the text of the paragraph you are processing as it was
* in your original input.
*
* @access public
* @return string A paragraph of your original input
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getResultText() {
return $this->result[$this->par]['text'];
}
/**
* Get a tagless paragraph.
*
* Get the text of the paragraph you are processing.
* If it's HTML text, it's stripped of its tags. Otherwise, you get
* your original input.
*
* @access public
* @return string A tagless paragraph
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getResultNewText() {
return $this->result[$this->par]['newtext'];
}
/**
* Does this paragraph contain right-to-left characters?
*
* If it doesn't you may skip the second stage of processing.
*
* @access public
* @return boolean True of right-to-left characters are found in the current paragraph.
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
public function getResultRTL() {
return $this->result[$this->par]['rtl'];
}
/**
* Get all HTML tags found in input together with their offset
* and strip the input tagless.
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getTags() {
preg_match_all('/<[^<>]+>/', $this->text, $matches, PREG_OFFSET_CAPTURE);
$diff = 0;
$t = array();
$open = array();
for ($i = 0;$i < count($matches[0]);$i++) {
$t[$i] = array();
$t[$i]['tag'] = $matches[0][$i][0];
$t[$i]['offset'] = $matches[0][$i][1]-$diff;
$diff+= strlen($matches[0][$i][0]);
if (preg_match('/^<\s*\//', $matches[0][$i][0])) {
$t[$i]['offset']--;
}
}
$this->text = preg_replace('/<[^>]+>/Uu', '', $this->text);
$pattern = constant('UTF8PATTERN');
for ($i = 0;$i < count($t);$i++) {
$sub = substr($this->text, 0, $t[$i]['offset']);
$previous = $t[$i]['offset'];
preg_match_all($pattern, $sub, $matches);
$t[$i]['offset'] = count($matches[0]);
}
$this->tags[] = $t;
}
/**
* Wrapper function to identify characters.
*
* This function calls checkArabic() and checkText() to determine
* whether Arabic or other right-to-left characters appear in the input.
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function checkLanguages() {
$this->arabic = $this->checkArabic();
if ($this->arabic) {
$this->rtl = true;
} else {
$this->rtl = $this->checkText();
}
}
/**
* Set the $text variable.
*
* @access private
* @param string [$text] The input as it is processed by the functions
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function setText($text) {
$this->text = $text;
}
/**
* Set the $text_array.
*
* The input split in characters with their Unicode characteristics
* and bidirectional information.
*
* @access private
* @param array [$text_array] The input split in characters
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function setTextArray($text_array) {
$this->text_array = $text_array;
}
/**
* Get the input as an array of characters.
*
* @access private
* @return array The input as an array of characters
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getTextArray() {
return $this->text_array;
}
/**
* Decode the UTF-8 encoded input into an array of Unicode character code values
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function decodeUTF8() {
$str = $this->UTF8Text2Array();
$open = array();
$t = $this->tags[$this->par];
$tag = '';
if (count($t)) {
$tag = array_shift($t);
}
for ($i = 0;$i < count($str);$i++) {
$char = new UniChar();
// Attach tags to Unicode characters
while ($this->html and is_array($tag) and $tag['offset'] == $i) {
$closing = preg_match('/<\s*\//', $tag['tag']);
$solo = preg_match('/<[^>]+\/\s*>/', $tag['tag']);
if ($closing) {
if (count($open)) {
preg_match('/<\s*\/\s*([^\s>]+)/', $tag['tag'], $m);
$tm = $m[1][0];
$k = 0;
while ($k < count($open)) {
if (preg_match('/<\s*' . $tm . '(\s+[^>]+)*>/', $open[$k])) {
$char->setTag($open[$k]);
array_splice($open, $k, 1);
break;
}
$k++;
}
}
}
elseif ($solo) {
$char->setTag($tag['tag']);
}
else {
$char->setTag($tag['tag']);
array_unshift($open, $tag['tag']);
}
$tag = '';
if (count($t)) {
$tag = array_shift($t);
}
}
$length = strlen($str[$i]);
if ($length > 6) {
echo "\nCharacter out of range... Aborted!\n";
exit;
} elseif ($length == 1) {
$char->setNumber(ord($str[$i]));
$char->setLetter($str[$i]);
$str[$i] = $char;
} else {
$string = $str[$i];
$j = $length;
$mask = (64/(pow(2, ($length-1)))) -1;
while ($j) {
$pos = $length-$j;
$lshift = 6*$j-6;
$char_num = ord($string{$pos});
$char->setNumber($char->getNumber() |(($char_num&$mask) <<$lshift));
$mask = 63;
$j--;
}
$char->setLetter($str[$i]);
$str[$i] = $char;
}
if ($char->getNumber() == 8204 or $char->getNumber() == 8205) {
if ($i) {
$str[$i-1]->setJoiner($char->getNumber());
}
}
}
$this->setTextArray($str);
}
/**
* Check whether input contains Arabic characters
*
* @access private
* @return boolean Returns true if input contains Arabic characters
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function checkArabic() {
$pattern = '/(
\xD8[\x80-\x83\x8B\x8D\x9B\x9E\x9F\xA1-\xBA] # AL
| \xD9[\x80-\x8A\xAD-\xAF\xB1-\xBF] # AL
| \xDA[\x80-\xBF] # AL
| \xDB[\x80-\x95\x9D\xA5\xA6\xAE\xAF\xBA-\xBF] # AL
| \xDC[\x80-\x8D\x90\x92-\xAF] # AL
| \xDD[\x8D-\xAD] # AL
| \xDE[\x80-\xA5\xB1] # AL
| \xEF\xAD[\x90-\xBF] # AL
| \xEF\xAE[\x80-\xB1] # AL
| \xEF\xAF[\x93-\xBF] # AL
| \xEF[\xB0-\xB3][\x80-\xBF] # AL
| \xEF\xB4[\x80-\xBD] # AL
| \xEF\xB5[\x90-\xBF] # AL
| \xEF\xB6[\x80-\x8F\x92-\xBF] # AL
| \xEF\xB7[\x80-\x87\xB0-\xBC] # AL
| \xEF\xB9[\xB0-\xB4\xB6-\xBF] # AL
| \xEF\xBA[\x80-\xBF] # AL
| \xEF\xBB[\x80-\xBC] # AL
| \xD9[\xA0-\xA9\xAB\xAC] # AN
)/x';
if (preg_match($pattern, $this->text)) {
return true;
} else {
return false;
}
}
/**
* Check whether input contains other right-to-left characters apart from Arabic
*
* @access private
* @return boolean Returns true if input contains right-to-left characters
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function checkText() {
$pattern = '/(
\xD6\xBE # R
| \xD7[\x80\x83\x86\x90-\xAA\xB0-\xB4] # R
| \xDF[\x80-\xAA\xB4\xB5\xBA] # R
| \xE2\x80\x8F # R
| \xEF\xAC[\x9D\x9F\xA0-\xA8\xAA-\xB6\xB8-\xBC\xBE] # R
| \xEF\xAD[\x80\x81\x83\x84\x86-\x8F] # R
| \xF0\x90\xA0[\x80-\x85\x88\x8A-\xB5\xB7\xB8\xBC\xBF] # R
| \xF0\x90\xA4[\x80-\x99] # R
| \xF0\x90\xA8[\x80\x90-\x93\x95-\x97\x99-\xB3] # R
| \xF0\x90\xA9[\x80-\x87\x90-\x98] # R
| \xE2\x80[\xAB\xAE] # RLE & RLO
)/x';
if (preg_match($pattern, $this->text)) {
return true;
} else {
return false;
}
}
/**
* Check whether input contains other any right-to-left character
*
* @access private
* @return boolean Returns true if input contains any right-to-left character
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function checkFull() {
$pattern = '/(
\xD8[\x80-\x83\x8B\x8D\x9B\x9E\x9F\xA1-\xBA] # AL
| \xD9[\x80-\x8A\xAD-\xAF\xB1-\xBF] # AL
| \xDA[\x80-\xBF] # AL
| \xDB[\x80-\x95\x9D\xA5\xA6\xAE\xAF\xBA-\xBF] # AL
| \xDC[\x80-\x8D\x90\x92-\xAF] # AL
| \xDD[\x8D-\xAD] # AL
| \xDE[\x80-\xA5\xB1] # AL
| \xEF\xAD[\x90-\xBF] # AL
| \xEF\xAE[\x80-\xB1] # AL
| \xEF\xAF[\x93-\xBF] # AL
| \xEF[\xB0-\xB3][\x80-\xBF] # AL
| \xEF\xB4[\x80-\xBD] # AL
| \xEF\xB5[\x90-\xBF] # AL
| \xEF\xB6[\x80-\x8F\x92-\xBF] # AL
| \xEF\xB7[\x80-\x87\xB0-\xBC] # AL
| \xEF\xB9[\xB0-\xB4\xB6-\xBF] # AL
| \xEF\xBA[\x80-\xBF] # AL
| \xEF\xBB[\x80-\xBC] # AL
| \xD9[\xA0-\xA9\xAB\xAC] # AN
| \xD6\xBE # R
| \xD7[\x80\x83\x86\x90-\xAA\xB0-\xB4] # R
| \xDF[\x80-\xAA\xB4\xB5\xBA] # R
| \xE2\x80\x8F # R
| \xEF\xAC[\x9D\x9F\xA0-\xA8\xAA-\xB6\xB8-\xBC\xBE] # R
| \xEF\xAD[\x80\x81\x83\x84\x86-\x8F] # R
| \xF0\x90\xA0[\x80-\x85\x88\x8A-\xB5\xB7\xB8\xBC\xBF] # R
| \xF0\x90\xA4[\x80-\x99] # R
| \xF0\x90\xA8[\x80\x90-\x93\x95-\x97\x99-\xB3] # R
| \xF0\x90\xA9[\x80-\x87\x90-\x98] # R
| \xE2\x80[\xAB\xAE] # RLE & RLO
)/x';
if (preg_match($pattern, $this->getResultNewText())) {
return true;
} else {
return false;
}
}
/**
* Turns UTF-8 encoded input into an array of Unicode characters encoded in UTF-8
*
* @access private
* @return array Unicode characters encoded in UTF-8
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function UTF8Text2Array() {
$pattern = constant('UTF8PATTERN');
preg_match_all($pattern, $this->text, $matches);
if (!count($matches[0]) and strlen($this->text)) {
echo "\nInvalid text... Aborted!\n";
exit;
}
return $matches[0];
}
// P1. Split the text into separate paragraphs. A paragraph separator is kept with the previous paragraph.
// Within each paragraph, apply all the other rules of this algorithm.
/**
* Split input into paragraphs
*
* Rule P1 of the Unicode Bidirectional Algorithm:
* Split the text into separate paragraphs.
* A paragraph separator is kept with the previous paragraph.
* Within each paragraph, apply all the other rules of this algorithm.
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getParagraphs() {
$this->checkEntities();
if ($this->html) {
$this->text = preg_replace('/\r*\n/', ' ', $this->text);
while (preg_match('/(<\s*(\w+?)\s*[^>]*(\s*dir\s*=\s*["\']\s*(rtl|ltr)\s*["\'])[^>]*>)/Uu', $this->text, $matches, PREG_OFFSET_CAPTURE)) {
$start = $matches[0][1];
$tag = $matches[2][0];
$newtag = str_replace($matches[3][0], '', $matches[0][0]);
$this->text = substr($this->text, 0, $matches[0][1]) . $newtag . $this->changeDir($matches[4][0]) . substr($this->text, ($matches[0][1]+strlen($matches[0][0])));
$pat = '/<\s*\/*\s*' . $tag . '(\s*>|\s+[^>]*>)/Uu';
preg_match_all($pat, $this->text, $matches, PREG_OFFSET_CAPTURE);
$counter = 0;
for ($i = 0;$i < count($matches);$i++) {
if (preg_match('/<\s*\//', $matches[0][$i][0])) {
$counter--;
} else {
$counter++;
}
if (!$counter) {
$start = $matches[0][$i][1];
}
}
$this->text = substr($this->text, 0, $start) . chr(226) . chr(128) . chr(172) . substr($this->text, $start);
}
preg_match_all('/(<\s*(p|div|br|td|th|hr|h\d|legend|input|ol|ul|pre)\s*[^>]*>)/Uu', $this->text, $this->split_tags);
$this->paragraphs = preg_split('/(<\s*(p|div|br|td|th|hr|h\d|legend|input|ol|ul|pre)\s*[^>]*>)/Uu', $this->text);
} else {
while (preg_match('/\n\n\n/', $this->text)) {
$this->text = preg_replace('/(\r*\n\r*\n)\r*\n/', '\\1', $this->text);
}
$this->paragraphs = preg_split('/\r*\n\r*\n/', $this->text);
}
}
/**
* Turn HTML Entities into Unicode character code values
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function checkEntities() {
$this->text = preg_replace("/&(\w+);/e", "PHPBiDi::decodeEntity('\\1')", $this->text);
$this->text = preg_replace("/&#(\d+);/e", "UniChar::encodeUTF8Num('\\1')", $this->text);
}
/**
* Decode an HTML Entity
*
* @access private
* @param string [$entity] The name of the HTML Entity
* @return string The character encoded in UTF-8
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function decodeEntity($entity) {
$num = ord(html_entity_decode("&" . $entity . ";"));
return UniChar::encodeUTF8Num($num);
}
/**
* Set the paragraph embedding level
*
* Rule P2 of the Unicode Bidirectional Algorithm:
* In each paragraph, find the first character of type L, AL, or R.
*
* Rule P3 of the Unicode Bidirectional Algorithm:
* If a character is found in P2 and it is of type AL or R, then set
* the paragraph embedding level to one; otherwise, set it to zero.
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getBiDiText() {
$ta = &$this->getTextArray();
$i = 0;
while ($i < count($ta)) {
$type = $ta[$i]->getTypeChar();
if ($type == 'L') {
$this->pel = 0;
break;
} elseif ($type == 'AL' or $type == 'R') {
$this->pel = 1;
break;
}
$i++;
}
}
/**
* Complete the first stage of parsing
*
* Apply rules X1-X10, W1-W7, N1-N2 and I1-I2
* of the Unicode Bidirectional Algorithm
*
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function parseBiDiText() {
$neutral = array('B', 'S', 'WS', 'ON');
$lrm = 8206;
$rlm = 8207;
$lre = 8234;
$rle = 8235;
$pdf = 8236;
$lro = 8237;
$rlo = 8238;
$ta = &$this->getTextArray();
// X1. Begin by setting the current embedding level to the paragraph embedding level.
// Set the directional override status to neutral. Process each character iteratively, applying rules X2 through X9.
// Only embedding levels from 0 to 61 are valid in this phase.
$cel = $this->pel;
$dos = 'N';
$remember = array();
$sor = $this->pel%2 ? 'R' : 'L';
$levels = array(array('level' => $cel, 'sor' => $sor, 'eor' => '', 'chars' => array()));
$current_level = &$levels[count($levels) -1];
$i = 0;
while ($i < count($ta)) {
$char = &$ta[$i];
// X2. With each RLE, compute the least greater odd embedding level.
// a. If this new level would be valid, then this embedding code is valid. Remember (push) the current embedding level and override status.
//Reset the current level to this new level, and reset the override status to neutral.
// b. If the new level would not be valid, then this code is invalid. Do not change the current level or override status.
if ($char->getNumber() == $rle) {
$next_level = $cel+($cel%2) +1;
if ($next_level < 62) {
$remember[] = array('num' => $rle, 'cel' => $cel, 'dos' => $dos);
$cel = $next_level;
$sor = $cel%2 ? 'R' : 'L';
$current_level['eor'] = $sor;
$levels[] = array('level' => $cel, 'sor' => '', 'eor' => '', 'chars' => array());
$current_level = &$levels[count($levels) -1];
$current_level['sor'] = $sor;
$dos = 'N';
}
}
// X3. With each LRE, compute the least greater even embedding level.
// a. If this new level would be valid, then this embedding code is valid. Remember (push) the current embedding level and override status.
// Reset the current level to this new level, and reset the override status to neutral.
// b. If the new level would not be valid, then this code is invalid. Do not change the current level or override status.
elseif ($char->getNumber() == $lre) {
$next_level = $cel+2-($cel%2);
if ($next_level < 62) {
$remember[] = array('num' => $lre, 'cel' => $cel, 'dos' => $dos);
$cel = $next_level;
$sor = $cel%2 ? 'R' : 'L';
$current_level['eor'] = $sor;
$levels[] = array('level' => $cel, 'sor' => '', 'eor' => '', 'chars' => array());
$current_level = &$levels[count($levels) -1];
$current_level['sor'] = $sor;
$dos = 'N';
}
}
// X4. With each RLO, compute the least greater odd embedding level.
// a. If this new level would be valid, then this embedding code is valid. Remember (push) the current embedding level and override status.
// Reset the current level to this new level, and reset the override status to right-to-left.
elseif ($char->getNumber() == $rlo) {
$next_level = $cel+($cel%2) +1;
if ($next_level < 62) {
$remember[] = array('num' => $rlo, 'cel' => $cel, 'dos' => $dos);
$cel = $next_level;
$sor = $cel%2 ? 'R' : 'L';
$current_level['eor'] = $sor;
$levels[] = array('level' => $cel, 'sor' => '', 'eor' => '', 'chars' => array());
$current_level = &$levels[count($levels) -1];
$current_level['sor'] = $sor;
$dos = 'R';
}
}
// X5. With each LRO, compute the least greater even embedding level.
// a. If this new level would be valid, then this embedding code is valid. Remember (push) the current embedding level and override status.
// Reset the current level to this new level, and reset the override status to left-to-right.
// b. If the new level would not be valid, then this code is invalid. Do not change the current level or override status.
elseif ($char->getNumber() == $lro) {
$next_level = $cel+2-($cel%2);
if ($next_level < 62) {
$remember[] = array('num' => $lro, 'cel' => $cel, 'dos' => $dos);
$cel = $next_level;
$sor = $cel%2 ? 'R' : 'L';
$current_level['eor'] = $sor;
$levels[] = array('level' => $cel, 'sor' => '', 'eor' => '', 'chars' => array());
$current_level = &$levels[count($levels) -1];
$current_level['sor'] = $sor;
$dos = 'L';
}
}
// X7. With each PDF, determine the matching embedding or override code.
// If there was a valid matching code, restore (pop) the last remembered (pushed) embedding level and directional override.
elseif ($char->getNumber() == $pdf) {
if (count($remember)) {
$last = count($remember) -1;
if ($remember[$last]['num'] == $rle or $remember[$last]['num'] == $lre or $remember[$last]['num'] == $rlo or $remember[$last]['num'] == $lro) {
$match = array_pop($remember);
$sor = ($cel > $match['cel'] ? $cel : $match['cel']) %2 ? 'R' : 'L';
$cel = $match['cel'];
$current_level['eor'] = $sor;
$levels[] = array('level' => $cel, 'sor' => '', 'eor' => '', 'chars' => array());
$current_level = &$levels[count($levels) -1];
$current_level['sor'] = $sor;
$dos = $match['dos'];
}
}
}
// X6. For all types besides RLE, LRE, RLO, LRO, and PDF:
// a. Set the level of the current character to the current embedding level.
// b. Whenever the directional override status is not neutral, reset the current character type to the directional override status.
elseif ($char->getTypeChar() != 'BN') {
$char->setLevel($cel);
$current_level['chars'][] = $char;
if ($dos != 'N') {
$char->setTypeBiDi($dos);
}
}
// X8. All explicit directional embeddings and overrides are completely terminated at the end of each paragraph.
// Paragraph separators are not included in the embedding.
$i++;
}
$last = &$levels[count($levels) -1];
$last['eor'] = ($last['level'] > $this->pel ? $last['level'] : $this->pel) %2 ? 'R' : 'L';
// X9. Remove all RLE, LRE, RLO, LRO, PDF, and BN codes.
// Note that an implementation does not have to actually remove the codes; it just has to behave as though the codes were not present for the remainder of the algorithm.
// Conformance does not require any particular placement of these codes as long as all other characters are ordered correctly.
// The zero width joiner and non-joiner affect the shaping of the adjacent characters?hose that are adjacent in the original backing-store order,
// even though those characters may end up being rearranged to be non-adjacent by the Bidirectional Algorithm.
// X10. The remaining rules are applied to each run of characters at the same level.
// For each run, determine the start-of-level-run (sor) and end-of-level-run (eor) type, either L or R.
// This depends on the higher of the two levels on either side of the boundary
// (at the start or end of the paragraph, the level of the ?ther?run is the base embedding level).
// If the higher level is odd, the type is R; otherwise, it is L.
$i = 0;
while ($i < count($levels)) {
$current = &$levels[$i];
$chars = &$current['chars'];
$num_of_chars = count($chars);
// W1. Examine each nonspacing mark (NSM) in the level run, and change the type of the NSM to the type of the previous character.
// If the NSM is at the start of the level run, it will get the type of sor.
// W2. Search backward from each instance of a European number until the first strong type (R, L, AL, or sor) is found.
// If an AL is found, change the type of the European number to Arabic number.
$j = 0;
while ($j < $num_of_chars) {
if ($chars[$j]->getTypeBiDi() == 'NSM' and $j == 0) {
$chars[$j]->setTypeBiDi($current['sor']);
} elseif ($chars[$j]->getTypeBiDi() == 'NSM' and $j > 0) {
$chars[$j]->setTypeBiDi($chars[$j-1]->getTypeBiDi());
}
if ($chars[$j]->getTypeBiDi() == 'EN' and $j > 0) {
$k = $j;
while ($k) {
$k--;
if ($chars[$k]->getTypeBiDi() == 'R' or $chars[$k]->getTypeBiDi() == 'L') {
break;
} elseif ($chars[$k]->getTypeBiDi() == 'AL') {
$chars[$j]->setTypeBiDi('AN');
break;
}
}
}
$j++;
}
// W3. Change all ALs to R.
// W4. A single European separator between two European numbers changes to a European number.
// A single common separator between two numbers of the same type changes to that type.
// W5. A sequence of European terminators adjacent to European numbers changes to all European numbers.
// W6. Otherwise, separators and terminators change to Other Neutral.
$j = 0;
while ($j < $num_of_chars) {
if ($chars[$j]->getTypeBiDi() == 'AL') {
$chars[$j]->setTypeBiDi('R');
} elseif ($chars[$j]->getTypeBiDi() == 'EN' and $j < $num_of_chars-2) {
if (($chars[$j+1]->getTypeBiDi() == 'ES' or $chars[$j+1]->getTypeBiDi() == 'CS') and $chars[$j+2]->getTypeBiDi() == 'EN') {
$chars[$j+1]->setTypeBiDi('EN');
} elseif ($chars[$j+1]->getTypeBiDi() == 'ES' or $chars[$j+1]->getTypeBiDi() == 'CS') {
$chars[$j+1]->setTypeBiDi('ON');
}
} elseif ($chars[$j]->getTypeBiDi() == 'AN' and $j < $num_of_chars-2) {
if ($chars[$j+1]->getTypeBiDi() == 'CS' and $chars[$j+2]->getTypeBiDi() == 'AN') {
$chars[$j+1]->setTypeBiDi('AN');
} elseif ($chars[$j+1]->getTypeBiDi() == 'CS') {
$chars[$j+1]->setTypeBiDi('ON');
}
} elseif ($chars[$j]->getTypeBiDi() == 'ET' and $num_of_chars > 1) {
if ($j == $num_of_chars-1) {
if ($chars[$j-1]->getTypeBiDi() == 'EN') {
$chars[$j]->setTypeBiDi('EN');
} else {
$chars[$j]->setTypeBiDi('ON');
}
} elseif ($j == 0) {
if ($chars[$j+1]->getTypeBiDi() == 'EN') {
$chars[$j]->setTypeBiDi('EN');
} elseif ($chars[$j+1]->getTypeBiDi() == 'ET') {
$k = $j+1;
while ($chars[$k]->getTypeBiDi() == 'ET') {
$k++;
if ($k < $num_of_chars and $chars[$k]->getTypeBiDi() == 'EN') {
for ($m = $k-1;$m < $j;$m--) {
$chars[$m]->setTypeBiDi('EN');
}
} elseif ($k < $num_of_chars and $chars[$k]->getTypeBiDi() == 'ET') {
continue;
} else {
for ($m = $k-1;$m < $j;$m--) {
$chars[$m]->setTypeBiDi('ON');
}
}
}
} else {
$chars[$j]->setTypeBiDi('ON');
}
} else {
if ($chars[$j-1]->getTypeBiDi() == 'EN' or $chars[$j+1]->getTypeBiDi() == 'EN') {
$chars[$j]->setTypeBiDi('EN');
} elseif ($chars[$j+1]->getTypeBiDi() == 'ET') {
$k = $j+1;
while ($chars[$k]->getTypeBiDi() == 'ET') {
$k++;
if ($k < $num_of_chars and $chars[$k]->getTypeBiDi() == 'EN') {
for ($m = $k-1;$m < $j;$m--) {
$chars[$m]->setTypeBiDi('EN');
}
} elseif ($k < $num_of_chars and $chars[$k]->getTypeBiDi() == 'ET') {
continue;
} else {
for ($m = $k-1;$m < $j;$m--) {
$chars[$m]->setTypeBiDi('ON');
}
}
}
} else {
$chars[$j]->setTypeBiDi('ON');
}
}
} elseif ($chars[$j]->getTypeBiDi() == 'ES' or $chars[$j]->getTypeBiDi() == 'CS' or $chars[$j]->getTypeBiDi() == 'ET') {
$chars[$j]->setTypeBiDi('ON');
}
$j++;
}
// W7. Search backward from each instance of a European number until the first strong type (R, L, or sor) is found.
// If an L is found, then change the type of the European number to L.
$j = 0;
while ($j < $num_of_chars) {
if ($chars[$j]->getTypeBiDi() == 'EN' and $j > 0) {
$k = $j;
while ($k) {
$k--;
if ($chars[$k]->getTypeBiDi() == 'R') {
break;
} elseif ($chars[$k]->getTypeBiDi() == 'L' or ($k == 0 and $current['sor'] == 'L')) {
$chars[$j]->setTypeBiDi('L');
break;
}
}
} elseif ($chars[$j]->getTypeBiDi() == 'EN' and $current['sor'] == 'L') {
$chars[$j]->setTypeBiDi('L');
}
$j++;
}
// N1. A sequence of neutrals takes the direction of the surrounding strong text if the text on both sides has the same direction.
// European and Arabic numbers act as if they were R in terms of their influence on neutrals.
// Start-of-level-run (sor) and end-of-level-run (eor) are used at level run boundaries.
// N2. Any remaining neutrals take the embedding direction.
$j = 0;
while ($j < $num_of_chars) {
if (in_array($chars[$j]->getTypeBiDi(), $neutral)) {
if ($num_of_chars == 1) {
$chars[$j]->setTypeBiDi($current['level']%2 ? 'R' : 'L');
} else {
$start_char = $j;
if ($j == 0) {
$first_char = $current['sor'];
} else {
$first_char = $chars[$j-1]->getTypeBiDi();
}
while (1) {
$j++;
$end_char = $j;
if ($j == $num_of_chars) {
$last_char = $current['eor'];
break;
} elseif (in_array($chars[$j]->getTypeBiDi(), $neutral)) {
continue;
} else {
$last_char = $chars[$j]->getTypeBiDi();
break;
}
}
$right_context = array('AN', 'EN', 'R');
if ($first_char == 'L' and $last_char == 'L') {
for ($k = $start_char;$k < $end_char;$k++) {
$chars[$k]->setTypeBiDi('L');
}
} elseif (in_array($first_char, $right_context) and in_array($last_char, $right_context)) {
for ($k = $start_char;$k < $end_char;$k++) {
$chars[$k]->setTypeBiDi('R');
}
} else {
for ($k = $start_char;$k < $end_char;$k++) {
$chars[$k]->setTypeBiDi($current['level']%2 ? 'R' : 'L');
}
}
}
}
$j++;
}
$i++;
}
// I1. For all characters with an even (left-to-right) embedding direction, those of type R go up one level and those of type AN or EN go up two levels.
// I2. For all characters with an odd (right-to-left) embedding direction, those of type L, EN or AN go up one level.
$i = 0;
$j = 0;
$this->new_levels = array();
$x = 0;
while ($i < count($levels)) {
$current = &$levels[$i];
if (!count($current['chars'])) {
$i++;
continue;
}
$odd = $current['level']%2;
$this->new_levels[] = array('level' => '', 'chars' => array());
$last_level = &$this->new_levels[count($this->new_levels) -1];
$first_char = &$current['chars'][0];
$previous_bidi = $first_char->getTypeBiDi();
$previous_level = $first_char->getLevel();
if ($odd) {
if ($previous_bidi == 'L' or $previous_bidi == 'AN' or $previous_bidi == 'EN') {
$previous_level = $first_char->getLevel() +1;
$first_char->setLevel($previous_level);
}
} else {
if ($previous_bidi == 'R') {
$previous_level = $first_char->getLevel() +1;
$first_char->setLevel($previous_level);
} elseif ($previous_bidi == 'AN' or $previous_bidi == 'EN') {
$previous_level = $first_char->getLevel() +2;
$first_char->setLevel($previous_level);
}
}
$last_level['level'] = $first_char->getLevel();
$last_level['chars'][] = $first_char;
$k = 1;
while ($k < count($current['chars'])) {
if (isset($char)) {
unset($char);
}
$char = &$current['chars'][$k];
if ($previous_bidi == $char->getTypeBiDi()) {
$char->setLevel($previous_level);
$last_level['chars'][] = $char;
} else {
$this->new_levels[] = array('level' => '', 'chars' => array());
if (isset($last_level)) {
unset($last_level);
}
$last_level = &$this->new_levels[count($this->new_levels) -1];
$first_char = &$current['chars'][$k];
$previous_bidi = $char->getTypeBiDi();
$previous_level = $first_char->getLevel();
if ($odd) {
if ($previous_bidi == 'L' or $previous_bidi == 'AN' or $previous_bidi == 'EN') {
$previous_level = $char->getLevel() +1;
$char->setLevel($previous_level);
}
} else {
if ($previous_bidi == 'R') {
$previous_level = $char->getLevel() +1;
$char->setLevel($previous_level);
} elseif ($previous_bidi == 'AN' or $previous_bidi == 'EN') {
$previous_level = $char->getLevel() +2;
$char->setLevel($previous_level);
}
}
$last_level['level'] = $char->getLevel();
$last_level['chars'][] = $char;
}
$k++;
}
$i++;
}
if ($this->html) {
$open = array();
for ($i = 0;$i < count($this->new_levels);$i++) {
$current = &$this->new_levels[$i];
$chars = &$current['chars'];
$last = count($chars) - 1;
for ($j = 0;$j <= $last;$j++) {
$char = &$chars[$j];
$ts = $char->getTagSize();
for ($k = 0;$k < $ts;$k++) {
$cur_tag = $char->getTag($k);
$op = 1;
$l = 0;
while ($l < count($open)) {
if ($open[$l] == $cur_tag) {
$op = 0;
array_splice($open, $l, 1);
if (!$j) {
$char->setTag($cur_tag);
}
break;
}
$l++;
}
if ($op) {
array_unshift($open, $cur_tag);
}
}
if (!$j) {
$l = 0;
while ($l < count($open)) {
$char->setTag($open[$l]);
$l++;
}
}
}
$char = &$chars[$last];
$l = 0;
while ($l < count($open)) {
$char->setTag($open[$l]);
$l++;
}
}
}
}
/**
* Complete the second stage of parsing
*
* Apply rules L1-L4 of the Unicode Bidirectional Algorithm
*
* @access private
* @param integer [$start] Offset where line begins in paragraph
* @param integer [$length] Length of line
* @return string The line in visual order
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getBiDiLine($start, $length) {
$orig_start = $start;
$lev = &$this->result[$this->par]['levels'];
$temp = array();
if (!$length) {
$length = -1;
}
for ($i = 0;$i < count($lev);$i++) {
$l = $lev[$i]['chars'];
if ($start >= count($l)) {
$start-= count($l);
continue;
} else {
if ($start) {
$j = $start;
$start = 0;
} else {
$j = 0;
}
while ($length and $j < count($l)) {
$temp[] = $l[$j];
$length--;
$j++;
}
}
}
// L1. On each line, reset the embedding level of the following characters to the paragraph embedding level:
// 1. Segment separators,
// 2. Paragraph separators,
// 3. Any sequence of whitespace characters preceding a segment separator or paragraph separator, and
// 4. Any sequence of white space characters at the end of the line.
// The types of characters used here are the original types, not those modified by the previous phase.
// Because a Paragraph Separator breaks lines, there will be at most one per line, at the end of that line.
$i = 0;
$open = array();
if (count($this->line_tags)) {
foreach ($this->line_tags as $tag) {
$temp[0]->setTag($tag);
}
$this->line_tags = array();
}
while ($i < count($temp)) {
$char = &$temp[$i];
if ($char->getTypeChar() == 'B' or $char->getTypeChar() == 'S') {
$char->setLevel($this->pels[$this->par]);
} elseif ($char->getTypeChar() == 'WS') {
$j = $i+1;
while (1) {
if ($j == count($temp) or $temp[$j]->getTypeChar() == 'B' or $temp[$j]->getTypeChar() == 'S') {
break;
} elseif ($temp[$j]->getTypeChar() == 'WS') {
$j++;
continue;
} else {
$j = 0;
break;
}
}
if ($j) {
$k = $i;
while ($k < $j) {
$temp[$k]->setLevel($this->pels[$this->par]);
$k++;
}
}
}
$ts = $char->getTagSize();
if ($ts) {
$op = 1;
for ($j = 0;$j < $ts;$j++) {
$cur_tag = $char->getTag($j);
$k = 0;
while ($k < count($open)) {
if ($open[$k] == $cur_tag) {
$op = 0;
array_splice($open, $k, 1);
break;
}
$k++;
}
if ($op) {
array_unshift($open, $cur_tag);
}
}
}
$i++;
}
if (count($open)) {
foreach ($open as $tag) {
$temp[count($temp)-1]->setTag($tag);
}
}
$this->line_tags = $open;
// L2. From the highest level found in the text to the lowest odd level on each line, including intermediate levels not actually present in the text,
// reverse any contiguous sequence of characters that are at that level or higher.
$levels = array();
$highest = 0;
$lowest = 1000;
$previous = -1;
$i = 0;
while ($i < count($temp)) {
$level = $temp[$i]->getLevel();
if ($previous != $level) {
$levels[] = array('level' => $level, 'chars' => array($temp[$i]));
$highest = $highest >= $level ? $highest : $level;
if ($level%2) {
$lowest = $lowest <= $level ? $lowest : $level;
}
$previous = $level;
} else {
$levels[count($levels) -1]['chars'][] = $temp[$i];
}
$i++;
}
while ($highest >= $lowest) {
$rev = array();
$rev = array(array('action' => '0', 'chars' => array()));
for ($i = 0;$i < count($temp);$i++) {
$last = count($rev) -1;
if ($temp[$i]->getLevel() >= $highest) {
if ($rev[$last]['action']) {
$rev[$last]['chars'][] = $temp[$i];
} else {
$rev[] = array('action' => '1', 'chars' => array($temp[$i]));
}
} else {
if ($rev[$last]['action']) {
$rev[] = array('action' => '0', 'chars' => array($temp[$i]));
} else {
$rev[$last]['chars'][] = $temp[$i];
}
}
}
$temp = array();
for ($i = 0;$i < count($rev);$i++) {
if ($rev[$i]['action']) {
$rev[$i]['chars'] = array_reverse($rev[$i]['chars']);
}
$temp = array_merge($temp, $rev[$i]['chars']);
}
$highest--;
}
// L3. Combining marks applied to a right-to-left base character will at this point precede their base character.
// If the rendering engine expects them to follow the base characters in the final display process, then the ordering of the marks and the base character must be reversed.
// L4. A character is depicted by a mirrored glyph if and only if (a) the resolved directionality of that character is R,
// and (b) the Bidi_Mirrored property value of that character is true.
$ret = '';
$i = 0;
while ($i < count($temp)) {
$ret.= $temp[$i]->getMirror();
$i++;
}
$this->line_chars = $temp;
return $ret;
}
/**
* Create Arabic ligatures
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function fixArabChars() {
$levels = &$this->new_levels;
$i = 0;
while ($i < count($levels)) {
if (!($levels[$i]['level']%2)) {
$i++;
continue;
}
$chars = &$levels[$i]['chars'];
$arabic = array(array());
$chars_j = array(array());
$last_word = &$arabic[count($arabic) -1];
$last_j = &$chars_j[count($chars_j) -1];
$j = 0;
while ($j < count($chars)) {
if (($chars[$j]->getNumber() > 1535 and $chars[$j]->getNumber() < 1792 and $chars[$j]->getTypeChar() == 'AL') or $chars[$j]->getNumber() == 8204 or $chars[$j]->getNumber() == 8205) {
$last_word[] = &$chars[$j];
$last_j[] = $j;
} else {
if (count($last_word)) {
$arabic[] = array();
$chars_j[] = array();
$last_word = &$arabic[count($arabic) -1];
$last_j = &$chars_j[count($chars_j) -1];
}
}
$j++;
}
if (!count($last_word)) {
unset($last_word);
unset($last_j);
array_pop($arabic);
array_pop($chars_j);
}
$spl = array();
$j = 0;
while ($j < count($arabic)) {
$link_prev = 0;
if (count($arabic[$j]) > 1) {
for ($k = 0;$k < count($arabic[$j]);$k++) {
$oldtb = $arabic[$j][$k]->getTypeBiDi();
if ($join = $arabic[$j][$k]->getJoiner()) {
if ($join == 4) {
$link_next = 1;
} else {
$link_next = 0;
}
} else {
if ($k != count($arabic[$j]) -1 and $arabic[$j][$k]->getArLetSize() > 2 and $arabic[$j][$k+1]->getArLetSize() and $arabic[$j][$k+1]->getArLetSize() != 1) {
$link_next = 1;
} else {
$link_next = 0;
}
}
if ($link_prev and $link_next and $arabic[$j][$k]->getArLetSize() > 2) {
if ($arabic[$j][$k]->getArLetMedial()) {
$arabic[$j][$k]->setNumber($arabic[$j][$k]->getArLetMedial());
}
$link_prev = 1;
} elseif (!$link_prev and $link_next and $arabic[$j][$k]->getArLetSize() > 2) {
if ($arabic[$j][$k]->getArLetInitial()) {
$arabic[$j][$k]->setNumber($arabic[$j][$k]->getArLetInitial());
}
$link_prev = 1;
} elseif ($link_prev and !$link_next and $arabic[$j][$k]->getArLetSize() > 1) {
if ($arabic[$j][$k]->getArLetFinal()) {
$arabic[$j][$k]->setNumber($arabic[$j][$k]->getArLetFinal());
}
if ($k and $arabic[$j][$k]->getNumber() == 65166) {
switch ($arabic[$j][$k-1]->getNumber()) {
case "65247":
$arabic[$j][$k]->setNumber(65275);
$spl[] = $chars_j[$j][$k-1];
break;
case "65248":
$arabic[$j][$k]->setNumber(65276);
$spl[] = $chars_j[$j][$k-1];
break;
default:
break;
}
}
$link_prev = 0;
} else {
if ($arabic[$j][$k]->getArLetIsolated()) {
$arabic[$j][$k]->setNumber($arabic[$j][$k]->getArLetIsolated());
}
$link_prev = 0;
}
$arabic[$j][$k]->setTypeBiDi($oldtb);
$arabic[$j][$k]->encodeUTF8Num($arabic[$j][$k]->getNumber());
}
}
$j++;
}
for ($k = count($spl) -1;$k >= 0;$k--) {
array_splice($chars, $spl[$k], 1);
}
$i++;
}
}
/**
* Create 'newtext' at the end of the first stage of processing
*
* @access private
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getText() {
$levels = &$this->new_levels;
$ret = '';
$i = 0;
while ($i < count($levels)) {
$chars = &$levels[$i]['chars'];
$j = 0;
while ($j < count($chars)) {
$ret.= $chars[$j]->getLetter();
$j++;
}
$i++;
}
$this->setText($ret);
}
/**
* Place the tags back where they were stripped from.
*
* @access private
* @return string Processed text with tags
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function fixTags() {
$levels = &$this->new_levels;
$ret = '';
$i = 0;
$char_counter = 0;
$t = $this->tags[$this->par];
while ($i < count($levels)) {
$chars = &$levels[$i]['chars'];
$j = 0;
while ($j < count($chars)) {
while (count($t) and !is_array($tag)) {
$tag = array_shift($t);
while (count($t) and $tag['offset'] === $t[0]['offset']) {
$tag['tag'].= $t[0]['tag'];
array_shift($t);
}
}
if ($tag['offset'] == $char_counter) {
if (preg_match('/<\s*\//', $tag['tag'])) {
$ret.= $chars[$j]->getLetter() . $tag['tag'];
} else {
$ret.= $tag['tag'] . $chars[$j]->getLetter();
}
$tag = '';
} else {
$ret.= $chars[$j]->getLetter();
}
$char_counter++;
$j++;
}
$i++;
}
return $this->split_tags[0][$this->par] . $ret;
}
/**
* Create hexadecimal pattern for preg_match()
*
* @access private
* @param string [$str] String to be turned into pattern
* @return string Pattern of hexadecimal character representations
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getPattern($str) {
$ret = '';
$i = 0;
while ($i < strlen($str)) {
$ret.= '\x' . strtoupper(dechex(ord($str{$i})));
$i++;
}
return $ret;
}
/**
* Get RLO or LRO
*
* @access private
* @param string [$dir] Tag attribute 'dir' value (either 'rtl' or 'ltr')
* @return string RLO or LRO encoded in UTF-8
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function changeDir($dir) {
if ($dir == 'rtl') {
return chr(226) . chr(128) . chr(174);
} else {
return chr(226) . chr(128) . chr(173);
}
}
/**
* Restore tags in tagless text.
*
* @access private
* @param integer [$start] Offset where line begins in paragraph
* @param integer [$length] Length of line
* @return string The new line in right-to-left order with tags.
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function restoreTags($start, $length) {
$ret = '';
$open = array();
$chars = &$this->line_chars;
for ($i = 0;$i < count($chars);$i++) {
$char = $this->line_chars[$i];
$ts = $char->getTagSize();
$c = $char->getMirror();
if (!$ts) {
$ret.= $c;
continue;
}
$op = 1;
$snap = $open;
$clt = array();
for ($j = 0;$j < $ts;$j++) {
$cur_tag = $char->getTag($j);
$k = 0;
while ($k < count($open)) {
if ($open[$k] == $cur_tag) {
$op = 0;
preg_match('/<\s*([^\s>]+)(\s+[^>]+)*>/', $cur_tag, $m);
$clt[] = '</' . $m[1][0] . '>';
array_splice($open, $k, 1);
break;
}
$k++;
}
if ($op) {
$c = $cur_tag . $c;
array_unshift($open, $cur_tag);
}
}
$nclt = array();
for ($j = count($snap) - 1;$j >= 0;$j--) {
$cur_tag = $snap[$j];
preg_match('/<\s*([^\s>]+)(\s+[^>]+)*>/', $cur_tag, $m);
for($k = 0;$k < count($clt);$k++) {
if ($clt[$k] == '</' . $m[1][0] . '>') {
$nclt[] = $clt[$k];
array_splice($clt, $k, 1);
break;
}
}
}
$nclt = array_merge($clt, $nclt);
$ret.= $c . join('', $nclt);
}
if (count($this->split_tags[0])) {
if (!$start) {
$ret = $this->split_tags[0][$this->par] . $ret;
}
else {
$total = 0;
foreach ($this->result[$this->par]['levels'] as $level) {
$total+= count($level['chars']);
}
if ($start + $length == $total) {
preg_match('/<\s*([^\s>]+)(\s+[^>]+)*>/', $this->split_tags[0][$this->par], $m);
$ret.= '</' . $m[1][0] . '>';
}
}
}
while (preg_match('/(<\s*([^\s>]+)(\s+[^>]+)*>)[^<]+<\s*\/\s*\2\s*>\1/Uu', $ret)) {
$ret = preg_replace('/(<\s*([^\s>]+)(\s+[^>]+)*>)([^<]+)<\s*\/\s*\2\s*>\1/Uu', '\\1\\4', $ret);
}
return $ret;
}
/**
* Get substr values for tagless text based on original HTML input.
*
* All three parameters are required. By providing the substring of the
* paragraph that is to be displayed as a new line, you may avoid the
* parsing process.
*
* @access private
* @param string [$txt] The new line.
* @param integer [$start] The position in the paragraph where the new line starts (first character is 0). Only positive values are supported.
* @param integer [$length] The length of the new line.
* @author Efthimios Mavrogeorgiadis <hide@address.com>
* @since Wed Jan 09 2008 02:35:24 GMT+0200
* @version v 1.00 Wed Jan 09 2008 02:35:24 GMT+0200
*/
private function getStrippedValues($txt, &$start, &$length) {
$pattern = '/' . $this->getPattern($txt) . '/';
preg_match($pattern, $this->getResultNewText(), $matches, PREG_OFFSET_CAPTURE, $this->char_counter - $length);
$start = $matches[0][1];
$length = strlen($txt);
if ($start) {
$prev = substr($this->getResultNewText(), 0, $start);
} else {
$prev = '';
}
$main = substr($this->getResultNewText(), $start, $length);
$pattern = constant('UTF8PATTERN');
if ($prev) {
preg_match_all($pattern, $prev, $prev_matches);
} else {
$prev_matches = array();
}
preg_match_all($pattern, $main, $main_matches);
$start = count($prev_matches[0]);
$length = count($main_matches[0]);
}
}
?>