<?php
/*
* markup_parser.php
*
* @(#) $Id: markup_parser.php,v 1.64 2009/08/23 08:48:39 mlemos Exp $
*
*/
define('MARKUP_PARSER_START', 1);
define('MARKUP_PARSER_GET_ELEMENT', 2);
define('MARKUP_PARSER_GET_TAG', 3);
define('MARKUP_PARSER_GET_COMMENT', 4);
define('MARKUP_PARSER_GET_DOCTYPE', 5);
define('MARKUP_PARSER_GET_CDATA', 6);
define('MARKUP_PARSER_GET_ENTITY', 7);
define('MARKUP_PARSER_GET_DTD_ENTITY', 8);
define('MARKUP_PARSER_GET_DTD_ELEMENT', 9);
define('MARKUP_PARSER_GET_DTD_ATTLIST', 10);
define('MARKUP_PARSER_GET_DTD_CONDITIONAL', 11);
define('MARKUP_PARSER_END', 12);
define('MARKUP_PARSER_ERROR_NONE', 0);
define('MARKUP_PARSER_ERROR_UNEXPECTED', 1);
define('MARKUP_PARSER_ERROR_INVALID_SYNTAX', 2);
define('MARKUP_PARSER_ERROR_INVALID_USAGE', 3);
/*
{metadocument}<?xml version="1.0" encoding="ISO-8859-1" ?>
<class>
<package>net.manuellemos.markupparser</package>
<version>@(#) $Id: markup_parser.php,v 1.64 2009/08/23 08:48:39 mlemos Exp $</version>
<copyright>Copyright © (C) Manuel Lemos 2009</copyright>
<title>Markup parser</title>
<author>Manuel Lemos</author>
<authoraddress>mlemos-at-acm.org</authoraddress>
<documentation>
<idiom>en</idiom>
<purpose>Parse HTML and other markup based documents.</purpose>
<usage>Use the <functionlink>StartParsing</functionlink> function to
initialize the parser. Then use the
<functionlink>Parse</functionlink> function to make the class parse
markup data, eventually read from files. When you are done with
feeding the whole document data, call the
<functionlink>FinishParsing</functionlink> function.
<paragraphbreak />
The <functionlink>Parse</functionlink> function returns arrays of
tokens that describe each document element. The
<functionlink>RewriteElement</functionlink> function can be used to
convert the tokens back to markup document strings.
<paragraphbreak />
Element tokens are associated to the respective positions
in the document. Positions are numbers that represent their offsets
relative to beginning of the document. The
<functionlink>GetPositionLine</functionlink> function can return the
line and column number associated to a given document position if
the <variablelink>track_lines</variablelink> is set to
<booleanvalue>1</booleanvalue>.
<paragraphbreak />
The <functionlink>ParseDTDExpressionValue</functionlink> and
<functionlink>ParseAttributeList</functionlink> functions can be
used to parse expressions that may appear in DTD markup
elements.</usage>
</documentation>
{/metadocument}
*/
class markup_parser_class
{
/*
{metadocument}
<variable>
<name>error</name>
<type>STRING</type>
<value></value>
<documentation>
<purpose>Store the message that is returned when an error
occurs.</purpose>
<usage>Check this variable to understand what happened when a call to
any of the class functions has failed.<paragraphbreak />
This class uses cumulative error handling. This means that if one
class functions that may fail is called and this variable was
already set to an error message due to a failure in a previous call
to the same or other function, the function will also fail and does
not do anything.<paragraphbreak />
This allows programs using this class to safely call several
functions that may fail and only check the failure condition after
the last function call.<paragraphbreak />
Just set this variable to an empty string to clear the error
condition.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error = '';
/*
{metadocument}
<variable>
<name>error_code</name>
<type>INTEGER</type>
<value>0</value>
<documentation>
<purpose>Store the code that is returned when an error
occurs.</purpose>
<usage>Check this variable to understand what happened when a call
to any of the class functions has failed. It may be set to several
possible error codes defined as constants:<paragraphbreak />
<tt>MARKUP_PARSER_ERROR_NONE</tt> - No error happened
<paragraphbreak />
<tt>MARKUP_PARSER_ERROR_UNEXPECTED</tt> - It was found a condition
that the class is not yet ready to handle
<paragraphbreak />
<tt>MARKUP_PARSER_ERROR_INVALID_SYNTAX</tt> - A syntax error was
found
<paragraphbreak />
<tt>MARKUP_PARSER_ERROR_INVALID_USAGE</tt> - An invalid value was
passed to the class function parameters or set to the class
variables</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error_code = MARKUP_PARSER_ERROR_NONE;
/*
{metadocument}
<variable>
<name>error_position</name>
<type>INTEGER</type>
<value>-1</value>
<documentation>
<purpose>Point to the position of the markup data or file that
refers to the last error that occurred.</purpose>
<usage>Check this variable to determine the relevant position of the
document when a parsing error occurs. A negative value indicates
that there was no error or the last error is not associated to a
specific document position.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error_position = -1;
/*
{metadocument}
<variable>
<name>buffer_length</name>
<type>INTEGER</type>
<value>8000</value>
<documentation>
<purpose>Maximum length of the chunks of markup data read from files
that the class parse at one time.</purpose>
<usage>Adjust this value according to the available memory.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $buffer_length = 8000;
/*
{metadocument}
<variable>
<name>ignore_syntax_errors</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Specify whether the class should ignore syntax errors in
malformed documents.</purpose>
<usage>Set this variable to <booleanvalue>0</booleanvalue> if it is
necessary to verify whether markup data may be corrupted due to
to eventual bugs in the program that generated the
document.<paragraphbreak />
Currently the class only ignores some types of syntax errors.
Other syntax errors may still cause the
<functionlink>Parse</functionlink> to fail.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $ignore_syntax_errors=1;
/*
{metadocument}
<variable>
<name>warnings</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>Return a list of positions of the original document that
contain syntax errors.</purpose>
<usage>Check this variable to retrieve eventual document syntax
errors that were ignored when the
<variablelink>ignore_syntax_errors</variablelink> is set to
<booleanvalue>1</booleanvalue>.<paragraphbreak />
The indexes of this array are the positions of the errors. The
array values are the corresponding syntax error messages.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $warnings=array();
/*
{metadocument}
<variable>
<name>store_positions</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Tell the class to return the position of each document
element token.</purpose>
<usage>Set this variable to <integervalue>0</integervalue> if you do
not need to know the position of each parsed markup element.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $store_positions = 1;
/*
{metadocument}
<variable>
<name>track_lines</name>
<type>BOOLEAN</type>
<value>0</value>
<documentation>
<purpose>Tell the class to keep track the position of each document
line.</purpose>
<usage>Set this variable to <integervalue>1</integervalue> if you
need to determine the line and column number associated to a given
position of the parsed document.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $track_lines = 0;
/*
{metadocument}
<variable>
<name>tag_lower_case</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Tell the class to lower the case of tag and attribute names
in the <functionlink>RewriteElement</functionlink> function.</purpose>
<usage>Set this variable to <booleanvalue>0</booleanvalue> when you
want to preserve the original case tags and attributes being
rewritten.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $tag_lower_case = 1;
/*
{metadocument}
<variable>
<name>quote_attribute_values</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Tell the class to always quote the values of attribute in
the <functionlink>RewriteElement</functionlink> function.</purpose>
<usage>Set this variable to <booleanvalue>0</booleanvalue> when you
want that attribute values be quoted only when they have spaces,
tabs or line break characters.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $quote_attribute_values = 1;
/*
{metadocument}
<variable>
<name>decode_entities</name>
<type>BOOLEAN</type>
<value>0</value>
<documentation>
<purpose>Tell the class to decode all the character entities in
character data or tag attributes.</purpose>
<usage>Set this variable to <integervalue>1</integervalue> if you
need to get all the character data or tag attributes with
character entities already decoded.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $decode_entities = 0;
/*
{metadocument}
<variable>
<name>allow_grave_accent_quoting</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Tell the class to allow grave accent characters as
delimiters for quoted tag attributes.</purpose>
<usage>Set this variable to <integervalue>0</integervalue> if you
want the class to be strict and not accept grave accent characters
to quote tag attribute values.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $allow_grave_accent_quoting = 1;
/* Private variables */
var $file;
var $data;
var $state = MARKUP_PARSER_START;
var $last_state = MARKUP_PARSER_GET_ELEMENT;
var $state_stack = array();
var $buffer = '';
var $buffer_position = 0;
var $offset = 0;
var $lines = array();
var $line_offset = 0;
var $last_line = 1;
var $last_carriage_return = 0;
var $decoding_entities = 0;
var $entities = array(
'amp'=>'&',
'quot'=>'"',
'apos'=>"'",
'lt'=>'<',
'gt'=>'>'
);
/* Private functions */
Function SetError($error, $code)
{
$this->error = $error;
$this->error_code = $code;
return(0);
}
Function SetErrorWithContact($error, $code)
{
return($this->SetError($error.'. Please contact the author Manuel Lemos <hide@address.com> and send a copy of this message to let him add support for this kind of markup documents', $code));
}
Function SetPositionedError($error, $code, $position)
{
$this->error_position = $position;
return($this->SetError($error, $code));
}
Function SetPositionedWarning($error, $code, $position)
{
if(!$this->ignore_syntax_errors)
return($this->SetPositionedError($error, $code, $position));
$this->warnings[$position]=$error;
return(1);
}
Function SetPHPError($error, $code, &$php_error_message)
{
if(IsSet($php_error_message)
&& strlen($php_error_message))
$error .= ': '.$php_error_message;
return($this->SetError($error, $code));
}
Function SetUnterminatedWarning($what, $end, &$need_more_data)
{
if($end
&& !$this->SetPositionedWarning('unterminated '.$what, MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
$need_more_data = 1;
return(1);
}
Function SetUnterminatedTagWarning($end, &$need_more_data)
{
return($this->SetUnterminatedWarning('tag', $end, $need_more_data));
}
Function SetUnterminatedError($what, $end, &$need_more_data)
{
if($end)
return($this->SetPositionedError('unterminated '.$what, MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position));
$need_more_data = 1;
return(1);
}
Function SetUnterminatedDocTypeError($end, &$need_more_data)
{
return($this->SetUnterminatedError('DOCTYPE', $end, $need_more_data));
}
Function SetUnterminatedDTDEntityError($end, &$need_more_data)
{
return($this->SetUnterminatedError('DTD ENTITY', $end, $need_more_data));
}
Function SkipCharacters($characters, $start = -1)
{
if($start < 0)
$start = $this->buffer_position;
return(strspn($this->buffer, $characters, $start) + $start);
}
Function FindCharacters($characters, &$position, $start = -1)
{
if($start < 0)
$start = $this->buffer_position;
$position = strcspn($this->buffer, $characters, $start) + $start;
return($position < strlen($this->buffer) ? $this->buffer[$position] : '');
}
Function SkipComment($s, &$e, $end, &$need_more_data)
{
$e = $s = $this->SkipCharacters(" \t\r\n", $s);
$l = strlen($this->buffer);
if($s >= $l)
{
if(!$end)
$need_more_data = 1;
return(1);
}
if(strcmp($this->buffer[$s], '-'))
return(1);
++$s;
if($s >= $l)
{
if(!$end)
$need_more_data = 1;
return(1);
}
if(strcmp($this->buffer[$s], '-'))
return(1);
++$s;
do
{
$c = $this->FindCharacters('-', $s, $s);
if(strlen($c) == 0
|| ++$s > $l)
{
if($end)
return(0);
$need_more_data = 1;
return(1);
}
}
while(strcmp($this->buffer[$s], '-'));
$e = $this->SkipCharacters(" \t\r\n", $s + 1);
return(1);
}
Function GetTagAttribute(&$s, $stop, &$value, &$need_more_data, $allow_quoted = 1)
{
$l = strlen($this->buffer);
switch($quote = $this->buffer[$s])
{
case '`':
if(!$this->allow_grave_accent_quoting)
break;
case "'":
case '"':
if(!$allow_quoted)
{
if(!$this->SetPositionedWarning('unexpected quoted attribute', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
++$s;
break;
}
++$s;
$c = $this->FindCharacters($quote.'><', $e, $s);
switch($c)
{
case '`':
if(!$this->SetPositionedWarning('attribute is quoted with grave accents', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
break;
case '':
if($e + 1 >= $l)
{
$need_more_data = 1;
return(1);
}
break;
}
$value = substr($this->buffer, $s, $e - $s);
if($c == '<'
|| $c == '>')
{
if(!$this->SetPositionedWarning('unfinished quoted attribute', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $e))
return(0);
$s = $e;
return(1);
}
$s = $this->SkipCharacters(" \t\r\n", $e + 1);
if($s >= $l)
$need_more_data = 1;
return(1);
case '>':
return(1);
}
if($this->allow_grave_accent_quoting)
$stop .= '`';
$c = $this->FindCharacters($stop."\"'>< \t\r\n", $e, $s);
if(strlen($c) == 0)
{
$need_more_data = 1;
return(1);
}
$value = substr($this->buffer, $s, $e - $s);
if($c == '<')
{
$s = $e;
return(0);
}
$s = $this->SkipCharacters(" \t\r\n", $e);
if($s >= $l)
$need_more_data = 1;
return(1);
}
Function ParseDTDNameStart(&$p, &$start)
{
$start = null;
$v = $this->buffer;
$l = strlen($v);
if($p >= $l)
return(1);
$s = strspn($v, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $p);
if($s > 0)
{
$start = substr($v, $p, $s);
$p += $s;
}
return(1);
}
Function ParseDTDNameCharacter(&$p, &$character)
{
$character = null;
$v = $this->buffer;
$l = strlen($v);
if($p >= $l)
return(1);
$c = strspn($v, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-.', $p);
if($c > 0)
{
$character = substr($v, $p, $c);
$p += $c;
}
return(1);
}
Function ParseDTDName(&$p, &$name)
{
$name = null;
$s = $p;
if(!$this->ParseDTDNameStart($s, $c))
return(0);
if(IsSet($c))
{
for(;;)
{
if(!$this->ParseDTDNameCharacter($s, $c))
return(0);
if(!IsSet($c))
break;
}
$v = $this->buffer;
$l = strlen($v);
$n = substr($v, $p, $s - $p);
if($s < $l
&& $v[$s] === ':')
{
++$s;
if($this->ParseDTDNameStart($s, $c)
&& IsSet($c))
{
for(;;)
{
if(!$this->ParseDTDNameCharacter($s, $c))
return(0);
if(!IsSet($c))
break;
}
$n = substr($v, $p, $s - $p);
$p = $s;
}
}
else
$p = $s;
$name = array(
'Type'=>'name',
'Name'=>$n
);
}
return(1);
}
Function ParseDTDNumber(&$p, &$number)
{
$number = null;
$v = $this->buffer;
$l = strlen($v);
if($p >= $l)
return(1);
$c = strspn($v, '0123456789', $p);
if($c > 0)
{
$number = array(
'Type'=>'number',
'Name'=>substr($v, $p, $c)
);
$p += $c;
}
return(1);
}
Function ParseDTDOperator(&$p, &$operator)
{
$operator = null;
$v = $this->buffer;
$l = strlen($v);
if($p >= $l)
return(1);
$o = $this->SkipCharacters(" \t\r\n", $p);
switch($c = $v[$o])
{
case '|':
case '&':
case ',':
$operator = array(
'Type'=>'operator',
'Operator'=>$c
);
$p = $this->SkipCharacters(" \t\r\n", $o + 1);
break;
}
return(1);
}
Function ParseDTDPCData(&$p, &$pcdata)
{
$pcdata = null;
$v = $this->buffer;
$l = strlen($v);
$pcd = '#PCDATA';
$lp = strlen($pcd);
if($p + $lp > $l
|| strcmp(substr($v, $p, $lp), $pcd))
return(1);
$pcdata = array(
'Type'=>'pcdata',
);
$p += $lp;
return(1);
}
Function ParseDTDEmpty(&$p, &$empty)
{
$empty = null;
$v = $this->buffer;
$l = strlen($v);
$e = 'EMPTY';
$le = strlen($e);
if($p + $le > $l
|| strcmp(substr($v, $p, $le), $e))
return(1);
$empty = array(
'Type'=>'empty',
);
$p += $le;
return(1);
}
Function ParseDTDCData(&$p, &$cdata)
{
$cdata = null;
$v = $this->buffer;
$l = strlen($v);
$c = 'CDATA';
$lc = strlen($c);
if($p + $lc > $l
|| strcmp(substr($v, $p, $lc), $c))
return(1);
$cdata = array(
'Type'=>'cdata',
);
$p += $lc;
return(1);
}
Function ParseDTDGroup(&$p, &$group)
{
$group = null;
$v = $this->buffer;
$l = strlen($v);
$g = $p;
$m = null;
if($g < $l)
{
switch($c = $v[$g])
{
case '-':
case '+':
$m = ' '.$c;
++$g;
break;
}
}
if($g >= $l
|| strcmp($v[$g], '('))
return(1);
++$g;
if(!$this->ParseDTDExpression($g, $e))
return(0);
if(!IsSet($e)
|| $g >= $l
|| strcmp($v[$g], ')'))
return(1);
$group = array(
'Type'=>'group',
'Group'=>$e,
);
if(IsSet($m))
$group['Multiplicity'] = $m;
$p = $g + 1;
return(1);
}
Function ParseDTDEntity(&$p, &$entity)
{
$entity = null;
$v = $this->buffer;
$l = strlen($v);
$e = $p;
if($e >= $l
|| strcmp($v[$e], '%'))
return(1);
++$e;
if(!$this->ParseDTDName($e, $n))
return(0);
if(!IsSet($n)
|| $e >= $l)
return(1);
switch($v[$e])
{
case ';':
++$e;
break;
case '>':
case ' ':
case "\t":
case "\r":
case "\n":
break;
default:
return(1);
}
$entity = array(
'Type'=>'entity',
'Entity'=>$n['Name']
);
$p = $e;
return(1);
}
Function ParseDTDExpressionElement(&$p, &$element)
{
$element = null;
$v = $this->buffer;
$l = strlen($v);
$e = $p;
if(!IsSet($element))
{
if(!$this->ParseDTDGroup($e, $group))
return(0);
if(IsSet($group))
$element = $group;
}
if(!IsSet($element))
{
if(!$this->ParseDTDEntity($e, $entity))
return(0);
if(IsSet($entity))
$element = $entity;
}
if(!IsSet($element))
{
if(!$this->ParseDTDPCData($e, $pcdata))
return(0);
if(IsSet($pcdata))
$element = $pcdata;
}
if(!IsSet($element))
{
if(!$this->ParseDTDEmpty($e, $empty))
return(0);
if(IsSet($empty))
$element = $empty;
}
if(!IsSet($element))
{
if(!$this->ParseDTDCData($e, $cdata))
return(0);
if(IsSet($cdata))
$element = $cdata;
}
if(!IsSet($element))
{
if(!$this->ParseDTDName($e, $name))
return(0);
if(IsSet($name))
$element = $name;
}
if(!IsSet($element))
{
if(!$this->ParseDTDNumber($e, $number))
return(0);
if(IsSet($number))
$element = $number;
}
if(IsSet($element))
{
if(!IsSet($element['Multiplicity'])
&& $e < $l)
{
switch($c = $v[$e])
{
case '*':
case '?':
case '+':
$m = $c;
++$e;
break;
}
if(IsSet($m))
$element['Multiplicity'] = $m;
}
$p = $e;
}
return(1);
}
Function ParseDTDExpression(&$p, &$expression)
{
$expression = null;
if(!$this->ParseDTDExpressionElement($p, $element))
return(0);
if(IsSet($element))
{
$expression = array($element);
for(;;)
{
$o = $p = $this->SkipCharacters(" \t\r\n", $p);
if(!$this->ParseDTDOperator($o, $operator))
return(0);
if(!IsSet($operator))
break;
if(!$this->ParseDTDExpressionElement($o, $element))
return(0);
if(!IsSet($element))
break;
$expression[] = $operator;
$expression[] = $element;
$p = $o;
}
}
return(1);
}
Function ParseEntity($entity, $position, $adjust_error, &$element, &$advance)
{
$element = null;
$advance = 0;
if($entity[0] != '&')
return($this->SetPositionedError('invalid entity', MARKUP_PARSER_ERROR_UNEXPECTED, $position));
$s = 1;
$e = strcspn($entity, ';&', $s) + $s;
if($s == $e)
{
if(!$this->SetPositionedWarning('it was found an entity without a name', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $s : 0)))
return(0);
}
else
{
$end = ($e < strlen($entity) && $entity[$e] == ';');
if(!$end)
{
if(!$this->SetPositionedWarning('it was found an incorrectly ended entity', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $e : 0)))
return(0);
}
if($entity[$s] == '#')
{
$c = $s + 1;
if(strtolower($entity[$c]) == 'x')
{
++$c;
$c += strspn($entity, '0', $c);
$value = strtolower(substr($entity, $c, $e - $c));
$d = strspn($value, '0123456789abcdef') + $c;
if($d != $e)
{
if(!$this->SetPositionedWarning('it was found an entity with an invalid hexadecimal character code', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $d : 0)))
return(0);
$e = $d;
$value = substr($value, 0, $e - $c);
$end = 0;
}
if($d == $c)
{
if(!$this->SetPositionedWarning('it was found an entity with an empty hexadecimal code', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $c : 0)))
return(0);
}
elseif(strlen($value) > 8)
{
if(!$this->SetPositionedWarning('it was found an entity with an hexadecimal code that is too large', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $d : 0)))
return(0);
}
else
{
$element = array(
'Type'=>'ENTITY',
'Entity'=>substr($entity, $s, $e - $s),
'Code'=>HexDec($value),
);
if($this->store_positions)
$element['Position'] = $position;
$advance = $e + ($end ? 1 : 0);
return(1);
}
}
else
{
$d = strspn($entity, '0123456789', $c) + $c;
if($d != $e)
{
if(!$this->SetPositionedWarning('it was found an entity with an invalid character code', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $d : 0)))
return(0);
$e = $d;
$end = 0;
}
if($d == $c)
{
if(!$this->SetPositionedWarning('it was found an entity with an empty code', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $s : 0)))
return(0);
}
else
{
$c += strspn($entity, '0', $c);
if($c == $e)
--$c;
for($code = intval($entity[$c]), ++$c; $c < $e; ++$c)
$code = $code * 10 + intval($entity[$c]);
if($code > 0xffffffff)
{
if(!$this->SetPositionedWarning('it was found an entity with a code that is too large', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $d : 0)))
return(0);
}
else
{
$element = array(
'Type'=>'ENTITY',
'Entity'=>substr($entity, $s, $e - $s),
'Code'=>$code,
);
if($this->store_positions)
$element['Position'] = $position;
$advance = $e + ($end ? 1 : 0);
return(1);
}
}
}
}
elseif($e - $s > 8)
{
if(!$this->SetPositionedWarning('it was found an entity name that is too long', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $s : 0)))
return(0);
}
elseif(!preg_match('/^[a-z][-a-z.]*$/i', $entity = substr($entity, $s, $e - $s)))
{
if(!$this->SetPositionedWarning('it was found an entity with an invalid name', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $position + ($adjust_error ? $s : 0)))
return(0);
}
else
{
$element = array(
'Type'=>'ENTITY',
'Entity'=>$entity,
);
if($this->decoding_entities)
{
if(IsSet($this->entities[$entity]))
$element['Code'] = Ord($this->entities[$entity]);
}
if($this->store_positions)
$element['Position'] = $position;
$advance = $e + ($end ? 1 : 0);
return(1);
}
}
$element = array(
'Type'=>'ENTITY',
'Entity'=>'amp',
);
if($this->decoding_entities)
$element['Code'] = Ord('&');
if($this->store_positions)
$element['Position'] = $position;
$advance = 1;
return(1);
}
Function DecodeEntities($value, &$decoded, $position)
{
$l = strlen($value);
$decoded = '';
for($s = 0; $s < $l;)
{
$e = $s + strcspn($value, '&', $s);
if($e != $s)
$decoded .= substr($value, $s, $e - $s);
if($e == $l)
return(1);
$c = $e + 1 + strcspn($value, ';&', $e + 1);
if(!$this->ParseEntity(substr($value, $e, $c - $e + 2), $position, 0, $entity, $length))
return(0);
$name = $entity['Entity'];
if(IsSet($entity['Code']))
{
$code = $entity['Code'];
if($code > 0xFF)
{
if(!$this->SetPositionedWarning('cannot decode entities with more than 8 bits', MARKUP_PARSER_ERROR_INVALID_USAGE, $position))
return(0);
$decoded .= '&'.$name.';';
}
else
$decoded .= Chr($code);
}
elseif(IsSet($this->entities[$name]))
$decoded .= $this->entities[$name];
else
$decoded .= '&'.$name.';';
$s = $e + $length;
}
return(1);
}
Function ParseAttList(&$s, &$attlist, $end, &$need_more_data)
{
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list name', $end, $need_more_data));
while(strcmp($this->buffer[$s], '>'))
{
$n = $s;
if(!$this->ParseDTDName($s, $value))
return(0);
if(IsSet($value))
{
$name = $value['Name'];
if(IsSet($attlist['Attributes'][$name]))
return($this->SetPositionedError('DTD attribute name '.$name.' is duplicated', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $n));
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list value', $end, $need_more_data));
if(!$this->ParseDTDExpression($s, $value))
return(0);
if(!IsSet($value))
return($this->SetUnterminatedError('DTD attribute list value', $end, $need_more_data));
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list default', $end, $need_more_data));
$c = $this->FindCharacters("> \t\r\n", $e, $s);
if(strlen($c) == 0)
{
if($end)
return($this->SetUnterminatedError('DTD attribute list default', $end, $need_more_data));
$need_more_data = 1;
return(1);
}
$default = substr($this->buffer, $s, $e - $s);
$attlist['Attributes'][$name] = array(
'Value'=>$value,
'Default'=>$default,
);
$s = $e;
if(!strcmp($default, '#FIXED'))
{
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list default value', $end, $need_more_data));
$this->GetTagAttribute($s, '', $attlist['Attributes'][$name]['Fixed'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedError('DTD attribute list default value', $end, $need_more_data));
}
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list name', $end, $need_more_data));
}
else
{
if(!$this->ParseDTDEntity($s, $entity))
return(0);
if(IsSet($entity))
{
$attlist['Entities'][] = $entity['Entity'];
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list', $end, $need_more_data));
}
else
return($this->SetUnterminatedError('DTD attribute list', $end, $need_more_data));
}
}
return(1);
}
Function ParseElement($end, &$element, &$need_more_data)
{
$need_more_data = 0;
$l = strlen($this->buffer);
switch($this->state)
{
case MARKUP_PARSER_START:
$this->state = MARKUP_PARSER_GET_ELEMENT;
return(1);
case MARKUP_PARSER_GET_DTD_CONDITIONAL;
$c = $this->FindCharacters(']<>&', $position);
if($c == ']')
{
if($position + 3 <= $l)
{
if(!strcmp(substr($this->buffer, $position, 3), ']]>'))
{
if($position != $this->buffer_position)
{
$element = array(
'Type'=>'DATA',
'Data'=>substr($this->buffer, $this->buffer_position, $position - $this->buffer_position),
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $position;
return(1);
}
$s = count($this->state_stack) - 1;
if(!IsSet($this->state_stack[$s]))
return($this->SetPositionedError('invalid conditional section close tag', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $position));
$element = array(
'Type'=>'END_CONDITIONAL',
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->last_state = $this->state = $this->state_stack[$s];
UnSet($this->state_stack[$s]);
$this->buffer_position = $position + 3;
return(1);
}
}
elseif(!$end)
{
$need_more_data = 1;
return(1);
}
}
case MARKUP_PARSER_GET_ELEMENT:
$c = $this->FindCharacters('<>&"', $position);
switch($c)
{
case '':
$need_more_data = !$end;
if($end
&& $position != $this->buffer_position)
{
$element = array(
'Type'=>'DATA',
'Data'=>substr($this->buffer, $this->buffer_position, $position - $this->buffer_position),
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $position;
}
if($end)
$this->state = MARKUP_PARSER_END;
return(1);
case '<':
break;
case '>':
case '"':
if($position > $this->buffer_position)
{
$element = array(
'Type'=>'DATA',
'Data'=>substr($this->buffer, $this->buffer_position, $position - $this->buffer_position),
);
}
else
{
if($c === '>'
&& !$this->SetPositionedWarning($c.' character is not encoded', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
$element = array(
'Type'=>'ENTITY',
'Entity'=>($c === '>' ? 'gt' : 'quot'),
);
if($this->decoding_entities)
$element['Code'] = Ord($c);
++$position;
}
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $position;
return(1);
case '&':
$this->state = MARKUP_PARSER_GET_ENTITY;
if($position != $this->buffer_position)
{
$element = array(
'Type'=>'DATA',
'Data'=>substr($this->buffer, $this->buffer_position, $position - $this->buffer_position),
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $position;
}
return(1);
}
$this->state = MARKUP_PARSER_GET_TAG;
if($position != $this->buffer_position)
{
$element = array(
'Type'=>'DATA',
'Data'=>substr($this->buffer, $this->buffer_position, $position - $this->buffer_position),
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $position;
return(1);
}
case MARKUP_PARSER_GET_TAG:
$s = $this->SkipCharacters(" \t\r\n", $this->buffer_position + 1);
if($s > $this->buffer_position + 1)
{
if(!$this->SetPositionedWarning('< character is not encoded', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
return(1);
}
$c = $this->FindCharacters("> \t\r\n<", $e, $s);
if(strlen($c) == 0)
{
if(!$this->SetUnterminatedTagWarning($end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
if($s == $e)
{
if(!$this->SetPositionedWarning('it was found an empty tag', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
$tag = substr($this->buffer, $s, $e - $s);
if($this->buffer[$s] === '!')
{
if(($hidden = !strcasecmp(substr($tag, 0, strlen($condition = '!--[if')), $condition))
|| !strcasecmp(substr($tag, 0, strlen($condition= '![if')), $condition))
{
$s += strlen($condition);
if(strlen($c = $this->FindCharacters(']>', $e, $s)))
{
$condition = trim(substr($this->buffer, $s, $e - $s));
if($c == ']')
$c = $this->FindCharacters('>', $e, $e + 1);
if(strlen($c))
{
$element = array(
'Type'=>'CONDITION',
'Condition'=>$condition,
'Hidden'=>$hidden
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
$this->buffer_position = $e + 1;
return(1);
}
}
return($this->SetUnterminatedError('condition', $end, $need_more_data));
}
if(!strcasecmp(substr($tag, 0, strlen('![endif]')), '![endif]'))
{
if(strlen($this->FindCharacters('>', $e, $s + strlen('![endif]'))))
{
$element = array(
'Type'=>'END_CONDITION',
'Hidden'=>!strcmp(substr($this->buffer, $e - 2, 2), '--')
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
$this->buffer_position = $e + 1;
return(1);
}
return($this->SetUnterminatedError('end condition', $end, $need_more_data));
}
if(!strcmp(substr($tag, 0, strlen('!--')), '!--'))
{
$this->state = MARKUP_PARSER_GET_COMMENT;
$need_more_data = 0;
return(1);
}
if(!strcasecmp(substr($tag, 0, strlen('![')), '!['))
{
$this->state = MARKUP_PARSER_GET_CDATA;
return(1);
}
switch($tag)
{
case '!DOCTYPE':
$this->state = MARKUP_PARSER_GET_DOCTYPE;
return(1);
case '!ENTITY':
$this->state = MARKUP_PARSER_GET_DTD_ENTITY;
return(1);
case '!ELEMENT':
$this->state = MARKUP_PARSER_GET_DTD_ELEMENT;
return(1);
case '!ATTLIST':
$this->state = MARKUP_PARSER_GET_DTD_ATTLIST;
return(1);
}
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
return(1);
}
$end_tag= ($this->buffer[$s] === '/');
if($end_tag)
++$s;
$e = $this->SkipCharacters('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $s);
$e = $this->SkipCharacters('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:.-_', $e);
$tag = substr($this->buffer, $s, $e - $s);
$s = $this->SkipCharacters(" \t\r\n", $e);
$attributes = $decoded_attributes = array();
for(;;)
{
if($s >= $l)
{
if(!$this->SetUnterminatedTagWarning($end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
switch($this->buffer[$s])
{
case '<':
if(!$this->SetPositionedWarning('it was found a tag start character in a tag attribute', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
--$s;
case '>':
$element = array(
'Type'=>($end_tag ? 'ENDTAG' : 'TAG'),
'Name'=>$tag,
);
if($end_tag)
{
if(count($attributes)
&& !$this->SetPositionedWarning('it was defined an end tag with attributes', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
}
else
{
if(IsSet($attributes['/']))
{
$element['Close'] = 1;
UnSet($attributes['/']);
}
if(count($attributes))
{
$element['Attributes'] = $attributes;
if(count($decoded_attributes))
$element['DecodedAttributes'] = $decoded_attributes;
}
}
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $s + 1;
$this->state = $this->last_state;
return(1);
}
$a = $v = $s;
$this->GetTagAttribute($s, '=', $attribute, $need_more_data, 0);
if($need_more_data)
{
if(!$this->SetUnterminatedTagWarning($end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
if(substr($attribute, 0, 1) === '/')
{
$close = 1;
if(strlen($attribute) > 1)
$attribute = substr($attribute, 1);
}
else
$close = 0;
if(IsSet($attributes[$attribute])
&& !$this->SetPositionedWarning('attribute '.$attribute.' was defined more than once', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $a))
return(0);
$attributes[$attribute] = '';
switch($this->buffer[$s])
{
case '=':
$s = $this->SkipCharacters(" \t\r\n", $s + 1);
if($s >= $l)
{
if(!$this->SetUnterminatedTagWarning($end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
$v = $s;
$this->GetTagAttribute($s, '', $attributes[$attribute], $need_more_data);
if($need_more_data)
{
if(!$this->SetUnterminatedTagWarning($end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
$need_more_data = 0;
return(1);
}
if($this->decoding_entities)
{
$value = $attributes[$attribute];
if(!$this->DecodeEntities($value, $decoded, $this->offset + $this->buffer_position))
return(0);
if(strcmp($decoded, $value))
$decoded_attributes[$attribute] = $decoded;
}
break;
case '>':
break;
}
if($close
&& $this->buffer[$s] !== '>')
$close = 0;
if(!$close
&& !preg_match('/^[a-zA-Z][a-zA-Z0-9.-]*(:[a-zA-Z0-9.-]+)?$/', $attribute))
{
if(!$this->SetPositionedWarning('invalid tag attribute '.$attribute, MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $a))
return(0);
UnSet($attributes[$attribute]);
UnSet($decoded_attributes[$attribute]);
}
}
break;
case MARKUP_PARSER_GET_COMMENT:
$s = $this->buffer_position + strlen('<!--');
for($f = $s;;)
{
$c = $this->FindCharacters('->', $e, $f);
switch($c)
{
case '':
if(!$this->SetUnterminatedWarning('comment', $end, $need_more_data))
return(0);
if(!$need_more_data)
{
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
}
return(1);
case '>';
if($e == $s)
{
if(!$this->SetPositionedWarning('comment without end marks', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
$comment = substr($this->buffer, $s, $e - $s);
break 2;
}
$f = $e + 1;
continue 2;
}
if($e + 2 > $l
&& !$this->SetUnterminatedWarning('comment', $end, $need_more_data))
return(0);
if(substr($this->buffer, $e, 2) === '--')
{
$comment = substr($this->buffer, $s, $e - $s);
$e += 2;
break;
}
$f = $e + 1;
}
$c = $this->FindCharacters('><', $f, $e);
switch($c)
{
case '':
if(!$this->SetUnterminatedWarning('comment', $end, $need_more_data))
return(0);
if(!$need_more_data)
{
$element = array(
'Type'=>'ENTITY',
'Entity'=>'lt',
);
if($this->decoding_entities)
$element['Code'] = Ord('<');
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->state = $this->last_state;
++$this->buffer_position;
}
return(1);
case '>':
if($f != $e)
{
if(!$this->SetPositionedWarning('comment has garbage characters after end comment mark', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $e))
return(0);
$e = $f;
}
++$e;
break;
case '<':
if(!$this->SetPositionedWarning('unfinished comment', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $e))
return(0);
$e = $f;
break;
}
$element = array(
'Type'=>'COMMENT',
'Comment'=>$comment,
);
if($this->store_positions)
{
$element['Position'] = $this->offset + $this->buffer_position;
$element['CommentPosition'] = $this->offset + $s;
}
$this->buffer_position = $e;
$this->state = $this->last_state;
return(1);
case MARKUP_PARSER_GET_DOCTYPE:
$s = $this->SkipCharacters(" \t\r\n", $this->buffer_position + strlen('<!DOCTYPE'));
if($s >= $l)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
$doctype = array(
'Type'=>'DOCTYPE',
);
$this->GetTagAttribute($s, '', $doctype['Root'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
if($this->buffer[$s] != '>')
{
$e = $s;
$this->GetTagAttribute($e, '', $availability, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
if($this->buffer[$e] != '>')
{
$s = $e;
switch($availability)
{
case 'PUBLIC':
$this->GetTagAttribute($s, '', $doctype['Identifier'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
case 'SYSTEM':
$doctype['Availability'] = $availability;
switch($this->buffer[$s])
{
case '>':
case '<':
case '[':
break 2;
}
$this->GetTagAttribute($s, '', $doctype['URL'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
break;
}
switch($this->buffer[$s])
{
case '>':
break;
case '[':
$c = $this->FindCharacters(']', $e, $s + 1);
if(strlen($c) == 0)
return($this->SetUnterminatedDocTypeError($end, $need_more_data));
$doctype['DTD'] = substr($this->buffer, $s + 1, $e - $s - 1);
$s = $this->SkipCharacters(" \t\r\n", $e + 1);
switch($this->buffer[$s])
{
case '>':
break 2;
case '<':
break;
default:
return($this->SetPositionedError('unknown type of parameters at the end of the DOCTYPE declaration', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
}
case '<':
if(!$this->SetPositionedWarning('it was found a tag start character in a DOCTYPE declaration', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
--$s;
break;
default:
return($this->SetPositionedError('unknown type of parameters at the end of the DOCTYPE declaration', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
}
}
}
elseif(!$this->SetPositionedWarning('DOCTYPE declaration does not specify the root attribute', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $this->buffer_position))
return(0);
else
Unset($doctype['Root']);
$element = $doctype;
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $s + 1;
$this->state = $this->last_state;
return(1);
case MARKUP_PARSER_GET_CDATA:
if(!$this->SkipComment($this->buffer_position + strlen('<!['), $s, $end, $need_more_data))
return($this->SetPositionedError('unterminated tag section', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
$c = $this->FindCharacters('[', $e, $s);
if(strlen($c) == 0)
return($this->SetUnterminatedError('tag section', $end, $need_more_data));
if(!$this->ParseDTDName($s, $value))
return(0);
if(!IsSet($value))
{
if(!$this->ParseDTDEntity($s, $value))
return(0);
if(!IsSet($value))
return($this->SetPositionedError('invalid conditional value', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
}
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetPositionedError('unterminated tag section', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
if($s >= strlen($this->buffer))
return($this->SetUnterminatedError('CDATA', $end, $need_more_data));
if(strcmp($this->buffer[$s], '['))
return($this->SetPositionedError('invalid CDATA syntax', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
++$s;
if($value['Type'] != 'name'
|| $value['Name'] != 'CDATA')
{
$element = array(
'Type'=>'CONDITIONAL',
'Condition'=>$value
);
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$p = count($this->state_stack);
$this->state_stack[$p] = $this->last_state;
$this->last_state = $this->state = MARKUP_PARSER_GET_DTD_CONDITIONAL;
$this->buffer_position = $s;
return(1);
}
$f = $s;
do
{
$c = $this->FindCharacters(']', $e, $f);
if(strlen($c) == 0
|| $e + 3 > $l)
return($this->SetUnterminatedError('CDATA', $end, $need_more_data));
$f = $e + 1;
}
while(substr($this->buffer, $e, strlen(']]>')) != ']]>');
$element = array(
'Type'=>'CDATA',
'Data'=>substr($this->buffer, $s, $e - $s)
);
if($this->store_positions)
{
$element['Position'] = $this->offset + $this->buffer_position;
$element['CDATAPosition'] = $this->offset + $s;
}
$this->state = $this->last_state;
$this->buffer_position = $e + strlen(']]>');
return(1);
case MARKUP_PARSER_GET_ENTITY:
$s = $this->buffer_position + 1;
$c = $this->FindCharacters(';', $e, $s);
if(strlen($c) == 0)
{
if(!$this->SetUnterminatedWarning('entity', $end, $need_more_data))
return(0);
if($need_more_data
&& !$end)
return(1);
$need_more_data = 0;
$entity = substr($this->buffer, $this->buffer_position);
}
else
$entity = substr($this->buffer, $this->buffer_position, $e + 1 - $this->buffer_position);
if(!$this->ParseEntity($entity, $this->offset + $this->buffer_position, 1, $element, $advance))
return(0);
$this->buffer_position += $advance;
$this->state = $this->last_state;
return(1);
case MARKUP_PARSER_GET_DTD_ENTITY:
$s = $this->SkipCharacters(" \t\r\n", $this->buffer_position + strlen('<!ENTITY'));
if($s >= $l)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
$entity = array(
'Type'=>'DTD_ENTITY',
'Internal'=>0,
);
$this->GetTagAttribute($s, '', $name, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
if(!strcmp($name, '%'))
{
$entity['Internal'] = 1;
$this->GetTagAttribute($s, '', $name, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
}
$entity['Name'] = $name;
$this->GetTagAttribute($s, '', $value, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
switch($value)
{
case 'PUBLIC':
$this->GetTagAttribute($s, '', $entity['Identifier'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
case 'SYSTEM':
$this->GetTagAttribute($s, '', $entity['URI'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
break;
case 'CDATA':
if(!$this->SkipComment($s, $s, $end, $need_more_data))
return($this->SetPositionedError('unterminated DTD entity comment', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
if($this->buffer[$s] != '>')
{
$this->GetTagAttribute($s, '', $entity['CDATA'], $need_more_data);
if($need_more_data)
return($this->SetUnterminatedDTDEntityError($end, $need_more_data));
}
default:
$entity['Value'] = $value;
break;
}
while($this->buffer[$s] != '>')
{
if(!$this->SkipComment($s, $e, $end, $need_more_data))
return($this->SetPositionedError('unterminated DTD entity comment', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
if($need_more_data)
return(1);
if($s == $e)
{
if(!$this->SetPositionedWarning('unknown type of parameters after the end of DTD entity declaration', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
break;
}
$s = $e;
}
$element = $entity;
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $s + ($this->buffer[$s] == '>' ? 1 : 0);
$this->state = $this->last_state;
return(1);
case MARKUP_PARSER_GET_DTD_ELEMENT:
$s = $this->SkipCharacters(" \t\r\n", $this->buffer_position + strlen('<!ELEMENT'));
if($s >= $l)
return($this->SetUnterminatedError('DTD element', $end, $need_more_data));
$c = $this->FindCharacters('>', $e, $s);
if(strlen($c) == 0)
{
if($end)
return($this->SetUnterminatedError('DTD element', $end, $need_more_data));
$need_more_data = 1;
return(1);
}
$dtd_element = array(
'Type'=>'DTD_ELEMENT',
);
if(!$this->ParseDTDExpression($s, $dtd_element['Name']))
return(0);
if(!IsSet($dtd_element['Name']))
return($this->SetPositionedError('invalid DTD expression', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
$s = $this->SkipCharacters(" \t\r\n", $s);
$o = $s;
$this->GetTagAttribute($s, '', $start, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedError('DTD element', $end, $need_more_data));
switch($start)
{
case '-':
case 'O':
$dtd_element['Start'] = ($start != 'O');
break;
default:
$dtd_element['Start'] = $dtd_element['End'] = 1;
$s = $o;
}
if($s > $o)
{
$s = $this->SkipCharacters(" \t\r\n", $s);
$this->GetTagAttribute($s, '', $end, $need_more_data);
if($need_more_data)
return($this->SetUnterminatedError('DTD element', $end, $need_more_data));
switch($start)
{
case '-':
case 'O':
break;
default:
return($this->SetPositionedError('invalid DTD end tag omission value for element '.$dtd_element['Name'], MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
}
$dtd_element['End'] = ($end != 'O');
$s = $this->SkipCharacters(" \t\r\n", $s);
}
if(!$this->ParseDTDExpression($s, $dtd_element['Content']))
return(0);
if(!IsSet($dtd_element['Content']))
return($this->SetPositionedError('invalid DTD expression', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
$s = $this->SkipCharacters(" \t\r\n", $s);
if(!$this->ParseDTDExpression($s, $additional))
return(0);
if(IsSet($additional))
$dtd_element['Additional'] = $additional;
while($this->buffer[$s] != '>')
{
$s = $this->SkipCharacters(" \t\r\n", $s);
if(!$this->SkipComment($s, $e, $end, $need_more_data))
return($this->SetPositionedError('unterminated DTD entity comment', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
if($need_more_data)
return(1);
if($s == $e)
{
if(!$this->SetPositionedWarning('unknown type of parameters after the end of DTD entity declaration', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s))
return(0);
break;
}
$s = $e;
}
$element = $dtd_element;
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $s + ($this->buffer[$s] == '>' ? 1 : 0);
$this->state = $this->last_state;
return(1);
case MARKUP_PARSER_GET_DTD_ATTLIST:
if(!$this->SkipComment($this->buffer_position + strlen('<!ATTLIST'), $s, $end, $need_more_data))
return($this->SetUnterminatedError('DTD attribute list', $end, $need_more_data));
$c = $this->FindCharacters('>', $e, $s);
if(strlen($c) == 0)
{
if($end)
return($this->SetUnterminatedError('DTD attribute list', $end, $need_more_data));
$need_more_data = 1;
return(1);
}
$attlist = array(
'Type'=>'DTD_ATTLIST',
'Attributes'=>array(),
'Entities'=>array()
);
if(!$this->ParseDTDExpression($s, $attlist['Name']))
return(0);
if(!IsSet($attlist['Name']))
return($this->SetPositionedError('invalid DTD expression', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $this->offset + $s));
if(!$this->ParseAttList($s, $attlist, $end, $need_more_data))
return(0);
$element = $attlist;
if($this->store_positions)
$element['Position'] = $this->offset + $this->buffer_position;
$this->buffer_position = $s + 1;
$this->state = $this->last_state;
return(1);
default:
return($this->SetPositionedError($this->state.' is not a valid parser state', MARKUP_PARSER_ERROR_UNEXPECTED, $this->offset + $this->buffer_position));
}
}
Function ParseData($data, $end, &$elements)
{
$length = strlen($data);
if($this->track_lines
&& $length)
{
$line = $this->last_line;
$position = 0;
if($this->last_carriage_return)
{
if($data[0] == "\n")
++$position;
$this->lines[++$line] = $this->line_offset + $position;
$this->last_carriage_return = 0;
}
while($position < $length)
{
$position += strcspn($data, "\r\n", $position) ;
if($position >= $length)
break;
if($data[$position] == "\r")
{
++$position;
if($position >= $length)
{
$this->last_carriage_return = 1;
break;
}
if($data[$position] == "\n")
++$position;
$this->lines[++$line] = $this->line_offset + $position;
}
else
{
++$position;
$this->lines[++$line] = $this->line_offset + $position;
}
}
$this->last_line = $line;
$this->line_offset += $length;
}
$elements = array();
$this->buffer .= $data;
do
{
if(!$this->ParseElement($end, $element, $need_more_data))
return(0);
if(IsSet($element))
{
if($this->track_lines
&& $this->store_positions)
{
if(!$this->GetPositionLine($element['Position'], $element['Line'], $element['Column']))
return(0);
}
$elements[] = $element;
UnSet($element);
}
}
while(!$need_more_data
&& $this->state != MARKUP_PARSER_END);
if($end
&& $this->state!=MARKUP_PARSER_END)
return($this->SetError('reached a premature end of data', MARKUP_PARSER_ERROR_INVALID_SYNTAX));
if($this->buffer_position>0)
{
$this->offset += $this->buffer_position;
$this->buffer = substr($this->buffer, $this->buffer_position);
$this->buffer_position = 0;
}
return(1);
}
Function GetAttributeValue($attribute, $value)
{
return(' '.(strcspn($attribute, " \t\r\n") < strlen($attribute) ? '"'.$attribute.'"' : $attribute).(strlen($value) ? '='.(($this->quote_attribute_values || strcspn($value, " \t\r\n") < strlen($value)) ? '"'.str_replace('"', '"', $value).'"' : $value) : ''));
}
/* Public functions */
/*
{metadocument}
<function>
<name>GetPositionLine</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Get the line number of the document that corresponds to a
given position.</purpose>
<usage>Pass the document offset number as the position to be
located. Make sure the <variablelink>track_lines</variablelink>
variable is set to <booleanvalue>1</booleanvalue> before parsing
the document.</usage>
<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
the <variablelink>track_lines</variablelink> variable is set to
<booleanvalue>1</booleanvalue> and it was given a valid positive
position number that does not exceed the position of the last
parsed document line.</returnvalue>
</documentation>
<argument>
<name>position</name>
<type>INTEGER</type>
<documentation>
<purpose>Position of the line to be located.</purpose>
</documentation>
</argument>
<argument>
<name>line</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the line that corresponds to the
given document position.</purpose>
</documentation>
</argument>
<argument>
<name>column</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the column of the line that
corresponds to the given document position.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function GetPositionLine($position, &$line, &$column)
{
if(!$this->track_lines)
return($this->SetPositionedError('line positions are not being tracked', MARKUP_PARSER_ERROR_INVALID_USAGE, $position));
$bottom = 0;
$top = count($this->lines) - 1;
if($position < 0)
return($this->SetPositionedError('it was not specified a valid position', MARKUP_PARSER_ERROR_INVALID_USAGE, $position));
for(;;)
{
$line = intval(($bottom + $top) / 2);
$current = $this->lines[$line];
if($current < $position)
$bottom = $line + 1;
elseif($current > $position)
$top = $line - 1;
else
break;
if($top < $bottom)
{
$line = $top;
break;
}
}
$column = $position - $this->lines[$line] + 1;
++$line;
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>ParseDTDExpressionValue</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Parse the value of an element expression used in a
DTD.</purpose>
<usage>Use only if you need to expand entity values when parsing
DTDs.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if it is given a
valid DTD expression value.</returnvalue>
</documentation>
<argument>
<name>value</name>
<type>STRING</type>
<documentation>
<purpose>DTD expression value to be parsed.</purpose>
</documentation>
</argument>
<argument>
<name>expression</name>
<type>HASH</type>
<out />
<documentation>
<purpose>Array that defines the types and values of the parsed DTD
expression.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function ParseDTDExpressionValue($value, &$expression)
{
$buffer = $this->buffer;
$position = $this->buffer_position;
$this->buffer = $value;
$this->buffer_position = 0;
$p = 0;
$success = $this->ParseDTDExpression($p, $expression);
if($success
&& $p < strlen($value))
$success = $this->SetPositionedError('DTD expression syntax error', MARKUP_PARSER_ERROR_INVALID_SYNTAX, $p);
$this->buffer = $buffer;
$this->buffer_position = $position;
return($success);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>ParseAttributeList</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Parse the value of an attribute list expression used in a
DTD.</purpose>
<usage>Use only if you need to expand attribute list values when
parsing DTDs.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if it is given a
valid DTD attribute list expression value.</returnvalue>
</documentation>
<argument>
<name>value</name>
<type>STRING</type>
<documentation>
<purpose>Attribute list expression value to be parsed.</purpose>
</documentation>
</argument>
<argument>
<name>attlist</name>
<type>HASH</type>
<out />
<documentation>
<purpose>Array that defines the types and values of the parsed DTD
attribute list expression.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function ParseAttributeList($value, &$attlist)
{
$buffer = $this->buffer;
$position = $this->buffer_position;
$this->buffer = $value.'>';
$this->buffer_position = 0;
$s = 0;
$success = $this->ParseAttList($s, $attlist, 1, $need_more_data);
$this->buffer = $buffer;
$this->buffer_position = $position;
return($success);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>StartParsing</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Initialize the state of the markup parser.</purpose>
<usage>Call this function before start parsing the markup document,
passing the file name or data to be parse and eventually other
parsing option parameters.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if all
parameters are correctly defined.</returnvalue>
</documentation>
<argument>
<name>parameters</name>
<type>HASH</type>
<documentation>
<purpose>Specifies a list of options that define how to parse the
given document. Currently it has the following options:
<paragraphbreak />
<tt>Data</tt> - String with the markup data to be parsed
<paragraphbreak />
<tt>File</tt> - Name of the file from which the data to be parsed
should be read instead of a static string.
<paragraphbreak />
<tt>DecodeEntities</tt> - Alternative way to set the option for
determining whether the class should decode character entities,
as described for the
<variablelink>decode_entities</variablelink>.
</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function StartParsing($parameters)
{
$this->error = '';
$this->warnings = array();
$this->state = MARKUP_PARSER_START;
$this->last_state = $this->last_state;
$this->state_stack = array();
$this->buffer = '';
$this->buffer_position = 0;
$this->offset = 0;
$this->lines = ($this->track_lines ? array(0 => 0) : array());
$this->line_offset = 0;
$this->last_line = 0;
$this->last_carriage_return = 0;
$this->decoding_entities = (IsSet($parameters['DecodeEntities']) ? intval($parameters['DecodeEntities']) : $this->decode_entities);
if(IsSet($parameters['File']))
{
if(!($this->file = @fopen($parameters['File'], 'r')))
{
UnSet($this->file);
return($this->SetPHPError('Could not open the file '.$parameters['File'], MARKUP_PARSER_ERROR_INVALID_USAGE, $php_errormsg));
}
}
elseif(IsSet($parameters['Data']))
$this->data = $parameters['Data'];
else
return($this->SetError('it was not specified the markup data to parse', MARKUP_PARSER_ERROR_INVALID_USAGE));
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>Parse</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Parse the markup document.</purpose>
<usage>Call this function iteratively until the <argumentlink>
<function>Parse</function>
<argument>end</argument>
</argumentlink> argument is returned set to
<booleanvalue>1</booleanvalue>.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if there were no
fatal parsing errors.</returnvalue>
</documentation>
<argument>
<name>end</name>
<type>BOOLEAN</type>
<out />
<documentation>
<purpose>Determine when the parser reached the end of the
document.</purpose>
</documentation>
</argument>
<argument>
<name>elements</name>
<type>ARRAY</type>
<out />
<documentation>
<purpose>Return a sequence of associative arrays with entries that
describe each document element that was parsed.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function Parse(&$end, &$elements)
{
if(IsSet($this->file))
{
if(GetType($data = @fread($this->file, $this->buffer_length)) != 'string')
{
$this->SetPHPError('Could not read from the file', MARKUP_PARSER_ERROR_INVALID_USAGE, $php_errormsg);
$this->FinishParsing();
return(0);
}
$end = feof($this->file);
}
elseif($this->data)
{
$data = $this->data;
$end = 1;
}
if($end)
$this->FinishParsing();
if(IsSet($data))
{
if($this->ParseData($data, $end, $elements))
{
if($end)
$this->FinishParsing();
return(1);
}
}
else
$this->SetError('there is no data to parse', MARKUP_PARSER_ERROR_INVALID_USAGE);
$this->FinishParsing();
return(0);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>FinishParsing</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Close any files and release any resources allocated while
the document was being parsed.</purpose>
<usage>Call this function after you are done with parsing the markup
document.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if all resources
were successfully released.</returnvalue>
</documentation>
<do>
{/metadocument}
*/
Function FinishParsing()
{
if(IsSet($this->file))
{
fclose($this->file);
UnSet($this->file);
}
if(IsSet($this->data))
UnSet($this->data);
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>RewriteElement</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Generate a string for a previously parsed document markup
element.</purpose>
<usage>Call this function for each markup element when you want to
regenerated an element that was just parsed and eventually
filtered.</usage>
<returnvalue>Returns <booleanvalue>0</booleanvalue> if it is pass an
invalid element definition.</returnvalue>
</documentation>
<argument>
<name>element</name>
<type>HASH</type>
<documentation>
<purpose>Associative array that defines the type and the values of
the document element to be rewritten.</purpose>
</documentation>
</argument>
<argument>
<name>markup</name>
<type>STRING</type>
<out />
<documentation>
<purpose>Return the string of the rewritten document element.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function RewriteElement($element, &$markup)
{
if(!IsSet($element['Type']))
return($this->SetError('it was not specified a valid element type', MARKUP_PARSER_ERROR_INVALID_USAGE));
switch($element['Type'])
{
case 'TAG':
case 'ENDTAG':
$endtag = ($element['Type']=='ENDTAG');
$tag = $element['Name'];
if($this->tag_lower_case)
$tag = strtolower($tag);
$markup = '<'.($endtag ? '/' : '').$tag;
if(!$endtag
&& IsSet($element['Attributes']))
{
$attributes = $element['Attributes'];
$ta = count($attributes);
for($a = 0, Reset($attributes); $a < $ta; Next($attributes), ++$a)
{
$attribute = Key($attributes);
$value = $attributes[$attribute];
if($this->tag_lower_case)
$attribute = strtolower($attribute);
$markup.= $this->GetAttributeValue($attribute, $value);
}
}
if(IsSet($element['Close']))
$markup .= ' /';
$markup .= '>';
break;
case 'DATA':
$markup = $element['Data'];
break;
case 'DOCTYPE':
$markup = '<!DOCTYPE'.
(IsSet($element['Root'])
? $this->GetAttributeValue($element['Root'], '').(IsSet($element['Availability'])
? $this->GetAttributeValue($element['Availability'], '').(($element['Availability']=='PUBLIC' && IsSet($element['Identifier']))
? $this->GetAttributeValue($element['Identifier'], '')
: '').(IsSet($element['URL']) ? $this->GetAttributeValue($element['URL'], '') : '') : '').(IsSet($element['DTD']) ? ' ['.$element['DTD'].']' : '') : '').'>';
break;
case 'COMMENT':
$markup = '<!--'.$element['Comment'].'-->';
break;
case 'CDATA':
$markup = '<![CDATA['.$element['Data'].']]>';
break;
case 'ENTITY':
$markup = '&'.(IsSet($element['Entity']) ? $element['Entity'] : '#'.$element['Code']).';';
break;
case 'CONDITION':
$markup = '<!'.($element['Hidden'] ? '--' : '').'[if '.$element['Condition'].']>';
break;
case 'END_CONDITION':
$markup = '<![endif]'.($element['Hidden'] ? '--' : '').'>';
break;
default:
return($this->SetPositionedError('rewriting elements of type '.$element['Type'].' is not yet supported', MARKUP_PARSER_ERROR_UNEXPECTED, IsSet($element['Position']) ? $element['Position'] : -1));
}
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
};
/*
{metadocument}
</class>
{/metadocument}
*/
?>