<?php
/*
* markup_filter_safe_html.php
*
* @(#) $Id: markup_filter_safe_html.php,v 1.42 2009/08/23 06:49:45 mlemos Exp $
*
*/
define('MARKUP_FILTER_SAFE_HTML_START', 0);
define('MARKUP_FILTER_SAFE_HTML_GET_ELEMENT', 1);
define('MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE', 2);
define('MARKUP_FILTER_SAFE_HTML_CHECK_STYLES', 3);
define('MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES', 4);
define('MARKUP_FILTER_SAFE_HTML_END', 5);
define('MARKUP_FILTER_SAFE_HTML_ERROR_NONE', MARKUP_PARSER_ERROR_NONE);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED', MARKUP_PARSER_ERROR_UNEXPECTED);
define('MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX', MARKUP_PARSER_ERROR_INVALID_SYNTAX);
define('MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE', MARKUP_PARSER_ERROR_INVALID_USAGE);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG', 201);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE', 202);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE', 203);
define('MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT', 204);
/*
{metadocument}<?xml version="1.0" encoding="ISO-8859-1" ?>
<class>
<package>net.manuellemos.markupparser</package>
<version>@(#) $Id: markup_filter_safe_html.php,v 1.42 2009/08/23 06:49:45 mlemos Exp $</version>
<copyright>Copyright © (C) Manuel Lemos 2009</copyright>
<title>Markup filter safe HTML</title>
<author>Manuel Lemos</author>
<authoraddress>mlemos-at-acm.org</authoraddress>
<documentation>
<idiom>en</idiom>
<purpose>Parse an HTML document and remove all unsafe tags and CSS
styles that may contain Javascript code and other harmful HTML
structures.
<paragraphbreak />
Unsafe HTML is often submitted by untrusted users to sites that
accept user submitted content. Such HTML may contain Javascript that
could be used to perform <link>
<data>cross-site scripting</data>
<url>http://en.wikipedia.org/wiki/Cross-site_scripting</url>
</link> (XSS) or <link>
<data>cross-site request forgery</data>
<url>http://en.wikipedia.org/wiki/Cross-site_request_forgery</url>
</link> (CSRF) attacks.</purpose>
<usage>Use the <functionlink>StartParsing</functionlink> function to
initialize the parser. Then use the
<functionlink>Parse</functionlink> function to make the class parse
HTML data, eventually read from files. When you are done with
feeding the whole document data, call the
<functionlink>FinishParsing</functionlink> function.
<paragraphbreak />
The <functionlink>Parse</functionlink> function returns arrays of
tokens that describe each document element. The
<functionlink>RewriteElement</functionlink> function can be used to
convert the tokens back to HTML document strings.
<paragraphbreak />
By default, the class uses the markup validator class to parse the
HTML documents before it actually analyzes and filters unsafe tags
from the documents. Use the <functionlink>SetInput</functionlink>
function to set a different filter object as source of parsed
document elements.
<paragraphbreak />
Element tokens are associated to the respective positions
in the document. Positions are numbers that represent their offsets
relative to beginning of the document. The
<functionlink>GetPositionLine</functionlink> function can return the
line and column number associated to a given document position if
the <variablelink>track_lines</variablelink> is set to
<booleanvalue>1</booleanvalue>.
<paragraphbreak />
The class may also parse and filter individual CSS stylesheets using
the function <functionlink>FilterStylesheet</functionlink>. The
function <functionlink>GetStylesheetPositionLine</functionlink> may
be used to determine the line associated to the position of an
error.</usage>
</documentation>
{/metadocument}
*/
class markup_filter_safe_html_class
{
/*
{metadocument}
<variable>
<name>error</name>
<type>STRING</type>
<value></value>
<documentation>
<purpose>Store the message that is returned when an error
occurs.</purpose>
<usage>Check this variable to understand what happened when a call to
any of the class functions has failed.<paragraphbreak />
This class uses cumulative error handling. This means that if one
class functions that may fail is called and this variable was
already set to an error message due to a failure in a previous call
to the same or other function, the function will also fail and does
not do anything.<paragraphbreak />
This allows programs using this class to safely call several
functions that may fail and only check the failure condition after
the last function call.<paragraphbreak />
Just set this variable to an empty string to clear the error
condition.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error = '';
/*
{metadocument}
<variable>
<name>error_code</name>
<type>INTEGER</type>
<value>0</value>
<documentation>
<purpose>Store the code that is returned when an error
occurs.</purpose>
<usage>Check this variable to understand what happened when a call
to any of the class functions has failed. It may be set to several
possible error codes defined as constants:<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_NONE</tt> - No error happened
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED</tt> - It was found a
condition that the class is not yet ready to handle
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX</tt> - A syntax
error was found
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE</tt> - An invalid
value was passed to the class function parameters or set to the
class variables
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG</tt> - A tag
considered unsafe was found
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE</tt> - A tag
attribute considered unsafe was found
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE</tt> - A CSS
style considered unsafe was found
<paragraphbreak />
<tt>MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT</tt> - An HTML
comment with Server Side Include (SSI) commands was found</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error_code = MARKUP_FILTER_SAFE_HTML_ERROR_NONE;
/*
{metadocument}
<variable>
<name>error_position</name>
<type>INTEGER</type>
<value>-1</value>
<documentation>
<purpose>Point to the position of the markup data or file that
refers to the last error that occurred.</purpose>
<usage>Check this variable to determine the relevant position of the
document when a parsing error occurs.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $error_position = -1;
/*
{metadocument}
<variable>
<name>buffer_length</name>
<type>INTEGER</type>
<value>8000</value>
<documentation>
<purpose>Maximum length of the chunks of markup data read from files
that the class parse at one time.</purpose>
<usage>Adjust this value according to the available memory.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $buffer_length = 8000;
/*
{metadocument}
<variable>
<name>ignore_syntax_errors</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Specify whether the class should ignore syntax errors in
malformed documents.</purpose>
<usage>Set this variable to <booleanvalue>0</booleanvalue> if it is
necessary to verify whether markup data may be corrupted due to
to eventual bugs in the program that generated the
document.<paragraphbreak />
Currently the class only ignores some types of syntax errors.
Other syntax errors may still cause the
<functionlink>Parse</functionlink> to fail.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $ignore_syntax_errors=1;
/*
{metadocument}
<variable>
<name>warnings</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>Return a list of positions of the original document that
contain syntax errors.</purpose>
<usage>Check this variable to retrieve eventual document syntax
errors that were ignored when the
<variablelink>ignore_syntax_errors</variablelink> is set to
<booleanvalue>1</booleanvalue>.<paragraphbreak />
The indexes of this array are the positions of the errors. The
array values are the corresponding syntax error messages.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $warnings=array();
/*
{metadocument}
<variable>
<name>store_positions</name>
<type>BOOLEAN</type>
<value>1</value>
<documentation>
<purpose>Tell the class to return the position of each document
element token.</purpose>
<usage>Set this variable to <integervalue>0</integervalue> if you do
not need to know the position of each parsed markup element.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $store_positions = 1;
/*
{metadocument}
<variable>
<name>track_lines</name>
<type>BOOLEAN</type>
<value>0</value>
<documentation>
<purpose>Tell the class to keep track the position of each document
line.</purpose>
<usage>Set this variable to <integervalue>1</integervalue> if you
need to determine the line and column number associated to a given
position of the parsed document.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $track_lines = 0;
/*
{metadocument}
<variable>
<name>unsafe_tags</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>List of tags that may be unsafe.</purpose>
<usage>Change the default list of unsafe tags only if you realize
there are more tags that should be considered unsafe.
<paragraphbreak />
Currently, the tags considered unsafe are: APPLET, IFRAME, OBJECT
and SCRIPT.
<paragraphbreak />
It is not necessary to add proprietary tags because those will be
discarded by another class that validates the HTML according to a
standard DTD.
<paragraphbreak />
All the entries in this array variable must be set with a key with
the name of the unsafe tag that the class should discard. The
entry values should be set to an empty array to allow eventual
parameters in future versions of this class.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $unsafe_tags = array(
'APPLET'=>array(),
'IFRAME'=>array(),
'OBJECT'=>array(),
'SCRIPT'=>array(),
);
/*
{metadocument}
<variable>
<name>safe_proprietary_css_properties</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>List of proprietary CSS properties that should be
considered safe to allow. Proprietary tags start with the -
character.</purpose>
<usage>Change the default list of safe CSS properties only if you
realize there are more properties that should be considered safe.
<paragraphbreak />
All the entries in this array variable must be set with a key with
the name of the safe property that the class should allow. The
entry values should be set to an empty array to allow eventual
parameters in future versions of this class.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $safe_proprietary_css_properties = array(
);
/*
{metadocument}
<variable>
<name>safe_css_property_types</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>List of types of expressions that should be considered safe
to allow in CSS style values.</purpose>
<usage>Change the default list of safe CSS property types only if
you realize there are more properties that should be considered
safe.
<paragraphbreak />
Currently, the types considered safe are: delimiter, dimension,
function, hash, identifier, number, percentage, string and uri.
<paragraphbreak />
All the entries in this array variable must be set with a key with
the name of the safe property type that the class should allow.
The entry values should be set to an empty array to allow eventual
parameters in future versions of this class.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $safe_css_property_types = array(
'delimiter'=>array(),
'dimension'=>array(),
'function'=>array(),
'hash'=>array(),
'identifier'=>array(),
'number'=>array(),
'percentage'=>array(),
'string'=>array(),
'uri'=>array(),
);
/*
{metadocument}
<variable>
<name>safe_css_property_functions</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>List of the names of functions that should be considered
safe to allow in CSS style values.</purpose>
<usage>Change the default list of safe CSS property functions only
if you realize there are more functions that should be considered
safe.
<paragraphbreak />
All the entries in this array variable must be set with a key with
the name of the safe function that the class should allow. The
entry values should be set to an empty array to allow eventual
parameters in future versions of this class.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $safe_css_property_functions = array();
/*
{metadocument}
<variable>
<name>safe_url_schemes</name>
<type>HASH</type>
<value></value>
<documentation>
<purpose>List of schemes that should be considered safe to allow in
URLs.</purpose>
<usage>Change the default list of safe URL schemes only if you
realize there are more schemes that should be considered safe.
<paragraphbreak />
All the entries in this array variable must be set with a key with
the name of the scheme that the class should allow. The entry
values should be set to an empty array to allow eventual
parameters in future versions of this class.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $safe_url_schemes = array(
'ftp' => array(),
'ftps' => array(),
'http' => array(),
'https' => array(),
'mailto' => array(),
'news' => array(),
'nntp' => array(),
);
/*
{metadocument}
<variable>
<name>allow_server_side_includes</name>
<type>BOOLEAN</type>
<value>0</value>
<documentation>
<purpose>Tell the class whether it should parse HTML comments to
determine whether it contains server side includes (SSI) that
should be filtered.</purpose>
<usage>Set this variable to <booleanvalue>1</booleanvalue> only if
you need to retrieve all comments unfiltered, even if they may
server side include commands.</usage>
</documentation>
</variable>
{/metadocument}
*/
var $allow_server_side_includes = 0;
/* Private variables */
var $input;
var $buffer = array();
var $buffer_position = 0;
var $state = MARKUP_FILTER_SAFE_HTML_START;
var $unsafe = '';
var $safe_styles = array();
var $flush_style = 0;
var $css;
/* Private functions */
Function SetError($error, $code)
{
$this->error = $error;
$this->error_code = $code;
return(0);
}
Function SetPositionedError($error, $code, $position)
{
$this->error_position = $position;
return($this->SetError($error, $code));
}
Function SetPositionedWarning($error, $code, $position)
{
if(!$this->ignore_syntax_errors)
return($this->SetPositionedError($error, $code, $position));
$this->warnings[$position]=$error;
return(1);
}
Function CheckUnsafeURL($url)
{
$l = strlen($url);
for($u = 0; $u < $l; ++$u)
{
if(ord($url[$u]) > 32)
{
if($u > 0)
{
$url = substr($url, $u);
$l -= $u;
}
break;
}
}
$colon = strcspn($url, "\0\1\2\3\4\5\6\7\x8\x9\xa\xb\xc\xd\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f :/.");
if(($colon < $l
&& strcspn($url[$colon], "\0\1\2\3\4\5\6\7\x8\x9\xa\xb\xc\xd\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ") < 1)
|| !($u = @parse_url(trim($url))))
return(0);
if(IsSet($u['scheme'])
&& !IsSet($this->safe_url_schemes[strtolower($u['scheme'])]))
return(0);
return(1);
}
Function FilterStyle(&$style, $position, $add_element_offset, &$unsafe)
{
$unsafe = 0;
$tp = count($style);
for($p = 0; $p < $tp; ++$p)
{
$property = $style[$p]['Property'];
if(!strcmp($property[0], '-')
&& !IsSet($this->safe_proprietary_css_properties[strtolower($property)]))
{
if(!$this->SetPositionedWarning('proprietary style property '.$property.' is considered unsafe', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Position'] : 0)))
return(0);
$unsafe = 1;
Unset($style[$p]);
continue;
}
$te = count($style[$p]['Value']);
for($e = 0; $e < $te; ++$e)
{
$type = $style[$p]['Value'][$e]['Type'];
if(!IsSet($this->safe_css_property_types[$type]))
{
if(!$this->SetPositionedWarning('style '.$property.' property has an expression of an unsafe type ('.$type.')', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
return(0);
$unsafe = 1;
Unset($style[$p]);
continue 2;
}
switch($type)
{
case 'function':
if(!IsSet($this->safe_css_property_functions[$function = $style[$p]['Value'][$e]['Function']]))
{
if(!$this->SetPositionedWarning('style '.$property.' property has an unsafe function ('.$function.')', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
return(0);
$unsafe = 1;
Unset($style[$p]);
continue 3;
}
break;
case 'uri':
if(!$this->CheckUnsafeURL($style[$p]['Value'][$e]['URI']))
{
if(!$this->SetPositionedWarning('style '.$property.' property has an unsafe URL', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
return(0);
$unsafe = 1;
Unset($style[$p]);
continue 3;
}
break;
}
}
}
return(1);
}
Function FilterAttributes(&$raw, $is_style, &$unsafe)
{
$unsafe = 0;
if(IsSet($raw['Attributes']))
{
$attributes = $raw['Attributes'];
$decoded = (IsSet($raw['DecodedAttributes']) ? $raw['DecodedAttributes'] : array());
$ta = count($attributes);
$clean = $decoded_clean = array();
for(Reset($attributes), $a = 0; $a < $ta; Next($attributes), ++$a)
{
$attribute = Key($attributes);
$upper = strtoupper($attribute);
if(!strcmp(substr($upper, 0, 2), 'ON'))
{
if(!$this->SetPositionedWarning('unsafe event attribute '.$attribute, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE, $raw['Position']))
return(0);
continue;
}
$value = $attributes[$attribute];
$decoded_value = (IsSet($decoded[$attribute]) ? $decoded[$attribute] : $attributes[$attribute]);
if($is_style
&& $upper === 'TYPE'
&& trim(strtolower($decoded_value)) !== 'text/css')
{
if(!$this->SetPositionedWarning('unsafe '.($raw['Type'] == 'TAG' ? 'tag' : 'end tag').' <'.$raw['Name'].'> '.$attribute.' attribute value '.$decoded_value, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG, $raw['Position']))
return(0);
$unsafe = 1;
return(1);
}
if(IsSet($raw['AttributeTypes']['URI'][$attribute])
&& !$this->CheckUnsafeURL($decoded_value))
{
if(!$this->SetPositionedWarning('unsafe URL in attribute '.$attribute, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE, $raw['Position']))
return(0);
continue;
}
if(IsSet($raw['AttributeTypes']['StyleSheet'][$attribute]))
{
$css = new css_parser_class;
$css->store_positions = 1;
$css->track_lines = 0;
if(!$css->ParseStyleProperties($decoded_value, $style))
{
if(!$this->SetPositionedWarning($css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $raw['Position']))
return(0);
continue;
}
if(!$this->FilterStyle($style, $raw['Position'], 0, $unsafe_style))
return(0);
if($unsafe_style
&& count($style) == 0)
continue;
if(!$css->RewriteStyleProperties($style, $decoded_value))
return(0);
$value = HtmlSpecialChars($decoded_value);
}
$clean[$attribute] = $value;
if(IsSet($decoded[$attribute]))
$decoded_clean[$attribute] = $decoded_value;
}
if(count($clean))
{
$raw['Attributes'] = $clean;
if(IsSet($raw['DecodedAttributes']))
$raw['DecodedAttributes'] = $decoded_clean;
}
else
{
UnSet($raw['Attributes']);
Unset($raw['DecodedAttributes']);
}
}
return(1);
}
Function FilterElement($end, &$element, &$need_more_data)
{
$need_more_data = 0;
$l = count($this->buffer);
switch($this->state)
{
case MARKUP_FILTER_SAFE_HTML_START:
$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
case MARKUP_FILTER_SAFE_HTML_GET_ELEMENT:
if($this->buffer_position >= $l)
{
if($end)
$this->state = MARKUP_FILTER_SAFE_HTML_END;
else
$need_more_data = 1;
return(1);
}
$raw = $this->buffer[$this->buffer_position];
switch($raw['Type'])
{
case 'TAG':
case 'ENDTAG':
$tag = strtoupper($raw['Name']);
if(IsSet($this->unsafe_tags[$tag]))
{
if(!$this->SetPositionedWarning('unsafe '.($raw['Type'] == 'TAG' ? 'tag' : 'end tag').' <'.$raw['Name'].'>', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG, $raw['Position']))
return(0);
$this->unsafe = $tag;
++$this->buffer_position;
if($this->buffer_position >= $l)
{
if($end)
$this->state = MARKUP_FILTER_SAFE_HTML_END;
else
$need_more_data = 1;
}
elseif($raw['Type'] == 'TAG')
$this->state = MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE;
return(1);
}
$is_style = ($tag === 'STYLE' && $raw['Type'] == 'TAG');
if(!$this->FilterAttributes($raw, $is_style, $unsafe))
return(0);
if($unsafe)
{
$this->unsafe = $tag;
++$this->buffer_position;
if($this->buffer_position >= $l)
{
if($end)
$this->state = MARKUP_FILTER_SAFE_HTML_END;
else
$need_more_data = 1;
}
elseif($raw['Type'] == 'TAG')
$this->state = MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE;
return(1);
}
if($is_style)
{
$this->unsafe = $tag;
$this->state = MARKUP_FILTER_SAFE_HTML_CHECK_STYLES;
return(1);
}
++$this->buffer_position;
break;
case 'COMMENT':
if(!$this->allow_server_side_includes
&& !strcmp(substr(ltrim($raw['Comment']), 0, 1), '#'))
{
if(!$this->SetPositionedWarning('comment may contain server side include command', MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT, $raw['Position']))
return(0);
++$this->buffer_position;
return(1);
}
++$this->buffer_position;
break;
default:
++$this->buffer_position;
break;
}
$element = $raw;
return(1);
case MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE:
for($position = $this->buffer_position; $position < $l; ++$position)
{
$raw = $this->buffer[$position];
switch($raw['Type'])
{
case 'ENDTAG':
if(!strcmp($this->unsafe, strtoupper($raw['Name'])))
{
$this->buffer_position = $position + 1;
$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
return(1);
}
break;
}
}
if($end)
{
if(!$this->SetPositionedError('end tag '.$this->unsafe.' is missing', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$this->buffer_position]['Position']))
return(0);
$this->buffer_position = $position;
$this->state = MARKUP_FILTER_SAFE_HTML_END;
}
else
$need_more_data = 1;
return(1);
case MARKUP_FILTER_SAFE_HTML_CHECK_STYLES:
for($last_data = 0, $filter = array(), $position = $this->buffer_position + 1; $position < $l; ++$position)
{
$raw = $this->buffer[$position];
switch($raw['Type'])
{
case 'DATA':
if($last_data)
$filter[count($filter) - 1]['Data'] .= $raw['Data'];
else
{
$filter[] = $raw;
$last_data = 1;
}
break;
case 'ENTITY':
$data = ((IsSet($raw['Code']) && ($code = $raw['Code']) < 255) ? Chr($code) : '&'.$raw['Entity'].';');
if($last_data)
$filter[count($filter) - 1]['Data'] .= $data;
else
{
$filter[] = array(
'Type'=>'DATA',
'Data'=>$data,
'Position'=>$raw['Position']
);
$last_data = 1;
}
break;
case 'COMMENT':
$filter[] = $raw;
$last_data = 0;
break;
case 'ENDTAG':
if(!strcmp($this->unsafe, strtoupper($raw['Name'])))
{
$raw = $this->buffer[$this->buffer_position];
if(!$this->FilterAttributes($raw, 1, $unsafe))
return(0);
if($unsafe)
{
$this->buffer_position = $position +1;
$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
return(1);
}
$this->safe_styles = array();
$tf = count($filter);
$total = 0;
for($f = 0; $f < $tf; ++$f)
{
switch($filter[$f]['Type'])
{
case 'DATA':
$stylesheet = $filter[$f]['Data'];
$style_position = $filter[$f]['Position'];
break;
case 'COMMENT':
$stylesheet = $filter[$f]['Comment'];
$style_position = $filter[$f]['CommentPosition'];
break;
}
if(!$this->FilterStylesheet($stylesheet, $filtered, $style_position))
return(0);
if(strlen($filtered))
{
if(strcmp($filtered, $stylesheet))
{
switch($filter[$f]['Type'])
{
case 'DATA':
$filter[$f]['Data'] = $filtered;
break;
case 'COMMENT':
$filter[$f]['Comment'] = "\n".$filtered;
break;
}
}
$this->safe_styles[] = $filter[$f];
++$total;
}
}
if($total > 0)
{
$element = $raw;
$this->flush_style = 0;
$this->buffer_position = $position;
$this->state = MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES;
}
else
{
$this->buffer_position = $position + 1;
$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
}
return(1);
}
if(!$this->SetPositionedWarning('invalid end tag </'.$raw['Name'].'> in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
return(0);
break;
case 'TAG':
if(!$this->SetPositionedWarning('invalid tag <'.$raw['Name'].'> in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
return(0);
break;
case 'CDATA':
if(!$this->SetPositionedWarning('invalid CDATA section in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
return(0);
break;
default:
var_dump($raw);
if(!$this->SetPositionedWarning('invalid '.$raw['Type'].' section in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $this->buffer[$position]['Position']))
return(0);
break;
}
}
if($end)
{
if(!$this->SetPositionedError('end tag '.$this->unsafe.' is missing', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$this->buffer_position]['Position']))
return(0);
$this->buffer_position = $position;
$this->state = MARKUP_FILTER_SAFE_HTML_END;
}
else
$need_more_data = 1;
return(1);
case MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES:
if($this->flush_style < count($this->safe_styles))
{
$element = $this->safe_styles[$this->flush_style];
++$this->flush_style;
}
else
{
$this->safe_styles = array();
$element = $this->buffer[$this->buffer_position];
$this->buffer_position = $this->buffer_position + 1;
$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
}
return(1);
default:
return($this->SetPositionedError($this->state.' is not a valid parser state', MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $this->buffer[$this->buffer_position]['Position']));
}
}
Function FilterElements($raw, $end, &$elements)
{
$length = count($raw);
$elements = array();
if(count($this->buffer) == 0)
$this->buffer = $raw;
else
{
for($e = 0; $e < $length; ++$e)
$this->buffer[] = $raw[$e];
}
do
{
if(!$this->FilterElement($end, $element, $need_more_data))
return(0);
if(IsSet($element))
{
$elements[] = $element;
UnSet($element);
}
}
while(!$need_more_data
&& $this->state != MARKUP_FILTER_SAFE_HTML_END);
if($end
&& $this->state!=MARKUP_FILTER_SAFE_HTML_END)
return($this->SetError('reached a premature end of data', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX));
if($this->buffer_position == count($this->buffer))
{
$this->buffer = array();
$this->buffer_position = 0;
}
return(1);
}
/* Public functions */
/*
{metadocument}
<function>
<name>SetInput</name>
<type>VOID</type>
<documentation>
<purpose>Set the object of the class that will be used to parse HTML
document before it is filtered by this class.</purpose>
<usage>Use this function only if you need to override the HTML
parsing class, which is the markup validator class by default is
the markup filter validator class.</usage>
</documentation>
<argument>
<name>input</name>
<type>OBJECT</type>
<documentation>
<purpose>Reference to the HTML parser input object.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function SetInput(&$input)
{
$this->input = &$input;
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>GetPositionLine</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Get the line number of the document that corresponds to a
given position.</purpose>
<usage>Pass the document offset number as the position to be
located. Make sure the <variablelink>track_lines</variablelink>
variable is set to <booleanvalue>1</booleanvalue> before parsing
the document.</usage>
<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
the <variablelink>track_lines</variablelink> variable is set to
<booleanvalue>1</booleanvalue> and it was given a valid positive
position number that does not exceed the position of the last
parsed document line.</returnvalue>
</documentation>
<argument>
<name>position</name>
<type>INTEGER</type>
<documentation>
<purpose>Position of the line to be located.</purpose>
</documentation>
</argument>
<argument>
<name>line</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the line that corresponds to the
given document position.</purpose>
</documentation>
</argument>
<argument>
<name>column</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the column of the line that
corresponds to the given document position.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function GetPositionLine($position, &$line, &$column)
{
if(!IsSet($this->input))
return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
if(!$this->input->GetPositionLine($position, $line, $column))
return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>FilterStylesheet</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Filter a CSS stylesheet to discard unsafe style
definitions.</purpose>
<usage>Pass a string with the text of the stylesheet to filter.</usage>
<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
the stylesheet string was parsed successfully.</returnvalue>
</documentation>
<argument>
<name>stylesheet</name>
<type>STRING</type>
<documentation>
<purpose>String of the stylesheet to parse.</purpose>
</documentation>
</argument>
<argument>
<name>filtered</name>
<type>STRING</type>
<out />
<documentation>
<purpose>Returns the filtered stylesheet without any unsafe CSS
style definitions.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function FilterStylesheet($stylesheet, &$filtered, $position = 0)
{
$this->css = new css_parser_class;
$this->css->store_positions = 1;
$this->css->track_lines = ($this->track_lines && $position == 0);
$filtered = '';
if(!$this->css->ParseStylesheet($stylesheet, $styles))
{
if(!$this->SetPositionedWarning('stylesheet error: '.$this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $position + $this->css->error_position))
return(0);
}
else
{
$te = count($styles);
for($total_unsafe = 0, $e = 0; $e < $te; ++$e)
{
switch($styles[$e]['Type'])
{
case 'ruleset':
if(!$this->FilterStyle($styles[$e]['RuleSet']['Properties'], $position, 1, $unsafe))
return(0);
if($unsafe)
++$total_unsafe;
if(!$this->css->RewriteStyle($styles[$e], $style))
return($this->SetPositionedError('style rewrite error: '.$this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $position + $styles[$e]['Position']));
$filtered .= $style;
break;
default:
return($this->SetPositionedError('it is not possible to handle stylesheet elements of type '.$styles[$e]['Type'], MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $position + $styles[$e]['Position']));
}
}
if($total_unsafe == 0)
$filtered = $stylesheet;
}
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>GetStylesheetPositionLine</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Get the line number of a given position of the original
stylesheet filtered with the
<functionlink>FilterStylesheet</functionlink> function.</purpose>
<usage>Pass the stylesheet offset number as the position to be
located. Make sure the <variablelink>track_lines</variablelink>
variable is set to <booleanvalue>1</booleanvalue> before parsing
the stylesheet.</usage>
<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
the <variablelink>track_lines</variablelink> variable is set to
<booleanvalue>1</booleanvalue> and it was given a valid positive
position number that does not exceed the position of the last
parsed stylesheet line.</returnvalue>
</documentation>
<argument>
<name>position</name>
<type>INTEGER</type>
<documentation>
<purpose>Position of the line to be located.</purpose>
</documentation>
</argument>
<argument>
<name>line</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the line that corresponds to the
given stylesheet position.</purpose>
</documentation>
</argument>
<argument>
<name>column</name>
<type>INTEGER</type>
<out />
<documentation>
<purpose>Returns the number of the column of the line that
corresponds to the given stylesheet position.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function GetStylesheetPositionLine($position, &$line, &$column)
{
if(!IsSet($this->css))
return($this->SetPositionedError('no stylesheet was previously filtered', $position));
if(!$this->css->GetPositionLine($position, $line, $column))
return($this->SetPositionedError($this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE, $position));
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>StartParsing</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Initialize the state of the markup parser.</purpose>
<usage>Call this function before start parsing the markup document,
passing the file name or data to be parse and eventually other
parsing option parameters.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if all
parameters are correctly defined.</returnvalue>
</documentation>
<argument>
<name>parameters</name>
<type>HASH</type>
<documentation>
<purpose>Specifies a list of options that define how to parse the
given document. Currently it has the following options:
<paragraphbreak />
<tt>Data</tt> - String with the markup data to be parsed
<paragraphbreak />
<tt>File</tt> - Name of the file from which the data to be parsed
should be read instead of a static string.
<paragraphbreak />
<tt>OnlyBody</tt> - Determine whether the HTML document should be
parsed just as the BODY section or as a complete HTML document.
<paragraphbreak />
<tt>DTDCachePath</tt> - Path of directory where the cached DTD
files will be stored to prevent the overhead of fecthing the DTD
files from the remote DTD sites every time an HTML document is
parsed. If this parameter is missing, the DTD will not be cached.
</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function StartParsing($parameters)
{
if(!IsSet($this->input))
{
$this->input = new markup_filter_validator_class;
$this->input->track_lines = $this->track_lines;
$this->input->buffer_length = $this->buffer_length;
$this->input->ignore_syntax_errors = $this->ignore_syntax_errors;
$this->input->store_positions = $this->store_positions;
}
$this->buffer = array();
$this->buffer_position = 0;
$this->state = MARKUP_FILTER_SAFE_HTML_START;
$this->warnings = array();
$parameters['DecodeEntities'] = 1;
$parameters['AttributeTypes']= array(
'URI'=>array(),
'StyleSheet'=>array(),
);
if(!$this->input->StartParsing($parameters))
return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>Parse</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Parse the markup document.</purpose>
<usage>Call this function iteratively until the <argumentlink>
<function>Parse</function>
<argument>end</argument>
</argumentlink> argument is returned set to
<booleanvalue>1</booleanvalue>.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if there were no
fatal parsing errors.</returnvalue>
</documentation>
<argument>
<name>end</name>
<type>BOOLEAN</type>
<out />
<documentation>
<purpose>Determine when the parser reached the end of the
document.</purpose>
</documentation>
</argument>
<argument>
<name>elements</name>
<type>ARRAY</type>
<out />
<documentation>
<purpose>Return a sequence of associative arrays with entries that
describe each document element that was parsed.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function Parse(&$end, &$elements)
{
if(!IsSet($this->input))
return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
if(!$this->input->Parse($end, $raw))
{
$this->FinishParsing();
return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
}
if(!$this->FilterElements($raw, $end, $elements))
{
$this->FinishParsing();
return(0);
}
if($end)
$this->FinishParsing();
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>FinishParsing</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Close any files and release any resources allocated while
the document was being parsed.</purpose>
<usage>Call this function after you are done with parsing the markup
document.</usage>
<returnvalue>Returns <booleanvalue>1</booleanvalue> if all resources
were successfully released.</returnvalue>
</documentation>
<do>
{/metadocument}
*/
Function FinishParsing()
{
if(!IsSet($this->input))
return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
if(!$this->input->FinishParsing())
return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
$tw = count($this->input->warnings);
for(Reset($this->input->warnings), $w = 0; $w < $tw; Next($this->input->warnings), ++$w)
{
$warning = Key($this->input->warnings);
if(!IsSet($this->warnings[$warning]))
$this->warnings[$warning] = $this->input->warnings[$warning];
}
return(1);
}
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
<function>
<name>RewriteElement</name>
<type>BOOLEAN</type>
<documentation>
<purpose>Generate a string for a previously parsed document markup
element.</purpose>
<usage>Call this function for each markup element when you want to
regenerated an element that was just parsed and eventually
filtered.</usage>
<returnvalue>Returns <booleanvalue>0</booleanvalue> if it is pass an
invalid element definition.</returnvalue>
</documentation>
<argument>
<name>element</name>
<type>HASH</type>
<documentation>
<purpose>Associative array that defines the type and the values of
the document element to be rewritten.</purpose>
</documentation>
</argument>
<argument>
<name>markup</name>
<type>STRING</type>
<out />
<documentation>
<purpose>Return the string of the rewritten document element.</purpose>
</documentation>
</argument>
<do>
{/metadocument}
*/
Function RewriteElement($element, &$markup)
{
if(!IsSet($this->input))
return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
if(!$this->input->RewriteElement($element, $markup))
return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
return(1);
}
};
/*
{metadocument}
</do>
</function>
{/metadocument}
*/
/*
{metadocument}
</class>
{/metadocument}
*/
?>