Location: PHPKode > scripts > Secure HTML parser and filter,XSS,CSRF > secure-html-parser-and-filter/markup_filter_safe_html.php
<?php
/*
 * markup_filter_safe_html.php
 *
 * @(#) $Id: markup_filter_safe_html.php,v 1.42 2009/08/23 06:49:45 mlemos Exp $
 *
 */

define('MARKUP_FILTER_SAFE_HTML_START',        0);
define('MARKUP_FILTER_SAFE_HTML_GET_ELEMENT',  1);
define('MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE',  2);
define('MARKUP_FILTER_SAFE_HTML_CHECK_STYLES', 3);
define('MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES', 4);
define('MARKUP_FILTER_SAFE_HTML_END',          5);

define('MARKUP_FILTER_SAFE_HTML_ERROR_NONE',            MARKUP_PARSER_ERROR_NONE);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED',      MARKUP_PARSER_ERROR_UNEXPECTED);
define('MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX',  MARKUP_PARSER_ERROR_INVALID_SYNTAX);
define('MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE',   MARKUP_PARSER_ERROR_INVALID_USAGE);

define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG',       201);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE', 202);
define('MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE', 203);
define('MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT',      204);


/*
{metadocument}<?xml version="1.0" encoding="ISO-8859-1" ?>
<class>

	<package>net.manuellemos.markupparser</package>

	<version>@(#) $Id: markup_filter_safe_html.php,v 1.42 2009/08/23 06:49:45 mlemos Exp $</version>
	<copyright>Copyright © (C) Manuel Lemos 2009</copyright>
	<title>Markup filter safe HTML</title>
	<author>Manuel Lemos</author>
	<authoraddress>mlemos-at-acm.org</authoraddress>

	<documentation>
		<idiom>en</idiom>
		<purpose>Parse an HTML document and remove all unsafe tags and CSS
			styles that may contain Javascript code and other harmful HTML
			structures.
			<paragraphbreak />
			Unsafe HTML is often submitted by untrusted users to sites that
			accept user submitted content. Such HTML may contain Javascript that
			could be used to perform <link>
				<data>cross-site scripting</data>
				<url>http://en.wikipedia.org/wiki/Cross-site_scripting</url>
			</link> (XSS) or <link>
				<data>cross-site request forgery</data>
				<url>http://en.wikipedia.org/wiki/Cross-site_request_forgery</url>
			</link> (CSRF) attacks.</purpose>
		<usage>Use the <functionlink>StartParsing</functionlink> function to
			initialize the parser. Then use the
			<functionlink>Parse</functionlink> function to make the class parse
			HTML data, eventually read from files. When you are done with
			feeding the whole document data, call the
			<functionlink>FinishParsing</functionlink> function.
			<paragraphbreak />
			The <functionlink>Parse</functionlink> function returns arrays of
			tokens that describe each document element. The
			<functionlink>RewriteElement</functionlink> function can be used to
			convert the tokens back to HTML document strings.
			<paragraphbreak />
			By default, the class uses the markup validator class to parse the
			HTML documents before it actually analyzes and filters unsafe tags
			from the documents. Use the <functionlink>SetInput</functionlink>
			function to set a different filter object as source of parsed
			document elements.
			<paragraphbreak />
			Element tokens are associated to the respective positions
			in the document. Positions are numbers that represent their offsets
			relative to beginning of the document. The
			<functionlink>GetPositionLine</functionlink> function can return the
			line and column number associated to a given document position if
			the <variablelink>track_lines</variablelink> is set to
			<booleanvalue>1</booleanvalue>.
			<paragraphbreak />
			The class may also parse and filter individual CSS stylesheets using
			the function <functionlink>FilterStylesheet</functionlink>. The
			function <functionlink>GetStylesheetPositionLine</functionlink> may
			be used to determine the line associated to the position of an
			error.</usage>
	</documentation>
{/metadocument}
*/

class markup_filter_safe_html_class
{
/*
{metadocument}
	<variable>
		<name>error</name>
		<type>STRING</type>
		<value></value>
		<documentation>
			<purpose>Store the message that is returned when an error
				occurs.</purpose>
			<usage>Check this variable to understand what happened when a call to
				any of the class functions has failed.<paragraphbreak />
				This class uses cumulative error handling. This means that if one
				class functions that may fail is called and this variable was
				already set to an error message due to a failure in a previous call
				to the same or other function, the function will also fail and does
				not do anything.<paragraphbreak />
				This allows programs using this class to safely call several
				functions that may fail and only check the failure condition after
				the last function call.<paragraphbreak />
				Just set this variable to an empty string to clear the error
				condition.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $error = '';

/*
{metadocument}
	<variable>
		<name>error_code</name>
		<type>INTEGER</type>
		<value>0</value>
		<documentation>
			<purpose>Store the code that is returned when an error
				occurs.</purpose>
			<usage>Check this variable to understand what happened when a call
				to any of the class functions has failed. It may be set to several
				possible error codes defined as constants:<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_NONE</tt> - No error happened
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED</tt> - It was found a
					condition that the class is not yet ready to handle
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX</tt> - A syntax
					error was found
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE</tt> - An invalid
					value was passed to the class function parameters or set to the
					class variables
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG</tt> - A tag
					considered unsafe was found
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE</tt> - A tag
					attribute considered unsafe was found
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE</tt> - A CSS
					style considered unsafe was found
				<paragraphbreak />
				<tt>MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT</tt> - An HTML
					comment with Server Side Include (SSI) commands was found</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $error_code = MARKUP_FILTER_SAFE_HTML_ERROR_NONE;

/*
{metadocument}
	<variable>
		<name>error_position</name>
		<type>INTEGER</type>
		<value>-1</value>
		<documentation>
			<purpose>Point to the position of the markup data or file that
				refers to the last error that occurred.</purpose>
			<usage>Check this variable to determine the relevant position of the
				document when a parsing error occurs.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $error_position = -1;

/*
{metadocument}
	<variable>
		<name>buffer_length</name>
		<type>INTEGER</type>
		<value>8000</value>
		<documentation>
			<purpose>Maximum length of the chunks of markup data read from files
				that the class parse at one time.</purpose>
			<usage>Adjust this value according to the available memory.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $buffer_length = 8000;

/*
{metadocument}
	<variable>
		<name>ignore_syntax_errors</name>
		<type>BOOLEAN</type>
		<value>1</value>
		<documentation>
			<purpose>Specify whether the class should ignore syntax errors in
				malformed documents.</purpose>
			<usage>Set this variable to <booleanvalue>0</booleanvalue> if it is
				necessary to verify whether markup data may be corrupted due to
				to eventual bugs in the program that generated the
				document.<paragraphbreak />
				Currently the class only ignores some types of syntax errors.
				Other syntax errors may still cause the
				<functionlink>Parse</functionlink> to fail.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $ignore_syntax_errors=1;

/*
{metadocument}
	<variable>
		<name>warnings</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>Return a list of positions of the original document that
				contain syntax errors.</purpose>
			<usage>Check this variable to retrieve eventual document syntax
				errors that were ignored when the
				<variablelink>ignore_syntax_errors</variablelink> is set to
				<booleanvalue>1</booleanvalue>.<paragraphbreak />
				The indexes of this array are the positions of the errors. The
				array values are the corresponding syntax error messages.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $warnings=array();

/*
{metadocument}
	<variable>
		<name>store_positions</name>
		<type>BOOLEAN</type>
		<value>1</value>
		<documentation>
			<purpose>Tell the class to return the position of each document
				element token.</purpose>
			<usage>Set this variable to <integervalue>0</integervalue> if you do
				not need to know the position of each parsed markup element.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $store_positions = 1;

/*
{metadocument}
	<variable>
		<name>track_lines</name>
		<type>BOOLEAN</type>
		<value>0</value>
		<documentation>
			<purpose>Tell the class to keep track the position of each document
				line.</purpose>
			<usage>Set this variable to <integervalue>1</integervalue> if you
				need to determine the line and column number associated to a given
				position of the parsed document.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $track_lines = 0;

/*
{metadocument}
	<variable>
		<name>unsafe_tags</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>List of tags that may be unsafe.</purpose>
			<usage>Change the default list of unsafe tags only if you realize
				there are more tags that should be considered unsafe.
				<paragraphbreak />
				Currently, the tags considered unsafe are: APPLET, IFRAME, OBJECT
				and SCRIPT.
				<paragraphbreak />
				It is not necessary to add proprietary tags because those will be
				discarded by another class that validates the HTML according to a
				standard DTD.
				<paragraphbreak />
				All the entries in this array variable must be set with a key with
				the name of the unsafe tag that the class should discard. The
				entry values should be set to an empty array to allow eventual
				parameters in future versions of this class.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $unsafe_tags = array(
		'APPLET'=>array(),
		'IFRAME'=>array(),
		'OBJECT'=>array(),
		'SCRIPT'=>array(),
	);

/*
{metadocument}
	<variable>
		<name>safe_proprietary_css_properties</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>List of proprietary CSS properties that should be
				considered safe to allow. Proprietary tags start with the -
				character.</purpose>
			<usage>Change the default list of safe CSS properties only if you
				realize there are more properties that should be considered safe.
				<paragraphbreak />
				All the entries in this array variable must be set with a key with
				the name of the safe property that the class should allow. The
				entry values should be set to an empty array to allow eventual
				parameters in future versions of this class.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $safe_proprietary_css_properties = array(
	);

/*
{metadocument}
	<variable>
		<name>safe_css_property_types</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>List of types of expressions that should be considered safe
				to allow in CSS style values.</purpose>
			<usage>Change the default list of safe CSS property types only if
				you realize there are more properties that should be considered
				safe.
				<paragraphbreak />
				Currently, the types considered safe are: delimiter, dimension,
				function, hash, identifier, number, percentage, string and uri.
				<paragraphbreak />
				All the entries in this array variable must be set with a key with
				the name of the safe property type that the class should allow.
				The entry values should be set to an empty array to allow eventual
				parameters in future versions of this class.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $safe_css_property_types = array(
		'delimiter'=>array(),
		'dimension'=>array(),
		'function'=>array(),
		'hash'=>array(),
		'identifier'=>array(),
		'number'=>array(),
		'percentage'=>array(),
		'string'=>array(),
		'uri'=>array(),
	);

/*
{metadocument}
	<variable>
		<name>safe_css_property_functions</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>List of the names of functions that should be considered
				safe to allow in CSS style values.</purpose>
			<usage>Change the default list of safe CSS property functions only
				if you realize there are more functions that should be considered
				safe.
				<paragraphbreak />
				All the entries in this array variable must be set with a key with
				the name of the safe function that the class should allow. The
				entry values should be set to an empty array to allow eventual
				parameters in future versions of this class.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $safe_css_property_functions = array();

/*
{metadocument}
	<variable>
		<name>safe_url_schemes</name>
		<type>HASH</type>
		<value></value>
		<documentation>
			<purpose>List of schemes that should be considered safe to allow in
				URLs.</purpose>
			<usage>Change the default list of safe URL schemes only if you
				realize there are more schemes that should be considered safe.
				<paragraphbreak />
				All the entries in this array variable must be set with a key with
				the name of the scheme that the class should allow. The entry
				values should be set to an empty array to allow eventual
				parameters in future versions of this class.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $safe_url_schemes = array(
		'ftp' => array(),
		'ftps' => array(),
		'http' => array(),
		'https' => array(),
		'mailto' => array(),
		'news' => array(),
		'nntp' => array(),
	);

/*
{metadocument}
	<variable>
		<name>allow_server_side_includes</name>
		<type>BOOLEAN</type>
		<value>0</value>
		<documentation>
			<purpose>Tell the class whether it should parse HTML comments to
				determine whether it contains server side includes (SSI) that
				should be filtered.</purpose>
			<usage>Set this variable to <booleanvalue>1</booleanvalue> only if
				you need to retrieve all comments unfiltered, even if they may
				server side include commands.</usage>
		</documentation>
	</variable>
{/metadocument}
*/
	var $allow_server_side_includes = 0;

	/* Private variables */

	var $input;
	var $buffer = array();
	var $buffer_position = 0;
	var $state = MARKUP_FILTER_SAFE_HTML_START;
	var $unsafe = '';
	var $safe_styles = array();
	var $flush_style = 0;
	var $css;

	/* Private functions */

	Function SetError($error, $code)
	{
		$this->error = $error;
		$this->error_code = $code;
		return(0);
	}

	Function SetPositionedError($error, $code, $position)
	{
		$this->error_position = $position;
		return($this->SetError($error, $code));
	}

	Function SetPositionedWarning($error, $code, $position)
	{
		if(!$this->ignore_syntax_errors)
			return($this->SetPositionedError($error, $code, $position));
		$this->warnings[$position]=$error;
		return(1);
	}

	Function CheckUnsafeURL($url)
	{
		$l = strlen($url);
		for($u = 0; $u < $l; ++$u)
		{
			if(ord($url[$u]) > 32)
			{
				if($u > 0)
				{
					$url = substr($url, $u);
					$l -= $u;
				}
				break;
			}
		}
		$colon = strcspn($url, "\0\1\2\3\4\5\6\7\x8\x9\xa\xb\xc\xd\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f :/.");
		if(($colon < $l
		&& strcspn($url[$colon], "\0\1\2\3\4\5\6\7\x8\x9\xa\xb\xc\xd\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ") < 1)
		|| !($u = @parse_url(trim($url))))
			return(0);
		if(IsSet($u['scheme'])
		&& !IsSet($this->safe_url_schemes[strtolower($u['scheme'])]))
			return(0);
		return(1);
	}

	Function FilterStyle(&$style, $position,  $add_element_offset, &$unsafe)
	{
		$unsafe = 0;
		$tp = count($style);
		for($p = 0; $p < $tp; ++$p)
		{
			$property = $style[$p]['Property'];
			if(!strcmp($property[0], '-')
			&& !IsSet($this->safe_proprietary_css_properties[strtolower($property)]))
			{
				if(!$this->SetPositionedWarning('proprietary style property '.$property.' is considered unsafe', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Position'] : 0)))
					return(0);
				$unsafe = 1;
				Unset($style[$p]);
				continue;
			}
			$te = count($style[$p]['Value']);
			for($e = 0; $e < $te; ++$e)
			{
				$type = $style[$p]['Value'][$e]['Type'];
				if(!IsSet($this->safe_css_property_types[$type]))
				{
					if(!$this->SetPositionedWarning('style '.$property.' property has an expression of an unsafe type ('.$type.')', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
						return(0);
					$unsafe = 1;
					Unset($style[$p]);
					continue 2;
				}
				switch($type)
				{
					case 'function':
						if(!IsSet($this->safe_css_property_functions[$function = $style[$p]['Value'][$e]['Function']]))
						{
							if(!$this->SetPositionedWarning('style '.$property.' property has an unsafe function ('.$function.')', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
								return(0);
							$unsafe = 1;
							Unset($style[$p]);
							continue 3;
						}
						break;
					case 'uri':
						if(!$this->CheckUnsafeURL($style[$p]['Value'][$e]['URI']))
						{
							if(!$this->SetPositionedWarning('style '.$property.' property has an unsafe URL', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_CSS_STYLE, $position + ($add_element_offset ? $style[$p]['Value'][$e]['Position'] : 0)))
								return(0);
							$unsafe = 1;
							Unset($style[$p]);
							continue 3;
						}
						break;
				}
			}
		}
		return(1);
	}

	Function FilterAttributes(&$raw, $is_style, &$unsafe)
	{
		$unsafe = 0;
		if(IsSet($raw['Attributes']))
		{
			$attributes = $raw['Attributes'];
			$decoded = (IsSet($raw['DecodedAttributes']) ? $raw['DecodedAttributes'] : array());
			$ta = count($attributes);
			$clean = $decoded_clean = array();
			for(Reset($attributes), $a = 0; $a < $ta; Next($attributes), ++$a)
			{
				$attribute = Key($attributes);
				$upper = strtoupper($attribute);
				if(!strcmp(substr($upper, 0, 2), 'ON'))
				{
					if(!$this->SetPositionedWarning('unsafe event attribute '.$attribute, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE, $raw['Position']))
						return(0);
					continue;
				}
				$value = $attributes[$attribute];
				$decoded_value = (IsSet($decoded[$attribute]) ? $decoded[$attribute] : $attributes[$attribute]);
				if($is_style
				&& $upper === 'TYPE'
				&& trim(strtolower($decoded_value)) !== 'text/css')
				{
					if(!$this->SetPositionedWarning('unsafe '.($raw['Type'] == 'TAG' ? 'tag' : 'end tag').' <'.$raw['Name'].'> '.$attribute.' attribute value '.$decoded_value, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG, $raw['Position']))
						return(0);
					$unsafe = 1;
					return(1);
				}
				if(IsSet($raw['AttributeTypes']['URI'][$attribute])
				&& !$this->CheckUnsafeURL($decoded_value))
				{
					if(!$this->SetPositionedWarning('unsafe URL in attribute '.$attribute, MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_ATTRIBUTE, $raw['Position']))
						return(0);
					continue;
				}
				if(IsSet($raw['AttributeTypes']['StyleSheet'][$attribute]))
				{
					$css = new css_parser_class;
					$css->store_positions = 1;
					$css->track_lines = 0;
					if(!$css->ParseStyleProperties($decoded_value, $style))
					{
						if(!$this->SetPositionedWarning($css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $raw['Position']))
							return(0);
						continue;
					}
					if(!$this->FilterStyle($style, $raw['Position'], 0, $unsafe_style))
						return(0);
					if($unsafe_style
					&& count($style) == 0)
						continue;
					if(!$css->RewriteStyleProperties($style, $decoded_value))
						return(0);
					$value = HtmlSpecialChars($decoded_value);
				}
				$clean[$attribute] = $value;
				if(IsSet($decoded[$attribute]))
					$decoded_clean[$attribute] = $decoded_value;
			}
			if(count($clean))
			{
				$raw['Attributes'] = $clean;
				if(IsSet($raw['DecodedAttributes']))
					$raw['DecodedAttributes'] = $decoded_clean;
			}
			else
			{
				UnSet($raw['Attributes']);
				Unset($raw['DecodedAttributes']);
			}
		}
		return(1);
	}

	Function FilterElement($end, &$element, &$need_more_data)
	{
		$need_more_data = 0;
		$l = count($this->buffer);
		switch($this->state)
		{
			case MARKUP_FILTER_SAFE_HTML_START:
				$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;

			case MARKUP_FILTER_SAFE_HTML_GET_ELEMENT:
				if($this->buffer_position >= $l)
				{
					if($end)
						$this->state = MARKUP_FILTER_SAFE_HTML_END;
					else
						$need_more_data = 1;
					return(1);
				}
				$raw = $this->buffer[$this->buffer_position];
				switch($raw['Type'])
				{
					case 'TAG':
					case 'ENDTAG':
						$tag = strtoupper($raw['Name']);
						if(IsSet($this->unsafe_tags[$tag]))
						{
							if(!$this->SetPositionedWarning('unsafe '.($raw['Type'] == 'TAG' ? 'tag' : 'end tag').' <'.$raw['Name'].'>', MARKUP_FILTER_SAFE_HTML_ERROR_UNSAFE_TAG, $raw['Position']))
								return(0);
							$this->unsafe = $tag;
							++$this->buffer_position;
							if($this->buffer_position >= $l)
							{
								if($end)
									$this->state = MARKUP_FILTER_SAFE_HTML_END;
								else
									$need_more_data = 1;
							}
							elseif($raw['Type'] == 'TAG')
								$this->state = MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE;
							return(1);
						}
						$is_style = ($tag === 'STYLE' && $raw['Type'] == 'TAG');
						if(!$this->FilterAttributes($raw, $is_style, $unsafe))
							return(0);
						if($unsafe)
						{
							$this->unsafe = $tag;
							++$this->buffer_position;
							if($this->buffer_position >= $l)
							{
								if($end)
									$this->state = MARKUP_FILTER_SAFE_HTML_END;
								else
									$need_more_data = 1;
							}
							elseif($raw['Type'] == 'TAG')
								$this->state = MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE;
							return(1);
						}
						if($is_style)
						{
							$this->unsafe = $tag;
							$this->state = MARKUP_FILTER_SAFE_HTML_CHECK_STYLES;
							return(1);
						}
						++$this->buffer_position;
						break;

					case 'COMMENT':
						if(!$this->allow_server_side_includes
						&& !strcmp(substr(ltrim($raw['Comment']), 0, 1), '#'))
						{
							if(!$this->SetPositionedWarning('comment may contain server side include command', MARKUP_FILTER_SAFE_HTML_ERROR_SSI_COMMENT, $raw['Position']))
								return(0);
							++$this->buffer_position;
							return(1);
						}
						++$this->buffer_position;
						break;

					default:
						++$this->buffer_position;
						break;
				}
				$element = $raw;
				return(1);

			case MARKUP_FILTER_SAFE_HTML_SKIP_UNSAFE:
				for($position = $this->buffer_position; $position < $l; ++$position)
				{
					$raw = $this->buffer[$position];
					switch($raw['Type'])
					{
						case 'ENDTAG':
							if(!strcmp($this->unsafe, strtoupper($raw['Name'])))
							{
								$this->buffer_position = $position + 1;
								$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
								return(1);
							}
							break;
					}
				}
				if($end)
				{
					if(!$this->SetPositionedError('end tag '.$this->unsafe.' is missing', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$this->buffer_position]['Position']))
						return(0);
					$this->buffer_position = $position;
					$this->state = MARKUP_FILTER_SAFE_HTML_END;
				}
				else
					$need_more_data = 1;
				return(1);

			case MARKUP_FILTER_SAFE_HTML_CHECK_STYLES:
				for($last_data = 0, $filter = array(), $position = $this->buffer_position + 1; $position < $l; ++$position)
				{
					$raw = $this->buffer[$position];
					switch($raw['Type'])
					{
						case 'DATA':
							if($last_data)
								$filter[count($filter) - 1]['Data'] .= $raw['Data'];
							else
							{
								$filter[] = $raw;
								$last_data = 1;
							}
							break;
						case 'ENTITY':
							$data = ((IsSet($raw['Code']) && ($code = $raw['Code']) < 255) ? Chr($code) : '&'.$raw['Entity'].';');
							if($last_data)
								$filter[count($filter) - 1]['Data'] .= $data;
							else
							{
								$filter[] = array(
									'Type'=>'DATA',
									'Data'=>$data,
									'Position'=>$raw['Position']
								);
								$last_data = 1;
							}
							break;
						case 'COMMENT':
							$filter[] = $raw;
							$last_data = 0;
							break;
						case 'ENDTAG':
							if(!strcmp($this->unsafe, strtoupper($raw['Name'])))
							{
								$raw = $this->buffer[$this->buffer_position];
								if(!$this->FilterAttributes($raw, 1, $unsafe))
									return(0);
								if($unsafe)
								{
									$this->buffer_position = $position +1;
									$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
									return(1);
								}
								$this->safe_styles = array();
								$tf = count($filter);
								$total = 0;
								for($f = 0; $f < $tf; ++$f)
								{
									switch($filter[$f]['Type'])
									{
										case 'DATA':
											$stylesheet = $filter[$f]['Data'];
											$style_position = $filter[$f]['Position'];
											break;
										case 'COMMENT':
											$stylesheet = $filter[$f]['Comment'];
											$style_position = $filter[$f]['CommentPosition'];
											break;
									}
									if(!$this->FilterStylesheet($stylesheet, $filtered, $style_position))
										return(0);
									if(strlen($filtered))
									{
										if(strcmp($filtered, $stylesheet))
										{
											switch($filter[$f]['Type'])
											{
												case 'DATA':
													$filter[$f]['Data'] = $filtered;
													break;
												case 'COMMENT':
													$filter[$f]['Comment'] = "\n".$filtered;
													break;
											}
										}
										$this->safe_styles[] = $filter[$f];
										++$total;
									}
								}
								if($total > 0)
								{
									$element = $raw;
									$this->flush_style = 0;
									$this->buffer_position = $position;
									$this->state = MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES;
								}
								else
								{
									$this->buffer_position = $position + 1;
									$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
								}
								return(1);
							}
							if(!$this->SetPositionedWarning('invalid end tag </'.$raw['Name'].'> in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
								return(0);
							break;
						case 'TAG':
							if(!$this->SetPositionedWarning('invalid tag <'.$raw['Name'].'> in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
								return(0);
							break;
						case 'CDATA':
							if(!$this->SetPositionedWarning('invalid CDATA section in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$position]['Position']))
								return(0);
							break;
						default:
							var_dump($raw);
							if(!$this->SetPositionedWarning('invalid '.$raw['Type'].' section in the middle of CSS styles', MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $this->buffer[$position]['Position']))
								return(0);
							break;
					}
				}
				if($end)
				{
					if(!$this->SetPositionedError('end tag '.$this->unsafe.' is missing', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $this->buffer[$this->buffer_position]['Position']))
						return(0);
					$this->buffer_position = $position;
					$this->state = MARKUP_FILTER_SAFE_HTML_END;
				}
				else
					$need_more_data = 1;
				return(1);

			case MARKUP_FILTER_SAFE_HTML_FLUSH_STYLES:
				if($this->flush_style < count($this->safe_styles))
				{
					$element = $this->safe_styles[$this->flush_style];
					++$this->flush_style;
				}
				else
				{
					$this->safe_styles = array();
					$element = $this->buffer[$this->buffer_position];
					$this->buffer_position = $this->buffer_position + 1;
					$this->state = MARKUP_FILTER_SAFE_HTML_GET_ELEMENT;
				}
				return(1);

			default:
				return($this->SetPositionedError($this->state.' is not a valid parser state', MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $this->buffer[$this->buffer_position]['Position']));
		}
	}

	Function FilterElements($raw, $end, &$elements)
	{
		$length = count($raw);
		$elements = array();
		if(count($this->buffer) == 0)
			$this->buffer = $raw;
		else
		{
			for($e = 0; $e < $length; ++$e)
				$this->buffer[] = $raw[$e];
		}
		do
		{
			if(!$this->FilterElement($end, $element, $need_more_data))
				return(0);
			if(IsSet($element))
			{
				$elements[] = $element;
				UnSet($element);
			}
		}
		while(!$need_more_data
		&& $this->state != MARKUP_FILTER_SAFE_HTML_END);
		if($end
		&& $this->state!=MARKUP_FILTER_SAFE_HTML_END)
			return($this->SetError('reached a premature end of data', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX));
		if($this->buffer_position == count($this->buffer))
		{
			$this->buffer = array();
			$this->buffer_position = 0;
		}
		return(1);
	}

	/* Public functions */

/*
{metadocument}
	<function>
		<name>SetInput</name>
		<type>VOID</type>
		<documentation>
			<purpose>Set the object of the class that will be used to parse HTML
				document before it is filtered by this class.</purpose>
			<usage>Use this function only if you need to override the HTML
				parsing class, which is the markup validator class by default is
				the markup filter validator class.</usage>
		</documentation>
		<argument>
			<name>input</name>
			<type>OBJECT</type>
			<documentation>
				<purpose>Reference to the HTML parser input object.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function SetInput(&$input)
	{
		$this->input = &$input;
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>GetPositionLine</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Get the line number of the document that corresponds to a
				given position.</purpose>
			<usage>Pass the document offset number as the position to be
				located. Make sure the <variablelink>track_lines</variablelink>
				variable is set to <booleanvalue>1</booleanvalue> before parsing
				the document.</usage>
			<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
				 the <variablelink>track_lines</variablelink> variable is set to
				<booleanvalue>1</booleanvalue> and it was given a valid positive
				position number that does not exceed the position of the last
				parsed document line.</returnvalue>
		</documentation>
		<argument>
			<name>position</name>
			<type>INTEGER</type>
			<documentation>
				<purpose>Position of the line to be located.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>line</name>
			<type>INTEGER</type>
			<out />
			<documentation>
				<purpose>Returns the number of the line that corresponds to the
					given document position.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>column</name>
			<type>INTEGER</type>
			<out />
			<documentation>
				<purpose>Returns the number of the column of the line that
					corresponds to the given document position.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function GetPositionLine($position, &$line, &$column)
	{
		if(!IsSet($this->input))
			return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
		if(!$this->input->GetPositionLine($position, $line, $column))
			return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>FilterStylesheet</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Filter a CSS stylesheet to discard unsafe style
				definitions.</purpose>
			<usage>Pass a string with the text of the stylesheet to filter.</usage>
			<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
				 the stylesheet string was parsed successfully.</returnvalue>
		</documentation>
		<argument>
			<name>stylesheet</name>
			<type>STRING</type>
			<documentation>
				<purpose>String of the stylesheet to parse.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>filtered</name>
			<type>STRING</type>
			<out />
			<documentation>
				<purpose>Returns the filtered stylesheet without any unsafe CSS
					style definitions.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function FilterStylesheet($stylesheet, &$filtered, $position = 0)
	{
		$this->css = new css_parser_class;
		$this->css->store_positions = 1;
		$this->css->track_lines = ($this->track_lines && $position == 0);
		$filtered = '';
		if(!$this->css->ParseStylesheet($stylesheet, $styles))
		{
			if(!$this->SetPositionedWarning('stylesheet error: '.$this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_SYNTAX, $position + $this->css->error_position))
				return(0);
		}
		else
		{
			$te = count($styles);
			for($total_unsafe = 0, $e = 0; $e < $te; ++$e)
			{
				switch($styles[$e]['Type'])
				{
					case 'ruleset':
						if(!$this->FilterStyle($styles[$e]['RuleSet']['Properties'], $position, 1, $unsafe))
							return(0);
						if($unsafe)
							++$total_unsafe;
						if(!$this->css->RewriteStyle($styles[$e], $style))
							return($this->SetPositionedError('style rewrite error: '.$this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $position + $styles[$e]['Position']));
						$filtered .= $style;
						break;

					default:
						return($this->SetPositionedError('it is not possible to handle stylesheet elements of type '.$styles[$e]['Type'], MARKUP_FILTER_SAFE_HTML_ERROR_UNEXPECTED, $position + $styles[$e]['Position']));
				}
			}
			if($total_unsafe == 0)
				$filtered = $stylesheet;
		}
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>GetStylesheetPositionLine</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Get the line number of a given position of the original
				stylesheet filtered with the
				<functionlink>FilterStylesheet</functionlink> function.</purpose>
			<usage>Pass the stylesheet offset number as the position to be
				located. Make sure the <variablelink>track_lines</variablelink>
				variable is set to <booleanvalue>1</booleanvalue> before parsing
				the stylesheet.</usage>
			<returnvalue>This function returns <booleanvalue>1</booleanvalue> if
				 the <variablelink>track_lines</variablelink> variable is set to
				<booleanvalue>1</booleanvalue> and it was given a valid positive
				position number that does not exceed the position of the last
				parsed stylesheet line.</returnvalue>
		</documentation>
		<argument>
			<name>position</name>
			<type>INTEGER</type>
			<documentation>
				<purpose>Position of the line to be located.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>line</name>
			<type>INTEGER</type>
			<out />
			<documentation>
				<purpose>Returns the number of the line that corresponds to the
					given stylesheet position.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>column</name>
			<type>INTEGER</type>
			<out />
			<documentation>
				<purpose>Returns the number of the column of the line that
					corresponds to the given stylesheet position.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function GetStylesheetPositionLine($position, &$line, &$column)
	{
		if(!IsSet($this->css))
			return($this->SetPositionedError('no stylesheet was previously filtered', $position));
		if(!$this->css->GetPositionLine($position, $line, $column))
			return($this->SetPositionedError($this->css->error, MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE, $position));
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>StartParsing</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Initialize the state of the markup parser.</purpose>
			<usage>Call this function before start parsing the markup document,
				passing the file name or data to be parse and eventually other
				parsing option parameters.</usage>
			<returnvalue>Returns <booleanvalue>1</booleanvalue> if all
				parameters are correctly defined.</returnvalue>
		</documentation>
		<argument>
			<name>parameters</name>
			<type>HASH</type>
			<documentation>
				<purpose>Specifies a list of options that define how to parse the
					given document. Currently it has the following options:
				<paragraphbreak />
				<tt>Data</tt> - String with the markup data to be parsed
				<paragraphbreak />
				<tt>File</tt> - Name of the file from which the data to be parsed
					should be read instead of a static string.
				<paragraphbreak />
				<tt>OnlyBody</tt> - Determine whether the HTML document should be
					parsed just as the BODY section or as a complete HTML document.
				<paragraphbreak />
				<tt>DTDCachePath</tt> - Path of directory where the cached DTD
					files will be stored to prevent the overhead of fecthing the DTD
					files from the remote DTD sites every time an HTML document is
					parsed. If this parameter is missing, the DTD will not be cached.
				</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function StartParsing($parameters)
	{
		if(!IsSet($this->input))
		{
			$this->input = new markup_filter_validator_class;
			$this->input->track_lines = $this->track_lines;
			$this->input->buffer_length = $this->buffer_length;
			$this->input->ignore_syntax_errors = $this->ignore_syntax_errors;
			$this->input->store_positions = $this->store_positions;
		}
		$this->buffer = array();
		$this->buffer_position = 0;
		$this->state = MARKUP_FILTER_SAFE_HTML_START;
		$this->warnings = array();
		$parameters['DecodeEntities'] = 1;
		$parameters['AttributeTypes']= array(
			'URI'=>array(),
			'StyleSheet'=>array(),
		);
		if(!$this->input->StartParsing($parameters))
			return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>Parse</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Parse the markup document.</purpose>
			<usage>Call this function iteratively until the <argumentlink>
					<function>Parse</function>
					<argument>end</argument>
				</argumentlink> argument is returned set to
				<booleanvalue>1</booleanvalue>.</usage>
			<returnvalue>Returns <booleanvalue>1</booleanvalue> if there were no
				fatal parsing errors.</returnvalue>
		</documentation>
		<argument>
			<name>end</name>
			<type>BOOLEAN</type>
			<out />
			<documentation>
				<purpose>Determine when the parser reached the end of the
					document.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>elements</name>
			<type>ARRAY</type>
			<out />
			<documentation>
				<purpose>Return a sequence of associative arrays with entries that
					describe each document element that was parsed.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function Parse(&$end, &$elements)
	{
		if(!IsSet($this->input))
			return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
		if(!$this->input->Parse($end, $raw))
		{
			$this->FinishParsing();
			return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
		}
		if(!$this->FilterElements($raw, $end, $elements))
		{
			$this->FinishParsing();
			return(0);
		}
		if($end)
			$this->FinishParsing();
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>FinishParsing</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Close any files and release any resources allocated while
				the document was being parsed.</purpose>
			<usage>Call this function after you are done with parsing the markup
				document.</usage>
			<returnvalue>Returns <booleanvalue>1</booleanvalue> if all resources
				were successfully released.</returnvalue>
		</documentation>
		<do>
{/metadocument}
*/
	Function FinishParsing()
	{
		if(!IsSet($this->input))
			return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
		if(!$this->input->FinishParsing())
			return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
		$tw = count($this->input->warnings);
		for(Reset($this->input->warnings), $w = 0; $w < $tw; Next($this->input->warnings), ++$w)
		{
			$warning = Key($this->input->warnings);
			if(!IsSet($this->warnings[$warning]))
				$this->warnings[$warning] = $this->input->warnings[$warning];
		}
		return(1);
	}
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*
{metadocument}
	<function>
		<name>RewriteElement</name>
		<type>BOOLEAN</type>
		<documentation>
			<purpose>Generate a string for a previously parsed document markup
				element.</purpose>
			<usage>Call this function for each markup element when you want to
				regenerated an element that was just parsed and eventually
				filtered.</usage>
			<returnvalue>Returns <booleanvalue>0</booleanvalue> if it is pass an
				invalid element definition.</returnvalue>
		</documentation>
		<argument>
			<name>element</name>
			<type>HASH</type>
			<documentation>
				<purpose>Associative array that defines the type and the values of
					the document element to be rewritten.</purpose>
			</documentation>
		</argument>
		<argument>
			<name>markup</name>
			<type>STRING</type>
			<out />
			<documentation>
				<purpose>Return the string of the rewritten document element.</purpose>
			</documentation>
		</argument>
		<do>
{/metadocument}
*/
	Function RewriteElement($element, &$markup)
	{
		if(!IsSet($this->input))
			return($this->SetError('it was not specified the input object', MARKUP_FILTER_SAFE_HTML_ERROR_INVALID_USAGE));
		if(!$this->input->RewriteElement($element, $markup))
			return($this->SetPositionedError($this->input->error, $this->input->error_code, $this->input->error_position));
		return(1);
	}
};
/*
{metadocument}
		</do>
	</function>
{/metadocument}
*/

/*

{metadocument}
</class>
{/metadocument}

*/

?>
Return current item: Secure HTML parser and filter,XSS,CSRF