Location: PHPKode > projects > WebSecurityTester > php-gtk/security/lib/HtmlParser.cls.php
<?
/**
* Class HtmlParser
* 
* This class allow you to parse HTML pages
* 
* @Author: Simone Cosci <hide@address.com>
* @Version: 1.0
* 
* 
* */


class HtmlParser
{

	/**
	 * @access public
	 * @var string HTML to be parsed
	 */
	var $html;
	
	/**
	 * Forms Collection FOUND (return of HtmlFormParser->parseForms())
	 * @access public
	 * @var array Forms
	 */
	var $forms;
	
	/**
	 * Images Collection FOUND
	 * @access public
	 * @var array Images
	 */
	var $images;
	
	/**
	 * CSS Collection FOUND
	 * @access public
	 * @var array CSS
	 */
	var $css = array();
	
	/**
	 * Scripts Collection FOUND
	 * @access public
	 * @var array Scripts
	 */
	var $scripts = array();
	
	/**
	 * HypertextReferers Collection FOUND
	 * @access public
	 * @var array HypertextReferers
	 */
	var $hrefs;
	
	/**
	 * links collected
	 * @access public
	 * @var array
	 */
	var $links;

	/**
	 * HtmlParser Construstor
	 * @access public
	 * @param $html
	 * @return HtmlParser
	 */
	 function HtmlParser($html=''){
		
		if($html!='') $this->html = $html;
		
		$this->forms = array();
		$this->images = array();
		$this->hrefs = array();
		$this->meta = array();
		$this->links = array();
		
		$this->scripts['inline'] = array();
		$this->scripts['linked'] = array();
		
		$this->css['inline'] = array();
		$this->css['linked'] = array();
		$this->css['url'] = array();
		
		
	}
	
	/**
	 * Parse Function
	 * @access public
	 * @return void
	 */
	function Parse(){
		
		/* Parse FORMS */
		$myHtmlFormParser = new HtmlFormParser($this->html);
		$this->forms = $myHtmlFormParser->parseForms();
		unset($myHtmlFormParser);
		
		/* Parse inline and linked CSS */
		$this->_parseCSS();

		/* Parse IMG Tags */
		$this->_parseImages();

		/* Parse SCRIPT Tags */
		$this->_parseScripts();		
		
		/* Parse <A HREF Tags */
		$this->_parseHrefs();		
		
		/* Parse META Tags */
		$this->_parseMetaTags();
		
		/* Parse EVERY LINK Tag */
		$this->_parseLinks();	
		
	}
	
	/**
	 * Parse Images
	 * @access private
	 * @return void
	 */
	function _parseImages(){
		if ( preg_match_all("/<img.*>/isU", $this->html, $images)){
			foreach ( $images[0] as $image ) {
				if(preg_match("/<img.*src=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $image, $img_src))
					$this->images[] = preg_replace("/[\"'<>]/", "", $img_src[1]);
			}
		}
		$this->images = array_unique($this->images);
	}
	
	/**
	 * Parse CSS
	 * @access private
	 * @return void
	 */
	function _parseCSS(){
		if ( preg_match_all("/<style.*>/isU", $this->html, $css)){
			foreach ( $css[0] as $style ) {
				if(preg_match_all("/url(.*)/isU", $style, $urls)){
					foreach ( $urls[0] as $url ) {
						if(preg_match("/url.*(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $url, $css_url)){
							$url = preg_replace("/[\"'<>]/", "", $css_url[1]);
							$this->css['url'][$url] = '';
						}
					}
				}
				$myCssParser = new CssParser();
				$myCssParser->ParseStr($style);
				$this->css['inline'][] = $myCssParser->css;
			}
		}
		if ( preg_match_all("/<link.*>/isU", $this->html, $css_path)){
			foreach ( $css_path[0] as $css ) {
				preg_match("/<link.*href=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $css, $css_url);
				$url = preg_replace("/[\"'<>]/", "", $css_url[1]);
				$this->css['linked'][$url] = '';
			}
		}
	}

	/**
	 * Parse Scripts
	 * @access private
	 * @return void
	 */
	function _parseScripts(){
		if ( preg_match_all("/<script.*.+><\/script>/isU", $this->html, $scripts)){
			foreach ( $scripts[0] as $script ) {
				if(preg_match("/<script.*src=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $script, $script_src)){
					$script_src[1] = preg_replace("/[\"'<>]/", "", $script_src[1]);
					$this->scripts['linked'][] = $script_src[1];
				}
			}
		}
		$scripts = stripfromtextarray($this->html,'<script','</script>');
		foreach($scripts as $script){
			$sl = strlen($script);
			for($i=0; $i<$sl; $i++){
				if(substr($script,0,1)=='>'){
					$script = substr($script,1,$sl);
					break;
				}else $script = substr($script,1,$sl);
			}
			if(!empty($script))
				$this->scripts['inline'][] = $script;
		}
	}
	
	/**
	 * Parse HREF
	 * @access private
	 * @return void
	 */
	function _parseHrefs(){
		if ( preg_match_all("/<a.*>/isU", $this->html, $hrefs)){
			foreach ( $hrefs[0] as $href ) {
				if( preg_match("/<a.*href=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $href, $link)){
					$this->hrefs[] = preg_replace("/[\"'<>]/", "", $link[1]);
				}
			}
		}
	}

	/**
	 * Parse META
	 * @access private
	 * @return void
	 */
	function _parseMetaTags() {
		if(preg_match_all("/<meta.*>/isU", $this->html, $metatags)){
			foreach ( $metatags[0] as $k=>$meta ) {
				$names = array('name','http-equiv');
				foreach ($names as $name){
					if( preg_match("/<meta.*$name=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $meta, $n) && preg_match("/<meta.*content=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $meta, $c)){
						$this->meta[$name][preg_replace("/[\"'<>]/", "", $n[1])] = preg_replace("/[\"'<>]/", "", $c[1]);
					}
				}
			}
		}
	}
	
	/**
	 * Parse META
	 * @access private
	 * @return void
	 */
	function _parseLinks(){
		//Pattern building across multiple lines to avoid page distortion.
		$pattern  = "/((@import\s+[\"'`]([\w:?=@&\/#._;-]+)[\"'`];)|";
		$pattern .= "(:\s*url\s*\([\s\"'`]*([\w:?=@&\/#._;-]+)";
		$pattern .= "([\s\"'`]*\))|<[^>]*\s+(src|href|url)\=[\s\"'`]*";
		$pattern .= "([\w:?=@&\/#._;-]+)[\s\"'`]*[^>]*>))/i";
		//End pattern building.
		preg_match_all ($pattern, $this->html, $matches);
		$this->links = (is_array($matches)) ? array_merge($matches[3],$matches[8]):array();
		$this->links = array_unique($this->links);
		$this->links = array_remove($this->links, '', false);
	}
}

?>
Return current item: WebSecurityTester