<?
/**
* Class HtmlParser
*
* This class allow you to parse HTML pages
*
* @Author: Simone Cosci <hide@address.com>
* @Version: 1.0
*
*
* */
class HtmlParser
{
/**
* @access public
* @var string HTML to be parsed
*/
var $html;
/**
* Forms Collection FOUND (return of HtmlFormParser->parseForms())
* @access public
* @var array Forms
*/
var $forms;
/**
* Images Collection FOUND
* @access public
* @var array Images
*/
var $images;
/**
* CSS Collection FOUND
* @access public
* @var array CSS
*/
var $css = array();
/**
* Scripts Collection FOUND
* @access public
* @var array Scripts
*/
var $scripts = array();
/**
* HypertextReferers Collection FOUND
* @access public
* @var array HypertextReferers
*/
var $hrefs;
/**
* links collected
* @access public
* @var array
*/
var $links;
/**
* HtmlParser Construstor
* @access public
* @param $html
* @return HtmlParser
*/
function HtmlParser($html=''){
if($html!='') $this->html = $html;
$this->forms = array();
$this->images = array();
$this->hrefs = array();
$this->meta = array();
$this->links = array();
$this->scripts['inline'] = array();
$this->scripts['linked'] = array();
$this->css['inline'] = array();
$this->css['linked'] = array();
$this->css['url'] = array();
}
/**
* Parse Function
* @access public
* @return void
*/
function Parse(){
/* Parse FORMS */
$myHtmlFormParser = new HtmlFormParser($this->html);
$this->forms = $myHtmlFormParser->parseForms();
unset($myHtmlFormParser);
/* Parse inline and linked CSS */
$this->_parseCSS();
/* Parse IMG Tags */
$this->_parseImages();
/* Parse SCRIPT Tags */
$this->_parseScripts();
/* Parse <A HREF Tags */
$this->_parseHrefs();
/* Parse META Tags */
$this->_parseMetaTags();
/* Parse EVERY LINK Tag */
$this->_parseLinks();
}
/**
* Parse Images
* @access private
* @return void
*/
function _parseImages(){
if ( preg_match_all("/<img.*>/isU", $this->html, $images)){
foreach ( $images[0] as $image ) {
if(preg_match("/<img.*src=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $image, $img_src))
$this->images[] = preg_replace("/[\"'<>]/", "", $img_src[1]);
}
}
$this->images = array_unique($this->images);
}
/**
* Parse CSS
* @access private
* @return void
*/
function _parseCSS(){
if ( preg_match_all("/<style.*>/isU", $this->html, $css)){
foreach ( $css[0] as $style ) {
if(preg_match_all("/url(.*)/isU", $style, $urls)){
foreach ( $urls[0] as $url ) {
if(preg_match("/url.*(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $url, $css_url)){
$url = preg_replace("/[\"'<>]/", "", $css_url[1]);
$this->css['url'][$url] = '';
}
}
}
$myCssParser = new CssParser();
$myCssParser->ParseStr($style);
$this->css['inline'][] = $myCssParser->css;
}
}
if ( preg_match_all("/<link.*>/isU", $this->html, $css_path)){
foreach ( $css_path[0] as $css ) {
preg_match("/<link.*href=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $css, $css_url);
$url = preg_replace("/[\"'<>]/", "", $css_url[1]);
$this->css['linked'][$url] = '';
}
}
}
/**
* Parse Scripts
* @access private
* @return void
*/
function _parseScripts(){
if ( preg_match_all("/<script.*.+><\/script>/isU", $this->html, $scripts)){
foreach ( $scripts[0] as $script ) {
if(preg_match("/<script.*src=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $script, $script_src)){
$script_src[1] = preg_replace("/[\"'<>]/", "", $script_src[1]);
$this->scripts['linked'][] = $script_src[1];
}
}
}
$scripts = stripfromtextarray($this->html,'<script','</script>');
foreach($scripts as $script){
$sl = strlen($script);
for($i=0; $i<$sl; $i++){
if(substr($script,0,1)=='>'){
$script = substr($script,1,$sl);
break;
}else $script = substr($script,1,$sl);
}
if(!empty($script))
$this->scripts['inline'][] = $script;
}
}
/**
* Parse HREF
* @access private
* @return void
*/
function _parseHrefs(){
if ( preg_match_all("/<a.*>/isU", $this->html, $hrefs)){
foreach ( $hrefs[0] as $href ) {
if( preg_match("/<a.*href=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $href, $link)){
$this->hrefs[] = preg_replace("/[\"'<>]/", "", $link[1]);
}
}
}
}
/**
* Parse META
* @access private
* @return void
*/
function _parseMetaTags() {
if(preg_match_all("/<meta.*>/isU", $this->html, $metatags)){
foreach ( $metatags[0] as $k=>$meta ) {
$names = array('name','http-equiv');
foreach ($names as $name){
if( preg_match("/<meta.*$name=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $meta, $n) && preg_match("/<meta.*content=(\"([^\"]*)\"|'([^']*)'|[^>\s]*)([^>]*)?>/is", $meta, $c)){
$this->meta[$name][preg_replace("/[\"'<>]/", "", $n[1])] = preg_replace("/[\"'<>]/", "", $c[1]);
}
}
}
}
}
/**
* Parse META
* @access private
* @return void
*/
function _parseLinks(){
//Pattern building across multiple lines to avoid page distortion.
$pattern = "/((@import\s+[\"'`]([\w:?=@&\/#._;-]+)[\"'`];)|";
$pattern .= "(:\s*url\s*\([\s\"'`]*([\w:?=@&\/#._;-]+)";
$pattern .= "([\s\"'`]*\))|<[^>]*\s+(src|href|url)\=[\s\"'`]*";
$pattern .= "([\w:?=@&\/#._;-]+)[\s\"'`]*[^>]*>))/i";
//End pattern building.
preg_match_all ($pattern, $this->html, $matches);
$this->links = (is_array($matches)) ? array_merge($matches[3],$matches[8]):array();
$this->links = array_unique($this->links);
$this->links = array_remove($this->links, '', false);
}
}
?>