Location: PHPKode > scripts > BowML > BowHTMLParser.inc.php
<?php

/**
 * Class : BowHTMLParser
 * Developed by Jonas Eriksson
 * @copyright 2004 BICOM KB (JULI)
 **/
  
 class BowHTMLParser
 {
	/*Instance variables*/
	var $contents;
	var $document;
	var $openedTags;
	var $openedAttributeTags;
	var $currentCharacterPosition;
	var $lastCharacterPosition;
	var $stopParseAt;
	
	/*Setter and getter for $contents*/
	function contents() { return $this->contents; }
	function contentsSet( $someContents ) { $this->contents = trim($someContents); }

	/*Setter and getter for $currentCharacterPosition*/
	function currentCharacterPosition() { return $this->currentCharacterPosition; }
	function currentCharacterPositionSet( $aCharacter ) { $this->currentCharacterPosition = $aCharacter; }

	/*Setter and getter for $lastCharacterPosition*/
	function lastCharacterPosition() { return $this->lastCharacterPosition; }
	function lastCharacterPositionSet( $aCharacter ) { $this->lastCharacterPosition = $aCharacter; }
	
	/*Setter and Getter for $document*/
	function &document() { return $this->document; }
	function documentSet( $aDocument ) { return $this->document =& $aDocument; }
	
	/*Setter and Getter for $openedTags*/
	function &openedTags() { return $this->openedTags; }
	function openedTagsSet( $someOpenedTags ) { return $this->openedTags =& $someOpenedTags; }

	/*Setter and Getter for $openedAttributeTags*/
	function &openedAttributeTags() { return $this->openedAttributeTags; }
	function openedAttributeTagsSet( $someOpenedAttributeTags ) { return $this->openedAttributeTags =& $someOpenedAttributeTags; }

	/*Setter and Getter for $stopParseAt*/
	function stopParseAt() { return $this->stopParseAt; }
	function stopParseAtSet( $p ) { return $this->stopParseAt = $p; }	
	
	function BowHTMLParser($aHTMLFile, $pStartAtPosition = null, $pStopAtTagEnd = null)
	{
		$startTimeRead = $this->getmicrotime();
/*		$this->contentsSet(file_get_contents($aHTMLFile)); Wont work in PHP versions prior to 4.3 */
		$this->contentsSet(implode ('', file($aHTMLFile))); 

		if(is_null($pStartAtPosition))
			$this->currentCharacterPositionSet(0);
		else 
			$this->currentCharacterPositionSet($pStartAtPosition);
		
		$this->lastCharacterPositionSet((strlen($this->contents())-1));
		
		if(!is_null($pStopAtTagEnd))
			$this->stopParseAtSet(array('name'=>strtolower($pStopAtTagEnd), 'id'=>null));
		else
			$this->stopParseAtSet(null);
			
		$this->documentSet(new BowHTMLDocument());
		$this->openedTagsSet(array());
		$this->openedAttributeTagsSet(array());
		set_time_limit(2460);
		$this->parseDocument();
		$this->openedAttributetagsSet(array());
	}

	function end()
	{
		return 'bowMessageEndOfParsing';
	}
	
	function noStringNode()
	{
		return 'bowMessageNoStringNode';
	}
	
	function notATag()
	{
		return 'bowMessageNotATag';
	}
	
	/**
	* @return void
	* @desc PRIVATE: Increment the currentCharacterPosition
	*/
	function incrementCharacter() 
	{ 
		$this->currentCharacterPositionSet($this->currentCharacterPosition + 1); 
	}

	/**
	* @return void
	* @desc PRIVATE: Retreives the current character
	*/
	function getCurrentCharacter() { return $this->contents[$this->currentCharacterPosition()]; }

	/**
	* @return void
	* @desc PRIVATE: Increments currentCharacterPosition and retreives it
	*/
	function getNextCharacter() 
	{
		if($this->currentCharacterPosition() < $this->lastCharacterPosition())
		{
			$this->incrementCharacter(); 
			return $this->contents[$this->currentCharacterPosition()]; 
		}
		else 
		{
			return $this->end();
		}
	}
	
	function getmicrotime()
	{ 
	    list($usec, $sec) = explode(" ",microtime()); 
    	return ((float)$usec + (float)$sec); 
    } 
	
	/**
	* @return void
	* @desc PRIVATE: Previews the next character
	*/
	function previewNextCharacter() 
	{
		if($this->currentCharacterPosition() < $this->lastCharacterPosition())
		{ 
			return $this->contents[$this->currentCharacterPosition()+1]; 
		}
	}
	
	function previewFromCurrent($pInteger)
	{
		if($this->currentCharacterPosition() < $this->lastCharacterPosition())
		{
			return $this->contents[$this->currentCharacterPosition()+$pInteger];
		}
	}
	
	function lastOpenedTag()
	{
		$openedTags = $this->openedTags();
		if(empty($openedTags))
			return array(0=>"Root");
		else 
		{
			end($openedTags);
			return each($openedTags);
		}
	}
	
	function lastOpenedAttributeTag()
	{
		$openedAttributeTags = $this->openedAttributeTags();
		if(empty($openedAttributeTags))
			return array(0=>"Root");
		else 
		{
			end($openedAttributeTags);
			return each($openedAttributeTags);
		}
	}
	
	/*Returns true if the passed node id is included in my $openedTags, false otherwise*/
	function isOpened($nodeId)
	{
		return in_array($nodeId, $this->openedTags());
	}

	/*Add the passed $nodeId to the collection of $openedTags*/	
	function openTag($nodeId, $name)
	{
		if(is_array($this->stopParseAt()) && $this->stopParseAt['name'] == strtolower($name))
		{
			if(is_null($this->stopParseAt['id'])) 
				$this->stopParseAt['id'] = $nodeId;
		}	
		$myOpenedTags =& $this->openedTags();
		$myOpenedTags[$nodeId] = $name;
		$this->openedTagsSet($myOpenedTags);
	}
	
	function openAttributeTag($nodeId, $name)
	{
		$tMyOpenedAttributeTags =& $this->openedAttributeTags();
		$tMyOpenedAttributeTags[$nodeId] = $name;
		$this->openedAttributeTagsSet($tMyOpenedAttributeTags);
	}
	
	/*Remove an element from the collection of $openedTags, that matches the passed $nodeId*/	
	function closeTag($nodeId)
	{
		$myOpenedTags =& $this->openedTags();
		unset($myOpenedTags[$nodeId]);
		$myOpenedAttributeTags =& $this->openedAttributeTags();
		unset($myOpenedAttributeTags[$nodeId]);
		$this->openedTagsSet($myOpenedTags);
		$this->openedAttributeTagsSet($myOpenedAttributeTags);
		if((!is_null($nodeId)) && is_array($this->stopParseAt()) && $this->stopParseAt['id'] == $nodeId)
		{
			return $this->end();		
		}
	}
	
	function updateTagForIdAndName($pId, $pTagName)
	{
		$tElement =& $this->document->idToElement($pId);
		$tElement->tagEndPositionSet($this->currentCharacterPosition());
		$tElement->tagStartPositionSet($this->currentCharacterPosition() - strlen($pTagName));
		$this->document->nodes->elements[$pId] = $tElement;
	}
	
	function parseDocument( )
	{
		if($this->currentCharacterPosition() < $this->lastCharacterPosition())
		{
			if($this->addAllTags() == $this->end())
				return $this->end();
		}
	}

	function addAllTags()
	{
		while($this->currentCharacterPosition() < $this->lastCharacterPosition())
		{
			$tTestCase = "";
			if($this->advanceToStartTag() == $this->end())
			{
				return $this->end();
			}
			if($this->previewNextCharacter() != "/")
			{
				//This is a tag that we should open
				if(($tReturn = $this->identifyAndAdd()) == $this->end())
				{
					return $this->end();
				}
			}
			else 
			{
				//This is a tag that we should close
				$tReturn = "none";
				if(($tReturn = $this->identifyAndClose()) == $this->end())
				{
					return $this->end();
				}
			}
			$tReturnString = $this->getStringNode("");
			if($tReturnString == $this->end())
			{
				return $this->end();
			}
			elseif($tReturnString != $this->noStringNode())
				$this->addString(trim($tReturnString));
		}
		return $this->end();
	}
	
	/**
	* @return void
	* @param string $tagName
	* @desc Add a new node to the documents last opened node.
	*/
	function addToDocumentList($tagName)
	{
		//If the document list is empty. This node is the root.
		if($this->document->nodes->isEmpty() && (!$this->isWhiteSpaceOnlyString($tagName)))
		{
			$tagID = $this->document->addRoot($tagName);
			$this->updateTagForIdAndName($tagID, $tagName);
			$this->openTag($tagID, $tagName);
		}
		else
		{
			$lastElement = $this->lastOpenedTag();
			$tagID = $this->document->addTagTo($tagName, $lastElement[0]);
			$this->updateTagForIdAndName($tagID, $tagName);
			if ($this->shouldBeOpened($tagName))
			{
				$this->openTag($tagID, $tagName);
				$this->openAttributeTag($tagID, $tagName);
			}
			else 
			{
				$this->openAttributeTag($tagID, $tagName);
			}
		}
	}

	function addString($pString)
	{
		if($pString != $this->noStringNode() && (!$this->isWhiteSpaceOnlyString($pString)))
		{
			$lastElement = $this->lastOpenedTag();
			$tStringNodeId = $this->document->addTextTo($pString, $lastElement[0]);
			$this->updateTagForIdAndName($tStringNodeId, $pString);
		}
	}
	
	function isWhiteSpaceOnlyString($pString)
	{
		return ctype_space($pString);
	}
	
	function shouldBeOpened($pTagName)
	{
		return (!in_array(strtolower($pTagName), array(
			"br", 
			"img", 
			"meta", 
			"link",
			"hr",
			"p",
			"input")));
	}
		
	/**
 	* @return unknown
 	* @desc Identfy the tag and add it to the document
	*/
	function identifyAndAdd()
	{
		$tagName = $this->identifyTag();
		if($tagName == $this->end())
			return $this->end();
		if($tagName == $this->notATag())
			return $this->notATag();
		$this->addToDocumentList($tagName);
		return $tagName;
	}
	
	function identifyAndClose()
	{
		if($this->getNextCharacter() == $this->end())
			return $this->end();
		$tagName = $this->identifyTag();

		if($tagName == $this->end())
			return $this->end();
		if($tagName == $this->notATag())
			return $this->notATag();
		if
		(
			strtolower($tagName) != 'script' 
			|| strtolower($tagName) != 'style'
			|| strtolower($tagName) != '?xml'
			|| strtolower($tagName) != '!doctype')
		{
			$tagId = $this->getProbableIdForName($tagName);
			if($this->closeTag($tagId) == $this->end())
			{
				return $this->end();	
			}
			return $tagName;
		}
	}

	function getProbableIdForName($pTagName)
	{
		foreach(array_reverse($this->openedTags(), true) as $iNodeId => $iTagName)
		{
			if(strtolower($pTagName) == strtolower($iTagName))
			{
				return $iNodeId;
			}
		}
	}
	
	function identifyTag()
	{	
		$tagName = "";
		$cc = $this->getNextCharacter();
		if($cc == $this->end())
			return $this->end();
		while($cc != ">" && $cc != " ")
		{
			$tagName = "$tagName$cc";
			$cc = $this->getNextCharacter();
			if($cc == $this->end())
				return $this->end();
		}
		if(strtolower($tagName) == 'script')
			return $this->escapeScript();
		elseif(strtolower($tagName) == 'style')
			return $this->escapeStyle();
		elseif(strtolower($tagName) == '?xml')
			return $this->escapeXmlDescriptor();
		elseif(strtolower($tagName) == '!doctype')
			return $this->escapeDocType();			
		return $tagName;
	}
	
	function advanceToStartTag( )
	{
		$cc = "";
		if($this->currentCharacterPosition() == 0 || $this->getCurrentCharacter() == "<")
			$cc = $this->getCurrentCharacter();
		while($cc != "<")
		{
			$cc = $this->getNextCharacter();
			if($cc == $this->end())
				return $this->end();
		}

		if($this->previewNextCharacter() == " " || is_numeric($this->previewNextCharacter()))
		{//This is probably some bad HTML or some loosley written JavaScript, 
		 //try to ignore it.
			$tReturn = $this->advanceToStartTag();
			if($tReturn == $this->end())
				return $tReturn;
		}
		if($this->previewNextCharacter() == "!")
		{//This might be a comment, if it is; try to ignore all of its contents
			if($this->previewFromCurrent(2) == "-")
			{
				$tReturn = $this->escapeComment();
				$tReturn = $this->advanceToStartTag();
				if($tReturn == $this->end())
					return $tReturn;
			}
		}
		return $this->getCurrentCharacter();
	}
	
	function getStringNode($pString)
	{
		$tString = $pString;
		if($this->getCurrentCharacter() == ">")
		{
			if($this->end() == $this->getNextCharacter())
			{
				return $this->end();
			}
		}
		if($this->getCurrentCharacter() == "<")
			return $this->noStringNode();
		for ($nn = $this->getCurrentCharacter();$nn != "<" ; $nn = $this->getNextCharacter()) 
		{
			if($nn == $this->end())
			{
				return $this->end();
			}
			elseif($nn == ">")
			{
				$this->addAttribute($tString);
				return $this->getStringNode("");
			}
			else
				$tString = "$tString$nn";
		}
		return $tString;
	}
	
	function addAttribute($pString)
	{
		//width="120" or href="app.inc.php?id=12" or width=120 or href=app.inc.php?id=12
		$tString = "";
		$tInNameSearch = true;
		$tValue = "";
		$tNameArray = array();
		$tValueArray = array();
		$tNameValueArray = array();
		for($c=0;$c<=strlen($pString)-1;$c++)
		{
			if($tInNameSearch)
			{ 
				if($pString[$c] != "=")
				{
					$tValue = "$tValue".$pString[$c];
				}
				else 
				{
					$tNameArray[] = trim($tValue);
					$tInNameSearch = false;
					$tValue = "";
				}
			}
			else 
			{
				if($pString[$c] != " ")
				{
					$tValue = "$tValue".$pString[$c];
					if($c == strlen($pString)-1)
					{
						$tValue = $this->unquoted($tValue);
						$tValueArray[] = trim($tValue);	
					}
				}
				else 
				{
					$tValue = $this->unquoted($tValue);
					$tValueArray[] = trim($tValue);
					$tInNameSearch = true;
					$tValue = "";
				}
			}
		}
		if(sizeof($tValueArray) == sizeof($tNameArray))
		{
			foreach($tValueArray as $iIndex => $iValue)
				$tNameValueArray[$tNameArray[$iIndex]] = $iValue;
			$lastElement = $this->lastOpenedAttributeTag();
			$this->document->addAttributesTo($tNameValueArray, $lastElement[0]);
		}
	}
	
	function unquoted($pString)
	{
		$tValue = $pString;
		if($tValue[0] == "\"" || $tValue[0] == "'")
			$tValue = substr($tValue, 1);
		if($tValue[strlen($tValue)-1] == "\"" || $tValue[strlen($tValue)-1] == "'")
			$tValue = substr($tValue, 0, -1);
		return $tValue;
	}
	
	function escapeComment()
	{
		for ($nn = $this->getCurrentCharacter();$nn != "-"; $nn = $this->getNextCharacter()) 
		{
			if($nn == $this->end())
				return $this->end();
		}
		if(($cc = $this->getNextCharacter()) != ">")
		{
			if($cc == $this->end())
				return $this->end();
			$this->escapeComment();
		}
	}
	
	function escapeScript()
	{
		/*NOTE: The script-tag escape in this function will be handled
		differently in the future. This is a quick-fix in the last minute*/
		$tString = "";
		while(!preg_match("/\<\/script\>/", strtolower($tString)))
		{
			$cc = $this->getNextCharacter();
			if($cc == $this->end())
				return $this->end();
			$tString = "$tString$cc";
		}
		return $this->notATag();
	}
	
	function escapeStyle()
	{
		/*NOTE: The style-tag escape in this function will be handled
		differently in the future. This is a quick-fix in the last minute*/
		$tString = "";
		while(!preg_match("/\<\/style\>/", strtolower($tString)))
		{
			$cc = $this->getNextCharacter();
			if($cc == $this->end())
				return $this->end();
			$tString = "$tString$cc";
		}
		return $this->notATag();
	}
	
	function escapeXmlDescriptor()
	{
		/*NOTE: The xml-tag escape in this function will be handled
		differently in the future. This is a quick-fix in the last minute*/
		$tString = "";
		while(!preg_match("/\?\>/", strtolower($tString)))
		{
			$cc = $this->getNextCharacter();
			if($cc == $this->end())
				return $this->end();
			$tString = "$tString$cc";
		}
		return $this->notATag();	
	}
	
	function escapeDoctype()
	{
		/*NOTE: The doctype-tag escape in this function will be handled
		differently in the future. This is a quick-fix in the last minute*/
		for ($nn = $this->getCurrentCharacter();$nn != ">"; $nn = $this->getNextCharacter()) 
		{
			if($nn == $this->end())
				return $this->end();
		}
		return $this->notATag();
	}
}

?>
Return current item: BowML