<?php
/**
* Class : BowHTMLParser
* Developed by Jonas Eriksson
* @copyright 2004 BICOM KB (JULI)
**/
class BowHTMLParser
{
/*Instance variables*/
var $contents;
var $document;
var $openedTags;
var $openedAttributeTags;
var $currentCharacterPosition;
var $lastCharacterPosition;
var $stopParseAt;
/*Setter and getter for $contents*/
function contents() { return $this->contents; }
function contentsSet( $someContents ) { $this->contents = trim($someContents); }
/*Setter and getter for $currentCharacterPosition*/
function currentCharacterPosition() { return $this->currentCharacterPosition; }
function currentCharacterPositionSet( $aCharacter ) { $this->currentCharacterPosition = $aCharacter; }
/*Setter and getter for $lastCharacterPosition*/
function lastCharacterPosition() { return $this->lastCharacterPosition; }
function lastCharacterPositionSet( $aCharacter ) { $this->lastCharacterPosition = $aCharacter; }
/*Setter and Getter for $document*/
function &document() { return $this->document; }
function documentSet( $aDocument ) { return $this->document =& $aDocument; }
/*Setter and Getter for $openedTags*/
function &openedTags() { return $this->openedTags; }
function openedTagsSet( $someOpenedTags ) { return $this->openedTags =& $someOpenedTags; }
/*Setter and Getter for $openedAttributeTags*/
function &openedAttributeTags() { return $this->openedAttributeTags; }
function openedAttributeTagsSet( $someOpenedAttributeTags ) { return $this->openedAttributeTags =& $someOpenedAttributeTags; }
/*Setter and Getter for $stopParseAt*/
function stopParseAt() { return $this->stopParseAt; }
function stopParseAtSet( $p ) { return $this->stopParseAt = $p; }
function BowHTMLParser($aHTMLFile, $pStartAtPosition = null, $pStopAtTagEnd = null)
{
$startTimeRead = $this->getmicrotime();
/* $this->contentsSet(file_get_contents($aHTMLFile)); Wont work in PHP versions prior to 4.3 */
$this->contentsSet(implode ('', file($aHTMLFile)));
if(is_null($pStartAtPosition))
$this->currentCharacterPositionSet(0);
else
$this->currentCharacterPositionSet($pStartAtPosition);
$this->lastCharacterPositionSet((strlen($this->contents())-1));
if(!is_null($pStopAtTagEnd))
$this->stopParseAtSet(array('name'=>strtolower($pStopAtTagEnd), 'id'=>null));
else
$this->stopParseAtSet(null);
$this->documentSet(new BowHTMLDocument());
$this->openedTagsSet(array());
$this->openedAttributeTagsSet(array());
set_time_limit(2460);
$this->parseDocument();
$this->openedAttributetagsSet(array());
}
function end()
{
return 'bowMessageEndOfParsing';
}
function noStringNode()
{
return 'bowMessageNoStringNode';
}
function notATag()
{
return 'bowMessageNotATag';
}
/**
* @return void
* @desc PRIVATE: Increment the currentCharacterPosition
*/
function incrementCharacter()
{
$this->currentCharacterPositionSet($this->currentCharacterPosition + 1);
}
/**
* @return void
* @desc PRIVATE: Retreives the current character
*/
function getCurrentCharacter() { return $this->contents[$this->currentCharacterPosition()]; }
/**
* @return void
* @desc PRIVATE: Increments currentCharacterPosition and retreives it
*/
function getNextCharacter()
{
if($this->currentCharacterPosition() < $this->lastCharacterPosition())
{
$this->incrementCharacter();
return $this->contents[$this->currentCharacterPosition()];
}
else
{
return $this->end();
}
}
function getmicrotime()
{
list($usec, $sec) = explode(" ",microtime());
return ((float)$usec + (float)$sec);
}
/**
* @return void
* @desc PRIVATE: Previews the next character
*/
function previewNextCharacter()
{
if($this->currentCharacterPosition() < $this->lastCharacterPosition())
{
return $this->contents[$this->currentCharacterPosition()+1];
}
}
function previewFromCurrent($pInteger)
{
if($this->currentCharacterPosition() < $this->lastCharacterPosition())
{
return $this->contents[$this->currentCharacterPosition()+$pInteger];
}
}
function lastOpenedTag()
{
$openedTags = $this->openedTags();
if(empty($openedTags))
return array(0=>"Root");
else
{
end($openedTags);
return each($openedTags);
}
}
function lastOpenedAttributeTag()
{
$openedAttributeTags = $this->openedAttributeTags();
if(empty($openedAttributeTags))
return array(0=>"Root");
else
{
end($openedAttributeTags);
return each($openedAttributeTags);
}
}
/*Returns true if the passed node id is included in my $openedTags, false otherwise*/
function isOpened($nodeId)
{
return in_array($nodeId, $this->openedTags());
}
/*Add the passed $nodeId to the collection of $openedTags*/
function openTag($nodeId, $name)
{
if(is_array($this->stopParseAt()) && $this->stopParseAt['name'] == strtolower($name))
{
if(is_null($this->stopParseAt['id']))
$this->stopParseAt['id'] = $nodeId;
}
$myOpenedTags =& $this->openedTags();
$myOpenedTags[$nodeId] = $name;
$this->openedTagsSet($myOpenedTags);
}
function openAttributeTag($nodeId, $name)
{
$tMyOpenedAttributeTags =& $this->openedAttributeTags();
$tMyOpenedAttributeTags[$nodeId] = $name;
$this->openedAttributeTagsSet($tMyOpenedAttributeTags);
}
/*Remove an element from the collection of $openedTags, that matches the passed $nodeId*/
function closeTag($nodeId)
{
$myOpenedTags =& $this->openedTags();
unset($myOpenedTags[$nodeId]);
$myOpenedAttributeTags =& $this->openedAttributeTags();
unset($myOpenedAttributeTags[$nodeId]);
$this->openedTagsSet($myOpenedTags);
$this->openedAttributeTagsSet($myOpenedAttributeTags);
if((!is_null($nodeId)) && is_array($this->stopParseAt()) && $this->stopParseAt['id'] == $nodeId)
{
return $this->end();
}
}
function updateTagForIdAndName($pId, $pTagName)
{
$tElement =& $this->document->idToElement($pId);
$tElement->tagEndPositionSet($this->currentCharacterPosition());
$tElement->tagStartPositionSet($this->currentCharacterPosition() - strlen($pTagName));
$this->document->nodes->elements[$pId] = $tElement;
}
function parseDocument( )
{
if($this->currentCharacterPosition() < $this->lastCharacterPosition())
{
if($this->addAllTags() == $this->end())
return $this->end();
}
}
function addAllTags()
{
while($this->currentCharacterPosition() < $this->lastCharacterPosition())
{
$tTestCase = "";
if($this->advanceToStartTag() == $this->end())
{
return $this->end();
}
if($this->previewNextCharacter() != "/")
{
//This is a tag that we should open
if(($tReturn = $this->identifyAndAdd()) == $this->end())
{
return $this->end();
}
}
else
{
//This is a tag that we should close
$tReturn = "none";
if(($tReturn = $this->identifyAndClose()) == $this->end())
{
return $this->end();
}
}
$tReturnString = $this->getStringNode("");
if($tReturnString == $this->end())
{
return $this->end();
}
elseif($tReturnString != $this->noStringNode())
$this->addString(trim($tReturnString));
}
return $this->end();
}
/**
* @return void
* @param string $tagName
* @desc Add a new node to the documents last opened node.
*/
function addToDocumentList($tagName)
{
//If the document list is empty. This node is the root.
if($this->document->nodes->isEmpty() && (!$this->isWhiteSpaceOnlyString($tagName)))
{
$tagID = $this->document->addRoot($tagName);
$this->updateTagForIdAndName($tagID, $tagName);
$this->openTag($tagID, $tagName);
}
else
{
$lastElement = $this->lastOpenedTag();
$tagID = $this->document->addTagTo($tagName, $lastElement[0]);
$this->updateTagForIdAndName($tagID, $tagName);
if ($this->shouldBeOpened($tagName))
{
$this->openTag($tagID, $tagName);
$this->openAttributeTag($tagID, $tagName);
}
else
{
$this->openAttributeTag($tagID, $tagName);
}
}
}
function addString($pString)
{
if($pString != $this->noStringNode() && (!$this->isWhiteSpaceOnlyString($pString)))
{
$lastElement = $this->lastOpenedTag();
$tStringNodeId = $this->document->addTextTo($pString, $lastElement[0]);
$this->updateTagForIdAndName($tStringNodeId, $pString);
}
}
function isWhiteSpaceOnlyString($pString)
{
return ctype_space($pString);
}
function shouldBeOpened($pTagName)
{
return (!in_array(strtolower($pTagName), array(
"br",
"img",
"meta",
"link",
"hr",
"p",
"input")));
}
/**
* @return unknown
* @desc Identfy the tag and add it to the document
*/
function identifyAndAdd()
{
$tagName = $this->identifyTag();
if($tagName == $this->end())
return $this->end();
if($tagName == $this->notATag())
return $this->notATag();
$this->addToDocumentList($tagName);
return $tagName;
}
function identifyAndClose()
{
if($this->getNextCharacter() == $this->end())
return $this->end();
$tagName = $this->identifyTag();
if($tagName == $this->end())
return $this->end();
if($tagName == $this->notATag())
return $this->notATag();
if
(
strtolower($tagName) != 'script'
|| strtolower($tagName) != 'style'
|| strtolower($tagName) != '?xml'
|| strtolower($tagName) != '!doctype')
{
$tagId = $this->getProbableIdForName($tagName);
if($this->closeTag($tagId) == $this->end())
{
return $this->end();
}
return $tagName;
}
}
function getProbableIdForName($pTagName)
{
foreach(array_reverse($this->openedTags(), true) as $iNodeId => $iTagName)
{
if(strtolower($pTagName) == strtolower($iTagName))
{
return $iNodeId;
}
}
}
function identifyTag()
{
$tagName = "";
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
while($cc != ">" && $cc != " ")
{
$tagName = "$tagName$cc";
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
}
if(strtolower($tagName) == 'script')
return $this->escapeScript();
elseif(strtolower($tagName) == 'style')
return $this->escapeStyle();
elseif(strtolower($tagName) == '?xml')
return $this->escapeXmlDescriptor();
elseif(strtolower($tagName) == '!doctype')
return $this->escapeDocType();
return $tagName;
}
function advanceToStartTag( )
{
$cc = "";
if($this->currentCharacterPosition() == 0 || $this->getCurrentCharacter() == "<")
$cc = $this->getCurrentCharacter();
while($cc != "<")
{
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
}
if($this->previewNextCharacter() == " " || is_numeric($this->previewNextCharacter()))
{//This is probably some bad HTML or some loosley written JavaScript,
//try to ignore it.
$tReturn = $this->advanceToStartTag();
if($tReturn == $this->end())
return $tReturn;
}
if($this->previewNextCharacter() == "!")
{//This might be a comment, if it is; try to ignore all of its contents
if($this->previewFromCurrent(2) == "-")
{
$tReturn = $this->escapeComment();
$tReturn = $this->advanceToStartTag();
if($tReturn == $this->end())
return $tReturn;
}
}
return $this->getCurrentCharacter();
}
function getStringNode($pString)
{
$tString = $pString;
if($this->getCurrentCharacter() == ">")
{
if($this->end() == $this->getNextCharacter())
{
return $this->end();
}
}
if($this->getCurrentCharacter() == "<")
return $this->noStringNode();
for ($nn = $this->getCurrentCharacter();$nn != "<" ; $nn = $this->getNextCharacter())
{
if($nn == $this->end())
{
return $this->end();
}
elseif($nn == ">")
{
$this->addAttribute($tString);
return $this->getStringNode("");
}
else
$tString = "$tString$nn";
}
return $tString;
}
function addAttribute($pString)
{
//width="120" or href="app.inc.php?id=12" or width=120 or href=app.inc.php?id=12
$tString = "";
$tInNameSearch = true;
$tValue = "";
$tNameArray = array();
$tValueArray = array();
$tNameValueArray = array();
for($c=0;$c<=strlen($pString)-1;$c++)
{
if($tInNameSearch)
{
if($pString[$c] != "=")
{
$tValue = "$tValue".$pString[$c];
}
else
{
$tNameArray[] = trim($tValue);
$tInNameSearch = false;
$tValue = "";
}
}
else
{
if($pString[$c] != " ")
{
$tValue = "$tValue".$pString[$c];
if($c == strlen($pString)-1)
{
$tValue = $this->unquoted($tValue);
$tValueArray[] = trim($tValue);
}
}
else
{
$tValue = $this->unquoted($tValue);
$tValueArray[] = trim($tValue);
$tInNameSearch = true;
$tValue = "";
}
}
}
if(sizeof($tValueArray) == sizeof($tNameArray))
{
foreach($tValueArray as $iIndex => $iValue)
$tNameValueArray[$tNameArray[$iIndex]] = $iValue;
$lastElement = $this->lastOpenedAttributeTag();
$this->document->addAttributesTo($tNameValueArray, $lastElement[0]);
}
}
function unquoted($pString)
{
$tValue = $pString;
if($tValue[0] == "\"" || $tValue[0] == "'")
$tValue = substr($tValue, 1);
if($tValue[strlen($tValue)-1] == "\"" || $tValue[strlen($tValue)-1] == "'")
$tValue = substr($tValue, 0, -1);
return $tValue;
}
function escapeComment()
{
for ($nn = $this->getCurrentCharacter();$nn != "-"; $nn = $this->getNextCharacter())
{
if($nn == $this->end())
return $this->end();
}
if(($cc = $this->getNextCharacter()) != ">")
{
if($cc == $this->end())
return $this->end();
$this->escapeComment();
}
}
function escapeScript()
{
/*NOTE: The script-tag escape in this function will be handled
differently in the future. This is a quick-fix in the last minute*/
$tString = "";
while(!preg_match("/\<\/script\>/", strtolower($tString)))
{
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
$tString = "$tString$cc";
}
return $this->notATag();
}
function escapeStyle()
{
/*NOTE: The style-tag escape in this function will be handled
differently in the future. This is a quick-fix in the last minute*/
$tString = "";
while(!preg_match("/\<\/style\>/", strtolower($tString)))
{
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
$tString = "$tString$cc";
}
return $this->notATag();
}
function escapeXmlDescriptor()
{
/*NOTE: The xml-tag escape in this function will be handled
differently in the future. This is a quick-fix in the last minute*/
$tString = "";
while(!preg_match("/\?\>/", strtolower($tString)))
{
$cc = $this->getNextCharacter();
if($cc == $this->end())
return $this->end();
$tString = "$tString$cc";
}
return $this->notATag();
}
function escapeDoctype()
{
/*NOTE: The doctype-tag escape in this function will be handled
differently in the future. This is a quick-fix in the last minute*/
for ($nn = $this->getCurrentCharacter();$nn != ">"; $nn = $this->getNextCharacter())
{
if($nn == $this->end())
return $this->end();
}
return $this->notATag();
}
}
?>