<?php
class FeedParser{
private $xmlParser = null;
private $insideItem = array(); // Keep track of current position in tag tree
private $currentTag = null; // Last entered tag name
private $currentAttr = null; // Attributes array of last entered tag
private $namespaces = array(
'http://purl.org/rss/1.0/' => 'RSS 1.0',
'http://purl.org/rss/1.0/modules/content/' => 'RSS 2.0',
'http://www.w3.org/2005/Atom' => 'ATOM 1',
); // Namespaces to detact feed version
private $itemTags = array('ITEM','ENTRY'); // List of tag names which holds a feed item
private $channelTags = array('CHANNEL','FEED'); // List of tag names which holds all channel elements
private $dateTags = array('UPDATED','PUBDATE','DC:DATE');
private $hasSubTags = array('IMAGE','AUTHOR'); // List of tag names which have sub tags
private $channels = array();
private $items = array();
private $itemIndex = 0;
private $url = null; // The parsed url
private $version = null; // Detected feed version
/**
* Constructor - Initialize and set event handler functions to xmlParser
*/
function __construct()
{
$this->xmlParser = xml_parser_create();
xml_set_object($this->xmlParser, $this);
xml_set_element_handler($this->xmlParser, "startElement", "endElement");
xml_set_character_data_handler($this->xmlParser, "characterData");
}
/*-----------------------------------------------------------------------+
| Public functions. Use to parse feed and get informations. |
+-----------------------------------------------------------------------*/
/**
* Get all channel elements
*
* @access public
* @return array - All chennels as associative array
*/
public function getChannels()
{
return $this->channels;
}
/**
* Get all feed items
*
* @access public
* @return array - All feed items as associative array
*/
public function getItems()
{
return $this->items;
}
/**
* Get total number of feed items
*
* @access public
* @return number
*/
public function getTotalItems()
{
return count($this->items);
}
/**
* Get a feed item by index
*
* @access public
* @param number index of feed item
* @return array feed item as associative array of it's elements
*/
public function getItem($index)
{
if($index < $this->getTotalItems())
{
return $this->items[$index];
}
else
{
throw new Exception("Item index is learger then total items.");
return false;
}
}
/**
* Get a channel element by name
*
* @access public
* @param string the name of channel tag
* @return string
*/
public function getChannel($tagName)
{
if(array_key_exists(strtoupper($tagName), $this->channels))
{
return $this->channels[strtoupper($tagName)];
}
else
{
throw new Exception("Channel tag $tagName not found.");
return false;
}
}
/**
* Get the parsed URL
*
* @access public
* @return string
*/
public function getParsedUrl()
{
if(empty($this->url))
{
throw new Exception("Feed URL is not set yet.");
return FALSE;
}
else
{
return $this->url;
}
}
/**
* Get the detected Feed version
*
* @access public
* @return string
*/
public function getFeedVersion()
{
return $this->version;
}
/**
* Parses a feed url
*
* @access public
* @param srting teh feed url
* @return void
*/
public function parse($url)
{
$this->url = $url;
$URLContent = $this->getUrlContent();
if($URLContent)
{
$segments = str_split($URLContent, 4096);
foreach($segments as $index=>$data)
{
$lastPiese = ((count($segments)-1) == $index)? true : false;
xml_parse($this->xmlParser, $data, $lastPiese)
or die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->xmlParser)),
xml_get_current_line_number($this->xmlParser)));
}
xml_parser_free($this->xmlParser);
}
else
{
die('Sorry! cannot load the feed url.');
}
if(empty($this->version))
{
die('Sorry! cannot detect the feed version.');
}
}
// End public functions -------------------------------------------------
/*-----------------------------------------------------------------------+
| Private functions. Be careful to edit them. |
+-----------------------------------------------------------------------*/
/**
* Load the whole contents of a RSS/ATOM page
*
* @access private
* @return string
*/
private function getUrlContent()
{
if(empty($this->url))
{
throw new Exception("URL to parse is empty!.");
return false;
}
if($content = @file_get_contents($this->url))
{
return $content;
}
else
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
$error = curl_error($ch);
curl_close($ch);
if(empty($error))
{
return $content;
}
else
{
throw new Exception("Erroe occured while loading url by cURL. <br />\n" . $error) ;
return false;
}
}
}
/**
* Handle the start event of a tag while parsing
*
* @access private
* @param object the xmlParser object
* @param string name of currently entering tag
* @param array array of attributes
* @return void
*/
private function startElement($parser, $tagName, $attrs)
{
if(!$this->version)
{
$this->findVersion($tagName, $attrs);
}
array_push($this->insideItem, $tagName);
$this->currentTag = $tagName;
$this->currentAttr = $attrs;
}
/**
* Handle the end event of a tag while parsing
*
* @access private
* @param object the xmlParser object
* @param string name of currently ending tag
* @return void
*/
private function endElement($parser, $tagName)
{
if (in_array($tagName, $this->itemTags))
{
$this->itemIndex++;
}
array_pop($this->insideItem);
$this->currentTag = $this->insideItem[count($this->insideItem)-1];
}
/**
* Handle character data of a tag while parsing
*
* @access private
* @param object the xmlParser object
* @param string tag value
* @return void
*/
private function characterData($parser, $data)
{
//Converting all date formats to timestamp
if(in_array($this->currentTag, $this->dateTags))
{
$data = strtotime($data);
}
if($this->inChannel())
{
// If has subtag, make current element an array and assign subtags as it's element
if(in_array($this->getParentTag(), $this->hasSubTags))
{
if(! is_array($this->channels[$this->getParentTag()]))
{
$this->channels[$this->getParentTag()] = array();
}
$this->channels[$this->getParentTag()][$this->currentTag] .= strip_tags($this->html2bb((trim($data))));
return;
}
else
{
if(! in_array($this->currentTag, $this->hasSubTags))
{
$this->channels[$this->currentTag] .= strip_tags($this->html2bb((trim($data))));
}
}
if(!empty($this->currentAttr))
{
$this->channels[$this->currentTag . '_ATTRS'] = $this->currentAttr;
//If the tag has no value
if(strlen($this->channels[$this->currentTag]) < 2)
{
//If there is only one attribute, assign the attribute value as channel value
if(count($this->currentAttr) == 1)
{
foreach($this->currentAttr as $attrVal)
{
$this->channels[$this->currentTag] = $attrVal;
}
}
//If there are multiple attributes, assign the attributs array as channel value
else
{
$this->channels[$this->currentTag] = $this->currentAttr;
}
}
}
}
elseif($this->inItem())
{
// If has subtag, make current element an array and assign subtags as it's elements
if(in_array($this->getParentTag(), $this->hasSubTags))
{
if(! is_array($this->items[$this->itemIndex][$this->getParentTag()]))
{
$this->items[$this->itemIndex][$this->getParentTag()] = array();
}
$this->items[$this->itemIndex][$this->getParentTag()][$this->currentTag] .= strip_tags($this->html2bb((trim($data))));
return;
}
else
{
if(! in_array($this->currentTag, $this->hasSubTags))
{
$this->items[$this->itemIndex][$this->currentTag] .= strip_tags($this->html2bb((trim($data))));
}
}
if(!empty($this->currentAttr))
{
$this->items[$this->itemIndex][$this->currentTag . '_ATTRS'] = $this->currentAttr;
//If the tag has no value
if(strlen($this->items[$this->itemIndex][$this->currentTag]) < 2)
{
//If there is only one attribute, assign the attribute value as feed element's value
if(count($this->currentAttr) == 1)
{
foreach($this->currentAttr as $attrVal)
{
$this->items[$this->itemIndex][$this->currentTag] = $attrVal;
}
}
//If there are multiple attributes, assign the attribute array as feed element's value
else
{
$this->items[$this->itemIndex][$this->currentTag] = $this->currentAttr;
}
}
}
}
}
/**
* Find out the feed version
*
* @access private
* @param string name of current tag
* @param array array of attributes
* @return void
*/
private function findVersion($tagName, $attrs)
{
$namespace = array_values($attrs);
foreach($this->namespaces as $value =>$version)
{
if(in_array($value, $namespace))
{
$this->version = $version;
return;
}
}
}
private function getParentTag()
{
return $this->insideItem[count($this->insideItem) - 2];
}
/**
* Detect if current position is in channel element
*
* @access private
* @return bool
*/
private function inChannel()
{
if($this->version == 'RSS 1.0')
{
if(in_array('CHANNEL', $this->insideItem) && $this->currentTag != 'CHANNEL')
return TRUE;
}
elseif($this->version == 'RSS 2.0')
{
if(in_array('CHANNEL', $this->insideItem) && !in_array('ITEM', $this->insideItem) && $this->currentTag != 'CHANNEL')
return TRUE;
}
elseif($this->version == 'ATOM 1')
{
if(in_array('FEED', $this->insideItem) && !in_array('ENTRY', $this->insideItem) && $this->currentTag != 'FEED')
return TRUE;
}
return FALSE;
}
/**
* Detect if current position is in Item element
*
* @access private
* @return bool
*/
private function inItem()
{
if($this->version == 'RSS 1.0' || $this->version == 'RSS 2.0')
{
if(in_array('ITEM', $this->insideItem) && $this->currentTag != 'ITEM')
return TRUE;
}
elseif($this->version == 'ATOM 1')
{
if(in_array('ENTRY', $this->insideItem) && $this->currentTag != 'ENTRY')
return TRUE;
}
return FALSE;
}
//This function is taken from lastRSS
/**
* Replace HTML entities &something; by real characters
*
*
* @access private
* @author Vojtech Semecky <hide@address.com>
* @link http://lastrss.oslab.net/
* @param string
* @return string
*/
private function unhtmlentities($string)
{
// Get HTML entities table
$trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES);
// Flip keys<==>values
$trans_tbl = array_flip ($trans_tbl);
// Add support for ' entity (missing in HTML_ENTITIES)
$trans_tbl += array(''' => "'");
// Replace entities by values
return strtr ($string, $trans_tbl);
}
// HTML conversion common to both bbcode and text result
function html_common($string)
{
// First extract just the body of the message
if (stristr($string, "<body"))
{
if (preg_match('#<body.*?>(.*)</body>#is', $string, $matches))
{
$string = $matches[1];
}
elseif (preg_match('#<body.*?>(.*)#is', $string, $matches))
{
$string = $matches[1];
}
}
// Convert all HTML tags to lower case
$string = preg_replace('#(</?)(\w+)([^>]*>)#e','"$1".strtolower("$2")."$3"',$string);
//
// Save the pre-formatted text
//
preg_match_all('#<pre(| .*?)>(.*?)</pre>#s', $string, $matches);
$preformated_strings = $matches[2];
$cp = count($preformated_strings);
for ($i = 0; $i < $cp; $i++)
{
$string = preg_replace('#<pre(| .*?)>(.*?)</pre>#s', "***pre_string***$i", $string, 1);
}
//
// Replace formatting elements
//
// Replace all CR LF with a single space
$string = str_replace("\r\n", ' ', $string);
// Then replace CR or LF alone with a single space
$string = str_replace("\r", ' ', $string);
$string = str_replace("\n", ' ', $string);
// Replace line break with a CR LF
$string = str_replace('<br>', "\r\n", $string);
$string = str_replace('<br />', "\r\n", $string);
// The end of a division should trigger a line break
// $string = str_replace('</div>', "\r\n", $string);
// replace spaces
$string = str_replace(' ', ' ', $string);
// replace multiple spaces
$string = preg_replace('/ {2,}/', ' ', $string);
// Outlook sometimes puts extra stuff with the paragraph marker, so match
// <p> and <p otherjunk>
//$string = preg_replace('#<p(| .*?)>#', "\r\n", $string);
// Restore the preformatted text
//
for ($i = 0; $i < $cp; $i++)
{
$string = str_replace("***pre_string***$i", '<pre>' . $preformated_strings[$i] . '</pre>', $string);
}
return $string;
}
// Converts an HTML email into bbcode
// This function is loosely based on cbparser.php by corz.org
function html2bb($string)
{
// Do common conversion stuff
$string = $this->html_common($string);
// Do simple string replacements
//@emovip 22/Oct/2010 2:56 hey b creative man
//<font color="#ff4040"><font size="6"><font face="Impact">gggggggggg</font></font></font>
//[color=#ff4040][size=6][font=Impact]gggggggggg[/font][/size][/color]
$string = preg_replace('#<font color="(.*?)">(.*?)</font>#i', " $2 ", $string);
$string = preg_replace('#<font size="(.*?)">(.*?)</font>#i', " $2 ", $string);
$string = preg_replace('#<font face="(.*?)">(.*?)</font>#i', " $2 ", $string);
$string = preg_replace('#<p align="(.*?)">(.*?)</p>#i', "[$1] $2 [/$1]", $string);
$string = preg_replace('#<div align="(.*?)">(.*?)</div>#i', "[$1] $2 [/$1]", $string);
$string = str_replace('</b>', '[/b]', $string);
$string = str_replace('</i>', '[/i]', $string);
$string = str_replace('</u>', '[/u]', $string);
$string = str_replace('</ul>', '[/list]', $string);
$string = str_replace('</ol>', '[/list]', $string);
$string = str_replace('</em>', '[/i]', $string);
$string = str_replace('</strong>', '[/b]', $string);
$string = str_replace('</blockquote>', '[/quote]', $string);
$string = str_replace('</pre>', '[/quote]', $string);
// Do simple reg expr replacements
$string = preg_replace('#<b(| .*?)>#', '[b]', $string);
$string = preg_replace('#<i(| .*?)>#', '[i]', $string);
$string = preg_replace('#<u(| .*?)>#', '[u]', $string);
$string = preg_replace('#<ul(| .*?)>#', '[list]', $string);
$string = preg_replace('#<ol(| .*?)>#', '[list=1]', $string);
$string = preg_replace('#<li(| .*?)>#', '[*]', $string);
$string = preg_replace('#<em(| .*?)>#', '[i]', $string);
$string = preg_replace('#<strong(| .*?)>#', '[b]', $string);
$string = preg_replace('#<blockquote(| .*?)>#', '[quote]', $string);
$string = preg_replace('#<pre(| .*?)>#', '[quote]', $string);
// replace multiple instances of [b] or [i] with single tags
$string = preg_replace('#(\[b\])+#', '[b]', $string);
$string = preg_replace('#(\[i\])+#', '[i]', $string);
$string = preg_replace('#(\[/b\])+#', '[/b]', $string);
$string = preg_replace('#(\[/i\])+#', '[/i]', $string);
// fix for thunderbird which chops up quotes into little chunks for some reason. Remove if necessary!
$string = preg_replace('#\[\/quote\]\s*?\[quote\]#', '', $string);
// Replace email address
$string = preg_replace('#<a .*?href=.*?"mailto:(.*?)".*?>(.*?)</a>#i', "$2 ([email]$1[/email])", $string);
// Replace links
$string = preg_replace('#<a .*href=.*"(.*)".*>(.*)</a>#iUe', "'[url'. (trim('$1') ? '='.trim('$1') : '') .']'.trim('$2').'[/url]'", $string);
// Remove any image tags whose source starts with 'cid:' - this is an inline attachment, and will be added to the post as a normal attachment.
$string = preg_replace('#<img[^>]+src="cid:[^>]+>#i', '', $string);
// Replace image references
$string = preg_replace('#<img .*src="(.*)".*>#iUe', "'[img]'.trim('$1').'[/img]'", $string);
// Remove all remaining HTML tags
$string = preg_replace('#<(/?\w+|!--)[^>]*>#', '', $string);
// Convert HTML entities
$string = html_entity_decode($string);
// Convert quotes
if (get_magic_quotes_gpc()) {
return stripslashes($string);
} else {
return ($string);
}
}
}
?>