Location: PHPKode > projects > phlyMail Lite > phlymail/shared/lib/rss.read.php
<?php
/**
 * Offers functions for reading a given RSS file and returning the feed's
 * data as an array structure
 * By using more generalized RegEx for parsing RSS data this class should be
 * fairly resistant against unclean XML data. Additionally you can pass it more
 * tag names (case insensitive) for recognizing future enhancements of the
 * standards or private use tags.
 *
 * @package phlyMail Nahariya 4.0+
 * @author Matthias Sommerfeld, phlyLabs
 * @copyright 2005-2008 phlyLabs, Berlin http://phlylabs.de/
 * @uses phlyMail 3.5+ shared/lib/functions.php
 * @version 0.5.9
 */
class parse_rss
{
    // used, when no encoding found
    private $default_enc = 'UTF-8';
    // You might limit, how many entries you wish to get from top of the feed
    private $items_limit = 0;
    // This is really desired - don not leave any HTML in the feed's data
    private $strip_html   = true;
    // Format of how the lastBuildDate of the feed is returned
    private $date_format = 'Y-m-d H:i:s';
    // Where to put a cached copy of the RSS gotten from the server
    private $cache_dir   = '';
    // How long to cache, 0 for no cache
    private $cache_time  = 0;
    private $channeltags = array
            ('title', 'link', 'description', 'language', 'copyright'
            ,'managingEditor', 'webMaster', 'lastBuildDate', 'rating', 'docs'
            );
    private $itemtags = array
            ('title', 'link', 'description', 'author', 'category', 'comments'
            ,'enclosure', 'guid', 'pubDate', 'source'
            );
    private $imagetags = array('title', 'url', 'link', 'width', 'height');
    private $textinputtags = array('title', 'description', 'name', 'link');

    /**
     * Constructor method. By passing an array with options you might influence
     * the behaviour of the class.
     *
     *[@param array $options  Pass some options for influence on behaviour]
     * @return object instance
     * @since 0.5.8
     */
    public function __construct($options = null)
    {
        if (!is_null($options)) {
            if (isset($options['channeltags']) && is_array($options['channeltags'])) {
                $this->channeltags = $options['channeltags'];
            }
            if (isset($options['itemtags']) && is_array($options['itemtags'])) {
                $this->itemtags = $options['itemtags'];
            }
            if (isset($options['imagetags']) && is_array($options['imagetags'])) {
                $this->imagetags = $options['imagetags'];
            }
            if (isset($options['textinputtags']) && is_array($options['textinputtags'])) {
                $this->textinputtags = $options['textinputtags'];
            }
            if (isset($options['cache_time']) && is_numeric($options['cache_time'])) {
                $this->cache_time = $options['cache_time'];
            }
            if (isset($options['cache_dir']) && is_string($options['cache_dir'])) {
                $this->cache_dir = $options['cache_dir'];
            }
            if (isset($options['$date_format']) && is_string($options['$date_format'])) {
                $this->$date_format = $options['$date_format'];
            }
            if (isset($options['strip_html']) && is_bool($options['strip_html'])) {
                $this->strip_html = $options['strip_html'];
            }
            if (isset($options['items_limit']) && is_numeric($options['items_limit'])) {
                $this->items_limit = $options['items_limit'];
            }
        }
        return true;
    }

    /**
     * Return the channel information for a given filename
     *
     * @param  string  $filename  File to read the information from
     * @return array  empty array on failure, structured associative array on success
     * @access public
     * @since 0.0.1
     */
    public function read($filename = false)
    {
        // If CACHE ENABLED
        if ($this->cache_dir != '') {
            $cache_file = $this->cache_dir . '/rsscache_' . md5($filename);
            if (file_exists($cache_file) && is_readable($cache_file)
                    && is_writeable($cache_file) && ((time()-filemtime($cache_file))) < $this->cache_time) {
                // cached file is fresh enough, return cached array
                $result = unserialize(file_get_contents($cache_file));
                // set 'cached' to 1 only if cached file is correct
                if ($result) $result['cached'] = 1;
            } else {
                // cached file is too old, create new
                $result = $this->parse($filename);
                $serialized = serialize($result);
                file_put_contents($cache_file, $serialized);
                if ($result) $result['cached'] = 0;
            }
        } else {
            $result = $this->parse($filename);
            if ($result) $result['cached'] = 0;
        }
        // return result
        return $result;
    }

    /**
     * Actual parsing method used by the public method get() to read and parse the given RSS file
     * @param  string  URL or filename of the RSS
     * @return  array  structured array with found info, false on failure
     * @access  private
     * @since  0.0.1
     */
    private function parse($filename)
    {
        // Opening the given file impossibe
        if (!@is_readable($filename)) return false;

        $rss_content = file_get_contents($filename);
        // Parse document encoding
        $result['encoding'] = $this->rss_preg_match('!encoding=[\'\"](.*?)[\'\"]!si', $rss_content);
        // This id used by $this->rss_preg_match
        $this->rssenc = ($result['encoding'] != '') ? $result['encoding'] : $this->default_enc;
        // Parse CHANNEL info
        preg_match('!<channel.*?>(.*?)</channel>!si', $rss_content, $out_channel);
        foreach ($this->channeltags as $channeltag) {
            $channeltag = preg_quote($channeltag, '!');
            $temp = $this->rss_preg_match('!<'.$channeltag.'.*?>(.*?)</'.$channeltag.'>!si', $out_channel[1]);
            if ($temp != '') {
                $result[$channeltag] = $temp; // Set only if not empty
                if ($this->strip_html) {
                    $result[$channeltag] = strip_tags($this->unhtmlentities(strip_tags($result[$channeltag])));
                }
            }
        }
        // If date_format is specified and lastBuildDate is valid
        if ($this->date_format != '' && ($timestamp = strtotime($result['lastBuildDate'])) !== -1) {
            // convert lastBuildDate to specified date format
            $result['lastBuildDate'] = date($this->date_format, $timestamp);
        }
        // Parse TEXTINPUT info
        preg_match('!<textinput(|[^>]*[^/])>(.*?)</textinput>!si', $rss_content, $out_textinfo);
        // This a little strange regexp means:
        // Look for tag <textinput> with or without any attributes, but skip truncated version <textinput />
        // (it's not beggining tag)
        if (isset($out_textinfo[2])) {
            foreach ($this->textinputtags as $textinputtag) {
                $temp = $this->rss_preg_match('!<'.$textinputtag.'.*?>(.*?)</'.$textinputtag.'>!si', $out_textinfo[2]);
                if ($temp != '') $result['textinput_'.$textinputtag] = $temp; // Set only if not empty
            }
        }
        // Parse IMAGE info
        preg_match('!<image.*?>(.*?)</image>!si', $rss_content, $out_imageinfo);
        if (isset($out_imageinfo[1])) {
            foreach ($this->imagetags as $imagetag) {
                $temp = $this->rss_preg_match('!<'.$imagetag.'.*?>(.*?)</'.$imagetag.'>!si', $out_imageinfo[1]);
                if ($temp != '') $result['image_'.$imagetag] = $temp; // Set only if not empty
            }
        }
        // Parse ITEMS
        preg_match_all('!<item(| .*?)>(.*?)</item>!si', $rss_content, $items);
        $rss_items = $items[2];
        $i = 0;
        $result['items'] = array(); // create array even if there are no items
        foreach ($rss_items as $rss_item) {
            // If number of items is lower then limit: Parse one item
            if ($i < $this->items_limit || $this->items_limit == 0) {
                foreach($this->itemtags as $itemtag) {
                    $temp = $this->rss_preg_match('!<'.$itemtag.'.*?>(.*?)</'.$itemtag.'>!si', $rss_item);
                    if ($temp != '') $result['items'][$i][$itemtag] = $temp; // Set only if not empty
                }
                // Strip HTML tags and other bullshit from DESCRIPTION
                if ($this->strip_html && isset($result['items'][$i]['description']) && $result['items'][$i]['description']) {
                    $result['items'][$i]['description'] = strip_tags(strip_tags($result['items'][$i]['description']));
                }
                // Strip HTML tags and other bullshit from TITLE
                if ($this->strip_html && isset($result['items'][$i]['title']) && $result['items'][$i]['title']) {
                    $result['items'][$i]['title'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['title'])));
                }
                // If date_format is specified and pubDate is valid
                if ($this->date_format != '' && isset($result['items'][$i]['pubDate'])
                        && ($timestamp = strtotime($result['items'][$i]['pubDate'])) !== -1) {
                    // convert pubDate to specified date format
                    $result['items'][$i]['pubDate'] = date($this->date_format, $timestamp);
                }
                // Item counter
                $i++;
            }
        }
        $result['items_count'] = $i;
        return $result;
    }

    /**
     * Adjusts the return of preg_match to a more suitable method
     * @param  string  Pattern
     * @param  string  Subject
     * @return trimmed field with index 1 of the preg_match() array output
     * @since 0.0.1
     * @access private
     */
    private function rss_preg_match($pattern, $subject)
    {
        preg_match($pattern, $subject, $out);
        // no result -> empty string
        if (!isset($out[1])) return '';
        // CDATA ist i-bä-bä
        $out[1] = str_replace(array('<![CDATA[', ']]>'), array('', ''), $out[1]);
        // Try to push the data to phlyMail's internal encoding (UTF-8)
        $enc = (isset($this->rssenc) && $this->rssenc) ? $this->rssenc : $this->default_enc;
        $out[1] = encode_utf8($out[1], $enc, true);
        // Return result
        return trim($out[1]);
    }

    /**
     * Replace HTML entities &something; by real characters
     * @param  string  String to do the replacements in
     * @return  string  Replaced string
     * @access  private
     * @since 0.0.1
     */
    private function unhtmlentities($string)
    {
        // First let PHP do its job with UTF-8 and stuff
        $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
        // Now shooting at the rest
        $string = preg_replace
        // Illegal multibyte, decimal, hexa entities
                (array('!&#(x)?([a-f0-9]{3,4});!i', '!&#(\d+);!me', '!&#x([a-f0-9]+);!mei')
                ,array('', 'chr(\1)', 'chr(0x\1)')
                ,$string
                );
        // What's left, will be transferred on return
        $trans_tbl = array_merge
                (array_flip(get_html_translation_table(HTML_ENTITIES, ENT_QUOTES))
                ,array('&gt;' => '>', '&lt;' => '<', '&amp;' => '&', '&quot;' => '"', '&apos;' => "'")
                );
        return strtr($string, $trans_tbl);
    }
}
?>
Return current item: phlyMail Lite