Location: PHPKode > projects > Yioop! > yioop-v0.78/lib/processors/epub_processor.php
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009, 2010, 2011  Chris Pollett hide@address.com
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Vijeth Patil hide@address.com
 * @package seek_quarry
 * @subpackage processor
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009, 2010, 2011
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Load base class, if needed.
 */
require_once BASE_DIR."/lib/processors/text_processor.php";

/**
 * If XML turns out to be XHTML ...
 */
require_once BASE_DIR."/lib/processors/html_processor.php";

/**
 * Load so can parse urls
 */
require_once BASE_DIR."/lib/url_parser.php";


/**
 *  The maximum length of description
 *
 *  @const integer MAX_DESCRIPTION_LEN
*/
const MAX_DESCRIPTION_LEN = 2000;
    
/**
 * The constant represents the number of
 * child levels at which the data is present in
 * the content.opf file.
 *
 *  @const integer MAX_DOM_LEVEL
*/
const MAX_DOM_LEVEL = 15;

 /**
 * Used to create crawl summary information
 * for XML files (those served as application/epub+zip)
 *
 * @author Vijeth Patil
 * @package seek_quarry
 * @subpackage processor
 */
 
 

class EpubProcessor extends TextProcessor
{
    /**
     *  The name of the tag element in an xml document
     *
     *  @var string name
     */
    var $name;

    /**
     *  The attribute of the tag element in an xml document
     *
     *  @var string attributes
     */
    var $attributes;

    /**
     *  The content of the tag element or attribute, used to extract
     *  the fields like title, creator, language of the document
     *
     *  @var string content
     */
    var $content;

    /**
     *  The child tag element of a tag element.
     *
     *  @var string children
     */
    var $children;

    /**
     *  Used to extract the title, description and links from
     *  a string consisting of ebook publication data.
     *
     *  @param string $page epub contents
     *  @param string $url the url where the page contents came from,
     *     used to canonicalize relative links
     *
     *  @return array  a summary of the contents of the page
     *
     */
    function process($page, $url)
    {
        $summary = NULL;
        $opf_pattern = "/.opf$/i";
        $html_pattern  = "/.html$/i";
        $xhtml_pattern = "/.xhtml$/i";
        $temp_filename = CRAWL_DIR ."/temp/epub.zip";
        $epub_url[0] = '';
        $epub_language = '';
        $epub_title = '';
        $epub_unique_identifier = '';
        $epub_author = '';
        $epub_publisher = '';
        $epub_date = '';
        $epub_subject = '';
        $desc = '';
        $htmlcontent = '';
        file_put_contents($temp_filename,$page);
        $zip = new ZipArchive;
        if($zip->open($temp_filename)) {
            for($i = 0; $i < $zip->numFiles; $i++) {
                // get the content file names of .epub document
                $filename[$i] = $zip->getNameIndex($i) ;
                if(preg_match($opf_pattern,$filename[$i])) {
                    // Get the file data from zipped folder
                    $opf_data = $zip->getFromName($filename[$i]);
                    $opf_summary = $this->xmlToObject($opf_data);
                    for($m = 0;$m <= MAX_DOM_LEVEL; $m++) {
                        for($n = 0;$n <= MAX_DOM_LEVEL; $n++) {
                            if(isset($opf_summary->children[$m]->children[$n])){
                                $child = $opf_summary->children[$m]->
                                    children[$n];
                                if( isset($child->name) &&
                                    $child->name == "dc:language") {
                                    $epub_language =
                                        $opf_summary->children[$m]->
                                            children[$n]->content ;
                                }
                                if( ($opf_summary->children[$m]->children[$n]->
                                    name) == "dc:title") {
                                    $epub_title = $opf_summary->children[$m]->
                                        children[$n]->content;
                                }
                                if( ($opf_summary->children[$m]->children[$n]->
                                    name) == "dc:creator") {
                                    $epub_author = $opf_summary->children[$m]->
                                        children[$n]->content ;
                                }
                                if( ($opf_summary->children[$m]->children[$n]->
                                    name) == "dc:identifier") {
                                    $epub_unique_identifier = $opf_summary->
                                        children[$m]->children[$n]->content ;
                                }
                            }
                        }
                    }
                }else if((preg_match($html_pattern,$filename[$i])) ||
                    (preg_match($xhtml_pattern,$filename[$i]))) {
                    $html = new HtmlProcessor;
                    $html_data = $zip->getFromName($filename[$i]);
                    $description[$i] = $html->process($html_data,$url);
                    $htmlcontent.= $description[$i]['t'];
                }
            }
        }
        if($epub_title != '')
        {
            $desc= " $epub_title .";
        }
        if($epub_author != '')
        {
            $desc = $desc." $epub_author ";
        }
        if($epub_language != '')
        {  
            $desc = $desc." $epub_language ";
        }
        if($epub_unique_identifier != '')
        {
            $desc = $desc." URN-".
            $epub_unique_identifier.".";
        }
        if($epub_publisher != '')
        {  
            $desc = $desc." $epub_publisher ";
        }
        if($epub_date != '')
        {  
            $desc = $desc." $epub_date ";
        }
        if($epub_subject != '')
        {  
            $desc = $desc." $epub_subject ";
        }
        $desc= $desc.$htmlcontent;
        //restrict the length of the description to maximum description length
        if(strlen($desc) > MAX_DESCRIPTION_LEN)  
        {
            $desc= substr($desc, 0, MAX_DESCRIPTION_LEN);
        }
        $summary[self::TITLE] = $epub_title;
        $summary[self::DESCRIPTION] = $desc;
        $summary[self::LANG] = $epub_language;
        $summary[self::LINKS] = $epub_url;
        $summary[self::PAGE] = $page;
        if($zip) {
            $zip->close(); 
        }
        unlink($temp_filename);
        return $summary;
    }

    /**
     *  Used to extract the DOM tree containing the information
     *  about the epub file such as title, author, language, unique
     *  identifier of the book from a string consisting of ebook publication
     *  content OPF file.
     *
     *  @param string $page xml contents
     *
     *  @return array  an information about the contents of the page
     *
     */
    function xmlToObject($xml)
    {
        $parser = xml_parser_create();
        xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
        xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 1);
        xml_parse_into_struct($parser, $xml, $tags);
        xml_parser_free($parser);

        $elements = array();  // the currently filling [child] XmlElement array
        $stack = array();
        foreach ($tags as $tag) {
            $index = count($elements);
            if ($tag['type'] == "complete" || $tag['type'] == "open") {
                $elements[$index] = new EpubProcessor;
                $elements[$index]->name = $tag['tag'];
                if(isset($tag['attributes'])) {
                    $elements[$index]->attributes = $tag['attributes'];
                }
                if(isset($tag['value'])) {
                    $elements[$index]->content = $tag['value'];
                }
                if ($tag['type'] == "open") {  // push
                    $elements[$index]->children = array();
                    $stack[count($stack)] = &$elements;
                    $elements = &$elements[$index]->children;
                }
            }
            if ($tag['type'] == "close") {  // pop
                $elements = &$stack[count($stack) - 1];
                unset($stack[count($stack) - 1]);
            }
        }
        return $elements[0];  // the single top-level element
    }
}
?>
Return current item: Yioop!