Location: PHPKode > projects > Yioop! > yioop-v0.78/lib/processors/doc_processor.php
<?php
/** 
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009, 2010, 2011  Chris Pollett hide@address.com
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett hide@address.com
 * @package seek_quarry
 * @subpackage processor
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009, 2010, 2011
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Load the parent class
 */
require_once BASE_DIR."/lib/processors/text_processor.php";

/**
 * Used to create crawl summary information 
 * for binary DOC files
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage processor
 */
class DocProcessor extends TextProcessor
{
    /**
     *  Used to extract the title, description and links from
     *  a string consisting of Word Doc data (2004 or earlier).
     *
     *  @param string $page  the web-page contents
     *  @param string $url  the url where the page contents came from,
     *     used to canonicalize relative links
     *
     *  @return array  a summary of the contents of the page
     *
     */
    public function process($page, $url)
    {
        $text = "";
        if(is_string($page)) {
            $text = self::extractASCIIText($page);
        }  
        if($text == "") {
            $text = $url;
        }

        $summary = parent::process($text, $url);

        return $summary;

    }
    
    /**
     * This is the main text from Word doc extractor
     * A Word Doc consists of a FIB, Piece Table, and
     * DocumentStream. The last contains the text.
     * The piece table is supposed to be used to reconstruct
     * the order of the text from the DocumentStream and the FIB, file 
     * information block,is supposed to tell us where the piece table is.
     * I am not using any of this for now. I am just brute
     * force looking for the text which I know has to be at a page (256 byte)
     * boundary. I then go until I no longer see ASCII. So the order
     * of text extracted might be screwed up right now.
     *
     * @param string $doc string data of a 2004 or earlier Word doc
     */
    static function extractASCIIText($doc) {
        $len = strlen($doc); 

        $text = "";
        $boundary_change = 0;
        $text_state = false;
        $start_text = 0;
        $end_text = 0;
        for($i = 0; $i < $len; $i+= 256) {
             if(self::checkPageForText($doc, $i)) {
                $start_text = $i;
                break;
             }
        }

        for($i = $start_text; $i < $len; $i += 8) {
            if(self::checkAllZeros($doc, $i)) {
                 $end_text =  $i; 
                 break;
            }

            $text .= self::cleanTextBlock($doc, $i);
        }

        return $text;
    }

    /**
     * Scans document starting at given position and looking forward eight
     * character to see if these are ASCII printable or not.
     *
     * @param string $doc document to scan
     * @param int $pos position to start scanning
     * @return whether the eight next characters were ASCII printable
     */
    static function checkPageForText($doc, $pos) {;
        $is_text = true;
        for($i = 0; $i < 8; $i++) {
            $ascii = ord($doc[$pos]);
            if (!((9 <= $ascii && $ascii <= 13) || 
                (32 <= $ascii && $ascii <= 126)) ){
                $is_text = false;
                break;
            }
            $pos++;
        }

        return $is_text;
    }

    /**
     * Scans document starting at given position and looking forward eight
     * character to see if these are all \0 or not.
     *
     * @param string $doc document to scan
     * @param int $pos position to start scanning
     * @return whether the eight next characters were \0
     */
    static function checkAllZeros($doc, $pos) {
        $is_zero = true;
        for($i = 0; $i < 8; $i++) {
            $ascii = ord($doc[$pos]);
            if ($ascii != 0 ){
                $is_zero = false;
                break;
            }
            $pos++; 
        } 

        return $is_zero;
    }

    /**
     * Scans document starting at given position forward eight
     * character returning those characters which are ASCII printable
     *
     * @param string $doc document to scan
     * @param int $pos position to start scanning
     * @return substring of ASCII printable characters
     */
    static function cleanTextBlock($doc, $pos) {
        $text = "";
        for($i = 0; $i < 8; $i++) {
            if(isset($doc[$pos])) {
                $ascii = ord($doc[$pos]);
                if((9<= $ascii && $ascii <= 13) ||
                    (32<= $ascii && $ascii <= 126) ) {
                    $text .= chr($ascii);
                }
            }
            $pos++;
        }

        return $text;
    }

}

?>
Return current item: Yioop!