Location: PHPKode > projects > Yioop! > yioop-v0.78/lib/indexing_plugins/indexing_plugin.php
<?php
/** 
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2011 Priya Gangaraju hide@address.com, Chris Pollett
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Priya Gangaraju hide@address.com, Chris Pollett
 * @package seek_quarry
 * @subpackage indexing_plugin
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2011
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 
/** Some models might interface with a DBMS so load the DBMS manager*/
require_once BASE_DIR."/models/datasources/".DBMS."_manager.php";

/**
 * Base indexing plugin Class. An indexing plugin allows a developer
 * to do additional processing on web pages during a crawl, then after
 * the web crawl is over do post processing on the additional data
 * that was collected. For example, during a crawl one might by analysing
 * web pages mark pages that have recipes on them with the meta word
 * recipe:all, then after the crawl is over do post processing such
 * as clustering the recipe's found and add additional meta words to
 * retrieve recipe's by principle ingredient. Subclasses of IndexingPlugin
 * do in crawl processing by overriding the pageProcessing method, they
 * do post crawl processing by overriding the postProcessing method. In 
 * addition a subclass should override the static functions
 * getProcessors() to say what PageProcessor's the plugin should be
 * associated with as well as getAdditionalMetaWords() to say what
 * additional meta words the plugin injects into the index.
 *
 * @author Priya Gangaraju, Chris Pollett
 * @package seek_quarry
 * @subpackage indexing_plugin
 */ 
abstract class IndexingPlugin
{

    /**
     * Array of the PageProcessor classes used by this IndexingPlugin 
     * (contructor loads these)
     * @var array
     */
    var $processors = array();
    /**
     * Array of the model classes used by this IndexingPlugin 
     * (contructor loads these)
     * @var array
     */
    var $models = array();
    /**
     * The IndexArchiveBundle object that this indexing plugin might
     * make changes to in its postProcessing method
     * @var object
     */
    var $index_archive;

    /**
     * Reference to a database object that might be used by models on this 
     * plugin
     * @var object
     */
    var $db;

    /**
     * Builds an IndexingPlugin object. Loads in the appropriate
     * models for the given plugin object
     */
    function __construct() 
    {
        $db_class = ucfirst(DBMS)."Manager";
        $this->db = new $db_class();
        
        require_once BASE_DIR."/models/model.php";

        foreach($this->models as $model) {
            require_once BASE_DIR."/models/".$model."_model.php";
             
            $model_name = ucfirst($model)."Model";
            $model_instance_name = lcfirst($model_name);

            $this->$model_instance_name = new $model_name();
        }
        
    }

    /**
     * This method is called by a PageProcessor in its handle() method
     * just after it has processed a web page. This method allows
     * an indexing plugin to do additional processing on the page
     * such as adding sub-documents, before the page summary is
     * handed back to the fetcher.
     *
     *  @param string $page web-page contents
     *  @param string $url the url where the page contents came from,
     *     used to canonicalize relative links
     *
     *  @return array consisting of a sequence of subdoc arrays found
     *      on the given page. Each subdoc array has a self::TITLE and
     *      a self::DESCRIPTION
     */
    abstract function pageProcessing($page, $url);

    /**
     * This method is called by the queue_server with the name of
     * a completed index. This allows the indexing plugin to 
     * perform searches on the index and using the results, inject
     * new page/index data into the index before it becomes available
     * for end use.
     *
     * @param string $index_name the name/timestamp of an IndexArchiveBundle
     *      to do post processing for
     */
    abstract function postProcessing($index_name);

    /**
     * @return array string names of page processors that this plugin
     *      associates with
     */
    static function getProcessors() {return NULL;}

    /**
     *  Returns an associative array of meta words => description length
     *  for each meta word injected by this plugin into an index. The
     *  description length is used to say how the maximum length of
     *  the web snippet show in search results for this meta owrd should be
     *  
     *  @return array meta words => description length pairs
     */
    static function getAdditionalMetaWords() {return array();}

}
?>
Return current item: Yioop!