Location: PHPKode > projects > Yioop! > yioop-v0.78/models/crawl_model.php
<?php
/** 
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009, 2010, 2011  Chris Pollett hide@address.com
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett hide@address.com
 * @package seek_quarry
 * @subpackage model
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009, 2010, 2011
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**  For crawlHash function  */
require_once BASE_DIR."/lib/utility.php";
/** 
 * Loads common constants for web crawling, used for index_data_base_name and 
 * schedule_data_base_name 
 */
require_once BASE_DIR."/lib/crawl_constants.php";
/** 
 * Crawl data is stored in an IndexArchiveBundle, 
 * so load the definition of this class
 */
require_once BASE_DIR."/lib/index_archive_bundle.php";

/** used to prevent cache page requests from being logged*/
define("NO_LOGGING", true);

/**
 * This is class is used to handle
 * db results for a given phrase search
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage model
 */
class CrawlModel extends Model implements CrawlConstants
{
    /**
     * Stores the name of the current index archive to use to get search 
     * results from
     * @var string
     */
    var $index_name;


    /**
     *  {@inheritdoc}
     */
    function __construct($db_name = DB_NAME) 
    {
        parent::__construct($db_name);
    }


    /**
     * Get a summary of a document by the generation it is in
     * and its offset into the corresponding WebArchive.
     *
     * @param int $summary_offset offset in $generation WebArchive
     * @param int $generation the index of the WebArchive in the 
     *      IndexArchiveBundle to find the item in.
     * @return array summary data of the matching document
     */
    function getCrawlItem($summary_offset, $generation)
    {
        $index_archive_name = self::index_data_base_name . $this->index_name;

        $index_archive = 
            new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name);

        $summary = $index_archive->getPage($summary_offset, $generation);

        return $summary;
    }


    /**
     * Gets the cached version of a web page from the machine on which it was 
     * fetched.
     *
     * Complete cached versions of web pages typically only live on a fetcher 
     * machine. The queue server machine typically only maintains summaries. 
     * This method makes a REST request of a fetcher machine for a cached page 
     * and get the results back.
     *
     * @param string $machine the ip address of domain name of the machine the 
     *      cached page lives on
     * @param string $machine_uri the path from document root on $machine where 
     *      the yioop scripts live
     * @param int $partition the partition in the WebArchiveBundle the page is
     *       in
     * @param int $offset the offset in bytes into the WebArchive partition in 
     *      the WebArchiveBundle at which the cached page lives.
     * @param string $crawl_time the timestamp of the crawl the cache page is 
     *      from
     * @return array page data of the cached page
     */
    function getCacheFile($machine, $machine_uri, $partition, 
        $offset, $crawl_time) 
    {
        $time = time();
        $session = md5($time . AUTH_KEY);
        if($machine == '::1') { //IPv6 :(
            $machine = "[::1]/"; 
            //used if the fetching and queue serving were on the same machine
        }

        $request= "http://$machine$machine_uri?c=archive&a=cache&time=$time".
            "&session=$session&partition=$partition&offset=$offset".
            "&crawl_time=$crawl_time";
        $tmp = FetchUrl::getPage($request);
        $page = @unserialize(base64_decode($tmp));
        $page['REQUEST'] = $request;

        return $page;
    }


    /**
     * Gets the name (aka timestamp) of the current index archive to be used to 
     * handle search queries
     *
     * @return string the timestamp of the archive
     */
    function getCurrentIndexDatabaseName()
    {
        $this->db->selectDB(DB_NAME);
        $sql = "SELECT CRAWL_TIME FROM CURRENT_WEB_INDEX";
        $result = $this->db->execute($sql);

        $row =  $this->db->fetchArray($result);
        
        return $row['CRAWL_TIME'];
    }


    /**
     * Sets the IndexArchive that will be used for search results
     *
     * @param $timestamp  the timestamp of the index archive. The timestamp is 
     * when the crawl was started. Currently, the timestamp appears as substring
     * of the index archives directory name
     */
    function setCurrentIndexDatabaseName($timestamp)
    {
        $this->db->selectDB(DB_NAME);
        $this->db->execute("DELETE FROM CURRENT_WEB_INDEX");
        $sql = "INSERT INTO CURRENT_WEB_INDEX VALUES ('".$timestamp."')";
        $this->db->execute($sql);

    }


    /**
     * Gets a list of all index archives of crawls that have been conducted
     * 
     * @param bool $return_arc_bundles whether index bundles used for indexing
     *      arc or other archive bundles should be included in the lsit
     * @param bool $return_recrawls whether index archive bundles generated as
     *      a result of recrawling should be included in the result
     *
     * @return array Available IndexArchiveBundle directories and 
     *      their meta information this meta information includes the time of 
     *      the crawl, its description, the number of pages downloaded, and the 
     *      number of partitions used in storing the inverted index
     */
    function getCrawlList($return_arc_bundles = false, $return_recrawls = false)
    {
        $list = array();
        $dirs = glob(CRAWL_DIR.'/cache/*', GLOB_ONLYDIR);

        foreach($dirs as $dir) {
            if(strlen($pre_timestamp = 
                strstr($dir, self::index_data_base_name)) > 0) {
                $crawl = array();
                $crawl['CRAWL_TIME'] = 
                    substr($pre_timestamp, strlen(self::index_data_base_name));
                $info = IndexArchiveBundle::getArchiveInfo($dir);
                $index_info = unserialize($info['DESCRIPTION']);
                $crawl['DESCRIPTION'] = "";
                if(!$return_arc_bundles && isset($index_info['ARCFILE'])) {
                    continue;
                } else if ($return_arc_bundles
                    && isset($index_info['ARCFILE'])) {
                    $crawl['DESCRIPTION'] = "ARCFILE::";
                }
                if(!$return_recrawls && 
                    isset($index_info[self::CRAWL_TYPE]) && 
                    $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                    continue;
                } else if($return_recrawls  && 
                    isset($index_info[self::CRAWL_TYPE]) && 
                    $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                    $crawl['DESCRIPTION'] = "RECRAWL::";
                }
                $schedules = glob(CRAWL_DIR.'/schedules/'.
                    self::schedule_data_base_name.$crawl['CRAWL_TIME'].
                    '/*/At*.txt');
                $crawl['RESUMABLE'] = (count($schedules) > 0) ? true: false;
                $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
                $crawl['VISITED_URLS_COUNT'] = 
                    isset($info['VISITED_URLS_COUNT']) ?
                    $info['VISITED_URLS_COUNT'] : 0;
                $crawl['COUNT'] = $info['COUNT'];
                $crawl['NUM_DOCS_PER_PARTITION'] = 
                    $info['NUM_DOCS_PER_PARTITION'];
                $crawl['WRITE_PARTITION'] = $info['WRITE_PARTITION'];
                $list[] = $crawl;
            }
        }

        return $list;
    }

    /**
     * Deletes the crawl with the supplied timestamp if it exists. Also
     * deletes any crawl mixes making use of this crawl
     *
     * @param string $timestamp a Unix timestamp
     */
    function deleteCrawl($timestamp)
    {
        $this->db->unlinkRecursive(
            CRAWL_DIR.'/cache/'.self::index_data_base_name . $timestamp, true);
        $this->db->unlinkRecursive(
            CRAWL_DIR.'/schedules/'.self::index_data_base_name .
            $timestamp, true);
        $this->db->unlinkRecursive(
            CRAWL_DIR.'/schedules/' . self::schedule_data_base_name.$timestamp,
            true);
        $this->db->unlinkRecursive(
            CRAWL_DIR.'/schedules/'.self::robot_data_base_name.
            $timestamp, true);

        $this->db->selectDB(DB_NAME);
        $sql = "SELECT DISTINCT MIX_TIMESTAMP FROM MIX_COMPONENTS WHERE ".
            " CRAWL_TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        $rows = array();
        while($rows[] =  $this->db->fetchArray($result)) ;

        foreach($rows as $row) {
            $this->deleteCrawlMix($row['MIX_TIMESTAMP']);
        }
        $current_timestamp = $this->getCurrentIndexDatabaseName();
        if($current_timestamp == $timestamp) {
            $this->db->execute("DELETE FROM CURRENT_WEB_INDEX");
        }
    }

    /**
     * Gets a list of all mixes of available crawls
     *
     * @param bool $components if false then don't load the factors
     *      that make up the crawl mix, just load the name of the mixes
     *      and their timestamps; otherwise, if true loads everything
     * @return array list of available crawls
     */
    function getMixList($components = false)
    {
        $this->db->selectDB(DB_NAME);
        $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES";
        $result = $this->db->execute($sql);

        $rows = array();
        while($row = $this->db->fetchArray($result)) {
            if($components) {
                $mix = $this->getCrawlMix($row['MIX_TIMESTAMP'], true);
                $row['GROUPS'] = $mix['GROUPS'];
            }
            $rows[] = $row;
        }
        return $rows;
    }


    /**
     * Retrieves the weighting component of the requested crawl mix
     *
     * @param string $timestamp of the requested crawl mix
     * @param bool $just_components says whether to find the mix name or
     *      just the components array.
     * @return array the crawls and their weights that make up the
     *      requested crawl mix.
     */
    function getCrawlMix($timestamp, $just_components = false)
    {
        $this->db->selectDB(DB_NAME);
        if(!$just_components) {
            $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ".
                " MIX_TIMESTAMP='$timestamp'";
            $result = $this->db->execute($sql);
            $mix =  $this->db->fetchArray($result);
        } else {
            $mix = array();
        }
        $sql = "SELECT GROUP_ID, RESULT_BOUND".
            " FROM MIX_GROUPS WHERE ".
            " MIX_TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        $mix['GROUPS'] = array();
        while($row = $this->db->fetchArray($result)) {
            $mix['GROUPS'][$row['GROUP_ID']]['RESULT_BOUND'] = 
                $row['RESULT_BOUND'];
        }
        foreach($mix['GROUPS'] as $group_id => $data) {
            $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, KEYWORDS ".
                " FROM MIX_COMPONENTS WHERE ".
                " MIX_TIMESTAMP='$timestamp' AND GROUP_ID='$group_id'";
            $result = $this->db->execute($sql);

            $mix['COMPONENTS'] = array();
            $count = 0;
            while($row =  $this->db->fetchArray($result)) {
                $mix['GROUPS'][$group_id]['COMPONENTS'][$count] =$row;
                $count++;
            }
        }
        return $mix;
    }

    /**
     * Returns the timestamp associated with a mix name;
     *
     * @param string $mix_name name to lookup
     * @return mixed timestamp associated with name if exists false otherwise
     */
    function getCrawlMixTimestamp($mix_name)
    {
        $this->db->selectDB(DB_NAME);
            $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ".
                " MIX_NAME='$mix_name'";
            $result = $this->db->execute($sql);
            $mix =  $this->db->fetchArray($result);
        if(isset($mix["MIX_TIMESTAMP"])) {
            return $mix["MIX_TIMESTAMP"];
        }
        return false;
    }
    /**
     * Get a description associated with a Web Crawl or Crawl Mix
     *
     * @param int $timestamp of crawl or mix in question
     * @param bool $is_mix whether it is a mix or not
     * @return array associative array containing item DESCRIPTION
     */
    function getInfoTimestamp($timestamp, $is_mix = NULL)
    {
        if($is_mix === NULL) {
            $is_mix = $this->isCrawlMix($timestamp);
        }
        $info = array();
        if($is_mix) {
            $this->db->selectDB(DB_NAME);

            $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ".
                " MIX_TIMESTAMP='$timestamp'";
            $result = $this->db->execute($sql);
            $mix =  $this->db->fetchArray($result);
            $info['TIMESTAMP'] = $timestamp;
            $info['DESCRIPTION'] = $mix['MIX_NAME'];
            $info['IS_MIX'] = true;
        } else {
            $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
            if(file_exists($dir)) {
                $info = IndexArchiveBundle::getArchiveInfo($dir);
                $tmp = unserialize($info['DESCRIPTION']);
                $info['DESCRIPTION'] = $tmp['DESCRIPTION'];
            }
        }

        return $info;
    }

    /**
     * Returns whether the supplied timestamp corresponds to a crawl mix
     *
     * @param string timestamp of the requested crawl mix
     *
     * @return bool true if it does; false otherwise
     */
    function isCrawlMix($timestamp)
    {
        $this->db->selectDB(DB_NAME);

        $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ".
            " MIX_TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        if($result) {
            if($mix =  $this->db->fetchArray($result)) {
                return true;
            } else {
                return false;
            }
        }
    }

    /**
     * Stores in DB the supplied crawl mix object
     *
     * @param array $mix an associative array repreenting the crawl mix object
     */
    function setCrawlMix($mix)
    {
        $this->db->selectDB(DB_NAME);
        //although maybe slower, we first get rid of any old data
        $timestamp = $mix['MIX_TIMESTAMP'];

        $this->deleteCrawlMix($timestamp);

        //next we store the new data
        $sql = "INSERT INTO CRAWL_MIXES VALUES ('$timestamp', '".
            $mix['MIX_NAME']."')";
        $this->db->execute($sql);

        $gid = 0;
        foreach($mix['GROUPS'] as $group_id => $group_data) {

            $sql = "INSERT INTO MIX_GROUPS VALUES ('$timestamp', '$gid', ".
                "'".$group_data['RESULT_BOUND']."')";
            $this->db->execute($sql);
            foreach($group_data['COMPONENTS'] as $component) {
                $sql = "INSERT INTO MIX_COMPONENTS VALUES ('$timestamp', '".
                    $gid."', '".$component['CRAWL_TIMESTAMP']."', '".
                    $component['WEIGHT']."', '" .
                    $component['KEYWORDS']."')";
                $this->db->execute($sql);
            }
            $gid++;
        }
    }

    /**
     * Stores in DB the supplied crawl mix object
     *
     * @param array $mix an associative array repreenting the crawl mix object
     */
    function deleteCrawlMix($timestamp)
    {
        $this->db->selectDB(DB_NAME);
        $sql = "DELETE FROM CRAWL_MIXES WHERE MIX_TIMESTAMP='$timestamp'";
        $this->db->execute($sql);
        $sql = "DELETE FROM MIX_GROUPS WHERE MIX_TIMESTAMP='$timestamp'";
        $this->db->execute($sql);
        $sql = "DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP='$timestamp'";
        $this->db->execute($sql);

    }

    /**
     * Returns the crawl parameters that were used during a given crawl
     *
     * @param string $timestamp timestamp of the crawl to load the crawl
     *      parameters of
     * @return array  the first sites to crawl during the next crawl
     *      restrict_by_url, allowed, disallowed_sites
     */
    function getCrawlSeedInfo($timestamp)
    {
        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
        $seed_info = NULL;
        if(file_exists($dir)) {
            $info = IndexArchiveBundle::getArchiveInfo($dir);
            $index_info = unserialize($info['DESCRIPTION']);
            $seed_info['general']["restrict_sites_by_url"] = 
                $index_info[self::RESTRICT_SITES_BY_URL];
            $seed_info['general']["crawl_type"] = 
                (isset($index_info[self::CRAWL_TYPE])) ?
                $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL;
            $seed_info['general']["crawl_index"] = 
                (isset($index_info[self::CRAWL_INDEX])) ?
                $index_info[self::CRAWL_INDEX] : '';
            $seed_info['general']["crawl_order"] = 
                $index_info[self::CRAWL_ORDER];
            $site_types = array(
                "allowed_sites" => self::ALLOWED_SITES,
                "disallowed_sites" => self::DISALLOWED_SITES,
                "seed_sites" => self::TO_CRAWL
            );
            foreach($site_types as $type => $code) {
                if(isset($index_info[$code])) {
                    $tmp = & $index_info[$code];
                } else {
                    $tmp = array();
                }
                $seed_info[$type]['url'] =  $tmp;
            }
            $seed_info['meta_words'] = array();
            if(isset($index_info[self::META_WORDS]) ) {
                $seed_info['meta_words'] = $index_info[self::META_WORDS];
            }
            if(isset($index_info[self::INDEXING_PLUGINS])) {
                $seed_info['indexing_plugins']['plugins'] = 
                    $index_info[self::INDEXING_PLUGINS];
            }
        }
        return $seed_info;
    }

    /**
     * Changes the crawl parameters of an existing crawl
     *
     * @param string $timestamp timestamp of the crawl to change
     * @param array $new_info the new parameters
     */
    function setCrawlSeedInfo($timestamp, $new_info)
    {
        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
        if(file_exists($dir)) {
            $info = IndexArchiveBundle::getArchiveInfo($dir);
            $index_info = unserialize($info['DESCRIPTION']);
            if(isset($new_info['general']["restrict_sites_by_url"])) {
                $index_info[self::RESTRICT_SITES_BY_URL] = 
                    $new_info['general']["restrict_sites_by_url"];
            }
            $updatable_site_info = array(
                "allowed_sites" => self::ALLOWED_SITES,
                "disallowed_sites" => self::DISALLOWED_SITES
            );
            foreach($updatable_site_info as $type => $code) {
                if(isset($new_info[$type])) {
                    $index_info[$code] = $new_info[$type]['url'];
                }
            }
            if(isset($new_info['meta_words']) ) {
                $index_info[self::META_WORDS] = $new_info['meta_words'];
            }
            if(isset($new_info['indexing_plugins']) ) {
                $index_info[self::INDEXING_PLUGINS] = 
                    $new_info['indexing_plugins']['plugins'];
            }
            $info['DESCRIPTION'] = serialize($index_info);
            IndexArchiveBundle::setArchiveInfo($dir, $info);
        }
    }


    /**
     *  Returns the initial sites that a new crawl will start with along with
     *  crawl parameters such as crawl order, allowed and disallowed crawl sites
     *  @param bool $use_default whether or not to use the Yioop! default
     *      crawl.ini file rather than the one created by the user.
     *  @return array  the first sites to crawl during the next crawl
     *      restrict_by_url, allowed, disallowed_sites
     */
    function getSeedInfo($use_default = false)
    {
        if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
            $info = parse_ini_file (WORK_DIRECTORY."/crawl.ini", true);
        } else {
            $info =
                parse_ini_file (BASE_DIR."/configs/default_crawl.ini", true);
        }

        return $info;

    }

    /**
     * Writes a crawl.ini file with the provided data to the user's 
     * WORK_DIRECTORY
     *
     * @param array $info an array containing information about the crawl
     * such as crawl_order, whether restricted_by_url, seed_sites, 
     * allowed_sites and disallowed_sites
     */
    function setSeedInfo($info)
    {
        if(!isset($info['general']['crawl_index'])) {
            $info['general']['crawl_index']='12345678';
        }
        $n = array();
        $n[] = <<<EOT
; ***** BEGIN LICENSE BLOCK ***** 
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009, 2010  Chris Pollett hide@address.com
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK ***** 
;
; crawl.ini 
;
; crawl configuration file
;
EOT;
        $n[] = '[general]';
        $n[] = "crawl_order = '".$info['general']['crawl_order']."';";
        $n[] = "crawl_type = '".$info['general']['crawl_type']."';";
        $n[] = "crawl_index = '".$info['general']['crawl_index']."';";
        $bool_string = 
            ($info['general']['restrict_sites_by_url']) ? "true" : "false";
        $n[] = "restrict_sites_by_url = $bool_string;";
        $n[] = "";
        
        $site_types = array('allowed_sites', 'disallowed_sites', 'seed_sites');
        foreach($site_types as $type) {
            $n[] = "[$type]";
            foreach($info[$type]['url'] as $url) {
                $n[] = "url[] = '$url';";
            }
            $n[]="";
        }
        $n[] = "[meta_words]";
        if(isset($info["meta_words"])) {
            foreach($info["meta_words"] as $word_pattern => $url_pattern) {
                $n[] = "$word_pattern = '$url_pattern';";
            }
            $n[]="";
        }
        $n[] = "[indexing_plugins]";
        if(isset($info["indexing_plugins"])) {
            foreach($info["indexing_plugins"]['plugins'] as $plugin) {
                $n[] = "plugins[] = '$plugin';";
            }
        }//

        $out = implode("\n", $n);
        file_put_contents(WORK_DIRECTORY."/crawl.ini", $out);
    }
}
?>
Return current item: Yioop!