<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009, 2010, 2011 Chris Pollett hide@address.com
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett hide@address.com
* @package seek_quarry
* @subpackage library
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009, 2010, 2011
* @filesource
*/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* Summaries and word document list stored in WebArchiveBundle's so load it
*/
require_once 'web_archive_bundle.php';
/**
* Used to store word index
*/
require_once 'index_shard.php';
/**
* Used to store word dictionary
*/
require_once 'index_dictionary.php';
/**
* Used for crawlLog and crawlHash
*/
require_once 'utility.php';
/**
*Loads common constants for web crawling
*/
require_once 'crawl_constants.php';
/**
* Encapsulates a set of web page summaries and an inverted word-index of terms
* from these summaries which allow one to search for summaries containing a
* particular word.
*
* The basic file structures for an IndexArchiveBundle are:
* <ol>
* <li>A WebArchiveBundle for web page summaries.</li>
* <li>A IndexDictionary containing all the words stored in the bundle.
* Each word entry in the dictionary contains starting and ending
* offsets for documents containing that word for some particular IndexShard
* generation.</li>
* <li>A set of index shard generations. These generations
* have names index0, index1,... A shard has word entries, word doc entries
* and document entries. For more information see the index shard
* documentation.
* </li>
* <li>
* The file generations.txt keeps track of what is the current generation.
* A given generation can hold NUM_WORDS_PER_GENERATION words amongst all
* its partitions. After which the next generation begins.
* </li>
* </ol>
*
*
* @author Chris Pollett
* @package seek_quarry
* @subpackage library
*/
class IndexArchiveBundle implements CrawlConstants
{
/**
* Folder name to use for this IndexArchiveBundle
* @var string
*/
var $dir_name;
/**
* A short text name for this IndexArchiveBundle
* @var string
*/
var $description;
/**
* Number of partitions in the summaries WebArchiveBundle
* @var int
*/
var $num_partitions_summaries;
/**
* structure contains info about the current generation:
* its index (ACTIVE), and the number of words it contains
* (NUM_WORDS).
* @var array
*/
var $generation_info;
/**
* Number of docs before a new generation is started
* @var int
*/
var $num_docs_per_generation;
/**
* WebArchiveBundle for web page summaries
* @var object
*/
var $summaries;
/**
* IndexDictionary for all shards in the IndexArchiveBundle
* This contains entries of the form (word, num_shards with word,
* posting list info 0th shard containing the word,
* posting list info 1st shard containing the word, ...)
* @var object
*/
var $dictionary;
/**
* Index Shard for current generation inverted word index
* @var object
*/
var $current_shard;
/**
* Makes or initializes an IndexArchiveBundle with the provided parameters
*
* @param string $dir_name folder name to store this bundle
* @param int $num_partitions_summaries number of WebArchive partitions
* to use in the summmaries WebArchiveBundle
* @param string $description a text name/serialized info about this
* IndexArchiveBundle
*/
function __construct($dir_name, $read_only_archive = true,
$description = NULL, $num_docs_per_generation = NUM_DOCS_PER_GENERATION)
{
$this->dir_name = $dir_name;
$index_archive_exists = false;
if(!is_dir($this->dir_name)) {
mkdir($this->dir_name);
mkdir($this->dir_name."/posting_doc_shards");
} else {
$index_archive_exists = true;
}
if(file_exists($this->dir_name."/generation.txt")) {
$this->generation_info = unserialize(
file_get_contents($this->dir_name."/generation.txt"));
} else {
$this->generation_info['ACTIVE'] = 0;
file_put_contents($this->dir_name."/generation.txt",
serialize($this->generation_info));
}
$this->summaries = new WebArchiveBundle($dir_name."/summaries",
$read_only_archive, -1, $description);
$this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
$this->description = $this->summaries->description;
$this->num_docs_per_generation = $num_docs_per_generation;
$this->dictionary = new IndexDictionary($this->dir_name."/dictionary");
}
/**
* Add the array of $pages to the summaries WebArchiveBundle pages being
* stored in the partition $generation and the field used
* to store the resulting offsets given by $offset_field.
*
* @param int $generation field used to select partition
* @param string $offset_field field used to record offsets after storing
* @param array &$pages data to store
* @param int $visited_urls_count number to add to the count of visited urls
* (visited urls is a smaller number than the total count of objects
* stored in the index).
*/
function addPages($generation, $offset_field, &$pages,
$visited_urls_count)
{
$this->summaries->setWritePartition($generation);
$this->summaries->addPages($offset_field, $pages);
$this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
}
/**
* Adds the provided mini inverted index data to the IndexArchiveBundle
* Expects initGenerationToAdd to be called before, so generation is correct
*
* @param object $index_shard a mini inverted index of word_key=>doc data
* to add to this IndexArchiveBundle
*/
function addIndexData($index_shard)
{
crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
$start_time = microtime();
$this->getActiveShard()->appendIndexShard($index_shard);
crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
" Time: ".(changeInMicrotime($start_time)));
}
/**
* Determines based on its size, if index_shard should be added to
* the active generation or in a new generation should be started.
* If so, a new generation is started, the old generation is saved, and
* the dictionary of the old shard is copied to the bundles dictionary
* and a log-merge performed if needed
*
* @param object $index_shard a mini inverted index of word_key=>doc data
* @return int the active generation after the check and possible change has
* been performed
*/
function initGenerationToAdd($index_shard)
{
$current_num_docs = $this->getActiveShard()->num_docs;
$add_num_docs = $index_shard->num_docs;
if($current_num_docs + $add_num_docs > $this->num_docs_per_generation){
$switch_time = microtime();
$this->saveAndAddCurrentShardDictionary();
//Set up new shard
$this->generation_info['ACTIVE']++;
$this->generation_info['CURRENT'] =
$this->generation_info['ACTIVE'];
$current_index_shard_file = $this->dir_name.
"/posting_doc_shards/index". $this->generation_info['ACTIVE'];
$this->current_shard = new IndexShard(
$current_index_shard_file, $this->generation_info['ACTIVE'],
$this->num_docs_per_generation);
file_put_contents($this->dir_name."/generation.txt",
serialize($this->generation_info));
crawlLog("Switch Shard time:".changeInMicrotime($switch_time));
}
return $this->generation_info['ACTIVE'];
}
/**
* Saves the active index shard to disk, then adds the words from this
* shard to the dictionary
*/
function saveAndAddCurrentShardDictionary()
{
// Save current shard dictionary to main dictionary
$this->forceSave();
$current_index_shard_file = $this->dir_name.
"/posting_doc_shards/index". $this->generation_info['ACTIVE'];
/* want to do the copying of dictionary as files to conserve memory
in case merge tiers after adding to dictionary
*/
$this->current_shard = new IndexShard(
$current_index_shard_file, $this->generation_info['ACTIVE'],
$this->num_docs_per_generation, true);
$this->dictionary->addShardDictionary($this->current_shard);
}
/**
* Sets the current shard to be the active shard (the active shard is
* what we call the last (highest indexed) shard in the bundle. Then
* returns a reference to this shard
* @return object last shard in the bundle
*/
function getActiveShard()
{
if($this->setCurrentShard($this->generation_info['ACTIVE'])) {
return $this->getCurrentShard();
} else if(!isset($this->current_shard) ) {
$current_index_shard_file = $this->dir_name.
"/posting_doc_shards/index". $this->generation_info['CURRENT'];
$this->current_shard = new IndexShard($current_index_shard_file,
$this->generation_info['CURRENT'],
$this->num_docs_per_generation);
}
return $this->current_shard;
}
/**
* Returns the shard which is currently being used to read word-document
* data from the bundle. If one wants to write data to the bundle use
* getActiveShard() instead. The point of this method is to allow
* for lazy reading of the file associated with the shard.
*
* @return object the currently being index shard
*/
function getCurrentShard()
{
if(!isset($this->current_shard)) {
if(!isset($this->generation_info['CURRENT'])) {
$this->generation_info['CURRENT'] =
$this->generation_info['ACTIVE'];
}
$current_index_shard_file = $this->dir_name.
"/posting_doc_shards/index". $this->generation_info['CURRENT'];
if(file_exists($current_index_shard_file)) {
if(isset($this->generation_info['DISK_BASED']) &&
$this->generation_info['DISK_BASED'] == true) {
$this->current_shard =new IndexShard(
$current_index_shard_file,
$this->generation_info['CURRENT'],
$this->num_docs_per_generation, true);
$this->current_shard->getShardHeader();
$this->current_shard->read_only_from_disk = true;
} else {
$this->current_shard =
IndexShard::load($current_index_shard_file);
}
} else {
$this->current_shard = new IndexShard($current_index_shard_file,
$this->generation_info['CURRENT'],
$this->num_docs_per_generation);
}
}
return $this->current_shard;
}
/**
* Sets the current shard to be the $i th shard in the index bundle.
*
* @param $i which shard to set the current shard to be
* @param $disk_based whether to read the whole shard in before using or
* leave it on disk except for pages need and use memcache
*/
function setCurrentShard($i, $disk_based = false)
{
$this->generation_info['DISK_BASED'] = $disk_based;
if(isset($this->generation_info['CURRENT']) &&
($i == $this->generation_info['CURRENT'] ||
$i > $this->generation_info['ACTIVE'])) {
return false;
} else {
$this->generation_info['CURRENT'] = $i;
unset($this->current_shard);
return true;
}
}
/**
* Gets the page out of the summaries WebArchiveBundle with the given
* offset and generation
*
* @param int $offset byte offset in partition of desired page
* @param int $generation which generation WebArchive to look up in
* defaults to the same number as the current shard
* @return array desired page
*/
function getPage($offset, $generation = -1)
{
if($generation == -1 ) {
$generation = $this->generation_info['CURRENT'];
}
return $this->summaries->getPage($offset, $generation);
}
/**
* Forces the current shard to be saved
*/
function forceSave()
{
$this->getActiveShard()->save(false, true);
}
/**
* Computes the words which appear in the fewest or most documents
*
* @param array $word_keys keys of words to select amongst
* @param int $num number of words from the above set to return
* @param string $comparison callback function name for how to compare words
* @return array the $num most documents or $num least document words
*/
function getSelectiveWords($word_keys, $num, $comparison="lessThan")
//lessThan is in utility.php
{
$words_array = array();
if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;}
foreach($word_keys as $word_key) {
$tmp = $this->dictionary->getWordInfo($word_key);
if($tmp === false) {
$words_array[$word_key] = 0;
} else {
$count = 0;
foreach($tmp as $entry) {
$count += $entry[3];
}
$words_array[$word_key] = $count;
}
}
uasort( $words_array, $comparison);
return array_slice($words_array, 0, $num);
}
/**
*
*/
function setMemcache($dict = true, $shards = false)
{
if($dict) {
$this->dictionary->setMemcache();
}
if($shards) {
$this->setCurrentShard(0, true);
$this->getCurrentShard();
$old_current_index = $this->generation_info['CURRENT'];
$active_index = $this->generation_info['ACTIVE'];
for($i = 0; $i <= $active_index; $i++) {
$mem_shard = $this->setCurrentShard($i, true);
$this->getCurrentShard()->setMemcachePostingsDocs();
}
$this->setCurrentShard($old_current_index, true);
}
}
/**
*
*/
function removeMemcache()
{
$this->dictionary->removeMemcache();
$this->setCurrentShard(0, true);
$this->getCurrentShard();
$old_current_index = $this->generation_info['CURRENT'];
$active_index = $this->generation_info['ACTIVE'];
for($i = 0; $i <= $active_index; $i++) {
$mem_shard = $this->setCurrentShard($i, true);
$this->getCurrentShard()->removeMemcachePostingsDocs();
}
$this->setCurrentShard($old_current_index, true);
}
/**
* Gets the description, count of summaries, and number of partitions of the
* summaries store in the supplied directory. If the file arctype.txt
* exist, this is view as a dummy index archive for the sole purpose of
* allowing conversions of downloaded data such as arc files into
* Yioop! format.
*
* @param string path to a directory containing a summaries WebArchiveBundle
* @return array summary of the given archive
*/
static function getArchiveInfo($dir_name)
{
if(file_exists($dir_name."/arc_description.txt")) {
$crawl = array();
$info = array();
$crawl['DESCRIPTION'] = substr(
file_get_contents($dir_name."/arc_description.txt"), 0, 256);
$crawl['ARCFILE'] = true;
$info['VISITED_URLS_COUNT'] = 0;
$info['COUNT'] = 0;
$info['NUM_DOCS_PER_PARTITION'] = 0;
$info['WRITE_PARTITION'] = 0;
$info['DESCRIPTION'] = serialize($crawl);
return $info;
}
return WebArchiveBundle::getArchiveInfo($dir_name."/summaries");
}
/**
* Sets the archive info (DESCRIPTION, COUNT,
* NUM_DOCS_PER_PARTITION) for the web archive bundle associated with
* this bundle. As DESCRIPTION is used to store info about the info
* bundle this sets the global properties of the info bundle as well.
*
* @param string $dir_name folder with archive bundle
* @param array $info struct with above fields
*/
static function setArchiveInfo($dir_name, $info)
{
WebArchiveBundle::setArchiveInfo($dir_name."/summaries", $info);
}
/**
* Returns the mast time the archive info of the bundle was modified.
*
* @param string $dir_name folder with archive bundle
*/
static function getParamModifiedTime($dir_name)
{
return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries");
}
}
?>