<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009, 2010, 2011 Chris Pollett hide@address.com
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett hide@address.com
* @package seek_quarry
* @subpackage model
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009, 2010, 2011
* @filesource
*/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* logging is done during crawl not through web,
* so it will not be used in the phrase model
*/
if(!defined("POST_PROCESSING")) {
define("LOG_TO_FILES", false);
}
/** For crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/** For extractPhrasesAndCount function */
require_once BASE_DIR."/lib/phrase_parser.php";
/**
* Used to look up words and phrases in the inverted index
* associated with a given crawl
*/
require_once BASE_DIR."/lib/index_archive_bundle.php";
/**
* Load FileCache class in case used
*/
require_once(BASE_DIR."/lib/file_cache.php");
/**
* Load iterators to get docs out of index archive
*/
foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
as $filename) {
require_once $filename;
}
/**
*
* This is class is used to handle
* results for a given phrase search
*
* @author Chris Pollett
* @package seek_quarry
* @subpackage model
*/
class PhraseModel extends Model
{
/** used to hold the name of index archive to look summaries up in
* @var string
*/
var $index_name;
/** an associative array of additional meta words and
* the max description length of results if such a meta word is used
* this array is typically set in index.php
*
* @var array
*/
var $additional_meta_words;
/**
* Used to hold query statistics about the current query
* @var array
*/
var $query_info;
/**
* Number of pages to cache in one go in memcache or filecache
* Size chosen based on 1MB max object size for memcache or filecache
*/
const NUM_CACHE_PAGES = 10;
/**
* {@inheritdoc}
*/
function __construct($db_name = DB_NAME)
{
parent::__construct($db_name);
}
/**
* Returns whether there is a index with the provide timestamp
*
* @param int $index_timestamp timestamp of the index to check if in cache
* @return bool whether it exists or not
*/
function indexExists($index_time_stamp)
{
return file_exists(CRAWL_DIR.'/cache/IndexData'.$index_time_stamp);
}
/**
* Rewrites a mix query so that it maps directly to a query about crawls
*
* @param string $query the original before a rewrite
* @param object $mix a mix object saying how the mix is built out of crawls
*
* @return string a rewritten query in terms of crawls
*/
function rewriteMixQuery($query, $mix)
{
$disjunct_phrases = explode("|", $query);
$rewrite = "";
if(isset($mix['GROUPS'])) {
foreach($mix['GROUPS'] as $group) {
$pipe = "";
foreach($disjunct_phrases as $disjunct) {
$rewrite .= $pipe;
$pipe = ' | ';
$disjunct_string = $disjunct;
$base_weight = 1;
$pattern = "/(\s)(index:(\S)+)/";
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$rewrite .= $disjunct;
continue;
}
$pattern = "/(\s)(i:(\S)+)/";
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$rewrite .= $disjunct;
continue;
}
$pattern = "/(\s)(weight:(\S)+)/";
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$base_weight = substr($matches[2][0],strlen("weight:"));
$disjunct_string =
preg_replace($pattern,"", $disjunct_string);
}
$pattern = "/(\s)(w:(\S)+)/";
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$base_weight = substr($matches[2][0],strlen("w:"));
$disjunct_string =
preg_replace($pattern,"", $disjunct_string);
}
$pipe2 = "";
if(isset($group['COMPONENTS'])) {
$start_disjunct_string = $disjunct_string;
foreach($group['COMPONENTS'] as $component) {
$disjunct_string = $start_disjunct_string;
if(isset($component['KEYWORDS'])) {
$disjunct_string .= " ".$component['KEYWORDS'];
}
$rewrite .= $pipe2.$disjunct_string." w:".
($component['WEIGHT']*$base_weight)." i:".
$component['CRAWL_TIMESTAMP'];
$pipe2 = ' | ';
}
}
}
$num_results = (isset($group['RESULT_BOUND']) &&
$group['RESULT_BOUND'] > 1) ?
$group['RESULT_BOUND'] : 1;
$rewrite .= " #$num_results# ";
}
}
return $rewrite;
}
/**
* Given a query phrase, returns formatted document summaries of the
* documents that match the phrase.
*
* @param string $phrase the phrase to try to match
* @param int $low return results beginning with the $low document
* @param int $results_per_page how many results to return
* @param bool $format whether to highlight in the returned summaries the
* matched text
* @param array $filter an array of hashes of domains to filter from
* results
* @param bool $use_cache_if_allowed if true and USE_CACHE is true then
* an attempt will be made to look up the results in either
* the file cache or memcache. Otherwise, items will be recomputed
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
* no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
* @return array an array of summary data
*/
function getPhrasePageResults(
$input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE,
$format = true, $filter = NULL, $use_cache_if_allowed = true,
$raw = 0)
{
if(QUERY_STATISTICS) {
$indent= " ";
$in2 = $indent . $indent;
$in3 = $in2 . $indent;
$prs_cnt = 0;
$dis_cnt = 0;
$this->query_info = array();
$this->query_info['QUERY'] =
"<b>PHRASE QUERY</b>: ".$input_phrase."<br />";
$start_time = microtime();
}
$results = NULL;
$word_structs = array();
/*
this is a quick and dirty parsing and will usually work,
exceptions would be # or | in quotes or if someone tried
to escape |.
First we split into presentation elements then we split by
disjuncts
*/
$presentation_parts = preg_split('/#(\d)+#/',
$input_phrase, -1, PREG_SPLIT_DELIM_CAPTURE);
$count = 0;
$presentation_parts = array_chunk($presentation_parts, 2);
$num_parts = count($presentation_parts);
$query_parts = array();
$last_part = NULL;
for($i = 0; $i < $num_parts ; $i++) {
if(isset($presentation_parts[$i][0]) &&
($trimmed = trim($presentation_parts[$i][0])) != "" ) {
$to_return = (isset($presentation_parts[$i][1])) ?
$presentation_parts[$i][1]: 1;
$query_parts[$trimmed][] =
array($count, $to_return);
$last_part = $trimmed;
if(isset($presentation_parts[$i][1])) {
$count += $presentation_parts[$i][1];
} else {
$count ++;
}
}
}
$results_high = $low + $results_per_page;
$num_last_parts = count($query_parts[$last_part]);
if($query_parts[$last_part][$num_last_parts - 1][0] +
$query_parts[$last_part][$num_last_parts - 1][1] < $low) {
$query_parts[$last_part][$num_last_parts - 1][1] = $results_high;
}
$num_phrases = count($query_parts);
foreach($query_parts as $phrase => $pre_result_bounds) {
$phrase_high = $pre_result_bounds[0][1];
$result_bounds = array();
$start_flag = false;
$num_bounds = 0;
foreach($pre_result_bounds as $bound) {
if($bound[0] > $results_high) break;
//rest of presentation after what we'll return so break
$phrase_high = $bound[0] + $bound[1];
if($phrase_high < $low) continue;
// this part of presentation is before what we'll return so skip
$result_bounds[] = $bound;
$num_bounds++;
}
if($num_bounds == 0) continue;
if($phrase == $last_part &&
$result_bounds[$num_bounds - 1][0] +
$result_bounds[$num_bounds - 1][1] < $results_high) {
$result_bounds[$num_bounds - 1][1] = $results_high -
$result_bounds[$num_bounds - 1][0];
}
$phrase_num = max(min($phrase_high, $results_high), $results_high) -
$low;
$disjunct_phrases = explode("|", $phrase);
$word_structs = array();
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= $indent .
"<b>Presentation $prs_cnt:</b><br />";
$this->query_info['QUERY'] .= "$in2<i>Low</i>:".
$result_bounds[0][0]."<br />";
$this->query_info['QUERY'] .= $in2 .
"<i>High</i>: ".$result_bounds[0][1]."<br />";
$prs_cnt++;
}
foreach($disjunct_phrases as $disjunct) {
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "$in2<b>Disjunct $dis_cnt:"
. "</b><br />";
$dis_cnt++;
}
list($word_struct, $format_words) =
$this->parseWordStructConjunctiveQuery($disjunct);
if($word_struct != NULL) {
$word_structs[] = $word_struct;
}
}
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .=
"$in2<b>Presentation Parse time</b>: " .
changeInMicrotime($start_time)."<br />";
$summaries_time = microtime();
}
$out_results = $this->getSummariesByHash($word_structs,
$low, $phrase_num, $filter, $use_cache_if_allowed, $raw);
if(isset($out_results['PAGES']) &&
count($out_results['PAGES']) != 0) {
$out_count = 0;
foreach($result_bounds as $bound) {
for($i = $bound[0];
$i < min($bound[0] + $bound[1], $results_high);
$i++) {
if(isset($out_results['PAGES'][$out_count])) {
$results['PAGES'][$i] =
$out_results['PAGES'][$out_count];
$out_count++;
}
}
}
if($phrase == $last_part && isset($out_results['TOTAL_ROWS'])){
$total_rows = $out_results['TOTAL_ROWS'];
}
}
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "$in2<b>Get Summaries time</b>: ".
changeInMicrotime($summaries_time)."<br />";
$format_time = microtime();
}
}
if(isset($results['PAGES'])){
ksort($results['PAGES']);
$results["PAGES"] = array_values($results["PAGES"]);
}
if(count($results) == 0) {
$results = NULL;
}
if($results == NULL) {
$total_rows = 0;
$results['TOTAL_ROWS'] = 0;
}
if(isset($total_rows)) {
$results['TOTAL_ROWS'] = $total_rows;
} else {
$results['TOTAL_ROWS'] = count($results['PAGES']);
}
if($format) {
if(count($format_words) == 0 ){
$format_words = NULL;
}
} else {
$format_words = NULL;
}
$description_length = self::DEFAULT_DESCRIPTION_LENGTH;
if(isset($this->additional_meta_words) &&
is_array($this->additional_meta_words)) {
foreach($this->additional_meta_words as $meta_word => $length){
$pattern = "/$meta_word/";
if(preg_match($pattern, $input_phrase)) {
$description_length = $length;
break; // only match the first found
}
}
}
$output = $this->formatPageResults($results, $format_words,
$description_length);
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "<b>Format time</b>: ".
changeInMicrotime($format_time)."<br />";
$this->query_info['ELAPSED_TIME'] = changeInMicrotime($start_time);
$this->db->total_time += $this->query_info['ELAPSED_TIME'];
$this->db->query_log[] = $this->query_info;
}
return $output;
}
/**
* Determines the offset into the summaries WebArchiveBundle of the
* provided url so that the info:url summary can be retrieved.
* This assumes of course that the info:url meta word has been stored.
*
* @param string $url what to lookup
* @return array (offset, generation) into the web archive bundle
*/
function lookupSummaryOffsetGeneration($url)
{
$index_archive_name = self::index_data_base_name . $this->index_name;
$index_archive = new IndexArchiveBundle(
CRAWL_DIR.'/cache/'.$index_archive_name);
$num_retrieved = 0;
$pages = array();
$summary_offset = NULL;
$num_generations = $index_archive->generation_info['ACTIVE'];
$word_iterator =
new WordIterator(crawlHash("info:$url"), $index_archive);
if(is_array($next_docs = $word_iterator->nextDocsWithWord())) {
foreach($next_docs as $doc_key => $doc_info) {
$summary_offset =
$doc_info[CrawlConstants::SUMMARY_OFFSET];
$generation = $doc_info[CrawlConstants::GENERATION];
$cache_partition = $doc_info[CrawlConstants::SUMMARY][
CrawlConstants::CACHE_PAGE_PARTITION];
$num_retrieved++;
if($num_retrieved >= 1) {
break;
}
}
if($num_retrieved == 0) {
return false;
}
} else {
return false;
}
return array($summary_offset, $generation, $cache_partition);
}
/**
* Parses from a string phrase representing a conjunctive query, a struct
* consisting of the words keys searched for, the allowed and disallowed
* phrases, the weight that should be put on these query results, and
* which archive to use.
*
* @param string $phrase string to extract struct from
* @return array struct representing the conjunctive query
*/
function parseWordStructConjunctiveQuery($phrase)
{
$indent= " ";
$in2 = $indent . $indent;
$in3 = $in2 . $indent;
$in4 = $in2. $in2;
$phrase = " ".$phrase;
$phrase = $this->parseIfConditions($phrase);
$phrase_string = $phrase;
$meta_words = array('link:', 'site:', 'version:', 'modified:',
'filetype:', 'info:', '\-', 'os:', 'server:', 'date:',
'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:',
'lang:', 'media:', 'elink:', 'location:');
if(isset($this->additional_meta_words)) {
$meta_words = array_merge($meta_words, array_keys(
$this->additional_meta_words));
}
$index_name = $this->index_name;
$weight = 1;
$found_metas = array();
$disallow_phrases = array();
foreach($meta_words as $meta_word) {
$pattern = "/(\s)($meta_word(\S)+)/";
preg_match_all($pattern, $phrase, $matches);
if(!in_array($meta_word, array('i:', 'index:', 'w:',
'weight:', '\-') )) {
$matches = $matches[2];
$found_metas = array_merge($found_metas, $matches);
} else if($meta_word == '\-') {
if(count($matches[0]) > 0) {
$disallow_phrases =
array_merge($disallow_phrases,
array(substr($matches[2][0],1)));
}
} else if ($meta_word == 'i:' || $meta_word == 'index:') {
if(isset($matches[2][0])) {
$index_name = substr($matches[2][0],strlen($meta_word));
}
} else if ($meta_word == 'w:' || $meta_word == 'weight:') {
if(isset($matches[2][0])) {
$weight = substr($matches[2][0],strlen($meta_word));
}
}
$phrase_string = preg_replace($pattern, "", $phrase_string);
}
$index_archive_name = self::index_data_base_name . $index_name;
$index_archive = new IndexArchiveBundle(
CRAWL_DIR.'/cache/'.$index_archive_name);
$phrase_string = mb_ereg_replace(PUNCT, " ", $phrase_string);
$phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);
/*
we search using the stemmed/char-grammed words, but we format
snippets in the results by bolding either
*/
$query_words = explode(" ", $phrase_string); //not stemmed
$base_words =
PhraseParser::extractPhrases($phrase_string,MAX_PHRASE_LEN,
getLocaleTag()); //stemmed, if have stemmer
$words = array_merge($base_words, $found_metas);
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "$in3<i>Index</i>: ".
$index_archive_name."<br />";
$this->query_info['QUERY'] .= "$in3<i>LocaleTag</i>: ".
getLocaleTag()."<br />";
$this->query_info['QUERY'] .=
"$in3<i>Stemmed/Char-grammed Words</i>:<br />";
foreach($base_words as $word){
$this->query_info['QUERY'] .= "$in4$word<br />";
}
$this->query_info['QUERY'] .= "$in3<i>Meta Words</i>:<br />";
foreach($found_metas as $word){
$this->query_info['QUERY'] .= "$in4$word<br />";
}
}
if(isset($words) && count($words) == 1 &&
count($disallow_phrases) < 1) {
$phrase_string = $words[0];
$phrase_hash = crawlHash($phrase_string);
$word_struct = array("KEYS" => array($phrase_hash),
"RESTRICT_PHRASES" => NULL, "DISALLOW_KEYS" => array(),
"WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive
);
} else {
/*
handle strings in quotes
(we want an exact match on such quoted strings)
*/
$quoteds =array();
$hash_quoteds = array();
$num_quotes =
preg_match_all('/\"((?:[^\"\\\]|\\\\.)*)\"/', $phrase,$quoteds);
if(isset($quoteds[1])) {
$quoteds = $quoteds[1];
}
//get a raw list of words and their hashes
$hashes = array();
$i = 0;
foreach($words as $word) {
$hashes[] = crawlHash($word);
}
$restrict_phrases = $quoteds;
if(count($hashes) > 0) {
$word_keys = array_slice($hashes, 0, MAX_QUERY_TERMS);
} else {
$word_keys = NULL;
$word_struct = NULL;
}
$restrict_phrases = array_unique($restrict_phrases);
$restrict_phrases = array_filter($restrict_phrases);
$index_archive->setCurrentShard(0, true);
$disallow_keys = array();
$num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases));
for($i = 0; $i < $num_disallow_keys; $i++) {
$disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount(
$disallow_phrases[$i], 2, getLocaleTag()));
//stemmed
$disallow_keys[] = crawlHash($disallow_stem[0]);
}
if($word_keys !== NULL) {
$word_struct = array("KEYS" => $word_keys,
"RESTRICT_PHRASES" => $restrict_phrases,
"DISALLOW_KEYS" => $disallow_keys,
"WEIGHT" => $weight,
"INDEX_ARCHIVE" => $index_archive
);
}
}
$format_words = array_merge($query_words, $base_words);
return array($word_struct, $format_words);
}
/**
* Evaluates any if: conditional meta-words in the query string to
* caluclate a new query string.
*
* @param string $phrase original query string
* @return string query string after if: meta words have been evaluated
*/
function parseIfConditions($phrase)
{
$cond_token = "if:";
$pattern = "/(\s)($cond_token(\S)+)/";
preg_match_all($pattern, $phrase, $matches);
$matches = $matches[2];
$result_phrase = preg_replace($pattern, "", $phrase);
foreach($matches as $match) {
$match = substr($match, strlen($cond_token));
$match_parts = explode("!", $match);
if(count($match_parts) < 2) continue;
if(stristr($result_phrase, $match_parts[0]) !== false) {
$result_phrase .= " ".str_replace("+", " ", $match_parts[1]);
} else if(isset($match_parts[2])) {
$result_phrase .= " ".str_replace("+", " ", $match_parts[2]);
}
}
return $result_phrase;
}
/**
* Given a page summary extract the words from it and try to find documents
* which match the most relevant words. The algorithm for "relevant" is
* pretty weak. For now we pick the $num many words which appear in the
* fewest documents.
*
* @param string $crawl_item a page summary
* @param int $num number of key phrase to return
* @return array an array of most selective key phrases
*/
function getTopPhrases($crawl_item, $num)
{
$index_archive_name = self::index_data_base_name . $this->index_name;
$index_archive =
new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name);
$phrase_string =
PhraseParser::extractWordStringPageSummary($crawl_item);
$words =
array_keys(PhraseParser::extractPhrasesAndCount($phrase_string));
$hashes = array();
$lookup = array();
foreach($words as $word) {
$tmp = crawlHash($word);
$hashes[] = $tmp;
$lookup[$tmp] = $word;
}
$words_array =
$index_archive->getSelectiveWords($hashes, $num, "greaterThan");
$word_keys = array_keys($words_array);
$phrases = array();
foreach($word_keys as $word_key) {
$phrases[] = $lookup[$word_key];
}
return $phrases;
}
/**
* Gets doc summaries of documents containing given words and meeting the
* additional provided criteria
* @param array $word_structs an array of word_structs. Here a word_struct
* is an associative array with at least the following fields
* KEYS -- an array of word keys
* RESTRICT_PHRASES -- an array of phrases the document must contain
* DISALLOW_PHRASES -- an array of words the document must not contain
* WEIGHT -- a weight to multiple scores returned from this iterator by
* INDEX_ARCHIVE -- an index_archive object to get results from
* @param int $limit number of first document in order to return
* @param int $num number of documents to return summaries of
* @param array &$filter an array of hashes of domains to filter from
* results
* @param bool $use_cache_if_allowed if true and USE_CACHE is true then
* an attempt will be made to look up the results in either
* the file cache or memcache. Otherwise, items will be recomputed
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
* no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
* @return array document summaries
*/
function getSummariesByHash($word_structs, $limit, $num, &$filter,
$use_cache_if_allowed = true, $raw = 0)
{
global $CACHE;
$pages = array();
$generation = 0;
$to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) *
self::NUM_CACHE_PAGES;
$start_slice = floor(($limit)/self::NUM_CACHE_PAGES) *
self::NUM_CACHE_PAGES;
if(USE_CACHE) {
$mem_tmp = "";
foreach($word_structs as $word_struct) {
$mem_tmp .= serialize($word_struct["KEYS"]).
serialize($word_struct["RESTRICT_PHRASES"]) .
serialize($word_struct["DISALLOW_KEYS"]) .
$word_struct["WEIGHT"] .
$word_struct["INDEX_ARCHIVE"]->dir_name;
}
if($use_cache_if_allowed) {
$cache_success = true;
$results = array();
$results['PAGES'] = array();
for($i=$start_slice; $i<$to_retrieve;$i+=self::NUM_CACHE_PAGES){
$summary_hash = crawlHash($mem_tmp.":".$i);
$slice = $CACHE->get($summary_hash);
if($slice === false) {
$cache_success = false;
break;
}
$results['PAGES'] = array_merge($results['PAGES'],
$slice['PAGES']);
$results['TOTAL_ROWS'] = $slice['TOTAL_ROWS'];
}
if($cache_success) {
$results['PAGES'] =
array_slice($results['PAGES'],
$limit - $start_slice, $num);
return $results;
}
}
}
$query_iterator = $this->getQueryIterator($word_structs, $filter, $raw);
$num_retrieved = 0;
$pages = array();
while(is_object($query_iterator) &&
is_array($next_docs = $query_iterator->nextDocsWithWord()) &&
$num_retrieved < $to_retrieve) {
foreach($next_docs as $doc_key => $doc_info) {
$summary = & $doc_info[CrawlConstants::SUMMARY];
$tmp = unserialize($query_iterator->getIndex(
$doc_key)->description);
$doc_info[self::CRAWL_TIME] = $tmp[self::CRAWL_TIME];
unset($doc_info[CrawlConstants::SUMMARY]);
if(is_array($summary)) {
$pages[] = array_merge($doc_info, $summary);
$num_retrieved++;
}
}
}
usort($pages, "scoreOrderCallback");
if($num_retrieved < $to_retrieve) {
$results['TOTAL_ROWS'] = $num_retrieved;
} else {
$results['TOTAL_ROWS'] = $query_iterator->num_docs;
//this is only an approximation
}
$result_count = count($pages);
if(USE_CACHE) {
for($i = 0; $i < $result_count; $i++){
unset($pages[$i][self::LINKS]);
}
for($i = 0;$i < $to_retrieve;$i+=self::NUM_CACHE_PAGES){
$summary_hash = crawlHash($mem_tmp.":".$i);
$slice['PAGES'] = array_slice($pages, $i,
self::NUM_CACHE_PAGES);
$slice['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
$CACHE->set($summary_hash, $slice);
}
}
$results['PAGES'] = & $pages;
$results['PAGES'] = array_slice($results['PAGES'], $start_slice);
$results['PAGES'] = array_slice($results['PAGES'], $limit -
$start_slice, $num);
return $results;
}
/**
* Using the supplied $word_structs, contructs an iterator for getting
* results to a query
*
* @param array $word_structs an array of word_structs. Here a word_struct
* is an associative array with at least the following fields
* KEYS -- an array of word keys
* RESTRICT_PHRASES -- an array of phrases the document must contain
* DISALLOW_PHRASES -- an array of words the document must not contain
* WEIGHT -- a weight to multiple scores returned from this iterator by
* INDEX_ARCHIVE -- an index_archive object to get results from
* @param array &$filter an array of hashes of domains to filter from
* results
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
* no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
* @return &object an iterator for iterating through results to the
* query
*/
function getQueryIterator($word_structs, &$filter, $raw = 0)
{
$iterators = array();
$total_iterators = 0;
foreach($word_structs as $word_struct) {
if(!is_array($word_struct)) { continue;}
$word_keys = $word_struct["KEYS"];
$distinct_word_keys = array_unique($word_keys);
$restrict_phrases = $word_struct["RESTRICT_PHRASES"];
$disallow_keys = $word_struct["DISALLOW_KEYS"];
$index_archive = $word_struct["INDEX_ARCHIVE"];
$weight = $word_struct["WEIGHT"];
$num_word_keys = count($word_keys);
$total_iterators = count($distinct_word_keys);
$word_iterators = array();
$word_iterator_map = array();
if($num_word_keys < 1) {continue;}
for($i = 0; $i < $total_iterators; $i++) {
$word_iterators[$i] =
new WordIterator($distinct_word_keys[$i], $index_archive,
false, $filter);
foreach ($word_keys as $index => $key) {
if($key == $distinct_word_keys[$i]){
$word_iterator_map[$index] = $i;
}
}
}
$num_disallow_keys = count($disallow_keys);
if($num_disallow_keys > 0) {
for($i = 0; $i < $num_disallow_keys; $i++) {
$disallow_iterator =
new WordIterator($disallow_keys[$i], $index_archive,
false, $filter);
$word_iterators[$num_word_keys + $i] =
new NegationIterator($disallow_iterator);
}
}
$num_word_keys += $num_disallow_keys;
if($num_word_keys == 1) {
$base_iterator = $word_iterators[0];
} else {
$base_iterator = new IntersectIterator(
$word_iterators,$word_iterator_map);
}
if($restrict_phrases == NULL && $disallow_keys == array() &&
$weight == 1) {
$iterators[] = $base_iterator;
} else {
$iterators[] = new PhraseFilterIterator($base_iterator,
$restrict_phrases, $weight);
}
}
$num_iterators = count($iterators);
if( $num_iterators < 1) {
return NULL;
} else if($num_iterators == 1) {
$union_iterator = $iterators[0];
} else {
$union_iterator = new UnionIterator($iterators);
}
$raw = intval($raw);
if ($raw == 2) {
$group_iterator = $union_iterator;
} else if ($raw == 1) {
$group_iterator =
new GroupIterator($union_iterator, $total_iterators, true);
} else {
$group_iterator =
new GroupIterator($union_iterator, $total_iterators);
}
return $group_iterator;
}
}
?>