Location: PHPKode > projects > Yioop! > yioop-v0.78/lib/index_shard.php
<?php
/** 
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009, 2010, 2011  Chris Pollett hide@address.com
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett hide@address.com
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009, 2010, 2011
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Read in base class, if necessary
 */
require_once "persistent_structure.php";

/**
 * Load charCopy
 */
require_once "utility.php";

/** 
 *Loads common constants for web crawling
 */
require_once  BASE_DIR.'/lib/crawl_constants.php';

/**
 * Data structure used to store one generation worth of the word document
 * index (inverted index).
 * This data structure consists of three main components a word entries,
 * word_doc entries, and document entries.
 *
 * Word entries are described in the documentation for the words field.
 *
 * Word-doc entries are described in the documentation for the word_docs field
 *
 * Document entries are described in the documentation for the doc_infos field
 * 
 * IndexShards also have two access modes a $read_only_from_disk mode and 
 * a loaded in memory mode. Loaded in memory mode is mainly for writing new
 * data to the shard. When in memory, data in the shard can also be in one of 
 * two states packed or unpacked. Roughly, when it is in a packed state it is 
 * ready to be serialized to disk; when it is an unpacked state it methods 
 * for adding data can be used.
 *
 * Serialized on disk, a shard has a header with document statistics followed
 * by the a prefix index into the words component, followed by the word
 * component itself, then the word-docs component, and finally the document
 * component.
 * 
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage library
 */ 
 
class IndexShard extends PersistentStructure implements 
    CrawlConstants
{
    /**
     * Stores document id's and links to documents id's together with
     * summary offset information, and number of words in the doc/link
     * The format for a record is 4 byte offset, followed by
     * 3 bytes for the document length, followed by 1 byte containing
     * the number of 8 byte doc key strings that make up the doc id (2 for
     * a doc, 3 for a link), followed by the doc key strings themselves.
     * In the case of a document the first doc key string has a hash of the
     * url, the second a hash a tag stripped version of the document. 
     * In the case of a link, the keys are a unique identifier for the link 
     * context, followed by  8 bytes for
     * the hash of the url being pointed to by the link, followed by 8
     * bytes for the hash of "info:url_pointed_to_by_link".
     * @var string
     */
    var $doc_infos;
    /**
     *  Length of $doc_infos as a string
     *  @var int
     */
    var $docids_len;

    /**
     * This string is non-empty when shard is loaded and in its packed state.
     * It consists of a sequence of posting records. Each posting
     * consists of a offset into the document entries structure
     * for a document containing the word this is the posting for,
     * as well as the number of occurrences of that word in that document.
     * @var string
     */
    var $word_docs;
    /**
     *  Length of $word_docs as a string
     *  @var int
     */
    var $word_docs_len;

    /**
     * Stores the array of word entries for this shard
     * In the packed state, word entries consist of the word id, 
     * a generation number, an offset into the word_docs structure 
     * where the posting list for that word begins,
     * and a length of this posting list. In the unpacked state
     * each entry is a string of all the posting items for that word
     * Periodically data in this words array is flattened to the word_postings
     * string which is a more memory efficient was of storing data in PHP
     * @var array
     */
    var $words;

    /**
     * Stores length of the words array in the shard on disk. Only set if
     * we're in $read_only_from_disk mode
     *
     * @var int
     */
     var $words_len;

    /**
     * An array representing offsets into the words dictionary of the index of 
     * the first occurrence of a two byte prefix of a word_id. 
     *
     * @var array
     */
    var $prefixes;

    /**
     * Length of the prefix index into the dictionary of the shard
     *
     * @var int
     */
    var $prefixes_len;

    /**
     * This is supposed to hold the number of earlier shards, prior to the 
     * current shard.
     * @var int
     */
    var $generation;

    /**
     * This is supposed to hold the number of documents that a given shard can
     * hold.
     * @var int
     */
    var $num_docs_per_generation;

    /**
     * Number of documents (not links) stored in this shard
     * @var int
     */
    var $num_docs;
    /**
     * Number of links (not documents) stored in this shard
     * @var int
     */
    var $num_link_docs;
    /**
     * Number of words stored in total in all documents in this shard
     * @var int
     */
    var $len_all_docs;
    /**
     * Number of words stored in total in all links in this shard
     * @var int
     */
    var $len_all_link_docs;

    /**
     * File handle for a shard if we are going to use it in read mode
     * and not completely load it.
     *
     * @var resource
     */
    var $fh;

    /**
     * An cached array of disk blocks for an index shard that has not
     * been completely loaded into memory.
     * @var array
     */
    var $blocks;

    /**
     * Flag used to determined if this shard is going to be largely kept on
     * disk and to be in read only mode. Otherwise, shard will assume to
     * be completely held in memory and be read/writable.
     * @var bool
     */
    var $read_only_from_disk;

    /**
     * Keeps track of the packed/unpacked state of the word_docs list
     *
     * @var bool
     */
    var $word_docs_packed;

    /**
     * Keeps track of the length of the shard as a file
     *
     * @var int
     */
    var $file_len;

    /**
     * Number of document inserts since the last time word data was flattened
     * to the word_postings string.
     */
     var $last_flattened_words_count;

    /**
     * Used to hold word_id, posting_len, posting triples as a memory efficient
     * string
     * @var string
     */
    var $word_postings;
     
    /**
     * Fraction of NUM_DOCS_PER_GENERATION document inserts before data
     * from the words array is flattened to word_postings. (It will
     * also be flattened during periodic index saves)
     */
    const FLATTEN_FREQUENCY = 10000;

    /**
     * Bytes of tmp string allowed during flattenings
     */
     const WORD_POSTING_COPY_LEN = 2000000;

    /**
     * Used to keep track of whether a record in document infos is for a
     * document or for a link
     */
    const LINK_FLAG =  0x800000;

    /**
     * Size in bytes of one block in IndexShard
     */
    const SHARD_BLOCK_SIZE = 4096;

    /**
     * Header Length of an IndexShard (sum of its non-variable length fields)
     */
    const HEADER_LENGTH = 40;

    /**
     * Length of a Word entry in bytes in the shard
     */
    const WORD_ITEM_LEN = 20;

    /**
     * Length of a word entry's key in bytes
     */
    const WORD_KEY_LEN = 8;

    /**
     * Length of a key in a DOC ID.
     */
    const DOC_KEY_LEN = 8;

    /**
     * Length of one posting ( a doc offset occurrence pair) in a posting list
     */
    const POSTING_LEN = 4;

    /**
     *  Represents an empty prefix item
     */
    const BLANK = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF";

    /**
     * Flag used to indicate that a word item should not be packed or unpacked
     */
    const HALF_BLANK = "\xFF\xFF\xFF\xFF";

    /**
     *  Represents an empty prefix item
     */
    const STORE_FLAG = "\x80";

    /**
     * Makes an index shard with the given file name and generation offset
     *
     * @param string $fname filename to store the index shard with
     * @param int $generation when returning documents from the shard
     *      pretend there ar ethis many earlier documents
     * @param bool $read_only_from_disk used to determined if this shard is 
     *      going to be largely kept on disk and to be in read only mode. 
     *      Otherwise, shard will assume to be completely held in memory and be 
     *      read/writable.
     */
    function __construct($fname, $generation = 0, 
        $num_docs_per_generation = NUM_DOCS_PER_GENERATION,
        $read_only_from_disk = false)
    {
        parent::__construct($fname, -1);
        $this->hash_name = crawlHash($fname);
        $this->generation = $generation;
        $this->num_docs_per_generation = $num_docs_per_generation;
        $this->word_docs = "";
        $this->word_postings = "";
        $this->words_len = 0;
        $this->word_docs_len = 0;
        $this->last_flattened_words_count = 0;
        $this->words = array();
        $this->docids_len = 0;
        $this->doc_infos = "";
        $this->num_docs = 0;
        $this->num_link_docs = 0;
        $this->len_all_docs = 0;
        $this->len_all_link_docs = 0;
        $this->blocks = array();
        $this->fh = NULL;
        $this->read_only_from_disk = $read_only_from_disk;
        $this->word_docs_packed = false;
    }

    /**
     * Add a new document to the index shard with the given summary offset.
     * Associate with this document the supplied list of words and word counts.
     * Finally, associate the given meta words with this document.
     *
     * @param string $doc_keys a string of concatenated keys for a document 
     *      to insert. Each key is assumed to be a string of DOC_KEY_LEN many 
     *      bytes. This whole set of keys is viewed as fixing one document.
     * @param int $summary_offset its offset into the word archive the
     *      document's data is stored in
     * @param array $word_lists (word => array of word positions in doc)
     * @param array $meta_ids meta words to be associated with the document
     *      an example meta word would be filetype:pdf for a PDF document.
     * @param bool $is_doc flag used to indicate if what is being sored is
     *      a document or a link to a document
     * @param mixed $rank either false if not used, or a 4 bit estimate of the
     *      rank of this document item
     * @return bool success or failure of performing the add
     */
    function addDocumentWords($doc_keys, $summary_offset, $word_lists,
        $meta_ids = array(), $is_doc = false, $rank = false)
    {
        if($this->word_docs_packed == true) {
            $this->words = array();
            $this->word_docs = "";
            $this->word_docs_packed = false;
        }

        $doc_len = 0;
        $link_doc_len = 0;
        $len_key = strlen($doc_keys);
        $num_keys = floor($len_key/self::DOC_KEY_LEN);

        if($num_keys * self::DOC_KEY_LEN != $len_key) return false;

        if($num_keys % 2 == 0 ) {
            $doc_keys .= self::BLANK; //want to keep docids_len divisible by 16
        }

        $summary_offset_string = packInt($summary_offset);
        $added_len = strlen($summary_offset_string);
        $this->doc_infos .= $summary_offset_string;

        if($is_doc) { 
            $this->num_docs++;
        } else { //link item
            $this->num_link_docs++;
        }
        foreach($meta_ids as $meta_id) {
            $word_lists[$meta_id] = array();
        }

        //using $this->docids_len divisible by 16
        $doc_offset = $this->docids_len >> 4;
        foreach($word_lists as $word => $position_list) {
            $occurrences = count($position_list);
            $word_id = crawlHash($word, true);
            $store = $this->packPosting($doc_offset, $position_list);

            if(!isset($this->words[$word_id])) {
                $this->words[$word_id] = $store;
            } else {
                $this->words[$word_id] .= $store;
            }
            if($occurrences > 0) {
                if($is_doc == true) {
                    $doc_len += $occurrences;
                } else {
                    $link_doc_len += $occurrences;
                }
            }
            $this->word_docs_len += strlen($store);
        }

        $this->len_all_docs += $doc_len;
        $this->len_all_link_docs += $link_doc_len;
        $flags = ($is_doc) ? 0 : self::LINK_FLAG;
        if($rank !== false) {
            $rank &= 0x0f;
            $rank <<= 19;
            $flags += $rank;
        }
        $item_len = ($is_doc) ? $doc_len: $link_doc_len;
        $len_num_keys = $this->packDoclenNum(($flags + $item_len), $num_keys);

        $this->doc_infos .=  $len_num_keys;
        $added_len += strlen($len_num_keys);
        $this->doc_infos .= $doc_keys;
        $added_len += strlen($doc_keys);
        $this->docids_len += $added_len;

        return true;
    }

    /**
     * Returns the first offset, last offset, and number of documents the
     * word occurred in for this shard. The first offset (similarly, the last
     * offset) is the byte offset into the word_docs string of the first
     * (last) record involving that word.
     *
     * @param string $word_id id of the word one wants to look up
     * @param bool $raw whether the id is our version of base64 encoded or not
     * @return array first offset, last offset, count
     */
    function getWordInfo($word_id, $raw = false)
    {
        if($raw == false) {
            //get rid of out modfied base64 encoding
            $word_id = unbase64Hash($word_id);
        }

        $is_disk = $this->read_only_from_disk;
        $word_item_len = self::WORD_ITEM_LEN;

        if($is_disk) {
            $this->getShardHeader();

            $prefix = (ord($word_id[0]) << 8) + ord($word_id[1]);
            $prefix_info = $this->getShardSubstring(
                self::HEADER_LENGTH + 8*$prefix, 8);
            if($prefix_info == self::BLANK) {
                return false;
            }
            $offset = unpackInt(substr($prefix_info, 0, 4));

            $high = unpackInt(substr($prefix_info, 4, 4)) - 1;

            $start = self::HEADER_LENGTH + $this->prefixes_len  + $offset;
        } else {
            if($this->word_docs_packed == false) {
                $this->mergeWordPostingsToString();
                $this->packWords(NULL);
                $this->outputPostingLists();
            }
            $start = 0;
            $high = (strlen($this->words) - $word_item_len)/$word_item_len;
        }
        $low = 0;
        $check_loc = (($low + $high) >> 1);
        do {
            $old_check_loc = $check_loc;
            if($is_disk) {
                $word_string = $this->getShardSubstring($start + 
                    $check_loc * $word_item_len, $word_item_len);
            } else {
                $word_string = substr($this->words, $start + 
                    $check_loc * $word_item_len, $word_item_len);
            }
            if($word_string == false) {return false;}
            $id = substr($word_string, 0, self::WORD_KEY_LEN);
            $cmp = strcmp($word_id, $id);
            if($cmp === 0) {
                return $this->getWordInfoFromString(
                    substr($word_string, self::WORD_KEY_LEN));
            } else if ($cmp < 0) {
                $high = $check_loc;
                $check_loc = (($low + $check_loc) >> 1);
            } else {
                if($check_loc + 1 == $high) {
                    $check_loc++;
                }
                $low = $check_loc;
                $check_loc = (($high + $check_loc) >> 1);
            }
        } while($old_check_loc != $check_loc);

        return false;

    }

    /**
     * Returns documents using the word_docs string (either as stored
     * on disk or completely read in) of records starting
     * at the given offset and using its link-list of records. Traversal of
     * the list stops if an offset larger than $last_offset is seen or
     * $len many doc's have been returned. Since $next_offset is passed by
     * reference the value of $next_offset will point to the next record in
     * the list (if it exists) after the function is called.
     *
     * @param int $start_offset of the current posting list for query term
     *      used in calculating BM25F.
     * @param int &$next_offset where to start in word docs
     * @param int $last_offset offset at which to stop by
     * @param int $len number of documents desired
     * @return array desired list of doc's and their info
     */
    function getPostingsSlice($start_offset, &$next_offset, $last_offset, $len)
    {
        if(!$this->read_only_from_disk && !$this->word_docs_packed) {
            $this->mergeWordPostingsToString();
            $this->packWords(NULL);
            $this->outputPostingLists();
        }

        $num_docs_so_far = 0;
        $results = array();
        /* wd_len is a kludgy fix because word_docs_len can get out of sync
           when things are file-based and am still tracking down why
        */
        $wd_len = (isset($this->file_len )) ? 
            $this->file_len - $this->docids_len : $this->word_docs_len;
        $end = min($wd_len, $last_offset);

        $num_docs_or_links =  
            self::numDocsOrLinks($start_offset, $last_offset);

        do {
            if($next_offset > $end) {break;}
            $old_next_offset = $next_offset;

            $doc_id = 
                $this->makeItem( // this changes next offst
                    $item, $next_offset, $num_docs_or_links);
            $results[$doc_id] = $item;
            $num_docs_so_far += ($next_offset - $old_next_offset)
                / self::POSTING_LEN;
        } while ($next_offset<= $last_offset && $num_docs_so_far < $len
            && $next_offset > $old_next_offset);

        return $results;
    }

    /**
     *  An upper bound on the number of docs or links represented by
     *  the start and ending integer offsets into a posting list.
     *
     *  @param int $start_offset starting location in posting list
     *  @param int $last_offset ending location in posting list
     *  @return int number of docs or links
     */
    static function numDocsOrLinks($start_offset, $last_offset)
    {
        return floor(($last_offset - $start_offset) / self::POSTING_LEN);
    }

    /**
     * Stores in the supplied item document statistics (suumary offset, 
     * relevance, doc rank, and score) for the the document
     * pointed to by $current_offset, based on the the posting lists 
     * num docs with word, and the number of occurrences of the word in the doc.
     * Returns the doc_id of the document
     *
     * @param array &$item a reference to an array to store statistic in
     * @param int $current_offset offset into word_docs for the document to
     *      calculate statistics for
     * @param int $num_doc_or_links number of documents or links doc appears in
     * @param int $occurs number of occurrences of the current word in 
     *   the document
     *
     * @return string $doc_id of document pointed to by $current_offset
     */
    function makeItem(&$item, &$current_offset, $num_doc_or_links,
        $occurs = 0)
    {
        $current = ($current_offset/self::POSTING_LEN );
        $posting_start = $current;
        $posting_end = $current;
        $posting = $this->getPostingAtOffset(
                $current, $posting_start, $posting_end);
        $current_offset = ($posting_end + 1)* self::POSTING_LEN;
        $offset = 0;
        list($doc_index, $item[self::POSITION_LIST]) = 
            $this->unpackPosting($posting, $offset);

        $doc_depth = log(10*(($doc_index +1) + 
            $this->num_docs_per_generation*$this->generation), 10);
        $item[self::DOC_RANK] = number_format(11 - 
            $doc_depth, PRECISION);

        $doc_loc = $doc_index << 4;
        $doc_info_string = $this->getDocInfoSubstring($doc_loc, 
            self::DOC_KEY_LEN); 
        $item[self::SUMMARY_OFFSET] = unpackInt(
            substr($doc_info_string, 0, 4));
        list($doc_len, $num_keys) = 
            $this->unpackDoclenNum(substr($doc_info_string, 4));

        $item[self::GENERATION] = $this->generation;

        $is_doc = (($doc_len & self::LINK_FLAG) == 0) ? true : false;
        if(!$is_doc) {
            $doc_len &= (self::LINK_FLAG - 1);
        }
        $item[self::IS_DOC] = $is_doc;

        $item[self::PROXIMITY] = 
            $this->computeProximity($item[self::POSITION_LIST],$is_doc);
        $occurrences = $this->weightedCount($item[self::POSITION_LIST],$is_doc);

        if($occurs != 0) {
            $occurences = array(
                self::TITLE => 0,
                self::DESCRIPTION => 0,
                self::LINKS => 0);
            if($is_doc) {
                $occurrences[self::DESCRIPTION] = $occurs;
            } else {
                $occurences[self::LINKS] = $occurs;
            }
        }
        /* 
           for archive crawls we store rank as the 4 bits after the high order 
           bit
        */
        $rank_mask = (0x0f) << 19;
        $pre_rank = ($doc_len & $rank_mask);
        if( $pre_rank > 0) {
            $item[self::DOC_RANK] = $pre_rank >> 19;
            $doc_len &= (2 << 19 - 1);
        }

        $skip_stats = false;

        if($item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) {
            $skip_stats = true;
            $item[self::RELEVANCE] = 1;
            $item[self::SCORE] = $item[self::DOC_RANK];
        } else if($is_doc) {
            $average_doc_len = $this->len_all_docs/$this->num_docs;
            $num_docs = $this->num_docs;
            $type_weight = 1;
        } else {
            $average_doc_len = ($this->num_link_docs != 0) ? 
                $this->len_all_link_docs/$this->num_link_docs : 0;
            $num_docs = $this->num_link_docs;
            $type_weight = LINK_WEIGHT;
        }
        if(!isset($item['KEY'])) {
            $doc_id = $this->getDocInfoSubstring(
                $doc_loc + self::DOC_KEY_LEN, $num_keys * self::DOC_KEY_LEN);
        } else {
            $doc_id = $item['KEY'];
        }
        if(!$skip_stats) {
            $item[self::RELEVANCE] = 0;
            if($occurrences[self::TITLE] > 0) {
                self::docStats($item, $occurrences[self::TITLE], 
                    AD_HOC_TITLE_LENGTH, 
                    $num_doc_or_links, AD_HOC_TITLE_LENGTH, $num_docs, 
                    $this->num_docs + $this->num_link_docs, TITLE_WEIGHT);
            }
            if($occurrences[self::DESCRIPTION] > 0) {
                $average_doc_len = 
                    max($average_doc_len - AD_HOC_TITLE_LENGTH, 1);
                $doc_len = max($doc_len - AD_HOC_TITLE_LENGTH, 1);
                self::docStats($item, $occurrences[self::DESCRIPTION], 
                    $doc_len, $num_doc_or_links, $average_doc_len , $num_docs, 
                    $this->num_docs + $this->num_link_docs, DESCRIPTION_WEIGHT);
            }
            if($occurrences[self::LINKS] > 0) {
                self::docStats($item, $occurrences[self::LINKS], 
                    $doc_len, $num_doc_or_links, $average_doc_len , $num_docs,
                    $this->num_docs + $this->num_link_docs, LINK_WEIGHT);
            }
            $item[self::SCORE] = $item[self::DOC_RANK]
                * $item[self::RELEVANCE];
        }

        return $doc_id;

    }
    /**
     * Used to sum over the occurences in a position list counting with
     * weight based on term location in the document
     *
     * @param array $position_list positions of term in item
     * @param bool $is_doc whether the item is a document or a link
     * @return array asscoiative array of document_part => weight count 
     *  of occurrences of term in 
     *
     */
    function weightedCount($position_list, $is_doc) {
        $count = array(
            self::TITLE => 0,
            self::DESCRIPTION => 0,
            self::LINKS => 0);
        foreach($position_list as $position) {
            if($is_doc) {
                if($position < AD_HOC_TITLE_LENGTH) {
                    $count[self::TITLE] ++;
                } else {
                    $count[self::DESCRIPTION]++;
                }
            } else {
                $count[self::LINKS]++;
            }
        }
        return $count;
    }

    /**
     * Returns a proximity score for a single term based on its location in
     * doc.
     *
     * @param array $position_list locations of term within item
     * @param bool $is_doc whether the item is a document or not
     * @return int a score for proximity
     */
    function computeProximity($position_list, $is_doc) {
        return (!$is_doc) ? LINK_WEIGHT : (isset($position_list[0]) && 
            $position_list[0] < AD_HOC_TITLE_LENGTH) ?
            TITLE_WEIGHT : DESCRIPTION_WEIGHT;
    }

    /**
     *  Computes BM25F relevance and a score for the supplied item based
     *  on the supplied parameters.
     *
     *  @param array &$item doc summary to compute a relevance and score for.
     *      Pass-by-ref so self::RELEVANCE and self::SCORE fields can be changed
     *  @param int $occurrences - number of occurences of the term in the item
     *  @param int $doc_len number of words in doc item represents
     *  @param int $num_doc_or_link number of links or docs containing the term
     *  @param float $average_doc_len average length of items in corpus
     *  @param int $num_docs either number of links or number of docs depending
     *      if item represents a link or a doc.
     *  @param int $total_docs_or_links number of docs or links in corpus
     *  @param float BM25F weight for this component (doc or link) of score
     */
    static function docStats(&$item, $occurrences, $doc_len, $num_doc_or_links, 
        $average_doc_len, $num_docs, $total_docs_or_links, $type_weight)
    {

        $doc_ratio = ($average_doc_len > 0) ?
            $doc_len/$average_doc_len : 0;
        $pre_relevance = number_format(
                3 * $occurrences/
                ($occurrences + .5 + 1.5* $doc_ratio), 
                PRECISION);

        $num_term_occurrences = $num_doc_or_links *
            $num_docs/($total_docs_or_links);

        $IDF = log(($num_docs - $num_term_occurrences + 0.5) /
            ($num_term_occurrences + 0.5));

        $item[self::RELEVANCE] += 0.5 * $IDF * $pre_relevance * $type_weight;

    }

    /**
     *  Gets the posting closest to index $current in the word_docs string
     *  modifies the passed-by-ref variables $posting_start and 
     *  $posting_end so they are the index of the the start and end of the
     *  posting
     *
     *  @param int $current an index into the word_docs strings
     *      corresponds to a start search loc of $current * self::POSTING_LEN
     *  @param int &$posting_start after function call will be
     *      index of start of nearest posting to current
     *  @param int &$posting_end after function call will be
     *      index of end of nearest posting to current
     *
     *  @return string the substring of word_docs corresponding to the posting
     */
    function getPostingAtOffset($current, &$posting_start, &$posting_end, 
        $just_start = false)
    {
            $posting = $this->getWordDocsSubstring($current * self::POSTING_LEN,
                self::POSTING_LEN);
            $posting_start = $current;
            $posting_end = $current;
            $end_word_start = 0;
            $chr = (ord($posting[0]) & 192);
            $first_time = ( $chr == 64);
            while ($chr == 128 || $first_time ){
                $first_time = false;
                $posting_start--;
                $posting = $this->getWordDocsSubstring(
                    $posting_start * self::POSTING_LEN, self::POSTING_LEN) . 
                    $posting;
                $chr = (ord($posting[0]) & 192);
                $end_word_start += self::POSTING_LEN;
            }
            if($just_start) {
                return $posting;
            }
            $chr = ord($posting[$end_word_start]) & 192;
            while($chr > 64) {
                $posting_end++;
                $posting .= $this->getWordDocsSubstring(
                    $posting_end*self::POSTING_LEN, self::POSTING_LEN);
                $end_word_start += self::POSTING_LEN;
                $chr = ord($posting[$end_word_start]) & 192;
            }

            return $posting;
    }

    /**
     * Finds the first posting offset between $start_offset and $end_offset
     * of a posting that has a doc_offset bigger than or equal to $doc_offset
     * This is implemented using a galloping search (double offset till
     * get larger than binary search).
     *
     *  @param int $start_offset first posting to consider
     *  @param int $end_offset last posting before give up
     *  @param int $doc_offset document offset we want to be greater than or 
     *      equal to
     *
     *  @return int offset to next posting
     */
     function nextPostingOffsetDocOffset($start_offset, $end_offset,
        $doc_offset) {

        $doc_index = $doc_offset >> 4;
        $current = floor($start_offset/self::POSTING_LEN);
        $end = floor($end_offset/self::POSTING_LEN);
        $low = $current;
        $high = $end;
        $posting_start = $current;
        $posting_end = $current;
        $stride = 32;
        $gallop_phase = true;
        do {
            $offset = 0;
            $posting = $this->getPostingAtOffset(
                $current, $posting_start, $posting_end, true);
            $post_doc_index = $this->getDocIndexPosting($posting);
            if($doc_index > $post_doc_index) {
                $low = $current;
                if($gallop_phase) {
                    $current += $stride;
                    $stride <<= 1;
                    if($current > $end ) {
                        $current = $end;
                        $gallop_phase = false;
                    }
                } else if($current >= $end) {
                    return false;
                } else {
                    if($current + 1 == $high) {
                        $current++;
                        $low = $current;
                    }
                    $current = (($low + $high) >> 1);
                }
            } else if($doc_index < $post_doc_index) {
                if($low == $current) {
                    return $posting_start * self::POSTING_LEN;
                } else if($gallop_phase) {
                    $gallop_phase = false;
                }
                $high = $current;
                $current = (($low + $high) >> 1);
            } else  {
                return $posting_start * self::POSTING_LEN;
            }

        } while($current <= $end);

        return false;
     }

    /**
     * Given an offset of a posting into the word_docs string, looks up
     * the posting there and computes the doc_offset stored in it.
     *
     *  @param int $offset byte/char offset into the word_docs string
     *  @return int a document byte/char offset into the doc_infos string
     */
    function docOffsetFromPostingOffset($offset) {
        $current = $offset / self::POSTING_LEN;
        $posting = $this->getPostingAtOffset(
            $current, $posting_start, $posting_end, true);
        $doc_index = $this->getDocIndexPosting($posting);

        return ($doc_index << 4);
    }

    /**
     * Returns $len many documents which contained the word corresponding to
     * $word_id (only wordk for loaded shards)
     *
     * @param string $word_id key to look up documents for
     * @param int number of documents desired back (from start of word linked
     *      list).
     * @return array desired list of doc's and their info
     */
    function getPostingsSliceById($word_id, $len)
    {
        $results = array();
        $info = $this->getWordInfo($word_id, true);
        if($info !== false) {
            list($first_offset, $last_offset,
                $num_docs_or_links) = $info;
            $results = $this->getPostingsSlice($first_offset, 
                $first_offset, $last_offset, $len);
        }
        return $results;
    }

    /**
     * Adds the contents of the supplied $index_shard to the current index
     * shard
     *
     * @param object $index_shard the shard to append to the current shard
     */
    function appendIndexShard($index_shard)
    {
        if($this->word_docs_packed == true) {
            $this->words = array();
            $this->word_docs = "";
            $this->word_docs_packed = false;
        }
        if($index_shard->word_docs_packed == true) {
            $index_shard->unpackWordDocs();
        }

        $this->doc_infos .= $index_shard->doc_infos;

        $two_doc_len = 2 * self::DOC_KEY_LEN;
        foreach($index_shard->words as $word_id => $postings) {
            $postings_len = strlen($postings);
            // update doc offsets for newly added docs
            $add_len_flag = false;
            if($postings_len !=  $two_doc_len || 
                substr($postings, 0, self::POSTING_LEN) != self::HALF_BLANK) {
                $offset = 0;
                $new_postings = "";
                $index_shard_len = ($this->docids_len >> 4);
                while($offset < $postings_len) {
                    list($doc_index, $posting_list) = // this changes $offset
                        $this->unpackPosting($postings, $offset);
                    $doc_index += $index_shard_len;
                    $new_postings .=
                        $this->packPosting($doc_index, $posting_list);
                }
                $add_len_flag = true;
            } else {
                $new_postings = $postings;
            }
            $new_postings_len = strlen($new_postings);
            if(!isset($this->words[$word_id])) {
                $this->words[$word_id] = $new_postings;
            } else  {
                $this->words[$word_id] .= $new_postings;
            }
            if($add_len_flag) {
                $this->word_docs_len += $new_postings_len;
            }
        }
        $this->docids_len += $index_shard->docids_len;
        $this->num_docs += $index_shard->num_docs;
        $this->num_link_docs += $index_shard->num_link_docs;
        $this->len_all_docs += $index_shard->len_all_docs;
        $this->len_all_link_docs += $index_shard->len_all_link_docs;
        crawlLog("Finishing append...mem:".memory_get_usage());
        if($this->num_docs - $this->last_flattened_words_count >
            self::FLATTEN_FREQUENCY) {
            $this->mergeWordPostingsToString();
            crawlLog("...Flattened Word Postings mem:".memory_get_usage());
        }
    }

    /**
     * Used to flatten the words associative array to a more memory 
     * efficient word_postings string.
     */
    function mergeWordPostingsToString()
    {
        if($this->word_docs_packed) {
            return;
        }
        ksort($this->words, SORT_STRING);
        $tmp_string = "";
        $offset = 0;
        $write_offset = 0;
        $len = strlen($this->word_postings);
        $key_len = self::WORD_KEY_LEN;
        $posting_len = self::POSTING_LEN;
        $item_len = $key_len + $posting_len;
        foreach($this->words as $word_id => $postings) {
            $cmp = -1;
            while($cmp < 0 && $offset + $item_len <= $len) {
                $key = substr($this->word_postings, $offset, $key_len);
                $key_posts_len = unpackInt(substr(
                    $this->word_postings, $offset + $key_len, $posting_len));
                $key_postings = substr($this->word_postings, 
                    $offset + $item_len, $key_posts_len);
                $word_id_posts_len = strlen($postings);
                $cmp = strcmp($key, $word_id);
                if($cmp == 0) {
                    $tmp_string .= $key . 
                        packInt($key_posts_len + $word_id_posts_len) .
                        $key_postings . $postings;
                    $offset += $item_len + $key_posts_len;
                } else if ($cmp < 0) {
                    $tmp_string .= $key .packInt($key_posts_len). $key_postings;
                    $offset += $item_len + $key_posts_len;
                } else {
                    $tmp_string .= $word_id . 
                        packInt($word_id_posts_len). $postings;
                }
                $tmp_len = strlen($tmp_string);
                $copy_data_len = min(self::WORD_POSTING_COPY_LEN, $tmp_len);
                $copy_to_len = min($offset - $write_offset, 
                    $len - $write_offset);
                if($copy_to_len > $copy_data_len) {
                    charCopy($tmp_string, $this->word_postings, $write_offset,
                        $copy_data_len);
                    $write_offset += $copy_data_len;
                    $tmp_string = substr($tmp_string, $copy_data_len);
                }
           }
           if($offset + $item_len > $len) {
                $word_id_posts_len = strlen($postings);
                if($write_offset < $len) {
                    $tmp_len = strlen($tmp_string);
                    $copy_data_len = $len - $write_offset;
                    if($tmp_len < $copy_data_len) { // this case shouldn't occur
                        $this->word_postings = 
                            substr($this->word_postings, 0, $write_offset);
                        $this->word_postings .= $tmp_string;
                    } else {
                        charCopy($tmp_string, $this->word_postings, 
                            $write_offset, $copy_data_len);
                        $this->word_postings .=
                             substr($tmp_string, $copy_data_len);
                        $tmp_string = "";
                    }
                    $tmp_string = "";
                    $write_offset = $len;
                }
                $this->word_postings .= 
                    $word_id . packInt($word_id_posts_len). $postings;
            }
        }
        if($tmp_string != "") {
            $tmp_len = strlen($tmp_string);
            $copy_data_len = $offset - $write_offset;
            $pad_len = $tmp_len - $copy_data_len;
            $pad = str_pad("", $pad_len, "@");
            $this->word_postings .= $pad;
            for($j = $len + $pad_len - 1, 
                $k = $len - 1; $k >= $offset; $j--, $k--) {
                $this->word_postings[$j] = "" . $this->word_postings[$k];
                    /*way slower if directly
                    assign!!! PHP is crazy*/
            }
            charCopy($tmp_string, $this->word_postings, 
                $write_offset, $tmp_len);
        }

        $this->words = array();
        $this->last_flattened_words_count = $this->num_docs;
    }

    /**
     * Changes the summary offsets associated with a set of doc_ids to new 
     * values. This is needed because the fetcher puts documents in a 
     * shard before sending them to a queue_server. It is on the queue_server
     * however where documents are stored in the IndexArchiveBundle and
     * summary offsets are obtained. Thus, the shard needs to be updated at
     * that point. This function should be called when shard unpacked 
     * (we check and unpack to be on the safe side).
     *
     * @param array $docid_offsets a set of doc_id  associated with a
     *      new_doc_offset.
     */
    function changeDocumentOffsets($docid_offsets)
    {
        if($this->word_docs_packed == true) {
            $this->words = array();
            $this->word_docs = "";
            $this->word_docs_packed = false;
        }
        $docids_len = $this->docids_len;

        for($i = 0 ; $i < $docids_len; $i += $row_len) {
            $doc_info_string = $this->getDocInfoSubstring($i, 
                self::DOC_KEY_LEN);
            $offset = unpackInt(
                substr($doc_info_string, 0, self::POSTING_LEN));
            $doc_len_info = substr($doc_info_string, 
                    self::POSTING_LEN, self::POSTING_LEN);
            list($doc_len, $num_keys) = 
                $this->unpackDoclenNum($doc_len_info);
            $key_count = ($num_keys % 2 == 0) ? $num_keys + 2: $num_keys + 1;
            $row_len = self::DOC_KEY_LEN * ($key_count);

            $id = substr($this->doc_infos, $i + self::DOC_KEY_LEN, 
                $num_keys * self::DOC_KEY_LEN);

            $new_offset = (isset($docid_offsets[$id])) ? 
                packInt($docid_offsets[$id]) : 
                packInt($offset);

            charCopy($new_offset, $this->doc_infos, $i, self::POSTING_LEN);

        }
    }


    /**
     *  Save the IndexShard to its filename
     * 
     *  @param bool $to_string whether output should be written to a string
     *      rather than the default file location
     *  @param bool $with_logging whether log messages should be written
     *      as the shard save progresses
     *  @return string serialized shard if output was to string else empty 
     *      string
     */
    public function save($to_string = false, $with_logging = false)
    {
        $out = "";
        $this->mergeWordPostingsToString();
        if($with_logging) {
            crawlLog("Saving index shard .. done merge postings to string");
        }
        $this->prepareWordsAndPrefixes();
        if($with_logging) {
            crawlLog("Saving index shard .. make prefixes");
        }
        $header =  pack("N", $this->prefixes_len) .
            pack("N", $this->words_len) .
            pack("N", $this->word_docs_len) .
            pack("N", $this->docids_len) . 
            pack("N", $this->generation) .
            pack("N", $this->num_docs_per_generation) .
            pack("N", $this->num_docs) .
            pack("N", $this->num_link_docs) .
            pack("N", $this->len_all_docs) .
            pack("N", $this->len_all_link_docs);
        if($with_logging) {
            crawlLog("Saving index shard .. packed header");
        }
        if($to_string) {
            $out = $header;
            $this->packWords(NULL);
            $out .= $this->words;
            $this->outputPostingLists(NULL);
            $out .= $this->word_docs;
            $out .= $this->doc_infos;
        } else {
            $fh = fopen($this->filename, "wb");
            fwrite($fh, $header);
            fwrite($fh, $this->prefixes);
            $this->packWords($fh);
            if($with_logging) {
                crawlLog("Saving index shard .. wrote dictionary");
            }
            $this->outputPostingLists($fh);
            fwrite($fh, $this->doc_infos);
            fclose($fh);
        }
        if($with_logging) {
            crawlLog("Saving index shard .. done");
        }
        // clean up by returning to state where could add more docs
        $this->words = array();
        $this->word_docs = "";
        $this->prefixes = "";
        $this->word_docs_packed = false;
        return $out;
    }

    /**
     * Computes the prefix string index for the current words array.
     * This index gives offsets of the first occurrences of the lead two char's
     * of a word_id in the words array. This method assumes that the word
     * data is already in >word_postings
     */
    function prepareWordsAndPrefixes()
    {
        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $key_len = self::WORD_KEY_LEN;
        $posting_len = self::POSTING_LEN;
        $this->words_len = 0;
        $word_postings_len = strlen($this->word_postings);
        $pos = 0;
        $tmp = array();
        $offset = 0;
        $num_words = 0;
        $old_prefix = false;
        while($pos < $word_postings_len) {
            $this->words_len += $word_item_len;
            $first = substr($this->word_postings, $pos, $key_len);
            $post_len = unpackInt(substr($this->word_postings, 
                $pos + $key_len, $posting_len));
            $pos += $key_len + $posting_len + $post_len;
            $prefix = (ord($first[0]) << 8) + ord($first[1]);
            if($old_prefix === $prefix) {
                $num_words++;
            } else {
                if($old_prefix !== false) {
                    $tmp[$old_prefix] = packInt($offset) .
                        pack("N", $num_words);
                    $offset += $num_words * $word_item_len;
                }
                $old_prefix = $prefix;
                $num_words = 1;
            }
        }

        $tmp[$old_prefix] = packInt($offset) . packInt($num_words);
        $num_prefixes = 2 << 16;
        $this->prefixes = "";
        for($i = 0; $i < $num_prefixes; $i++) {
            if(isset($tmp[$i])) {
                $this->prefixes .= $tmp[$i];
            } else {
                $this->prefixes .= self::BLANK;
            }
        }
        $this->prefixes_len = strlen($this->prefixes);
    }

    /**
     * Posting lists are initially stored associated with a word as a key
     * value pair. The merge operation then merges them these to a string
     * help by word_postings. packWords separates words from postings.
     * After being applied words is a string consisting of 
     * triples (as concatenated strings) word_id, start_offset, end_offset.
     * The offsets refer to integers offsets into a string $this->word_docs
     * Finally, if a file handle is given its write the word dictionary out 
     * to the file as a long string. This function assumes 
     * mergeWordPostingsToString has just been called.
     *
     * @param resource $fh a file handle to write the dictionary to, if desired
     * @param bool $to_string whether to return a string containing the packed 
     *      data

     */
    function packWords($fh = NULL)
    {
        if($this->word_docs_packed) {
            return;
        }
        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $key_len = self::WORD_KEY_LEN;
        $posting_len = self::POSTING_LEN;
        $this->word_docs_len = 0;
        $this->words = "";
        $total_out = "";
        $word_postings_len = strlen($this->word_postings);
        $pos = 0;
        $two_doc_len = 2 * self::DOC_KEY_LEN;
        while($pos < $word_postings_len) {
            $word_id = substr($this->word_postings, $pos, $key_len);
            $len = unpackInt(substr($this->word_postings, 
                $pos + $key_len, $posting_len));
            $postings = substr($this->word_postings, 
                $pos + $key_len + $posting_len, $len);
            $pos += $key_len + $posting_len + $len;
            /* 
                we pack generation info to make it easier to build the global
                dictionary
            */
            if($len != $two_doc_len || 
                substr($postings, 0, self::POSTING_LEN) != self::HALF_BLANK) {
                $out = packInt($this->generation)
                    . packInt($this->word_docs_len)
                    . packInt($len);
                $this->word_docs_len += $len;
                $this->words .= $word_id . $out;
            } else {
                $out = substr($postings, 
                    self::POSTING_LEN, self::WORD_ITEM_LEN);
                $out[0] = chr((0x80 | ord($out[0])));
                $this->words .= $word_id . $out;
            }
        }
        if($fh != null) {
            fwrite($fh, $this->words);
        }
        $this->words_len = strlen($this->words);
        $this->word_docs_packed = true;
    }

    /**
     * Used to convert the word_postings string into a word_docs string
     * or if a file handle is provided write out the word_docs sequence
     * of postings to the provided file handle.
     *
     * @param resource $fh a filehandle to write to
     */
    function outputPostingLists($fh = NULL)
    {
        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $key_len = self::WORD_KEY_LEN;
        $posting_len = self::POSTING_LEN;
        $this->word_docs = "";
        $total_out = "";
        $word_postings_len = strlen($this->word_postings);
        $pos = 0;
        $tmp_string = "";
        $tmp_len = 0;
        $two_doc_len = 2 * self::DOC_KEY_LEN;
        while($pos < $word_postings_len) {
            $word_id = substr($this->word_postings, $pos, $key_len);
            $len = unpackInt(substr($this->word_postings, 
                $pos + $key_len, $posting_len));
            $postings = substr($this->word_postings, 
                $pos + $key_len + $posting_len, $len);
            $pos += $key_len + $posting_len + $len;

            if($len != $two_doc_len || 
                substr($postings, 0, self::POSTING_LEN) != self::HALF_BLANK) {
                if($fh != NULL) {
                    if($tmp_len < self::SHARD_BLOCK_SIZE) {
                        $tmp_string .= $postings;
                        $tmp_len += $len;
                    } else {
                        fwrite($fh, $tmp_string);
                        $tmp_string = $postings;
                        $tmp_len = $len;
                    }
                } else {
                    $this->word_docs .= $postings;
                }
           }
        }
        if($tmp_len > 0) {
            if($fh == NULL ) {
                $this->word_docs .= $tmp_string;
            } else {
                fwrite($fh, $tmp_string);
            }
        }
    }

    /**
     * Takes the word docs string and splits it into posting lists which are
     * assigned to particular words in the words dictionary array.
     * This method is memory expensive as it briefly has essentially 
     * two copies of what's in word_docs.
     */
    function unpackWordDocs()
    {
        if(!$this->word_docs_packed) {
            return;
        }
        foreach($this->words as $word_id => $postings_info) {
            /* we are ignoring the first four bytes which contains 
               generation info
             */
            if((ord($postings_info[0]) & 0x80) > 0 ) {
                $postings_info[0] = chr(ord($postings_info[0]) - 0x80);
                $postings_info = self::HALF_BLANK . $postings_info;
                $this->words[$word_id] = $postings_info;
            } else {
                $offset = unpackInt(substr($postings_info, 4, 4));
                $len = unpackInt(substr($postings_info, 8, 4));
                $postings = substr($this->word_docs, $offset, $len);
                $this->words[$word_id] = $postings;
            }
        }
        unset($this->word_docs);
        $this->word_docs_packed = false;
    }


    /**
     * Used to store the length of a document as well as the number of
     * key components in its doc_id as a packed int (4 byte string)
     *
     * @param int $doc_len number of words in the document
     * @param int $num_keys number of keys that are used to make up its doc_id
     * @return string packed int string representing these two values
     */
    static function packDoclenNum($doc_len, $num_keys)
    {
        return packInt(($doc_len << 8) + $num_keys);
    }

    /**
     * Used to extract from a 4 byte string representing a packed int,
     * a pair which represents the length of a document together with the
     * number of keys in its doc_id
     *
     * @param string $doc_len_string string to unpack
     * @return array pair (number of words in the document,
     *      number of keys that are used to make up its doc_id)
     */
    static function unpackDoclenNum($doc_len_string)
    {
        $doc_int = unpackInt($doc_len_string);
        $num_keys = $doc_int & 255;
        $doc_len = ($doc_int >> 8);
        return array($doc_len, $num_keys);
    }

    /**
     * Makes an packed integer string from a docindex and the number of
     * occurrences of a word in the document with that docindex.
     *
     * @param int $doc_index index (i.e., a count of which document it
     *      is rather than a byte offset) of a document in the document string
     * @param array integer positions word occurred in that doc
     * @return string a modified9 (our compression scheme) packed 
     *      string containing this info.
     */
    static function packPosting($doc_index, $position_list)
    {
        $delta_list = deltaList($position_list);
        if(isset($delta_list[0])){
            $delta_list[0]++;
        }

        if( $doc_index >= (2 << 14) && isset($delta_list[0]) 
            && $delta_list[0] < (2 << 9)  && $doc_index < (2 << 17)) {
            $delta_list[0] += (((2 << 17) + $doc_index) << 9);
        } else {
            // we add 1 to doc_index to make sure not 0 (modified9 needs > 0)
            array_unshift($delta_list, ($doc_index + 1));
        }
        $encoded_list = encodeModified9($delta_list);
        return $encoded_list;
    }

    /**
     * Given a packed integer string, uses the top three bytes to calculate
     * a doc_index of a document in the shard, and uses the low order byte
     * to computer a number of occurences of a word in that document.
     *
     * @param string $posting a string containing 
     *      a doc index position list pair coded encoded using modified9
     * @param int &offset a offset into the string where the modified9 posting
     *      is encoded
     * @return array consisting of integer doc_index and a subarray consisting
     *      of integer positions of word in doc.
     */
    static function unpackPosting($posting, &$offset)
    {
        $delta_list = decodeModified9($posting, $offset);
        $doc_index = array_shift($delta_list);

        if(($doc_index & (2 << 26)) > 0) {
            $delta0 = ($doc_index & ((2 << 9) - 1));
            array_unshift($delta_list, $delta0);
            $doc_index -= $delta0;
            $doc_index -= (2 << 26);
            $doc_index >>= 9;
        } else {
            $doc_index--;
        }
        if(isset($delta_list[0])){
            $delta_list[0]--;
        }

        $position_list = deDeltaList($delta_list);

        return array($doc_index, $position_list);
    }

    static function getDocIndexPosting($posting)
    {
        $delta_list = unpackListModified9(substr($posting, 0, 4));
        $doc_index = array_shift($delta_list);

        if(($doc_index & (2 << 26)) > 0) {
            $delta0 = ($doc_index & ((2 << 9) - 1));
            array_unshift($delta_list, $delta0);
            $doc_index -= $delta0;
            $doc_index -= (2 << 26);
            $doc_index >>= 9;
        } else {
            $doc_index--;
        }
        return $doc_index;
    }

    /**
     * Converts $str into 3 ints for a first offset into word_docs,
     * a last offset into word_docs, and a count of number of docs
     * with that word.
     *
     * @param string $str 
     * @param bool $include_generation 
     * @return array of these three or four int's
     */
    static function getWordInfoFromString($str, $include_generation = false)
    {
        $generation = unpackInt(substr($str, 0, 4));
        $first_offset = unpackInt(substr($str, 4, 4));
        $len = unpackInt(substr($str, 8, 4));
        $last_offset = $first_offset + $len - self::POSTING_LEN;
        $count = floor($len / self::POSTING_LEN);
        if( $include_generation) {
            return array($generation, $first_offset, $last_offset, $count);
        }
        return array($first_offset, $last_offset, $count);
    }

    /**
     * From disk gets $len many bytes starting from $offset in the word_docs
     * strings 
     *
     * @param $offset byte offset to begin getting data out of disk-based
     *      word_docs
     * @param $len number of bytes to get
     * @return desired string
     */
    function getWordDocsSubstring($offset, $len)
    {
        if($this->read_only_from_disk) {
            $base_offset = self::HEADER_LENGTH + 
                $this->prefixes_len + $this->words_len;
            return $this->getShardSubstring($base_offset + $offset, $len);
        }
        return substr($this->word_docs, $offset, $len);
    }

    /**
     * From disk gets $len many bytes starting from $offset in the doc_infos
     * strings 
     *
     * @param $offset byte offset to begin getting data out of disk-based
     *      doc_infos
     * @param $len number of bytes to get
     * @return desired string
     */
    function getDocInfoSubstring($offset, $len)
    {
        if($this->read_only_from_disk) {
            $base_offset = $this->file_len - $this->docids_len;
            return $this->getShardSubstring($base_offset + $offset, $len);
        }
        return substr($this->doc_infos, $offset, $len);
    }

    /**
     *  Gets from Disk Data $len many bytes beginning at $offset from the
     *  current IndexShard
     *
     * @param int $offset byte offset to start reading from
     * @param int $len number of bytes to read
     * @return string data fromthat location  in the shard
     */
    function getShardSubstring($offset, $len)
    {
        $block_offset = (floor($offset/self::SHARD_BLOCK_SIZE) *
            self::SHARD_BLOCK_SIZE);
        $start_loc = $offset - $block_offset;
        $substring = "";
        do {
            $data = $this->readBlockShardAtOffset($block_offset);
            if($data === false) {return $substring;}
            $block_offset += self::SHARD_BLOCK_SIZE;
            $substring .= substr($data, $start_loc);
            $start_loc = 0;
        } while (strlen($substring) < $len);
        return substr($substring, 0, $len);
    }

    /**
     * Reads SHARD_BLOCK_SIZE from the current IndexShard's file beginning
     * at byte offset $bytes
     *
     * @param int $bytes byte offset to start reading from
     * @return &string data fromIndexShard file
     */
    function &readBlockShardAtOffset($bytes)
    {
        $false = false;
        if(isset($this->blocks[$bytes])) {
            return $this->blocks[$bytes];
        } 
        if($this->fh === NULL) {
            $this->fh = fopen($this->filename, "rb");
            if($this->fh === false) return false;
            $this->file_len = filesize($this->filename);
        }
        if($bytes >= $this->file_len) {
            return $false;
        }
        $seek = fseek($this->fh, $bytes, SEEK_SET);
        if($seek < 0) {
            return $false;
        }
        $this->blocks[$bytes] = fread($this->fh, self::SHARD_BLOCK_SIZE);

        return $this->blocks[$bytes];
    }

    /**
     * If not already loaded, reads in from disk the fixed-length'd field 
     * variables of this IndexShard ($this->words_len, etc)
     */
    function getShardHeader()
    {
        if(isset($this->num_docs) && $this->num_docs > 0) {
            return; // if $this->num_docs > 0 assume have read in
        }
        $info_block = & $this->readBlockShardAtOffset(0);
        $header = substr($info_block, 0, self::HEADER_LENGTH);
        self::headerToShardFields($header, $this);
    }



    /**
     *  Load an IndexShard from a file or string
     *
     *  @param string $fname the name of the file to the IndexShard from/to
     *  @param string &$data stringified shard data to load shard from. If NULL
     *      then the data is loaded from the $fname if possible
     *  @return object the IndexShard loaded
     */
    static function load($fname, &$data = NULL)
    {
        $shard = new IndexShard($fname);
        if($data === NULL) {
            $fh = fopen($fname, "rb");
            $shard->file_len = filesize($fname);
            $header = fread($fh, self::HEADER_LENGTH);
        } else {
            $shard->file_len = strlen($data);
            $header = substr($data, 0, self::HEADER_LENGTH);
            $pos = self::HEADER_LENGTH;
        }
        self::headerToShardFields($header, $shard);

        if($data === NULL) {
            fread($fh, $shard->prefixes_len );
            $words = fread($fh, $shard->words_len);
            $shard->word_docs = fread($fh, $shard->word_docs_len);
            $shard->doc_infos = fread($fh, $shard->docids_len);
            fclose($fh);
        } else {
            $words = substr($data, $pos, $shard->words_len);
            $pos += $shard->words_len;
            $shard->word_docs = substr($data, $pos, $shard->word_docs_len);
            $pos += $shard->word_docs_len;
            $shard->doc_infos = substr($data, $pos, $shard->docids_len);
        }

        $pre_words_array = str_split($words, self::WORD_ITEM_LEN);
        unset($words);
        array_walk($pre_words_array, 'IndexShard::makeWords', $shard);
        $shard->word_docs_packed = true;
        $shard->unpackWordDocs();
        return $shard;
    }


    /**
     *  Split a header string into a shards field variable
     *
     *  @param string $header a string with packed shard header data
     *  @param object shard IndexShard to put data into
     */
    static function headerToShardFields($header, $shard)
    {
        $header_array = str_split($header, 4);
        $header_data = array_map('unpackInt', $header_array);
        $shard->prefixes_len = $header_data[0];
        $shard->words_len = $header_data[1];
        $shard->word_docs_len = $header_data[2];
        $shard->docids_len = $header_data[3];
        $shard->generation = $header_data[4];
        $shard->num_docs_per_generation = $header_data[5];
        $shard->num_docs = $header_data[6];
        $shard->num_link_docs = $header_data[7];
        $shard->len_all_docs = $header_data[8];
        $shard->len_all_link_docs = $header_data[9];
    }

    /**
     * Callback function for load method. splits a word_key . word_info string
     * into an entry in the passed shard $shard->words[word_key] = $word_info.
     *
     * @param string &value  the word_key . word_info string
     * @param int $key index in array - we don't use
     * @param object $shard IndexShard to add the entry to word table for
     */
    static function makeWords(&$value, $key, $shard)
    {
        $shard->words[substr($value, 0, self::WORD_KEY_LEN)] = 
            substr($value, self::WORD_KEY_LEN, 
                self::WORD_ITEM_LEN - self::WORD_KEY_LEN);
    }

}
Return current item: Yioop!