Location: PHPKode > projects > Yioop! > yioop-v0.78/lib/utility.php
<?php
/** 
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009, 2010, 2011  Chris Pollett hide@address.com
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * A library of string, log, hash, time, and conversion functions
 *
 * @author Chris Pollett hide@address.com
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009, 2010, 2011
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Copies from $source string beginning at position $start, $length many
 * bytes to destination string
 *
 * @param string $source  string to copy from
 * @param string &$destination string to copy to
 * @param int $start starting offset
 * @param int $length number of bytes to copy
 */
function charCopy($source, &$destination, $start, $length) 
{
    $endk = $length - 1;
    $end = $start + $endk;
    for($j = $end, $k = $endk; $j >= $start; $j--, $k--) {
        $destination[$j] = $source[$k]; 
    }
}

/**
 *  Encodes an integer using variable byte coding.
 *
 *  @param int $pos_int integer to encode
 *  @return string a string of 1-5 chars depending on how bit $pos_int was
 */
function vByteEncode($pos_int) 
{
    $result = chr($pos_int & 127);
    $pos_int >>= 7;
    while($pos_int > 0){
        $result .= chr(128 | ($pos_int & 127));
        $pos_int >>= 7;
    }
    return $result;
}

/**
 *  Decodes from a string using variable byte coding an integer.
 *
 *  @param string &$str string to use for decoding
 *  @param int $offset byte offset into string when var int stored
 *  @return int the decoded integer
 */
function vByteDecode(&$str, &$offset) 
{
    $pos_int = ord($str[$offset] & 127) ;
    $shift = 7;
    while (ord($str[$offset++]) & 128 > 0) {
        $pos_int += (ord($str[$offset] & 127) << $shift);
        $shift += 7;
    }

    return $pos_int;
}

/**
 * Computes the difference of a list of integers.
 * i.e., (a1, a2, a3, a4) becomes (a1, a2-a1, a3-a2, a4-a3) 
 *
 * @param array $list a nondecreasing list of integers
 * @return array the corresponding list of differences of adjacent
 *      integers
 */
function deltaList($list)
{
    $last = 0;
    $delta_list = array();
    foreach($list as $elt) {
        $delta_list[] = $elt - $last;
        $last = $elt;
    }
    return $delta_list;
}

/**
 * Given an array of differences of integers reconstructs the
 * original list. This computes the inverse of the deltaList function
 *
 * @see deltaList
 * @param array $delta_list a list of nonegative integers
 * @return array a nondecreasing list of integers
 */
function deDeltaList($delta_list)
{
    $last = 0;
    $list = array();
    foreach($delta_list as $delta) {
        $last += $delta;
        $list[] = $last;
    }
    return $list;
}

/**
 * Encodes a sequence of integers x, such that 1 <= x <= 2<<28-1
 * as a string.
 *
 * The encoded string is a sequence of 4 byte words (packed int's).
 * The high order 2 bits of a given word indicate whether or not
 * to look at the next word. The codes are as follows:
 * 11 start of encoded string, 10 continue four more bytes, 01 end of
 * encoded, and 00 indicates whole sequence encoded in one word.
 *
 * After the high order 2 bits, the next most significant bits indicate
 * the format of the current word. There are nine possibilities:
 * 00 - 1 28 bit number, 01 - 2 14 bit numbers, 10 - 3 9 bit numbers,
 * 1100 - 4 6 bit numbers, 1101 - 5 5 bit numbers, 1110 6 4 bit numbers,
 * 11110 - 7 3 bit numbers, 111110 - 12 2 bit numbers, 111111 - 24 1 bit 
 * numbers.
 *
 * @param array $list a list of positive integers satsfying above
 * @return string encoded string
 */
function encodeModified9($list)
{
    global $MOD9_PACK_POSSIBILITIES;
    $cnt = 0;
    $cur_size = 1;
    $cur_len = 1;
    $pack_list = array();
    $list_string = "";
    $continue_bits = 3;
    foreach($list as $elt) {
        $old_len = $cur_len;
        while( $elt > $cur_size )
        {
            $cur_len++;
            $cur_size = (1 << $cur_len) - 1;

        }

        if( $cnt < $MOD9_PACK_POSSIBILITIES[$cur_len] ) {
            $pack_list[] = $elt;
            $cnt++;
        } else {
            $list_string .= packListModified9($continue_bits, 
                $MOD9_PACK_POSSIBILITIES[$old_len], $pack_list);
            $continue_bits = 2;
            $pack_list = array($elt);
            $cur_size = 1;
            $cur_len = 1;
            $cnt = 1;
            while( $elt > $cur_size )
            {
                $cur_size = (1 << $cur_len) - 1;
                $cur_len++;
            }
        }
    }
    $continue_bits = ($continue_bits == 3) ? 0 : 1;
    $list_string .= packListModified9($continue_bits, 
        $MOD9_PACK_POSSIBILITIES[$cur_len], $pack_list);

    return $list_string;
}

/**
 * Packs the contents of a single word of a sequence being encoded
 * using Modified9.
 *
 * @param int $continue_bits the high order 2 bits of the word
 * @param int $cnt the number of element that will be packed in this word
 * @param array $list a list of positive integers to pack into word
 * @return string encoded 4 byte string
 * @see encodeModified9
 */
function packListModified9($continue_bits, $cnt, $pack_list)
{
    global $MOD9_NUM_ELTS_CODES, $MOD9_NUM_BITS_CODES;

    $out_int = 0;
    $code = $MOD9_NUM_ELTS_CODES[$cnt];
    $num_bits = $MOD9_NUM_BITS_CODES[$code];
    foreach($pack_list as $elt) {
        $out_int <<= $num_bits;
        $out_int += $elt;
    }
    $out_string = packInt($out_int);

    $out_string[0] = chr(($continue_bits << 6) + $code + ord($out_string[0]));
    return $out_string;
}

/**
 * Decoded a sequence of positive integers from a string that has been
 * encoded using Modified 9
 *
 * @param string $int_string string to decode from
 * @param int &$offset where to string in the string, after decode
 *      points to where one was after decoding.
 * @return array sequence of positive integers that were decoded
 * @see encodeModified9
 */
function decodeModified9($input_string, &$offset)
{
    $flag_mask = 192;
    $continue_threshold = 128;
    $first_time = true;
    $decode_list = array();
    if(($len = strlen($input_string) ) < 4) return array();
    do {
        $int_string = substr($input_string, $offset, 4);
        $ord_first = ord($int_string[0]);
        $flag_bits = ($ord_first & $flag_mask);
        if($first_time) {
            if($flag_bits != 0 && $flag_bits != $flag_mask) {
                return false;
            }
            $first_time = false;
        }
        $int_string[0] = chr($ord_first - $flag_bits);
        $decode_list = array_merge($decode_list, 
            unpackListModified9($int_string));
        $offset += 4;
        $len -= 4;
    } while($flag_bits >= $continue_threshold);

   return $decode_list;
}

/**
 * Decoded a single word with high two bits off according to modified 9
 *
 * @param string $int_string 4 byte string to decode
 * @return array sequence of integers that results from the decoding.
 */
function unpackListModified9($int_string) 
{
    global $MOD9_NUM_BITS_CODES, $MOD9_NUM_ELTS_DECODES;

    $first_char = ord($int_string[0]);
    foreach($MOD9_NUM_BITS_CODES as $code => $num_bits) {
        if(($first_char & $code) == $code) break;
    }
    $mask = (2 << ($num_bits - 1)) - 1;
    $num_elts = $MOD9_NUM_ELTS_DECODES[$code];
    $int_string[0] = chr($first_char - $code);

    $encoded_list = unpackInt($int_string);
    $decoded_list = array();
    for($i = 0; $i < $num_elts; $i++) {
        if(($pre_elt = $encoded_list & $mask) == 0) break;
        array_unshift($decoded_list, $pre_elt);
        $encoded_list >>= $num_bits;
    }

    return $decoded_list;
}


/**
 * Unpacks an int from a 4 char string
 *
 * @param string $str where to extract int from
 * @return int extracted integer
 */
function unpackInt($str)
{
    $tmp = unpack("N", $str);
    return $tmp[1];
}

/**
 * Packs an int into a 4 char string
 *
 * @param int $my_int the integer to pack
 * @return string the packed string
 */
function packInt($my_int)
{
    return pack("N", $my_int);
}

/**
 * Unpacks a float from a 4 char string
 *
 * @param string $str where to extract int from
 * @return float extracted float
 */
function unpackFloat($str)
{
    $tmp = unpack("f", $str);
    return $tmp[1];
}

/**
 * Packs an float into a 4 char string
 *
 * @param float $my_floatt the float to pack
 * @return string the packed string
 */
function packFloat($my_float)
{
    return pack("f", $my_float);
}

/**
 * Converts a string to string where each char has been replaced by its 
 * hexadecimal equivalent
 *
 * @param string $str what we want rewritten in hex
 * @return string the hexified string
 */
function toHexString($str)
{
    $out = "";
    for($i = 0; $i < strlen($str); $i++) {
        $out .= dechex(ord($str[$i]))." ";
    }
    return $out;
}

/**
 * Converts a string to string where each char has been replaced by its 
 * binary equivalent
 *
 * @param string $str what we want rewritten in hex
 * @return string the binary string
 */
function toBinString($str)
{
    $out = "";
    for($i = 0; $i < strlen($str); $i++) {
        $out .= substr(decbin(256+ord($str[$i])), 1)." ";
    }
    return $out;
}

/**
 *  Logs a message to a logfile or the screen
 *
 *  @param string $msg message to log
 *  @param string $lname name of log file in the LOG_DIR directory, rotated logs
 *      will also use this as their basename followed by a number followed by 
 *      bz2 (since they are bzipped).
 */

function crawlLog($msg, $lname = NULL)
{
    static $logname;

    if(defined("NO_LOGGING")) {
        return;
    }

    if($lname != NULL)
    {
        $logname = $lname;
    } else if(!isset($logname)) {
        $logname = "message";
    }

    $time_string = date("r", time());
    $out_msg = "[$time_string] $msg";
    if(LOG_TO_FILES) {
        $logfile = LOG_DIR."/$logname.log";

        clearstatcache(); //hopefully, this doesn't slow things too much

        if(file_exists($logfile) && filesize($logfile) > MAX_LOG_FILE_SIZE) {
            if(file_exists("$logfile.".NUMBER_OF_LOG_FILES.".bz2")) {
                unlink("$logfile.".NUMBER_OF_LOG_FILES.".bz2");
            }
            for($i = NUMBER_OF_LOG_FILES; $i > 0; $i--) {
                if(file_exists("$logfile.".($i-1).".bz2")) {
                    rename("$logfile.".($i-1).".bz2", "$logfile.$i.bz2");
                }
            }
            file_put_contents("$logfile.0.bz2", 
                bzcompress(file_get_contents($logfile)));
            unlink($logfile);
        }
        error_log($out_msg."\n", 3, $logfile);
    } else {
        error_log($out_msg);
    }
}

/**
 *  Computes an 8 byte hash of a string for use in storing documents.
 *
 *  An eight byte hash was chosen so that the odds of collision even for
 *  a few billion documents via the birthday problem are still reasonable.
 *  If the raw flag is set to false then an 11 byte base64 encoding of the
 *  8 byte hash is returned. The hash is calculated as the xor of the
 *  two halves of the 16 byte md5 of the string. (8 bytes takes less storage
 *  which is useful for keeping more doc info in memory)
 *
 *  @param string $string the string to hash
 *  @param bool $raw whether to leave raw or base 64 encode
 *  @return string the hash of $string
 */
function crawlHash($string, $raw = false) 
{
    $pre_hash = md5($string, true);

    $left = substr($pre_hash,0, 8) ;
    $right = substr($pre_hash,8, 8) ;

    $combine = $right ^ $left;

    if(!$raw) {
        $hash = base64Hash($combine);
            // common variant of base64 safe for urls and paths
    } else {
        $hash = $combine;
    }

    return $hash; 
}

/**
 * Converts a crawl hash number to something closer to base64 coded but
 * so doesn't get confused in urls or DBs
 *
 *  @param string $string a hash to base64 encode
 *  @return string the encoded hash 
 */
function base64Hash($string)
{
    $hash = rtrim(base64_encode($string), "=");
    $hash = str_replace("/", "_", $hash);
    $hash = str_replace("+", "-" , $hash); 

    return $hash;
}


/**
 * Decodes a crawl hash number from base64 to raw ASCII
 *
 *  @param string $base64 a hash to decode
 *  @return string the decoded hash 
 */
function unbase64Hash($base64)
{
    //get rid of out modified base64 encoding
    $hash = str_replace("_", "/", $base64);
    $hash = str_replace("-", "+" , $hash);
    $hash .= "=";
    $raw = base64_decode($hash);

    return $raw;
}

/**
 * Encodes a string in a format suitable for post data
 * (mainly, base64, but str_replace data that might mess up post in result) 
 *
 * @param string $str string to encode
 * @return string encoded string
 */
function webencode($str)
{
    $str = str_replace("/", "_", base64_encode($str));
    $str = str_replace("+", ".", $str);
    $str = str_replace("=", "~", $str);
    return $str;
}

/**
 * Decodes a string encoded by webencode
 *
 * @param string $str string to encode
 * @return string encoded string
 */
function webdecode($str)
{
    $str = str_replace("_", "/", $str);
    $str = str_replace(".", "+", $str);
    return base64_decode(str_replace("~", "=", $str));
}

/**
 * The search engine project's variation on the Unix crypt function using the 
 * crawlHash function instead of DES
 *
 * The crawlHash function is used to encrypt passwords stored in the database
 *
 * @param string $string the string to encrypt
 * @param int $salt salt value to be used (needed to verify if a password is 
 *      valid)
 * @return string the crypted string where crypting is done using crawlHash
 */
function crawlCrypt($string, $salt = NULL)
{
    if($salt == NULL) {
        $salt = rand(10000, 99999);
    } else {
        $len = strlen($salt);
        $salt = substr($salt, $len - 5, 5);
    }
    return crawlHash($string.$salt).$salt;
}




/**
 * Measures the change in time in seconds between two timestamps to microsecond
 * precision
 *
 * @param string $start starting time with microseconds
 * @param string $end ending time with microseconds, if null use current time
 * @return float time difference in seconds
 */
function changeInMicrotime( $start, $end=NULL ) 
{
    if( !$end ) {
    	    $end= microtime();
    }
    list($start_microseconds, $start_seconds) = explode(" ", $start);
    list($end_microseconds, $end_seconds) = explode(" ", $end);

    $change_in_seconds = intval($end_seconds) - intval($start_seconds);
    $change_in_microseconds = 
        floatval($end_microseconds) - floatval($start_microseconds);

    return floatval( $change_in_seconds ) + $change_in_microseconds;
} 


/**
 *  Converts a CSS unit string into its equivalent in pixels. This is
 *  used by @see SvgProcessor.
 *
 *  @param string $value  a number followed by a legal CSS unit
 *  @return int a number in pixels
 */
function convertPixels($value) 
{
    $len = strlen($value);
    if($len < 2) return intval($value);
    if($value[$len - 1] == "%") {
        $num = floatval(substr($value, 0, $len - 1));
        return ($num > 0) ? floor(8*min(100, $num)) : 0;
    }
    $num = floatval(substr($value, 0, $len - 2));
    $unit = substr($value, $len - 2);
    switch($unit)
    {

        case "cm":
        case "pt":
            return intval(28*$num);
        break;
        case "em":
        case "pc":
            return intval(6*$num);
        break;
        case "ex":
            return intval(12*$num);
        break;
        case "in":
            //assume screen 72 dpi as on mac
            return intval(72*$num);
        break;
        case "mm":
            return intval(2.8*$num);
        break;
        case "px":
            return intval($num);
        break;
        default:
            $num = $value;
    }
    return intval($num);
}


// callbacks for Model::traverseDirectory

/**
 * This is a callback function used in the process of recursively deleting a 
 * directory
 *
 * @param string $file_or_dir the filename or directory name to be deleted
 * @see DatasourceManager::unlinkRecursive()
 */
function deleteFileOrDir($file_or_dir)
{
    if(is_file($file_or_dir)) {
        unlink($file_or_dir);
    } else {
        rmdir($file_or_dir);
    }
}

/**
 * This is a callback function used in the process of recursively chmoding to 
 * 777 all files in a folder
 *
 * @param string $file the filename or directory name to be chmod
 * @see DatasourceManager::etWorldPermissionsRecursive()
 */
function setWorldPermissions($file)
{
    chmod($file, 0777);
}

//ordering functions used in sorting

/**
 *  Callback function used to sort documents by score
 *
 *  The function is used to sort documents being added to an IndexArchiveBundle
 *
 *  @param string $word_doc_a doc id of first document to compare
 *  @param string $word_doc_b doc id of second document to compare
 *  @return int -1 if first doc bigger 1 otherwise
 *  @see IndexArchiveBundle::addPartitionWordData()
 */
function scoreOrderCallback($word_doc_a, $word_doc_b)
{
    return ((float)$word_doc_a[CrawlConstants::SCORE] > 
        (float)$word_doc_b[CrawlConstants::SCORE]) ? -1 : 1;
}

/**
 *  Callback function used to sort documents by doc_rank
 *
 *  The function is used to sort documents being added to an IndexArchiveBundle
 *
 *  @param string $word_doc_a doc id of first document to compare
 *  @param string $word_doc_b doc id of second document to compare
 *  @return int -1 if first doc bigger 1 otherwise
 *  @see IndexArchiveBundle::addPartitionWordData()
 */
function docRankOrderCallback($word_doc_a, $word_doc_b)
{
    return ((float)$word_doc_a[CrawlConstants::DOC_RANK] > 
        (float)$word_doc_b[CrawlConstants::DOC_RANK]) ? -1 : 1;
}

/**
 * Callback to check if $a is less than $b
 *
 * Used to help sort document results returned in PhraseModel called 
 * in IndexArchiveBundle
 *
 * @param float $a first value to compare
 * @param float $b second value to compare
 * @return int -1 if $a is less than $b; 1 otherwise
 * @see IndexArchiveBundle::getSelectiveWords()
 * @see PhraseModel::getPhrasePageResults()
 */
function lessThan($a, $b) {
    if ($a == $b) {
        return 0;
    }
    return ($a < $b) ? -1 : 1;
}

/**
 *  Callback to check if $a is greater than $b
 *
 * Used to help sort document results returned in PhraseModel called in 
 * IndexArchiveBundle
 *
 * @param float $a first value to compare
 * @param float $b second value to compare
 * @return int -1 if $a is greater than $b; 1 otherwise
 * @see IndexArchiveBundle::getSelectiveWords()
 * @see PhraseModel::getTopPhrases()
 */
function greaterThan($a, $b) {
    if ($a == $b) {
        return 0;
    }
    return ($a > $b) ? -1 : 1;
}


?>
Return current item: Yioop!