<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009, 2010, 2011 Chris Pollett hide@address.com
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett hide@address.com
* @package seek_quarry
* @subpackage library
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009, 2010, 2011
* @filesource
*/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* Load in base class
*/
require_once "string_array.php";
/**
* A Notifier is called when data in the queue is move around
*/
require_once "notifier.php";
/**
* Loaded for crawlLog function
*/
require_once "utility.php";
/**
* Constants shared amoung classes involved in storing web crawl data
*/
require_once "crawl_constants.php";
/**
*
* Code used to manage a memory efficient priority queue.
* Weights for the queue must be flaots. The queue itself is
* implemented using heaps
*
* @author Chris Pollett
*
* @package seek_quarry
* @subpackage library
*/
class PriorityQueue extends StringArray implements CrawlConstants
{
/**
* Number of values that can be stored in the priority queue
* @var int
*/
var $num_values;
/**
* Number of bytes needed to store a value associated with a weight
* @var int
*/
var $value_size;
/**
* Number of bytes needed to store a weight in the queue
* @var int
*/
var $weight_size = 4; //size of a float
/**
* Number of items that are currently stored in the queue
* @var int
*/
var $count;
/**
* When the polling the queue returns the least or most weighted value
* @var string
*/
var $min_or_max;
/**
* An object that implements the Notifier interface (for instance,
* WebQueueArchive)
* @var object
*/
var $notifier; // who to call if move an item in queue
/**
* Makes a priority queue (implemented as an array heap) with the given
* operating parameters
*
* @param string $fname filename to store the data associated with the queue
* @param int $num_values number of values the queue can hold
* @param int $value_size the size in a bytes of a value
* @param string $min_or_max whether this priority queue return least or
* most weight values when polled
* @param object $notifier object to call when a value changes in the queue
* @param int $save_frequency how often the data in the queue should be
* save to disk. (It's default location is RAM)
*/
function __construct($fname, $num_values, $value_size,
$min_or_max, $notifier = NULL,
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
{
$this->num_values = $num_values;
$this->value_size = $value_size;
$this->min_or_max = $min_or_max;
$this->count = 0;
$this->notifier = $notifier;
parent::__construct($fname, $num_values,
$value_size + $this->weight_size, $save_frequency);
}
/**
* Gets the data stored at the ith location in the priority queue
*
* @param int $i location to return data from
* @return mixed array data if the value of $i is between 1 and count, false
* otherwise
*/
function peek($i = 1)
{
if($i < 1 || $i > $this->count) {
crawlLog("Peek Index $i not in Range [1, {$this->count}]");
return false;
}
return $this->getRow($i);
}
/**
* Removes and returns the ith element out of the Priority queue.
* Since this is a priority queue the first element in the queue
* will either be the min or max (depending on queue type) element
* stored. If $i is not in range an error message is written to the log.
* This operation also performs a check to see if the queue should be
* saved to disk
*
* @param int $i element to get out of the queue
* @return mixed array data if the value of $i is between 1 and count, false
* otherwise
*/
function poll($i = 1)
{
if($i < 1 || $i > $this->count) {
crawlLog("Index $i not in Range [1, {$this->count}]");
return false;
}
$extreme = $this->peek($i);
$last_entry = $this->getRow($this->count);
$this->putRow($i, $last_entry);
$this->count--;
$this->percolateDown($i);
$this->checkSave();
return $extreme;
}
/**
* Inserts a new item into the priority queue.
*
* @param string $data what to insert into the queue
* @param float $weight how much the new data should be weighted
* @return mixed index location in queue where item was stored if
* successful, otherwise false.
*/
function insert($data, $weight)
{
if($this->count == $this->num_values) {
return false;
}
$this->count++;
$cur = $this->count;
$this->putRow($cur, array($data, $weight));
$loc = $this->percolateUp($cur);
return $loc;
}
/**
* Add $delta to the $ith element in the priority queue and then adjusts
* the queue to store the heap property
*
* @param int $i element whose weight should be adjusted
* @param float $delta how much to change the weight by
*/
function adjustWeight($i, $delta)
{
if( ($tmp = $this->peek($i)) === false) {
crawlLog("Index $i not in queue adjust weight failed");
return false;
}
list($data, $old_weight) = $tmp;
$new_weight = $old_weight + $delta;
$this->putRow($i, array($data, $new_weight));
if($new_weight > $old_weight) {
if($this->min_or_max == self::MIN) {
$this->percolateDown($i);
} else {
$this->percolateUp($i);
}
} else {
if($this->min_or_max == self::MAX) {
$this->percolateDown($i);
} else {
$this->percolateUp($i);
}
}
}
/**
* Pretty prints the contents of the queue viewed as an array.
*
*/
function printContents()
{
for($i = 1; $i <= $this->count; $i++) {
$row = $this->peek($i);
print "Entry: $i Value: ".$row[0]." Weight: ".$row[1]."\n";
}
}
/**
* Return the contents of the priority queue as an array of
* value weight pairs.
*
* @return array contents of the queue
*/
function getContents()
{
$rows = array();
for($i = 1; $i <= $this->count; $i++) {
$rows[] = $this->peek($i);
}
return $rows;
}
/**
* Scaless the weights of elements in the queue so that the sum fo the new
* weights is $new_total
*
* This function is used periodically to prevent the queue from being
* gummed up because all of the weights stored in it are too small.
*
* @param int $new_total what the new sum of weights of elements in the
* queue will be after normalization
*/
function normalize($new_total = NUM_URLS_QUEUE_RAM)
{
$count = $this->count;
$total_weight = $this->totalWeight();
if($total_weight <= 0) {
crawlLog(
"Total queue weight was zero!! Doing uniform renormalization!");
}
for($i = 1; $i <= $count; $i++) {
$row = $this->getRow($i);
if($total_weight > 0) {
$row[1] = ($new_total*$row[1])/$total_weight;
} else {
$row[1] = $new_total/$count;
}
$this->putRow($i, $row);
}
}
/**
* If the $ith element in the PriorityQueue violates the heap
* property with its parent node (children should be of lower
* priority than the parent), this function
* tries modify the heap to restore the heap property.
*
* @param int $i node to consider in restoring the heap property
* @return int final position $ith node ends up at
*/
function percolateUp($i)
{
if($i <= 1) return $i;
$start_row = $this->getRow($i);
$parent = $i;
while ($parent > 1) {
$child = $parent;
$parent = floor($parent/2);
$row = $this->getRow($parent);
if($this->compare($row[1], $start_row[1]) < 0) {
$this->putRow($child, $row);
} else {
$this->putRow($child, $start_row);
return $child;
}
}
$this->putRow(1, $start_row);
return 1;
}
/**
* If the ith element in the PriorityQueue violates the heap
* property with some child node (children should be of lower
* priority than the parent), this function
* tries modify the heap to restore the heap property.
*
* @param int $i node to consider in restoring the heap property
*/
function percolateDown($i)
{
$start_row = $this->getRow($i);
$count = $this->count;
$parent = $i;
$child = 2*$parent;
while ($child <= $count) {
$left_child_row = $this->getRow($child);
if($child < $count) { // this 'if' checks if there is a right child
$right_child_row = $this->getRow($child + 1);
if($this->compare($left_child_row[1], $right_child_row[1]) <0) {
$child++;
}
}
$child_row = $this->getRow($child);
if($this->compare($start_row[1], $child_row[1]) < 0) {
$this->putRow($parent, $child_row);
} else {
$this->putRow($parent, $start_row);
return;
}
$parent = $child;
$child = 2*$parent;
}
$this->putRow($parent, $start_row);
}
/**
* Computes the difference of the two values $value1 and $value2
*
* Which is subtracted from which is determined by whether this is
* a min_or_max priority queue
*
* @param float $value1 a value to take the difference between
* @param float $value2 the other value
* @return float the differences
*/
function compare($value1, $value2)
{
if($this->min_or_max == self::MIN) {
return $value2 - $value1;
} else {
return $value1 - $value2;
}
}
/**
* Gets the ith element of the PriorityQueue viewed as an array
*
* @param int $i element to get
* @return array value stored in queue together with its weight as a two
* element array
*/
function getRow($i)
{
$value_size = $this->value_size;
$weight_size = $this->weight_size;
$row = $this->get($i);
$value = substr($row, 0, $value_size);
$pre_weight = substr($row, $value_size, $weight_size);
$weight_array = unpack("f", $pre_weight);
$weight = $weight_array[1];
return array($value, $weight);
}
/**
* Add data to the $i row of the priority queue viewed as an array
* Calls the notifier associated with this queue about the change
* in data's location
*
* @param int $i location to add data
* @param array $row data to add (a two element array in the form
* key, float value).
*/
function putRow($i, $row)
{
$raw_data = $row[0].pack("f", $row[1]);
$this->put($i, $raw_data);
if($this->notifier != NULL) {
$this->notifier->notify($i, $row);
}
}
/**
* Computes and returns the weight of all items in prority queue
*
* @return float weight of all items stored in the priority queue
*/
function totalWeight()
{
$count = $this->count;
$total_weight = 0;
for($i = 1; $i <= $count; $i++) {
$row = $this->getRow($i);
$total_weight += $row[1];
}
return $total_weight;
}
}
?>