<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009, 2010, 2011 Chris Pollett hide@address.com
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett hide@address.com
* @package seek_quarry
* @subpackage bin
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009, 2010, 2011
* @filesource
*/
if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
/** Calculate base directory of script @ignore*/
define("BASE_DIR", substr(
dirname(realpath($_SERVER['PHP_SELF'])), 0,
-strlen("/bin")));
/** THis tool does not need logging*/
define("LOG_TO_FILES", false);
/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
if(!PROFILE) {
echo "Please configure the search engine instance by visiting" .
"its web interface on localhost.\n";
exit();
}
/** NO_CACHE means don't try to use memcache*/
define("NO_CACHE", true);
/** USE_CACHE false rules out file cache as well*/
define("USE_CACHE", false);
/** Load the class that maintains our URL queue */
require_once BASE_DIR."/lib/web_queue_bundle.php";
/** Load word->{array of docs with word} index class */
require_once BASE_DIR."/lib/index_archive_bundle.php";
/** Used for manipulating urls*/
require_once BASE_DIR."/lib/url_parser.php";
/** For crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/** Get the database library based on the current database type */
require_once BASE_DIR."/models/datasources/".DBMS."_manager.php";
/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/crawl_constants.php";
/*
* We'll set up multi-byte string handling to use UTF-8
*/
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**
* Command line program that allows one to examine the content of
* the WebArchiveBundles and IndexArchiveBundles of Yioop crawls.
* For now it supports returning header information about bundles,
* as well as pretty printing the page/summary contents of the bundle.
*
* The former can be gotten from a bundle by running arc_tool with a
* command like:
* php arc_tool.php info bundle_name
*
* The latter can be gotten from a bundle by running arc_tool with a
* command like:
* php arc_tool.php list bundle_name start_doc_num num_results
*
* @author Chris Pollett
* @package seek_quarry
*/
class ArcTool implements CrawlConstants
{
/**
* The maximum number of documents the arc_tool list function
* will read into memory in one go.
*/
const MAX_BUFFER_DOCS = 200;
/**
* Initializes the ArcTool, for now does nothing
*/
function __construct()
{
}
/**
* Runs the ArcTool on the supplied command line arguments
*/
function start()
{
global $argv;
if(!isset($argv[1]) || (!isset($argv[2]) && $argv[1] != "list")) {
$this->usageMessageAndExit();
}
if($argv[1] != "list") {
$path = $bundle_name = UrlParser::getDocumentFilename($argv[2]);
if($path == $argv[2] && !file_exists($path)) {
$path = CRAWL_DIR."/cache/".$path;
}
}
switch($argv[1])
{
case "list":
$this->outputArchiveList();
break;
case "info":
$this->outputInfo($path);
break;
case "reindex":
$this->reindexIndexArchive($path);
break;
case "show":
if(!isset($argv[3])) {
$this->usageMessageAndExit();
}
$this->outputShowPages($path, $argv[3], $argv[4]);
break;
default:
$this->usageMessageAndExit();
}
}
/**
* Lists the Web or IndexArchives in the crawl directory
*/
function outputArchiveList()
{
$pattern = CRAWL_DIR."/cache/{".self::archive_base_name.",".
self::index_data_base_name."}*";
$archives = glob($pattern, GLOB_BRACE);
if(is_array($archives)) {
foreach($archives as $archive_name) {
echo UrlParser::getDocumentFilename($archive_name)."\n";
}
} else {
echo "No archives currently in crawl directory \n";
}
}
/**
* Determines whether the supplied name is a WebArchiveBundle or
* an IndexArchiveBundle. Then outputsto stdout header information about the
* bundle by calling the appropriate sub-function.
*
* @param string $archive_name the name of a directory that holds
* WebArchiveBundle or IndexArchiveBundle data
*/
function outputInfo($archive_name)
{
$bundle_name = UrlParser::getDocumentFilename($archive_name);
echo "Bundle Name: ".$bundle_name."\n";
$archive_type = $this->getArchiveKind($archive_name);
echo "Bundle Type: ".$archive_type."\n";
if($archive_type === false) {
$this->badFormatMessageAndExit($archive_name);
}
$call = "outputInfo".$archive_type;
$info = $archive_type::getArchiveInfo($archive_name);
$this->$call($info, $archive_name);
}
/**
*
*/
function reindexIndexArchive($path)
{
if($this->getArchiveKind($path) != "IndexArchiveBundle") {
echo "\n$path ...\n".
" is not an IndexArchiveBundle so cannot be re-indexed\n\n";
exit();
}
$shards = glob($path."/posting_doc_shards/index*");
if(is_array($shards)) {
$dbms_manager = DBMS."Manager";
$db = new $dbms_manager();
$db->unlinkRecursive($path."/dictionary", false);
IndexDictionary::makePrefixLetters($path."/dictionary");
$dictionary = new IndexDictionary($path."/dictionary");
$max_generation = 0;
foreach($shards as $shard_name) {
$file_name = UrlParser::getDocumentFilename($shard_name);
$generation = (int)substr($file_name, strlen("index"));
$max_generation = max($max_generation, $generation);
}
for($i = 0; $i < $max_generation + 1; $i++) {
$shard_name = $path."/posting_doc_shards/index$i";
echo "\nShard $i\n";
$shard = new IndexShard($shard_name, $i,
NUM_DOCS_PER_GENERATION, true);
$dictionary->addShardDictionary($shard);
}
echo "\nFinal Merge Tiers\n";
$dictionary->mergeAllTiers();
echo "\nReindex complete!!\n";
} else {
echo "\n$path ...\n".
" does not contain posting shards so cannot be re-indexed\n\n";
}
}
/**
* Outputs to stdout header information for a IndexArchiveBundle
* bundle.
*
* @param array $info header info that has already been read from
* the description.txt file
* @param string $archive_name the name of the folder containing the bundle
*/
function outputInfoIndexArchiveBundle($info, $archive_name)
{
$more_info = unserialize($info['DESCRIPTION']);
unset($info['DESCRIPTION']);
$info = array_merge($info, $more_info);
echo "Description: ".$info['DESCRIPTION']."\n";
$generation_info = unserialize(
file_get_contents("$archive_name/generation.txt"));
$num_generations = $generation_info['ACTIVE']+1;
echo "Number of generations: ".$num_generations."\n";
echo "Number of stored links and documents: ".$info['COUNT']."\n";
echo "Number of stored documents: ".$info['VISITED_URLS_COUNT']."\n";
$crawl_order = ($info[self::CRAWL_ORDER] == self::BREADTH_FIRST) ?
"Bread First" : "Page Importance";
echo "Crawl order was: $crawl_order\n";
echo "Seed sites:\n";
foreach($info[self::TO_CRAWL] as $seed) {
echo " $seed\n";
}
if($info[self::RESTRICT_SITES_BY_URL]) {
echo "Sites allowed to crawl:\n";
foreach($info[self::ALLOWED_SITES] as $site) {
echo " $site\n";
}
}
echo "Sites not allowed to be crawled:\n";
if(is_array($info[self::DISALLOWED_SITES])) {
foreach($info[self::DISALLOWED_SITES] as $site) {
echo " $site\n";
}
}
echo "Meta Words:\n";
foreach($info[self::META_WORDS] as $word) {
echo " $word\n";
}
echo "\n";
}
/**
* Outputs to stdout header information for a WebArchiveBundle
* bundle.
*
* @param array $info header info that has already been read from
* the description.txt file
* @param string $archive_name the name of the folder containing the bundle
*/
function outputInfoWebArchiveBundle($info, $archive_name)
{
echo "Description: ".$info['DESCRIPTION']."\n";
echo "Number of stored documents: ".$info['COUNT']."\n";
echo "Maximum Number of documents per partition: ".
$info['NUM_DOCS_PER_PARTITION']."\n";
echo "Number of partitions: ".
($info['WRITE_PARTITION']+1)."\n";
echo "\n";
}
/**
* Used to list out the pages/summaries stored in a bundle
* $archive_name. It lists to stdout $num many documents starting at $start.
*
* @param string $archive_name name of bundle to list documents for
* @param int $start first document to list
* @param int $num number of documents to list
*/
function outputShowPages($archive_name, $start, $num)
{
$fields_to_print = array(
self::URL => "URL",
self::IP_ADDRESSES => "IP ADDRESSES",
self::TIMESTAMP => "DATE",
self::HTTP_CODE => "HTTP RESPONSE CODE",
self::TYPE => "MIMETYPE",
self::ENCODING => "CHARACTER ENCODING",
self::DESCRIPTION => "DESCRIPTION",
self::PAGE => "PAGE DATA");
$archive_type = $this->getArchiveKind($archive_name);
if($archive_type === false) {
$this->badFormatMessageAndExit($archive_name);
}
$info = $archive_type::getArchiveInfo($archive_name);
$num = min($num, $info["COUNT"] - $start);
if($archive_type == "IndexArchiveBundle") {
$generation_info = unserialize(
file_get_contents("$archive_name/generation.txt"));
$num_generations = $generation_info['ACTIVE']+1;
$archive = new WebArchiveBundle($archive_name."/summaries");
} else {
$num_generations = $info["WRITE_PARTITION"]+1;
$archive = new WebArchiveBundle($archive_name);
}
$num = max($num, 0);
$total = $start + $num;
$seen = 0;
$generation = 0;
while($seen < $total && $generation < $num_generations) {
$partition = $archive->getPartition($generation, false);
if($partition->count < $start && $seen < $start) {
$generation++;
$seen += $partition->count;
continue;
}
$seen_generation = 0;
while($seen < $total && $seen_generation < $partition->count) {
$num_to_get = min($total - $seen,
$partition->count - $seen_generation,
self::MAX_BUFFER_DOCS);
$objects = $partition->nextObjects($num_to_get);
$seen += $num_to_get;
$seen_generation += $num_to_get;
if($seen > $start) {
$num_to_show = min($seen - $start, $num_to_get);
$cnt = 0;
$first = $num_to_get - $num_to_show;
foreach($objects as $object) {
if($cnt >= $first) {
$out = "";
if(isset($object[1][self::TIMESTAMP])) {
$object[1][self::TIMESTAMP] =
date("r", $object[1][self::TIMESTAMP]);
}
foreach($fields_to_print as $key => $name) {
if(isset($object[1][$key])) {
$out .= "[$name]\n";
if($key != self::IP_ADDRESSES) {
$out .= $object[1][$key]."\n";
} else {
foreach($object[1][$key] as $address) {
$out .= $address."\n";
}
}
}
}
$out .= "==========\n\n";
echo "BEGIN ITEM, LENGTH:".strlen($out)."\n";
echo $out;
}
$cnt++;
}
}
}
$generation++;
}
}
/**
* Given a folder name, determines the kind of bundle (if any) it holds.
* It does this based on the expected location of the description.txt file.
*
* @param string $archive_name the name of folder
* @return string the archive bundle type, either: WebArchiveBundle or
* IndexArchiveBundle
*/
function getArchiveKind($archive_name)
{
if(file_exists("$archive_name/description.txt")) {
return "WebArchiveBundle";
}
if(file_exists("$archive_name/summaries/description.txt")) {
return "IndexArchiveBundle";
}
return false;
}
/**
* Outputs the "hey, this isn't a known bundle message" and then exit()'s.
*/
function badFormatMessageAndExit($archive_name)
{
echo "$archive_name does not appear to be a web or index ".
"archive bundle\n";
exit();
}
/**
* Outputs the "how to use this tool message" and then exit()'s.
*/
function usageMessageAndExit()
{
echo "\narc_tool is used to look at the contents of\n";
echo "WebArchiveBundles and IndexArchiveBundles.\n";
echo "It will look for these using the path provided or \n";
echo "will check in the Yioop! crawl directory as a fall back\n\n";
echo "The available commands for arc_tool are:\n\n";
echo "php arc_tool.php list //returns a list \n".
"//of all the archives in the Yioop! crawl directory.\n\n";
echo "php arc_tool.php info bundle_name //return info about\n".
"//documents stored in archive.\n\n";
echo "php arc_tool.php show bundle_name start num //outputs\n".
"//items start through num from bundle_name\n\n";
echo "php arc_tool.php reindex bundle_name \n".
"//reindex the word dictionary in bundle_name\n";
exit();
}
}
$arc_tool = new ArcTool();
$arc_tool->start();
?>