<?php
/*
$Id: 10_ows_bot.php 109 2007-09-27 04:49:13Z randomperson83 $
Obsessive Web Statistics
Copyright (C) 2007 Dustin Spicuzza <hide@address.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
TODO: Implement a filter that gives out information about bots
*/
class BotAnalysis implements iPlugin, iAnalysisPlugin {
var $robot;
var $bot_visits;
var $bot_strings = -1;
var $unknown_id = false;
var $last_is_unknown = -1;
var $website = -1;
// this should return a unique ID identifying the plugin, should start with an alpha,
// should use basename instead of just __FILE__ otherwise it could expose path information
public function getPluginId(){
return 'p'. md5(basename(__FILE__) . get_class());
}
// returns an associative array describing the plugin
public function getPluginInformation(){
// automagically increment the revision number :)
$revision = trim(str_replace('Rev:','',str_replace('$','','$Rev: 109 $')));
return array(
'author' => 'Dustin Spicuzza (OWS builtin)',
'pluginName' => 'Bot Analysis',
'version' => "1.0.$revision",
'description' => 'Attempts to identify bots and tag them as such',
'url' => 'http://obsessive.sourceforge.net/'
);
}
// this function should return a set of arrays that define the dimensions
// and attributes that this plugin defines. You should not specify an attribute
// that another plugin defines. This is not website dependent.
public function define_dimensions(){
global $cfg;
return array(
'bot' => array(
'pnode_is' => null,
'bot' => attribute_defn('text',null,$cfg['indexsz_text']),
'is_bot' => attribute_defn('boolean',null,true),
'is_unknown' => attribute_defn('boolean',null,true)
)
);
}
/*
Treat this like a constructor. This is called before all phases of
analysis, and is only called once per website. It should be used to
clean up website-specific variables. Like SQL id's.
*/
public function InitializeAnalysis($website){
if ($this->website != $website){
$this->website = $website;
// this needs to be reset for each website
$this->unknown_id = false;
// this is quite simplistic, but it works -- load it only once
// TODO: Incorporate list at robottxt.org... and better descriptions
if ($this->bot_strings == -1)
if (($this->bot_strings = $this->get_bots()) === false)
return false;
}
return true;
}
/*
This function is called before the round of analysis starts
$ids An array of all the current ID's for every dimension. If you
insert new rows into the dimension table, you MUST use and increment
the ID in the appropriate dimension.
*/
public function preAnalysis($website,&$ids){
$this->robot = array();
$this->bot_visits = 0;
$this->last_is_unknown = -1;
return true;
}
/*
This function is called for each line grabbed from the logfile.
$website The current website being worked on
$dimension This parameter defines which dimension is being analyzed
at the moment. The function may be called multiple times
for a row with different dimensions as arguments.
$line This contains an array of data that was retrieved
from the logfile.
This function should return an item that is the 'primary node' of the
dimension (defined as an attribute with the same name as the dimension).
You should return false if you do not define a primary node for that dimension,
or if there is an error.
*/
public function getPrimaryNode($website, $dimension, $line){
$agent = (array_key_exists('User-Agent',$line) ? $line['User-Agent'] : urldecode($line['UD-User-Agent']));
$this->last_is_unknown = false;
// the detection will be twofold:
// ID any host addresses that visited robots.txt, and exclude them for a period of 10 minutes (that visit)
// Anything that has a 'bot' in its agent name
// -- specific strings will be flagged as particular types of bots
// see if the string is in there?
foreach ($this->bot_strings as $kbs => $bs){
if (stripos($agent,$kbs) !== false){
$this->bot_visits += 1;
return $bs;
}
}
$this->last_is_unknown = true;
// try unknowns
if (stripos($agent,'bot') !== false){
$this->bot_visits += 1;
return "Unknown: " . $agent;
}else if (stripos($agent,'crawl') !== false){
$this->bot_visits += 1;
return "Unknown: " . $agent;
}else if ((array_key_exists('Request',$line) && substr($line['Request'],0,11) == '/robots.txt') ||
(array_key_exists('Request-Path',$line) && $line['Request-Path'] == '/robots.txt')){
// don't know what this is, but we couldn't identify it and it hit
// robots.txt, so deal with it later
$this->robot[] = array($line['Date'],$line['Time'],$line['Remote-Host']);
}
return '';
}
/*
This function is called for each line grabbed from the logfile.
$website The current website being worked on
$dimension This parameter defines which dimension is being analyzed
at the moment. The function may be called multiple times
for a row with different dimensions as arguments.
$pnode This contains the primary node of the dimension.
The plugin should only return attributes that are defined in the
define_dimensions function. This function should return an array
in the form of
array('attribute' => 'value', ...)
which defines values to be uploaded to the SQL database. It should NOT return
the primary node. Please note that the returned values can be cached, so this
function may NOT always be called for each row.
You should ALWAYS return an array with the same keys each time, in the same order
that the keys were defined in define_dimensions. Otherwise, things will break.
If you do not define any attributes in the current dimension, or if there is
an error, then return false.
*/
public function getAttributes($website, $dimension, $pnode){
if ($pnode == '')
return array('is_bot' => false, 'is_unknown' => false);
// check to see if we already analyzed the primary node
if ($this->last_is_unknown != -1)
return array('is_bot' => true, 'is_unknown' => $this->last_is_unknown);
else
return array('is_bot' => true, 'is_unknown' => in_array($pnode,$this->bot_strings));
}
/*
this function is called after the round of analysis is complete. Called many times.
$ids An array of all the current ID's for every dimension. If you
insert new rows into the dimension table, you MUST use and increment
the ID in the appropriate dimension. $ids[$dimensionname] is how it would
be referenced.
*/
public function postAnalysis($website,&$ids){
$table = str_replace('.','_',$website);
$s_fact = db_escape_string($table);
$s_bot = db_escape_string($table . '_bot');
$s_date = db_escape_string($table . '_date');
$s_time = db_escape_string($table . '_time');
$s_host = db_escape_string($table . '_host');
// get the ID of the unknown bot, or create it if it doesn't already exist
if ($this->unknown_id === false){
$result = db_query("SELECT bot_id FROM $s_bot WHERE bot = 'Unknown - robots.txt'");
if (!db_is_valid_result($result))
return show_error("Could not find Unknown bot ID!");
else if (db_has_rows($result)){
$row = db_fetch_row($result);
$this->unknown_id = $row[0];
}else{
$id = $ids['bot']++;
if (!db_is_valid_result(db_query("INSERT INTO $s_bot (bot_id,bot,is_bot,is_unknown) VALUES ($id,'Unknown - robots.txt',TRUE,TRUE)")))
return show_error("Could not insert unknown bot!");
if (($this->unknown_id = db_get_last_id($s_bot,'bot_id')) === false)
return show_error("Could not successfully retrieve the ID of the unknown bot!");
}
}
$sql = "UPDATE $s_fact INNER JOIN $s_date ON $s_date.date_id = $s_fact.date_id INNER JOIN $s_time ON $s_time.time_id = $s_fact.time_id INNER JOIN $s_host ON $s_host.host_id = $s_fact.host_id SET $s_fact.bot_id = $this->unknown_id WHERE $s_host.host =";
//echo "\nFound $this->bot_visits bot hits. Now eliminating " . count($this->robot) . " hits on robots.txt...\n";
//$this->bot_visits = 0;
$max = count($this->robot);
// done? ok, now do another query for the items in $this->robot
foreach ($this->robot as $bot){
// convert time to timestamp
$ts = strtotime(str_replace('/','',"$bot[0] $bot[1]"));
if ($ts === false || $ts == -1)
return show_error("Could not convert timestamp of \"$bot[0] $bot[0]\"");
// lets just assume these work for the sake of argument...
$date1 = date("Y-m-d",strtotime("-10 minutes", $ts));
$date2 = date("Y-m-d",strtotime("+10 minutes", $ts));
$time1 = date("H:i:s",strtotime("-10 minutes", $ts));
$time2 = date("H:i:s",strtotime("+10 minutes", $ts));
if ($date1 == $date2)
$date = "$s_date.date = '$date1'";
else
$date = "$s_date.date >= '$date1' AND $s_date.date <= '$date2'";
// do the update on the selected range
if (!db_is_valid_result(db_query("$sql '" . db_escape_string($bot[2]) . "' AND $date AND $s_time.time >= '$time1' AND $s_time.time <= '$time2'")))
return false;
if ($max > 10000)
show_progress('robots.txt Analysis',10000);
}
//echo "Done, found $this->bot_visits bot hits using that method.\n\n";
return true;
}
// grabs list of bots from a file
function get_bots(){
$file = dirname(__FILE__) . '/ows_bots.txt';
if (!file_exists($file) || !($lines = file($file)))
return show_error("Bot list $file not found!");
$bots = array();
foreach ($lines as $line){
$line = trim($line);
if ($line != '' && $line[0] == ';')
continue;
$parts = split("\t",$line);
if (count($parts) > 1){
$f = 0;
foreach ($parts as $part){
$tp = trim($part);
if ($tp != ''){
if ($f == 0){
$f = 1;
$k = $tp;
}else{
$f = 2;
$v = $tp;
break;
}
}
}
if ($f == 2)
$bots[$k] = $v;
}
}
return $bots;
}
}
$bot_analysis = new BotAnalysis();
register_plugin('analysis',$bot_analysis);
?>