Location: PHPKode > projects > Obsessive Website Statistics > ows/plugins/10_ows_bot.php
<?php
/*
	$Id: 10_ows_bot.php 109 2007-09-27 04:49:13Z randomperson83 $

	Obsessive Web Statistics
    Copyright (C) 2007 Dustin Spicuzza <hide@address.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
	
	
	TODO: Implement a filter that gives out information about bots

*/

class BotAnalysis implements iPlugin, iAnalysisPlugin {

	var	$robot;
	var $bot_visits;
	var $bot_strings = -1;
	var $unknown_id = false;
	var $last_is_unknown = -1;
	var $website = -1;

	// this should return a unique ID identifying the plugin, should start with an alpha,
	// should use basename instead of just __FILE__ otherwise it could expose path information
	public function getPluginId(){
		return 'p'. md5(basename(__FILE__) . get_class());
	}


	// returns an associative array describing the plugin
	public function getPluginInformation(){
		// automagically increment the revision number :)
		$revision = trim(str_replace('Rev:','',str_replace('$','','$Rev: 109 $')));
		return array(
			'author' => 'Dustin Spicuzza (OWS builtin)',
			'pluginName' => 'Bot Analysis',
			'version' => "1.0.$revision",
			'description' => 'Attempts to identify bots and tag them as such',
			'url' => 'http://obsessive.sourceforge.net/'
		);
	}
	
	// this function should return a set of arrays that define the dimensions
	// and attributes that this plugin defines. You should not specify an attribute
	// that another plugin defines. This is not website dependent.
	public function define_dimensions(){
	
		global $cfg;
	
		return array(
			'bot' => array(
				'pnode_is' => null,
				'bot' => attribute_defn('text',null,$cfg['indexsz_text']),
				'is_bot' => attribute_defn('boolean',null,true),
				'is_unknown' => attribute_defn('boolean',null,true)
			)
		);
	}
	
	/*
		Treat this like a constructor. This is called before all phases of
		analysis, and is only called once per website. It should be used to
		clean up website-specific variables. Like SQL id's. 
	*/
	public function InitializeAnalysis($website){
		if ($this->website != $website){
			
			$this->website = $website;
			
			// this needs to be reset for each website
			$this->unknown_id = false;
			
			// this is quite simplistic, but it works -- load it only once
			// TODO: Incorporate list at robottxt.org... and better descriptions
			if ($this->bot_strings == -1)
				if (($this->bot_strings = $this->get_bots()) === false)
					return false;
		}
		
		return true;
	}
	

	/* 
		This function is called before the round of analysis starts
	
		$ids		An array of all the current ID's for every dimension. If you
					insert new rows into the dimension table, you MUST use and increment
					the ID in the appropriate dimension.
	*/
	public function preAnalysis($website,&$ids){
	
		$this->robot = array();
		$this->bot_visits = 0;
		$this->last_is_unknown = -1;
	
		return true;
	}
	

	/*
		This function is called for each line grabbed from the logfile. 
		
		$website		The current website being worked on
		
		$dimension  	This parameter defines which dimension is being analyzed
						at the moment. The function may be called multiple times
						for a row with different dimensions as arguments.
		
		$line			This contains an array of data that was retrieved
						from the logfile.
		
		This function should return an item that is the 'primary node' of the
		dimension (defined as an attribute with the same name as the dimension). 
		
		You should return false if you do not define a primary node for that dimension,
		or if there is an error.
		
	*/
	public function getPrimaryNode($website, $dimension, $line){

		$agent = (array_key_exists('User-Agent',$line) ? $line['User-Agent'] : urldecode($line['UD-User-Agent']));
	
		$this->last_is_unknown = false;
		
		// the detection will be twofold: 
		//		ID any host addresses that visited robots.txt, and exclude them for a period of 10 minutes (that visit)
		//		Anything that has a 'bot' in its agent name
		//			-- specific strings will be flagged as particular types of bots
	
		// see if the string is in there?
		foreach ($this->bot_strings as $kbs => $bs){
			if (stripos($agent,$kbs) !== false){
				$this->bot_visits += 1;
				return $bs;
			}
		}
	
		$this->last_is_unknown = true;
	
		// try unknowns
		if (stripos($agent,'bot') !== false){
			$this->bot_visits += 1;
			return "Unknown: " . $agent;
			
		}else if (stripos($agent,'crawl') !== false){
			$this->bot_visits += 1;
			return "Unknown: " . $agent;
			
		}else if ((array_key_exists('Request',$line) && substr($line['Request'],0,11) == '/robots.txt') ||
				  (array_key_exists('Request-Path',$line) &&  $line['Request-Path'] == '/robots.txt')){
			
			// don't know what this is, but we couldn't identify it and it hit
			// robots.txt, so deal with it later
			$this->robot[] = array($line['Date'],$line['Time'],$line['Remote-Host']);
		}
		
		return '';
	}
	
	/*
		This function is called for each line grabbed from the logfile. 
		
		$website		The current website being worked on
		
		$dimension  	This parameter defines which dimension is being analyzed
						at the moment. The function may be called multiple times
						for a row with different dimensions as arguments.
		
		$pnode			This contains the primary node of the dimension.
	
		The plugin should only return attributes that are defined in the
		define_dimensions function. This function should return an array 
		in the form of
			
			array('attribute' => 'value', ...)
			
		which defines values to be uploaded to the SQL database. It should NOT return
		the primary node. Please note that the returned values can be cached, so this 
		function may NOT always be called for each row. 
		
		You should ALWAYS return an array with the same keys each time, in the same order
		that the keys were defined in define_dimensions. Otherwise, things will break.
		
		If you do not define any attributes in the current dimension, or if there is 
		an error, then return false.
	*/
	public function getAttributes($website, $dimension, $pnode){
	
		if ($pnode == '')
			return array('is_bot' => false, 'is_unknown' => false);
		
		// check to see if we already analyzed the primary node
		if ($this->last_is_unknown != -1)
			return array('is_bot' => true, 'is_unknown' => $this->last_is_unknown);
		else
			return array('is_bot' => true, 'is_unknown' => in_array($pnode,$this->bot_strings));
	}
	
	
		/* 
		this function is called after the round of analysis is complete. Called many times.
	
		$ids		An array of all the current ID's for every dimension. If you
					insert new rows into the dimension table, you MUST use and increment
					the ID in the appropriate dimension. $ids[$dimensionname] is how it would
					be referenced.
	*/
	public function postAnalysis($website,&$ids){
	
		$table = str_replace('.','_',$website);
		$s_fact = db_escape_string($table);
		$s_bot = db_escape_string($table . '_bot');
		$s_date = db_escape_string($table . '_date');
		$s_time = db_escape_string($table . '_time');
		$s_host = db_escape_string($table . '_host');
	
		// get the ID of the unknown bot, or create it if it doesn't already exist
		if ($this->unknown_id === false){
			$result = db_query("SELECT bot_id FROM $s_bot WHERE bot = 'Unknown - robots.txt'");
			
			if (!db_is_valid_result($result))
				return show_error("Could not find Unknown bot ID!");
			
			else if (db_has_rows($result)){
				$row = db_fetch_row($result);
				$this->unknown_id = $row[0];
			
			}else{
			
				$id = $ids['bot']++;
			
				if (!db_is_valid_result(db_query("INSERT INTO $s_bot (bot_id,bot,is_bot,is_unknown) VALUES ($id,'Unknown - robots.txt',TRUE,TRUE)")))
					return show_error("Could not insert unknown bot!");
					
				if (($this->unknown_id = db_get_last_id($s_bot,'bot_id')) === false)
					return show_error("Could not successfully retrieve the ID of the unknown bot!");
			}
		}
		
		
		$sql = "UPDATE $s_fact INNER JOIN $s_date ON $s_date.date_id = $s_fact.date_id INNER JOIN $s_time ON $s_time.time_id = $s_fact.time_id INNER JOIN $s_host ON $s_host.host_id = $s_fact.host_id SET $s_fact.bot_id = $this->unknown_id WHERE $s_host.host =";
	
		//echo "\nFound $this->bot_visits bot hits. Now eliminating " . count($this->robot) . " hits on robots.txt...\n";
		//$this->bot_visits = 0;
		
		$max = count($this->robot);
		
		// done? ok, now do another query for the items in $this->robot
		foreach ($this->robot as $bot){
		
			// convert time to timestamp
			$ts = strtotime(str_replace('/','',"$bot[0] $bot[1]"));
			if ($ts === false || $ts == -1)
				return show_error("Could not convert timestamp of \"$bot[0] $bot[0]\"");
			
			
			// lets just assume these work for the sake of argument...
			$date1 = date("Y-m-d",strtotime("-10 minutes", $ts));
			$date2 = date("Y-m-d",strtotime("+10 minutes", $ts));
			$time1 = date("H:i:s",strtotime("-10 minutes", $ts));
			$time2 = date("H:i:s",strtotime("+10 minutes", $ts));
			
			if ($date1 == $date2)
				$date = "$s_date.date = '$date1'";
			else
				$date = "$s_date.date >= '$date1' AND $s_date.date <= '$date2'";
			
			// do the update on the selected range
			if (!db_is_valid_result(db_query("$sql '" . db_escape_string($bot[2]) . "' AND $date AND $s_time.time >= '$time1' AND $s_time.time <= '$time2'")))
				return false;
	
			if ($max > 10000)
				show_progress('robots.txt Analysis',10000);
		
		}
		
		//echo "Done, found $this->bot_visits bot hits using that method.\n\n";
		return true;
	}

	
	// grabs list of bots from a file
	function get_bots(){
	
		$file = dirname(__FILE__) . '/ows_bots.txt';
		if (!file_exists($file) || !($lines = file($file)))
			return show_error("Bot list $file not found!");
		
		$bots = array();
		
		foreach ($lines as $line){
			
			$line = trim($line);
			
			if ($line != '' && $line[0] == ';')
				continue;
			$parts = split("\t",$line);
			if (count($parts) > 1){
				$f = 0;
				foreach ($parts as $part){
					$tp = trim($part);
					if ($tp != ''){
						if ($f == 0){
							$f = 1;
							$k = $tp;
						}else{
							$f = 2;
							$v = $tp;
							break;
						}
					}
				}
				if ($f == 2)
					$bots[$k] = $v;
			}
		}
		
		return $bots;
	}
}

$bot_analysis = new BotAnalysis();
register_plugin('analysis',$bot_analysis);

?>
Return current item: Obsessive Website Statistics