Location: PHPKode > scripts > Bot recognizer and dispatcher > bot-recognizer-and-dispatcher/bot_recognizer.php
<?PHP
/**
* @name bot_recognizer.php :
* Class for defining if request comes from search/indexing bot
* and performing some action depending on bot name.
* @Author Alexander Selifonov <alex (at) selifan [dot] ru>
* @Copyright 2009 Alexander Selifonov
* @link http://www.selifan.ru
* @link http://www.phpclasses.org/browse/author/267915.html
* @Version 1.00.001
* @license http://www.opensource.org/licenses/bsd-license.php
* PHP required version : 5.x
* Last modified: 28.08.2009
*/

class CBotRecognizer {

  const VERSION = '1.00';
  const SEARCH_IP_ONLY = 0;
  const SEARCH_IP_OR_AGENT = 1; # default search mode - both IP and user agent used
  const SEARCH_AGENT_ONLY = 2;

  const UNDEFINED_BOT = -1;
  const MALICIOUS_BOT = -100;

  private $table_prefix = 'botrec_';
  private $searchmode = 1; # search method
  private $_callbacks = array();
  private $_type_callbacks = array();
  private $_dbobject = false;
  private $_dbobjclass = '';
  private $_botdata = array(); # if engine is 'file', bot data will be loaded into this array
  private $_botdefs_file = '';
  private $_verbose = false;
  private $_debugip;
  private $_debugagent;
  private $_result = null;
  private $addedcnt = 0;
  private $errormessage = '';
  private $botwords = array('bot','crawl','spider'); # words for identifying bot by UA substring
  private $bot_malicious = false; # becomes true if found bot is checked as malicious in our database
  private $bottype = 0; # what kind is this bot of (0-unspecificied, 1-indexing, 2-email harvesting, etc...
  private $malicious_handler = '';
  private $register_suspects = false; # for future functionality
  private $_worktime = array(); # time interval when dispathing is active, example : array('03:00', '05:00')
  function CBotRecognizer($params=array()){ #  tableprefix='', $db_object=false) {
    $this->_botdefs_file = dirname(__FILE__) .'/bot-defs.txt';
    if(is_array($params)) {
      if(isset($params['tableprefix'])) $this->table_prefix = $params['tableprefix'];
      if(isset($params['dbobject']) && is_object($params['dbobject'])) {
        $this->_dbobject = $params['dbobject'];
        $this->_dbobjclass = strtolower( get_class($this->_dbobject) );
        if($this->_verbose) echo "passed to Constructor db object ".get_class($this->_dbobject).'<br />';
      }
      if(isset($params['searchmode'])) $this->searchmode = $params['searchmode'];
      if(isset($params['verbose'])) $this->_verbose = $params['verbose'];

      if(!empty($params['sourcefile'])) {
        $this->_botdefs_file = $params['sourcefile'];
      }
      if(isset($params['worktime'])) {
          $this->_worktime = is_string($params['worktime']) ? split('[-,;]',$params['worktime']) :
          (is_array($params['worktime']) ? $params['worktime'] : array());
      }

    }
    if(!is_object($this->_dbobject) && file_exists($this->_botdefs_file)) {
      $this->LoadBotDefinitionsFile();
    }
  }
  /**
  * sets IP and/or user agent strring for emulating specific bot
  *
  * @param mixed $ip IP-address
  * @param mixed $agent User-Agent string
  */
  function EmulateBot($ip='', $agent='') {
    $this->_debugip = $ip;
    $this->_debugagent = $agent;
    $this->_result = null;
  }

  function SetSearchMode($mode) {
    $this->searchmode = $mode;
    $this->_result = null; # retry recognition
  }

  /**
  * Tries to recognize search/spyder bot
  * @returns char bot name or (UNDEFINED_BOT) (undefined bot) or false (not a bot)
  */
  function GetBotId($ua='',$ip='') {
      if(!empty($ua) || !empty($ip)) {
          $this->_result!=null;
          $this->_debugagent = $ua;
          $this->_debugip = $ip;
      }
      if($this->_result!==null) return $this->_result;
      $retcode = false;
      $ipaddr = ($this->_debugip)? $this->_debugip : $_SERVER['REMOTE_ADDR'];
      if($this->_debugagent) $usragent = $this->_debugagent;
      else                   $usragent = isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : '';


      if(empty($ipaddr)) return ($this->_result=false); # running from CRON or what else, not from client HTTP request
      if($this->_verbose) echo "-- CBotRecognizer::GetBotId/KT1 for ip=$ipaddr, user-agent: $usragent, dbobject : ",get_class($this->_dbobject),"<br />";

    # get integer IP addr representation from "xxx.xxx.xxx.xxx"
    $ip_x32 = self::FromIpToX32($ipaddr);

    if(is_object($this->_dbobject)) { #<3-engine>
      # using SQL engine:
      if($this->_verbose) echo "searching in DB,  by {$this->_dbobjclass}...<br />"; # debug printing
      $cond = array();
      if($this->searchmode <= CBotRecognizer::SEARCH_IP_OR_AGENT) $cond[] = "(($ip_x32) BETWEEN ipfrom AND ipto)";
      if($this->searchmode >= CBotRecognizer::SEARCH_IP_OR_AGENT) $cond[] = "(useragent<>'' AND (INSTR('$usragent', useragent)>0))";
      $strcond = implode(' OR ',$cond);
      $query = "SELECT botid,bottype,malicious FROM {$this->table_prefix}bot_definitions WHERE $strcond LIMIT 1";
      if($this->_dbobjclass=='cdbengine') {
        $result = $this->_dbobject->sql_query($query,1,0,0);
        if(!empty($result[0])) {
            $retcode = $result[0];
            $this->bottype = $result[1];
            $this->bot_malicious = $result[2];
        }
      }
      else { # (substr($this->_dbobjclass,0,7) == 'zend_db') {
          $this->_dbobject->setFetchMode(Zend_Db::FETCH_OBJ);
          $result = $this->_dbobject->fetchRow($query, 2);
          if(isset($result->botid)) {
            $retcode = $result->botid;
            $this->bottype = $result->bottype;
            $this->bot_malicious = $result->malicious;
          }
      }
      if($this->_verbose>1) { echo "--GetBotId in SQL search result: $retcode, type:[{$this->bottype}], malicious: [{$this->bot_malicious}]<br />";}
    } #<3-engine>
    elseif(count($this->_botdata)) { #<3-engine>
      if($this->_verbose>1) {
          echo "using file engine<br />";
      }
      foreach($this->_botdata as $dta) {
        $b_bot = false;
        if($this->searchmode <= CBotRecognizer::SEARCH_IP_OR_AGENT) {
          $b_bot = ($ip_x32>=$dta[1]) && ($ip_x32<=$dta[2]);
        }

        if(!$b_bot && $this->searchmode >= CBotRecognizer::SEARCH_IP_OR_AGENT) {
          if($dta[3]!='') $b_bot = (stripos($usragent,$dta[3])!==false);
        }
        if($b_bot) {
          $retcode = $dta[0]; # short bot name
          $this->bottype  = isset($dta[4])? $dta[4] : 0; # bot type
          $this->bot_malicious = isset($dta[5])? $dta[5] : 0; # malicious or not
          break;
        }
      }
      if($this->_verbose>1) { echo "--GetBotId in filemode search result: ($retcode), type:[{$this->bottype}], malicious:[{$this->bot_malicious}]<br />";}
    } #<3-engine>

    # last resort: if one of special words found, return UNDEFINED_BOT (undefined bot):
    if(!$retcode) {
      foreach($this->botwords as $botword) {
        if(stripos($usragent,$botword)!==false) $retcode = CBotRecognizer::UNDEFINED_BOT;
      }
    }

    return $retcode;
  }

  function CreateBotDefTable() {
    if(!is_object($this->_dbobject)) return false;
    $sqldrop   = "DROP TABLE {$this->table_prefix}bot_definitions";
    $sqlcreate = "CREATE TABLE {$this->table_prefix}bot_definitions (
      recid INT(20) NOT NULL AUTO_INCREMENT,
      botid CHAR(60) NOT NULL DEFAULT '',
      ipfrom INT UNSIGNED NOT NULL DEFAULT 0,
      ipto   INT UNSIGNED NOT NULL DEFAULT 0,
      useragent CHAR(60) NOT NULL DEFAULT '',
      bottype INT(4) default 0,
      malicious INT(1) default 0,
      PRIMARY KEY(recid), KEY ix_ipfrom(ipfrom), KEY ix_ipto(ipto), KEY ix_useragent(useragent) )";
    $sqldrop2 = $sqlcreate2 = '';
    if($this->register_suspects) {
      $sqldrop2   = "DROP TABLE {$this->table_prefix}bot_logsuspect";
      $sqlcreate2 = "CREATE TABLE {$this->table_prefix}bot_logsuspect (
      recid INT(20) NOT NULL AUTO_INCREMENT,
      botid CHAR(60) NOT NULL DEFAULT '',
      ipaddr INT UNSIGNED NOT NULL DEFAULT 0,
      hitcounter INT(10) DEFAULT 0,
      logstart DATETIME not null DEFAULT 0,
      PRIMARY KEY(recid), KEY ix_ipaddr(ipaddr) )";
    }
    if($this->_dbobjclass == 'cdbengine') { # use CDBEngine wrapper
      $this->_dbobject->sql_query($sqldrop);
      $this->_dbobject->sql_query($sqlcreate);
      if($this->register_suspects) {
        $this->_dbobject->sql_query($sqldrop2);
        $this->_dbobject->sql_query($sqlcreate2);
      }
    }
    else { # use Zend_Db...
      $this->_dbobject->query($sqldrop);
      $this->_dbobject->query($sqlcreate);
      if($this->register_suspects) {
        $this->_dbobject->query($sqldrop2);
        $this->_dbobject->query($sqlcreate2);
      }
    }
    if($this->_verbose) echo "CreateBotDefTable: table(s) for bots created<br />"; #debug
  }

  /**
  * loads bot definitions from delimited text file
  *
  * @param string $srcfile
  * @param integer|boolean $clearexisting clean existing data or not (default-not)
  * @return int count of loaded definitions
  */
  function LoadBotDefinitionsFile($srcfile='', $clearexisting=false) {
    $mydir = dirname(__FILE__);
    if(empty($srcfile)) $srcfile = $this->_botdefs_file;
    if($clearexisting) {
      if(is_object($this->_dbobject)) { $this->CreateBotDefTable(); }
      else { $this->_botdata = array(); }
    }
    if(file_exists($srcfile)) $lines = @file($srcfile);
    elseif(file_exists("$mydir/$srcfile")) $lines = @file("$mydir/$srcfile"); # bot def's file may reside in this php class folder

    if(count($lines)<1) return false;
    $this->addedcnt = 0;
    foreach($lines as $line) { #<2>
      $arr = explode('|', trim($line));
      if(count($arr)<4) continue;
      $botid = trim($arr[0]);
      $ip1 = self::FromIpToX32(trim($arr[1]));
      $ip2 = self::FromIpToX32(trim($arr[2]));
      $agent = trim($arr[3]);
      $bottype = isset($arr[4])? intval($arr[4]) : 0;
      $mal = isset($arr[5])? intval($arr[5]) : 0;
      $this->AddBotDefinition($botid,$ip1,$ip2,$agent, $bottype,$mal);
    } #<2>
    if($this->_verbose) echo "LoadBotDefinitionsFile($srcfile) loaded definitions : {$this->addedcnt}<br />";
    return $this->addedcnt;
  }
  /**
  * imports bot definitions from internet into local SQL DB
  *
  * @param mixed $bot_id bot identifier
  * @param mixed $url source text file name or url (in iplists.com format)
  * @param mixed $file_type reserved
  * @return int
  */
  function ImportBotsFromUrl($bot_id, $url,$file_type=0,$bottype=0, $malicious=0) {
    $this->errormessage = '';
    $this->addedcnt = 0;
    $ipranges = $uas = array();
    $canopen_url = ini_get('allow_url_fopen');
    if(!$canopen_url) ini_set('allow_url_fopen',true);
    $fh = @fopen($url,'r');
    if(!$fh) {
      $this->errormessage = 'Error opening URL or file : '.$url;
      return false;
    }
    while(!feof($fh)) {
      $line = (fgets($fh));
      if(!$line) continue;
      $line = trim($line);
      $ua = '';
      $strip1 = '';
      $strip2= '';
      if(substr($line,0,1)=='#') { # comment or # "UA ..." - string with User Agent
        if(substr($line,0,5) == '# UA ') $uas[] = CBotRecognizer::StrUndress(strtolower(substr($line,5)));
        continue;
      }

      $sip = $strip1 = $strip2 = $line;
      if(intval($sip)) { #<3>
        $spl = explode('.',$sip);
        if(count($spl<4)) { # make full IP range : "74.6.7.0"-"74.6.7.255" from  short ip like "74.6.7"
            $strip1 = $sip. str_repeat('.0',(4-count($spl)));
            $strip2 = $sip. str_repeat('.255',(4-count($spl)));
        }
        if($strip1 != $strip2) {
          $ipranges[] = array(self::FromIpToX32($strip1), CBotRecognizer::FromIpToX32($strip2));
        }
        else { # <4> try toi find range that can be "widened" for this ip-addr
          $thisip = self::FromIpToX32($strip1);
          for($k_ip=0; $k_ip<count($ipranges);$k_ip++) { #<5>
            if($ipranges[$k_ip][1] == $thisip-1) {
              $ipranges[$k_ip][1] += 1; # place this IP to found range
              $strip1 = '';
              break;
            }
          } #<5>
          if($strip1) $ipranges[] = array(self::FromIpToX32($strip1), CBotRecognizer::FromIpToX32($strip2));
        } #<4>
      } #<3>
    }
    fclose($fh);
    if(!$canopen_url) ini_set('allow_url_fopen',$canopen_url); # return to "fopen-no-url" mode

    sort($ipranges);
    # merge overlapped,adjased and nested IP ranges:
    $ip2 = array();
    for($k_ip=0; $k_ip<count($ipranges);$k_ip++) { #<2>
      $rng = $ipranges[$k_ip];
      $b_add = true;
      for($k2=0; $k2<count($ip2); $k2++) { #<3>
        $b_add = true;
        if($ip2[$k2][0] <= $rng[0] && $ip2[$k2][1] >= $rng[0]) { #<4> overlapped or fully nested
          if($ip2[$k2][0] <= $rng[1] && $ip2[$k2][1] >= $rng[1]) { #<5> nested - just skip it
            $b_add=false; break;
          } #<5>
          else { #<5> - overlapped, make found range wider to cover this one.
            $ip2[$k2][1] = $rng[1];
            $b_add=false; break;
          } #<5>
        } #<4>
        elseif($ip2[$k2][1]+1 == $rng[0]) { #<4> adjacent ranges, merge
          $ip2[$k2][1] = $rng[1];
          $b_add=false; break;
        } #<4>
      } #<3>
      if($b_add) $ip2[] = $rng;
    } #<2>

    # in DB mode - clean from "old" recorfds for this bot id before adding new list
    if(is_object($this->_dbobject) && $bot_id!='' && count($ip2>0)) { #<2>
      $cleanqry = "DELETE FROM {$this->table_prefix}bot_definitions WHERE botid='$bot_id'";
      if($this->_dbobjclass == 'cdbengine') {
        $this->_dbobject->sql_query($cleanqry);
      }
      else {
        $this->_dbobject->query($cleanqry);
      }
    }

    for($kk=0; $kk<max(count($ip2),count($uas));$kk++) {
        $ipfrom = isset($ip2[$kk][0])? $ip2[$kk][0]: 0;
        $ipto   = isset($ip2[$kk][1])? $ip2[$kk][1]: 0;
        $ua     = isset($uas[$kk])? $uas[$kk]: '';
        $this->AddBotDefinition($bot_id,$ipfrom,$ipto,$ua,$bottype,$malicious);
    }
    if($this->_verbose) {
      #debug:
      $added = count($ip2);
      echo "<h4>$bot_id UA list from $url</h3>";
      foreach($uas as $oneua) echo "{$oneua}<br />";

      echo "<h4>$bot_id IP list from $url</h3>";
      foreach($ip2 as $ip) {
        $ip4 = $this->FromX32ToIp($ip[0]);
        $ip4a = $this->FromX32ToIp($ip[1]);
        echo "IP: $ip4 - $ip4a<br />";
      }
    }
    return $this->addedcnt;
  }
  /**
  * Adds bot definition into internal array.
  * Used internally when loading
  * @param mixed $botid
  * @param int $ipfrom integer representation of "starting" IP adress
  * @param int $ipto integer representation of "ending" IP adress
  * @param string $useragent
  */
  function AddBotDefinition($botid, $ipfrom, $ipto=0, $useragent='', $bottype=0, $mal=0) {
    if(is_object($this->_dbobject)) { # bot defs in SQL table
      $bottype = empty($bottype)? '0' : $bottype;
      $mal = empty($mal)? '0' : $mal;
      $sql = "INSERT INTO {$this->table_prefix}bot_definitions (botid,ipfrom,ipto,useragent, bottype, malicious)".
      " VALUES ('$botid','$ipfrom','$ipto','$useragent', $bottype, $mal)";

      if($this->_dbobjclass=='cdbengine') {
        $result = $this->_dbobject->sql_query($sql);
      }
      else {
        $result = $this->_dbobject->query($sql);
      }
    }
    else { # bot defs are in-memory
      if(is_array($botid)) $this->_botdata[]=$botid;
      else {
        if(empty($ipto)) $ipto = $ipfrom;
        $this->_botdata[] = array($botid,$ipfrom, $ipto, $useragent,$bottype,$mal);
      }
    }
    $this->addedcnt++;
  }

  /**
  * Registers handler function that will be called if some specific bot(s)
  * recognized
  *
  * @param mixed $callbackfnc callback function name
  * @param mixed $botlist array or [|,;] delimited string with bot id list that will fire this func.
  */
  function SetHandlerForBots($callbackfnc,$botlist) {
    if(!is_array($botlist)) $botlist = split('[|,;]',$botlist);
    if(is_array($botlist)) foreach($botlist as $botid) {
      if(!empty($callbackfnc)) $this->_callbacks[$botid] = $callbackfnc;
      else unset($this->_callbacks[$botid]);
    }
  }
  /**
  * sets handler function for some type(s) of bots
  *
  * @param mixed $callbackfnc
  * @param mixed $bottype - integer or array holding bot types that will be handled
  */
  function SetHandlerForTypes($callbackfnc,$bottype) {
    if(!is_array($bottype)) $bottype = split('[|,;]',$bottype);
    if(is_array($bottype)) foreach($bottype as $onetype) {
      if(!empty($callbackfnc)) $this->_type_callbacks[$onetype] = $callbackfnc;
      else unset($this->_callbacks[$onetype]);
    }
  }

  /**
  * Sets handler function for all "malicious" bots
  *
  * @param string $funcname existing function name
  */
  function SetMaliciousHandler($funcname) {
    $this->malicious_handler = $funcname;
  }
  /**
  * Dispatch() method tries to recognize the bot and runs respective callback function
  */
  function Dispatch() {
      if($this->_worktime && count($this->_worktime)>=2 && !empty($this->_worktime[1])) {
          $curtm = date('H:i'); # if current time out of working interval, don't dispatch
          if($this->_worktime[0] < $this->_worktime[1] && ($curtm < $this->_worktime[0] || $curtm > $this->_worktime[1])) return;
          if($this->_worktime[0] > $this->_worktime[1] && ($curtm < $this->_worktime[0] && $curtm > $this->_worktime[1])) return;
      }
      $botid = $this->GetBotId();
      if(($this->bot_malicious) && !empty($this->malicious_handler)
        && function_exists($this->malicious_handler)) {
        call_user_func($this->malicious_handler);
      }
      elseif(isset($this->_callbacks[$botid]) && function_exists($this->_callbacks[$botid])) {
        call_user_func($this->_callbacks[$botid]);
      }
      elseif(isset($this->_type_callbacks[$this->bottype]) && function_exists($this->_type_callbacks[$this->bottype])) {
        call_user_func($this->_type_callbacks[$this->bottype]);
      }

  }

  function IsMaliciousBot() {
      if($this->_result===null) $this->GetBotId();
      return $this->bot_malicious;
  }

  function GetBotType() {
      if($this->_result===null) $this->GetBotId();
      return $this->bottype;
  }

  function GetErrorMessage() { return $this->errormessage; }

  /**
  * converts octet-notation IP addr to intreger
  *
  * @param mixed $ipaddr
  * @return string
  */
  function FromIpToX32($ipaddr) {
    $iparr = explode('.',$ipaddr);
    if(count($iparr)<2) return sprintf('%u', $ipaddr);
    $ip_x32 = ($iparr[0]<<24) + ($iparr[1]<<16) + ($iparr[2]<<8) +$iparr[3];
    return sprintf('%u', $ip_x32);
  }

  function FromX32ToIp($ipx32) {
    $ipVal = $ipx32;
    $ipArr = array(0 => floor(  $ipVal/0x1000000) );
    $ipVint   = $ipVal-($ipArr[0]*0x1000000);
    $ipArr[1] = ($ipVint & 0xFF0000)  >> 16;
    $ipArr[2] = ($ipVint & 0xFF00  )  >> 8;
    $ipArr[3] =  $ipVint & 0xFF;
    return implode('.', $ipArr);
  }
  /**
  * "Undresses" string , deleting starting & ending apostrofs if both exist
  *
  * @param string $par
  * @return string
  */
  function StrUndress($par) {
    if(substr($par,0,1)=='"' && substr($par,-1)=='"') return substr($par,1,strlen($par)-2);
    if(substr($par,0,1)=="'" && substr($par,-1)=="'") return substr($par,1,length($par)-2);
    return mysql_real_escape_string($par);
  }
} # CBotRecognizer definition end
Return current item: Bot recognizer and dispatcher