<?php
//----------------------------------------------------------------------
// CrawlTrack 3.2.4
//----------------------------------------------------------------------
// Crawler Tracker for website
//----------------------------------------------------------------------
// Author: Jean-Denis Brun
//----------------------------------------------------------------------
// Website: www.crawltrack.net
//----------------------------------------------------------------------
// That script is distributed under GNU GPL license
//----------------------------------------------------------------------
// file: cleaning-crawler-entry.php
//----------------------------------------------------------------------
// Last update: 16/06/2010
//----------------------------------------------------------------------
if (!defined('IN_CRAWLT'))
{
echo"<h1>Hacking attempt !!!!</h1>";
exit();
}
//maximum number of element for query to avoid time out error; can be adapted according server
$maxlimit= 10000;
//initialize array
$testunique=array();
$table=array();
$date=array();
$idtosuppress=array();
$listbadip=array();
$timeperpage=array();
$nbvisits=array();
//date for the mysql query
if($period>=10)
{
$datetolookfor=" date >'".crawlt_sql_quote($daterequest)."'
AND date <'".crawlt_sql_quote($daterequest2)."'";
}
else
{
$datetolookfor=" date >'".crawlt_sql_quote($daterequest)."'";
}
//do the cleaning only for a one day period and one time per session
//------------------------------------------------------------------------------------------------------
if(!isset($_SESSION['flag']))
{
session_name('crawlt');
session_start();
$_SESSION['flag'] = true;
}
if(!isset($_SESSION['cleaning']))
{
$_SESSION['cleaning']=0;
}
if((($period==0) OR ($period>=1000)) && $_SESSION['cleaning']==0)
{
//database connection
$connexion = mysql_connect($crawlthost,$crawltuser,$crawltpassword) or die("MySQL connection to database problem");
$selection = mysql_select_db($crawltdb) or die("MySQL database selection problem");
/*cleaning of the crawlt_visits_human table
to suppress bot using IE6 user agent with several different IP, cleaning done per site
*/
$sqlcleaning = "SELECT id_visit,crawlt_site_id_site,crawlt_ip, crawlt_browser
FROM crawlt_visits_human
WHERE $datetolookfor
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requetecleaning = db_query($sqlcleaning, $connexion);
$visitstotal=mysql_num_rows($requetecleaning);
if($visitstotal>=1)
{
while ($ligne = mysql_fetch_row($requetecleaning))
{
$listsiteidforcleaning[$ligne[1]]=$ligne[1];
${$ligne[1].'listip'}[$ligne[2]]= $ligne[2];
if($ligne[3]==4)
{
${$ligne[1].'listipie6'}[$ligne[2]]= $ligne[2];
${$ligne[1].'idtosuppress'}[]=$ligne[0];
}
}
foreach($listsiteidforcleaning as $value)
{
if(isset(${$value.'listipie6'}))
{
//suppress IE6 visit if it's more than 50% of the total and if there is more than 10 IP using IE6
if(count(${$value.'listipie6'})/count(${$value.'listip'}) >0.5 && count(${$value.'listipie6'})>10)
{
$listidtosuppress=implode("','",${$value.'idtosuppress'});
$sqlsuppress = "DELETE FROM crawlt_visits_human WHERE id_visit IN ('$listidtosuppress')";
$requetesuppress = db_query($sqlsuppress, $connexion);
}
}
}
}
/*cleaning of the crawlt_visits_human table
to suppress double entry (same search engine, same keyword, same site, same page view, with less than 5mn between visit)
since the last cleaning*/
$sqlcleaning = "SELECT id_visit,crawlt_site_id_site,keyword,crawlt_id_crawler, date, crawlt_id_page
FROM crawlt_visits_human
INNER JOIN crawlt_keyword
ON crawlt_visits_human.crawlt_keyword_id_keyword = crawlt_keyword.id_keyword
AND date >'".crawlt_sql_quote($datecleaning)."'
AND crawlt_id_crawler IN ('1,2,3,4')
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requetecleaning = db_query($sqlcleaning, $connexion);
$visitstotal=mysql_num_rows($requetecleaning);
if($visitstotal>=1)
{
while ($ligne = mysql_fetch_row($requetecleaning))
{
$testunique[]=$ligne[1].urlencode($ligne[2]).$ligne[3].$ligne[5];
$table[]=$ligne[0];
$date[]= strtotime($ligne[4]);
}
$testnodouble = array_unique($testunique);
$testdouble= array_diff_assoc($testunique,$testnodouble);
$somethingtosuppress=0;
foreach($testdouble as $i=>$value)
{
foreach($testnodouble as $j=>$value2)
{
if($testunique[$i]==$testunique[$j] && abs($date[$i]-$date[$j])<300)
{
$idtosuppress[]=$table[$i];
$somethingtosuppress=1;
}
}
}
if($somethingtosuppress==1)
{
//request to suppress double entry in the visit table
$listidtosuppress=implode("','",$idtosuppress);
$sqlsuppress = "DELETE FROM crawlt_visits_human WHERE id_visit IN ('$listidtosuppress')";
$requetesuppress = db_query($sqlsuppress, $connexion);
}
}
//---------------------------------------------------------------------------------------------------
/*cleaning of the crawlt_visits_human table
to suppress double entry (same referer , same site, same page view, same IP with less than 60mn between visit)
since the last cleaning*/
//initialize array
$testunique=array();
$table=array();
$date=array();
$idtosuppress=array();
$sqlcleaning = "SELECT id_visit, crawlt_site_id_site, crawlt_id_referer, crawlt_id_crawler, date, crawlt_id_page,crawlt_ip, crawlt_browser
FROM crawlt_visits_human
INNER JOIN crawlt_keyword
ON crawlt_visits_human.crawlt_keyword_id_keyword = crawlt_keyword.id_keyword
AND date >'".crawlt_sql_quote($datecleaning)."'
AND crawlt_id_crawler ='0'
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requetecleaning = db_query($sqlcleaning, $connexion);
$visitstotal=mysql_num_rows($requetecleaning);
if($visitstotal>=1)
{
while ($ligne = mysql_fetch_row($requetecleaning))
{
$testunique[]=$ligne[1].$ligne[2].$ligne[5].$ligne[6].$ligne[7];
$table[]=$ligne[0];
$date[]= strtotime($ligne[4]);
}
$testnodouble = array_unique($testunique);
$testdouble= array_diff_assoc($testunique,$testnodouble);
$somethingtosuppress=0;
foreach($testdouble as $i=>$value)
{
foreach($testnodouble as $j=>$value2)
{
if($testunique[$i]==$testunique[$j] && abs($date[$i]-$date[$j])<3600)
{
$idtosuppress[]=$table[$i];
$somethingtosuppress=1;
}
}
}
if($somethingtosuppress==1)
{
//request to suppress double entry in the visit table
$listidtosuppress=implode("','",$idtosuppress);
$sqlsuppress = "DELETE FROM crawlt_visits_human WHERE id_visit IN ('$listidtosuppress')";
$requetesuppress = db_query($sqlsuppress, $connexion);
}
}
//---------------------------------------------------------------------------------------------------
//query to detect visit coming from same IP range (good change to have an unknow crawler detected as a visitor)
//check done only on a one day period
$sql = "SELECT SUBSTRING_INDEX(crawlt_ip, '.', 3),crawlt_ip
FROM crawlt_visits_human
WHERE $datetolookfor
AND crawlt_ip !=''
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requete= db_query($sql, $connexion);
$resultnumber=mysql_num_rows($requete);
if($resultnumber>=1)
{
while ($ligne = mysql_fetch_row($requete))
{
${'iprange'.$ligne[0]}[$ligne[1]] =$ligne[1];
$listiprange[$ligne[0]]=$ligne[0];
}
foreach($listiprange as $shortip)
{
if(count(${'iprange'.$shortip})>1)
{
$listbadip = $listbadip + ${'iprange'.$shortip};
}
}
}
//---------------------------------------------------------------------------------------------------
//query to detect IP with more than 3 pages viewed with less than 2 second per page or with more than 50 pages viewed (good change to have an unknow crawler detected as a visitor)
$sql = "SELECT crawlt_ip, COUNT(DISTINCT id_visit), MAX(date), MIN(date)
FROM crawlt_visits_human
WHERE $datetolookfor
AND crawlt_ip !=''
GROUP BY crawlt_ip
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requete = db_query($sql, $connexion);
while ($ligne = mysql_fetch_row($requete))
{
$timeperpage[$ligne[0]]=(strtotime($ligne[2])-strtotime($ligne[3]))/$ligne[1];
$nbvisits[$ligne[0]]=$ligne[1];
if(($timeperpage[$ligne[0]]<2 && $nbvisits[$ligne[0]]>3) OR $nbvisits[$ligne[0]]>50)
{
$listbadip[$ligne[0]]=$ligne[0];
}
}
//---------------------------------------------------------------------------------------------------
//query to detect IP coming after a search engine query with more than 2 pages viewed with each time a new keyword (good change to have an unknow crawler detected as a visitor)
$sql = "SELECT crawlt_ip, COUNT(DISTINCT crawlt_keyword_id_keyword)
FROM crawlt_visits_human
WHERE $datetolookfor
AND crawlt_ip !=''
AND crawlt_id_crawler IN ('1,2,3,4,5')
GROUP BY crawlt_ip
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requete = db_query($sql, $connexion);
while ($ligne = mysql_fetch_row($requete))
{
if($ligne[1]> 2)
{
$listbadip[$ligne[0]]=$ligne[0];
}
}
//---------------------------------------------------------------------------------------------------
//query to get the referer spammer site list
$sql = "SELECT referer FROM crawlt_badreferer";
$requete = mysql_query($sql, $connexion);
$nbrresult=mysql_num_rows($requete);
if($nbrresult>=1)
{
while ($ligne = mysql_fetch_row($requete))
{
$listspamreferer[]=$ligne[0];
}
}
else
{
$listspamreferer=array();
}
//include searchenginelist.php file to get the searchengine host to remove visit coming from them without query
if(!isset($crawltgooglelist))
{
include"include/searchenginelist.php";
}
$listspamreferer=array_merge($listspamreferer,$crawltgooglelist,$crawltmsnlist,$crawltyahoolist,$crawltasklist,$crawltexaleadlist);
//query to get visits coming from referer spammer sites and also to clean the referer table from internal referer
$visittosuppress=array();
$referertosuppress=array();
$sql = "SELECT referer, crawlt_id_referer,crawlt_ip
FROM crawlt_visits_human
INNER JOIN crawlt_referer
ON crawlt_visits_human.crawlt_id_referer=crawlt_referer.id_referer
AND $datetolookfor
AND crawlt_id_crawler=0
LIMIT 0,".crawlt_sql_quote($maxlimit)."";
$requete = db_query($sql, $connexion);
$nbrresult=mysql_num_rows($requete);
if($nbrresult>=1)
{
while ($ligne = mysql_fetch_row($requete))
{
$parseurl = @parse_url($ligne[0]);
if(isset($parseurl['host']))
{
if(in_array($parseurl['host'],$listspamreferer) && (!preg_match("/google/i", $parseurl['host']) OR (preg_match("/google/i", $parseurl['host']) && !preg_match("/imgres/i", $ligne[0]))))
{
$listbadip[$ligne[2]]=$ligne[2];
$referertosuppress[$ligne[1]]=$ligne[1];
}
elseif($parseurl['host']==$urlsite[$site]) //to remove from referer table all the internal referer
{
$referertosuppress[$ligne[1]]=$ligne[1];
}
}
}
}
//query to delete these referer from the crawlt_referer table
if(count($referertosuppress)>0)
{
$crawltreferertosuppress=implode("','",$referertosuppress);
$sql="DELETE FROM crawlt_referer
WHERE id_referer IN ('$crawltreferertosuppress')";
$requete= db_query($sql, $connexion);
}
//---------------------------------------------------------------------------------------------------
//query to delete these visits from the crawlt_human_visits table
if(count($listbadip)>0)
{
$crawltlistip=implode("','",$listbadip);
$sql="DELETE FROM crawlt_visits_human
WHERE crawlt_ip IN ('$crawltlistip')";
$requete= db_query($sql, $connexion);
}
//---------------------------------------------------------------------------------------------------
//update the crawlt_config table to enter the last cleaning date (now - 1.5 hour)
$datecleaning = date("Y-m-d H:i:s",(strtotime("now")-5400));
$sqlupdate ="UPDATE crawlt_config SET datelastcleaning='".crawlt_sql_quote($datecleaning)."'";
$requeteupdate = db_query($sqlupdate, $connexion);
$_SESSION['cleaning']=1;
}
?>