Location: PHPKode > scripts > Spider Engine > spider-engine/spiderengine.class.php
<?php
/**
* About author:
* Radu T.
* email: eaglebvro[@]yahoo[dot]com
*
* If you want to spider something, just notify me on email and I'll help you.
*
* About class:
* SpiderEngine v.2 class for spidering any html page
*   -fetchData() - for reading the content of a html page
*   -processData() - for doing whatever you want to the results
* 
* 	-url - url to read from eg. http://www.home.com/page_no_<range[0]>.html
* 	-range - array for range of action on url eg. array(0=>array("start"=>1,"end"=>10,"step"=>1)) - that means: for(i=1;i<=10;i+=1)
* 	-pattern - the html text containing the pattern_definition and text		     
* 	-pattern_definition - array definition names eg. array("dummy","cat","subcat")
* 	-start - from where the spider reads the content of the page
* 	-end - array of "to_process" and "not_to_process" content, if a text from array "to_process" was found in content page then the data is spidered and is called processData(), if a text from array "not_to_process" was found in content page then just show a message
* 	
* 	-pattern definition example: {p[abc]}, {p[1]},{p[#]},{p[no.1]} etc.
*   -pattern can be found in the same page multiple times
*/

class SpiderEngine {

    var $url="";
    var $range=array();
    var $pattern="";
    var $pattern_matches=array();
    var $pattern_definition=array();
    var $start="";
    var $end=array();
    var $openType;
    var $pathToLogFile="";
    var $recursiveClassName="";
    var $utf8_encode="";
    var $utf8_decode="";
                	
// Constructor
function SpiderEngine ($openType="fgc",$pathToLogFile='./logfile.log',$utf8_encode=false,$utf8_decode=false) {
	
	error_reporting( E_ALL );
	ini_set( 'display_errors' , true );
	//set_time_limit( 3600 * 24 );
	
	print('<html><meta http-equiv="content-type" content="text/html; charset=iso-8859-1"><head><title>SPIDER ENGINE @eaglebvro</title></head><body>');
	
	$this->openType=$openType;
	$this->pathToLogFile=$pathToLogFile;
	$this->utf8_encode=$utf8_encode;
	$this->utf8_decode=$utf8_decode;
	
}

function scrollDown()
{
	print( '<script> window.scrollBy(0,1000000); </script>' );
}
          
function open_external_url($url, $method = "fgc")
{
   //sleep(1);
	$data = '';
   if(strtolower($method) == "curl")
   {
       $curl = curl_init();

	  // Setup headers - I used the same headers from Firefox version 2.0.0.6
	  // below was split up because php.net said the line was too long. :/
	  $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
	  $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
	  $header[] = "Cache-Control: max-age=0";
	  $header[] = "Connection: keep-alive";
	  $header[] = "Keep-Alive: 300";
	  $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
	  $header[] = "Accept-Language: en-us,en;q=0.7,ro;q=0.3";
	  $header[] = "Pragma: "; // browsers keep this blank.
	
	  curl_setopt($curl, CURLOPT_URL, $url);
	  curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
	  curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
	  curl_setopt($curl, CURLOPT_REFERER, 'http://www.google.com');
	  curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
	  curl_setopt($curl, CURLOPT_AUTOREFERER, true);
	  curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
	  curl_setopt($curl, CURLOPT_TIMEOUT, 10);
	
	  $html = curl_exec($curl); // execute the curl command
	  curl_close($curl); // close the connection
	
	  return $html; // and finally, return $html
   }
   else if(strtolower($method) == "fopen")
   {
       $file = fopen($url, "r");
       if ($file)
       {
	       while(!feof($file)) {
	           $data = $data . fgets($file, 4096);
	       }
	       fclose ($file);
       }
       else return false;       
   }
   else if(strtolower($method) == "fgc")
   {
   		$data=file_get_contents($url);
       
       	if(!$data) return false;
   }
   return $data;
} 

function imageNameFromUrl($source)
{
	$img=explode('/',$source);
	return rawurlencode($img[count($img)-1]);
}

function fetchImage($source,$destination)
{
	$source=substr_replace($source,rawurlencode(basename($source)),strpos($source,basename($source)));
	$data = file_get_contents($source);
	$file = fopen($destination, "w+");
	$r=fputs($file, $data);
	fclose($file);
	
	return $r;
}

function strip_selected_tags($str, $tags = "", $stripContent = false)
{
    preg_match_all("/<([^>]+)>/i",$tags,$allTags,PREG_PATTERN_ORDER);
    foreach ($allTags[1] as $tag){
        if ($stripContent) {
            $str = preg_replace("/<".$tag."[^>]*>.*<\/".$tag.">/iU","",$str);
        }
        $str = preg_replace("/<\/?".$tag."[^>]*>/iU","",$str);
    }
    return $str;
}

function fetchData($process=true)
{

//make an array of text and pattern def	
$arr=explode(']}',$this->pattern);
//print_r($arr);
$array=array();
foreach ($arr as $ak=>$av)
{
	$array[$ak]['text']=$av;
	foreach($this->pattern_definition as $pd)
	{
		$array[$ak]['text']=str_replace('{p['.$pd,'',$array[$ak]['text']);
	}
	
	$array[$ak]['pd']=explode('{p[',$arr[$ak]);
	$array[$ak]['pd']=isset($array[$ak]['pd'][1])?$array[$ak]['pd'][1]:"dummy";
}
//print_r($array);	
//end_make an array of text and pattern def	

if(count($this->range)>0)
{
	
foreach ($this->range as $k=>$v)
{

if(substr_count($this->url,'{range['.$k.']}')==1)
{	
	
for ($i=$this->range[$k]['start'];$i<=$this->range[$k]['end'];$i+=$this->range[$k]['step'])
{
	
//$i is the page
$t=new Timer();	// starts the timer foreach page

$url=str_replace('{range['.$k.']}',$i,$this->url);
	
print('<span style="color:#005fa2;">Url: '.$url.' processing!</span><br>');
$this->scrollDown();
$this->appendToLog('Url: '.$url.' processing!');
	
$content_temp= $this->open_external_url($url,$this->openType);
   
  //print $content_temp;

if($content_temp){   
   
	foreach ($this->end['not_to_process'] as $ntp)
	{
		
		if($ntp!='')
		{
			if(substr_count($content_temp,$ntp)>0)
			{
				print('<span style="color:#ff6600;">Url: '.$url.' wasn\'t processed because "'.$ntp.'" was found in content !</span><br>');
				$this->scrollDown();
				$this->appendToLog('Url: '.$url.' wasn\'t processed because "'.$ntp.'" was found in content !');
				$next=false;
			}
			elseif(substr_count($content_temp,$ntp)==0) {
				$next=true;
			}
		}
		else {
			$next=true;
		}
	}
	
	//echo $next;
	
	if($next)
	{
		foreach ($this->end['to_process'] as $ta)
		{   
			
			if(substr_count($content_temp,$ta)==1)
			{
				$content_temp=$this->textBetween($this->start,$ta,$content_temp); //take the content between start and end, end must be an array
				
								
				if($this->utf8_encode)
				{
					$content_temp=utf8_encode($content_temp);
				}
				
				if($this->utf8_decode)
				{
					$content_temp=utf8_decode($content_temp);
				}
				
				//$content_temp=chars_encode($content_temp,true);
		
				//print ($content_temp);
				//$this->scrollDown();
				
				//print_r($array);
				//$this->scrollDown();
				
				$rest='';
								
				do
				{
						
					for($j=0;$j<=(count($array)-2);$j++)
					{
						$this->pattern_matches[$url][$array[$j]['pd']]=$this->textBetween($array[$j]['text'],$array[$j+1]['text'],substr($content_temp,strlen($rest)));
						$rest.=$array[$j]['text'].$this->pattern_matches[$url][$array[$j]['pd']];
					}
					
					//$kpm++;
				}
				while (substr_count(substr($content_temp,strlen($rest)),$array[0]['text'])>0);
				
				if($process)
				{			
					$this->processData();
					unset($this->pattern_matches);
				}
				
				$this_time=$t->getTTMS();
				print('<span style="color:#ff6600;">Url: '.$url.' has been processed in '.$this_time.' !</span><br>');					$this->scrollDown();		
				$this->appendToLog('Url: '.$url.' has been processed in '.$this_time.' !');
					
				
			}
		}
	}
}
else {
	print('<span style="color:#ff6600;">Url: '.$url.' couldn\'t be opened!</span><br>');
	$this->appendToLog('Url: '.$url.' couldn\'t be opened!');
	sleep(20);
	print('<span style="color:#ff6600;">The spider will be restarted from url: '.$url.' !</span><br>');
	$this->appendToLog('The spider will be restarted from url: '.$url.' !');
	
	$this->setSpiderConfigRangeStart($i);
	
	if(eval("return (\$obj_new=new ".$this->recursiveClassName."(\"".$this->openType."\",\"".$this->pathToLogFile."\"));"))
	{
		$obj_new->url=$this->url;
		$obj_new->recursiveClassName=$this->recursiveClassName;
		$obj_new->start=$this->start;
		$obj_new->end=$this->end;
		$obj_new->pattern=$this->pattern;
		$obj_new->pattern_definition=$this->pattern_definition;
		$obj_new->range=$this->range;
		$obj_new->range[$k]['start']=$i;
		$obj_new->fetchData();
	}
	
	unset($this);
	
}

}
}
}
}
else{
	
$t=new Timer();	// starts the timer foreach page
$url=$this->url;	
	
print('<span style="color:#005fa2;">Url: '.$url.' processing!</span><br>');
$this->scrollDown();
$this->appendToLog('Url: '.$url.' processing!');
	
$content_temp= $this->open_external_url($url,$this->openType);

//print $content_temp;

if($content_temp)
{
   
	foreach ($this->end['not_to_process'] as $ntp)
	{
		if($ntp!='')
		{
			if(substr_count($content_temp,$ntp)>0)
			{
				print('<span style="color:#ff6600;">Url: '.$url.' wasn\'t processed because "'.$ntp.'" was found in content !</span><br>');
				$this->scrollDown();
				$this->appendToLog('Url: '.$url.' wasn\'t processed because "'.$ntp.'" was found in content !');
				$next=false;
			}
			elseif(substr_count($content_temp,$ntp)==0) {
				$next=true;
			}
		}
		else {
			$next=true;
		}
	}
	
	if($next)
	{
		foreach ($this->end['to_process'] as $ta)
		{   
			if(substr_count($content_temp,$ta)==1)
			{
				$content_temp=$this->textBetween($this->start,$ta,$content_temp); //take the content between start and end, end must be an array
				
								
				if($this->utf8_encode)
				{
					$content_temp=utf8_encode($content_temp);
				}
				
				if($this->utf8_decode)
				{
					$content_temp=utf8_decode($content_temp);
				}
				
				//$content_temp=chars_encode($content_temp,true);
		
				//print ($content_temp);
				//$this->scrollDown();
				
				//print_r($array);
				//$this->scrollDown();
				
				$rest='';
				$kpm=0;
								
				do
				{
						
					for($j=0;$j<=(count($array)-2);$j++)
					{
						$this->pattern_matches[$kpm][$array[$j]['pd']]=$this->textBetween($array[$j]['text'],$array[$j+1]['text'],substr($content_temp,strlen($rest)));
						$rest.=$array[$j]['text'].$this->pattern_matches[$kpm][$array[$j]['pd']];
					}
					
					$kpm++;
				}
				while (substr_count(substr($content_temp,strlen($rest)),$array[0]['text'])>0);
				
				if($process)
				{			
					$this->processData();
				}
				
				$this_time=$t->getTTMS();
				print('<span style="color:#ff6600;">Url: '.$url.' has been processed in '.$this_time.' !</span><br>');							$this->scrollDown();		
				$this->appendToLog('Url: '.$url.' has been processed in '.$this_time.' !');
					
				
			}
		}
	}
}
else {
	print('<span style="color:#ff6600;">Url: '.$url.' couldn\'t be opened!</span><br>');
	$this->appendToLog('Url: '.$url.' couldn\'t be opened!');
	sleep(5);
}

}
print('</body></html>');

}

function fetchDataFromContent($content_temp)
{

//make an array of text and pattern def	
$arr=explode(']}',$this->pattern);
//print_r($arr);
$array=array();
foreach ($arr as $ak=>$av)
{
	$array[$ak]['text']=$av;
	foreach($this->pattern_definition as $pd)
	{
		$array[$ak]['text']=str_replace('{p['.$pd,'',$array[$ak]['text']);
	}
	
	$array[$ak]['pd']=explode('{p[',$arr[$ak]);
	$array[$ak]['pd']=isset($array[$ak]['pd'][1])?$array[$ak]['pd'][1]:"dummy";
}
//print_r($array);	
//end_make an array of text and pattern def	

if($this->utf8_encode)
{
	$content_temp=utf8_encode($content_temp);
}

if($content_temp){   
   
$kpm=0;   
		
$rest='';
				
do
{
		
	for($j=0;$j<=(count($array)-2);$j++)
	{
		$this->pattern_matches[$kpm][$array[$j]['pd']]=$this->textBetween($array[$j]['text'],$array[$j+1]['text'],substr($content_temp,strlen($rest)));
		$rest.=$array[$j]['text'].$this->pattern_matches[$kpm][$array[$j]['pd']];
	}
	
	$kpm++;
}
while (substr_count(substr($content_temp,strlen($rest)),$array[0]['text'])>0);

//print_r($this->pattern_matches);	

return $this->pattern_matches;
}

}

function arrayToString($array)
{
	$text='';$x=0;
    $text.="array(";
    $count=count($array);

    foreach ($array as $key=>$value)
    {
        $x++;

        if (is_array($value))
        {
            if(substr($text,-1,1)==')')    $text .= ',';
            $text.='"'.$key.'"'."=>".arraytostring($value);
            continue;
        }

        $text.="\"$key\"=>\"$value\"";

        if ($count!=$x) $text.=",";
    }

    $text.=")";

    if(substr($text, -4, 4)=='),),')$text.='))';

    return $text;
}

function processData() //anything you want to process matches
{

	print_r($this->pattern_matches);

}

function textBetween($s1,$s2,$s){
  $s1 = strtolower($s1);
  //echo $s1;
  $s2 = strtolower($s2);
  //echo $s2;
  $L1 = strlen($s1);
  //echo $L1;
  $scheck = strtolower($s);
  //$scheck = $s;
  //echo $scheck;
  if($L1>0){$pos1 = strpos($scheck,$s1);} else {$pos1=0;}
  if($pos1 !== false){
   if($s2 == '') return substr($s,$pos1+$L1);
   $pos2 = strpos(substr($scheck,$pos1+$L1),$s2);
   if($pos2!==false) return substr($s,$pos1+$L1,$pos2);
  }
  return '';
}

function appendToLog($logstr)
{
	    $timestamp = date("M d H:i:s");
        
        $log_append_str = "$timestamp " .$logstr;
        
        if(file_exists($this->pathToLogFile) && is_writeable($this->pathToLogFile))
        {
                $fp = fopen($this->pathToLogFile, 'a+');
                fputs($fp, "$log_append_str\r\n");
                fclose($fp);
        }
        else if(!file_exists($this->pathToLogFile) && is_writeable($this->pathToLogFile))
        {
                touch($this->pathToLogFile);
                chmod($this->pathToLogFile, 0777);
                $fp = fopen($this->pathToLogFile, 'a+');
                fputs($fp, "$log_append_str\r\n");
                fclose($fp);
        }
        else
        {
                die("Unable to write to ".$this->pathToLogFile." ...");
        }       
}

}//end class
Return current item: Spider Engine