Location: PHPKode > scripts > Web Crawler using MySQL DB > web-crawler-using-mysql-db/spider_simple.class.php
<?

/**
* About author:
*  vivekanandan
* email: hide@address.com
*
* If you want to any help on spider ot any thing in php just mail me 
*
* About class:
*  WebSpider    -  constructor set teh domain & url to map it 
*  
*   processTagInPageData() 	- it process the anchor tag & frame tag as googlebot does
*   fetchURLPageData() 		- it returns the html page content for the given URL
* 	isURLExists()			- it checks wheather the given url is added in DB  
*   displayDomainRecords    - it displays teh records  from DB    
* 	StoreUniqueURL 			- it stores the unique url in DB 
* 	processSpecificTagbyType-  it parse each & every tag & truncate the parsed tag from the string 
*/



ini_set("display_errors",1); 


/* table structure 

CREATE TABLE `spider` (
  `id` bigint(20) NOT NULL auto_increment,
  `domain` varchar(150) NOT NULL,
  `url` varchar(2000) NOT NULL,
  `parentid` int(11) NOT NULL,
  `visitflag` int(11) NOT NULL,
  `type` varchar(20) NOT NULL,
  `level` mediumint(9) NOT NULL,
  PRIMARY KEY  (`id`)
)
*/


class WebSpider {

var $mMaxDepth;
var $mDomain;
var $mDBHost;
var $mDBUserName;
var $mDBPassword;
var $mDBDatabase;
var $mURLPageData;
var $mURL;


	function WebSpider($pmDomain, $pmDepth,$pmURL) { 
	
		$this->mDomain		= $pmDomain;
		$this->mMaxDepth	= $pmDepth;
		$this->mURL			= $pmURL;
					
	}
	
	function isURLExists($pmDomain, $pmURL) {  

		mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
		mysql_select_db($this->mDBDatabase);  
		
		$vSQL  = "SELECT count( id ) AS cnt FROM spider
					WHERE  domain = '$pmDomain' and url = '$pmURL'";
		$rs = mysql_query($vSQL); 
		$oRecord = mysql_fetch_assoc($rs);
		return $oRecord['cnt'];
		
    }

	function displayDomainRecords($pmDomain){
	
	   	mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
		mysql_select_db($this->mDBDatabase);  		
		
		$vSQL = "select count(id) as cnt   from  spider where domain = '$pmDomain' ";
		$rsURLList = mysql_query($vSQL);
		$vCnt = mysql_fetch_assoc($rsURLList);
		
			
		$vSQL = "select *   from  spider where domain = '$pmDomain' order by id asc ";
		$rsURLList = mysql_query($vSQL);
		print "<strong>Domain</strong> : ".$pmDomain ." <strong>Total URL</strong> :".$vCnt['cnt'];
		?>
		<table width='80%'  cellspacing='2' cellpadding='2' border="1">
		  <tr>
			<td><strong>URL</strong></td>
			<td><strong>Type</strong></td>
		  </tr>
		<?
		while($aRec = mysql_fetch_assoc($rsURLList)) { ?>			
   		  <tr>
			<td><?php echo $aRec['url'] ?></td>
			<td><?php echo htmlspecialchars($aRec['type'] )?></td>

		  </tr>
     	<? }  ?> 
         </table>
        
        <? 
		

	
	}
	
	function StoreUniqueURL($pmDomain, $pmURL, $pmParentId=0, $pmLevel,$pmType){
	
		mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
		mysql_select_db($this->mDBDatabase);  		 
	
		$pmURL = mysql_real_escape_string($pmURL); 
		if($this->isURLExists($pmDomain,$pmURL)==0) {  	
			$vURLSQL = " INSERT INTO `spider` ( domain, `url` , `parentid` , `visitflag` , `type` , `level` )
							VALUES ('$pmDomain' , '$pmURL', '$pmParentId', '0', '$pmType', '$pmLevel' )";				
			mysql_query($vURLSQL); 	
		}
		
	}


	function fetchLinkfromTag($pmData, $pmTagName, $pmAtributeName){ 
	
		$vPos = strpos($pmData, $pmTagName);
		if($vPos === false){
			return false; // if no link found stop search 
		}
		$vPos += strlen($vStr); 	
		$vSubStr = substr($pmData,$vPos);
		
		$vHrefPos = strpos($vSubStr, $pmAtributeName);
		$vSubStr = substr($vSubStr,  $vHrefPos);
	
		
		$url = explode('"',$vSubStr);	
		return array("url"=>$url[1],"str"=>$vSubStr);
		
	}




	function fetchURLPageData($vURL) { 

		$rCurlRes = curl_init();
		curl_setopt($rCurlRes, CURLOPT_URL,$vURL);		
		curl_setopt($rCurlRes, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13)');
		curl_setopt($rCurlRes, CURLOPT_REFERER, $this->mDomain);
		curl_setopt($rCurlRes, CURLOPT_AUTOREFERER, true); 	  
		curl_setopt($rCurlRes, CURLOPT_HEADER, 0); // set to 0 to eliminate header info from response
		curl_setopt($rCurlRes, CURLOPT_RETURNTRANSFER, 1); // Returns response data instead of TRUE(1	
		$res = curl_exec($rCurlRes);
		return $res;
	}
	
   function ProcessSpiderInit() { 
         
	$this->StoreUniqueURL($this->mDomain, $this->mURL, 0,1,'index');  
	
   }
     

   function processSpecificTagbyType($pmData, $pmTagName, $pmAttribute) {
      
    do {    
			$aResult = $this->fetchLinkfromTag($pmData,$pmTagName, $pmAttribute); 			
			
			$vURL 	= $aResult['url'];
			$pmData = $aResult['str']; 			
			if($pmData) {
				$this->StoreUniqueURL($this->mDomain, $vURL , 1,  1, $pmTagName);  
			}
			$vIndex++;    			
			
	 }while($pmData);	 
   
   }   


   
   function processTagInPageData($pmData) {  
   
   	   $this->processSpecificTagbyType($pmData,'<a',"href=");  
       $this->processSpecificTagbyType($pmData,'<frame', "src=");       
   
   }
   
   
   function fetchURLDataandParseURL() { 

	  $vData   = $this->fetchURLPageData($this->mURL);  	
   	  $this->processTagInPageData($vData);  	  
  }		

}

?>
Return current item: Web Crawler using MySQL DB