Location: PHPKode > scripts > MegaSpider > megaspider/spider.class.php
<?php
/**
* Project:     MegaSpider : small fun project ;)
* File:        spider.class.php
*
* MegaSpider crawl a site following and retrieving
* any data that you want to retrive.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* For questions, help, comments, discussion, etc., please 
* send e-mail to marcin sznyra : hide@address.com
* 
* @package MegaSpider
* @desc Retrieving any data from website.
* @link http://www.marcinsznyra.info/
* @copyright 2005 Marcin Szymon Sznyra <hide@address.com>
* @license Public Domain LGPL
* @author Marcin Szymon Sznyra <hide@address.com>
* @version $Id: spider.class.php,v 1.0 RC 2005/05/05 08:32:25 MarcinSznyra Exp$;
* @todo add database driver and controller, 
*/

class MegaSpider
{
	/**
	* Global address of retriving website
	* @var string url
	**/
	var $_url = null;

	/**
	* Domain address of $_url
	* @var string domain name
	**/
	var $_domain = null;
	
	/**
	* Actualy parsed and retrived address
	* @var string url
	* @depracted
	**/
	var $_actual = null;
	
	/** CURL OPTIONS **/
	
	/**
	* CURL RESOURCE
	* @var string resource
	**/
	var $_CURL_RESOURCE = null;
	
	/**
	* CURL setting
	* @var boolean
	**/
	var $_CURLOPT_FAILONERROR = false;
	
	/**
	* CURL setting
	* @var boolean
	**/
	var $_CURLOPT_FOLLOWLOCATION = true;
	
	/**
	* CURL setting
	* @var boolean
	**/
	var $_CURLOPT_RETURNTRANSFER = true;
	
	/**
	* CURL setting
	* @var integer
	**/
	var $_CURLOPT_TIMEOUT = 3;
	
	/**
	* CURL setting
	* @var boolean
	**/
	var $_CURLOPT_POST = true;
	
	/**
	* CURL setting
	* @var boolean
	**/
	var $_CURLOPT_POSTFIELDS = null;
	
	/**
	* CURL setting
	* @var string Curl browser identification
	**/
	var $_CURLOPT_USERAGENT = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)";
	
	/**
	* CURL setting
	* @var string
	**/
	var $_CURLOPT_USERPWD = null;
	
	/**
	* CURL setting
	* @var string cookie save path
	**/
	var $_CURLOPT_COOKIEJAR = '/cookies/';

	/**
	* CURL setting
	* @var string cookie save path
	**/
	var $_CURLOPT_COOKIEFILE = '/cookies/';
		
	/** DEBUG MODE **/
	
	/**
	* Debug mode true or false
	* @var boolean mode
	**/
	var $_debug = true;
		
	/** SETTINGS **/
	
	/**
	* Sleep time after reciving the page
	* @var integer time
	**/
	var $_sleep = 3;
		
	/** DATA **/
	
	/**
	* Class array of unique www addresses that found in whole site
	* @var array adressess
	* @access private
	**/
	var $_unique_addreses = array( );
	
	/**
	* Class array of unique links that found in whole site
	* @var array links
	* @access private
	**/
	var $_unique_hrefs = array( );
	
	/**
	* Class array of unique emails that found in whole site
	* @var array emails
	* @access private
	**/
	var $_unique_emails = array( );
		
		
	/**
	* __Construct of MegaSpider Class don't 
	* need any params, starts automaticly and
	* automaticly runs bulit in timer, after
	* that try to run CURL_INIT and makes the
	* CURL resource.
	*
	* @return void
	* @access public
	**/
	function MegaSpider( )
	{
		error_reporting( E_ALL );
		ini_set( 'display_errors' , true );
		set_time_limit( 3600 * 24 );
			
		$body = '<html>';
		$body = '<head><title>MegSpider</title></head>';
		$body = '<body style="background:#000000;font-family:sans-serif;font-size:11px;color:#ffffff">';
			
		if( $this -> _debug == true )
		{
			print( $body );
		}
		
		//debug construct run
		if( $this -> _debug == true ) 
		{
			print( '<h4>MegaSpider __Construct start running application</h4><br>' );
			$this -> _scrollDown( );
		}
		
		$this -> _run_time( );
	
		$this -> _CURLOPT_COOKIEJAR = realpath( dirname( __FILE__ ) ).'/'.$this -> _CURLOPT_COOKIEJAR;
		$this -> _CURLOPT_COOKIEFILE = realpath( dirname( __FILE__ ) ).'/'.$this -> _CURLOPT_COOKIEFILE;
		
		$this -> _CURL_RESOURCE = curl_init( );
			
		//debug init curl
		if( is_resource( $this -> _CURL_RESOURCE ) && $this -> _debug )
		{
			print( __LINE__.' <b>_CURL init()</b> executed sucesfully <br>' );
			$this -> _scrollDown( );
		}
		
	}
		
		
	/**
	* Method alows You to assign some
	* page that will be parsed by the
	* Mega Spider Class. Method need only
	* one param - address of www page.
	*
	* @param string address
	* @return void
	* @access public
	**/
	function assign( $url = 'http://www.msn.com' )
	{
		$this -> _url = $url;
		
		if( $this -> _debug == true ) 
		{
			print( __LINE__.' Assign method started : address -> '.$url.'<br>' );
			$this -> _scrollDown( );
		}
			
		$this -> getPage( $url );
			
		$this -> _stats( );
		$this -> _end_time( );
	}
	
	/**
	* Reciving and parsing the page,
	* that method runs all the others 
	* methods as _unique_href, _unique_emails
	* _unique_address. Needs one param - the
	* web page address.
	*
	* @param string address
	* @return void
	* @access private
	**/
	function getPage( $page )
	{
		if( isset( $this -> _unique_hrefs[$page] ) )
		{
			if( $this -> _debug == true ) 
			{
				print( __LINE__.' Found address in _unique_hrefs : '.htmlentities( strip_tags( $page ) ).' <br>' );
				$this -> _scrollDown( );
			}
				
			$this -> _unique_hrefs[$page]['visit'] = $this -> _unique_hrefs[$page]['visit'] + 1; 
				
			if( $this -> _debug == true ) 
			{
				print( __LINE__.' Change visit number : '.$this -> _unique_hrefs[$page]['visit'].' <br>' );
				$this -> _scrollDown( );
			}
			
			$page = $this -> _url.$this -> _unique_hrefs[$page]['href'];
			
			if( $this -> _debug == true ) 
			{
				print( __LINE__.' Parse address : '.htmlentities( strip_tags( $page ) ).' <br>' );
				$this -> _scrollDown( );
			}
				
			$this -> _actual = $page;
		} 
		
		$content = $this -> getContent( $page );
			
		if( $this -> _debug == true )
		{
			print( __LINE__.' Parse content <b>date : '.date('h:i:s').'</b><br>');
			$this -> _scrollDown( );
		}

		if( $this -> _debug == true )
		{
			print( __LINE__.' Sleep time : '.($this -> _sleep).' seconds<br>' );
			$this -> _scrollDown( );
		}
			
		sleep( $this -> _sleep );
			
		if( $this -> _debug == true )
		{
			print( __LINE__.' Parsing content date : '.date('h:i:s').'<br>');
			$this -> _scrollDown( );
		}
			
		if( $content['error'] == 0 )
		{
		
			$content = $content['pages'];
			
		} else {
		
			$content = $content['ermsg'];
			
		}
		
		$this -> _unique_addresses( &$content );
		$this -> _unique_hrefs( &$content );
		$this -> _unique_emails( &$content );

		
		$this -> getUniqueHref( );
	}
	
	/**
	* If one page was parsed by class,
	* system trys to know what is the
	* next page that shoud be parsed.
	* All found addresses are in array
	* _unique_hrefs. This Method runs
	* the spider to recive next www page
	* from array. Don't need any param.
	*
	* @return void
	* @access private
	**/
	function getUniqueHref( )
	{
		if( !empty( $this -> _unique_hrefs ) )
		{
			foreach( $this -> _unique_hrefs AS $key => $value )
			{

				if( $value['visit'] == 0 )
				{
					
					if( $this -> _debug == true ) 
					{
						print( '<br><br><h4>Parse page :: '.htmlentities( strip_tags( $value['name'] ) ).'</h4> <br>' );
						$this -> _scrollDown( );
					}
					
					$this -> getPage( $key );
					
					break;
				}
			}
		}
	}
	
	/**
	* Method get as param address of 
	* web page and download full html
	* content of page. If this operation
	* was not sucesfully method try do
	* run method getUniqueHref that will
	* try to download next page from
	* array _unique_hrefs
	*
	* @param string address
	* @return void
	* @access private
	**/
	function getContent( $address )
	{
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_URL , $address ); // set url to post to
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_FAILONERROR , $this -> _CURLOPT_FAILONERROR );
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_FOLLOWLOCATION , $this -> _CURLOPT_FOLLOWLOCATION );
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_RETURNTRANSFER , $this -> _CURLOPT_RETURNTRANSFER );
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_TIMEOUT , $this -> _CURLOPT_TIMEOUT );
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_COOKIEJAR , $this -> _CURLOPT_COOKIEJAR );
		curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_COOKIEFILE , $this -> _CURLOPT_COOKIEFILE );

		if( strlen( $this -> _CURLOPT_POSTFIELDS ) > 1 )
		{
			curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_POST , $this -> _CURLOPT_POST );
			curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_POSTFIELDS , $this -> _CURLOPT_POSTFIELDS );
		}

		if( strlen( $this -> _CURLOPT_USERAGENT ) > 0 ) 
		{
			curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_USERAGENT, $this -> _CURLOPT_USERAGENT );
		}

		if( strlen( $this -> _CURLOPT_USERPWD ) > 2 ) 
		{
			curl_setopt( $this -> _CURL_RESOURCE , CURLOPT_USERPWD, $this -> _CURLOPT_USERPWD );
		}
			
		$ret['pages'] = curl_exec( $this -> _CURL_RESOURCE ); // go get the page
		$ret['error'] = curl_errno( $this -> _CURL_RESOURCE );
		$ret['ermsg'] = curl_error( $this -> _CURL_RESOURCE );

		if( $ret['error'] == 0 )
		{
	
			return $ret;	
	
		} else {
		
			$this -> getUniqueHref( );
		
		}
	}
	
	/**
	* Method parse html codes of web
	* page and retriving from html code
	* all existing email addressess.
	*
	* @param string html codes
	* @return void
	* @access private
	**/
	function _unique_emails( &$content )
	{
		if( $this -> _debug == true )
		{
			print( __LINE__.' Parse unique emails <br>' );
			$this -> _scrollDown( );
		}
		
		if( empty( $content ) == false )
		{
			
			if( $this -> _debug == true )
			{
				print( __LINE__.' Parse Parsing content <br>' );
				$this -> _scrollDown( );
			}
			
			preg_match_all("/[mM][aA][iI][lL][tT][oO]:([^\"]*)/", $content , $arrEmails );
			
			if( empty( $arrEmails[1] ) == false ) 
			{
				
				if( $this -> _debug == true )
				{
					print( __LINE__.' <b>Found not identified emails</b> <br>' );
					$this -> _scrollDown( );
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
					
				foreach( $arrEmails[1] AS $key => $value )
				{
					
					if( !isset( $this -> _unique_emails[$value] ) )
					{
						$this -> _unique_emails[$value]= $value;
							
						if( $this -> _debug == true )
						{
							print( __LINE__.' Parse: Editing _unique_emails, making : '.htmlentities( strip_tags( $value ) ).' <br>' );
							$this -> _scrollDown( );
						}
					
					} else {
						
						if( $this -> _debug == true )
						{
							print( __LINE__.' Parse: Address exits at _unique_emails, breaking : '.htmlentities( strip_tags( $value ) ).' <br>' );
							$this -> _scrollDown( );
						}
							
					}
				
				}
					
				if( $this -> _debug == true )
				{
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
				
			}
			
		} else {
		
			if( $this -> _debug == true )
			{
				print( __LINE__.' Content is empty! <br>' );
				$this -> _scrollDown( );
			}
			
		}
	}
		
	/**
	* Method parse html codes of web
	* page and retriving from html code
	* all existing www addressess.
	*
	* @param string html codes
	* @return void
	* @access private
	**/
	function _unique_addresses( &$content )
	{
		if( $this -> _debug == true )
		{
			print( __LINE__.' Parse unique adressess <br>' );
			$this -> _scrollDown( );
		}
		
		$addr = null;
		$port = null;
		
		if( empty( $content ) == false )
		{
			if( $this -> _debug == true )
			{
				print( __LINE__.' Parse: Parsing content <br>' );
				$this -> _scrollDown( );
			}
		
			preg_match_all("/(http|https)?:\/\/?([a-zA-Z0-9\-\.]*\.[a-zA-Z]{2,5})(:[a-zA-Z0-9]*)?\/?([a-zA-Z0-9.-_]*\/)?([a-zA-Z0-9.-_?&=%+$]+)?/", $content , $arrAddress );
				
			if( empty( $arrAddress[2] ) == false )
			{
				$prot =& $arrAddress[1];
				$addr =& $arrAddress[2];
				$port =& $arrAddress[3];
				$fold =& $arrAddress[4];
				$phps =& $arrAddress[4];
					
				if( $this -> _debug == true )
				{
					print( __LINE__.' <b>Found not identified addresses</b> <br>' );
					$this -> _scrollDown( );
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
			}
				
				
			//print_r( $arrAddress );
				
			//uniquie addressess
			if( empty( $addr ) == false )
			{
				foreach( $addr AS $key => $value ) 
				{
					$value = eregi_replace( 'www.' , '' , $value );
					if( !isset( $this -> _unique_addreses[$value] ) )
					{
						if( $this -> _debug == true )
						{
							print( __LINE__.' Parse: Editing _unique_addreses, making : '.htmlentities( strip_tags( $value ) ).' <br>' );
							$this -> _scrollDown( );
						}
						
						$this -> _unique_addreses[$value]['www'] = $value;
						//protocol
						if( !empty( $prot[$key] ) )
						{
							$this -> _unique_addreses[$value]['protocol'] = $prot[$key];
						} else {
							$this -> _unique_addreses[$value]['protocol'] = null;
						}
						//port
						if( !empty( $port[$key] ) )
						{
							$this -> _unique_addreses[$value]['port'] = $port[$key];
						} else {
							$this -> _unique_addreses[$value]['port'] = $port[$key];
						}
							 
					} else {
						
						if( $this -> _debug == true )
						{
							print( __LINE__.' Parse: Address exits at _unique_addersses : '.htmlentities( strip_tags( $value ) ).' <br>' );
							$this -> _scrollDown( );
						}
							
					}
				}
	
				if( $this -> _debug == true )
				{				
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
			}
		}
	}
		
	/**
	* Method parse html codes of web
	* page and retriving from html code
	* all existing href links.
	*
	* @param string html codes
	* @return void
	* @access private
	**/
	function _unique_hrefs( &$content )
	{
		if( $this -> _debug == true )
		{
			print( __LINE__.' Parse unique hrefs <br>' );
			$this -> _scrollDown( );
		}
		
		$addr = null;
		$port = null;
		
		if( empty( $content ) == false )
		{
			if( $this -> _debug == true )
			{
				print( __LINE__.' Parse: Parsing content <br>' );
				$this -> _scrollDown( );
			}
			
			preg_match_all("/<[aA][[:space:]][hH][rR][eE][fF]=\"([^\"]+)\"[^>]+>(.*)<\/[aA]>/", $content , $arrAddress );
				
			if( empty( $arrAddress[1] ) == false )
			{
				
				if( $this -> _debug == true )
				{
					print( __LINE__.' <b>Found not identified hrefs</b> <br>' );
					$this -> _scrollDown( );
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
					
					
				foreach( $arrAddress[1] AS $id => $href )
				{
					$href = eregi_replace( $this -> _url , '' , $href );
					
					if( !eregi( 'http|www|mailto', $href ) && $href!="#" && !eregi( 'javascript|;', $href ) )
					{
						
						if( $this -> _debug == true )
						{
							if( !isset( $this -> _unique_hrefs[$href] ) )
							{
								$this -> _unique_hrefs[$href]['href'] = $href;
								$this -> _unique_hrefs[$href]['name'] = $arrAddress[2][$id];
								$this -> _unique_hrefs[$href]['visit'] = 0;

								if( $this -> _debug == true )
								{
									print( __LINE__.' Parse: Editing _unique_hrefs, making : '.htmlentities( strip_tags( $href ) ).' <br>' );						
									$this -> _scrollDown( );
								}
							} else {
								
								if( $this -> _debug == true )
								{
									print( __LINE__.' Parse: Href exitst at _unique_hrefs, breaking : '.htmlentities( strip_tags( $href ) ).' <br>' );
									$this -> _scrollDown( );
								}
							
							}
						}
					} else {
						
						if( $this -> _debug == true )
						{
							print( __LINE__.' Parse: bad href expresion, breaking : '.htmlentities( strip_tags( $href ) ).' <br>' );
							$this -> _scrollDown( );
						}

					}
				}
					
				if( $this -> _debug == true )
				{
					print( '---------------------------<br>' );
					$this -> _scrollDown( );
				}
					
			}

		}
	}
		
	/**
	* Method prints in debug mode
	* statistics of found adresses
	* and emails.
	*
	* @return void
	* @access private
	**/
	function _stats( )
	{
		if( $this -> _debug == true )
		{
			
			print( '<br><br><h4>Total Statistic </h4>' );
			print( '---------------------------<br>' );
			print( 'Unique WWW addresses : '. sizeof( $this -> _unique_addreses ).' <br>' );
			$this -> _scrollDown( );
			
			print( 'Unique EMAIL addresses : '. sizeof( $this -> _unique_emails ).' <br>' );
			$this -> _scrollDown( );

			print( 'Total Pages parsed : '. sizeof( $this -> _unique_hrefs ).' <br>' );
			$this -> _scrollDown( );
			print( '---------------------------<br>' );
		}
	}
		
		
	/**
	* Method prints in debug mode
	* javascript thtat scroll down
	* the page.
	*
	* @return void
	* @access private
	**/
	function _scrollDown( )
	{
		if( $this -> _debug == true )
		{
			print( '<script> window.scrollBy(0,1000000); </script> ' );
		}
	}		

	/**
	* Runs microtime of run time
	* of aplication.
	*
	* @return void
	* @access private
	**/
	function _run_time( ){
		$micro_time = microtime( );
		$micro_time = explode(" ",$micro_time);
		
		$this -> _run_time = $micro_time[1] + $micro_time[0];
	}
	
	
	/**
	* In debug mode prints out 
	* full time of application work.
	*
	* @return integer run time
	* @access public
	**/
	function _end_time( )
	{
		$micro_time = microtime( );
		$micro_time = explode(" ",$micro_time);
		$this -> _end_time = $micro_time[1] + $micro_time[0];

		$total = ($this -> _end_time - $this -> _run_time);

		$total = round($total, 3);
		
		if( $this -> _debug == true )
		{
			print( '<h4>Total Time : '.$total.' </h4>' );
		}
		
		return $total;
	}

/* EOF */
}
?>
Return current item: MegaSpider