Location: PHPKode > scripts > URL and Email Scrapper > url-and-email-scrapper/scrapper.class.php
<?php
/**************************************************/
/*
Released by AwesomePHP.com, under the GPL License, a
copy of it should be attached to the zip file, or
you can view it on http://AwesomePHP.com/gpl.txt
*/
/**************************************************/

/*
URL Email Scrapper:
-------------------
*	Scrape Emails and URLs 
*	Limit scrapping to certain domain names.
*	Limit scrapping to certain file extensions.
*	Limit scrapping to block certain domain names.
*	Limit scrapping by the number of pages.
*	No dublicated pages scraped.
*/

class scraperStart{

	/* Variables */
	private $urlList;
	private $maxPages;
	private $curPages;
	private $fileEmail;
	private $fileURL;
	private $lineSep;
	private $onlyPages;
	private $pagesDone;
	private $doneEmails;
	private $doneURLS;
	private $doOnlyDomain;
	private $excludePages;
	
	/*
	Set max level
	*/
	function setOptions($pages)	{$this->maxPages = $pages;}
	
	/*
	Set file to store
	*/
	function setFile($fileEmail,$fileURL,$sep)
	{
		$this->fileEmail = $fileEmail;
		$this->fileURL = $fileURL;
		$this->lineSep = $sep;
	}
	
	/*
	Do only a certain domain
	*/	
	function onlyDomain($domainName){ $this->doOnlyDomain[] = $domainName;}
	
	/*
	Do not do these domains
	*/
	function excludeDomain($domains){	$this->excludePages[] = $domains;}
	
	/*
	Start scraping
	*/
	function startScrape($url)
	{
		$this->urlList[] = $url;
		$this->pagesDone[] = 'http://192.168.1.1';
		$this->doScrape();
	}
	
	/*
	Do only these page types
	*/
	function doOnly($type){	$this->onlyPages[] = $type;	}
	
	/*
	function to end scrape and store lists in file
	*/
	function storeList()
	{
		$this->doneURLS = $this->clearDubs($this->doneURLS);
		$this->doneEmails = $this->clearDubs($this->doneEmails);
		echo 'E->'.count($this->doneEmails);
		echo '<br />U->'.count($this->doneURLS);
		$file = fopen($this->fileEmail,"a");
		fwrite($file,"\n".implode($this->lineSep,$this->doneEmails));
		fclose($file);
		
		$file = fopen($this->fileURL,"a");
		fwrite($file,"\n".implode($this->lineSep,$this->doneURLS));
		fclose($file);
	}
	
	/*
	Function to actually start scapring - private
	*/
	private function doScrape(){

		$thisURL = $this->getNextPage();

		$thisURLvars = parse_url($thisURL);
		$htmlCode = $this->getContents($thisURL);
		
		preg_match_all('/(\w+\.)*\w+@(\w+\.)*\w+(\w+\-\w+)*\.\w+/is',$htmlCode,$emails);
		$this->insertList($emails[0],$this->fileEmail);

		
		preg_match_all(
		'|<\s*A\s*href="([^\"]+)"\s*>|Uis',
			$htmlCode,$locals);

		$finalURls = $this->editLocals($locals[1],$thisURLvars);
						
		$this->insertList($finalURls,$this->fileURL);	

	
		$this->curPages++;

		if($this->curPages <= $this->maxPages && count($this->urlList) > 0)
		{
			$this->doScrape();
		}
	}
	
	/*
	Get page content, try several ways - private
	*/
	private function getContents($url)
	{
		$code = file_get_contents($url);
		if($code){ return $code;}
		
		$file = fopen($url,"r");
		if($file)
		{
			while($t = fread($file,102654)){
				$code .= $t;
			}
			fclose($file);
			return $code;	
		}
		
		$urlNew = parse_url($url);
		$host = $urlNew['host'];
		$gets = $urlNew['path'].'?'.$urlNew['query'].'#'.$urlNew['fragment'];
		
		$fp = fsockopen($host, 80, $errno, $errstr, 30);
		if ($fp) {
			$out = "GET $gets HTTP/1.1\r\n";
			$out .= "Host: $host\r\n";
			$out .= "Connection: Close\r\n\r\n";

			fwrite($fp, $out);
			while (!feof($fp)) {
				$code .= fgets($fp, 128);
			}
			fclose($fp);
			return $code;
		}
	}
	
	/*
	Insert list into file - private
	*/
	private function insertList($list,$filename)
	{
		$list = $this->clearDubs($list);

		if($filename == $this->fileURL)
		{
			foreach($list as $item)
			{
				if($this->isAllowed($item)){
					$this->urlList[] = $item;
					$this->doneURLS[] = $item;
				}
			}
		} else {
			foreach($list as $item)
			{
				$this->doneEmails[] = $item;
			}
		}				
	}
	
	/*
	Function to clear dublicates - private
	*/
	private function clearDubs($list)
	{
		$newList = array();
		foreach($list as $item){
			if(!in_array($item,$newList) && $item != 'href'){ array_push($newList,$item);}
		}
		return $newList;
	}
	
	/*
	Edit local links to be public - private
	*/
	private function editLocals($list,$urlParsed)
	{

		$newList = array();
		foreach($list as $item)
		{
			if(!eregi('http',$item)){
				if(substr($item,0,1) == '/'){
					$item = 'http://'.$urlParsed['host'].$item;
				} else {
					$item = 'http://'.$urlParsed['host'].'/'.$item;
				}
			}
			array_push($newList,$item);
		}
		return $newList;
	}
	/*
	Get next page to parse - private
	*/
	private function getNextPage()
	{
		$doPage = array_shift($this->urlList);
		if(in_array($doPage,$this->pagesDone)){
			$this->getNextPage();
		} else {
			array_push($this->pagesDone,$doPage);
			return $doPage;
		}
	}
	
	/*
	Is the extension allowed in URL - private
	*/
	private function isAllowed($url)
	{
		$base2 = strtolower(substr(end(explode('.',$url)),0,2));
		$base3 = strtolower(substr(end(explode('.',$url)),0,3));
		$base4 = strtolower(substr(end(explode('.',$url)),0,4));
		
		foreach($this->excludePages as $curDomain){
			if(eregi($curDomain,$url)){ return false;}
		}
		
		$is_here = false;
		if($this->doOnlyDomain != NULL){
			foreach($this->doOnlyDomain as $curDomain){
				if(eregi($curDomain,$url)){ $is_here = true;}
			}
		} else {
			$is_here = true;
		}
		if($is_here == true)
		{
			return (in_array($base2,$this->onlyPages) || in_array($base3,$this->onlyPages) || in_array($base4,$this->onlyPages));
		}
		return false;
	}
}

?>
Return current item: URL and Email Scrapper