Location: PHPKode > projects > Simple Browser Proxy > SBP-0.3.2-development/SBP.class.inc
<?
/*	Author:		Josh Marquis
*	Edited:		08/20/03
*
*	License:	Refer to the file 'LICENSE'
*/

class SBP
{

	var $debug	=	"";	//If it's not empty it will cause a comment with it's contents to be appended to the end of the HTML//
	
function init()
{
	include_once "config.inc";
}

//	prepend the title with the following text	//
function prependTitle($source, $text)
{
	//important to keep case the same as XML feeds can be broken if the case of the tags do NOT match!!//
	$HTML	=	preg_replace("#\<(title|TITLE)\>#", "<\$1>".$text, $source, 1);

	return $HTML;
}

//	my url parser that works better than the builtin function	//
function parseURL($url)
{
	//	protocol(1), auth user(2), auth password(3), hostname(4), path(5), filename(6), file extension(7) and query(8)	//
	$pattern	=	"/^(?:(http[s]?):\/\/(?:(.*):(.*)@)?([^\/]+))?((?:[\/])?(?:[^\.]*?)?(?:[\/])?)?(?:([^\/^\.]+)\.([^\?]+))?(?:\?(.+))?$/i";
	preg_match($pattern, $url, $matches);

	$URI_PARTS["scheme"]	=	$matches[1];
	$URI_PARTS["host"]	=	$matches[4];
	$URI_PARTS["path"]	=	$matches[5];
	
	return $URI_PARTS;
}

//make the HTML source more difficult to match patterns to.//
function scrambleHTML($HTML)
{

	return $HTML;
}

function scrambleURL($HTML)
{
	return $HTML;
}
//replace the domain names with ips where applicable//
//NOTE:	If the hostname is unknown then the hostname is returned instead of the IP//
//NOTE: subdomains/vhosts don't like just the IP being used!	//
function domain2IP($HTML)
{
	//match all of the domains that are listed on the page...//
	//	TODO: handle all types of links: img, etc.
	/*	For now, this will only handle http:// style of links	*/
	$pattern	=	"#http:\/\/(.*?)[\/|'|\"]#ims";	//get just the domain stuff....
	$int_matches	=	preg_match_all($pattern, $HTML, $matches);
	
	
	//consolidate the matches into a list with only one entry per domain.../
	foreach($matches[1] as $domain)
	{
		//	If the current domain isn't already in the list then append it to the list.	//
		if(!@in_array($domain, $domains))
		{
			$domains[]	=	$domain;
		}
	}
	
	//resolve the domains into IPs//
	for($i=0;	$i < count($domains);	$i++)
	{
		$ips[$i]	=	gethostbyname($domains[$i]);
	}

	//replace the domains with their corresponding IPs in the document.//
	$HTML	=	str_replace($domains, $ips, $HTML);
		
	return $HTML;
}

//fetch the images, cache them locally and then replace the URLs to the images on the page with one pointing at the cache//
function cacheFiles($HTML)
{
	include 'config.inc';
	//parse out all the URLs to images//
	//$pattern	=	"#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";	//matches most of the tags that I want.//
	$pattern	=	"#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
	$int_matches	=	preg_match_all($pattern, $HTML, $urls_matched);
	
	//array of file extensions to fetch for the user//
	$exts	=	explode(",", $ext_to_cache);
	
	//consolidate the matches into a list with only one entry per image.../
	foreach($urls_matched[1] as $image_to_cache)
	{
		$ext		=	substr($image_to_cache, -3);
		//	If the current url isn't already in the list then append it to the list.	//
		if(!@in_array("http://".$image_to_cache, $images_to_cache) && in_array($ext, $exts))
		{
			$images_to_cache[]	=	"http://".$image_to_cache;
		}
	}
	
	//go through the document again to try and get any urls that I would miss such as ones without an extension or ones inside javascript//
	//EXAMPLE:	document.write("<img src='http://ads.osdn.com/?ad_id=1715&alloc_id=1864&site_id=1&request_id=4691586&");	//
		$pattern	=	"#\<(img|script|iframe)(.*?)src=['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
		$int_matches	=	preg_match_all($pattern, $HTML, $urls_rematched);
		
		//go through the list and get rid of any that would already have been matched...ie the ones with an extension//
		foreach($urls_rematched[3] as $extless_image_to_cache)
		{
			$ext		=	substr($extless_image_to_cache, -3);
			if(!@in_array("http://".$extless_image_to_cache, $images_to_cache) && $ext!=".")	//if it's not already in there and there isn't a '.' where there should be then it is to be added!
			{
				$images_to_cache[]	=	"http://".$extless_image_to_cache;
			}
		}
	
	
	if(!empty($images_to_cache))
	{
		$debug	.= print_r($images_to_cache, TRUE)."\n";
	}
	
	//fetch each image and store a copy of it here locally//
	//--store the images contents in a file that is named the MD5 sum of the file itself//
	require_once "Fetcher.class.inc";
	
	$get	=	new Fetcher;		//new class for fetching//
	$get->init();
	
	if(!empty($images_to_cache))
	{
		foreach($images_to_cache as $image_to_cache_url)
		{
			//MD5 the URL which will be the file's name//
			$image_MD5	=	md5($image_to_cache_url);
			
			//If the MD5 of the URL doesn't match any of the ones in the cache then fetch it//
			//check the age of the cache file...//
			$age		=	time()-@filemtime($path_to_cache.$image_MD5.".cache");
			
			if(@is_file($path_to_cache.$image_MD5.".cache") && $age < $max_age_cache)	// && (time()-filemtime($path_to_cache.$image_MD5))<$max_age_cache)
			{
				//replace the urls in the document with URLs the the corresponding cache files//
				$HTML	=	str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
			}
			else	//try to fetch the image//
			{
				if($get->fetch($image_to_cache_url))	//fetching was successful!....
				{				
					$f_MD5		=	@fopen($path_to_cache.$image_MD5.".cache", "wb");
					$isSuccessful	=	@fwrite($f_MD5, $get->results);
								@fclose($f_MD5);
					if($isSuccessful)
					{
						$HTML	=	str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
					}
				}
				else
				{
					//get rid of the URL that SBP could not fetch.//
					if(!$foolProof)
					{
						$HTML	=	str_replace($image_to_cache_url, "", $HTML);
					}
				}
	
			}
		}
	}
	return $HTML;
}
function checkCache()
{
	include 'config.inc';
	$size=0;
		
	if(is_dir($path_to_cache))
	{
		if ($dh	=	opendir($path_to_cache))
		{
			$i=0;
			while (($filecnt = readdir($dh)) !== false)
			{
				
			       if($filecnt == "." || $filecnt == "..")	continue;
			       
			       if(is_dir($path_to_cache.$filecnt))
			       {
				       $size		+=	fsize($path_to_cache.$filecnt);
			       }
			       else	//it's a file so add it's size to the total and add it's info to my file array in case everything is oversized.//
			       {
				       $size		+=	filesize($path_to_cache.$filecnt);
				       $files[$i][name]	=	$path_to_cache.$filecnt;
				       $files[$i][age]	=	time()-filemtime($path_to_cache.$filecnt);	//age in seconds//
				       $i++;
			       }
			}
			
			closedir($dh);
		}
	}
			
	$size	=	$size/1024/1024;	//MB//

	//give the user some useless information//
	$this->debug	.=	"\n\tLocal cache size: (".round($size,2)." / ".$max_size_cache." ) MB\n";
			
	if($size > $max_size_cache)
	{
		$degree		=	.25;
		//slim the cache by 25% starting with the oldest file first!//			
		$count		=	count($files);
		$length		=	$count * $degree;
				
		//since the oldest files are first in the array just hack off the first 25%	//
		$files_to_be_deleted	=	array_slice($files, 0, $length);	//make an array of the files that are to be deleted.
		
		$this->debug	.=	"\nCache was too large...removing $degree% of the cache.\n\n";		
		foreach($files_to_be_deleted as $target)
		{
			if(unlink ($target[name]))
			{
				$this->debug .= "\t\tFile: ".$target[name]." was removed from the cache.\n";
			}
		}
	}
}

//will set any headers that are needed for the browswer to render the page properly.//
//if isURL is true then it treats specimen like a URL otherwise like a file!//
function setHeaders($specimen, $isURL)
{
	$ctype="";	//make sure it isn't being forced.//
	
	if($isURL)
	{
		$ext	=	substr($specimen,-3 );
	}
	else	//it's a file so grab the first 3 chars.//
	{
		$ext	=	substr($specimen, 0, 3);
	}
	
	switch(strtolower($ext))
	{
		case "pdf":	$ctype="application/pdf";		break;
//		case "exe":	$ctype="application/octet-stream";	break;
//		case "zip":	$ctype="application/zip";		break;
		case "doc":	$ctype="application/msword";		break;
		case "xls":	$ctype="application/vnd.ms-excel";	break;
		case "ppt":	$ctype="application/vnd.ms-powerpoint";	break;
		case "gif":	$ctype="image/gif";			break;
		case "png":	$ctype="image/png";			break;
		case "jpg":	$ctype="image/jpg";			break;
		case "bmp":	$ctype="image/bmp";			break;
		case "xml":	$ctype="text/xml";			break;
		case "rdf":	$ctype="text/xml";			break;	//I could get 'text/rdf' to display so I'm cheating  :(//
		case "rss":	$ctype="text/rss";			break;	   
		default:
		{
			//nothing was determined so try reading some more chars....
			
			//if it was a file then try pulling more data from it.//
			if(!$isURL)
			{
				$further	=	substr($specimen, 6, 4);
				
				switch($further)
				{
					case "JFIF":	$ctype	=	"image/jpg";	break;
					case "vers":	$ctype	=	"text/xml";	break;
					case "Exif":	$ctype	=	"image/jpg";	break;
				}
			}
		}
	}
	//	If a MIME type was discernable then set the header...otherwise just go with whatever is already there!	//
	if($ctype	!=	"")
		header("Content-Type: $ctype");
}

//	take any local URLs and make them fully qualified URLs	//
function completeURLs($HTML, $url, $edited_tag)
{
	$URI_PARTS	=	$this->parseURL($url);
	$path		=	trim($URI_PARTS["path"], "/");
	$host_url	=	trim($URI_PARTS["host"], "/");

	//$host	=	$URI_PARTS["scheme"]."://".trim($URI_PARTS["host"], "/")."/".$path;	//ORIGINAL
	$host		=	$URI_PARTS["scheme"]."://".$host_url."/".$path."/";
	$host_no_path	=	$URI_PARTS["scheme"]."://".$host_url."/";

	//	make sure the host doesn't end in '//'	//
	$host		=	rtrim($host, '/')."/";
	
	//	replace '//' with 'http://'	//
	$pattern	=	"#(?<=\"|'|=)\/\/#";	//	the '|=' is experimental as it's not necessary as far as I have seen	//
	$HTML		=	preg_replace($pattern, "http://", $HTML);
			
	//matches [src|href|background|action]="/  because in the following pattern I don't want the '/' to stay	//
	$pattern	=	"#(src|href|background|action)(=\"|='|=(?!'|\"))\/#i";
	$HTML		=	preg_replace($pattern, "\$1\$2".$host_no_path, $HTML);
	
	$pattern	=	"#(href|src|background|action)(=\"|=(?!'|\")|=')(?!http|ftp|https|\"|'|javascript:|mailto:)#i";
	$HTML		=	preg_replace($pattern, "\$1\$2".$host, $HTML);
	
	//TODO: need to be able to clean off the crap after the action.//
	$pattern	=	"#action=(.*?)>#is";
	$replace	=	"action=".$_SERVER['PHP_SELF']."><input type=\"hidden\" name=\"original_url\" value=\"\$1\">";
	$HTML		=	preg_replace($pattern, $replace, $HTML);
	
	//mathces '/[any assortment of chars or nums]/../'
	$pattern	=	"#\/(\w*?)\/\.\.\/(.*?)>#ims";
	$replace	=	"/\$2>";
	$HTML		=	preg_replace($pattern, $replace, $HTML);

	//matches '/./'
	$pattern	=	"#\/\.\/(.*?)>#ims";
	$replace	=	"/\$1>";
	$HTML		=	preg_replace($pattern, $replace, $HTML);
	
	//	handle CSS2 import's of CSS files	//
	//EXAMPLE:	<style type="text/css" media="screen">@import "/themes/blue/blue.css";</style>	//
//	$pattern	=	"#import .(.*?).;#ims";
//	$replace	=	"import '".$host."\$1';";
//	$HTML		=	preg_replace($pattern, $replace, $HTML);
	
	return $HTML;
}

//	redirect link targets through the proxy page	//
function proxyURLs($HTML, $edited_tag)
{
	//	gotta remove the BASE tag for sites like yahoo.com	//
	//	OR make my proggy insert the FULL URL to it's self	//
	$pattern	=	"#\<base(.*?)\>#ims";
	$replacement	=	"<!-- <base\$1> -->";	//comment it out for now//
	$HTML		=	preg_replace($pattern, $replacement, $HTML);

	//	edit <link tags so that 'edited="$edit_tag" ' is just before 'href'	//
	$pattern	=	"#\<link(.*?)(\shref=)#ims";
	$HTML		=	preg_replace($pattern, "<link\$1 edited=\"".$edited_tag."\"\$2", $HTML);
	
	//matches everything with a </a> after it on the same line....fails to match when that is on another line.//
	$pattern	=	"#(?<!edited=\"".$edited_tag."\"\s)(href='|href=\"|href=(?!'|\"))(?=(.+)\</a\>)(?!mailto:|http://ftp|ftp|javascript:|'|\")#ims";
	$HTML		=	preg_replace($pattern, "edited=\"".$edited_tag."\" \$1".$_SERVER['PHP_SELF'].'?url=', $HTML);

	return $HTML;
}

//	embed's my javascript menu just before the closing BODY tag.	//
function embedMenu($HTML, $url)
{
//HEADER--BEGIN//
	$style_sheet		=	"<link rel=\"stylesheet\" href=\"layersmenu-sbp.css\" type=\"text/css\" /></link>\n";
	$browser_detection_code	=	"<script language=\"JavaScript\" type=\"text/javascript\">\n".
						file_get_contents($myDirPath."libjs/layersmenu-browser_detection.js")."\n".
						"</script>\n";
	$layersmenu_library	=	"<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu-library.js\"></script>\n";
	$layersmenu		=	"<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu.js\"></script>\n";
	

	include ($myDirPath . "lib/PHPLIB.php");
	include ($myDirPath . "lib/layersmenu-common.inc.php");
	include ($myDirPath . "lib/layersmenu.inc.php");

	//$mid	=	new LayersMenu(140, 20, 20);
	$mid = new LayersMenu(6, 7, 2, 1);	// Gtk2-like

	$mid->setMenuStructureFile($myDirPath . "layersmenu-sbp.txt");
	$mid->parseStructureForMenu("hormenu1");
	$mid->newHorizontalMenu("hormenu1");

	$header	=	$mid->makeHeader();

	//	all previous stuff appended into one line	//
	$source	=	"\n<!-- BEGIN: menu insert -->\n".$style_sheet.$browser_detection_code.$layersmenu_library.$layersmenu.$header."\n<!-- END: menu insert -->\n\n";
	
	//	code immediately after the opening head tag	//
	$pattern	=	"#\<head\>#i";	//	ONLY MATCHES THIS EXACT TAG!	//
	$after_head	=	"<head>".$source;
	$HTML	=	preg_replace($pattern, $after_head, $HTML, 1);	//	limit to 1 match...cpinews.net was doin some weird shit.	//
//HEADER--END//

//BODY--BEGIN//
	$primary_table		="<table width=\"100%\" cellpadding=\"0\" cellspacing=\"0\" class=\"horbar\" style=\"border: 1px solid groove;\">\n";
	$secondary_table	=	"<tr valign=\"middle\">\n".
						"<td>\n".
							"<table width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n";
	$td_menu		=				"<tr valign=\"center\">\n".
									"<td class=\"horbar2\" align=\"right\" style=\"border: 0px solid red\">\n".$mid->getMenu("hormenu1")."\n";
	$td_form		=					"</td>\n".
									"<td class=\"horbar2\" width=\"20\">&nbsp;</td>".
									"<td class=\"horbar2\">\n".
										"<form name=\"sbp_form\" method=\"POST\" action=\"".$PHP_SELF."\">\n".
											"<input type=\"text\" size=\"40\" name=\"form_url\" value=\"".$url."\" class=\"horbar2\" style=\"width: 300px;\">\n".
											"<input type=\"submit\" name=\"Go\" value=\"Go\" style=\"width: 30px;\" ".
												"onClick=\"javascript:document.sbp_form.action='?url='+document.sbp_form.form_url.value;\">\n".
										"</form>\n".
									"</td>\n";
	$close_tables		=				"</tr>\n".
							"</table>\n".
						"</td>\n".
					"</tr>\n".
				"</table>\n".$mid->makeFooter()."\n";
	
	//	all previous stuff appended into one line	//
	$source	=	"\n<!-- BEGIN: menu insert -->\n".$primary_table.$secondary_table.$td_menu.$td_form.$close_tables."\n<!-- END: menu insert -->\n\n";

	//	code immediately after the opening body tag	//
	$pattern	=	"#\<body(.*?)\>#ims";
	$after_body	=	"<body\$1>".$source;
	$HTML	=	preg_replace($pattern, $after_body, $HTML, 1);
//BODY--END//

	return $HTML;
}

//	calculates the differences in microtime captures	//
function processTime($start, $end)
{
 
   list($a_dec, $a_sec)	=	explode(" ", $start);
   list($b_dec, $b_sec)	=	explode(" ", $end);
 
   return $b_sec - $a_sec + $b_dec - $a_dec;
}

}	//end of SBP class//
?>
Return current item: Simple Browser Proxy