Location: PHPKode > projects > Simple Browser Proxy > SBP-0.3.2-development/SBP.class.inc
/*	Author:		Josh Marquis
*	Edited:		08/20/03
*	License:	Refer to the file 'LICENSE'

class SBP

	var $debug	=	"";	//If it's not empty it will cause a comment with it's contents to be appended to the end of the HTML//
function init()
	include_once "config.inc";

//	prepend the title with the following text	//
function prependTitle($source, $text)
	//important to keep case the same as XML feeds can be broken if the case of the tags do NOT match!!//
	$HTML	=	preg_replace("#\<(title|TITLE)\>#", "<\$1>".$text, $source, 1);

	return $HTML;

//	my url parser that works better than the builtin function	//
function parseURL($url)
	//	protocol(1), auth user(2), auth password(3), hostname(4), path(5), filename(6), file extension(7) and query(8)	//
	$pattern	=	"/^(?:(http[s]?):\/\/(?:(.*):(.*)@)?([^\/]+))?((?:[\/])?(?:[^\.]*?)?(?:[\/])?)?(?:([^\/^\.]+)\.([^\?]+))?(?:\?(.+))?$/i";
	preg_match($pattern, $url, $matches);

	$URI_PARTS["scheme"]	=	$matches[1];
	$URI_PARTS["host"]	=	$matches[4];
	$URI_PARTS["path"]	=	$matches[5];
	return $URI_PARTS;

//make the HTML source more difficult to match patterns to.//
function scrambleHTML($HTML)

	return $HTML;

function scrambleURL($HTML)
	return $HTML;
//replace the domain names with ips where applicable//
//NOTE:	If the hostname is unknown then the hostname is returned instead of the IP//
//NOTE: subdomains/vhosts don't like just the IP being used!	//
function domain2IP($HTML)
	//match all of the domains that are listed on the page...//
	//	TODO: handle all types of links: img, etc.
	/*	For now, this will only handle http:// style of links	*/
	$pattern	=	"#http:\/\/(.*?)[\/|'|\"]#ims";	//get just the domain stuff....
	$int_matches	=	preg_match_all($pattern, $HTML, $matches);
	//consolidate the matches into a list with only one entry per domain.../
	foreach($matches[1] as $domain)
		//	If the current domain isn't already in the list then append it to the list.	//
		if(!@in_array($domain, $domains))
			$domains[]	=	$domain;
	//resolve the domains into IPs//
	for($i=0;	$i < count($domains);	$i++)
		$ips[$i]	=	gethostbyname($domains[$i]);

	//replace the domains with their corresponding IPs in the document.//
	$HTML	=	str_replace($domains, $ips, $HTML);
	return $HTML;

//fetch the images, cache them locally and then replace the URLs to the images on the page with one pointing at the cache//
function cacheFiles($HTML)
	include 'config.inc';
	//parse out all the URLs to images//
	//$pattern	=	"#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";	//matches most of the tags that I want.//
	$pattern	=	"#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
	$int_matches	=	preg_match_all($pattern, $HTML, $urls_matched);
	//array of file extensions to fetch for the user//
	$exts	=	explode(",", $ext_to_cache);
	//consolidate the matches into a list with only one entry per image.../
	foreach($urls_matched[1] as $image_to_cache)
		$ext		=	substr($image_to_cache, -3);
		//	If the current url isn't already in the list then append it to the list.	//
		if(!@in_array("http://".$image_to_cache, $images_to_cache) && in_array($ext, $exts))
			$images_to_cache[]	=	"http://".$image_to_cache;
	//go through the document again to try and get any urls that I would miss such as ones without an extension or ones inside javascript//
	//EXAMPLE:	document.write("<img src='http://ads.osdn.com/?ad_id=1715&alloc_id=1864&site_id=1&request_id=4691586&");	//
		$pattern	=	"#\<(img|script|iframe)(.*?)src=['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
		$int_matches	=	preg_match_all($pattern, $HTML, $urls_rematched);
		//go through the list and get rid of any that would already have been matched...ie the ones with an extension//
		foreach($urls_rematched[3] as $extless_image_to_cache)
			$ext		=	substr($extless_image_to_cache, -3);
			if(!@in_array("http://".$extless_image_to_cache, $images_to_cache) && $ext!=".")	//if it's not already in there and there isn't a '.' where there should be then it is to be added!
				$images_to_cache[]	=	"http://".$extless_image_to_cache;
		$debug	.= print_r($images_to_cache, TRUE)."\n";
	//fetch each image and store a copy of it here locally//
	//--store the images contents in a file that is named the MD5 sum of the file itself//
	require_once "Fetcher.class.inc";
	$get	=	new Fetcher;		//new class for fetching//
		foreach($images_to_cache as $image_to_cache_url)
			//MD5 the URL which will be the file's name//
			$image_MD5	=	md5($image_to_cache_url);
			//If the MD5 of the URL doesn't match any of the ones in the cache then fetch it//
			//check the age of the cache file...//
			$age		=	time()-@filemtime($path_to_cache.$image_MD5.".cache");
			if(@is_file($path_to_cache.$image_MD5.".cache") && $age < $max_age_cache)	// && (time()-filemtime($path_to_cache.$image_MD5))<$max_age_cache)
				//replace the urls in the document with URLs the the corresponding cache files//
				$HTML	=	str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
			else	//try to fetch the image//
				if($get->fetch($image_to_cache_url))	//fetching was successful!....
					$f_MD5		=	@fopen($path_to_cache.$image_MD5.".cache", "wb");
					$isSuccessful	=	@fwrite($f_MD5, $get->results);
						$HTML	=	str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
					//get rid of the URL that SBP could not fetch.//
						$HTML	=	str_replace($image_to_cache_url, "", $HTML);
	return $HTML;
function checkCache()
	include 'config.inc';
		if ($dh	=	opendir($path_to_cache))
			while (($filecnt = readdir($dh)) !== false)
			       if($filecnt == "." || $filecnt == "..")	continue;
				       $size		+=	fsize($path_to_cache.$filecnt);
			       else	//it's a file so add it's size to the total and add it's info to my file array in case everything is oversized.//
				       $size		+=	filesize($path_to_cache.$filecnt);
				       $files[$i][name]	=	$path_to_cache.$filecnt;
				       $files[$i][age]	=	time()-filemtime($path_to_cache.$filecnt);	//age in seconds//
	$size	=	$size/1024/1024;	//MB//

	//give the user some useless information//
	$this->debug	.=	"\n\tLocal cache size: (".round($size,2)." / ".$max_size_cache." ) MB\n";
	if($size > $max_size_cache)
		$degree		=	.25;
		//slim the cache by 25% starting with the oldest file first!//			
		$count		=	count($files);
		$length		=	$count * $degree;
		//since the oldest files are first in the array just hack off the first 25%	//
		$files_to_be_deleted	=	array_slice($files, 0, $length);	//make an array of the files that are to be deleted.
		$this->debug	.=	"\nCache was too large...removing $degree% of the cache.\n\n";		
		foreach($files_to_be_deleted as $target)
			if(unlink ($target[name]))
				$this->debug .= "\t\tFile: ".$target[name]." was removed from the cache.\n";

//will set any headers that are needed for the browswer to render the page properly.//
//if isURL is true then it treats specimen like a URL otherwise like a file!//
function setHeaders($specimen, $isURL)
	$ctype="";	//make sure it isn't being forced.//
		$ext	=	substr($specimen,-3 );
	else	//it's a file so grab the first 3 chars.//
		$ext	=	substr($specimen, 0, 3);
		case "pdf":	$ctype="application/pdf";		break;
//		case "exe":	$ctype="application/octet-stream";	break;
//		case "zip":	$ctype="application/zip";		break;
		case "doc":	$ctype="application/msword";		break;
		case "xls":	$ctype="application/vnd.ms-excel";	break;
		case "ppt":	$ctype="application/vnd.ms-powerpoint";	break;
		case "gif":	$ctype="image/gif";			break;
		case "png":	$ctype="image/png";			break;
		case "jpg":	$ctype="image/jpg";			break;
		case "bmp":	$ctype="image/bmp";			break;
		case "xml":	$ctype="text/xml";			break;
		case "rdf":	$ctype="text/xml";			break;	//I could get 'text/rdf' to display so I'm cheating  :(//
		case "rss":	$ctype="text/rss";			break;	   
			//nothing was determined so try reading some more chars....
			//if it was a file then try pulling more data from it.//
				$further	=	substr($specimen, 6, 4);
					case "JFIF":	$ctype	=	"image/jpg";	break;
					case "vers":	$ctype	=	"text/xml";	break;
					case "Exif":	$ctype	=	"image/jpg";	break;
	//	If a MIME type was discernable then set the header...otherwise just go with whatever is already there!	//
	if($ctype	!=	"")
		header("Content-Type: $ctype");

//	take any local URLs and make them fully qualified URLs	//
function completeURLs($HTML, $url, $edited_tag)
	$URI_PARTS	=	$this->parseURL($url);
	$path		=	trim($URI_PARTS["path"], "/");
	$host_url	=	trim($URI_PARTS["host"], "/");

	//$host	=	$URI_PARTS["scheme"]."://".trim($URI_PARTS["host"], "/")."/".$path;	//ORIGINAL
	$host		=	$URI_PARTS["scheme"]."://".$host_url."/".$path."/";
	$host_no_path	=	$URI_PARTS["scheme"]."://".$host_url."/";

	//	make sure the host doesn't end in '//'	//
	$host		=	rtrim($host, '/')."/";
	//	replace '//' with 'http://'	//
	$pattern	=	"#(?<=\"|'|=)\/\/#";	//	the '|=' is experimental as it's not necessary as far as I have seen	//
	$HTML		=	preg_replace($pattern, "http://", $HTML);
	//matches [src|href|background|action]="/  because in the following pattern I don't want the '/' to stay	//
	$pattern	=	"#(src|href|background|action)(=\"|='|=(?!'|\"))\/#i";
	$HTML		=	preg_replace($pattern, "\$1\$2".$host_no_path, $HTML);
	$pattern	=	"#(href|src|background|action)(=\"|=(?!'|\")|=')(?!http|ftp|https|\"|'|javascript:|mailto:)#i";
	$HTML		=	preg_replace($pattern, "\$1\$2".$host, $HTML);
	//TODO: need to be able to clean off the crap after the action.//
	$pattern	=	"#action=(.*?)>#is";
	$replace	=	"action=".$_SERVER['PHP_SELF']."><input type=\"hidden\" name=\"original_url\" value=\"\$1\">";
	$HTML		=	preg_replace($pattern, $replace, $HTML);
	//mathces '/[any assortment of chars or nums]/../'
	$pattern	=	"#\/(\w*?)\/\.\.\/(.*?)>#ims";
	$replace	=	"/\$2>";
	$HTML		=	preg_replace($pattern, $replace, $HTML);

	//matches '/./'
	$pattern	=	"#\/\.\/(.*?)>#ims";
	$replace	=	"/\$1>";
	$HTML		=	preg_replace($pattern, $replace, $HTML);
	//	handle CSS2 import's of CSS files	//
	//EXAMPLE:	<style type="text/css" media="screen">@import "/themes/blue/blue.css";</style>	//
//	$pattern	=	"#import .(.*?).;#ims";
//	$replace	=	"import '".$host."\$1';";
//	$HTML		=	preg_replace($pattern, $replace, $HTML);
	return $HTML;

//	redirect link targets through the proxy page	//
function proxyURLs($HTML, $edited_tag)
	//	gotta remove the BASE tag for sites like yahoo.com	//
	//	OR make my proggy insert the FULL URL to it's self	//
	$pattern	=	"#\<base(.*?)\>#ims";
	$replacement	=	"<!-- <base\$1> -->";	//comment it out for now//
	$HTML		=	preg_replace($pattern, $replacement, $HTML);

	//	edit <link tags so that 'edited="$edit_tag" ' is just before 'href'	//
	$pattern	=	"#\<link(.*?)(\shref=)#ims";
	$HTML		=	preg_replace($pattern, "<link\$1 edited=\"".$edited_tag."\"\$2", $HTML);
	//matches everything with a </a> after it on the same line....fails to match when that is on another line.//
	$pattern	=	"#(?<!edited=\"".$edited_tag."\"\s)(href='|href=\"|href=(?!'|\"))(?=(.+)\</a\>)(?!mailto:|http://ftp|ftp|javascript:|'|\")#ims";
	$HTML		=	preg_replace($pattern, "edited=\"".$edited_tag."\" \$1".$_SERVER['PHP_SELF'].'?url=', $HTML);

	return $HTML;

//	embed's my javascript menu just before the closing BODY tag.	//
function embedMenu($HTML, $url)
	$style_sheet		=	"<link rel=\"stylesheet\" href=\"layersmenu-sbp.css\" type=\"text/css\" /></link>\n";
	$browser_detection_code	=	"<script language=\"JavaScript\" type=\"text/javascript\">\n".
	$layersmenu_library	=	"<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu-library.js\"></script>\n";
	$layersmenu		=	"<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu.js\"></script>\n";

	include ($myDirPath . "lib/PHPLIB.php");
	include ($myDirPath . "lib/layersmenu-common.inc.php");
	include ($myDirPath . "lib/layersmenu.inc.php");

	//$mid	=	new LayersMenu(140, 20, 20);
	$mid = new LayersMenu(6, 7, 2, 1);	// Gtk2-like

	$mid->setMenuStructureFile($myDirPath . "layersmenu-sbp.txt");

	$header	=	$mid->makeHeader();

	//	all previous stuff appended into one line	//
	$source	=	"\n<!-- BEGIN: menu insert -->\n".$style_sheet.$browser_detection_code.$layersmenu_library.$layersmenu.$header."\n<!-- END: menu insert -->\n\n";
	//	code immediately after the opening head tag	//
	$pattern	=	"#\<head\>#i";	//	ONLY MATCHES THIS EXACT TAG!	//
	$after_head	=	"<head>".$source;
	$HTML	=	preg_replace($pattern, $after_head, $HTML, 1);	//	limit to 1 match...cpinews.net was doin some weird shit.	//

	$primary_table		="<table width=\"100%\" cellpadding=\"0\" cellspacing=\"0\" class=\"horbar\" style=\"border: 1px solid groove;\">\n";
	$secondary_table	=	"<tr valign=\"middle\">\n".
							"<table width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n";
	$td_menu		=				"<tr valign=\"center\">\n".
									"<td class=\"horbar2\" align=\"right\" style=\"border: 0px solid red\">\n".$mid->getMenu("hormenu1")."\n";
	$td_form		=					"</td>\n".
									"<td class=\"horbar2\" width=\"20\">&nbsp;</td>".
									"<td class=\"horbar2\">\n".
										"<form name=\"sbp_form\" method=\"POST\" action=\"".$PHP_SELF."\">\n".
											"<input type=\"text\" size=\"40\" name=\"form_url\" value=\"".$url."\" class=\"horbar2\" style=\"width: 300px;\">\n".
											"<input type=\"submit\" name=\"Go\" value=\"Go\" style=\"width: 30px;\" ".
	$close_tables		=				"</tr>\n".
	//	all previous stuff appended into one line	//
	$source	=	"\n<!-- BEGIN: menu insert -->\n".$primary_table.$secondary_table.$td_menu.$td_form.$close_tables."\n<!-- END: menu insert -->\n\n";

	//	code immediately after the opening body tag	//
	$pattern	=	"#\<body(.*?)\>#ims";
	$after_body	=	"<body\$1>".$source;
	$HTML	=	preg_replace($pattern, $after_body, $HTML, 1);

	return $HTML;

//	calculates the differences in microtime captures	//
function processTime($start, $end)
   list($a_dec, $a_sec)	=	explode(" ", $start);
   list($b_dec, $b_sec)	=	explode(" ", $end);
   return $b_sec - $a_sec + $b_dec - $a_dec;

}	//end of SBP class//
Return current item: Simple Browser Proxy