<?
/* Author: Josh Marquis
* Edited: 08/20/03
*
* License: Refer to the file 'LICENSE'
*/
class SBP
{
var $debug = ""; //If it's not empty it will cause a comment with it's contents to be appended to the end of the HTML//
function init()
{
include_once "config.inc";
}
// prepend the title with the following text //
function prependTitle($source, $text)
{
//important to keep case the same as XML feeds can be broken if the case of the tags do NOT match!!//
$HTML = preg_replace("#\<(title|TITLE)\>#", "<\$1>".$text, $source, 1);
return $HTML;
}
// my url parser that works better than the builtin function //
function parseURL($url)
{
// protocol(1), auth user(2), auth password(3), hostname(4), path(5), filename(6), file extension(7) and query(8) //
$pattern = "/^(?:(http[s]?):\/\/(?:(.*):(.*)@)?([^\/]+))?((?:[\/])?(?:[^\.]*?)?(?:[\/])?)?(?:([^\/^\.]+)\.([^\?]+))?(?:\?(.+))?$/i";
preg_match($pattern, $url, $matches);
$URI_PARTS["scheme"] = $matches[1];
$URI_PARTS["host"] = $matches[4];
$URI_PARTS["path"] = $matches[5];
return $URI_PARTS;
}
//make the HTML source more difficult to match patterns to.//
function scrambleHTML($HTML)
{
return $HTML;
}
function scrambleURL($HTML)
{
return $HTML;
}
//replace the domain names with ips where applicable//
//NOTE: If the hostname is unknown then the hostname is returned instead of the IP//
//NOTE: subdomains/vhosts don't like just the IP being used! //
function domain2IP($HTML)
{
//match all of the domains that are listed on the page...//
// TODO: handle all types of links: img, etc.
/* For now, this will only handle http:// style of links */
$pattern = "#http:\/\/(.*?)[\/|'|\"]#ims"; //get just the domain stuff....
$int_matches = preg_match_all($pattern, $HTML, $matches);
//consolidate the matches into a list with only one entry per domain.../
foreach($matches[1] as $domain)
{
// If the current domain isn't already in the list then append it to the list. //
if(!@in_array($domain, $domains))
{
$domains[] = $domain;
}
}
//resolve the domains into IPs//
for($i=0; $i < count($domains); $i++)
{
$ips[$i] = gethostbyname($domains[$i]);
}
//replace the domains with their corresponding IPs in the document.//
$HTML = str_replace($domains, $ips, $HTML);
return $HTML;
}
//fetch the images, cache them locally and then replace the URLs to the images on the page with one pointing at the cache//
function cacheFiles($HTML)
{
include 'config.inc';
//parse out all the URLs to images//
//$pattern = "#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims"; //matches most of the tags that I want.//
$pattern = "#(?<=src=|background=|href=)['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
$int_matches = preg_match_all($pattern, $HTML, $urls_matched);
//array of file extensions to fetch for the user//
$exts = explode(",", $ext_to_cache);
//consolidate the matches into a list with only one entry per image.../
foreach($urls_matched[1] as $image_to_cache)
{
$ext = substr($image_to_cache, -3);
// If the current url isn't already in the list then append it to the list. //
if(!@in_array("http://".$image_to_cache, $images_to_cache) && in_array($ext, $exts))
{
$images_to_cache[] = "http://".$image_to_cache;
}
}
//go through the document again to try and get any urls that I would miss such as ones without an extension or ones inside javascript//
//EXAMPLE: document.write("<img src='http://ads.osdn.com/?ad_id=1715&alloc_id=1864&site_id=1&request_id=4691586&"); //
$pattern = "#\<(img|script|iframe)(.*?)src=['|\"|\s]http:\/\/(.*?)['|\"|\s](.*?)>#ims";
$int_matches = preg_match_all($pattern, $HTML, $urls_rematched);
//go through the list and get rid of any that would already have been matched...ie the ones with an extension//
foreach($urls_rematched[3] as $extless_image_to_cache)
{
$ext = substr($extless_image_to_cache, -3);
if(!@in_array("http://".$extless_image_to_cache, $images_to_cache) && $ext!=".") //if it's not already in there and there isn't a '.' where there should be then it is to be added!
{
$images_to_cache[] = "http://".$extless_image_to_cache;
}
}
if(!empty($images_to_cache))
{
$debug .= print_r($images_to_cache, TRUE)."\n";
}
//fetch each image and store a copy of it here locally//
//--store the images contents in a file that is named the MD5 sum of the file itself//
require_once "Fetcher.class.inc";
$get = new Fetcher; //new class for fetching//
$get->init();
if(!empty($images_to_cache))
{
foreach($images_to_cache as $image_to_cache_url)
{
//MD5 the URL which will be the file's name//
$image_MD5 = md5($image_to_cache_url);
//If the MD5 of the URL doesn't match any of the ones in the cache then fetch it//
//check the age of the cache file...//
$age = time()-@filemtime($path_to_cache.$image_MD5.".cache");
if(@is_file($path_to_cache.$image_MD5.".cache") && $age < $max_age_cache) // && (time()-filemtime($path_to_cache.$image_MD5))<$max_age_cache)
{
//replace the urls in the document with URLs the the corresponding cache files//
$HTML = str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
}
else //try to fetch the image//
{
if($get->fetch($image_to_cache_url)) //fetching was successful!....
{
$f_MD5 = @fopen($path_to_cache.$image_MD5.".cache", "wb");
$isSuccessful = @fwrite($f_MD5, $get->results);
@fclose($f_MD5);
if($isSuccessful)
{
$HTML = str_replace($image_to_cache_url, $web_path_to_cache.$image_MD5.".cache", $HTML);
}
}
else
{
//get rid of the URL that SBP could not fetch.//
if(!$foolProof)
{
$HTML = str_replace($image_to_cache_url, "", $HTML);
}
}
}
}
}
return $HTML;
}
function checkCache()
{
include 'config.inc';
$size=0;
if(is_dir($path_to_cache))
{
if ($dh = opendir($path_to_cache))
{
$i=0;
while (($filecnt = readdir($dh)) !== false)
{
if($filecnt == "." || $filecnt == "..") continue;
if(is_dir($path_to_cache.$filecnt))
{
$size += fsize($path_to_cache.$filecnt);
}
else //it's a file so add it's size to the total and add it's info to my file array in case everything is oversized.//
{
$size += filesize($path_to_cache.$filecnt);
$files[$i][name] = $path_to_cache.$filecnt;
$files[$i][age] = time()-filemtime($path_to_cache.$filecnt); //age in seconds//
$i++;
}
}
closedir($dh);
}
}
$size = $size/1024/1024; //MB//
//give the user some useless information//
$this->debug .= "\n\tLocal cache size: (".round($size,2)." / ".$max_size_cache." ) MB\n";
if($size > $max_size_cache)
{
$degree = .25;
//slim the cache by 25% starting with the oldest file first!//
$count = count($files);
$length = $count * $degree;
//since the oldest files are first in the array just hack off the first 25% //
$files_to_be_deleted = array_slice($files, 0, $length); //make an array of the files that are to be deleted.
$this->debug .= "\nCache was too large...removing $degree% of the cache.\n\n";
foreach($files_to_be_deleted as $target)
{
if(unlink ($target[name]))
{
$this->debug .= "\t\tFile: ".$target[name]." was removed from the cache.\n";
}
}
}
}
//will set any headers that are needed for the browswer to render the page properly.//
//if isURL is true then it treats specimen like a URL otherwise like a file!//
function setHeaders($specimen, $isURL)
{
$ctype=""; //make sure it isn't being forced.//
if($isURL)
{
$ext = substr($specimen,-3 );
}
else //it's a file so grab the first 3 chars.//
{
$ext = substr($specimen, 0, 3);
}
switch(strtolower($ext))
{
case "pdf": $ctype="application/pdf"; break;
// case "exe": $ctype="application/octet-stream"; break;
// case "zip": $ctype="application/zip"; break;
case "doc": $ctype="application/msword"; break;
case "xls": $ctype="application/vnd.ms-excel"; break;
case "ppt": $ctype="application/vnd.ms-powerpoint"; break;
case "gif": $ctype="image/gif"; break;
case "png": $ctype="image/png"; break;
case "jpg": $ctype="image/jpg"; break;
case "bmp": $ctype="image/bmp"; break;
case "xml": $ctype="text/xml"; break;
case "rdf": $ctype="text/xml"; break; //I could get 'text/rdf' to display so I'm cheating :(//
case "rss": $ctype="text/rss"; break;
default:
{
//nothing was determined so try reading some more chars....
//if it was a file then try pulling more data from it.//
if(!$isURL)
{
$further = substr($specimen, 6, 4);
switch($further)
{
case "JFIF": $ctype = "image/jpg"; break;
case "vers": $ctype = "text/xml"; break;
case "Exif": $ctype = "image/jpg"; break;
}
}
}
}
// If a MIME type was discernable then set the header...otherwise just go with whatever is already there! //
if($ctype != "")
header("Content-Type: $ctype");
}
// take any local URLs and make them fully qualified URLs //
function completeURLs($HTML, $url, $edited_tag)
{
$URI_PARTS = $this->parseURL($url);
$path = trim($URI_PARTS["path"], "/");
$host_url = trim($URI_PARTS["host"], "/");
//$host = $URI_PARTS["scheme"]."://".trim($URI_PARTS["host"], "/")."/".$path; //ORIGINAL
$host = $URI_PARTS["scheme"]."://".$host_url."/".$path."/";
$host_no_path = $URI_PARTS["scheme"]."://".$host_url."/";
// make sure the host doesn't end in '//' //
$host = rtrim($host, '/')."/";
// replace '//' with 'http://' //
$pattern = "#(?<=\"|'|=)\/\/#"; // the '|=' is experimental as it's not necessary as far as I have seen //
$HTML = preg_replace($pattern, "http://", $HTML);
//matches [src|href|background|action]="/ because in the following pattern I don't want the '/' to stay //
$pattern = "#(src|href|background|action)(=\"|='|=(?!'|\"))\/#i";
$HTML = preg_replace($pattern, "\$1\$2".$host_no_path, $HTML);
$pattern = "#(href|src|background|action)(=\"|=(?!'|\")|=')(?!http|ftp|https|\"|'|javascript:|mailto:)#i";
$HTML = preg_replace($pattern, "\$1\$2".$host, $HTML);
//TODO: need to be able to clean off the crap after the action.//
$pattern = "#action=(.*?)>#is";
$replace = "action=".$_SERVER['PHP_SELF']."><input type=\"hidden\" name=\"original_url\" value=\"\$1\">";
$HTML = preg_replace($pattern, $replace, $HTML);
//mathces '/[any assortment of chars or nums]/../'
$pattern = "#\/(\w*?)\/\.\.\/(.*?)>#ims";
$replace = "/\$2>";
$HTML = preg_replace($pattern, $replace, $HTML);
//matches '/./'
$pattern = "#\/\.\/(.*?)>#ims";
$replace = "/\$1>";
$HTML = preg_replace($pattern, $replace, $HTML);
// handle CSS2 import's of CSS files //
//EXAMPLE: <style type="text/css" media="screen">@import "/themes/blue/blue.css";</style> //
// $pattern = "#import .(.*?).;#ims";
// $replace = "import '".$host."\$1';";
// $HTML = preg_replace($pattern, $replace, $HTML);
return $HTML;
}
// redirect link targets through the proxy page //
function proxyURLs($HTML, $edited_tag)
{
// gotta remove the BASE tag for sites like yahoo.com //
// OR make my proggy insert the FULL URL to it's self //
$pattern = "#\<base(.*?)\>#ims";
$replacement = "<!-- <base\$1> -->"; //comment it out for now//
$HTML = preg_replace($pattern, $replacement, $HTML);
// edit <link tags so that 'edited="$edit_tag" ' is just before 'href' //
$pattern = "#\<link(.*?)(\shref=)#ims";
$HTML = preg_replace($pattern, "<link\$1 edited=\"".$edited_tag."\"\$2", $HTML);
//matches everything with a </a> after it on the same line....fails to match when that is on another line.//
$pattern = "#(?<!edited=\"".$edited_tag."\"\s)(href='|href=\"|href=(?!'|\"))(?=(.+)\</a\>)(?!mailto:|http://ftp|ftp|javascript:|'|\")#ims";
$HTML = preg_replace($pattern, "edited=\"".$edited_tag."\" \$1".$_SERVER['PHP_SELF'].'?url=', $HTML);
return $HTML;
}
// embed's my javascript menu just before the closing BODY tag. //
function embedMenu($HTML, $url)
{
//HEADER--BEGIN//
$style_sheet = "<link rel=\"stylesheet\" href=\"layersmenu-sbp.css\" type=\"text/css\" /></link>\n";
$browser_detection_code = "<script language=\"JavaScript\" type=\"text/javascript\">\n".
file_get_contents($myDirPath."libjs/layersmenu-browser_detection.js")."\n".
"</script>\n";
$layersmenu_library = "<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu-library.js\"></script>\n";
$layersmenu = "<script language=\"JavaScript\" type=\"text/javascript\" src=\"libjs/layersmenu.js\"></script>\n";
include ($myDirPath . "lib/PHPLIB.php");
include ($myDirPath . "lib/layersmenu-common.inc.php");
include ($myDirPath . "lib/layersmenu.inc.php");
//$mid = new LayersMenu(140, 20, 20);
$mid = new LayersMenu(6, 7, 2, 1); // Gtk2-like
$mid->setMenuStructureFile($myDirPath . "layersmenu-sbp.txt");
$mid->parseStructureForMenu("hormenu1");
$mid->newHorizontalMenu("hormenu1");
$header = $mid->makeHeader();
// all previous stuff appended into one line //
$source = "\n<!-- BEGIN: menu insert -->\n".$style_sheet.$browser_detection_code.$layersmenu_library.$layersmenu.$header."\n<!-- END: menu insert -->\n\n";
// code immediately after the opening head tag //
$pattern = "#\<head\>#i"; // ONLY MATCHES THIS EXACT TAG! //
$after_head = "<head>".$source;
$HTML = preg_replace($pattern, $after_head, $HTML, 1); // limit to 1 match...cpinews.net was doin some weird shit. //
//HEADER--END//
//BODY--BEGIN//
$primary_table ="<table width=\"100%\" cellpadding=\"0\" cellspacing=\"0\" class=\"horbar\" style=\"border: 1px solid groove;\">\n";
$secondary_table = "<tr valign=\"middle\">\n".
"<td>\n".
"<table width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n";
$td_menu = "<tr valign=\"center\">\n".
"<td class=\"horbar2\" align=\"right\" style=\"border: 0px solid red\">\n".$mid->getMenu("hormenu1")."\n";
$td_form = "</td>\n".
"<td class=\"horbar2\" width=\"20\"> </td>".
"<td class=\"horbar2\">\n".
"<form name=\"sbp_form\" method=\"POST\" action=\"".$PHP_SELF."\">\n".
"<input type=\"text\" size=\"40\" name=\"form_url\" value=\"".$url."\" class=\"horbar2\" style=\"width: 300px;\">\n".
"<input type=\"submit\" name=\"Go\" value=\"Go\" style=\"width: 30px;\" ".
"onClick=\"javascript:document.sbp_form.action='?url='+document.sbp_form.form_url.value;\">\n".
"</form>\n".
"</td>\n";
$close_tables = "</tr>\n".
"</table>\n".
"</td>\n".
"</tr>\n".
"</table>\n".$mid->makeFooter()."\n";
// all previous stuff appended into one line //
$source = "\n<!-- BEGIN: menu insert -->\n".$primary_table.$secondary_table.$td_menu.$td_form.$close_tables."\n<!-- END: menu insert -->\n\n";
// code immediately after the opening body tag //
$pattern = "#\<body(.*?)\>#ims";
$after_body = "<body\$1>".$source;
$HTML = preg_replace($pattern, $after_body, $HTML, 1);
//BODY--END//
return $HTML;
}
// calculates the differences in microtime captures //
function processTime($start, $end)
{
list($a_dec, $a_sec) = explode(" ", $start);
list($b_dec, $b_sec) = explode(" ", $end);
return $b_sec - $a_sec + $b_dec - $a_dec;
}
} //end of SBP class//
?>