Location: PHPKode > scripts > CriosWeb_HTML_Cleaner > crioswebhtmlcleaner/HTMLCleaner.php
<?php
/*HTMLCleaner 1.0 [stable] (c) 2007-2009 Lucian Sabo
HTML source code cleaner (great help for cleaning MS Word content)
hide@address.com

Licenced under Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported (http://creativecommons.org/licenses/by-nc-sa/3.0/)
for personal, non-commercial use
--------*/
define("TAG_WHITELIST",0);
define("TAG_BLACKLIST",1);
define("ATTRIB_WHITELIST",0);
define("ATTRIB_BLACKLIST",1);

class HTMLCleaner
{
var $Options;
/*var $Tag_whitelist = array('table','tbody','thead','tfoot','tr','th','td','colgroup','col',
'p','br','hr','blockquote',
'b','i','u','sub','sup','strong','em','tt','var',
'code','xmp','cite','pre','abbr','acronym','address','samp',
'fieldset','legend',
'a','img',
'h1','h2','h3','h4','h4','h5','h6',
'ul','ol','li','dl','dt',
'frame','frameset',
'form','input','select','option','optgroup','button','textarea'); */

var $Tag_whitelist = '<table><tbody><thead><tfoot><tr><th><td><colgroup><col>
<p><br><hr><blockquote>
<b><i><u><sub><sup><strong><em><tt><var>
<code><xmp><cite><pre><abbr><acronym><address><samp>
<fieldset><legend>
<a><img>
<h1><h2><h3><h4><h4><h5><h6>
<ul><ol><li><dl><dt>
<frame><frameset>
<form><input><select><option><optgroup><button><textarea>';

//add <html><head><meta><title> to generate proper page
//don't forget to remove <body strip bellow

var $Attrib_blacklist = 'id|on[\w]+';
var $CleanUpTags = array('a','span','b','i','u','strong','em','big','small','tt','var','code','xmp','cite','pre','abbr','acronym','address','q','samp','sub','sup');//array of inline tags that can be merged
var $TidyConfig;
var $Encoding = 'latin1';
var $Version = '1.0';
function HTMLCleaner() {
	$this->Options = array(
				'RemoveStyles'			=> true,	//removes style definitions like style and class
				'IsWord'				=> true,	//Microsoft Word flag - specific operations may occur
				'UseTidy'				=> true,	//uses the tidy engine also to cleanup the source (reccomended)
				'TidyBefore'			=> false,	//apply Tidy first (not reccomended as tidy messes up sometimes legitimate spaces
				'CleaningMethod'		=> array(TAG_WHITELIST,ATTRIB_BLACKLIST),	//cleaning methods
				'OutputXHTML'			=> true,	//converts to XHTML by using TIDY.
				'FillEmptyTableCells' 	=> true, 	//fills empty cells with non-breaking spaces
				'DropEmptyParas'		=> true,	//drops empty paragraphs
				'Optimize'				=>true,		//Optimize code - merge tags
				'Compress'				=> false
				);	//trims all spaces (line breaks, tabs) between tags and between words.
	
	// Specify TIDY configuration
	$this->TidyConfig = array(
	       'indent'         				=> true, /*a bit slow*/
	       'output-xhtml'   				=> true, //Outputs the data in XHTML format
		   'word-2000'						=> false, //Removes all proprietary data when an MS Word document has been saved as HTML
		   //'clean'						=> true, /*too slow*/
		   'drop-proprietary-attributes'	=>true, //Removes all attributes that are not part of a web standard
		   'hide-comments' 					=> true, //Strips all comments
		   'preserve-entities' 				=> true,	// preserve the well-formed entitites as found in the input
		   'quote-ampersand' 				=> true,//output unadorned & characters as &amp;.
		   'show-body-only' 				=> true,
		   'wrap'           				=> 200
		   ); //Sets the number of characters allowed before a line is soft-wrapped
}
/*-----------------------------------------------------------------------------*/
function RemoveBlacklistedAttributes($attribs) {
		//the attribute _must_ have a line-break or a space before
		$this->html =  preg_replace('/[\s]+('.$attribs.')=[\s]*("[^"]*"|\'[^\']*\')/i',"",$this->html); //double and single quoted
		$this->html =  preg_replace('/[\s]+('.$attribs.')=[\s]*[^ |^>]*/i',"",$this->html); 	//not quoted
}

/*-----------------------------------------------------------------------------*/
function TidyClean() {
	if(!class_exists('tidy')) {
			if(function_exists( 'tidy_parse_string' ) ) {
				//use procedural style for compatibility with PHP 4.3
				tidy_set_encoding($this->Encoding);
			
				foreach ($this->TidyConfig as $key => $value) {
				   tidy_setopt($key,$value);
				}
			tidy_parse_string($this->html);
			tidy_clean_repair();
			$this->html = tidy_get_output();	    
			}
			else
				print("<b>No tidy support. Please enable it in your php.ini.\r\nOnly basic cleaning is beeing applied\r\n</b>");
	}
	else {
			//PHP 5 only !!!
			$tidy = new tidy;
			$tidy->parseString($this->html, $this->TidyConfig, $this->Encoding);
			$tidy->cleanRepair();
			$this->html = $tidy;
	}
}
/*-----------------------------------------------------------------------------*/	
function cleanUp($encoding = 'latin1') {  

if(!empty($encoding))
	$this->Encoding = $encoding;

//++++
if($this->Options['IsWord']){
	$this->TidyConfig['word-2000'] = true;
	$this->TidyConfig['drop-proprietary-attributes'] = true;
}
else
	$this->TidyConfig['word-2000'] = false;

//++++
if($this->Options['OutputXHTML']){
	$this->Options['UseTidy'] = true;
	$this->TidyConfig['output-xhtml'] = true;
}
else
	$this->TidyConfig['output-xhtml'] = false;

//++++
// Tidy
if($this->Options['UseTidy']){
	if($this->Options['TidyBefore'])
		$this->TidyClean();
}  
      
// remove escape slashes  
$this->html = stripslashes($this->html);  
      
//++++
if($this->Options['CleaningMethod'][0] == TAG_WHITELIST){
	// trim everything before the body tag right away, leaving possibility for body attributes  
	if(preg_match("/<body/i", "$this->html"))
		$this->html = stristr( $this->html, "<body");  
    
	//patch strip_tags bugs with the lt char (ex: <p>1<2</p> shows <p>1</p>)
	
	// strip tags, still leaving attributes, second variable is allowable tags  
	$this->html = strip_tags($this->html, $this->Tag_whitelist);  
	}
	        
//++++
if($this->Options['RemoveStyles'])
	//remove class and style definitions from tidied result
	$this->RemoveBlacklistedAttributes('class|style');
	
//++++
if($this->Options['IsWord']){	
	$this->RemoveBlacklistedAttributes('lang|[ovwxp]:\w+'); 
}
	
//++++
if($this->Options['CleaningMethod'][1] == ATTRIB_BLACKLIST){
	if(!empty($this->Attrib_blacklist))
		$this->RemoveBlacklistedAttributes($this->Attrib_blacklist);
}	

//++++
if($this->Options['Optimize']){
	//Optimize until nothing can be done for PHP 5, twice for PHP 4
	if((int)phpversion() >= 5){
		$repl = 1;
		while($repl){
			$repl = 0;
			foreach($this->CleanUpTags as $tag){
			$this->html = preg_replace("/<($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<\/($tag)>/i","\\2", $this->html,-1,$count); //strip empty inline tags (must be on top of merge inline tags)
			$repl += $count;
			$this->html = preg_replace("/<\/($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<($tag)>/i","\\2", $this->html,-1,$count);//merge inline tags
			$repl += $count;				
		}
	}
	
	}
	else {//PHP 4
			$repl = 1;
			while($repl){
				$repl = 0;
				foreach($this->CleanUpTags as $tag){
					$count = preg_match("/<($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<\/($tag)>/i", $this->html);
					$repl += $count;
					$this->html = preg_replace("/<($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<\/($tag)>/i","\\2", $this->html); //strip empty inline tags (must be on top of merge inline tags)	
					
					$count = preg_match("/<\/($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<($tag)>/i", $this->html);
					$repl += $count;
					$this->html = preg_replace("/<\/($tag)[^>]*>[\s]*([(&nbsp;)]*)[\s]*<($tag)>/i","\\2", $this->html);//merge inline tags
					}
			}	
	}//end php version test
		
	
	//drop empty paras after merging tags
	if($this->Options['DropEmptyParas'])
		$this->html = preg_replace('/<(p|h[1-6]{1})([^>]*)>[\s]*[(&nbsp;)]*[\s]*<\/(p|h[1-6]{1})>/i',"\r\n", $this->html);
	
	//trim extra spaces only if tidy is not set to indent
		if(!$this->TidyConfig['indent']){
			$this->html = preg_replace('/([^<>])[\s]+([^<>])/i',"\\1 \\2", $this->html);//trim spaces between words
			$this->html = preg_replace('/[\n|\r|\r\n|][\n|\r|\r\n|]+</i',"<", $this->html);	//trim excess spaces before tags
		}
}//end optimize
//++++
	
	//must be on top of	FillEmptyTableCells, because it can strip nbsp enclosed in paras
	if($this->Options['DropEmptyParas'] && !$this->Options['Optimize'])
		$this->html = preg_replace('/<(p|h[1-6]{1})([^>]*)>[\s]*[(&nbsp;)]*[\s]*<\/(p|h[1-6]{1})>/i',"\r\n", $this->html);
//++++

	if($this->Options['FillEmptyTableCells'])
		$this->html = preg_replace("/<td([^>]*)>[\s]*<\/td>/i", "<td\\1>&nbsp;</td>", $this->html);
   
//++++

    if($this->Options['Compress']){
		$this->html = preg_replace('/>[\s]+/',">", $this->html);	//trim spaces after tags
		$this->html = preg_replace('/[\s]+<\//',"</", $this->html);	//trim spaces before end tags
		$this->html = preg_replace('/[\s]+</',"<", $this->html);	//trim spaces before tags
		$this->html = preg_replace('/([^<>])[\s]+([^<>])/',"\\1 \\2", $this->html);//trim spaces between words
	}
//++++
// Tidy
if($this->Options['UseTidy']){
	if(!$this->Options['TidyBefore'])
		$this->TidyClean();
}  
	
	return $this->html;    
} //end cleanup
/*-----------------------------------------------------------------------------*/
}
Return current item: CriosWeb_HTML_Cleaner