Location: PHPKode > scripts > wordDocumentHandler > worddocumenthandler/wordDocumentHandler.php
<?php
//########################################################################################
// -------------- Summary
// This class can be used to convert a MsWord document to html, rtf or text. As MsWord can read txt and rtf, input
// can be doc, but also be txt or rtf
// Of course, you need MsWord installed on the server, so Windows OS.
//
// -------------- Author
// Logan Dugenoux - 2003
// hide@address.com
// http://www.peous.com/logan/
//
// -------------- License
// GPL
//
// -------------- Methods :
// - convertWordDocumentToString($inFile , $outFormat="html")				"html", "htm", "rtf" or "txt"
// - convertWordDocumentToFile($inFile ,$outFile, $outFormat="html")		"html", "htm", "rtf" or "txt"
// - cleanWordHTML( ... ) see help below
// - getLastError() returns last error if function returns false.
//
// ------------- Example :
// require ("wordDocumentHandler.php");
// $w = new wordDocumentHandler();
// $myWordFile = "my doc file.doc";
// $txt = $w->convertWordDocumentToString( $myWordFile , "txt" );
// echo  $txt;
//
//  ------------- About COM
// http://php.planetmirror.com/manual/en/faq.com.php
//
// * If you got error "Cannot instantiate non-existent class: com" 
//     Edit your php.ini and set com.allow_dcom=true
//
// Have fun !!!
//########################################################################################

	class wordDocumentHandler
	{
		var $lastError = "";

		function wordDocumentHandler() 
   		{
   		}
   		
   		function getLastError() 
   		{
   			return $this->lastError;
   		}
   		
   		function convertWordDocumentToString($inFile , $outFormat="html")
   		{
			// working space
			$dataPath = dirname(__FILE__)."/wordDocumentHandler/";
			@mkdir( $dataPath );
			$tempFile = tempnam ( $dataPath, "wrd" );
			$dataPath = str_replace(".", "", $tempFile)."/";
			@mkdir( $dataPath );
			$htmlFile = $dataPath."document.html";
			unlink( $tempFile );	// it is created I only want a name !
			
			// Conversion
			$this->convertWordDocumentToFile( $inFile, $htmlFile, $outFormat );
			$htmlCnt = file_get_contents($htmlFile);
			
			// Remove temp files
			$this->recursiveDirDelete(	$dataPath );
			
			return $htmlCnt;
   		}
   		
   		
   		function convertWordDocumentToFile($inFile ,$outFile, $outFormat="html")
   		{
   			$this->lastError = "";
   			$outFormatNumber = 8;		// default = HTML
   			if ($outFormat=="txt")		$outFormatNumber = 2;
   			if ($outFormat=="rtf")		$outFormatNumber = 6;
   			
   			// Create MsWord instance
 			$comObject = new COM("Word.Application");
 			if (!$comObject)
 			{
 				$this->lastError = "COM object of microsoft word cannot be found. Check COM permissions or Office install";
 				return false;
 			}

			// Open doc in Word
			if (!$comObject->Documents->Open($inFile))
			{
				$comObject->Quit(0); 	// Always quit ! otherwise msword.exe will stay
				$this->lastError = $inFile." cannot be opened by Word";
				return false;
			}
			
			// Save doc
			if (!$comObject->ActiveDocument->SaveAs($outFile,$outFormatNumber))
			{
				$comObject->Quit(0); 	// Always quit ! otherwise msword.exe will stay
				$this->lastError = "MsWord cannot save ".$outFile;
				return false;
			}
			
			$comObject->Quit(0); 	// Always quit ! otherwise msword.exe will stay
			return true;
		}
		
		
		
		function cleanWordHTML( 
			&$htmlCnt,							// tring to clean
												// chaine a nettoyer
			
			$supprimer_tout_style = 1,			// remove all styles, so removes CSS also
												// supprimer tous les styles. supprime la mise en forme CSS donc...
												
			$supprimer_if = 1,					// remove all M$ if
												// supprimer tout ce qui entre "if" microsoft
			
			$supprimer_espaces = 1,				// remove dbl spaces (necesary for the 3 next options)
												// supprimer les dbl espaces (nécessaires pour les 3 options suivantes)
			
			$supprimer_def_styles_inutiles = 1,	// remove unused CSS (>200 Ko !!)
												// virer les def de style CSS non utilisées dans class=...
												
			$recherche_balises_approfindie = 0,	// search styles within tags	(SLOOOOW)
												// recherche précise de toutes les balises <h1>, <h2>, ...
												
			$binder_nom_classes = 1,			// shorten class names binding them
												// remplacer les noms des classes par d'autres plus courts
												
			$supprimer_les_style_none = 1,		// remove "border=none"
												// supprie tous les styles "border:none", etc... qui sont pris comme tels par défaut
												
			$supprimer_trucs_word = 1,			// remove various useless tags
												// supprime diverses balises WORD
												
			$supprimer_balises_span = 1,		// remove <span> tags (not thein content)
												// supprime les balises <span> (pas leur contenu)
												
			$supprimer_toutes_balises = 0		// remove all tags (better use TXT output for conversion)
			)
		{
			
			//---------------
			// Do this before processing classes
			if ($supprimer_espaces)
			{
				$htmlCnt = str_replace( "\n", " ", $htmlCnt );
				$htmlCnt = str_replace( "\r", " ", $htmlCnt );
				$htmlCnt = str_replace( "\t", " ", $htmlCnt );
				$this->virerEspace( $htmlCnt );
			}
			
			// Remove IFs
			if ($supprimer_if)
			{
				$this->extractIf( $htmlCnt, 0 );
			}
			
			if ($supprimer_toutes_balises)
			{
				$htmlCnt = ereg_replace( "<style>[^<]*</style>", "", $htmlCnt );	// ici on vire aussi le contenu
				$htmlCnt = ereg_replace( "<[^>]*>", "", $htmlCnt );	
			}	
			
			if ($supprimer_def_styles_inutiles)
			{
				//---- Find used tags
				$lesClasses = array();
				// -1 within the styles class=...
				$balises = array();
				preg_match_all("(class=[^>]*)", $htmlCnt, $balises, PREG_SET_ORDER );
				
				for ($i=0;$i<=sizeof($balises);$i++)
				{
					$good = explode(" ", $balises[$i][0]);
					if (strlen(substr($good[0],6))>0)
						$lesClasses[substr($good[0],6)] = 1;
				}
				
				if ($recherche_balises_approfindie)
				{
					// SLOOOOOOOOOOOOOW
					// -2 Directly tag names	<tagName>
					$balises = array();
					preg_match_all("(<[^>]*)", $htmlCnt, $balises, PREG_SET_ORDER );
					for ($i=0;$i<=sizeof($balises);$i++)
					{
						$good = explode(" ", $balises[$i][0]);
						if (substr($good[0],1,1)=="/")
							continue;
						if (substr($good[0],1,1)=="!")
							continue;
						if (strlen(substr($good[0],1))>0)
							$lesClasses[substr($good[0],1)] = 1;
					}
				}
				else
				{
					$lesClasses["h1"] = 2;
					$lesClasses["h2"] = 2;
					$lesClasses["h3"] = 2;
					$lesClasses["h4"] = 2;
					$lesClasses["h5"] = 2;
					$lesClasses["h6"] = 2;
				}
				
				// end of research
				$balisesOk = "";
				foreach( $lesClasses as $k => $type )
				{
					if ($balisesOk)
						$balisesOk .= "|";
					$balisesOk .= $k;
				}
				$regExpression = "((".$balisesOk.") *\\{[^\\}]*\\})";
				// Find used styles
				$stylesDef = array();
				preg_match_all($regExpression, $htmlCnt, $stylesDef , PREG_SET_ORDER );
				$stylesDefString = "";
				for ($i=0;$i<=sizeof($stylesDef);$i++)
				{
					$stylesDefString .= "\n".$stylesDef[$i][0]."\n";
				}
				
				if ($binder_nom_classes)
				{
					$i=0;
					foreach( $lesClasses as $k => $type )
					{
						if ($type==1)		// style
						{
							$htmlCnt = str_replace($k, "c".$i, $htmlCnt);
							$stylesDefString = str_replace($k, "c".$i, $stylesDefString);
							$i++;
						}
					}
				}
				
				// Remove all <style>	... </style> tags
				$pLastStylePos = 0;
				$pStyleBegin = $this->strpoz( $htmlCnt, "<style>", $pLastStylePos );
				$pFirstStyleBegin = $pStyleBegin;
				if ($pStyleBegin != -1)
					$pStyleEnd = $this->strpoz( $htmlCnt, "</style>", $pStyleBegin );
				while ($pStyleBegin != -1)
				{
					$pLastStylePos = $pStyleEnd;
					$htmlCnt = substr( $htmlCnt, 0, $pStyleBegin ).substr( $htmlCnt,$pStyleEnd+8 );
					$pStyleBegin = $this->strpoz( $htmlCnt, "<style>", $pLastStylePos );
					if ($pStyleBegin != -1)
						$pStyleEnd = $this->strpoz( $htmlCnt, "</style>", $pStyleBegin );
				}
				
				// Write only necesary style
				if ($stylesDefString)
				{
					$htmlCnt = 	substr($htmlCnt, 0, $pFirstStyleBegin).
							"<style>\n<!--".$stylesDefString."-->\n</style>".
							substr($htmlCnt, $pFirstStyleBegin);
				}
			}
			
			if ($supprimer_tout_style)
			{
				$htmlCnt = ereg_replace( "style='[^']*'", "", $htmlCnt );	
			}
				
			if ($supprimer_les_style_none)
			{
					// C bon c par défault !
				$htmlCnt = str_replace( "text-decoration:none", "", $htmlCnt );
				$htmlCnt = str_replace( "text-underline:none", "", $htmlCnt );
				$htmlCnt = str_replace( "border-left:none", "", $htmlCnt );
				$htmlCnt = str_replace( "border-top:none", "", $htmlCnt );
				$htmlCnt = str_replace( "border-bottom:none", "", $htmlCnt );
				$htmlCnt = str_replace( "border-right:none", "", $htmlCnt );
			}
			
			if ($supprimer_trucs_word)
			{
				$htmlCnt = ereg_replace( "v:shapes=\"[^\"]*\"", "", $htmlCnt );
				$htmlCnt = ereg_replace( "style='tab-stops:[^']*'", "", $htmlCnt );
				$htmlCnt = ereg_replace( "<o[^>]*></o:p>", "", $htmlCnt );		// balises span vides
				$htmlCnt = ereg_replace( "<p[^>]*></p>", "", $htmlCnt );		// balises span vides
			 	$htmlCnt = ereg_replace( "mso-(^[';])*", "", $htmlCnt );
			 	$htmlCnt = ereg_replace( "field-code-(^[';])*", "", $htmlCnt );
			}
			
			if ($supprimer_balises_span)
			{
				$htmlCnt = ereg_replace( "<span[^>]*>", "", $htmlCnt );	
				$htmlCnt = str_replace( "</span>", "", $htmlCnt );
			}
		
			// Last optim
			if ($supprimer_espaces)
			{
				$this->virerEspace( $htmlCnt );
			}	
		}	
		
		
		
		//--------------- PRIVATE FUNCTIONS -----------------
		
		function virerEspace( &$htmlCnt )
		{
			// much much faster than $htmlCnt = ereg_replace( " +", " ", $htmlCnt );
			// and works if there is less than 256 spaces at the same time
			$htmlCnt = str_replace( "                                ", " ", $htmlCnt);
			$htmlCnt = str_replace( "                ", " ", $htmlCnt);
			$htmlCnt = str_replace( "        ", " ", $htmlCnt);
			$htmlCnt = str_replace( "    ", " ", $htmlCnt);
			$htmlCnt = str_replace( "  ", " ", $htmlCnt);
		}
		
		function extractIf( &$str, $pos )
		{
			$pIf1 = $this->strpoz($str, "<![if", $pos);
			$pIf2 = $this->strpoz($str, "<!--[if", $pos);
			$pIf = $this->zmin( $pIf1 ,$pIf2 );
			if ($pIf>=0)
			{
				$pIfEnd = $this->strpoz($str, ">", $pIf);
				$pNextIf1 = $this->strpoz($str, "<![if", $pIfEnd);
				$pNextIf2 = $this->strpoz($str, "<!--[if", $pIfEnd);
				$pNextIf = $this->zmin( $pNextIf1, $pNextIf2 );
				if ($pNextIf>=0)
				{
					$this->extractIf( $str, $pNextIf );
				}
				$pNextEndIf1 = $this->strpoz($str, "<![endif]", $pIfEnd);
				$pNextEndIf2 = $this->strpoz($str, "<![endif]", $pIfEnd);
				$pNextEndIf = $this->zmin( $pNextEndIf1, $pNextEndIf2 );
				
				$pNextEndIfEnd1 = $this->strpoz($str, ">", $pNextEndIf);
				$pNextEndIfEnd2 = $this->strpoz($str, ">", $pNextEndIf);
				$pNextEndIfEnd = $this->zmin( $pNextEndIfEnd1, $pNextEndIfEnd2 );
		
				$pCond = $this->strpoz($str, "[", $pIf);
				$ifCondition = substr( $str, $pCond+1+2+1, $pIfEnd-$pCond-2-2-1 );
				
				$oki = false;
				if ($ifCondition=="!vml")
				{
					$oki = true;
				}
				$insideIf = "";
							
				//	$pos		$pIf	$pIfEnd				$pNextEndIf	$pNextEndIfEnd
				//		....	<![if...	>		...		<![end if]>						....
				if ($oki)
				{
					$insideIf = substr( $str, $pIfEnd+1,$pNextEndIf-($pIfEnd+1) );
				}
	
				$str = 		substr( $str, 0, $pIf ).
							$insideIf.
							substr( $str, $pNextEndIfEnd+1 );
			}
			else
			{
				return substr($str, $pos);
			}
		}
		
		function zmin( $p1, $p2 )
		{
			return (($p1>=0)&&(($p1<$p2)||($p2==-1)))?$p1:$p2;
		}
		function strpoz( $mystring, $findme, $start )
		{
			$res = @strpos( $mystring, $findme, $start );
			if ($res===false)
				return -1;
			return $res;
		}
		
		function recursiveDirDelete($thePath)
		{	if(false === @is_dir($thePath)){		@unlink($thePath); 
			clearstatcache();
			if (@file_exists($thePath)) { 
			if(substr_count($thePath,":")){
			@system("del ".eregi_replace("/","\\",$thePath));
			}else{		@system("rm $thePath");	}	}
			clearstatcache();
			if (@file_exists($thePath)){	return false;	}else{		return true;		}
			}else{		$dh = @opendir($thePath);		while(($file = @readdir($dh)) !==false ){
			if($file != "." && $file != ".."){		$fullpath = $thePath.$file; 
			if(@is_dir($fullpath))$fullpath.="/";		if (!$this->recursiveDirDelete($fullpath)){
			  closedir($dh);		  return false;		}		}		}
			@closedir($dh);		@rmdir($thePath); 
			clearstatcache();		if (@file_exists($thePath)) { 
			if(substr_count($thePath,":")){		@system("del ".eregi_replace("/","\\",$thePath));
			}else{		@system("rmdir $thePath");	}	}
			clearstatcache();	if (@file_exists($thePath)){	return false;	}else{
			return true;	}	}	} 
		}
?>
Return current item: wordDocumentHandler