Location: PHPKode > scripts > Clean Up MS Word Mess Utility > index.php
<?PHP
/*
* Clean MS Word HTML
*
* @package Clean MS Word HTML
* @author $Author: sheiko $  
* @copyright (c) Dmitry Sheiko http://www.cmsdevelopment.com 
*/ 

header("Content-type: text/html; charset=UTF-8");
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>Clean MS Word HTML</title>
<style>
    body { background-color: #D8DCE0; font-family: Tahoma; font-size: 14px; }
    h1 { font-family: Tahoma; color: #fff; font-size: 18px; }
    div.message-box {
        padding: 0;
        margin: 40px;
        width: 70%;
    }
    div.message-box > div.header {
        background-color: #3C5F87;
        -moz-border-radius: 5px 5px 0 0;
        -webkit-border-radius: 5px 5px 0 0;
        padding: 10px 20px;
    }
    div.message-box > div.body {
        padding: 10px 20px;
        background-color: #fff;
        -moz-border-radius: 0 0 5px 5px;
        -webkit-border-radius: 0 0 5px 5px;
    }
    h3 {font-size: 14px; margin: 5px 0 2px 0; color: #3C5F87; }
	iframe { width: 100%; height: 200px; border: 1px solid grey; background-color: #FFF; color: #000;}
	textarea { width: 100%; height: 200px; border: 1px solid grey; background-color: #FFF; color: #000;}
	.hidden { display: none; }
</style>
<script type="text/javascript">
<!--
(function(){

// JS Helpers
function $(id) { 
    return document.getElementById(id);
}
function $$(name) { 
    return document.getElementsByTagName(name)[0];
}
function $$$(name) { 
    return document.getElementsByName(name)[0];
}
$.publish = function(el, eType, func) {
      if (document.addEventListener) {
        el.addEventListener(eType, func, true);
      } else if (document.attachEvent) {
          el.attachEvent('on'+eType, func, true);
      }
}

// Model
function cleanUp(code) {
    var _code = code;

    this.removeDecorator = function(tag) {
        _code = _code.replace(new RegExp("<"+tag+"[^>]*>","gi"), "");
        _code = _code.replace(new RegExp("<\/"+tag+">","gi"), "");
    }
    this.removeEmptyElement = function(tag) {
        _code = _code.replace(new RegExp("<"+tag+"([^>])*>(&nbsp;)*\s*<\/"+tag+">","gi"), "");
    }
    this.removeAttributeEverywhere = function(attr) {
        _code = _code.replace(new RegExp("<([a-zA-Z]+) ([^>]*)"+attr+"=\"[^\"]+\"([^>]*)>","gi"), "<$1 $2 $3>");
    }
    this.removeTagAttributes = function(tag) {
        _code = _code.replace(new RegExp("<("+tag+") [^>]*>","gi"), "<$1>");
    }
    this.removeElementByPrefix = function(pref) {
        _code = _code.replace(new RegExp("<"+pref+"\:([^>])*>([^<])*<\/"+pref+"\:([^>])*>","gi"), "");
        _code = _code.replace(new RegExp("<\\/?"+pref+"\:([^>])*>","gi"), "");
    }
    this.removeElement = function(tag) {
        _code = _code.replace(new RegExp("<"+tag+"([^>])*>([^<])*<\/"+tag+">","gi"), "");
    }
    this.removeSelfClosedElement = function(tag) {
        _code = _code.replace(new RegExp("<"+tag+"([^>])*>","gi"), "");
    }

    this.removeCharRepeats = function(char, subst) {
        _code = _code.replace(new RegExp(char+"+","gi"), subst);
    }
    this.removeComments = function() {
        _code = _code.replace(new RegExp("<![-|\[][^>]*>","gi"), "");
    }
    this.removeLocalCalls = function(tag, attr) {
        _code = _code.replace(new RegExp("<"+tag+"\\s+[^>]*"+attr+"=\"file\:[^>]+>","gi"), "");
    }

    this.run = function() {
    	this.removeElementByPrefix("[a-zA-Z]{1,3}");
        this.removeDecorator('font');
        this.removeDecorator('span');
        this.removeEmptyElement('p');
        this.removeAttributeEverywhere('class');
        this.removeAttributeEverywhere('style');
        this.removeTagAttributes('div');
        this.removeTagAttributes('p');
        this.removeComments();
        this.removeLocalCalls('img', 'src');
        this.removeLocalCalls('link', 'href');
        this.removeSelfClosedElement("\\?xml");
        this.removeSelfClosedElement("meta");
        this.removeElement('xml');
        this.removeElement('style');
        this.removeCharRepeats(" ", " ");
        this.removeCharRepeats("\\n\\s", "\n");
        return _code;
    }

}

function Init() {
	try {
      $('edit').contentWindow.document.designMode = "on";
      $.publish($$$('reset'), 'click', function(e){
    	  $('targetCode').className="hidden"
          $('richTextTitle').innerHTML = 'Rich Text';
    	  $('edit').contentWindow.document.body.innerHTML = '';
      });
      $.publish($$$('cleanUp'), 'click', function(e){
          var dirtyHtml = $('edit').contentWindow.document.body.innerHTML;
          if(dirtyHtml.length>0) {
            $('targetCode').className=""
            $('richTextTitle').innerHTML = 'Clean Rich Text';
	          var out =  new cleanUp(dirtyHtml).run();
	          $('src').value = out;
	          $('edit').contentWindow.document.body.innerHTML= out;
          } else {
              alert('There is nothing to clean up. Insert rich text first.');
          }
      });
	} catch(e) {
	       alert(e + " handled higher up.");      // Return error message.
	}
}
window.onload = function(){
    Init();
};
})(); // -->
</script>
</head>
<body>

        <div class="message-box">
        <div class="header">
            <h1>Clean MS Word Utility</h1>
        </div>
        <div class="body">
        <p>
            This little utility is written by <a href="http://dsheiko.com/aboutme/">me</a> to help you clean up the HTML derived by MS-Word, which initially full of extra code you don't need when posting the content on a blog.
        </p>
	<p>Mark a fragment in MS-Word, copy (Ctrl-C) and insert that in this application (Ctrl-V); Click Clean up HTML button.
	Well, you have clean HTML now.</p>

	<button name="cleanUp" type="button">Clean up HTML</button>
	<button name="reset" type="button">Reset</button>
	<h3 id="richTextTitle">Rich Text</h3>
	<iframe id="edit"></iframe>
	<div id="targetCode" class="hidden">
		<h3>Clean HTML Code</h3>
		<textarea id="src">
		</textarea>
	</div>
	<p>(C) 2010 Dmitry Sheiko</p>
        </div>
        </div>
</body>
</html>
Return current item: Clean Up MS Word Mess Utility