<?
/**
* This package requires HTMLSax3 package
*/
require_once 'XML/HTMLSax3.php';
class Html2Text {
var $anchor_char;
var $text = '';
var $links_list;
var $build_links;
var $block_elements = array('div','ol','ul','table','p','blockquote');
var $remove_elements = array('style','script');
function Html2Text(){
$this->anchor_char = chr(255);
$this->text_count = 0;
$this->links_count = 0;
$this->build_links = false;
$this->link_list = array();
}
function _openHandler(&$parser, $name, $attrs) {
$name = strtolower($name);
if($name == 'hr'){
$this->text .= "\n---------------------\n";
}
else if($name == 'br'){
$this->text .= "\n";
}
else if($name == 'blockquote'){
$this->was['blockquote']++;
}
else if($name == 'ul'){
$this->was['ul']++;
}
else if($name == 'ol'){
$this->was['ol']++;
}
else if($name == 'li'){
if($this->was['ul'] > 0){
for($i=1;$i<$this->was['ul'];$i++){
$this->text .= ' ';
}
$this->text .= '* ';
}
else if($this->was['ol'] > 0){
for($i=1;$i<$this->was['ol'];$i++){
$this->text .= ' ';
}
$this->was['ol-count'][$this->was['ol']]++;
$this->text .= $this->was['ol-count'][$this->was['ol']].' ';
}
}
else if($name == 'a'){
if($this->build_links){
if(preg_match('/(http|https):\/\//',$attrs['href'])){
$this->link_list[] = $attrs['href'];
$this->close['a'] = true;
$this->text .= $this->anchor_char;
}
else if(preg_match('/mailto:(.*)$/',$attrs['href'],$m)){
$this->link_list[] = $attrs['href'];
$this->close['a'] = true;
$this->text .= $this->anchor_char;
}
}
else{
if(preg_match('/(http|https):\/\//',$attrs['href'])){
$this->link_list[] = $attrs['href'];
$this->close['a'] = true;
}
else if(preg_match('/mailto:(.*)$/',$attrs['href'],$m)){
$this->link_list[] = $m[1];
$this->close['a'] = true;
}
}
}
else if(in_array($name,$this->remove_elements)){
$this->was[$name]++;
}
return true;
}
function _closeHandler(&$parser,$name) {
$name = strtolower($name);
if($this->close[$name]){
$this->close[$name] = false;
if($name == 'a'){
if($this->build_links)
$this->text .= $this->anchor_char;
else
$this->text .= '['.count($this->link_list).']';
}
}
else if($name == 'ul'){
$this->was['ul']--;
}
else if($name == 'ol'){
unset($this->was['ol-count'][$this->was['ol']]);
$this->was['ol']--;
}
else if($this->was[$name]){
$this->was[$name]--;
}
if(in_array($name,$this->block_elements)){
$this->text .= "\n";
}
return true;
}
function _dataHandler(&$parser, $data) {
foreach($this->remove_elements as $re){
if($this->was[$re]) return true;
}
if($this->was['blockquote']>0){
$data = str_replace("\n","\n".str_repeat('>',$this->was['blockquote'])." ",$data);
}
//$this->text .= $data;
$this->text .= html_entity_decode($data,ENT_NOQUOTES,$GLOBALS['DEFAULT_CHARSET']);
return true;
}
function _escapeHandler(&$parser, $data) {
return true;
}
function getText(){
return $this->text;
}
function clear(){
$this->text = '';
return true;
}
/**
* Main parsing fuction
*/
function parse($doc,$width=-1,$build_links=false){
$this->text = '';
$this->build_links = $build_links;
// Save all '<' symbols
$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
// Web documents shouldn't contains \x00 symbol
$doc = str_replace("\x00", '', $doc);
// Opera6 bug workaround
$doc = str_replace("\xC0\xBC", '<', $doc);
// UTF-7 encoding ASCII decode
$doc = $this->repackUTF7($doc);
// Instantiate the parser
$parser=& new XML_HTMLSax3();
// Set up the parser
$parser->set_object($this);
$parser->set_element_handler('_openHandler','_closeHandler');
$parser->set_data_handler('_dataHandler');
$parser->set_escape_handler('_escapeHandler');
$parser->parse($doc);
$this->text = preg_replace("/\n\s*\n/","\n",$this->text);
$this->text = preg_replace("/(\n){3,}/",'',$this->text);
$this->text = trim($this->text);
if($width > 0)
$this->text = wordwrap($this->text,$width);
// build list of links
if($this->build_links){
foreach($this->link_list as $link){
$start = strpos($this->text,$this->anchor_char);
$this->text = substr_replace($this->text,'<a href="http://www.google.com/gwt/n?&mrestrict=xhtml&u='.urlencode($link).'">',$start,1);
$end = strpos($this->text,$this->anchor_char);
$this->text = substr_replace($this->text,'</a>',$end,1);
}
}
else if(array_not_empty($this->link_list)){
$this->text .= "\nReferences:\n";
foreach($this->link_list as $k=>$link){
$this->text .= sprintf('%3d : ',$k+1);
$this->text .= $link."\n";
}
}
$this->link_list = array();
return $this->text;
}
/**
* UTF-7 decoding fuction
*/
function repackUTF7($str){
return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
}
/**
* Additional UTF-7 decoding fuction
*/
function repackUTF7Callback($str){
$str = base64_decode($str[1]);
$str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
return preg_replace('/\x00(.)/', '$1', $str);
}
/**
* Additional UTF-7 encoding fuction
*/
function repackUTF7Back($str){
return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
}
}
?>