Location: PHPKode > projects > CsWebmail > cswebmail-3.10/cswebmail-3.10/include/html2text.php
<?
/**
 * This package requires HTMLSax3 package
 */
require_once 'XML/HTMLSax3.php';

class Html2Text {
  var $anchor_char;
  var $text = '';
  var $links_list;
  var $build_links;
  var $block_elements = array('div','ol','ul','table','p','blockquote');
  var $remove_elements = array('style','script');
  function Html2Text(){
    $this->anchor_char = chr(255);
    $this->text_count = 0;
    $this->links_count = 0;
    $this->build_links = false;
    $this->link_list = array();
  }
  
  function _openHandler(&$parser, $name, $attrs) {
    $name = strtolower($name);

    if($name == 'hr'){
      $this->text .= "\n---------------------\n";
    }
    else if($name == 'br'){
      $this->text .= "\n";
    }
    else if($name == 'blockquote'){
      $this->was['blockquote']++;
    }
    else if($name == 'ul'){
      $this->was['ul']++;
    }
    else if($name == 'ol'){
      $this->was['ol']++;
    }
    else if($name == 'li'){
      if($this->was['ul'] > 0){
        for($i=1;$i<$this->was['ul'];$i++){
          $this->text .= '  ';
        }
        $this->text .= '* ';
      }
      else if($this->was['ol'] > 0){
        for($i=1;$i<$this->was['ol'];$i++){
          $this->text .= '  ';
        }
        $this->was['ol-count'][$this->was['ol']]++;
        $this->text .= $this->was['ol-count'][$this->was['ol']].' ';
      }
    }
    else if($name == 'a'){
      if($this->build_links){
        if(preg_match('/(http|https):\/\//',$attrs['href'])){
          $this->link_list[] = $attrs['href'];
          $this->close['a'] = true;
          $this->text .= $this->anchor_char;
        }
        else if(preg_match('/mailto:(.*)$/',$attrs['href'],$m)){
          $this->link_list[] = $attrs['href'];
          $this->close['a'] = true;
          $this->text .= $this->anchor_char;
        }
      }
      else{
        if(preg_match('/(http|https):\/\//',$attrs['href'])){
          $this->link_list[] = $attrs['href'];
          $this->close['a'] = true;
        }
        else if(preg_match('/mailto:(.*)$/',$attrs['href'],$m)){
          $this->link_list[] = $m[1];
          $this->close['a'] = true;
        }
      }
    }
    else if(in_array($name,$this->remove_elements)){
      $this->was[$name]++;
    }
    return true;
  }
  function _closeHandler(&$parser,$name) {
    $name = strtolower($name);
    if($this->close[$name]){
      $this->close[$name] = false;
      if($name == 'a'){
        if($this->build_links)
          $this->text .= $this->anchor_char;
        else
          $this->text .= '['.count($this->link_list).']';
      }
    }
    else if($name == 'ul'){
      $this->was['ul']--;
    }
    else if($name == 'ol'){
      unset($this->was['ol-count'][$this->was['ol']]);
      $this->was['ol']--;
    }
    else if($this->was[$name]){
      $this->was[$name]--;
    }

    if(in_array($name,$this->block_elements)){
      $this->text .= "\n";
    }
    return true;
  }

  function _dataHandler(&$parser, $data) {
    foreach($this->remove_elements as $re){
      if($this->was[$re]) return true;
    }
    if($this->was['blockquote']>0){
      $data = str_replace("\n","\n".str_repeat('>',$this->was['blockquote'])." ",$data);
    }
    //$this->text .= $data;
    $this->text .= html_entity_decode($data,ENT_NOQUOTES,$GLOBALS['DEFAULT_CHARSET']);
    return true;
  }

  function _escapeHandler(&$parser, $data) {
    return true;
  }
  function getText(){
    return $this->text;
  }

  function clear(){
    $this->text = '';
    return true;
  }

  /**
   * Main parsing fuction
   */
  function parse($doc,$width=-1,$build_links=false){
    $this->text = '';
    $this->build_links = $build_links;
    // Save all '<' symbols
    $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', $doc);

    // Web documents shouldn't contains \x00 symbol
    $doc = str_replace("\x00", '', $doc);

    // Opera6 bug workaround
    $doc = str_replace("\xC0\xBC", '&lt;', $doc);

    // UTF-7 encoding ASCII decode
    $doc = $this->repackUTF7($doc);

    // Instantiate the parser
    $parser=& new XML_HTMLSax3();

    // Set up the parser
    $parser->set_object($this);

    $parser->set_element_handler('_openHandler','_closeHandler');
    $parser->set_data_handler('_dataHandler');
    $parser->set_escape_handler('_escapeHandler');

    $parser->parse($doc);

    $this->text = preg_replace("/\n\s*\n/","\n",$this->text);
    $this->text = preg_replace("/(\n){3,}/",'',$this->text);
    $this->text = trim($this->text);
    
    if($width > 0)
      $this->text = wordwrap($this->text,$width);
    // build list of links
    if($this->build_links){
      foreach($this->link_list as $link){
        $start = strpos($this->text,$this->anchor_char);
        $this->text = substr_replace($this->text,'<a href="http://www.google.com/gwt/n?&amp;mrestrict=xhtml&amp;u='.urlencode($link).'">',$start,1);
        $end = strpos($this->text,$this->anchor_char);
        $this->text = substr_replace($this->text,'</a>',$end,1);
      }
    }
    else if(array_not_empty($this->link_list)){
      $this->text .= "\nReferences:\n";
      foreach($this->link_list as $k=>$link){
        $this->text .= sprintf('%3d : ',$k+1);
        $this->text .= $link."\n";
      }
    }
    $this->link_list = array();
    

    return $this->text;

  }


  /**
   * UTF-7 decoding fuction
   */
  function repackUTF7($str){
    return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
  }

  /**
   * Additional UTF-7 decoding fuction
   */
  function repackUTF7Callback($str){
    $str = base64_decode($str[1]);
    $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
    return preg_replace('/\x00(.)/', '$1', $str);
  }

  /**
   * Additional UTF-7 encoding fuction
   */
  function repackUTF7Back($str){
    return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
  }
}

?>
Return current item: CsWebmail