<?php
/**
* html2text converts HTML Markup to Markdown [1]. It also supports
* Markdown Extra [2] by Michel Fortin [3].
*
* It started as a port of Aaron Swartz' [4] html2text.py [5] but
* got a long way since. This is more than a mere port now!
*
* [1]: http://daringfireball.com/projects/markdown
* [2]: http://www.michelf.com/projects/php-markdown/extra/
* [3]: http://www.michelf.com/
* [4]: http://www.aaronsw.com/
* [5]: http://www.aaronsw.com/2002/html2text/
*
* @version 1.4
* @author Milian Wolff (hide@address.com,http://milianw.de)
* @license LGPL, see LICENSE.txt and below
* @copyright (C) 2007 Milian Wolff
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
class html2text {
# input html
var $html = '';
# output markdown
var $outtext = '';
# some control structures
var $p_p = 0;
var $start = 1;
var $space = 0;
var $force_html = false;
var $force_html_start = array('tag'=>'','parents'=>0);
# links
var $a = array();
var $astack = array();
var $acount = 0;
# lists
var $list = array ();
var $list_depth = 0;
var $lastWasNL = false;
# indenting and appending
var $append = '';
var $indent = '';
# these elements will be dropped with all subelements
var $drop = array(
'script',
'head',
'style',
'form',
);
# these elements will be quietly ignored, their children will be parsed
var $ignore = array(
'wrapper', # important!
'html', # closing html tag
'body', # closing body tag
'thead',
'tbody',
'tfoot',
);
# these elements can have some attributes
var $has_attrs = array(
# tag => list of allowed attrs
'h1' => array('id'),
'h2' => array('id'),
'h3' => array('id'),
'h4' => array('id'),
'h5' => array('id'),
'h6' => array('id'),
'a' => array('href','title'),
'img' => array('src','alt','title'),
# tables
'th' => array('align'),
'td' => array('align'),
# footnotes
'sup' => array('id'),
'footnote' => array('nr'),
# abbrevations
'abbr' => array('title'),
'acronym' => array('title'),
);
# table
var $max_len = array();
var $align = array();
var $cols = array();
var $rows = array();
var $col = 0;
var $row = 0;
var $header = array();
# parents
var $parents = array();
# abbrevations
var $abbrs = array();
# buffer
var $buffer = array();
var $buffer_lvl = 0;
# options
var $LINKS_EACH_PARAGRAPH;
var $BODY_WIDTH;
var $KEEP_HTML;
# global xml parser
var $xml_parser;
/**
* setup the xml_parser
* $links_each_paragraph: if set to true, the list of links will be
* displayed after each paragraph, else it will be displayed on the end of
* the file
* $body_width: if set to a integer greater 0 the output text will be
* wrapped to that width (in characters)
* $keep_html: if set to true, all unrecognized html tags will be kept, else
* they'll be removed
*
* @param bool $links_each_paragraph default true
* @param integer $body_width default 0
* @param bool $keep_html default true
* @return void
*/
function html2text($links_each_paragraph = true,$body_width = 0,$keep_html = true) {
$this->LINKS_EACH_PARAGRAPH = $links_each_paragraph;
$this->BODY_WIDTH = $body_width;
$this->KEEP_HTML = $keep_html;
$this->xml_parser = xml_parser_create();
xml_set_object($this->xml_parser, $this);
xml_parser_set_option($this->xml_parser, XML_OPTION_CASE_FOLDING, 0);
xml_set_element_handler($this->xml_parser, 'starttag', 'endtag');
xml_set_character_data_handler($this->xml_parser, 'handle_data');
xml_set_default_handler($this->xml_parser,'handle_default');
}
/**
* parse a html string to text
*
* @param string $html
* @return string
*/
function load_string($html) {
$html = trim($html);
if(empty($html)){
return '';
}
# use unix style newlines
$html = str_replace("\r","\n",str_replace("\r\n","\n",$html));
# remove doctype and xml tags
$html = preg_replace('#^.*<body[^>]*>#Us','<html><body>',$html);
/*
* cope with bad html
*/
$html = preg_replace('/<img(?!.*\/>)([^>]*)>/Us','<img$1 />',$html);
$html = preg_replace('#&(?!amp;)#','&',$html);
$html = str_replace('<','<',$html);
$html = preg_replace('#<([a-z]+[^>]*) ?/>#Us','<$1 />',$html);
# unmatched tags (poor performance)
preg_match_all('#<(([a-z]|h[1-6])+)(?= |>)#',$html,$matches);
foreach($matches[1] as $tag){
$html = preg_replace('#<'.$tag.'( |>)(.*)</'.$tag.'>#Us','<'.$tag.'$1$2</'.$tag.'>',$html,1);
}
# encode < to < and & to & inside <pre>|<code>
$html = preg_replace_callback('#(<pre[^>]*>\s*<code[^>]*>|<code[^>]*>|<pre[^>]*>)(.*)(</pre>\s*</code>|</code>|</pre>)#Us',
create_function(
'$matches',
'return $matches[1].str_replace(\'<\',\'<\',$matches[2]).$matches[3];'
),$html);
# handle empty attributes (e.g. <input checked>)
$html = preg_replace_callback('#<([a-z]+)(?>[^>]* [^=]+(?> [^>]*)?) ?/?>#s',array(&$this,'parse_empty_attribs'),$html);
# fake wrapper
$html = '<wrapper>'.$html.'</wrapper>';
# footnotes
$html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(<li id="fn:\d+">.+</li>)\s*</ol>\s*</div>#Us',array(&$this,'footnotes'),$html);
# last newline inside <pre> should not be parsed
$html = preg_replace('#\n</code></pre>#s','</code></pre>',$html);
# some html elements should not be parsed if their children wont be parsed:
if($this->KEEP_HTML){
# <ul|ol><li class="asdf">, complex because we need to handle nested lists
if(preg_match('#<li [^>]+>#',$html)){
preg_match_all('#(?:<li [^>]+>|</?(?:ul|ol)[^>]*>)#',$html,$matches,PREG_OFFSET_CAPTURE);
$lists = array();
$offset = 0;
$ins = ' forcehtml="1"';
$add = strlen($ins);
foreach($matches[0] as $k => $a){
if(substr($a[0],0,3) == '<li'){
$list = &$lists[count($lists)-1];
if(!$list['forced']){
$list['forced'] = true;
$html = substr_replace($html,$ins,$list['offset']+$offset,0);
$offset += $add;
}
} else {
if(substr($a[0],0,2) == '</'){ # close tag
array_pop($lists);
} else { # open tag
array_push($lists,array(
'offset' => $a[1]+3,
'forced' => strstr($a[0],'forcehtml='),
));
}
}
}
}
# <pre><code class="asdf">
$html = preg_replace('#(?><pre>)\s*(<code .+>)#Us','<pre forcehtml="1">$1',$html);
}
$this->html = $html;
# ok, now lets start parsing!
#echo dump($html);
$this->parse();
return $this->close();
}
/**
* clean up footnotes
*
* @param array $matches
* @return string
*/
function footnotes($matches){
# remove footnote link
$matches = preg_replace('@<a href="#fnref:\d+" rev="footnote"[^>]*>&#8617;</a>@U','',$matches[1]);
# remove empty paragraph
$matches = str_replace('<p></p>','',$matches);
# wrap in footnotes tag
$matches = '<footnotes>'.$matches.'</footnotes>';
# <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
$matches = str_replace('<li id="fn:','<footnote nr="',$matches);
return preg_replace('#</li>\s*(<footnote|</footnotes)#s','</footnote>$1',$matches);
}
/**
* @param array $matches
* @return string
*/
function parse_empty_attribs($matches){
if(preg_match('#^<[a-z]+(?: [a-z]+=(?:"[^"]*"|\'[^\']*\'))+ ?/?>$#s',$matches[0])){
# mismatch, this tag is correct
return $matches[0];
}
echo dump($matches[0]);
die();
$rep = $this->KEEP_HTML ? '$1="$1"' : '';
return '<'.$matches[1].preg_replace('#(?<= )([^ =>]{2,})(?= |$)#Us',$rep,$matches[2]).'>';
}
/**
* parse a html file to text
*
* @param string $file
* @return string
*/
function load_file($file) {
$contents = file_get_contents($file);
if(!$contents){
trigger_error('could not open XML input',E_USER_WARNING);
return false;
}
return $this->load_string($contents);
}
/**
* start parsing html to text
*
* @param void
* @return void
*/
function parse() {
$html = explode("\n", $this->html);
foreach ($html as $line) {
if (!xml_parse($this->xml_parser, $line . "\n")) {
$errcode = xml_get_error_code($this->xml_parser);
trigger_error(sprintf("XML error #%d: %s at line %d:<br /><pre><code>%s</code></pre>", $errcode,xml_error_string($errcode), xml_get_current_line_number($this->xml_parser),htmlspecialchars($line)),E_USER_WARNING);
#return;
}
}
}
/**
* close parser and return text
*
* @param void
* @return string
*/
function close() {
xml_parser_free($this->xml_parser);
$this->pbr();
$this->o('', false, 'end');
$this->out("\n");
$this->links();
# blockquotes
$this->outtext = preg_replace_callback('#^(\s*)((> )+)#m',array(&$this,'cleanup_bq'),$this->outtext);
# cleanup
$this->outtext = str_replace('&','&',str_replace('<','<',str_replace('>','>',$this->outtext)));
# empty lines (not preformatted)
$this->outtext = preg_replace('#^\s{1,4}$#m','',$this->outtext);
# empty quoted lines
$this->outtext = preg_replace('#^(>+)\s{1,5}$#m','$1',$this->outtext);
return rtrim($this->optwrap($this->outtext));
}
/**
* replace "> > > " with ">>> "
*
* @param array $m matches
* @return string
*/
function cleanup_bq($m){
return $m[1].str_repeat('>',strlen($m[2])/2).' ';
}
/**
* handles html comments
*
* @param resource $parser
* @param string $data
* @return void
*/
function handle_default($parser,$data){
if(substr($data,0,4) == '<!--' && substr($data,-3) == '-->'){
$this->outtext .= "\n\n".$data."\n";
}
}
/**
* adds pure data to the output (e.g. <p>DATA</p>)
*
* @param resource $parser
* @param string $data
* @return void
*/
function handle_data($parser, $data) {
$this->o($data, true);
}
/**
* start tags (e.g. <p>)
*
* @param resource $parser
* @param string $tag
* @param array $attrs
* @return void
*/
function starttag($parser, $tag, $attrs) {
$this->handle_tag($tag, $attrs, true);
}
/**
* end tags (e.g. </p>)
*
* @param resource $parser
* @param string $tag
* @return void
*/
function endtag($parser, $tag) {
$this->handle_tag($tag, null, false);
}
/**
* force html output of all children
*
* @param $tag
* @return void
*/
function force_html($tag){
$this->force_html = true;
$this->force_html_start = array(
'tag' => $tag,
'parents' => isset($this->parents[$tag]) ? strlen($this->parents[$tag]) : 0
);
}
/**
* parsing logic based on tag name
*
* @param string $tag
* @param array $attrs
* @param bool $start
* @return void
*/
function handle_tag($tag, $attrs, $start) {
if(in_array($tag,$this->drop)){ # drop tags with content
if($start){
$this->buffer();
} else {
$this->unbuffer();
}
return;
}
if(in_array($tag,$this->ignore)){ # drop tags but keep content
return;
}
# keeping the original html
if($this->KEEP_HTML){
if($start){
# is the force html attr set?
if(!$this->force_html && isset($attrs['forcehtml'])){
$this->force_html($tag);
}
# we'll have to keep this tag
if($this->force_html) {
$this->keep_tag($tag,$attrs,$start,true);
return;
} else {
# tag has attrs which can't be converted
if(!empty($attrs) && $this->keep_tag($tag,$attrs,$start)){
return;
}
}
} else {
if($this->force_html){
$this->keep_tag($tag,$attrs,$start,true);
if($tag == $this->force_html_start['tag'] && strlen($this->parents[$tag]) == $this->force_html_start['parents']){
$this->force_html = false;
}
return;
} elseif($this->parent($tag,'kept') && $this->keep_tag($tag,$attrs,$start)) {
return;
}
}
}
switch ($tag) {
case 'h1' :
case 'h2' :
case 'h3' :
case 'h4' :
case 'h5' :
case 'h6' :
$this->p();
if ($start) {
$this->o(str_repeat('#', intval($tag[1])) . ' ');
if(!empty($attrs['id'])){
$this->append = ' {#'.$attrs['id'].'}';
}
} else {
$this->out($this->append);
$this->append = '';
}
break;
case 'div' :
$this->p();
break;
case 'p' :
$this->p();
break;
case 'br' :
if ($start) {
$this->o(" \n");
}
break;
case 'hr' :
if ($start) {
$this->p();
$this->o('* * *');
$this->p();
}
break;
case 'blockquote' :
$this->indent('> ',$start);
if ($start) {
$this->start = true;
$this->out("\n\n".$this->indent);
}
break;
case 'em' :
case 'i' :
case 'u' :
$this->o('_');
break;
case 'strong' :
case 'b' :
$this->o('**');
break;
# footnotes
case 'sup':
if($start){
if(count($attrs) != 1 || !isset($attrs['id']) || !preg_match('#^fnref:(\d+)$#',$attrs['id'],$matches)){
# keep tag
$this->keep_tag($tag,$attrs,$start,true);
return;
}
# parse footnote
$this->out('[^'.$matches[1].']');
# omit output of link (<a href="#fn:1" rel="footnote">1</a>)
$this->buffer();
} else {
# last sup was not parsed -> keep tag
if(!$this->parent('sup')){
$this->keep_tag($tag,$attrs,$start);
return;
}
# sup was parsed -> reset buffer
$this->unbuffer();
}
break;
case 'footnotes':
$this->p();
break;
case 'footnote':
if($start){
$this->o('[^'.$attrs['nr']."]:\n".$this->indent.' ');
$this->start = true;
}
$this->indent(' ',$start);
break;
case 'a':
if($start) {
# buffer to check for inline links like <hide@address.com> and the like
if (isset ($attrs['href'])) {
$this->buffer();
array_push($this->astack, $attrs);
} else {
array_push($this->astack, null);
}
} else {
if($this->astack) {
$a = array_pop($this->astack);
if ($a) {
# for emails
$a['href'] = $this->decode($a['href']);
$buffer = $this->unbuffer();
$buffer_check = $this->decode(trim($buffer));
if((substr($a['href'],0,7) == 'mailto:' && 'mailto:'.$buffer_check == $a['href']) || $a['href'] == $buffer_check){
# inline link
$this->out('<'.$buffer_check.'>',true);
} else {
# block link
$this->previousIndex($a);
$this->out('['.$buffer.']['.$a['count'].']',true);
}
}
}
}
break;
# abbrevations
case 'abbr':
case 'acronym':
if($start){
$this->buffer();
array_push($this->abbrs,isset($attrs['title'])?$attrs['title']:'');
} else {
$abbr = $this->unbuffer();
$def = array_pop($this->abbrs);
# only add abbr if its not already defined
if(!isset($this->abbrs[$abbr])){
$this->abbrs[$abbr] = $def;
}
$this->o($abbr);
}
break;
case 'img' :
if ($start) {
if (isset ($attrs['src'])) {
$attrs['href'] = $attrs['src'];
$alt = '';
if (isset ($attrs['alt'])) {
$alt = $attrs['alt'];
} elseif(isset($attrs['title'])){
$alt = $attrs['title'];
}
$this->previousIndex($attrs);
$this->o('!['.$alt.'][' . $attrs['count'] . ']');
}
}
break;
case 'code':
# do we have to keep this tag?
# or is a parent <pre> element existing?
if($this->keep_tag($tag,$attrs,$start) || $this->parent('pre')){
return;
}
# convert to `code` and handle backticks inside code block
# <code>foo`bar</code> has to get ``foo`bar`` and so forth
if($start){
$this->buffer();
} else {
$str = $this->unbuffer();
preg_match_all('#`+#',$str,$matches);
if(!empty($matches[0])){
rsort($matches[0]);
$len = strlen($matches[0][0])+1;
} else {
$len = 1;
}
$ticks = str_repeat('`',$len);
$this->out($ticks.$str.$ticks);
}
break;
case 'dl' :
# note: if <dl> gets parsed, its direct children (<dd> and <dt>) will be parsed as well
if ($start) {
$this->p();
}
break;
case 'dd' :
# is the parent dl parsed?
if(!$this->parent('dl')){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if ($start) {
$this->o(': ');
$this->start = true;
} else {
$this->outtext .= "\n";
$this->pbr();
}
$this->indent(' ',$start);
break;
case 'dt' :
# is the parent dl parsed?
if(!$this->parent('dl')){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if (!$start) {
$this->pbr();
}
break;
case 'ol' :
case 'ul' :
# note: if this element gets parsed, its direct children <li>s will be parsed as well
if ($start) {
array_push($this->list, array (
'name' => $tag,
'num' => 0
));
} else {
array_pop($this->list);
$this->pbr();
}
break;
case 'li' :
if ($this->list) {
$li = &$this->list[count($this->list) - 1];
}
# not inside a list or the list tag was not parsed
if(!isset($li) || !$this->parent($li['name'])){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if ($start) {
$this->pbr();
if($li['name'] == 'ul'){
$this->o('* ');
} else {
$li['num']++;
/**
* @todo line up <ol><li>s > 9 correctly.
*/
$this->o($li['num'].'. ');
}
$this->start = true;
$this->indent(' ',$start);
} else {
$this->indent(' ',$start);
}
break;
case 'table':
# NOTE: if the <table> tag gets parsed, all its children will be as well!
# finally: parse the whole table
if(!$start){
$this->outtext .= "\n\n";
$separator = array();
# seperator with correct align identifikators
foreach($this->cols as $col => $arr){
$this->max_len[$col] = max($arr);
$left = $right = '';
switch($this->align[$col]){
case 'center':
$right = ':';
case 'left':
$left = ':';
break;
case 'right':
$right = ':';
break;
}
array_push($separator,$left.str_repeat('-',$this->max_len[$col]).$right);
}
$separator = '| '.implode(' | ',$separator).' |';
# set equal width
array_walk($this->rows,array(&$this,'fill_td'));
$rows = $this->rows;
foreach($rows as $row => $cols){
$this->pbr();
$this->o('| '.implode(' | ',$cols).' |');
if(in_array($row,$this->header)){
$this->pbr();
$this->o($separator);
}
}
$this->cols = array();
$this->rows = array();
$this->align = array();
$this->pbr();
}
break;
case 'tr':
# not inside a table or the parent table was not parsed
if(!$this->parent('table')){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if($start){
$this->row++;
} else {
$this->col = 0;
}
break;
case 'th':
# not inside a table or the parent table was not parsed
if(!$this->parent('table')){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if($start){
if(!in_array($this->row,$this->header)){
array_push($this->header,$this->row);
}
$this->col++;
$this->align[$this->col] = !empty($attrs['align']) ? $attrs['align'] : null;
}
break;
case 'td':
# not inside a table or the parent table was not parsed
if(!$this->parent('table')){
$this->keep_tag($tag,$attrs,$start,true);
return;
}
if($start){
$this->col++;
if(!empty($attrs['align']) && is_null($this->align[$this->col])){
$this->align[$this->col] = $attrs['align'];
if($attrs['align'] == 'center'){
$this->max_len[$this->col] +=2;
}
}
}
break;
case 'pre':
$this->indent(' ',$start,true);
if ($start) {
$this->pbr();
}
break;
default:
$this->keep_tag($tag,$attrs,$start,true);
return;
}
# if we want to keep all non convertible html this function has to know if some parent elemts
# were parsed or not (also some elements need to know if)
if($start){
if(!isset($this->parents[$tag])){
$this->parents[$tag] = '1';
} else {
$this->parents[$tag] .= '1';
}
} else {
if($this->LINKS_EACH_PARAGRAPH && in_array($tag,array('p','ul','blockquote','ol','dl','table','h1','h2','h3','h4','h5','h6'))){
$this->links();
}
$this->parents[$tag] = substr($this->parents[$tag],0,-1);
}
return;
}
/**
* adds a string to the output ($this->outtext)
* also copes with tables
*
* @param string $str
* @return void
*/
function out($str) {
# buffering
if($this->buffer_lvl){
$this->buffer[$this->buffer_lvl] .= $str;
return;
}
# this is for tables (see php markdown extra by michel fortin)
if(($this->parent('th') || $this->parent('td'))){
$str = trim($str);
if(!isset($this->rows[$this->row][$this->col])){
$this->rows[$this->row][$this->col] = $str;
} else {
$this->rows[$this->row][$this->col] .= $str;
}
if(!isset($this->cols[$this->col][$this->row])){
$this->cols[$this->col][$this->row] = strlen($str);
} else {
$this->cols[$this->col][$this->row] += strlen($str);
}
return;
}
$this->outtext .= $str;
}
/**
* further parse the output and add newlines, remove whitespaces and such
*
* @param string $data
* @param bool $puredata
* @param string $force
* @return void
*/
function o($data, $puredata = false, $force = false) {
if($this->parent('table') && trim($data) == ''){ # drop whitespaces inside tables
return;
} elseif ($puredata && !$this->parent('code','both') && !$this->parent('pre','both')) { # keep whitespace for code
$data = preg_replace('#\s+#', ' ', $data);
}
if (!$data && !$force) {
return;
}
if (!empty($this->indent)) {
$data = str_replace("\n", "\n".$this->indent, $data);
}
if ($this->start) {
if($data == ' '){
return;
}
$this->p_p = 0;
$this->start = 0;
}
if ($force == 'end') {
# It's the end.
$this->p_p = 0;
$this->out("\n");
}
if ($this->p_p) {
if($data == ' '){
return;
}
$data = ltrim($data);
$this->out(str_repeat("\n".$this->indent, $this->p_p));
}
$this->p_p = 0;
$this->out($data);
if($data){
$this->lastWasNL = substr($data, -1) == "\n";
}
}
/**
* display block links after paragraph etc.
* also handle abbrs
*
* @param void
* @return void
*/
function links(){
$this->abbrs();
if(empty($this->a)){
return; # no links stored
}
$pre = '';
$this->out("\n\n");
foreach($this->a as $links){
/**
* @todo base href
*/
foreach($links as $link){
$a = $pre.' [' . $link['count'] . ']: ' . $link['href'];
if (isset ($link['title'])) {
$a .= ' (' . $link['title'] . ')';
}
$this->out($a."\n");
}
}
$this->a = array();
$this->out("\n");
$this->lastWasNL = true;
}
/**
* display abbr list
*
* @param void
* @return void
*/
function abbrs(){
if(empty($this->abbrs)){
return; # no abbrs stored
}
$this->out("\n\n");
foreach($this->abbrs as $abbr => $def){
$this->out('*['.$abbr.']: '.$def."\n");
}
$this->abbrs = array();
$this->out("\n");
$this->lastWasNL = true;
}
/**
* if the link is already set use its count, else increase acount
*
* @param array &$attrs link attributes
* @return void
*/
function previousIndex(&$attrs) {
# check for existing link
if(isset($this->a[$attrs['href']])){
foreach($this->a[$attrs['href']] as $a){
if (!empty($attrs['title']) || !empty($a['title'])){
if($a['title'] == $attrs['title']) {
$attrs = $a;
return;
}
} else {
$attrs = $a;
return;
}
}
}
# if we come here, no matching link was found
$this->acount++;
$attrs['count'] = $this->acount;
if(isset($this->a[$attrs['href']])){
array_push($this->a[$attrs['href']],$attrs);
} else {
$this->a[$attrs['href']] = array($attrs);
}
}
/**
* handles bad html to avoid xml parse errors
*
* @param string $html
* @return string
*/
function handle_bad_html($html){
return preg_replace_callback('#<([a-z1-6]+)( [^>]*)?>(.*(?R).*)</\\1>#Us',array(&$this,'replace_bad_html'),$html);
}
/**
* callback function which is used in handle_bad_html()
*
* @param array $matches
* @return string
*/
function replace_bad_html($matches){
# recursion
$matches[3] = $this->handle_bad_html($matches[3]);
return '<'.$matches[1].$matches[2].'>'.$matches[3].'</'.$matches[1].'>';
}
/**
* if the option BODY_WIDTH is set, this option will wrap text to the
* provided width
*
* @param string $text
* @return string
*
* @todo wrapping of code (also kept code blocks)
*/
function optwrap($text) {
if ($this->BODY_WIDTH < 30) {
return $text;
}
$result = '';
$split = explode("\n", $text);
foreach ($split as $para) {
if (strlen($para) > 0) {
if (preg_match('#^(\s*): #',$para,$indent)) { # definition lists
$indent = isset($indent[1]) ? $indent[1] : '';
$result .= wordwrap($para, $this->BODY_WIDTH - strlen($indent) - 4, "\n".$indent.' ')."\n";
} elseif(preg_match('#^(\s*>+)#',$para,$indent)){ # blockquote
$result .= wordwrap($para,$this->BODY_WIDTH - (strlen($indent[0])+1),"\n".$indent[0].' ')."\n";
} elseif(preg_match('#^\s*\|#',$para)){ # table
$result .= $para."\n"; # dont wrap
} elseif(preg_match('#^(\s*)\*#',$para,$indent)) { # list item @todo: ol
$indent = isset($indent[1]) ? $indent[1] : '';
$indent.= ' ';
$result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
} elseif(preg_match('#^ \[[^\]]+\]:#',$para)){ # block links
# don't wrap at the moment
$result .= $para."\n";
continue;
} else { # something else
preg_match('#^\s+#',$para,$indent);
$indent = isset($indent[0]) ? $indent[0] : '';
$result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
}
} else {
$result .= "\n";
}
}
return $result;
}
/**
* handles html tags which are not represented by the parser logic
* if $this->KEEP_HTML is set to true, the tag will be appended to the
* output and `markdown="1"` added to its attributes
*
* @param string $tag
* @param array $attrs
* @param bool $start
* @param array $known_attrs these attrs can be handled by markdown
* @return bool
*/
function keep_tag($tag,$attrs,$start,$force = false){
if(!$force && !$this->KEEP_HTML){
return false;
}
# start tag
if($start){
# if there is a attr which cannot be handled by markdown
# this tag will be kept.
if(isset($this->has_attrs[$tag])){
$known_attrs = $this->has_attrs[$tag];
} else {
$known_attrs = array();
}
if(!$force && count($known_attrs) >= count($attrs)){
if(empty($attrs) || count(array_diff(array_keys($attrs),$known_attrs)) == 0){
# tag can be handled by markdown!
return false;
}
}
$attr = '';
if(!empty($attrs)){
foreach($attrs as $key => $value){
if($key == 'forcehtml'){
continue;
}
$attr.=' '.$key.'="'.$value.'"';
}
}
if(!$force && in_array($tag,array('div','center','li','dt','dd'))){
$attr.= ' markdown="1"';
} elseif(!$this->force_html) {
$this->force_html($tag);
}
$this->o('<'.$tag.$attr.'>',true);
# add to list of parents:
if(isset($this->parents[$tag])){
$this->parents[$tag] .= '2';
} else {
$this->parents[$tag] = '2';
}
# close tag
} else {
if(!$force && !$this->parent($tag,'kept')){
# the start tag of this element was not parsed
return false;
}
$this->o('</'.$tag.'>');
$this->parents[$tag] = substr($this->parents[$tag],0,-1);
# newlines after </tag>
if(in_array($tag,array('th','td','dt','dd','li','p'))){
$this->o("\n");
}
}
# newlines after <tag> and </tag>
if(in_array($tag,array('div','center','table','tr','ul','ol','dl','pre'))){
$this->o("\n");
}
return true;
}
/**
* outputs a cell widened to the proper width
*
* @param array &$row
* @return void
*/
function fill_td(&$row){
$len = 0;
foreach($row as $col => $cont){
$width = $this->max_len[$col];
switch($this->align[$col]){
case 'center':
$width += 2;
$row[$col] = str_pad($row[$col],$width,' ',STR_PAD_BOTH);
break;
case 'left':
$width++;
default:
$row[$col] = str_pad($row[$col],$width,' ');
break;
case 'right':
$width++;
$row[$col] = str_pad($row[$col],$width,' ',STR_PAD_LEFT);
break;
}
}
}
/**
* some sort of <br />
*
* @param void
* @return void
*/
function pbr() {
if ($this->p_p == 0) {
$this->p_p = 1;
}
}
/**
* text <p> (e.g. newlines after output)
*
* @param void
* @return void
*/
function p() {
if($this->parent('table')){
return;
}
$this->p_p = 2;
}
/**
* add $indent before each line
*
* @param string $indent
* @param bool $start wether it's an opening tag or a closing one
* @param bool $output shall $indent be outputted? (only if $start is true)
* @return void
*/
function indent($indent,$start,$output=false){
if($start){
if($output){
$this->o($indent);
}
$this->indent .= $indent;
} else {
$len = strlen($indent);
if($len >= strlen($this->indent)){
$this->indent = '';
} else {
$this->indent = substr($this->indent,0,-$len);
}
}
}
/**
* checks if a parent element exists
* use $type to check for a parsed parent element or a kept element
* @param string $parent name of the parent tag
* @param string $type either 'parsed' or 'kept' or 'both'
* @return bool
*/
function parent($parent,$type = 'parsed'){
if(!isset($this->parents[$parent])){
return false;
}
if($type != 'both'){
$type = $type == 'parsed' ? '1' : '2';
return substr($this->parents[$parent],-1) === $type;
} else {
return !empty($this->parents[$parent]);
}
}
/**
* start buffer
*
* @param void
* @return void
*/
function buffer(){
if($this->p_p){
$this->out(str_repeat("\n".$this->indent, $this->p_p));
$this->p_p = 0;
}
$this->buffer_lvl++;
$this->buffer[$this->buffer_lvl] = '';
}
/**
* end buffer and return buffered output
*
* @param void
* @return string
*/
function unbuffer(){
$out = $this->buffer[$this->buffer_lvl];
unset($this->buffer[$this->buffer_lvl]);
$this->buffer_lvl--;
return $out;
}
/**
* decode email
*
* @author hide@address.com <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
*/
function decode($text,$quote_style = ENT_NOQUOTES){
if (function_exists('html_entity_decode')) {
$text = html_entity_decode($text, $quote_style, 'ISO-8859-1'); // NOTE: UTF-8 does not work!
}
else {
$trans_tbl = get_html_translation_table(HTML_ENTITIES, $quote_style);
$trans_tbl = array_flip($trans_tbl);
$text = strtr($text, $trans_tbl);
}
$text = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $text);
$text = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $text);
return $text;
}
}
?>