<?php
/**
* Seltzlab - web applications development
* http://www.seltzlab.com
* mail: hide@address.com
* skype: seltzlab
*
* This work is licensed under Creative Commons Attribution-Share Alike 3.0
* http://creativecommons.org/licenses/by-sa/3.0/us/deed.en
*/
/**
* Class to analize semantic data from a html trunk
*/
class seltz_analyzer
{
/**
* turn on/off debug output
* @var boolean
*/
private $debug = false;
/**
* autodetected document type, can be url or string
* @var string
*/
public $doctype;
/**
* document text
* @var string
*/
public $doctext;
/**
* doucment language using 2 chars
* @var string
*/
public $doclang;
/**
* if doctype is url, here the document url
* @var string
*/
public $docurl;
/**
* the document encoding, if not specified try to get it from html meta informations. If fails set to iso-8859-1
* @var string
*/
public $encoding;
/**
* constants defined for our "good tags" weight in score computation
*/
const weight_ucfirst = 4;
const weight_pspell = 3;
const weight_strong = 5;
const weight_em = 5;
const weight_span = 4;
const weight_p = 1;
const weight_div = 0;
/**
* array of tags we assume are good for a semantic meaning
* @var array
*/
public $goodTags = array(
'strong' => seltz_analyzer::weight_strong,
'b' => seltz_analyzer::weight_strong,
'em' => seltz_analyzer::weight_em,
'i' => seltz_analyzer::weight_em,
'span' => seltz_analyzer::weight_span,
'p' => seltz_analyzer::weight_p,
'div' => seltz_analyzer::weight_div
);
/**
* array of tags we assume are particoulary good for a semantic meaning
* @var array
*/
public $reallyGoodTags = array(
'strong',
'b',
'em',
'i',
'span'
);
/**
* array of tags we assume are not good for a semantic meaning. Remember, at the moment all is finalized in getting some keywords linked on wikipedia
* @var array
*/
public $badTags = array(
'a',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li'
);
/**
* pspell object
* @var object
*/
private $pspell;
/**
* @param string $doc url of the document or xhtml string
* @param array $config configuration array. key debug bool default false, key encoding string default iso-8859-1, key doclang 2 char string default en
*/
function __construct($doc, $config)
{
if (isset($config['debug']))
$this->debug = $config['debug'];
if (preg_match("/^([A-z]*)(:\/\/)/iU", $doc)) {
$this->doctype = 'url';
$this->docurl = $doc;
$this->doctext = file_get_contents($doc);
}
else {
$this->doctype = 'string';
$this->doctext = $doc;
}
if ($config['encoding'])
$this->encoding = $config['encoding'];
else {
preg_match_all('/<\?xml\s+(version="(.*)")?\s+(encoding="(.*)")?\s*\?>/iU', $this->doctext[0], $match);
if ($match[4][0]) // ho encoding settato
$this->encoding = $match[4][0];
else
$this->encoding = 'iso-8859-1';
}
if (isset($config['doclang']))
$this->doclang = $config['doclang'];
else
$this->doclang = 'en';
$this->pspell = pspell_new($this->doclang, "", "", "", PSPELL_FAST);
$this->out_debug(array($this->doclang, $this->doctype, $this->encoding, $goodTags));
}
public function buildStruct()
{
// analyze html's meanings
$words = $this->analyze_xml();
asort($words);
$this->out_debug('RESULTS -------------------------');
$this->out_debug($words);
return $words;
}
private function analyze_string($text)
{
$words_delimiter = array(' ', ';', ',', '.', ':', '\'', '"', '!', '?', '-', '(', ')', '[', ']', '/', '\\', '=', "\n", "\r", "\t");
$words = array();
// analyze text without html and attach a final point to make the last word recognized
$temptxt = strip_tags($text).'.';
$current_word = '';
$this->out_debug("Analyze string:".$temptxt.", strlen:".strlen($temptxt));
for ($i = 0; $i < (strlen($temptxt) + 1); $i++) {
$asciichar = ord($temptxt{$i});
if ($asciichar < 32 || $asciichar > 126)
continue;
if (in_array($temptxt{$i}, $words_delimiter) || $temptxt{$i} == ' ') {
if (!ctype_digit($current_word) && is_string($current_word)) {
$word_index = strtolower($current_word);
$this->out_debug("\tFound word:".$word_index);
// if first char is uppercase maybe is a candidate word...
if (strlen($current_word) > 1 && strlen($current_word) != $i && $current_word === ucfirst($current_word) && !in_array($temptxt{($i - strlen($current_word) - 1)}, array_slice($words_delimiter, 1))) {
// if first char uppercase && the previous char was a blank space && the previous word was uppercase too
if ($temptxt{($i - strlen($current_word) - 1)} == ' ' && $last_word === ucfirst($last_word)) {
// than maybe is highly possible that the two words goes togheter
if (in_array($current_word, explode(' ', $last_word_index)))
$word_index = $last_word_index;
else
$word_index = strtolower($last_word_index.' '.$current_word);
$words[$word_index] += seltz_analyzer::weight_ucfirst + $words[$last_word_index];
unset($words[$last_word_index]);
$this->out_debug("\tPrevious was uppercase.. get togheter: $word_index = $last_word_index + ".$current_word);
}
else {
$words[$word_index] += seltz_analyzer::weight_ucfirst;
$this->out_debug("\tFirst char uppercase, score:".$words[$word_index]);
}
}
// if spell is not recognized maybe is a candidate word...
if (strlen($current_word) > 1 && !pspell_check($this->pspell, $current_word)) {
$words[$word_index] += seltz_analyzer::weight_pspell;
$this->out_debug("\tWord not found in pspell, score:".$words[$word_index]);
}
$last_word_index = $word_index;
$last_word = $current_word;
}
$current_word = '';
continue;
}
else {
$current_word .= $temptxt{$i};
}
}
$this->out_debug($words);
return $words;
}
private function analyze_xml()
{
$this->out_debug('PARSE XML.............');
$insideAt = array();
$tags = array_keys($this->goodTags);
$xmlwords = array();
$reader = new XMLReader();
$reader->XML($this->doctext, $this->encoding);
$reader->setParserProperty(XMLReader::SUBST_ENTITIES, true);
$reader->setParserProperty(XMLReader::LOADDTD, true);
$reader->setParserProperty(XMLReader::VALIDATE, false);
while ($reader->read()) {
$el = strtolower($reader->name);
$stopUntil = false;
$this->out_debug('xml node:'.$reader->nodeType.', '.$el.', currentElement: '.$currentElement.', last:'.$lastElement);
switch ($reader->nodeType) {
case XMLREADER::ELEMENT:
$this->out_debug('start '.$el);
if (in_array($el, $tags))
$currentElement = $el;
else if (in_array($el, $this->badTags))
$insideBad = $el;
break;
case XMLREADER::END_ELEMENT:
$this->out_debug('end '.$el);
if ($insideBad == $el)
$insideBad = false;
$lastElement = $el;
$currentElement = '';
break;
case XMLREADER::TEXT:
if ((!$currentElement || $insideBad) && !in_array($lastElement, $tags))
break;
if (in_array($currentElement, $this->reallyGoodTags))
$this->out_debug('inside a really good tag:'.$currentElement);
$this->out_debug('analyze node content:'.$reader->value);
$words = seltz_analyzer::analyze_string($reader->value);
// if inside a really good tag we take the entire string as index
if (in_array($currentElement, $this->reallyGoodTags)) {
foreach ($words as $word => $s) {
$this->out_debug('word '.$word.', add '.$s.' to '.$score);
$score += $this->goodTags[$currentElement] + $s;
if (isset($xmlwords[$word])) {
$score += $xmlwords[$word];
unset($xmlwords[$word]);
}
$index .= $word.' ';
unset($words[$word]);
}
$words[substr($index, 0, -1)] = $score;
$index = '';
}
else {
foreach ($words as $word => $score) {
$this->out_debug('word '.$word.', add '.$this->goodTags[$currentElement].' to '.$score);
$words[$word] = $this->goodTags[$currentElement] + $score;
}
}
$xmlwords = array_merge($xmlwords, $words);
break;
}
}
return $xmlwords;
}
function out_debug($o)
{
if (!$this->debug)
return;
static $debug_c = 0;
echo "<pre style=\"color:green\">";
$debug_c++;
if (is_array($o)) {
foreach ($o as $k => $tok) {
echo "\n{$debug_c}] $k => "; print_r($tok);
$debug_c++;
}
}
else
echo "\n{$debug_c}] $o";
echo "</pre>";
}
}
function unit_test($article, $lang) {
echo "<h1>Analizing for:</h1><div>".htmlspecialchars($article)."</div>";
$mydoc = new seltz_analyzer($article, array('doclang' => $lang, 'debug' => true));
$words = $mydoc->buildStruct();
unset($mydoc);
// here produce another analysis using Open Text Summarizer [see http://libots.sourceforge.net] output
$summary = "<div>".strip_tags(shell_exec('echo "'.$article.'"|ots --ratio 10 -d '.$lang))."</div>";
$mydoc = new seltz_analyzer($summary, array('doclang' => $lang, 'debug' => true));
$sumwords = $mydoc->buildStruct();
// and than sum the two array's score
foreach ($words as $word => $score)
$words[$word] = $sumwords[$word] + $score;
foreach ($sumwords as $word => $score)
if (!isset($words[$word]))
$words[$word] = $sumwords[$word];
asort($words);
echo "<pre>";print_r($words);echo "</pre>";
}
?>