Location: PHPKode > scripts > Seltz analyzer > seltz-analyzer/seltz_analyzer.php
<?php

/**
 * Seltzlab - web applications development
 * http://www.seltzlab.com
 * mail: hide@address.com
 * skype: seltzlab
 * 
 * This work is licensed under Creative Commons Attribution-Share Alike 3.0
 * http://creativecommons.org/licenses/by-sa/3.0/us/deed.en
 */

/**
 * Class to analize semantic data from a html trunk 
 */
class seltz_analyzer
{
    /**
     * turn on/off debug output
     * @var boolean
     */
    private $debug = false;

    /**
     * autodetected document type, can be url or string
     * @var string
     */
    public $doctype;
    /**
     * document text
     * @var string
     */
    public $doctext;
    /**
     * doucment language using 2 chars
     * @var string 
     */
    public $doclang;
    /**
     * if doctype is url, here the document url
     * @var string
     */
    public $docurl;
    /**
     * the document encoding, if not specified try to get it from html meta informations. If fails set to iso-8859-1
     * @var string
     */
    public $encoding;

    /**
     * constants defined for our "good tags" weight in score computation 
     */
    const weight_ucfirst = 4;
    const weight_pspell = 3;
    const weight_strong = 5;
    const weight_em = 5;
    const weight_span = 4;
    const weight_p = 1;
    const weight_div = 0;

    /**
     * array of tags we assume are good for a semantic meaning
     * @var array
     */
    public $goodTags = array(
        'strong' => seltz_analyzer::weight_strong,
        'b' => seltz_analyzer::weight_strong,
        'em' => seltz_analyzer::weight_em,
        'i' => seltz_analyzer::weight_em,
        'span' => seltz_analyzer::weight_span,
        'p' => seltz_analyzer::weight_p,
        'div' => seltz_analyzer::weight_div
    );
    /**
     * array of tags we assume are particoulary good for a semantic meaning
     * @var array
     */
    public $reallyGoodTags = array(
        'strong',
        'b',
        'em',
        'i',
        'span'
    );
    /**
     * array of tags we assume are not good for a semantic meaning. Remember, at the moment all is finalized in getting some keywords linked on wikipedia
     * @var array
     */
    public $badTags = array(
        'a',
        'h1',
        'h2',
        'h3',
        'h4',
        'h5',
        'h6',
        'li'
    );

    /**
     * pspell object
     * @var object
     */
    private $pspell;

    /**
     * @param string $doc url of the document or xhtml string
     * @param array $config configuration array. key debug bool default false, key encoding string default iso-8859-1, key doclang 2 char string default en
     */
    function __construct($doc, $config)
    {
        if (isset($config['debug']))
            $this->debug = $config['debug'];

        if (preg_match("/^([A-z]*)(:\/\/)/iU", $doc)) {
            $this->doctype = 'url';
            $this->docurl = $doc;
            $this->doctext = file_get_contents($doc);
        }
        else {
            $this->doctype = 'string';
            $this->doctext = $doc;
        }

        if ($config['encoding']) 
            $this->encoding = $config['encoding'];
        else {
            preg_match_all('/<\?xml\s+(version="(.*)")?\s+(encoding="(.*)")?\s*\?>/iU', $this->doctext[0], $match);

            if ($match[4][0]) // ho encoding settato
                $this->encoding = $match[4][0];
            else
                $this->encoding = 'iso-8859-1';
        }

        if (isset($config['doclang']))
            $this->doclang = $config['doclang'];
        else
            $this->doclang = 'en';

        $this->pspell = pspell_new($this->doclang, "", "", "", PSPELL_FAST);

        $this->out_debug(array($this->doclang, $this->doctype, $this->encoding, $goodTags));
    }

    public function buildStruct()
    {
        // analyze html's meanings
        $words = $this->analyze_xml();

        asort($words);

        $this->out_debug('RESULTS -------------------------');
        $this->out_debug($words);

        return $words;
    }

    private function analyze_string($text)
    {
        $words_delimiter = array(' ', ';', ',', '.', ':', '\'', '"', '!', '?', '-', '(', ')', '[', ']', '/', '\\', '=', "\n", "\r", "\t");

        $words = array();

        // analyze text without html and attach a final point to make the last word recognized
        $temptxt = strip_tags($text).'.';
        $current_word = '';

        $this->out_debug("Analyze string:".$temptxt.", strlen:".strlen($temptxt));

        for ($i = 0; $i < (strlen($temptxt) + 1); $i++) {

            $asciichar = ord($temptxt{$i});
            if ($asciichar < 32 || $asciichar > 126)
                continue;

            if (in_array($temptxt{$i}, $words_delimiter) || $temptxt{$i} == ' ') {
                if (!ctype_digit($current_word) && is_string($current_word)) {
                    $word_index = strtolower($current_word);
                    $this->out_debug("\tFound word:".$word_index);
                    // if first char is uppercase maybe is a candidate word...
                    if (strlen($current_word) > 1 && strlen($current_word) != $i && $current_word === ucfirst($current_word) && !in_array($temptxt{($i - strlen($current_word) - 1)}, array_slice($words_delimiter, 1))) {
                        // if first char uppercase && the previous char was a blank space && the previous word was uppercase too
                        if ($temptxt{($i - strlen($current_word) - 1)} == ' ' && $last_word === ucfirst($last_word)) {
                            // than maybe is highly possible that the two words goes togheter

                            if (in_array($current_word, explode(' ', $last_word_index)))
                                $word_index = $last_word_index;
                            else
                                $word_index = strtolower($last_word_index.' '.$current_word);
                            $words[$word_index] += seltz_analyzer::weight_ucfirst + $words[$last_word_index];
                            unset($words[$last_word_index]);
                            $this->out_debug("\tPrevious was uppercase.. get togheter: $word_index = $last_word_index + ".$current_word);
                        }
                        else {
                            $words[$word_index] += seltz_analyzer::weight_ucfirst;
                            $this->out_debug("\tFirst char uppercase, score:".$words[$word_index]);
                        }
                    }
                    // if spell is not recognized maybe is a candidate word...
                    if (strlen($current_word) > 1 && !pspell_check($this->pspell, $current_word)) {
                        $words[$word_index] += seltz_analyzer::weight_pspell;
                        $this->out_debug("\tWord not found in pspell, score:".$words[$word_index]);
                    }
                    $last_word_index = $word_index;
                    $last_word = $current_word;
                }
                $current_word = '';
                continue;
            }
            else {
                $current_word .= $temptxt{$i};
            }
        }

$this->out_debug($words);

        return $words;
    }

    private function analyze_xml()
    {
        $this->out_debug('PARSE XML.............');

        $insideAt = array();
        $tags = array_keys($this->goodTags);

        $xmlwords = array();

        $reader = new XMLReader();
        $reader->XML($this->doctext, $this->encoding);
        $reader->setParserProperty(XMLReader::SUBST_ENTITIES, true);
        $reader->setParserProperty(XMLReader::LOADDTD, true);
        $reader->setParserProperty(XMLReader::VALIDATE, false);

        while ($reader->read()) {
            $el = strtolower($reader->name);
            $stopUntil = false;
            $this->out_debug('xml node:'.$reader->nodeType.', '.$el.', currentElement: '.$currentElement.', last:'.$lastElement);

            switch ($reader->nodeType) {
                case XMLREADER::ELEMENT:
                $this->out_debug('start '.$el);
                if (in_array($el, $tags))
                    $currentElement = $el;
                else if (in_array($el, $this->badTags))
                    $insideBad = $el;
                break;

                case XMLREADER::END_ELEMENT:
                $this->out_debug('end '.$el);
                if ($insideBad == $el)
                    $insideBad = false;
                $lastElement = $el;
                $currentElement = '';
                break;

                case XMLREADER::TEXT:
                if ((!$currentElement || $insideBad) && !in_array($lastElement, $tags))
                    break;

                if (in_array($currentElement, $this->reallyGoodTags))
                    $this->out_debug('inside a really good tag:'.$currentElement);

                $this->out_debug('analyze node content:'.$reader->value);
                $words = seltz_analyzer::analyze_string($reader->value);

                // if inside a really good tag we take the entire string as index
                if (in_array($currentElement, $this->reallyGoodTags)) {
                    foreach ($words as $word => $s) {
                        $this->out_debug('word '.$word.', add '.$s.' to '.$score);
                        $score += $this->goodTags[$currentElement] + $s;
                        if (isset($xmlwords[$word])) {
                            $score += $xmlwords[$word];
                            unset($xmlwords[$word]);
                        }
                        $index .= $word.' ';

                        unset($words[$word]);
                    }
                    $words[substr($index, 0, -1)] = $score;
                    $index = '';
                }
                else {
                    foreach ($words as $word => $score) {
                        $this->out_debug('word '.$word.', add '.$this->goodTags[$currentElement].' to '.$score);
                        $words[$word] = $this->goodTags[$currentElement] + $score;
                    }
                }

                $xmlwords = array_merge($xmlwords, $words);
                break;
           }
        }

        return $xmlwords;
    }

    function out_debug($o)
    {
        if (!$this->debug)
            return;

        static $debug_c = 0;
        echo "<pre style=\"color:green\">";

        $debug_c++;
        if (is_array($o)) {
            foreach ($o as $k => $tok) {
                echo "\n{$debug_c}] $k => "; print_r($tok);
                $debug_c++;
            }
        }
        else
            echo "\n{$debug_c}] $o";
        echo "</pre>";
    }
}

function unit_test($article, $lang) {

    echo "<h1>Analizing for:</h1><div>".htmlspecialchars($article)."</div>";

    $mydoc = new seltz_analyzer($article, array('doclang' => $lang, 'debug' => true));
    $words = $mydoc->buildStruct();
    unset($mydoc);

    // here produce another analysis using Open Text Summarizer [see http://libots.sourceforge.net] output
    $summary = "<div>".strip_tags(shell_exec('echo "'.$article.'"|ots --ratio 10 -d '.$lang))."</div>";
    $mydoc = new seltz_analyzer($summary, array('doclang' => $lang, 'debug' => true));
    $sumwords = $mydoc->buildStruct();

    // and than sum the two array's score
    foreach ($words as $word => $score)
        $words[$word] = $sumwords[$word] + $score;

    foreach ($sumwords as $word => $score)
        if (!isset($words[$word]))
            $words[$word] = $sumwords[$word];

    asort($words);
    echo "<pre>";print_r($words);echo "</pre>";
}

?>
Return current item: Seltz analyzer