Location: PHPKode > projects > Buzzword > buzzword-0.8.12/includes/relevance.inc
<?php

/*
 * buzzword
 * Copyright (c) 2003 Jon Tai
 *
 * $Id: relevance.inc 330 2004-04-17 06:53:38Z jon $
 *
 * This file is part of buzzword.
 *
 * buzzword is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * buzzword is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with buzzword; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

// displays related blog entries, galleries, links, etc.
// (thanks, http://www.nat.org/dashboard/)
function relevance_hook($self_object_name, $self_object_key) {
    // bail out if relevance is disabled
    if (!get_pref('buzzword_relevance_limit'))
        return '';

    $related = '';

    global $BUZZWORD_HOOKS;
    if (!is_array($BUZZWORD_HOOKS['relevance']))
        return $related;

    sort($BUZZWORD_HOOKS['relevance']);
    foreach ($BUZZWORD_HOOKS['relevance'] as $hook) {
        $objects = get_relevance_objects($self_object_name, $self_object_key, $hook[1]);

        if (count($objects)) {
            $related .= "<h1>related {$hook[2]}</h1>\n";
            $related .= "<div class=\"sidebar-container\">\n";

            if (get_pref('buzzword_relevance_terse')) {
                foreach ($objects as $object)
                    $related .= $object->get_display_terse_relevance();
            } else {
                $related .= "<p>\n";
                foreach ($objects as $object)
                    $related .= $object->get_display_relevance();
                $related .= "</p>\n";
            }

            $related .= "</div>\n";
        }
    }

    return $related;
}

// returns objects of type $peer_object_name that are related to 
// the object of type $self_object_name with key $self_object_key
function get_relevance_objects($self_object_name, $self_object_key, $peer_object_name) {
    // save this var, we need it later
    $object_type = $peer_object_name;

    $self_object_name = mysql_quote_string($self_object_name);
    $self_object_key = (int) $self_object_key;
    $peer_object_name = mysql_quote_string($peer_object_name);

    // perform a self-join to match up keywords
    $sql = 'SELECT ';
    $sql .= 'self_cache.relevance_score AS self_score, ';
    $sql .= 'peer_cache.relevance_score AS peer_score, ';
    $sql .= 'peer_cache.object_name AS object_name, ';
    $sql .= 'peer_cache.object_key AS object_key ';

    $sql .= 'FROM ';
    $sql .= DB_PREFIX.'relevance_cache AS self_cache, ';
    $sql .= DB_PREFIX.'relevance_cache AS peer_cache ';

    $sql .= "WHERE self_cache.object_name = $self_object_name ";
    $sql .= "AND self_cache.object_key = $self_object_key ";
    $sql .= "AND peer_cache.object_name = $peer_object_name ";
    $sql .= "AND peer_cache.object_key != $self_object_key ";
    $sql .= "AND self_cache.relevance_keyword = peer_cache.relevance_keyword ";
    $result = mysql_query($sql);

    $scores = array();

    // assign scores to objects - maybe there's a way to have 
    // mysql do this part too?
    while ($row = mysql_fetch_assoc($result)) {
        $key = $row['object_key'];
            if (empty($scores[$key]))
            $scores[$key] = 0;

        // 1.2 points are added for each time the word appears in the 
        // original object
        $scores[$key] += ($row['self_score'] * 1.2);

        // 1.0 points are added for each time the word appears in the 
        // related object
        $scores[$key] += ($row['peer_score'] * 1.0);
    }

    // conserve memory
    mysql_free_result($result);

    // sort the scores in reverse order
    arsort($scores);

    // fetch the objects
    $objects = array();

    while (list($key, $score) = each($scores)) {
        $object = new $object_type($key);
        if ( ($object->exists()) && ($object->is_accessible) && ($object->is_visible) )
            $objects[] = $object;

        // stop when we've reached the limit
        if (count($objects) == get_pref('buzzword_relevance_limit'))
            break;
    }

    return $objects;
}

// deletes cached key words
function flush_relevance_keywords($object_name, $object_key) {
    $object_name = mysql_quote_string($object_name);
    $object_key = (int) $object_key;

    // remove all keywords for this $object_name/$object_key pair
    $sql = 'DELETE FROM '.DB_PREFIX.'relevance_cache ';
    $sql .= "WHERE object_name = $object_name ";
    $sql .= "AND object_key = $object_key";
    mysql_query($sql);
}

// caches an array of key words
function cache_relevance_keywords($object_name, $object_key, $keywords) {
    // first, remove all previous keywords for this $object_name/$object_key pair
    flush_relevance_keywords($object_name, $object_key);

    $object_name = mysql_quote_string($object_name);
    $object_key = (int) $object_key;

    // lock the relevance_cache table for performance
    $sql = 'LOCK TABLES '.DB_PREFIX.'relevance_cache WRITE';
    mysql_query($sql);

    // insert the new keywords
    while (list($relevance_keyword, $relevance_score) = each($keywords)) {
        $relevance_keyword = mysql_quote_string($relevance_keyword);
        $relevance_score = (int) $relevance_score;

        $sql = 'INSERT INTO '.DB_PREFIX.'relevance_cache ';
        $sql .= '(object_name, object_key, relevance_keyword, relevance_score) VALUES ';
        $sql .= "($object_name, $object_key, $relevance_keyword, $relevance_score)";
        mysql_query($sql);
    }

    // unlock the relevance_cache table
    $sql = 'UNLOCK TABLES';
    mysql_query($sql);
}

// given a string, returns an array of key words and their scores
// (thanks, http://libots.sourceforge.net/)
function get_relevance_keywords($string) {
    $debug = FALSE;
    $keywords = array();

    // strip out HTML, URLs, and HTML entities
    $string = strip_tags($string);
    $string = preg_replace('/(http|ftp)s?:\/\/[^\s]+/i', '', $string);
    $string = preg_replace('/&#?[a-z0-9]+;/i', '', $string);

    // ignore the possessive form (ie. jon's => jon)
    $string = preg_replace('/\'s\s/i', ' ', $string);

    // insert artificial word boundaries
    $string = preg_replace('/[!\?\.-]/', ' . ', $string);

    // split string into words
    $words = preg_split('/\s+/', $string);
    $prev_word = '.';

    // load the keyword blacklist
    static $keyword_blacklist = array();

    // only re-read it from file if it's not already cached
    if (!count($keyword_blacklist)) {
        $file = file('../includes/blacklist.inc');
        foreach ($file as $line) {
            $line = trim($line);
            if ( ($line != '') && ($line[0] != '#') )
                $keyword_blacklist[$line] = TRUE;
        }
    }

    // load pspell dictionary
    if (function_exists('pspell_new'))
        $pspell_dictionary = pspell_new('en');

    // examine each word
    foreach($words as $word) {
        if ($word != '') {
            // strip the word of non-alpha-numeric characters
            $keyword = strtolower(preg_replace('/[^a-z0-9]/i', '', $word));

            // flag to indicate that this word is a keyword
            // 0 => undecided, 1 => keyword, -1 => not a keyword
            $is_keyword = 0;

            // ignore very short (< 2 chars) and very long (> 100 chars) words
            if ($is_keyword == 0) {
                if ( (strlen($keyword) < 2) || (strlen($keyword) > 100) ) {
                    $is_keyword = -1;
                    if ($debug)
                        echo "$word (reject, word is too short or too long)\n";
                }
            }

            // if the word is on the keyword blacklist, it can't be a key word
            if ($is_keyword == 0) {
                if (!empty($keyword_blacklist[$keyword])) {
                   $is_keyword = -1;
                   if ($debug)
                       echo "$word (reject, word is on blacklist)\n";
                }
            }

            // if the word consists entirely of numbers, it's not a keyword
            if ($is_keyword == 0) {
                if (preg_match('/^[0-9]+$/', $keyword)) {
                    $is_keyword = -1;
                    if ($debug)
                        echo "$word (reject, word consists entirely of digits)\n";
                }
            }

            // if the word has a digit in it (but does not consist entirely 
            // of digits), it is a keyword
            if ($is_keyword == 0) {
                if (preg_match('/[0-9]/', $keyword)) {
                    $is_keyword = 1;
                    if ($debug)
                        echo "$word (accept, word contains a digit)\n";
                }
            }

            // a capital letter anywhere in the string is considered a keyword
            // unless the previous word contains punctuation.  in that case, 
            // a capital letter anywhere in the string except the first letter 
            // is considered a keyword
            if ($is_keyword == 0) {
                $regex = ($prev_word == '.') ? '/[a-zA-Z].*[A-Z]/' : '/[A-Z]/';
                if (preg_match($regex, $word)) {
                    $is_keyword = 1;
                    if ($debug) {
                        echo "$word (accept, word contains capital letter)\n";
                    }
                }
            }

            // if the word is incorrectly spelled, it's a keyword
            if ($is_keyword == 0) {
                // use pspell if possible, it's faster
                if (function_exists('pspell_check')) {
                    if (!pspell_check($pspell_dictionary, $keyword)) {
                        $is_keyword = 1;
                        if ($debug)
                            echo "$word (accept, word is incorrectly spelled)\n";
                    }
                } else {
                    $command = 'echo "'.$keyword.'" | "'.get_pref('buzzword_ispell_path').'" -a';
                    exec($command, $output, $exitcode);

                    if ($exitcode)
                        display_error("exec($command) failed with exit code $exitcode, exiting\n<br><br>\n".join("<br>\n", $output));

                    $result = $output[1];
                    if ( ($result[0] != '*') && ($result[0] != '+') ) {
                        $is_keyword = 1;
                        if ($debug)
                            echo "$word (accept, word is incorrectly spelled)\n";
                    }
                }
            }

            // debugging info
            if ($is_keyword == 0) {
                if ($debug)
                    echo "$word (reject, default)\n";
            }

            // add keywords to an array
            if ($is_keyword == 1) {
                if (empty($keywords[$keyword]))
                    $keywords[$keyword] = 0;
                $keywords[$keyword]++;
            }

            $prev_word = $word;
        }
    }

    return $keywords;
}

?>
Return current item: Buzzword