<?php
/*
* buzzword
* Copyright (c) 2003 Jon Tai
*
* $Id: relevance.inc 330 2004-04-17 06:53:38Z jon $
*
* This file is part of buzzword.
*
* buzzword is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* buzzword is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with buzzword; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
// displays related blog entries, galleries, links, etc.
// (thanks, http://www.nat.org/dashboard/)
function relevance_hook($self_object_name, $self_object_key) {
// bail out if relevance is disabled
if (!get_pref('buzzword_relevance_limit'))
return '';
$related = '';
global $BUZZWORD_HOOKS;
if (!is_array($BUZZWORD_HOOKS['relevance']))
return $related;
sort($BUZZWORD_HOOKS['relevance']);
foreach ($BUZZWORD_HOOKS['relevance'] as $hook) {
$objects = get_relevance_objects($self_object_name, $self_object_key, $hook[1]);
if (count($objects)) {
$related .= "<h1>related {$hook[2]}</h1>\n";
$related .= "<div class=\"sidebar-container\">\n";
if (get_pref('buzzword_relevance_terse')) {
foreach ($objects as $object)
$related .= $object->get_display_terse_relevance();
} else {
$related .= "<p>\n";
foreach ($objects as $object)
$related .= $object->get_display_relevance();
$related .= "</p>\n";
}
$related .= "</div>\n";
}
}
return $related;
}
// returns objects of type $peer_object_name that are related to
// the object of type $self_object_name with key $self_object_key
function get_relevance_objects($self_object_name, $self_object_key, $peer_object_name) {
// save this var, we need it later
$object_type = $peer_object_name;
$self_object_name = mysql_quote_string($self_object_name);
$self_object_key = (int) $self_object_key;
$peer_object_name = mysql_quote_string($peer_object_name);
// perform a self-join to match up keywords
$sql = 'SELECT ';
$sql .= 'self_cache.relevance_score AS self_score, ';
$sql .= 'peer_cache.relevance_score AS peer_score, ';
$sql .= 'peer_cache.object_name AS object_name, ';
$sql .= 'peer_cache.object_key AS object_key ';
$sql .= 'FROM ';
$sql .= DB_PREFIX.'relevance_cache AS self_cache, ';
$sql .= DB_PREFIX.'relevance_cache AS peer_cache ';
$sql .= "WHERE self_cache.object_name = $self_object_name ";
$sql .= "AND self_cache.object_key = $self_object_key ";
$sql .= "AND peer_cache.object_name = $peer_object_name ";
$sql .= "AND peer_cache.object_key != $self_object_key ";
$sql .= "AND self_cache.relevance_keyword = peer_cache.relevance_keyword ";
$result = mysql_query($sql);
$scores = array();
// assign scores to objects - maybe there's a way to have
// mysql do this part too?
while ($row = mysql_fetch_assoc($result)) {
$key = $row['object_key'];
if (empty($scores[$key]))
$scores[$key] = 0;
// 1.2 points are added for each time the word appears in the
// original object
$scores[$key] += ($row['self_score'] * 1.2);
// 1.0 points are added for each time the word appears in the
// related object
$scores[$key] += ($row['peer_score'] * 1.0);
}
// conserve memory
mysql_free_result($result);
// sort the scores in reverse order
arsort($scores);
// fetch the objects
$objects = array();
while (list($key, $score) = each($scores)) {
$object = new $object_type($key);
if ( ($object->exists()) && ($object->is_accessible) && ($object->is_visible) )
$objects[] = $object;
// stop when we've reached the limit
if (count($objects) == get_pref('buzzword_relevance_limit'))
break;
}
return $objects;
}
// deletes cached key words
function flush_relevance_keywords($object_name, $object_key) {
$object_name = mysql_quote_string($object_name);
$object_key = (int) $object_key;
// remove all keywords for this $object_name/$object_key pair
$sql = 'DELETE FROM '.DB_PREFIX.'relevance_cache ';
$sql .= "WHERE object_name = $object_name ";
$sql .= "AND object_key = $object_key";
mysql_query($sql);
}
// caches an array of key words
function cache_relevance_keywords($object_name, $object_key, $keywords) {
// first, remove all previous keywords for this $object_name/$object_key pair
flush_relevance_keywords($object_name, $object_key);
$object_name = mysql_quote_string($object_name);
$object_key = (int) $object_key;
// lock the relevance_cache table for performance
$sql = 'LOCK TABLES '.DB_PREFIX.'relevance_cache WRITE';
mysql_query($sql);
// insert the new keywords
while (list($relevance_keyword, $relevance_score) = each($keywords)) {
$relevance_keyword = mysql_quote_string($relevance_keyword);
$relevance_score = (int) $relevance_score;
$sql = 'INSERT INTO '.DB_PREFIX.'relevance_cache ';
$sql .= '(object_name, object_key, relevance_keyword, relevance_score) VALUES ';
$sql .= "($object_name, $object_key, $relevance_keyword, $relevance_score)";
mysql_query($sql);
}
// unlock the relevance_cache table
$sql = 'UNLOCK TABLES';
mysql_query($sql);
}
// given a string, returns an array of key words and their scores
// (thanks, http://libots.sourceforge.net/)
function get_relevance_keywords($string) {
$debug = FALSE;
$keywords = array();
// strip out HTML, URLs, and HTML entities
$string = strip_tags($string);
$string = preg_replace('/(http|ftp)s?:\/\/[^\s]+/i', '', $string);
$string = preg_replace('/&#?[a-z0-9]+;/i', '', $string);
// ignore the possessive form (ie. jon's => jon)
$string = preg_replace('/\'s\s/i', ' ', $string);
// insert artificial word boundaries
$string = preg_replace('/[!\?\.-]/', ' . ', $string);
// split string into words
$words = preg_split('/\s+/', $string);
$prev_word = '.';
// load the keyword blacklist
static $keyword_blacklist = array();
// only re-read it from file if it's not already cached
if (!count($keyword_blacklist)) {
$file = file('../includes/blacklist.inc');
foreach ($file as $line) {
$line = trim($line);
if ( ($line != '') && ($line[0] != '#') )
$keyword_blacklist[$line] = TRUE;
}
}
// load pspell dictionary
if (function_exists('pspell_new'))
$pspell_dictionary = pspell_new('en');
// examine each word
foreach($words as $word) {
if ($word != '') {
// strip the word of non-alpha-numeric characters
$keyword = strtolower(preg_replace('/[^a-z0-9]/i', '', $word));
// flag to indicate that this word is a keyword
// 0 => undecided, 1 => keyword, -1 => not a keyword
$is_keyword = 0;
// ignore very short (< 2 chars) and very long (> 100 chars) words
if ($is_keyword == 0) {
if ( (strlen($keyword) < 2) || (strlen($keyword) > 100) ) {
$is_keyword = -1;
if ($debug)
echo "$word (reject, word is too short or too long)\n";
}
}
// if the word is on the keyword blacklist, it can't be a key word
if ($is_keyword == 0) {
if (!empty($keyword_blacklist[$keyword])) {
$is_keyword = -1;
if ($debug)
echo "$word (reject, word is on blacklist)\n";
}
}
// if the word consists entirely of numbers, it's not a keyword
if ($is_keyword == 0) {
if (preg_match('/^[0-9]+$/', $keyword)) {
$is_keyword = -1;
if ($debug)
echo "$word (reject, word consists entirely of digits)\n";
}
}
// if the word has a digit in it (but does not consist entirely
// of digits), it is a keyword
if ($is_keyword == 0) {
if (preg_match('/[0-9]/', $keyword)) {
$is_keyword = 1;
if ($debug)
echo "$word (accept, word contains a digit)\n";
}
}
// a capital letter anywhere in the string is considered a keyword
// unless the previous word contains punctuation. in that case,
// a capital letter anywhere in the string except the first letter
// is considered a keyword
if ($is_keyword == 0) {
$regex = ($prev_word == '.') ? '/[a-zA-Z].*[A-Z]/' : '/[A-Z]/';
if (preg_match($regex, $word)) {
$is_keyword = 1;
if ($debug) {
echo "$word (accept, word contains capital letter)\n";
}
}
}
// if the word is incorrectly spelled, it's a keyword
if ($is_keyword == 0) {
// use pspell if possible, it's faster
if (function_exists('pspell_check')) {
if (!pspell_check($pspell_dictionary, $keyword)) {
$is_keyword = 1;
if ($debug)
echo "$word (accept, word is incorrectly spelled)\n";
}
} else {
$command = 'echo "'.$keyword.'" | "'.get_pref('buzzword_ispell_path').'" -a';
exec($command, $output, $exitcode);
if ($exitcode)
display_error("exec($command) failed with exit code $exitcode, exiting\n<br><br>\n".join("<br>\n", $output));
$result = $output[1];
if ( ($result[0] != '*') && ($result[0] != '+') ) {
$is_keyword = 1;
if ($debug)
echo "$word (accept, word is incorrectly spelled)\n";
}
}
}
// debugging info
if ($is_keyword == 0) {
if ($debug)
echo "$word (reject, default)\n";
}
// add keywords to an array
if ($is_keyword == 1) {
if (empty($keywords[$keyword]))
$keywords[$keyword] = 0;
$keywords[$keyword]++;
}
$prev_word = $word;
}
}
return $keywords;
}
?>