<?
//------------------------------------------------------------------------------------------
// SpamFilter 1.1 - 13/09/2005
// Licensed under GPL
//
// Created by Rafael C.P. (a.k.a. Kurama_Youko)
// Contact: hide@address.com
// Personal homepage: http://www.inf.ufrgs.br/~rcpinto (Portuguese only, sorry!)
//
// SpamFilter is a PHP class to do smart filtering on e-mails or any type of text.
// Could be used in webmail applications or even in less obvious applications like forums and guestbooks.
// It´s an adaptative filter so, it really learns what´s spam or not.
// Also, being an adaptative filter, it will misses some classifications at the beginning,
// so, you must manually set the wrong texts to their correct category (NORMAL or SPAM).
//------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------
// What´s New:
// - Words separated by line breaks now are separated correctly;
// - Option for MAP calculation.
// Thanks to Dario for both upgrades!!!
//------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------
// Usage:
// include('spamfilter.php');
// $sf = new SpamFilter();
//
// //Basic mail filtering
// *** here goes your e-mail functions getting full contents of an e-mail and storing it in $text ***
// if ($sf->filter($text)) {
// *** your e-mail functions to delete mail or move it to a junk folder ***
// }
//
// //It was marked as spam, but it isn´t!
// $sf->movefrom($text,SPAM);
//
// //It was left as normal, but it isn´t!
// $sf->movefrom($text,NORMAL);
//------------------------------------------------------------------------------------------
if (!defined('NORMAL')) define('NORMAL',0);
if (!defined('SPAM')) define('SPAM',1);
class SpamFilter {
var $data = array(); //Spam knowledge base
var $threshold = 0.5; //Spam threshold
var $filename = 'data.skb'; //Spam knowledge base filename
var $autosave = true; //Auto-save spam knowledge base to file after each filtering
var $map = false; //Use MAP calculation
//Constructor
function SpamFilter($filename='') {
if ($filename) $this->filename = $filename;
fclose(fopen($this->filename,'a'));
$this->load($this->filename);
}
//Loads spam knowledge base from file
function load($filename='') {
if ($filename) $this->filename = $filename;
$contents = implode(' ',file($this->filename));
$this->data = unserialize($contents);
//Dario´s contribution
$words = array_keys($this->data);
$this->tot_spam = 0;
$this->tot_nospam = 0;
foreach($words as $word) {
$this->tot_spam += $this->data[$word][1];
$this->tot_nospam += $this->data[$word][0];
}
//
return true;
}
//Saves spam knowledge base to file
function save($filename='') {
if ($filename) $this->filename = $filename;
$f = fopen($this->filename,'w');
if (!$f) return false;
$contents = serialize($this->data);
fwrite($f,$contents);
fclose($f);
return true;
}
//Adds word $word to spam knowledge base (0=normal,1=spam)
function addword($word,$spam) {
if ($word)
if ($spam) $this->data[$word][1]++;
else $this->data[$word][0]++;
}
//Deletes word $word from spam knowledge base (0=normal,1=spam)
function delword($word,$spam) {
if ($word)
if ($spam) $this->data[$word][1]--;
else $this->data[$word][0]--;
}
//Normalizes text for comparison
function normalize($text) {
$text = strtolower($text);
//Dario´s contribution
$text = str_replace("\n",' ',$text);
$text = str_replace("\r",' ',$text);
$text = str_replace("\t",' ',$text);
$text = str_replace("\0",' ',$text);
//
$text = str_replace(',',' ',$text);
$text = str_replace('.',' ',$text);
$text = str_replace('-',' ',$text);
$text = str_replace('?',' ',$text);
$text = str_replace('!',' ',$text);
$text = str_replace('"',' ',$text);
$text = str_replace('\'',' ',$text);
$text = str_replace('\\',' ',$text);
$text = str_replace('/',' ',$text);
$text = str_replace('_',' ',$text);
$text = str_replace(':',' ',$text);
$text = str_replace('[',' ',$text);
$text = str_replace(']',' ',$text);
$text = str_replace('#',' ',$text);
$text = str_replace('@',' ',$text);
$text = str_replace(')',' ',$text);
$text = str_replace('(',' ',$text);
$text = str_replace('ç','c',$text);
$text = str_replace('á','a',$text);
$text = str_replace('ã','a',$text);
$text = str_replace('â','a',$text);
$text = str_replace('à','a',$text);
$text = str_replace('ä','a',$text);
$text = str_replace('é','e',$text);
$text = str_replace('ê','e',$text);
$text = str_replace('è','e',$text);
$text = str_replace('ë','e',$text);
$text = str_replace('í','i',$text);
$text = str_replace('î','i',$text);
$text = str_replace('ì','i',$text);
$text = str_replace('ï','i',$text);
$text = str_replace('ó','o',$text);
$text = str_replace('õ','o',$text);
$text = str_replace('ô','o',$text);
$text = str_replace('ò','o',$text);
$text = str_replace('ö','o',$text);
$text = str_replace('ú','u',$text);
$text = str_replace('û','u',$text);
$text = str_replace('ù','u',$text);
$text = str_replace('ü','u',$text);
return $text;
}
//Stores text words into spam knowledge base (0=normal,1=spam)
function store($text,$spam) {
$text = $this->normalize($text);
$words = explode(' ',$text);
foreach ($words as $word) {
$this->addword($word,$spam);
}
}
//Removes text words from spam knowledge base (0=normal,1=spam)
function remove($text,$spam) {
$text = $this->normalize($text);
$words = explode(' ',$text);
foreach ($words as $word) {
$this->delword($word,$spam);
}
}
//Moves text words in spam knowledge base from a type to another (0=from normal,1=from spam)
function movefrom($text,$spam) {
$text = $this->normalize($text);
$words = explode(' ',$text);
foreach ($words as $word) {
$this->delword($word,$spam);
$this->addword($word,1-$spam);
}
}
//Returns spam probability for a single word
function wordscore($word) {
$total = $this->data[$word][0]+$this->data[$word][1];
if ($total == 0) return 0.5;
if ($this->map)
//Dario´s contribution
return $this->data[$word][1] / $this->tot_spam / ($this->data[$word][1] / $this->tot_spam + $this->data[$word][0] / $this->tot_nospam);
//
else
return $this->data[$word][1] / $total;
}
//Returns spam probability for a text
function textscore($text) {
$text = $this->normalize($text);
$words = explode(' ',$text);
$total = count($words);
if ($total == 0) return 0.5;
$score = 0;
foreach ($words as $word) {
if ($word) $score += $this->wordscore($word);
}
return $score / $total;
}
//If text´s spam probability is greater or equal to threshold
// store it as spam, else, as normal text
function filter($text,$threshold='') {
if ($threshold) $this->threshold = $threshold;
$text = $this->normalize($text);
if ($this->textscore($text) < $this->threshold) {
$this->store($text,0);
if ($this->autosave) $this->save();
return false; //Not spam
}
else {
$this->store($text,1);
if ($this->autosave) $this->save();
return true; //Is spam
}
}
}
?>