Location: PHPKode > scripts > Spam Filter > spam-filter/spamfilter.php
<?
//------------------------------------------------------------------------------------------
// SpamFilter 1.1 - 13/09/2005
// Licensed under GPL
//
// Created by Rafael C.P. (a.k.a. Kurama_Youko)
// Contact: hide@address.com
// Personal homepage: http://www.inf.ufrgs.br/~rcpinto (Portuguese only, sorry!)
//
// SpamFilter is a PHP class to do smart filtering on e-mails or any type of text.
// Could be used in webmail applications or even in less obvious applications like forums and guestbooks.
// It´s an adaptative filter so, it really learns what´s spam or not.
// Also, being an adaptative filter, it will misses some classifications at the beginning,
// so, you must manually set the wrong texts to their correct category (NORMAL or SPAM).
//------------------------------------------------------------------------------------------

//------------------------------------------------------------------------------------------
// What´s New:
// - Words separated by line breaks now are separated correctly;
// - Option for MAP calculation.
// Thanks to Dario for both upgrades!!!
//------------------------------------------------------------------------------------------

//------------------------------------------------------------------------------------------
// Usage:
// include('spamfilter.php');
// $sf = new SpamFilter();
//
// //Basic mail filtering
// *** here goes your e-mail functions getting full contents of an e-mail and storing it in $text ***
// if ($sf->filter($text)) {
//		*** your e-mail functions to delete mail or move it to a junk folder ***
// }
//
// //It was marked as spam, but it isn´t!
// $sf->movefrom($text,SPAM);
// 
// //It was left as normal, but it isn´t!
// $sf->movefrom($text,NORMAL);
//------------------------------------------------------------------------------------------

if (!defined('NORMAL')) define('NORMAL',0);
if (!defined('SPAM')) define('SPAM',1);

class SpamFilter {
	
	var $data = array();		//Spam knowledge base
	var $threshold = 0.5;		//Spam threshold
	var $filename = 'data.skb'; //Spam knowledge base filename
	var $autosave = true;		//Auto-save spam knowledge base to file after each filtering
	var $map = false;			//Use MAP calculation
	
	//Constructor
	function SpamFilter($filename='') {
		if ($filename) $this->filename = $filename;
		fclose(fopen($this->filename,'a'));
		$this->load($this->filename);
	}
	
	//Loads spam knowledge base from file
	function load($filename='') {
		if ($filename) $this->filename = $filename;
		$contents = implode(' ',file($this->filename));
		$this->data = unserialize($contents);

		//Dario´s contribution
		$words = array_keys($this->data);
		$this->tot_spam = 0;
		$this->tot_nospam = 0;
		foreach($words as $word) {
			$this->tot_spam += $this->data[$word][1];
			$this->tot_nospam += $this->data[$word][0];			
		}
		//
				
		return true;
	}
	
	//Saves spam knowledge base to file
	function save($filename='') {
		if ($filename) $this->filename = $filename;
		$f = fopen($this->filename,'w');
		if (!$f) return false;
		$contents = serialize($this->data);
		fwrite($f,$contents);
		fclose($f);
		return true;
	}
	
	//Adds word $word to spam knowledge base (0=normal,1=spam)
	function addword($word,$spam) {
		if ($word)
			if ($spam) $this->data[$word][1]++;
			else $this->data[$word][0]++;
	}
	
	//Deletes word $word from spam knowledge base (0=normal,1=spam)
	function delword($word,$spam) {
		if ($word)
			if ($spam) $this->data[$word][1]--;
			else $this->data[$word][0]--;
	}
	
	//Normalizes text for comparison
	function normalize($text) {
		$text = strtolower($text);

		//Dario´s contribution
    	$text = str_replace("\n",' ',$text);
	    $text = str_replace("\r",' ',$text);
    	$text = str_replace("\t",' ',$text);
	    $text = str_replace("\0",' ',$text);
		//
    
		$text = str_replace(',',' ',$text);
		$text = str_replace('.',' ',$text);
		$text = str_replace('-',' ',$text);
		$text = str_replace('?',' ',$text);
		$text = str_replace('!',' ',$text);
		$text = str_replace('"',' ',$text);
		$text = str_replace('\'',' ',$text);
		$text = str_replace('\\',' ',$text);
		$text = str_replace('/',' ',$text);
		$text = str_replace('_',' ',$text);
		$text = str_replace(':',' ',$text);
		$text = str_replace('[',' ',$text);
		$text = str_replace(']',' ',$text);
		$text = str_replace('#',' ',$text);
		$text = str_replace('@',' ',$text);
		$text = str_replace(')',' ',$text);
		$text = str_replace('(',' ',$text);

		$text = str_replace('ç','c',$text);

		$text = str_replace('á','a',$text);
		$text = str_replace('ã','a',$text);
		$text = str_replace('â','a',$text);
		$text = str_replace('à','a',$text);
		$text = str_replace('ä','a',$text);

		$text = str_replace('é','e',$text);
		$text = str_replace('ê','e',$text);
		$text = str_replace('è','e',$text);
		$text = str_replace('ë','e',$text);

		$text = str_replace('í','i',$text);
		$text = str_replace('î','i',$text);
		$text = str_replace('ì','i',$text);
		$text = str_replace('ï','i',$text);

		$text = str_replace('ó','o',$text);
		$text = str_replace('õ','o',$text);
		$text = str_replace('ô','o',$text);
		$text = str_replace('ò','o',$text);
		$text = str_replace('ö','o',$text);

		$text = str_replace('ú','u',$text);
		$text = str_replace('û','u',$text);
		$text = str_replace('ù','u',$text);
		$text = str_replace('ü','u',$text);
		
		return $text;
	}
	
	//Stores text words into spam knowledge base (0=normal,1=spam)
	function store($text,$spam) {
		$text = $this->normalize($text);
		$words = explode(' ',$text);
		foreach ($words as $word) {
			$this->addword($word,$spam);
		}
	}
	
	//Removes text words from spam knowledge base (0=normal,1=spam)
	function remove($text,$spam) {
		$text = $this->normalize($text);
		$words = explode(' ',$text);
		foreach ($words as $word) {
			$this->delword($word,$spam);
		}
	}
	
	//Moves text words in spam knowledge base from a type to another (0=from normal,1=from spam)
	function movefrom($text,$spam) {
		$text = $this->normalize($text);
		$words = explode(' ',$text);
		foreach ($words as $word) {
			$this->delword($word,$spam);
			$this->addword($word,1-$spam);
		}
	}
	
	//Returns spam probability for a single word
	function wordscore($word) {
		$total = $this->data[$word][0]+$this->data[$word][1];
		if ($total == 0) return 0.5;
		if ($this->map)
			//Dario´s contribution
			return $this->data[$word][1] / $this->tot_spam / ($this->data[$word][1] / $this->tot_spam + $this->data[$word][0] / $this->tot_nospam);
			//
		else
			return $this->data[$word][1] / $total;
	}
	
	//Returns spam probability for a text
	function textscore($text) {
		$text = $this->normalize($text);
		$words = explode(' ',$text);
		$total = count($words);
		if ($total == 0) return 0.5;
		$score = 0;
		foreach ($words as $word) {
			if ($word) $score += $this->wordscore($word);
		}
		return $score / $total;
	}
	
	//If text´s spam probability is greater or equal to threshold
	//	store it as spam, else, as normal text
	function filter($text,$threshold='') {
		if ($threshold) $this->threshold = $threshold;
		$text = $this->normalize($text);
		if ($this->textscore($text) < $this->threshold) {
			$this->store($text,0);
			if ($this->autosave) $this->save();
			return false; //Not spam
		}
		else {
			$this->store($text,1);
			if ($this->autosave) $this->save();
			return true; //Is spam
		}
	}
}
?>
Return current item: Spam Filter