<?php
/*
* IntelligentCensor v2.0
* Copyright 2006-2008 sk89q
* Written by sk89q
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
/**
* @package com.therisenrealm.text
*/
/**
* Censors text while taking into account wildcard substitutions and casing.
*
* @package com.therisenrealm.text
* @version 2.0
* @author sk89q
* @copyright Copyright (c) 2006-2008, sk89q
*/
class IntelligentCensor
{
/**
* List of words to censor.
*
* @var array
*/
public $inclusion = array();
/**
* List of words to not censor.
*
* @var array
*/
public $exclusion = array();
/**
* Fallback text to replace with for situations with too much recursion.
*
* @var string
*/
public $censored_text = "[censored]";
/**
* Maximum amount of recursion.
*
* @var int
*/
public $max_recursion = 15;
private $c_level;
private $c_censor;
private $c_replacement;
/**
* Constructs the object.
* @param array $inclusion Censor word list
* @param array $exclusion Censor exclusion word list
*/
public function __construct(array $inclusion, array $exclusion = array())
{
$this->inclusion = $inclusion;
$this->exclusion = $exclusion;
}
/**
* Attaches the ending of the censor (for the % wildcard) to the
* beginning of the stem. It will also consider the grammar of the stem
* and beginning so that it is conjugated together correctly.
* @param string $stem The stem of the word
* @param string $beginning The beginning to be prepended
* @param string $replacement The censor replacement from the word list (for reference)
* @return string Final attached version with stem and beginning
*/
protected function attach_beginning($stem, $beginning, $replacement)
{
return $beginning.$stem;
}
/**
* Attaches the ending of the censor (for the % wildcard) to the
* end of the stem. It will also consider the grammar of the stem
* and ending so that it is conjugated together correctly.
* @param string $stem The stem of the word
* @param string $ending The ending to be appended
* @param string $replacement The censor replacement from the word list (for reference)
* @return string Final attached version with stem and ending
*/
protected function attach_ending($stem, $ending, $replacement)
{
$last_letter = $replacement[strlen($replacement)-1];
// y => ies
if ($last_letter == "y" && $ending == "s") {
$stem = substr($stem, 0, -1);
$ending = "ies";
}
// e => ing
else if ($last_letter == "e" && $ending == "ing") {
$stem = substr($stem, 0, -1);
$ending = "ing";
}
// e => ed
else if ($last_letter == "e" && $ending == "ed") {
$stem = substr($stem, 0, -1);
$ending = "ed";
}
// Repeated consants
else if (in_array($last_letter, array("p")) & $ending == "y") {
$stem .= $last_letter;
$ending = "y";
}
return $stem.$ending;
}
/**
* Attempts to copy the casing from the original uncensored version
* to the final censored version. For example, if the original
* uncensored word is in all caps, the final version will be
* in all caps. This implementation supports all uppercase,
* all lowercase, and a copying of the capitalizing of each
* character starting from i=0, where i is the position in the
* string (i.e. hAppY => sUpeRcrazy).
* @param string $uncensored The original uncensored version, used for reference
* @param string $final The string that needs to be transformed
* @return string Final transformed string
*/
protected function adapt_casing($final, $uncensored)
{
// Uppercase?
if ($uncensored == strtoupper($uncensored)) {
$final = strtoupper($final);
}
// Lowercase?
else if ($uncensored == strtolower($uncensored)) {
// If the censor has casing, preserve that
if (strtolower($uncensored) != $uncensored) {
$final = strtolower($final);
}
}
// Now we just capitalize the letters in order...
else {
$new_final = '';
// Iterate through each letter
for ($i = 0; $i < strlen($final); $i++) {
if(strlen($uncensored) == $i) { // Ran out of letters, default...
$new_final .= substr($final, $i);
break;
} else if (strtoupper($uncensored[$i]) == $uncensored[$i]) {
$new_final .= strtoupper($final[$i]);
} else {
$new_final .= strtolower($final[$i]);
}
}
$final = $new_final;
}
return $final;
}
/**
* Used as the censor callback.
* @param array $m Matches
* @return string Censored version
*/
private function censor_callback(array $m)
{
$level = $this->c_level;
$censor = $this->c_censor;
$replacement = $this->c_replacement;
// Exclusion!
foreach ($this->exclusion as $match) {
$pattern = preg_quote($match, '#'); // Make it regexp safe
$pattern = str_replace("\*", "\w*", $pattern); // For matches
$pattern = str_replace("\+", "+", $pattern); // For repeating characters
if(preg_match("#^$pattern$#i", $m[0]))
{
return $m[0];
}
}
// Get rid of characters
$replacement_clean = str_replace("%", "", $replacement);
$replacement_clean = str_replace("*", "", $replacement_clean);
$replacement_clean = str_replace("+", "", $replacement_clean);
$final = '';
// Add beginning match
if ($censor[0] == "%" && strlen($m[1]) > 0) {
$beginning = $this->censor($m[1], $level+1, $level, $censor, $replacement);
// Adapt grammar
$final = $this->attach_beginning($final, $beginning, $replacement_clean);
}
// Add the replacement
$final .= $replacement_clean;
// Add ending match
if ($censor[strlen($censor)-1] == "%" && strlen($m[count($m)-1]) > 0) {
$end = $this->censor($m[count($m)-1], $level+1, $level, $censor, $replacement);
// Adapt grammar
$final = $this->attach_ending($final, $end, $replacement_clean);
}
// Casing
$final = $this->adapt_casing($final, $m[0]);
// Trim
$final = trim($final);
return $final;
}
/**
* Performs censoring on a string of text
* @param string $text The text
* @param int $level Recursion level; internally incremented number
* @param int $old_level Due to limitation of scope, we need to reset this for recursion to work
* @param int $old_censor Due to limitation of scope, we need to reset this for recursion to work
* @param int $old_replacement Due to limitation of scope, we need to reset this for recursion to work
* @return string Final censored text string
*/
public function censor($text, $level = 0, $old_level = 0, $old_censor = 0, $old_replacement = '')
{
// Blank?
if(trim($text) == "")
{
return $text;
}
// Too much recursion
if ($level >= $this->max_recursion) {
return $this->censored_text;
}
foreach ($this->inclusion as $censor => $replacement) {
// Need this for the callback
$this->c_level = $level;
$this->c_censor = $censor;
$this->c_replacement = $replacement;
$pattern = preg_quote($censor, '#'); // Make it regexp safe
$pattern = "($pattern)"; // Add surrounding paranthesis so we can add )( later
$pattern = str_replace("%", ")(\w*)(", $pattern); // For beginning and end matches
$pattern = str_replace("\*", "\w*", $pattern); // For matches that don't matter
$pattern = str_replace("\+", "+", $pattern); // For repeating characters
$pattern = str_replace("()", "", $pattern); // Clean up useless match groups
$text = preg_replace_callback("#\b$pattern\b#i", array($this, 'censor_callback'), $text);
}
// Fake scope...
// The problem is... PHP does not have closures where variables
// can be bound. Setting properties of the current object
// work dandy if it's just a linear stack, but when recursion
// is involved, we need to "reset" the current "bounded variables."
$this->c_level = $old_level;
$this->c_censor = $old_censor;
$this->c_replacement = $old_replacement;
return $text;
}
}