<?php
/**
* TOKENIZER library
*
* This library has the following features:
* 1. you can define a list of different sequences to cut tokens, not only a single char.
* 2. you can define a list of markers to avoid the cutting of tokens.
* 3. for each marker you can define opening, closure and escaping sequence of chars: you are not obliged to use only one char.
*
* @package tokenizer
* @version 1
* @author Domenico Pontari <hide@address.com>
* @copyright Copyright (c) 2010, Domenico Pontari
* @license http://opensource.org/licenses/bsd-license.php New and Simplified BSD licenses
*/
/**
* This class define sequences of chars to avoid the cutting of tokens
*
* Markers are sequences of chars to avoid the cutting of tokens. For each marker
* you can define opening, closure and escaping sequence of chars:
* you are not obliged to use only one char.
* @package tokenizer
*/
class marker {
protected $name;
protected $opening;
protected $closure;
protected $escaping;
function getName () { return $this->name; }
function getOpening () { return $this->opening; }
function getClosure () { return $this->closure; }
function getEscaping () { return $this->escaping; }
function setMarker ($opening, $closure = '', $escaping = '', $name = '') {
$this->opening = $opening;
$this->closure = ($closure == '')?$opening:$closure;
$this->escaping = $escaping;
$this->name = ($name == '')?$opening:$name;
}
function __construct ($opening, $closure = '', $escaping = '', $name = '') {
$this->setMarker ($opening, $closure, $escaping, $name);
}
}
/**
* This class realize the tokenization of a string into pieces
*
* @package tokenizer
*/
class tokenizer {
/**
* @var array an array of strings: a list of different sequences to cut tokens
*/
protected $limits = array();
/**
* @var array an array of markers to avoid the cutting of tokens
* @see marker
*/
protected $markers = array();
/**
* @var array an array of strings: the result of the tokenizer
*/
protected $tokens = false;
/**
* @var array an array of boolean
*/
protected $limitsMap = false;
/**
* @param array an array of markers the tokenizer
* @return void
*/
function setMarkers ($markers) { $this->markers = $markers; }
/**
* @param marker add a marker to the tokenizer
* @return void
*/
function setLimits ($limits) { $this->limits = $limits; }
/**
* @return array|false retrieve again the result of tokenize function
* @see tokenize
*/
function getTokens ($withLimits = false) {
if (($withLimits)||($this->tokens === false)) return $this->tokens;
$result = array();
foreach ($this->tokens as $num => $token)
if (!$this->limitsMap[$num]) array_push ($result, $token);
return $result;
}
function getLimitsMap () { return $this->limitsMap; }
/**
* Execute a tokenization
*
* @param string
* @return array|string if an error occurs return an error message otherwise return an array of tokens
*/
function tokenize ($string) {
$limits = array();
foreach ($this->limits as $limit) array_push ($limits, "($limit)");
$this->tokens = preg_split("/" . implode('|', $limits) . "/", $string,-1,PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE);
$numOfMarkers = count($this->markers);
if ($numOfMarkers > 0) $avoidLevel = array_fill(0, $numOfMarkers, 0);
else $avoidLevel = array();
$result = array();
$someActiveMarker = false;
foreach ($this->tokens as $num => $token) {
$tokenLen = strlen($token);
if ($someActiveMarker) $result[0] .= $token;
else array_unshift($result, $token);
$someActiveMarker = false;
for ($posMarker = 0; $posMarker < $numOfMarkers; ++$posMarker) {
$openingStr = $this->markers[$posMarker]->getOpening();
$closureStr = $this->markers[$posMarker]->getClosure();
$escapingStr = $this->markers[$posMarker]->getEscaping();
for ($i = 0; $i < $tokenLen; $i++) {
$comparingOpening = substr($token, $i, strlen($openingStr));
$comparingClosure = substr($token, $i, strlen($closureStr));
$comparingEscaping = substr($token, $i, strlen($escapingStr));
if ($avoidLevel[$posMarker] == 0) {// evaluate opening only
if ($comparingOpening == $openingStr) {
++$avoidLevel[$posMarker];
$i += strlen($openingStr) - 1;
}
} else {
if ((strlen($escapingStr) > 0) &&
($comparingEscaping == $escapingStr)) {
$i += strlen($escapingStr) - 1;
} elseif ($comparingClosure == $closureStr) {
--$avoidLevel[$posMarker];
$i += strlen($closureStr) - 1;
}
}
}
if ($avoidLevel[$posMarker] > 0) $someActiveMarker = true;
}
}
$this->tokens = array_reverse($result, false);
$this->limitsMap = array ();
foreach ($this->tokens as $token)
if (in_array($token, $this->limits, true)) array_push ($this->limitsMap, true);
else array_push ($this->limitsMap, false);
return $this->tokens;
}
/**
* Set default markers for the tokenizer:
* 1. ' with \' as escaping sequence
* 2. " with \" as escaping sequence
* 3. () without escaping chars
*
* @return void
*/
function setDefaultMarkers () {
$this->markers = array();
array_push ($this->markers, new marker ('(',')'));
array_push ($this->markers, new marker ("'", "'", "\'"));
array_push ($this->markers, new marker ('"', '"', '\"'));
}
/**
* Set default limit: the blank space
* @return void
*/
function setDefaultLimits () { $this->limits = array (' '); }
/**
* Return a new tokenizer with default limit and makers
* @return tokenizer
*/
static function getDefaultTokenizer () {
$result = new tokenizer();
$result->setDefaultLimits();
$result->setDefaultMarkers();
return $result;
}
}
/**
* @package tokenizer
*/
class parser {
protected $expression;
protected $tokenizer;
protected $operators;
protected $errorMsg;
protected $elements;
protected $operatorsMap;
/**
* This version of the constructor automically create a deafault tokenizer
*/
function __construct ($expression, $operators, $tokenizer = false) {
if ($tokenizer === false) $tokenizer = tokenizer::getDefaultTokenizer();
$this->setTokenizer ($tokenizer);
$this->setOperators ($operators);
$this->setExpression($expression);
}
/**
* @param string
* @return void
*/
function setExpression ($expression) { $this->expression = $expression; }
/**
* @param tokenizer
* @return void
* @see tokenizer
*/
function setTokenizer ($tokenizer) { $this->tokenizer = $tokenizer; }
/**
* @param array an array of strings in which each string is an operator in the expression
* @return void
*/
function setOperators ($operators) { $this->operators = $operators; }
/**
* Parse the expression to find operators and operand
* @return bool
*/
function parse () {
$tokens = $this->tokenizer->tokenize ($this->expression);
$result = array();
foreach ($tokens as $pos => $token)
if (($pos == 0) || (in_array($token, $this->operators)) ||
(in_array($tokens[$pos - 1], $this->operators))) array_unshift($result, $token);
else $result[0] .= $token;
$this->elements = array_reverse($result, false);
$this->operatorsMap = array();
foreach ($this->elements as $pos => $el)
if (in_array($el, $this->operators, true)) array_push ($this->operatorsMap, true);
else array_push ($this->operatorsMap, false);
return $this->elements;
}
/**
* Get the operator position in the element list
* @param int the n-th operator
* @return int|false
*/
function getOperatorPositionInElementList ($nthOperator) {
$this->errorMsg = '';
$currOperator = 0;
foreach ($this->operatorsMap as $pos => $isOperator) {
if ($isOperator) {
if ($currOperator++ == $nthOperator) return $pos;
}
}
$this->errorMsg = "Don't exist $nthOperator operators in the expression";
return false;
}
/**
* Get the left operand of an operator in the expression
* @param int the n-th operator
* @return string|false
*/
function getLeftOperand ($nthOperator) {
$pos = $this->getOperatorPositionInElementList ($nthOperator);
if ($pos === false) return false;
return $this->elements[$pos - 1];
}
/**
* Get the left operand of an operator in the expression
* @param int the n-th operator
* @return string|false
*/
function getRightOperand ($nthOperator) {
$pos = $this->getOperatorPositionInElementList ($nthOperator);
if ($pos === false) return false;
return $this->elements[$pos + 1];
}
}
?>