Location: PHPKode > scripts > Tokenizer > tokenizer/tokenizer.php
<?php
/**
 *  TOKENIZER library
 *
 *  This library has the following features:
 *  1. you can define a list of different sequences to cut tokens, not only a single char.
 *  2. you can define a list of markers to avoid the cutting of tokens.
 *  3. for each marker you can define opening, closure and escaping sequence of chars: you are not obliged to use only one char.
 *
 *  @package tokenizer
 *  @version 1
 *  @author Domenico Pontari <hide@address.com>
 *  @copyright Copyright (c) 2010, Domenico Pontari
 *  @license http://opensource.org/licenses/bsd-license.php New and Simplified BSD licenses
 */


/**
 *  This class define sequences of chars to avoid the cutting of tokens
 *
 *  Markers are sequences of chars to avoid the cutting of tokens. For each marker
 *  you can define opening, closure and escaping sequence of chars:
 *  you are not obliged to use only one char.
 *  @package tokenizer
 */
class marker {
    protected $name;
    protected $opening;
    protected $closure;
    protected $escaping;
    
    function getName () { return $this->name; }
    function getOpening () { return $this->opening; }
    function getClosure () { return $this->closure; }
    function getEscaping () { return $this->escaping; }
    
    function setMarker ($opening, $closure = '', $escaping = '', $name = '') {
        $this->opening = $opening;
        $this->closure = ($closure == '')?$opening:$closure;
        $this->escaping = $escaping;
        $this->name = ($name == '')?$opening:$name;
    }

    function __construct ($opening, $closure = '', $escaping = '', $name = '') {
        $this->setMarker ($opening, $closure, $escaping, $name);
    }
}


/**
 *  This class realize the tokenization of a string into pieces
 *
 *  @package tokenizer
 */
class tokenizer {
    /**
     *  @var array an array of strings: a list of different sequences to cut tokens
     */
    protected $limits = array();
    
    /**
     *  @var array an array of markers to avoid the cutting of tokens
     *  @see marker
     */
    protected $markers = array();


    /**
     *  @var array an array of strings: the result of the tokenizer
     */
    protected $tokens = false;

    /**
     *  @var array an array of boolean
     */
    protected $limitsMap = false;

    /**
     *  @param array an array of markers the tokenizer
     *  @return void
     */
    function setMarkers ($markers) { $this->markers =  $markers; }
    
    /**
     *  @param marker add a marker to the tokenizer
     *  @return void
     */
    function setLimits ($limits) { $this->limits =  $limits; }

    /**
     *  @return array|false retrieve again the result of tokenize function
     *  @see tokenize
     */
    function getTokens ($withLimits = false) {
        if (($withLimits)||($this->tokens === false)) return $this->tokens;
        
        $result = array();
        foreach ($this->tokens as $num => $token)
            if (!$this->limitsMap[$num]) array_push ($result, $token);
        return $result;
    }

    function getLimitsMap () { return $this->limitsMap; }

    /**
     *  Execute a tokenization
     *
     *  @param string
     *  @return array|string if an error occurs return an error message otherwise return an array of tokens
     */
    function tokenize ($string) {
        $limits = array();
        foreach ($this->limits as $limit) array_push ($limits, "($limit)");
        $this->tokens = preg_split("/" . implode('|', $limits) . "/", $string,-1,PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE);

        $numOfMarkers = count($this->markers);
        if ($numOfMarkers > 0) $avoidLevel = array_fill(0, $numOfMarkers, 0);
            else $avoidLevel = array();
        $result = array();
        $someActiveMarker = false;

        foreach ($this->tokens as $num => $token) {
            $tokenLen = strlen($token);
            if ($someActiveMarker) $result[0] .= $token;
                else array_unshift($result, $token);
            $someActiveMarker = false;
            for ($posMarker = 0; $posMarker < $numOfMarkers; ++$posMarker) {
                $openingStr = $this->markers[$posMarker]->getOpening();
                $closureStr = $this->markers[$posMarker]->getClosure();
                $escapingStr = $this->markers[$posMarker]->getEscaping();
                for ($i = 0; $i < $tokenLen; $i++) {
                    $comparingOpening = substr($token, $i, strlen($openingStr));
                    $comparingClosure = substr($token, $i, strlen($closureStr));
                    $comparingEscaping = substr($token, $i, strlen($escapingStr));

                    if ($avoidLevel[$posMarker] == 0) {// evaluate opening only
                        if ($comparingOpening == $openingStr) {
                            ++$avoidLevel[$posMarker];
                            $i += strlen($openingStr) - 1;
                        }
                    } else {
                        if ((strlen($escapingStr) > 0) &&
                            ($comparingEscaping == $escapingStr)) {
                            $i += strlen($escapingStr) - 1;
                        } elseif ($comparingClosure == $closureStr) {
                            --$avoidLevel[$posMarker];
                            $i += strlen($closureStr) - 1;
                        }
                    }
                }
                if ($avoidLevel[$posMarker] > 0) $someActiveMarker = true;
            }
        }
        $this->tokens = array_reverse($result, false);
        $this->limitsMap = array ();
        foreach ($this->tokens as $token)
            if (in_array($token, $this->limits, true)) array_push ($this->limitsMap, true);
                else array_push ($this->limitsMap, false);

        return $this->tokens;
    }

    /**
     *  Set default markers for the tokenizer:
     *  1. ' with \' as escaping sequence
     *  2. " with \" as escaping sequence
     *  3. () without escaping chars
     *
     *  @return void
     */
    function setDefaultMarkers () {
        $this->markers = array();
        array_push ($this->markers, new marker ('(',')'));
        array_push ($this->markers, new marker ("'", "'", "\'"));
        array_push ($this->markers, new marker ('"', '"', '\"'));
    }
    
    /**
     *  Set default limit: the blank space
     *  @return void
     */
    function setDefaultLimits () { $this->limits = array (' '); }
    
    /**
     *  Return a new tokenizer with default limit and makers
     *  @return tokenizer
     */
    static function getDefaultTokenizer () {
        $result = new tokenizer();
        $result->setDefaultLimits();
        $result->setDefaultMarkers();
        return $result;
    }
}

/**
 *  @package tokenizer
 */
class parser {
    protected $expression;
    
    protected $tokenizer;
    
    protected $operators;
    
    protected $errorMsg;
    
    protected $elements;
    
    protected $operatorsMap;
    
    /**
     *  This version of the constructor automically create a deafault tokenizer
     */
    function __construct ($expression, $operators, $tokenizer = false) {
        if ($tokenizer === false) $tokenizer = tokenizer::getDefaultTokenizer();
        $this->setTokenizer ($tokenizer);
        $this->setOperators ($operators);
        $this->setExpression($expression);
    }
    
    /**
     *  @param string
     *  @return void
     */
    function setExpression ($expression) { $this->expression = $expression; }

    /**
     *  @param tokenizer
     *  @return void
     *  @see tokenizer
     */
    function setTokenizer ($tokenizer) { $this->tokenizer = $tokenizer; }
    
    /**
     *  @param array an array of strings in which each string is an operator in the expression
     *  @return void
     */
    function setOperators ($operators) { $this->operators = $operators; }
    
    /**
     *  Parse the expression to find operators and operand
     *  @return bool
     */
    function parse () {
        $tokens = $this->tokenizer->tokenize ($this->expression);
        $result = array();

        foreach ($tokens as $pos => $token)
            if (($pos == 0) || (in_array($token, $this->operators)) ||
                (in_array($tokens[$pos - 1], $this->operators))) array_unshift($result, $token);
                else $result[0] .= $token;
        $this->elements = array_reverse($result, false);
        
        $this->operatorsMap = array();
        
        foreach ($this->elements as $pos => $el)
            if (in_array($el, $this->operators, true)) array_push ($this->operatorsMap, true);
                else array_push ($this->operatorsMap, false);

        return $this->elements;
    }

    /**
     *  Get the operator position in the element list
     *  @param int the n-th operator
     *  @return int|false
     */
    function getOperatorPositionInElementList ($nthOperator) {
        $this->errorMsg = '';

        $currOperator = 0;
        foreach ($this->operatorsMap as $pos => $isOperator) {
            if ($isOperator) {
                if ($currOperator++ == $nthOperator) return $pos;
            }
        }
        $this->errorMsg = "Don't exist $nthOperator operators in the expression";
        return false;
    }

    /**
     *  Get the left operand of an operator in the expression
     *  @param int the n-th operator
     *  @return string|false
     */
    function getLeftOperand ($nthOperator) {
        $pos = $this->getOperatorPositionInElementList ($nthOperator);
        if ($pos === false) return false;
        return $this->elements[$pos - 1];
    }
    
    /**
     *  Get the left operand of an operator in the expression
     *  @param int the n-th operator
     *  @return string|false
     */
    function getRightOperand ($nthOperator) {
        $pos = $this->getOperatorPositionInElementList ($nthOperator);
        if ($pos === false) return false;
        return $this->elements[$pos + 1];
    }
}

?>
Return current item: Tokenizer