Location: PHPKode > scripts > CSV Tokenizer > csv-tokenizer/CSVTokenizer.php
<?php
/* Copyright (c) 2005, Axis Data Management Corp.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of Axis Data Management Corp nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL AXIS DATA MANAGEMENT CORP
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/**
 *
 * Similar to strtok(), but all delimiters are significant (i.e., 
 * consecutive delimiters are not globbed into one), and empty string
 * tokens are permitted.  E.g., input String "" has one token of ""
 * regardless of delimiter, and " " (with delimiter of space) produces
 * two tokens of value "" (one for before the delimiter, and one for after).
 *
 * This class is called CSV Tokenizer, because this is the behavior 
 * required when using Comma-Separated-Value files (like with spreadsheets
 * and databases).
 * 
 * Here is how this class is typically used:
 *
 *     require "CSVTokenizer.php";
 *     ...
 *     $toker = new CSVTokenizer();
 *     $toker->setDelimiter($delim);
 *     $toker->setString($str);
 *     while (($token = $toker->nextToken()) !== false) {
 *         // Do something with the $token's.
 *
 * @see #displayTokens for other useful ways to use this class.
 * @author Blaine Simpson.  hide@address.com
 */ 

class CSVTokenizer {
    private $buffer;
    private $delimiter = " "; // Default to a SINGLE space character
    private $offset = -1;     // -1 means "Not Initialized"
                              // > strlen($buffer) means all-used-up

    private function checkready() {
        if ($this->offset < 0)
            throw new Exception("Object not ready for use.  You must set the "
            . " delimiter and input string.");
    }

    /**
     * @param indelim  Must be one character long.
     */
    public function setDelimiter($indelim) {
        if (strlen($indelim) != 1)
            throw new Exception(
                "Your input delimiter '$indelim' is not 1 character long");
        $this->delimiter = $indelim;
    }

    /**
     * Makes a COPY of the input string to work with.
     */
    public function setString($instr) {
        $this->buffer = $instr;
        $this->offset = 0;
    }

    public function countTokens() {
        $this->checkready();
        $count = 0;
        $offset = $this->offset;
        while ($offset <= strlen($this->buffer)) {
            $count++;
            if ($offset == strlen($this->buffer)) break;
            $offset = strpos($this->buffer, $this->delimiter, $offset);
            if ($offset === false) break;
            $offset++;
        }
        return $count;
    }

    public function nextToken() {
        $this->checkready();
        $token = false;
        $startIndex = $this->offset;
        if ($startIndex <= strlen($this->buffer)) {
            // Advance $this->offset to next delimiter
            $this->offset = (($startIndex == strlen($this->buffer))
                    ? $this->offset
                    : strpos($this->buffer, $this->delimiter, $startIndex)
            );
            if ($this->offset === false) $this->offset = strlen($this->buffer);
            // Advance to 1 past this delimiter
            $this->offset++;
            // Set $token
            //echo "Setting token to " .
            //$startIndex." w/ length ".($this->offset - 1 - $startIndex)."\n";
            $token = substr($this->buffer, $startIndex,
                    $this->offset - 1 - $startIndex);
            if ($token === false) $token = "";
        }
        //echo "Returning (" . $token . ") ? " . ($token === false) . "\n";
        return $token;
    }

    /**
     * Echos token counts and values for the given input string and delimiter
     * character.
     *
     * Use is trivially easy:
     *      require_once "CSVTokenizer.php";
     *      CSVTokenizer::displayTokens("one|two|three", "|");
     *
     * @param delim  Must be a single character
     */
    public static function displayTokens($str, $delim) {
        $toker = new CSVTokenizer();
        $toker->setDelimiter($delim);
        $toker->setString($str);
        $count = 0;
        echo $toker->countTokens() . " tokens total.\n";
        while (($token = $toker->nextToken()) !== false)
            echo "    " . ++$count . ":  (" . $token . ")   ["
                    . $toker->countTokens() . " remaining]\n";
        echo $toker->countTokens() . " tokens remaining.\n";
    }
}
?>
Return current item: CSV Tokenizer