Location: PHPKode > projects > Pelzini > pelzini-0.3/processor/c_lexer.php
<?php
/*
Copyright 2008 Josh Heidenreich

This file is part of Pelzini.

Pelzini is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Pelzini is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Pelzini.  If not, see <http://www.gnu.org/licenses/>.
*/


/**
* Contains the {@link CLexer} class
*
* @package Parsers
* @author Josh
* @since 0.2
**/

/**
* Tokenises a C file.
**/
class CLexer {
  // Should this be common for all lexers?
  private $single_characters = array(
    '(' => TOKEN_OPEN_NORMAL_BRACKET,
    ')' => TOKEN_CLOSE_NORMAL_BRACKET,
    '{' => TOKEN_OPEN_CURLY_BRACKET,
    '}' => TOKEN_CLOSE_CURLY_BRACKET,
    '[' => TOKEN_OPEN_SQUARE_BRACKET,
    ']' => TOKEN_CLOSE_SQUARE_BRACKET,
    '=' => TOKEN_EQUALS,
    '.' => TOKEN_PERIOD,
    ',' => TOKEN_COMMA,
    ';' => TOKEN_SEMICOLON,
    '*' => TOKEN_ASTERIX
  );
  
  private $reserved_words = array(
    'auto', 'break', 'case', 'const', 'continue', 'default', 'do', 'else', 'enum', 'extern',
    'for', 'goto', 'if', 'inline', 'register', 'restrict', 'return', 'sizeof', 'static',
    'struct', 'switch', 'typedef', 'union', 'volatile', 'while'
  );
  
  private $reserved_values = array('NULL');
  
  
  /**
  * Resets any state variables used by this class back to their initial state
  **/
  public function resetState() {}
  
  /**
  * Should return an array of zero or more Token objects
  **/
  public function process($source) {
    $offset = 0;
    $length = strlen($source);
    $tokens = array();
    
    // strip comments
    $source = preg_replace ('!/\*[^*].*?\*/!s', '', $source);
    
    $curr_line = 1;
    while ($offset < $length) {
      
      if (preg_match('/\G(\n|\r|\n\r)/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $curr_line++;
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "LINE..."; flush();
        continue;
      }
      
      // Firstly, look for single character tokens
      // Should this be common for all lexers?
      foreach ($this->single_characters as $char => $token_type) {
        if ($source[$offset] == $char) {
          $tokens[] = new Token($token_type, $char);
          $offset++;
          //echo "CHAR..."; flush();
          continue 2;
        }
      }
      
      // Now use regular expressions to find various other tokens
      // If one is found, add it to the list and move on
      
      // Search for a preprocessor directive
      if (preg_match('/\G(#[a-z]+.*?)\n/s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_C_PREPROCESSOR, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "PREP..."; flush();
        continue;
      }
      
      // Search for a Docblock comment
      if (preg_match('/\G\/\*\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_DOCBLOCK, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "DOCB..."; flush();
        continue;
      }
      
      // Search for a regular /* */ comment
      if (preg_match('/\G\/\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_COMMENT, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "COMM..."; flush();
        continue;
      }
      
      // Search for a // comment
      if (preg_match('/\G\/\/.*\n/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_COMMENT, rtrim($matches[0][0]));
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "DBLS..."; flush();
        continue;
      }
      
      // Search for a double-quoted string
      if (preg_match('/\G"([^\"]|\.)*"/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "STRD..."; flush();
        continue;
      }
      
      // Search for a single-quoted string
      if (preg_match('/\G\'([^\\\']|\.)*\'/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "STRS..."; flush();
        continue;
      }
      
      // Search for reserved words. This list includes the future reserved words
      foreach ($this->reserved_words as $word) {
        if (preg_match('/\G' . $word . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
          
          // Some reserved words get a specific token - basiclly anything that is understood by the analyser
          // everything else just gets the generic 'reserved word' token.
          switch ($word) {
            default:
              $tokens[] = new Token(TOKEN_RESERVED_WORD, $word);
              break;
          }
          
          $offset = $matches[0][1] + strlen($matches[0][0]);
          //echo "RESW..."; flush();
          continue;
        }
      }
      
      // Search for reserved values
      foreach ($this->reserved_values as $value) {
        if (preg_match('/\G' . $value . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
          $tokens[] = new Token(TOKEN_RESERVED_VALUE, $value);
          $offset = $matches[0][1] + strlen($matches[0][0]);
          //echo "RESV..."; flush();
          continue;
        }
      }
      
      // Search for a number
      $number_expressions = array(
        '/\G0x[0-9A-F]+/i',
        '/\G[0-9]+/'
      );
      foreach ($number_expressions as $expression) {
        if (preg_match($expression, $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
          $tokens[] = new Token(TOKEN_NUMBER, $matches[0][0]);
          $offset = $matches[0][1] + strlen($matches[0][0]);
          //echo "NUMB..."; flush();
          continue;
        }
      }
      
      // Search for an indentifier
      if (preg_match('/\G[a-z$_][a-z0-9$_]*/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
        $tokens[] = new Token(TOKEN_IDENTIFIER, $matches[0][0]);
        $offset = $matches[0][1] + strlen($matches[0][0]);
        //echo "IDEN..."; flush();
        continue;
      }
      
      //echo "OTHR..."; flush();
      $offset++;
    }
    
    //echo "\n"; flush();
    
    return $tokens;
  }
}

?>
Return current item: Pelzini