The process of handwriting lexical analyzer

如题，手写词法分析器。

总览

词法分析，就是将输入分解成一个个独立的词法符号，称为单词符号，即token。token可以理解成程序设计语言中具有独立含义的最小词法单位，在这个意义上和自然语言的单词含义有点像。但是，token不仅包括一般意义上的单词，还包括标点符号、操作符、分割符等等。

我选择sql的常用语句作为输入语言，进行词法分析。主要的过程如下：

梳理语言内容，书写必要的正则表达式
将正则表达式转换成NFA
将NFA转换成DFA
用代码实现DFA

梳理语言内容，书写必要的正则表达式

分析所选的语言，整理其中的类型，书写对应的正则表达式。整理结果如下，正则表达式都比较简单，我就不说了。

语言内容的梳理结果

关键字：create, table, int, not, null, char, float,primary, key,index, on, drop, select, from, where, and, insert, into, values, delete, quit, execfile
符号：( ) , ; =    <>  <   >   <=  >=
数字：123， 456.789
字符串: 'abcd\r\n\\ef23&*()'
标识符：[a-zA-Z_][a-zA-Z0-9_]*
注释: #开头 \n结尾

涵盖的sql语法

# DataType: int float char(n)  
# (1<=n<=255)
# only one line

# OPType: = <>  <   >   <=  >=

create table [tablename] (
  [attribute] int not null,
  [attribute] char(20),
  [attribute] float, 
  primary key([attribute])
);
# max : 32 attributes

create index [indexname] on [tablename(attribute)];

drop table [tablename];

drop index [indexname];

select [attribute],[attribute] from [tablename] where [attribute] > [number] and [attribute] < [number];

insert into [tablename] ([attribute list])values([values list]);

delete from [tablename] where [attribute] > [number];

quit;

execfile [filename];

将正则表达式转换成NFA

正则表达式是不能被计算机理解的，所以我们需要把它转换成DFA，中间以NFA作为过度。这部分的主要知识是如何将正则表达式转换成NFA。
直接的做法和简单的理论，可以参考轮子哥的《构建可配置的词法分析器》;
形式化的定义和相关的理论知识，可以参考《Elements of the Theory of Computation》第二章前3节的内容。

将NFA转换成DFA

NFA转换成DFA可以参考《Elements of the Theory of Computation》第二章第2节的内容。

用代码实现DFA

我最后得到的有限自动机如下，可以发现，指向End状态我使用了虚线，这是因为End状态其实是不存在的，它就是Start状态。最下面的长短间隔虚线部分是NFA而不是DFA，是因为处理关键字和标识符的有限自动机有些繁琐，每一个关键字都需要关键字字符数个状态，代码写起来实在是太臃肿，我小小hack了一下，直接用了NFA来处理。

The process of handwriting lexical analyzer_第1张图片

sqlnfa.png

万事具备了，只欠写代码了。具体的过程如下。

用代码定义途中的状态和类型
有些出口只有虚线的状态是不需要的，就像注释掉的几个状态。

enum class State{
  Start,
  InInteger,
  InFloat,
  InStringHalf,
  InEscape,
  //InString,
  InIdentifier,
  InComment,
  //InCommentEnd,
  //InOperator,
  //InKeyword,
  InKeyorId
};

enum class TokenType{
  Unknown,
  Integer,
  Float,
  String,
  Identifier,
  Comment,
  Operator,
  Keyword
};

定义Token的存储结构, 作为编译器，我们不仅需要得到最终的一个个token，还需要在报错时指出错误位置

struct Token{
  int line; //token所在的行
  int column; //token所在的列
  TokenType type; //token的类型
  std::string value; //token的内容

  Token(int _line,int _column,TokenType _type,const char* b,const char* e)
    :line(_line),column(_column),type(_type),value(b,e){
  }
};

通过下面的方式，把状态图转成代码

//define:
//char * pos为指向当前处理字符的指针
//State state为当前的状态
while(*pos){
  switch(state){
    case 某个状态:{
      switch(*pos){
        case 某个字符:{
          //根据状态图的情况进行不同的处理
          break;
        }
      }
      break;
    }
    ...
  }
}

运行实例

实例程序

std::string s("create table tname(\nid int,\nmark float,\nname char(100));\n"
                 "insert into tname (id,mark,name) values (1,15.6,'cmh');");
sql::Lex lex;
try{
  lex.parse(s);    
}catch(std::exception &e){
  print(e.what());
  return 0;
}
std::vector res = lex.getTokens();
print4(res); 
lex.clear();

运行结果：

     Value           Type      Line    Column
    create        Keyword         1         6
     table        Keyword         1        12
     tname     Identifier         1        18
         (       Operator         1        18
        id     Identifier         2         2
       int        Keyword         2         6
         ,       Operator         2         6
      mark     Identifier         3         4
     float        Keyword         3        10
         ,       Operator         3        10
      name     Identifier         4         4
      char        Keyword         4         9
         (       Operator         4         9
       100        Integer         4        13
         )       Operator         4        13
         )       Operator         4        14
         ;       Operator         4        15
    insert        Keyword         5         6
       int        Keyword         5        10
         o     Identifier         5        11
     tname     Identifier         5        17
         (       Operator         5        18
        id     Identifier         5        21
         ,       Operator         5        21
      mark     Identifier         5        26
         ,       Operator         5        26
      name     Identifier         5        31
         )       Operator         5        31
    values        Keyword         5        39
         (       Operator         5        40
         1        Integer         5        42
         ,       Operator         5        42
      15.6          Float         5        47
         ,       Operator         5        47
       cmh         String         5        52
         )       Operator         5        53
         ;       Operator         5        54

完整代码

#pragma once

#include 
#include 
#include 
#include 

namespace sql{
  enum class State{
    Start,
    InInteger,
    InFloat,
    InStringHalf,
    InEscape,
    //InString,
    InIdentifier,
    InComment,
    //InCommentEnd,
    //InOperator,
    //InKeyword,
    InKeyorId
  };

  enum class TokenType{
    Unknown,
    Integer,
    Float,
    String,
    Identifier,
    Comment,
    Operator,
    Keyword
  };

  class myexception: public std::exception{
    std::string s;
  public:
    myexception(const std::string &_s):s(_s){ }
    virtual const char* what() const throw(){
      return s.c_str();
    }
  };

  struct Token{
    int line;
    int column;
    TokenType type;
    std::string value;

    Token(int _line,int _column,TokenType _type,const char* b,const char* e)
      :line(_line),column(_column),type(_type),value(b,e){
    }
  };

  class Lex{
    struct CurrentState{
      State state;
      int line;
      int column;
      const char * tstart;
      const char * tend;

      CurrentState(State s,int l,int c,const char *ts,const char *te)
        :state(s),line(l),column(c),tstart(ts),tend(te){

      }
    };
    std::vector tokens;

   public:
    void clear(){
      tokens.clear();
    }
    void parse(std::string & source){
      CurrentState current(State::Start,1,1,source.c_str(),source.c_str());

      while(*current.tend){
        switch(current.state){
          case State::Start:{
            current.tstart = current.tend;
            switch(*current.tend){
              case '0':case '1':case '2':case '3':case '4':
              case '5':case '6':case '7':case '8':case '9':{
                current.state = State::InInteger;
                break;
              }
              case '\'':{
                current.state = State::InStringHalf;
                current.tstart++;
                break;
              }
              case '#':{
                current.state = State::InComment;
                current.tstart++;
                break;
              }
              case '(':case ')':case ',':case ';':case '=':{
                //current.state = State::Start;
                current.tend++;
                push(current,TokenType::Operator);
                current.tend--;
                break;
              }
              case '<':case '>':{
                //current.state = State::Start;
                if(*(current.tend+1)=='='){
                  current.tend += 2;
                  current.column += 2;
                  push(current,TokenType::Operator);
                  current.tend--;
                  current.column--;
                }else{
                  current.tend++;
                  push(current,TokenType::Operator);
                  current.tend--;
                }
                break;
              }
              case 'a':{
                handleKeyword({"and"},current,source);
                break;
              }
              case 'c':{
                handleKeyword({"create","char"},current,source);
                break;
              }
              case 'd':{
                handleKeyword({"delete","drop"},current,source);
                break;
              }
              case 'e':{
                handleKeyword({"execfile"},current,source);
                break;
              }
              case 'f':{
                handleKeyword({"float","from"},current,source);
                break;
              }
              case 'i':{
                handleKeyword({"insert","index","int","into"},current,source);
                break;
              }
              case 'k':{
                handleKeyword({"key"},current,source);
                break;
              }
              case 'n':{
                handleKeyword({"not","null"},current,source);
                break;
              }
              case 'o':{
                handleKeyword({"on"},current,source);
                break;
              }
              case 'p':{
                handleKeyword({"primary"},current,source);
                break;
              }
              case 'q':{
                handleKeyword({"quit"},current,source);
                break;
              }
              case 's':{
                handleKeyword({"select"},current,source);
                break;
              }
              case 't':{
                handleKeyword({"table"},current,source);
                break;
              }
              case 'v':{
                handleKeyword({"values"},current,source);
                break;
              }
              case 'w':{
                handleKeyword({"where"},current,source);
                break;
              }
              
              case 'b':
              case 'g':case 'h':case 'j':
              case 'l':case 'm':
              case 'r':
              case 'u':case 'x':case 'y':
              case 'z':case 'A':case 'B':case 'C':case 'D':
              case 'E':case 'F':case 'G':case 'H':case 'I':
              case 'J':case 'K':case 'L':case 'M':case 'N':
              case 'O':case 'P':case 'Q':case 'R':case 'S':
              case 'T':case 'U':case 'V':case 'W':case 'X':
              case 'Y':case 'Z':case '_':{
                current.state = State::InIdentifier;
                break;
              }
              case '\n':{
                //current.state = State::Start;
                current.line++;
                current.column = 0;
                break;
              }
              case ' ':{
                break;
              }
              case '\t':{
                break;
              }
              default:{
                throw myexception("error in start state");
                break;
              }
            }
            break;
          }
          case State::InKeyorId:{
            switch(*current.tend){
              case 'a':case 'b':case 'c':case 'd':case 'e':
              case 'f':case 'g':case 'h':case 'i':case 'j':
              case 'k':case 'l':case 'm':case 'n':case 'o':
              case 'p':case 'q':case 'r':case 's':case 't':
              case 'u':case 'v':case 'w':case 'x':case 'y':
              case 'z':case 'A':case 'B':case 'C':case 'D':
              case 'E':case 'F':case 'G':case 'H':case 'I':
              case 'J':case 'K':case 'L':case 'M':case 'N':
              case 'O':case 'P':case 'Q':case 'R':case 'S':
              case 'T':case 'U':case 'V':case 'W':case 'X':
              case 'Y':case 'Z':case '_':{
                current.state = State::InIdentifier;
                break;
              }
              default:{
                throw myexception("InKeyorId error");
                break;
              }
            }
            break;
          }
          case State::InInteger:{
            switch(*current.tend){
              case '0':case '1':case '2':case '3':case '4':
              case '5':case '6':case '7':case '8':case '9':{
                break;
              }
              case '.':{
                current.state = State::InFloat;
                break;
              }
              default:{
                current.state = State::Start;
                push(current,TokenType::Integer);
                current.tend--;
                current.column--;
                break;
              }
            }
            break;
          }
          case State::InFloat:{
            switch(*current.tend){
              case '0':case '1':case '2':case '3':case '4':
              case '5':case '6':case '7':case '8':case '9':{
                break;
              }
              default:{
                current.state = State::Start;
                push(current,TokenType::Float);
                current.tend--;
                current.column--;
                break;
              }
            }
            break;
          }
          case State::InStringHalf:{
            switch(*current.tend){
              case '\\':{
                current.state = State::InEscape;
                break;
              }
              case '\'':{
                current.state = State::Start;
                push(current,TokenType::String);
                break;
              }
              case '\n':{
                throw myexception("unexpected \\n in string");
                break;
              }
              default:{
                break;
              }
            }
            break;
          }
          case State::InEscape:{
            switch(*current.tend){
              case 'n':case '\'':case '\\':{
                current.state = State::InStringHalf;  
                break;
              }
              default:{
                throw myexception("undefined escape");
                break;
              }
            }
            break;
          }
          case State::InIdentifier:{
            switch(*current.tend){
              case '0':case '1':case '2':case '3':case '4':
              case '5':case '6':case '7':case '8':case '9':
              case 'a':case 'b':case 'c':case 'd':case 'e':
              case 'f':case 'g':case 'h':case 'i':case 'j':
              case 'k':case 'l':case 'm':case 'n':case 'o':
              case 'p':case 'q':case 'r':case 's':case 't':
              case 'u':case 'v':case 'w':case 'x':case 'y':
              case 'z':case 'A':case 'B':case 'C':case 'D':
              case 'E':case 'F':case 'G':case 'H':case 'I':
              case 'J':case 'K':case 'L':case 'M':case 'N':
              case 'O':case 'P':case 'Q':case 'R':case 'S':
              case 'T':case 'U':case 'V':case 'W':case 'X':
              case 'Y':case 'Z':case '_':{
                break;
              }
              default:{
                current.state = State::Start;
                push(current,TokenType::Identifier);
                current.tend--;
                current.column--;
                break;
              }
            }
            break;
          }
          case State::InComment:{
            switch(*current.tend){
              case '\n':{
                current.state = State::Start;
                push(current,TokenType::Comment);
                current.line++;
                current.column = 0;
                break;
              }
              default:{
                break;
              }
            }
            break;
          }
          default:{
            throw myexception("undefined state");
            break;
          }
        }
        current.tend++;
        current.column++;
      }
    }
    const std::vector & getTokens(){
      return tokens;
    }

  private:

    void push(const CurrentState & current,TokenType type){
      tokens.push_back(Token(current.line, current.column-1, type, current.tstart, current.tend));
    }

    void handleKeyword(std::vector && keys,CurrentState ¤t,std::string &source){
      int spos = current.tstart - source.c_str();
      for(int i=0;i