- 基本满足C语言的词法规则。
- 可以识别八进制,十六进制,浮点,科学计数法,同时支持后缀。
- 识别关键字。
- 识别字符和字符串中的转义。
一般自动机实现采用如下代码结构:
int state = 0;
switch(state){
case 0:
dosomeThing();
state = NextState;
break;
case 1:
...
}
这样的结构确实很规范,但是我觉得用起来有一点问题。
state = 2
这代表着跳转到状态2,这样的可读性不强,因为很难记住状态2是什么状态。可以把state变量改为enum来增强可读性。<
还是<=
就需要提前读取一个字符增加状态机,就会有三重switch嵌套,影响可读性。所以我采用了一般不推荐的goto语句。只要保证实现的状态机没有逻辑错误,使用goto不会造成复杂的结构。
stateName1:
dosomething;
...
goto stateName1;
...
goto stateName2;
stateName2:
...
实际使用msvc编译器发现,L和U的顺序其实没有限制,LuL
也是合法的数字后缀。如果用自动机实现所有的情况,需要上面的自动机三个。所以实际实现并没有采用自动机,采用的是检测L和U出现的次数。
'\.'
代表小数点,因为我不会再图里打小数点的转义
三个图是连起来的,名字相同的是同一个状态。处理后缀是专门的统计后缀每个字符出现的次数的处理程序,没有用自动机。
转义有\n
,\t
,还有\0
,\000
三位八进制和\xhh
两位十六进制
运算符的处理就比较简单了,只需要看后面能否组成符合运算符或者变成其它运算符。比如<
开头的运算符有<<
,<<=
,<=
和<
这几种运算符,自动机如下
需要注意的是/
,它既是注释的开头字符,也是/
,/=
的开头。
采用stl的unordermap来保存关键字,识别到标识符之后,判断该标识符是否在map中,如果在,则是关键字。
遇到不符合自动机规则的字符之后,输出一个错误信息,先跳过该字符串或者数字,就是一直跳过字符,知道遇到空白符,运算符。
lex实现主要是把每一类单词转变成一个正则表达式,代码如下:
digit [0-9]
digit8 [0-7]
digit16 [0-9a-fA-F]
postfix ((u|U)?(l|L)?(l|L)?)|((l|L)?(u|U)?(l|L)?)|((l|L)?(l|L)?(u|U)?)
postfixf (l|L)|(f|F)
letter [A-Za-z_]
note "//"[^\n]*
notes "/*"([^\*]|(\*)*[^\*/])*"*/"
id {letter}({letter}|{digit})*
char (\\[abfnrlvt\'\"\?\\])|(\\[0-7][0-7])|("\\x"{digit16}{digit16})
achar \'([^\n\'\\]|{char})\'
string \"([^\n\\\"]|{char})*\"
number [1-9]{digit}*{postfix}?
number8 0{digit8}*{postfix}?
number16 ("0x"|"0X"){digit16}+{postfix}?
numberSCM {digit}*\.?{digit}+(\e|\E)(\+|\-)?{digit}*{postfixf}?
numberF {digit}*\.{digit}+{postfixf}?
operator [\?\~\,\(\)\{\}\[\]\;\:\.]
注意:我使用的是在linux上翻译lex文件,然后在windows上开发。用--nounistd
选项可以不使用linux
系统头文件.使用-+
选项生成c++文件。
#include
#include
using std::ifstream;
using std::unordered_map;
using std::string;
using std::cout;
#define MaxLen 2048
#define C_DIGIT \
case '0':case'1':case '2':case '3':case '4':case '5':case '6':case '7':case '8':case '9':
#define C_LETTER \
case 'A':case'B':case 'C':case 'D':case'E':case 'F':case 'G': \
case 'H':case'I':case 'J':case 'K':case'L':case 'M':case 'N': \
case 'O':case'P':case 'Q':case 'R':case'S':case 'T': \
case 'U':case'V':case 'W':case 'X':case'Y':case 'Z': \
\
case 'a':case'b':case 'c':case 'd':case'e':case 'f':case 'g': \
case 'h':case'i':case 'j':case 'k':case'l':case 'm':case 'n': \
case 'o':case'p':case 'q':case 'r':case's':case 't': \
case 'u':case'v':case 'w':case 'x':case'y':case 'z':
#define C_BLANK \
case ' ':case '\t':case '\v':case '\f':case '\n':
#define C_HEX_DIGIT \
case '0':case'1':case '2':case '3':case'4':case '5':case '6':case '7':case'8':case '9': \
case 'A':case'B':case 'C':case 'D':case'E':case 'F': \
case 'a':case'b':case 'c':case 'd':case'e':case 'f':
#define C_OCT_DIGIT \
case '0':case'1':case '2':case '3':case'4':case '5':case '6':case '7':
#define C_NUMBER_POSTFIX \
case 'l':case 'L':case 'u':case 'U':
#define C_OPERATOR_SINGLE \
case ',':case';':case '?':case '~':case '(':case ')': \
case '[':case']':case '{':case '}':
#define C_OPERATOR \
case '+':case'-':case '*':case '/':case '&':case '|':case ':': \
case '=':case'^':case '!':case '#':case '>':case '<':
#define C_SPLITE C_BLANK C_OPERATOR C_OPERATOR_SINGLE
unordered_map<string, int> KeyTable = {
{"asm",0},{"auto",1},{"bool",2},{"break",3},{"case",4},{"catch",5},{"char",6},
{"class",7},{"const",8},{"continue",9},{"default",10},{"delete",11},{"do",12},{"double",13},
{"else",14},{"enum",15},{"explicit",16},{"export",17},{"extern",18},{"false",19},{"float",20},
{"for",21},{"friend",22},{"goto",23},{"if",24},{"inline",25},{"int",26},{"long",27},
{"namespace",28},{"mutable",29},{"new",30},{"operator",31},{"private",32},{"protected",33},{"public",35},
{"register",36},{"return",37},{"signed",38},{"sizeof",39},{"static",40},{"struct",41},{"switch",42},
{"template",43},{"this",44},{"throw",45},{"true",46},{"try",47},{"typedef",48},{"void",49},
{"typeid",50},{"typename",51},{"union",52},{"unsigned",53},{"using",54},{"virtual",55},{"volatile",56}
};
unordered_map<string, int> Operator = {
{"/=",0} ,{"<=",1}, {"<<",2},{"<<=",3} ,{">=",4}, {">>=",5},{">>",6} ,{"%=",7}, {"...",9},
{"+=",10}, {"++",11} ,{"->",12}, {"-=",13}, {"--",14}, {"||",15},{"\=",16},{"::",17},{"*=",18},
{"==",19},{"!=",20},{"^=",21},{"##",22}
};
enum CPPTYPE
{
CPP_NULL,
CPP_NAME,
CPP_NUMBER_DEC,
CPP_NUMBER_SCM,
CPP_NUMBER_OCT,
CPP_NUMBER_HEX,
CPP_NUMBER_FLOAT,
CPP_OPERATOR,
CPP_OPERATOR_COM,
CPP_CHAR,
CPP_STRING,
CPP_NOTES,
CPP_KEY
};
const char* CPPPTYPE_NAME[13] = { "错误","标识符","十进制数","科学计数法浮点数","八进制数",
"十六进制数","浮点数","运算符","复合运算符","字符","字符串","注释","关键字" };
struct TypeResult
{
CPPTYPE type;
const char * Adding;
};
void GotoNextSplite(char *buff, int &start);
void SkipBlank(char * buff, int &start);
int LineNum = 0;
TypeResult HandleNumber(char *buff, int &start) {
static bool flag_L, flag_LL, flag_U, flag_FL;
TypeResult Result;
flag_L = false; flag_LL = false; flag_U = false; flag_FL = false;
switch (buff[start])
{
case '0':
if (buff[start + 1] == 'x' || buff[start + 1] == 'X') {
start += 2;
goto HEX;
}
else
goto OCT;
default:
goto DEC;
}
DEC:
switch (buff[start])
{
C_DIGIT
start++;
goto DEC;
C_SPLITE
Result.type = CPPTYPE::CPP_NUMBER_DEC;
goto FINISH;
C_NUMBER_POSTFIX
flag_L = true; flag_LL = true; flag_U = true;
Result.type = CPPTYPE::CPP_NUMBER_DEC;
goto POSTFIX;
case '.':
start++;
goto DOUBLE;
case 'e':case 'E':
start++;
goto SCM;
default:
Result.Adding = "错误的十进制数";
goto ERROR;
}
OCT:
switch (buff[start])
{
C_OCT_DIGIT
start++;
goto OCT;
C_NUMBER_POSTFIX
flag_L = true; flag_LL = true; flag_U = true;
Result.type = CPPTYPE::CPP_NUMBER_OCT;
goto POSTFIX;
C_SPLITE
Result.type = CPPTYPE::CPP_NUMBER_OCT;
goto FINISH;
case '.':
start++;
goto DOUBLE;
case '8':case '9':case '10':
goto MABY_DOUBLE;
case 'e':case 'E':
goto SCM;
default:
Result.Adding = "不合法的八进制数";
goto ERROR;
}
HEX:
switch (buff[start])
{
C_HEX_DIGIT
start++;
goto HEX;
C_NUMBER_POSTFIX
flag_L = true; flag_LL = true; flag_U = true;
Result.type = CPPTYPE::CPP_NUMBER_HEX;
goto POSTFIX;
C_SPLITE
Result.type = CPPTYPE::CPP_NUMBER_HEX;
goto FINISH;
default:
Result.Adding = "不合法的十六进制数字";
goto ERROR;
}
MABY_DOUBLE:
switch (buff[start])
{
C_DIGIT
start++;
goto MABY_DOUBLE;
case '.':
start++;
goto DOUBLE;
case 'e': case 'E':
start++;
goto SCM;
default:
Result.Adding = "不合法的八进制数";
goto ERROR;
}
DOUBLE:
switch (buff[start])
{
C_DIGIT
start++;
goto DOUBLE;
C_SPLITE
Result.type = CPPTYPE::CPP_NUMBER_FLOAT;
goto FINISH;
case 'e': case 'E':
start++;
goto SCM;
case 'l':case 'L':case 'F':case 'f':
flag_FL = true;
Result.type = CPPTYPE::CPP_NUMBER_FLOAT;
goto POSTFIX;
default:
Result.Adding = "不合法的浮点数";
goto ERROR;
}
SCM:
if (buff[start] == '+' | buff[start] == '-')
start++;
switch (buff[start])
{
C_HEX_DIGIT
start++;
goto SCM;
C_SPLITE
Result.type = CPPTYPE::CPP_NUMBER_SCM;
goto FINISH;
C_NUMBER_POSTFIX
Result.type = CPPTYPE::CPP_NUMBER_SCM;
flag_FL = true;
goto POSTFIX;
default:
Result.Adding = "不合法的科学计数法";
goto ERROR;
break;
}
POSTFIX:
switch (buff[start])
{
case 'l':case 'L':
if (flag_L)flag_L = false;
else if (flag_LL) flag_LL = false;
else if (flag_FL) flag_FL = false;
else goto POSTFIX_ERROR;
start++;
goto POSTFIX;
case 'U':case 'u':
if (flag_U) flag_U = false;
else goto POSTFIX_ERROR;
start++;
goto POSTFIX;
case 'F':case 'f':
if (flag_FL) flag_FL = false;
else goto POSTFIX_ERROR;
start++;
goto POSTFIX;
C_SPLITE
goto FINISH;
}
POSTFIX_ERROR:
Result.Adding = "数字后缀错误";
ERROR:
GotoNextSplite(buff, start);
Result.type = CPPTYPE::CPP_NULL;
FINISH:
return Result;
}
TypeResult HandleOperator(char*buff, int &start)
{
TypeResult result;
switch (buff[start])
{
case '*':case '=':case '!':case '^':case '%':
if (buff[start + 1] == '=')
goto COM;
else
goto OP;
case '#':if (buff[start + 1] == '#') goto COM; else goto OP;
case ':':if (buff[start + 1] == ':') goto COM; else goto OP;
case '+':if (buff[start + 1] == '+' || buff[start+1] == '=') goto COM; else goto OP;
case '&':if (buff[start + 1] == '&' || buff[start+1] == '=') goto COM; else goto OP;
case '|':if (buff[start + 1] == '|' || buff[start+1] == '=') goto COM; else goto OP;
case '-':if (buff[start + 1] == '-' || buff[start++] == '>' || buff[start + 1] == '=') goto COM; else goto OP;
case '>':
if (buff[start + 1] == '=')
goto COM;
else if (buff[start + 1] == '>')
if (buff[start + 2] == '=')
{ start++; goto COM; }
else
goto COM;
else goto OP;
case '<':
if (buff[start + 1] == '=')
goto COM;
else if (buff[start + 1] == '>')
if (buff[start + 2] == '=')
{ start++; goto COM; }
else
goto COM;
else goto OP;
case '/':
if (buff[start + 1] == '/') goto NOTES;
else if (buff[start + 1] == '*') goto NOTES_BLOCK;
else if (buff[start + 1] == '=') goto COM;
else goto OP;
default:
break;
}
COM:
start += 2;
result.type = CPPTYPE::CPP_OPERATOR_COM;
return result;
OP:
start++;
result.type = CPPTYPE::CPP_OPERATOR;
return result;
NOTES:
switch (buff[start])
{
case '\n':
start++;
result.type = CPPTYPE::CPP_NOTES;
return result;
default:
start++;
goto NOTES;
}
NOTES_BLOCK:
switch (buff[start])
{
case '*':
if (buff[start + 1] == '/') {
start += 2;
result.type = CPPTYPE::CPP_NOTES;
return result;
}
default:
start++;
goto NOTES_BLOCK;
}
}
TypeResult HandleString(char *buff, int &start,bool flag)
{
TypeResult result;
if (flag)
goto ISSTRING;
ISCHAR:
switch (buff[start])
{
case '\\':
start++;
goto TRANSFERR;
case '\n':
GotoNextSplite(buff, start);
result.Adding = "错误的字符结尾";
result.type = CPPTYPE::CPP_NULL;
return result;
default:
start++;
}
ISCHAR__:
if (buff[start] == '\'') {
result.type = CPPTYPE::CPP_CHAR;
start++;
return result;
}
else {
GotoNextSplite(buff, start);
result.Adding = "错误的字符";
result.type = CPPTYPE::CPP_NULL;
return result;
}
ISSTRING:
switch (buff[start++])
{
case '\\':
goto TRANSFERR;
case '\n':
GotoNextSplite(buff, start);
result.Adding = "错误的字符结尾";
result.type = CPPTYPE::CPP_NULL;
return result;
case '"':
result.type = CPPTYPE::CPP_STRING;
return result;
default:
goto ISSTRING;
}
ADAPT:
if (flag)
goto ISSTRING;
else
goto ISCHAR__;
TRANSFERR:
switch (buff[start])
{
case '\\':case '\'':case '\"':case '\?':
case 'a':case 'b':case 'f':case 'n':case 'r':case 't':case 'v':
start++;
goto ADAPT;
case '0':
switch (buff[start+1])
{
C_OCT_DIGIT
switch (buff[start+2])
{
C_OCT_DIGIT
start += 3;
goto ADAPT;
default:
goto ERROR;
}
default:
start++;
goto ADAPT;
}
case 'x':
switch (buff[start+1])
{
C_HEX_DIGIT
switch (buff[start+2])
{
C_HEX_DIGIT
start += 3;
goto ADAPT;
default:
goto ERROR;
}
default:
goto ERROR;
}
}
ERROR:
GotoNextSplite(buff, start);
result.Adding = "错误的转义";
result.type = CPPTYPE::CPP_NULL;
return result;
}
void inline HandleNames(char *buff, int &start) {
start:
switch (buff[start])
{
C_LETTER
C_DIGIT
case '_':
start++;
goto start;
default:
return;
}
}
string my_print(char *buff, int start, int end) {
string a(buff + start, end - start);
return a;
}
int main()
{
int CPPNAMECount = 0;
unordered_map<string, int> Table;
int count[13] = { 0 };
int AllLetter = 0;
char *buff = new char[MaxLen*2];
memset(buff, 0, sizeof(char)*MaxLen * 2);
int index = 0;
int pre = 0;
ifstream fs("词法分析.cpp");
fs.read(buff, MaxLen*2);
TypeResult result;
start:
pre = index;
switch (buff[index])
{
case '.':
switch (buff[index+1])
{
case '.':
if (buff[index + 2] != '.')
{
result.type = CPPTYPE::CPP_NULL;
result.Adding = "不合法的运算符";
GotoNextSplite(buff, index);
goto OUT;
}
else {
index += 3;
result.type = CPPTYPE::CPP_OPERATOR_COM;
goto OUT;
}
C_DIGIT
goto NUM_HANDLE;
default:
index++;
result.type = CPPTYPE::CPP_OPERATOR;
goto OUT;
}
NUM_HANDLE:
C_DIGIT
result = HandleNumber(buff, index);
goto OUT;
C_OPERATOR_SINGLE case '\\':
index++;
result.type = CPPTYPE::CPP_OPERATOR;
goto OUT;
C_OPERATOR
result = HandleOperator(buff, index);
goto OUT;
C_LETTER case '_':
HandleNames(buff, index);
result.type = CPPTYPE::CPP_NAME;
goto OUT;
case '\'':
index++;
result = HandleString(buff, index, false);
goto OUT;
case '"':
index++;
result = HandleString(buff, index, true);
goto OUT;
default:
AllLetter += index;
for (int i = 0; i < 13; i++)
cout << CPPPTYPE_NAME[i] << ':' << count[i] << std::endl;
cout << "行数:" << LineNum << std::endl;
cout << "总字数:" << AllLetter << std::endl;
return 0;
}
OUT:
string a = my_print(buff, pre, index);
if (CPPTYPE::CPP_NOTES != result.type)
cout << a;
if (result.type == CPPTYPE::CPP_NULL) {
printf("\t\t%s\n", result.Adding);
SkipBlank(buff, index);
goto start;
}
if (result.type == CPPTYPE::CPP_NAME)
if (KeyTable.count(a))
result.type = CPPTYPE::CPP_KEY;
if (result.type == CPPTYPE::CPP_NAME && !Table.count(a))
Table.insert(std::pair<string,int>(a, CPPNAMECount++));
if (result.type != CPPTYPE::CPP_NOTES)
printf("\t \t%s\t", CPPPTYPE_NAME[result.type]);
count[result.type]++;
switch (result.type)
{
case CPPTYPE::CPP_NAME:
cout << Table[a] << std::endl; break;
case CPPTYPE::CPP_KEY:
cout << KeyTable[a] << std::endl; break;
case CPPTYPE::CPP_OPERATOR_COM:
cout << Operator[a] << std::endl; break;
default:
if (result.type != CPPTYPE::CPP_NOTES)
cout << a << std::endl; break;
}
SkipBlank(buff, index);
if (index >= MaxLen) {
index -= MaxLen;
AllLetter += MaxLen;
memcpy(buff, buff + MaxLen, MaxLen);
memset(buff + MaxLen, 0, MaxLen);
fs.read(buff + MaxLen, MaxLen);
}
goto start;
return 0;
}
void GotoNextSplite(char *buff, int &start)
{
start:
switch (buff[start])
{
C_BLANK
C_OPERATOR
C_OPERATOR_SINGLE
return;
default:
start++;
goto start;
}
}
void SkipBlank(char* buff, int &start) {
start:
switch (buff[start])
{
case '\n':
LineNum++;
case ' ':case '\t':case '\v':case '\f':
start++;
goto start;
default:
return;
}
}
digit [0-9]
digit8 [0-7]
digit16 [0-9a-fA-F]
postfix ((u|U)?(l|L)?(l|L)?)|((l|L)?(u|U)?(l|L)?)|((l|L)?(l|L)?(u|U)?)
postfixf (l|L)|(f|F)
letter [A-Za-z_]
note "//"[^\n]*
notes "/*"([^\*]|(\*)*[^\*/])*"*/"
id {letter}({letter}|{digit})*
char (\\[abfnrlvt\'\"\?\\])|(\\[0-7]([0-7][0-7])?)|("\\x"{digit16}{digit16})
achar \'([^\n\'\\]|{char})\'
string \"([^\n\\\"]|{char})*\"
number [1-9]{digit}*{postfix}?
number8 0{digit8}*{postfix}?
number16 ("0x"|"0X"){digit16}+{postfix}?
numberSCM {digit}*\.?{digit}+(\e|\E)(\+|\-)?{digit}*{postfixf}?
numberF {digit}*\.{digit}+{postfixf}?
operator [\?\~\,\(\)\{\}\[\]\;\:\.]
%%
"*=" {return 12;}
"*" {return 13;}
== {return 14;}
= {return 15;}
!= {return 16;}
! {return 17;}
"^=" {return 18;}
"^" {return 19;}
## {return 20;}
# {return 21;}
"++" {return 22;}
"+=" {return 23;}
"+" {return 24;}
"--" {return 25;}
"-=" {return 26;}
"->" {return 27;}
"-" {return 28;}
"||" {return 29;}
"|=" {return 30;}
"|" {return 31;}
&& {return 32;}
&= {return 33;}
& {return 34;}
"<<=" {return 35;}
"<<" {return 36;}
"<=" {return 37;}
"<" {return 38;}
">>=" {return 39;}
">>" {return 40;}
">=" {return 41;}
">" {return 42;}
{note} {return 0;}
{notes} {return 1;}
{id} {return 2;}
{achar} {return 4;}
{string} {return 5;}
{number} {return 6;}
{number8} {return 7;}
{number16} {return 8;}
{numberSCM} {return 9;}
{numberF} {return 10;}
{operator} {return 11;}
"\n" {return 43;}
[ \t\v\f]* {return 44;}
. {return 45;}
%%
int yyFlexLexer::yywrap()
{
return 1;
}
#include
#include
//一般位于linux下‘/usr/include/FlexLexer.h‘下,如果再windows上开发,拷过来就可以
#include
#include
#include
#include
using std::ifstream;
using std::cout;
using std::endl;
using std::string;
using std::unordered_map;
extern unordered_map<string, int> KeyTable;
unordered_map<string, int> KeyTable2 = {
{"asm",0},{"auto",1},{"bool",2},{"break",3},{"case",4},{"catch",5},{"char",6},
{"class",7},{"const",8},{"continue",9},{"default",10},{"delete",11},{"do",12},{"double",13},
{"else",14},{"enum",15},{"explicit",16},{"export",17},{"extern",18},{"false",19},{"float",20},
{"for",21},{"friend",22},{"goto",23},{"if",24},{"inline",25},{"int",26},{"long",27},
{"namespace",28},{"mutable",29},{"new",30},{"operator",31},{"private",32},{"protected",33},{"public",35},
{"register",36},{"return",37},{"signed",38},{"sizeof",39},{"static",40},{"struct",41},{"switch",42},
{"template",43},{"this",44},{"throw",45},{"true",46},{"try",47},{"typedef",48},{"void",49},
{"typeid",50},{"typename",51},{"union",52},{"unsigned",53},{"using",54},{"virtual",55},{"volatile",56}
};
const char* name[] = {
"行注释","块注释","标识符","关键字","字符","字符串","十进制数字","八进制数字","十六进制数字","科学计数法数字","浮点数","运算符","操作符"
};
int maina()
{
yyFlexLexer Scaner;
ifstream ifile("词法分析.cpp");
Scaner.switch_streams(ifile,std::cout);
unordered_map<string, int> table;
int c;
string ID;
int typeCount[13] = { 0 };
int IDcount = 0;
int line = 0;
int allLetter = 0;
while (c = Scaner.yylex())
{
switch (c)
{
case 2:
ID = string(Scaner.YYText());
if (KeyTable.count(ID)) {
cout << Scaner.YYText() << "\t\t关键字\t\t" << KeyTable[ID] << endl;
typeCount[3]++;
}
else {
if (!table.count(ID))
table.insert(std::pair<string, int>(ID, IDcount++));
cout << Scaner.YYText() << "\t\t标识符\t\t" << table[ID] << endl;
typeCount[2]++;
}
break;
case 0:case 1:case 3:case 4:case 5:case 6:case 7:case 8:case 9:case 10:case 11:
cout << Scaner.YYText() << "\t\t" << name[c] << "\t\t" << Scaner.YYText() << endl;
typeCount[c]++;
break;
case 45:
cout << "\t\t在" << line << "行 ->" << Scaner.YYText() << "<- 附件有错误" << endl;
break;
case 43:
line++;
case 44:
break;
default:
cout << Scaner.YYText() << "\t\t运算符\t\t" << c << endl;
break;
}
allLetter += Scaner.YYLeng();
}
for (int i = 2; i < 13; i++)
cout << name[i] << ":" << typeCount[i] << std::endl;
cout << "总行数:" << line << endl;
cout << "总字数:" << allLetter << endl;
return 0;
}