要求:
(1) TINY+词法分析器以 TINY+源代码为输入,输出为识别出的 token 序 列;
(2) 词法分析器以最长匹配为原则,例如‘:=’应识别为赋值符号而非单独 的‘:’及‘=’;
(3) Token 以(种别码,属性值)表示,包含以下类型的种别码:
a) KEY 为关键字;
b) SYM 为系统特殊字符;
c) ID 为变量;
d) NUM 为数值常量;
e) STR 为字符串常量。
(4) 识别词法错误。词法分析器可以给出词法错误的行号并打印出对应的 出错消息,主要包含以下类型的词法错误: a) 非法字符。即不属于TINY+字母表的字符,比如$就是一个非法字符; b) 字符串匹配错误,比如右部引号丢失,如‘scanner c) 注释的右部括号丢失或匹配错误,如 {this is an example
核心代码:
typetoken定义集:
globals.h
typedef enum
/* book-keeping tokens */
{ENDFILE,ERROR,
/* reserved words */
IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,TRUE1,FALSE1,OR,AND,NOT,INT,BOOL1,STRING,FLOAT,DOUBLE,DO,WHILE,
/* multicharacter tokens */
ID,NUM,STR,
/* special symbols */
ASSIGN,EQ,LT,MT,ME,LE,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI,COMMA,UPDOX,PERCENT
} TokenType;
扫描器:
scan.c
/* states in scanner DFA */
typedef enum
{ START,INASSIGN,INCOMMENT,INNUM,INID,DONE,INLE,INME,INUPDOX }
StateType;
/****************************************/
/* the primary function of the scanner */
/****************************************/
/* function getToken returns the
* next token in source file
*/
TokenType getToken(void)
{ /* index for storing into tokenString */
int tokenStringIndex = 0;
/* holds current token to be returned */
TokenType currentToken;
/* current state - always begins at START */
StateType state = START;
/* flag to indicate save to tokenString */
int save;
while (state != DONE)
{
int c = getNextChar();
save = TRUE;
switch (state)
{ case START:
if (isdigit(c))
state = INNUM;
else if (isalpha(c))
state = INID;
else if (c == ':')
state = INASSIGN;
else if (c == '>')
state = INME;
else if (c == '<')
state = INLE;
else if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'))
save = FALSE;
else if (c == '{')
{ save = FALSE;
state = INCOMMENT;
}
else if (c == '\'')
{ save = FALSE;
state = INUPDOX;
}
else
{ state = DONE;
switch (c)
{ case EOF:
save = FALSE;
currentToken = ENDFILE;
break;
case '=':
currentToken = EQ;
break;
case '+':
currentToken = PLUS;
break;
case '-':
currentToken = MINUS;
break;
case '*':
currentToken = TIMES;
break;
case '/':
currentToken = OVER;
break;
case '(':
currentToken = LPAREN;
break;
case ')':
currentToken = RPAREN;
break;
case ';':
currentToken = SEMI;
break;
case ',':
currentToken = COMMA;
break;
case '%':
currentToken = PERCENT;
break;
default:
currentToken = ERROR;
break;
}
}
break;
case INCOMMENT:
save = FALSE;
if (c == EOF)
{
state = DONE;
currentToken = ERROR;
strcpy(tokenString,"Missing \" } \" !");
tokenStringIndex+=15;
}
else if (c == '}')
state = START;
break;
case INUPDOX:
if (c == '\'')
{
save = FALSE;
state = DONE;
currentToken = STR;
}
else if (!(linepos < bufsize))
{
save = FALSE;
state = DONE;
currentToken = ERROR;
strcpy(tokenString,"Missing \" \' \" !");
tokenStringIndex+=15;
}
break;
case INASSIGN:
state = DONE;
if (c == '=')
currentToken = ASSIGN;
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
currentToken = ERROR;
}
break;
case INNUM:
if (!isdigit(c))
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = NUM;
}
break;
case INLE:
if (c=='=')
{ state = DONE;
currentToken = LE;
}
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = LT;
}
break;
case INME:
if (c=='=')
{ state = DONE;
currentToken = ME;
}
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = MT;
}
break;
case INID:
if (!(isalpha(c)||isdigit(c)))
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = ID;
}
break;
case DONE:
default: /* should never happen */
fprintf(listing,"Scanner Bug: state= %d\n",state);
state = DONE;
currentToken = ERROR;
break;
}
if ((save) && (tokenStringIndex <= MAXTOKENLEN))
tokenString[tokenStringIndex++] = (char) c;
if (state == DONE)
{ tokenString[tokenStringIndex] = '\0';
if (currentToken == ID)
currentToken = reservedLookup(tokenString);
}
}
if (TraceScan) {
fprintf(listing,"\t%d: ",lineno);
printToken(currentToken,tokenString);
}
return currentToken;
} /* end getToken */
打印分词获得的token:
util.c
/* Procedure printToken prints a token
* and its lexeme to the listing file
*/
void printToken( TokenType token, const char* tokenString )
{ switch (token)
{ case IF:
case THEN:
case ELSE:
case END:
case REPEAT:
case UNTIL:
case READ:
case WRITE:
case TRUE1:
case FALSE1:
case OR:
case AND:
case NOT:
case INT:
case BOOL1:
case FLOAT:
case STRING:
case DOUBLE:
case DO:
case WHILE:
fprintf(listing,
"KEY: %s\n",tokenString);
break;
case ASSIGN: fprintf(listing,"SYM: :=\n"); break;
case LT: fprintf(listing,"SYM: <\n"); break;
case MT: fprintf(listing,"SYM: >\n"); break;
case LE: fprintf(listing,"SYM: <=\n"); break;
case ME: fprintf(listing,"SYM: >=\n"); break;
case EQ: fprintf(listing,"SYM: =\n"); break;
case COMMA: fprintf(listing,"SYM: ,\n"); break;
case UPDOX: fprintf(listing,"SYM: \'\n"); break;
case PERCENT: fprintf(listing,"SYM: %\n"); break;
case LPAREN: fprintf(listing,"SYM: (\n"); break;
case RPAREN: fprintf(listing,"SYM: )\n"); break;
case SEMI: fprintf(listing,"SYM: ;\n"); break;
case PLUS: fprintf(listing,"SYM: +\n"); break;
case MINUS: fprintf(listing,"SYM: -\n"); break;
case TIMES: fprintf(listing,"SYM: *\n"); break;
case OVER: fprintf(listing,"SYM: /\n"); break;
case ENDFILE: fprintf(listing,"EOF\n"); break;
case NUM:
fprintf(listing,
"NUM, val= %s\n",tokenString);
break;
case ID:
fprintf(listing,
"ID, name= %s\n",tokenString);
break;
case STR:
fprintf(listing,
"STR, val= %s\n",tokenString);
break;
case ERROR:
fprintf(listing,
"ERROR: %s\n",tokenString);
break;
default: /* should never happen */
fprintf(listing,"Unknown token: %d\n",token);
}
}
测试与用例:
错误用例: