编译原理及实践教材附带了TINY编译器,在这里对这个小型编译器的代码,做一下简单的解析.
TINY编译器的词法分析Lex源程序是:
%{ #include "globals.h" #include "util.h" #include "scan.h" /* lexeme of identifier or reserved word */ char tokenString[MAXTOKENLEN+1]; %} digit [0-9] number {digit}+ letter [a-zA-Z] identifier {letter}+ newline /n whitespace [ /t]+ %% "if" {return IF;} "then" {return THEN;} "else" {return ELSE;} "end" {return END;} "repeat" {return REPEAT;} "until" {return UNTIL;} "read" {return READ;} "write" {return WRITE;} ":=" {return ASSIGN;} "=" {return EQ;} "<" {return LT;} "+" {return PLUS;} "-" {return MINUS;} "*" {return TIMES;} "/" {return OVER;} "(" {return LPAREN;} ")" {return RPAREN;} ";" {return SEMI;} {number} {return NUM;} {identifier} {return ID;} {newline} {lineno++;} {whitespace} {/* skip whitespace */} "{" { char c; do { c = input(); if (c == EOF) break; if (c == '/n') lineno++; } while (c != '}'); } . {return ERROR;} %% TokenType getToken(void) { static int firstTime = TRUE; TokenType currentToken; if (firstTime) { firstTime = FALSE; lineno++; yyin = source; yyout = listing; } currentToken = yylex(); strncpy(tokenString,yytext,MAXTOKENLEN); if (TraceScan) { fprintf(listing,"/t%d: ",lineno); printToken(currentToken,tokenString); } return currentToken; } |
TINY编译器的语法分析Yacc源程序是:
%{ #define YYPARSER /* distinguishes Yacc output from other code files */ #include "globals.h" #include "util.h" #include "scan.h" #include "parse.h" #define YYSTYPE TreeNode * static char * savedName; /* for use in assignments */ static int savedLineNo; /* ditto */ static TreeNode * savedTree; /* stores syntax tree for later return */ %} %token IF THEN ELSE END REPEAT UNTIL READ WRITE %token ID NUM %token ASSIGN EQ LT PLUS MINUS TIMES OVER LPAREN RPAREN SEMI %token ERROR
%% /* Grammar for TINY */ program : stmt_seq { savedTree = $1;} ; stmt_seq : stmt_seq SEMI stmt { YYSTYPE t = $1; if (t != NULL) { while (t->sibling != NULL) t = t->sibling; t->sibling = $3; $$ = $1; } else $$ = $3; } | stmt { $$ = $1; } ; stmt : if_stmt { $$ = $1; } | repeat_stmt { $$ = $1; } | assign_stmt { $$ = $1; } | read_stmt { $$ = $1; } | write_stmt { $$ = $1; } | error { $$ = NULL; } ; if_stmt : IF exp THEN stmt_seq END { $$ = newStmtNode(IfK); $$->child[0] = $2; $$->child[1] = $4; } | IF exp THEN stmt_seq ELSE stmt_seq END { $$ = newStmtNode(IfK); $$->child[0] = $2; $$->child[1] = $4; $$->child[2] = $6; } ; repeat_stmt : REPEAT stmt_seq UNTIL exp { $$ = newStmtNode(RepeatK); $$->child[0] = $2; $$->child[1] = $4; } ; assign_stmt : ID { savedName = copyString(tokenString); savedLineNo = lineno; } ASSIGN exp { $$ = newStmtNode(AssignK); $$->child[0] = $4; $$->attr.name = savedName; $$->lineno = savedLineNo; } ; read_stmt : READ ID { $$ = newStmtNode(ReadK); $$->attr.name = copyString(tokenString); } ; write_stmt : WRITE exp { $$ = newStmtNode(WriteK); $$->child[0] = $2; } ; exp : simple_exp LT simple_exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = LT; } | simple_exp EQ simple_exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = EQ; } | simple_exp { $$ = $1; } ; simple_exp : simple_exp PLUS term { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = PLUS; } | simple_exp MINUS term { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = MINUS; } | term { $$ = $1; } ; term : term TIMES factor { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = TIMES; } | term OVER factor { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = OVER; } | factor { $$ = $1; } ; factor : LPAREN exp RPAREN { $$ = $2; } | NUM { $$ = newExpNode(ConstK); $$->attr.val = atoi(tokenString); } | ID { $$ = newExpNode(IdK); $$->attr.name = copyString(tokenString); } | error { $$ = NULL; } ;
%% int yyerror(char * message) { fprintf(listing,"Syntax error at line %d: %s/n",lineno,message); fprintf(listing,"Current token: "); printToken(yychar,tokenString); Error = TRUE; return 0; } |
所用到的代码示例的输入文件SAMPLE.TNY:
{ Sample program in TINY language - computes factorial } read x; { input an integer } if 0 < x then { don't compute if x <= 0 } fact := 1; repeat fact := fact * x; x := x - 1 until x = 0; write fact { output factorial of x } end |
附带的代码示例手工书写了词法分析,语法分析,我们在这里一步步对代码做简单的解析.
词法分析:
词法编译是编译的基础,主要任务是从左至右逐个字符地对源程序进行扫描,产生一个单词记号(token).每个单词记号都是由词法分析程序从剩余的输入字符串的开头识别出的某种字符串格式,把作为字符串输入的源程序改造成单词符号串的中间形式.
单词的类型:
根据单词在语言中的作用将单词大致分为五类:
(1).关键字:这类单词在特定语言中有固定的意义;
(2).标志符:标志符的作用是为某个实体命名,以便于程序引用,可以用标志符来命名的实体包括变量,过程,类,对象等;
(3).常数:常数一般有整形,实型,布尔型,字符型等.
(4).运算符:分为算术运算符,布尔运算符,关系运算符;
(5).界符:类似于语言中的标的符号.
单词的类别码:
词法分析程序的输入是源程序字符串,输出是与源程序等价的符号序
列.这些符号序列可以是如下的形式:
(类别码,单词的值)
其中类别码表示单词的种类,通常用整数表示.
现在进入到词法分析的源程序,首先看一下GLOBALS.H的头文件:
/*定义单词种类*/ typedef enum { ENDFILE,ERROR, IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE, ID,NUM, ASSIGN,EQ,LT,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI } TokenType; /*定义结点类型:是语句结点还是表达式结点*/ typedef enum {StmtK,ExpK} NodeKind; /*定义语句类别:if,repeat,assign,read,write语句*/ typedef enum {IfK,RepeatK,AssignK,ReadK,WriteK} StmtKind; /*定义表达式类别*/ typedef enum {OpK,ConstK,IdK} ExpKind; typedef enum {Void,Integer,Boolean} ExpType;
#define MAXCHILDREN 3 typedef struct treeNode { struct treeNode * child[MAXCHILDREN]; /*子孩子指针数组*/ struct treeNode * sibling; /*右兄弟指针*/ int lineno; NodeKind nodekind; union { StmtKind stmt; ExpKind exp; } kind; union { TokenType op; int val; char * name; } attr; ExpType type; } TreeNode; |
然后在看一下词法扫描程序: SCAN.C
SCAN.C /*定义的状态*/ typedef enum { START, /*初始状态*/ INASSIGN, /*进入到赋值状态*/ INCOMMENT, /*进入到注释状态*/ INNUM, /*进入到数字状态*/ INID, /*进入到标志符状态*/ DONE /*状态结束*/ }StateType; /*每当语法分析程序需要一个单词时,就调用该子程序,得到 (类别码,单词的值)*/ TokenType getToken(void) { int tokenStringIndex = 0; TokenType currentToken; StateType state = START; int save; while (state != DONE) { int c = getNextChar(); /*从输入buf中读入一个字符*/ save = TRUE; switch (state) { case START: if (isdigit(c)) state = INNUM; else if (isalpha(c)) /*判断字母*/ state = INID; else if (c == ':') state = INASSIGN; else if ((c == ' ') || (c == '/t') || (c == '/n')) save = FALSE; else if (c == '{') { save = FALSE; state = INCOMMENT; } else { state = DONE; switch (c) { case EOF: save = FALSE; currentToken = ENDFILE; break; case '=': currentToken = EQ; break; case '<': currentToken = LT; break; case '+': currentToken = PLUS; break; case '-': currentToken = MINUS; break; case '*': currentToken = TIMES; break; case '/': currentToken = OVER; break; case '(': currentToken = LPAREN; break; case ')': currentToken = RPAREN; break; case ';': currentToken = SEMI; break; default: currentToken = ERROR; break; } } break; case INCOMMENT: save = FALSE; if (c == EOF) { state = DONE; currentToken = ENDFILE; } else if (c == '}') state = START; break; case INASSIGN: state = DONE; if (c == '=') currentToken = ASSIGN; else { /* backup in the input */ ungetNextChar(); save = FALSE; currentToken = ERROR; } break; case INNUM: if (!isdigit(c)) { /* backup in the input */ ungetNextChar(); save = FALSE; state = DONE; currentToken = NUM; } break; case INID: if (!isalpha(c)) { /* backup in the input */ ungetNextChar(); save = FALSE; state = DONE; currentToken = ID; } break; case DONE: default: /* should never happen */ fprintf(listing,"Scanner Bug: state= %d/n",state); state = DONE; currentToken = ERROR; break; } if ((save) && (tokenStringIndex <= MAXTOKENLEN)) { tokenString[tokenStringIndex++] = (char) c; } /*解析单词结束*/ if (state == DONE) { tokenString[tokenStringIndex] = '/0'; if (currentToken == ID) { currentToken = reservedLookup(tokenString); } } } if (TraceScan) { fprintf(listing,"/t%d: ",lineno); printToken(currentToken,tokenString); } return currentToken; } |
在来看语法分析程序PARSE.C,主要的作用是形成一颗语法树:
static TokenType token; /* 全局变量 */ static TreeNode * stmt_sequence(void); static TreeNode * statement(void); static TreeNode * if_stmt(void); static TreeNode * repeat_stmt(void); static TreeNode * assign_stmt(void); static TreeNode * read_stmt(void); static TreeNode * write_stmt(void); static TreeNode * exp(void); static TreeNode * simple_exp(void); static TreeNode * term(void); static TreeNode * factor(void);
static void syntaxError(char * message) { fprintf(listing,"/n>>> "); fprintf(listing,"Syntax error at line %d: %s",lineno,message); Error = TRUE; } static void match(TokenType expected) { //匹配分出的单词,如果匹配的话,取下一个单词 if (token == expected) { token = getToken(); } else { syntaxError("unexpected token -> "); printToken(token,tokenString); fprintf(listing," "); } } TreeNode * stmt_sequence(void) { //形成一棵以第一条语句开始的参数语法树 TreeNode * t = statement(); TreeNode * p = t; while ((token!=ENDFILE) && (token!=END) && (token!=ELSE) && (token!=UNTIL)) { TreeNode * q; match(SEMI); q = statement(); if (q!=NULL) { if (t==NULL) t = p = q; else { p->sibling = q; //下一个语句是右兄弟结点,形成同一层级 p = q; } } } return t; } TreeNode * statement(void) { //对五种语句类型分别处理 TreeNode * t = NULL; switch (token) { case IF : t = if_stmt(); break; case REPEAT : t = repeat_stmt(); break; case ID : t = assign_stmt(); break; case READ : t = read_stmt(); break; case WRITE : t = write_stmt(); break; default : syntaxError("unexpected token -> "); printToken(token,tokenString); token = getToken(); break; } /* end case */ return t; } TreeNode * if_stmt(void) { //对文法:IF exp THEN stmt_seq END //IF exp THEN stmt_seq ELSE stmt_seq END //的处理 TreeNode * t = newStmtNode(IfK); match(IF); if (t!=NULL) t->child[0] = exp(); match(THEN); if (t!=NULL) t->child[1] = stmt_sequence(); if (token==ELSE) { match(ELSE); if (t!=NULL) t->child[2] = stmt_sequence(); } match(END); return t; } TreeNode * repeat_stmt(void) { //对文法:REPEAT stmt_seq UNTIL exp //的处理 TreeNode * t = newStmtNode(RepeatK); match(REPEAT); if (t!=NULL) t->child[0] = stmt_sequence(); match(UNTIL); if (t!=NULL) t->child[1] = exp(); return t; } TreeNode * assign_stmt(void) { //对赋值语句的处理 //文法:ID ASSIGN exp TreeNode * t = newStmtNode(AssignK); if ((t!=NULL) && (token==ID)) { t->attr.name = copyString(tokenString); } match(ID); match(ASSIGN); if (t!=NULL) t->child[0] = exp(); return t; } TreeNode * read_stmt(void) { //对文法: READ ID //的处理 TreeNode * t = newStmtNode(ReadK); match(READ); if ((t!=NULL) && (token==ID)) t->attr.name = copyString(tokenString); match(ID); return t; } TreeNode * write_stmt(void) { //对文法: WRITE exp //的处理 TreeNode * t = newStmtNode(WriteK); match(WRITE); if (t!=NULL) t->child[0] = exp(); return t; } TreeNode * exp(void) { //对文法:exp: simple_exp LT simple_exp // | simple_exp EQ simple_exp // | simple_exp //的处理 //先生成了左边的子表达式 TreeNode * t = simple_exp(); if ((token==LT)||(token==EQ)) { TreeNode * p = newExpNode(OpK); //操作符表达式结点 if (p!=NULL) { p->child[0] = t; p->attr.op = token; //操作符类型 t = p; //t是需要返回的 } match(token); if (t!=NULL) { t->child[1] = simple_exp(); //在生成右边的子表达式 } } return t; } TreeNode * simple_exp(void) { //对文法:simple_exp : simple_exp PLUS term // | simple_exp MINUS term // | term //的处理 //先生成term项 TreeNode * t = term(); while ((token==PLUS)||(token==MINUS)) { TreeNode * p = newExpNode(OpK); //同样是构造操作符表达式结点 if (p!=NULL) { p->child[0] = t; p->attr.op = token; t = p; match(token); t->child[1] = term(); } } return t; } TreeNode * term(void) { //对文法: term : term TIMES factor // | term OVER factor // | factor //的处理 TreeNode * t = factor(); while ((token==TIMES)||(token==OVER)) { TreeNode * p = newExpNode(OpK); //同样的处理方法 if (p!=NULL) { p->child[0] = t; p->attr.op = token; t = p; match(token); p->child[1] = factor(); } } return t; } TreeNode * factor(void) { //对文法: factor : LPAREN exp RPAREN // | NUM // | ID //的处理 //判断单词的类型 TreeNode * t = NULL; switch (token) { case NUM : t = newExpNode(ConstK); if ((t!=NULL) && (token==NUM)) t->attr.val = atoi(tokenString); match(NUM); break; case ID : t = newExpNode(IdK); if ((t!=NULL) && (token==ID)) t->attr.name = copyString(tokenString); match(ID); break; case LPAREN : match(LPAREN); t = exp(); match(RPAREN); break; default: syntaxError("unexpected token -> "); printToken(token,tokenString); token = getToken(); break; } return t; } TreeNode * parse(void) { TreeNode * t; token = getToken(); t = stmt_sequence(); if (token!=ENDFILE) syntaxError("Code ends before file/n"); return t; } |
语法分析程序结束,形成一棵以第一条语句开始的参数语法树。
下面在看一下主程序main()的调用:
main( int argc, char * argv[] ) { TreeNode * syntaxTree; char pgm[120]; /* source code file name */ if (argc != 2){ fprintf(stderr,"usage: %s <filename>/n",argv[0]); exit(1); } strcpy(pgm,argv[1]) ; if (strchr (pgm, '.') == NULL){ strcat(pgm,".tny"); } source = fopen(pgm,"r"); if (source==NULL){ fprintf(stderr,"File %s not found/n",pgm); exit(1); } listing = stdout; fprintf(listing,"/nTINY COMPILATION: %s/n",pgm); syntaxTree = parse(); /*调用PARSE.C里的子程序*/ if (TraceParse) { fprintf(listing,"/nSyntax tree:/n"); printTree(syntaxTree); } } |