LCC编译器的源程序分析(7)词法分析

下面开始关键字、 ID 等识别,采用这种词法分析,是最高效的,由于在识别的过程里,就已经区分它是什么关键字,而不像其它的词法分析程序,需要查找才能决定是否是关键字。
#074         case 'i':
#075               if (rcp[0] == 'f'
#076               && !(map[rcp[1]]&(DIGIT|LETTER))) {
#077                    cp = rcp + 1;
#078                    return IF;
#079               }
#080               if (rcp[0] == 'n'
#081               && rcp[1] == 't'
#082               && !(map[rcp[2]]&(DIGIT|LETTER))) {
#083                    cp = rcp + 2;
#084                    tsym = inttype->u.sym;
#085                    return INT;
#086               }
#087               goto id;
#088         case 'h': case 'j': case 'k': case 'm': case 'n': case 'o':
#089         case 'p': case 'q': case 'x': case 'y': case 'z':
#090         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
#091         case 'G': case 'H': case 'I': case 'J': case 'K':
#092         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
#093         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
#094         case 'Y': case 'Z':
#095         id:
#096               if (limit - rcp < MAXLINE) {
#097                    cp = rcp - 1;
#098                    fillbuf();
#099                    rcp = ++cp;
#100               }
#101               assert(cp == rcp);
#102               token = (char *)rcp - 1;
#103               while (map[*rcp]&(DIGIT|LETTER))
#104                    rcp++;
#105               token = stringn(token, (char *)rcp - token);
#106               tsym = lookup(token, identifiers);
#107               cp = rcp;
#108               return ID;
74 行是识别以 i 开头的关键字和 ID 字符串。第 75 行到第 79 行是识别 if 关键字,它是通过缓冲区里前三个字符 i f 、分隔符来区分的,如果第三个字符是分隔符,说明它就是 if 关键字了,如果还有其它合法的字符,就是变量 ID 了。比如像 ifStmt 就是变量 ID 了。
80 行到第 86 行是识别 int 关键字。它返回一个记号 INT 的值,同时还取得 INT 类型符号信息。
87 行是所有的字符不是关键字时,就跳到 id 标号那里处理为变量 ID
88 行到第 108 行都是识别字符串为变量 ID ,由于那些字母开头的字符串里是不会有关键字的。
在第 96 行到第 99 行是重新填充字符缓冲区,以便识别完整的变量 ID 出来。
在第 102 行到第 105 行里,就获取 ID 的字符串,并保存到 token 里。
在第 106 行里查找这个 ID 是否已经声明,如果没有声明返回是空指令给 tsym
在第 108 行里返回 ID 这个记号来标识当前已经识别出一个 ID 了。
 
下面是进行数字串的识别:
#109         case '0': case '1': case '2': case '3': case '4':
#110         case '5': case '6': case '7': case '8': case '9': {
#111               unsigned long n = 0;
#112               if (limit - rcp < MAXLINE) {
#113                    cp = rcp - 1;
#114                    fillbuf();
#115                    rcp = ++cp;
#116               }
#117               assert(cp == rcp);
#118               token = (char *)rcp - 1;
#119               if (*token == '0' && (*rcp == 'x' || *rcp == 'X')) {
#120                    int d, overflow = 0;
#121                    while (*++rcp) {
#122                          if (map[*rcp]&DIGIT)
#123                               d = *rcp - '0';
#124                          else if (*rcp >= 'a' && *rcp <= 'f')
#125                               d = *rcp - 'a' + 10;
#126                          else if (*rcp >= 'A' && *rcp <= 'F')
#127                               d = *rcp - 'A' + 10;
#128                          else
#129                               break;
#130                          if (n&~(~0UL >> 4))
#131                               overflow = 1;
#132                          else
#133                               n = (n<<4) + d;
#134                    }
#135                    if ((char *)rcp - token <= 2)
#136                          error("invalid hexadecimal constant `%S'/n", token, (char *)rcp-token);
#137                    cp = rcp;
#138                    tsym = icon(n, overflow, 16);
#139               } else if (*token == '0') {
#140                    int err = 0, overflow = 0;
#141                    for ( ; map[*rcp]&DIGIT; rcp++) {
#142                          if (*rcp == '8' || *rcp == '9')
#143                               err = 1;
#144                          if (n&~(~0UL >> 3))
#145                               overflow = 1;
#146                          else
#147                               n = (n<<3) + (*rcp - '0');
#148                    }
#149                    if (*rcp == '.' || *rcp == 'e' || *rcp == 'E') {
#150                          cp = rcp;
#151                          tsym = fcon();
#152                          return FCON;
#153                    }
#154                    cp = rcp;
#155                    tsym = icon(n, overflow, 8);
#156                    if (err)
#157                          error("invalid octal constant `%S'/n", token, (char*)cp-token);
#158               } else {
#159                    int overflow = 0;
#160                    for (n = *token - '0'; map[*rcp]&DIGIT; ) {
#161                          int d = *rcp++ - '0';
#162                          if (n > (ULONG_MAX - d)/10)
#163                               overflow = 1;
#164                          else
#165                               n = 10*n + d;
#166                    }
#167                    if (*rcp == '.' || *rcp == 'e' || *rcp == 'E') {
#168                          cp = rcp;
#169                          tsym = fcon();
#170                          return FCON;
#171                    }
#172                    cp = rcp;
#173                    tsym = icon(n, overflow, 10);
#174               }
#175               return ICON;
#176         }
109 行到第 110 行里都是以数字开头的字符,这是 C 标准里规定数字常量的格式。
112 行到第 116 行也是继续填充缓冲区。
119 行到第 138 行是处理 16 进制的字符串,像 0x12AB 这样的字符串。通过 n = (n<<4) + d 来计算值有多大,最后调用函数 icon 来识别这个数字串是什么结尾的标识,比如 0x12ABL 这样的字符串。并且把它保存到符号 tsym 里,最后返回 ICON 常量记号。
139 行到第 157 行是识别 8 进制数或浮点数字符串。在第 141 到第 148 行里计算 8 进制值的大小。在第 148 到第 153 行是识别以 0 为开头的浮点数。
158 行到第 175 行是处理 10 进制的字符串和不是 0 开头的浮点数。
在数字串处理里,还需要判断值的大小,如果超出表示值,就需要给出错误提示。
 
 
#177         case '.':
#178               if (rcp[0] == '.' && rcp[1] == '.') {
#179                    cp += 2;
#180                    return ELLIPSIS;
#181               }
#182               if ((map[*rcp]&DIGIT) == 0)
#183                    return '.';
#184               if (limit - rcp < MAXLINE) {
#185                    cp = rcp - 1;
#186                    fillbuf();
#187                    rcp = ++cp;
#188               }
#189               assert(cp == rcp);
#190               cp = rcp - 1;
#191               token = (char *)cp;
#192               tsym = fcon();
#193               return FCON;
178 行到第 181 行是识别 C 里的省略符 ’…’
182 193 行是识别浮点数的处理。
 
 
#194         case 'L':
#195               if (*rcp == '/'') {
#196                    unsigned int *s = scon(*cp, wcput, wcbuf);
#197                    if (s - wcbuf > 2)
#198                          warning("excess characters in wide-character literal ignored/n");
#199                    tval.type = widechar;
#200                    tval.u.c.v.u = wcbuf[0];
#201                    tsym = &tval;
#202                    return ICON;
#203               } else if (*rcp == '"') {
#204                    unsigned int *s = scon(*cp, wcput, wcbuf);
#205                    tval.type = array(widechar, s - wcbuf, 0);
#206                    tval.u.c.v.p = wcbuf;
#207                    tsym = &tval;
#208                    return SCON;
#209               } else
#210                    goto id;
195 行到第 209 行都是识别宽字符或宽字符串。
 
#211         case '/'': {
#212               char *s = scon(*--cp, cput, cbuf);
#213               if (s - cbuf > 2)
#214                    warning("excess characters in multibyte character literal ignored/n");
#215               tval.type = inttype;
#216               if (chartype->op == INT)
#217                    tval.u.c.v.i = extend(cbuf[0], chartype);
#218               else
#219                    tval.u.c.v.i = cbuf[0]&0xFF;
#220               tsym = &tval;
#221               return ICON;
#222               }
上面是单引号的处理。
 
#223         case '"': {
#224               char *s = scon(*--cp, cput, cbuf);
#225               tval.type = array(chartype, s - cbuf, 0);
#226               tval.u.c.v.p = cbuf;
#227               tsym = &tval;
#228               return SCON;
#229               }
上面是双引号字符串的处理。
 
#230         case 'a':
#231               if (rcp[0] == 'u'
#232               && rcp[1] == 't'
#233               && rcp[2] == 'o'
#234               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#235                    cp = rcp + 3;
#236                    return AUTO;
#237               }
#238               goto id;
#239         case 'b':
#240               if (rcp[0] == 'r'
#241               && rcp[1] == 'e'
#242               && rcp[2] == 'a'
#243               && rcp[3] == 'k'
#244               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#245                    cp = rcp + 4;
#246                    return BREAK;
#247               }
#248               goto id;
#249         case 'c':
#250               if (rcp[0] == 'a'
#251               && rcp[1] == 's'
#252               && rcp[2] == 'e'
#253               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#254                    cp = rcp + 3;
#255                    return CASE;
#256               }
#257               if (rcp[0] == 'h'
#258               && rcp[1] == 'a'
#259               && rcp[2] == 'r'
#260               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#261                    cp = rcp + 3;
#262                    tsym = chartype->u.sym;
#263                    return CHAR;
#264               }
#265               if (rcp[0] == 'o'
#266               && rcp[1] == 'n'
#267               && rcp[2] == 's'
#268               && rcp[3] == 't'
#269               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#270                    cp = rcp + 4;
#271                    return CONST;
#272               }
#273               if (rcp[0] == 'o'
#274               && rcp[1] == 'n'
#275               && rcp[2] == 't'
#276               && rcp[3] == 'i'
#277               && rcp[4] == 'n'
#278               && rcp[5] == 'u'
#279               && rcp[6] == 'e'
#280               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#281                    cp = rcp + 7;
#282                    return CONTINUE;
#283               }
#284               goto id;
#285         case 'd':
#286               if (rcp[0] == 'e'
#287               && rcp[1] == 'f'
#288               && rcp[2] == 'a'
#289               && rcp[3] == 'u'
#290               && rcp[4] == 'l'
#291               && rcp[5] == 't'
#292               && !(map[rcp[6]]&(DIGIT|LETTER))) {
#293                    cp = rcp + 6;
#294                    return DEFAULT;
#295               }
#296               if (rcp[0] == 'o'
#297               && rcp[1] == 'u'
#298               && rcp[2] == 'b'
#299               && rcp[3] == 'l'
#300               && rcp[4] == 'e'
#301               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#302                    cp = rcp + 5;
#303                    tsym = doubletype->u.sym;
#304                    return DOUBLE;
#305               }
#306               if (rcp[0] == 'o'
#307               && !(map[rcp[1]]&(DIGIT|LETTER))) {
#308                    cp = rcp + 1;
#309                    return DO;
#310               }
#311               goto id;
#312         case 'e':
#313               if (rcp[0] == 'l'
#314               && rcp[1] == 's'
#315               && rcp[2] == 'e'
#316               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#317                    cp = rcp + 3;
#318                    return ELSE;
#319               }
#320               if (rcp[0] == 'n'
#321               && rcp[1] == 'u'
#322               && rcp[2] == 'm'
#323               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#324                    cp = rcp + 3;
#325                    return ENUM;
#326               }
#327               if (rcp[0] == 'x'
#328               && rcp[1] == 't'
#329               && rcp[2] == 'e'
#330               && rcp[3] == 'r'
#331               && rcp[4] == 'n'
#332               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#333                    cp = rcp + 5;
#334                    return EXTERN;
#335               }
#336               goto id;
#337         case 'f':
#338               if (rcp[0] == 'l'
#339               && rcp[1] == 'o'
#340               && rcp[2] == 'a'
#341               && rcp[3] == 't'
#342               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#343                    cp = rcp + 4;
#344                    tsym = floattype->u.sym;
#345                    return FLOAT;
#346               }
#347               if (rcp[0] == 'o'
#348               && rcp[1] == 'r'
#349               && !(map[rcp[2]]&(DIGIT|LETTER))) {
#350                    cp = rcp + 2;
#351                    return FOR;
#352               }
#353             goto id;
#354         case 'g':
#355               if (rcp[0] == 'o'
#356               && rcp[1] == 't'
#357               && rcp[2] == 'o'
#358               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#359                    cp = rcp + 3;
#360                    return GOTO;
#361               }
#362               goto id;
#363         case 'l':
#364             if (rcp[0] == 'o'
#365               && rcp[1] == 'n'
#366               && rcp[2] == 'g'
#367               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#368                    cp = rcp + 3;
#369                    return LONG;
#370               }
#371               goto id;
#372         case 'r':
#373               if (rcp[0] == 'e'
#374               && rcp[1] == 'g'
#375               && rcp[2] == 'i'
#376               && rcp[3] == 's'
#377               && rcp[4] == 't'
#378               && rcp[5] == 'e'
#379               && rcp[6] == 'r'
#380               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#381                    cp = rcp + 7;
#382                    return REGISTER;
#383               }
#384               if (rcp[0] == 'e'
#385               && rcp[1] == 't'
#386               && rcp[2] == 'u'
#387               && rcp[3] == 'r'
#388               && rcp[4] == 'n'
#389               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#390                    cp = rcp + 5;
#391                    return RETURN;
#392               }
#393               goto id;
#394         case 's':
#395               if (rcp[0] == 'h'
#396               && rcp[1] == 'o'
#397               && rcp[2] == 'r'
#398               && rcp[3] == 't'
#399               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#400                    cp = rcp + 4;
#401                    return SHORT;
#402               }
#403               if (rcp[0] == 'i'
#404               && rcp[1] == 'g'
#405               && rcp[2] == 'n'
#406               && rcp[3] == 'e'
#407               && rcp[4] == 'd'
#408               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#409                    cp = rcp + 5;
#410                    return SIGNED;
#411               }
#412               if (rcp[0] == 'i'
#413               && rcp[1] == 'z'
#414               && rcp[2] == 'e'
#415               && rcp[3] == 'o'
#416               && rcp[4] == 'f'
#417               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#418                    cp = rcp + 5;
#419                    return SIZEOF;
#420               }
#421               if (rcp[0] == 't'
#422               && rcp[1] == 'a'
#423                && rcp[2] == 't'
#424               && rcp[3] == 'i'
#425               && rcp[4] == 'c'
#426               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#427                    cp = rcp + 5;
#428                    return STATIC;
#429               }
#430               if (rcp[0] == 't'
#431               && rcp[1] == 'r'
#432               && rcp[2] == 'u'
#433               && rcp[3] == 'c'
#434               && rcp[4] == 't'
#435               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#436                    cp = rcp + 5;
#437                    return STRUCT;
#438               }
#439               if (rcp[0] == 'w'
#440               && rcp[1] == 'i'
#441               && rcp[2] == 't'
#442               && rcp[3] == 'c'
#443               && rcp[4] == 'h'
#444               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#445                    cp = rcp + 5;
#446                    return SWITCH;
#447               }
#448               goto id;
#449         case 't':
#450               if (rcp[0] == 'y'
#451               && rcp[1] == 'p'
#452               && rcp[2] == 'e'
#453               && rcp[3] == 'd'
#454               && rcp[4] == 'e'
#455               && rcp[5] == 'f'
#456               && !(map[rcp[6]]&(DIGIT|LETTER))) {
#457                    cp = rcp + 6;
#458                    return TYPEDEF;
#459               }
#460               goto id;
#461         case 'u':
#462               if (rcp[0] == 'n'
#463               && rcp[1] == 'i'
#464               && rcp[2] == 'o'
#465               && rcp[3] == 'n'
#466               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#467                    cp = rcp + 4;
#468                    return UNION;
#469               }
#470               if (rcp[0] == 'n'
#471               && rcp[1] == 's'
#472               && rcp[2] == 'i'
#473               && rcp[3] == 'g'
#474               && rcp[4] == 'n'
#475               && rcp[5] == 'e'
#476               && rcp[6] == 'd'
#477               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#478                    cp = rcp + 7;
#479                    return UNSIGNED;
#480               }
#481               goto id;
#482         case 'v':
#483               if (rcp[0] == 'o'
#484               && rcp[1] == 'i'
#485               && rcp[2] == 'd'
#486               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#487                    cp = rcp + 3;
#488                    tsym = voidtype->u.sym;
#489                    return VOID;
#490               }
#491               if (rcp[0] == 'o'
#492               && rcp[1] == 'l'
#493               && rcp[2] == 'a'
#494               && rcp[3] == 't'
#495               && rcp[4] == 'i'
#496               && rcp[5] == 'l'
#497               && rcp[6] == 'e'
#498               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#499                    cp = rcp + 7;
#500                    return VOLATILE;
#501               }
#502               goto id;
#503         case 'w':
#504               if (rcp[0] == 'h'
#505               && rcp[1] == 'i'
#506               && rcp[2] == 'l'
#507               && rcp[3] == 'e'
#508               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#509                    cp = rcp + 4;
#510                    return WHILE;
#511               }
#512               goto id;
#513         case '_':
#514               if (rcp[0] == '_'
#515               && rcp[1] == 't'
#516               && rcp[2] == 'y'
#517               && rcp[3] == 'p'
#518               && rcp[4] == 'e'
#519               && rcp[5] == 'c'
#520               && rcp[6] == 'o'
#521               && rcp[7] == 'd'
#522               && rcp[8] == 'e'
#523               && !(map[rcp[9]]&(DIGIT|LETTER))) {
#524                    cp = rcp + 9;
#525                    return TYPECODE;
#526               }
#527               if (rcp[0] == '_'
#528               && rcp[1] == 'f'
#529               && rcp[2] == 'i'
#530               && rcp[3] == 'r'
#531               && rcp[4] == 's'
#532               && rcp[5] == 't'
#533               && rcp[6] == 'a'
#534               && rcp[7] == 'r'
#535               && rcp[8] == 'g'
#536               && !(map[rcp[9]]&(DIGIT|LETTER))) {
#537                    cp = rcp + 9;
#538                    return FIRSTARG;
#539               }
#540               goto id;
上面都关键字的识别处理,如果不是关键字就是 ID 了。
 
#541         default:
#542               if ((map[cp[-1]]&BLANK) == 0)
#543                    if (cp[-1] < ' ' || cp[-1] >= 0177)
#544                          error("illegal character `//0%o'/n", cp[-1]);
#545                    else
#546                          error("illegal character `%c'/n", cp[-1]);
#547         }
#548  }
#549 }
#550 
缺省的值,就是出错的字符。
到这里已经把词法分析全部分析完成,其实写一个词法分析不是艰难的事情。在 LCC 的词法分析器里只用了短短的 600 行左右的代码就完成了这个功能。这里没有使用状态机的分析方法,而是使用超前搜索缓冲区来识别所有记号。这种词法分析是非常高效的,速度奇快,比状态机的算法要快。不过,它的代码修改起来,就比较麻烦一点。对于一个不经常变关键字的编译器来说,是没有什么问题的。
词法分析完成了,下一次就要去分析语法的问题了。
 

你可能感兴趣的:(编译器)