词法分析是编译的基础,需要对程序中的单词进行划分,并生成token文件(主要存符号表的入口地址,以便获取进一步需要的信息),供语法分析阶段使用。同时要生成符号表,包括变量的和常量的,在之后的分析中会不断的查填符号表,将单词的类型,值等各项信息填完整,才能进行运算等操作。采用了较为底层的C语言。结果由于好久没用过了,犯了许多低级的错误,编程过程比较艰辛,下面总结遇到的问题。
1、C语言严格区分字符和字符串,%c和%s不要混用,否则造成异常。Java中经常使用String或StringBuilder,淡化了字符的概念,在C中需要注意。
2、自定义的头文件要用双引号而不是尖括号。
3、'\n'光标移到下一行,在打印源程序时,头文件打印空行,看上去像多打印了一个'\n',其实没错。
4、字符串操作不能直接赋值额,需要搞清楚char型数组和char*的不同。Strcpy是将内容进行拷贝,char型数组用Strcpy。而可以char*=字符串常量,字符串常量表示的是一个地址,可以让char型的指针直接指向该地址,避免空间浪费。
5、在不同的处理函数中,要生成token,并赋值,然后让token的指针的数组指向该位置。但由于函数内的token变量是局部变量,跳出函数则失效,故指针指向的位置的内容将有可能改变,导致在该函数之外往文件中写token数组时乱码。
6、fgets最后读到行末尾,'添加\0,判断时'不是'\n'
7、开辟的数组空间太小导致越界,也会导致乱码。
8、最费脑子的错误就是自己设计的headCh,作为每次识别的第一个字符。因为读文件的时候,文件指针会一直的向后移动,当时为了避免文件指针往回移动的时间开销,就用一个char类型的headCh来保存第一个字符。但是预处理的函数中,需要根据下一个字符判断是否结束预处理,故由headCh保存下一个字符,再读到buffer中时,buffer的第一个字符为下一行的第二个字符,即headCh+buffer的内容才是完整的一行的内容。所以为了不改变程序的处理过程,还要保持headCh始终在buffer[start]之前的一位,而处理程序是根据headCh的值来判断调用哪个处理函数的,故每次处理结束后需要给headCh和start重新赋值。
程序流程图:
Token:
错误处理:
源码:
main.c
#include "global.h"
#include
int main()
{
char fname[FSIZE];
error = fopen("error.txt", "w+");
if(error==NULL)
{
printf("cannot create error.txt!\n");
}
out = fopen("out.txt", "w+");
if(out==NULL)
{
printf("cannot create out.txt!\n");
}
printf("please input filename: \n");
scanf("%s", fname);
in = fopen(fname, "r");
if(in==NULL)
{
printf("error: cannot open file %s\n", fname);
return -1;
}
headCh = predeal(in);
while(fgets(buf, BSIZE, in)!=NULL)
{
head = headCh;
int len = strlen(buf);
buf[len-1] = '\0';
// printf("buf:$%s$\tline:%d\n", buf, line);
start = 0;
while(headCh != '\0' )
{
while(buf[start]==' ' && headCh == ' ')
{
start++;
}
if(headCh == ' ')
{
if(buf[start] == '\0')break;
headCh = buf[start++];
}
// printf("start = %d\n", start);
// printf("headCH=%c\n",headCh);
if(isalpha(headCh))
{
// printf("ooooooo: DEAL ALPHA\n");
dealAlpha();
}
else if(isdigit(headCh))
{
// printf("ooooooo: DEAL DIGIT\n");
dealDigit();
}
else if(headCh=='/')
{
// printf("ooooooo: DEAL NOTATION\n");
if(dealNotation()==-1)
{
printf("notation too long to analyze, skip this line...\n");
fputc('\n', out);
break;
}
}
else if(isBorder(headCh))
{
// printf("ooooooo: DEAL BORDER\n");
dealBorder();
}
else if(headCh=='\'' || headCh =='"')
{
// printf("ooooooo: DEAL CHAR\n");
dealChar(headCh);
}
else // not available start
{
fprintf(error, "L%d\tcannot analyse %c\n",line,headCh);
headCh = buf[start];
}
start++;
// flag = 1;
}
line++;
flag = 0;
headCh = goBlank(in);
if(isNotation==0)
{
fprintf(out, "%c%s\n", head, buf);
}
isNotation = 0;
}
puts("Everything has done...");
writeToken();
writeVarTable();
writeConTable();
fclose(in);
fclose(out);
fclose(error);
printf("错误日志\t\t\terror.txt\n");
printf("Token文件\t\t\ttoken.txt\n");
printf("无注释头文件的源文件日志\tout.txt\n");
printf("常量符号表\t\t\tconTable.txt\n");
printf("变量符号表\t\t\tvarTable.txt\n");
return 0;
}
/**预处理**/
char predeal(FILE *in)
{
char ch;
ch = goBlank(in);
while(ch == '#')
{
fgets(buf, BSIZE, in);
dealInclude(buf);
line++;
fputc('\n', out);
ch = goBlank(in);
}
printf("headers done...\n");
return ch;
}
/** deal headers, like #include <...>**/
void dealInclude(char *buf)
{
char include[15];
char ch;
int i=9;
strncpy(include, buf, 9);
include[9] = '\0';
//printf("%s%d",include,strlen(include));
if(strcmp(include, "include <")==0)
{
while((ch=buf[i])!='>')
{
i++;
if(ch=='\n')
{
fprintf(error, "L%d\theaders end without '>'\n",line);
break;
}
}
}
else
{
fprintf(error, "L%d\theaders format error\n",line);
}
}
/**step blanks and count line number**/
char goBlank(FILE* in)
{
char ch;
do
{
ch = fgetc(in);
if(ch=='\n')
{
line++;
// printf("goblank\n");
fputc('\n', out);
}
}
while(ch ==' ' || ch =='\n' || ch =='\t');
return ch;
}
/**deal begin with alpha**/
void dealAlpha()
{
int symbol;
int id;
char word[100];
Token token;
VarTable varTable;
int i;
word[0] = headCh;
for(i=start; isdigit(buf[i])||isalpha(buf[i]); i++)
{
word[i-start+1] = buf[i];
}
word[i-start+1] = '\0';
// forward = i;
start = i;
headCh = buf[start];
symbol = isKeyword(word);
/**not keyword**/
if(symbol == -1)
{
/* id = isInVarTable(word);
if(id ==-1) //not in the varTable
{
varTable.id = varTableNum;
strcpy(varTable.name, word);
varTableArray[varTableNum] = varTable;
varTableNum++;
id = varTable.id;
}*/
varTable.id = varTableNum;
strcpy(varTable.name, word);
varTableArray[varTableNum] = varTable;
varTableNum++;
token.symbol = IDN;
sprintf(token.attr, "%d", varTable.id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum] = token;
tokenNum++;
}
/** is keyword**/
else
{
token.symbol = symbol;
strcpy(token.name,word);
strcpy(token.attr,"--");
tokenArray[tokenNum] = token;
tokenNum++;
}
}
/**判断是否是关键字**/
int isKeyword(char * word)
{
int i;
for(i=0; keywordList[i][0]; i++)
{
if(strcmp(word, keywordList[i])==0)
{
return i+256;
}
}
return -1;
}
/**将token数组写入文件**/
void writeToken()
{
FILE* ftoken;
int i=0;
ftoken = fopen("token.txt", "w+");
if(ftoken==NULL)
{
printf("cannot create file token.txt!\n");
}
for(i=0; i
#ifndef GLOBAL_H_INCLUDED
#define GLOBAL_H_INCLUDED
#include
#include
#define BSIZE 1024
#define FSIZE 50
#define TSIZE 1024
#define VTSIZE 1024
#define CTSIZE 1024
#define LIMIT_NOTATION 10
/**define keyword**/
#define INCLUDE 256
#define AUTO 257
#define BREAK 258
#define CASE 259
#define CHAR 260
#define CONST 261
#define CONTINUE 262
#define DEFAULT 263
#define DO 264
#define DOUBLE 265
#define ELSE 266
#define ENUM 267
#define EXTERN 268
#define FLOAT 269
#define FOR 270
#define GOTO 271
#define IF 272
#define INT 273
#define LONG 274
#define REGISTER 275
#define RETURN 276
#define SHORT 277
#define SIGNED 278
#define SIZEOF 279
#define STATIC 280
#define STRUCT 281
#define SWITCH 282
#define TYPEDEF 283
#define UNION 284
#define UNSIGNED 285
#define VOLATILE 286
#define WHILE 287
/**define variables and consts**/
#define IDN 300
#define INUM 301
#define FNUM 302
#define CCHAR 303
#define CSTR 304
/**define border**/
#define PLUS 400
#define MINUS 401
#define MUL 402
#define DIV 403
#define REM 404
#define SEMI 405
#define COM 406
#define BLP 407
#define BRP 408
#define SRP 409
#define SLP 410
#define BIG 411
#define SML 412
#define EQU 413
#define MLP 414
#define MRP 415
#define COL 416
#define QUE 417
#define SIG 418
#define NOT 419
#define AND 420
#define OR 421
#define PP 422
#define MM 423
#define EQEQ 424
#define NOTL 425
#define NOTR 426
#define DECL 427
#define BIGE 428
#define SMLE 429
#define NOTE 430
#define AA 431
#define OO 432
#define ANDE 433
#define MINUE 434
#define MULE 435
#define DIVE 436
#define XOR 437
#define RIGHT 438
#define LEFT 439
#define TURN 440
/**define change char**/
#define CA 500
#define CB 501
#define CF 502
#define CN 503
#define CR 504
#define CT 505
#define CV 506
#define CBSL 507
#define CQUE 508
#define CDQM 509
#define CQM 510
#define ZERO 511
/**structs**/
typedef struct varTable
{
int id;
char name[100];
} VarTable;
typedef struct conTable
{
int id;
char name[100];
} ConTable;
typedef struct token
{
char name[100];
int symbol;
char attr[100];
} Token;
/**variables**/
FILE *error;
FILE *out;
FILE *in;
int line=1;
int tokenNum=0;
int varTableNum=0;
int conTableNum=0;
int start = 0;
int forward = 0;
int flag =0;
int isNotation=0;
char buf[BSIZE];
Token tokenArray[TSIZE];
VarTable varTableArray[VTSIZE];
ConTable conTableArray[CTSIZE];
char headCh;
char head;
char *borderList[] = { "+","-","*","/","%",";",",","{","}",")","(",">","<","=","[","]",":","?","!","&","|",
"++","--","==","/*","*/",":=",">=","<=","!=","&&","||","+=","-=","*=","/=","^",">>","<<","~"
};//by order unnecessary notation
char *keywordList[] = {"include","auto","break","case","char","const","continue","default","do","double",
"else","enum","extern","float","for","goto","if","int","long","register",
"return","short","signed","sizeof","static","struct","switch","typedef",
"union","unsigned","volatile","while",""
};//by order
char changeList[12] = {'a', 'b', 'f','n','r','t','v','\\','?','"','\'','0'};
/**Functions**/
char goBlank(FILE* in);
char predeal(FILE* in);
void dealInclude(char* in);
void dealAlpha();
void dealDigit();
void dealBorder();
int dealNotation();
void dealChar(char ch);
int isKeyword(char *word);
void writeToken();
void writeVarTable();
void writeConTable();
int isBorder(char ch);
int isInVarTable(char *name);
int isInConTable(char *name);
#endif // GLOBAL_H_INCLUDED
1、不考虑自定义头文件,#include < 规定这样开头,尖括号前面只能空一格。
2、标识符长度< 100,否则越界。
3、字符(串)常量,长度不可超过1, 但需要结尾。
4、注释会整行打印空行。
5、错误文件输出到error.txt;常量符号表:conTable.txt;变量符号表:varTable.txt;无头文件和注释的程序out.txt;token输出到token.txt
6、错误处理包括:不合法的浮点数,注释不封闭,转义字符不存在,头文件包含错误,程序中有非法字符。
7、为了显示注释不封闭的错误处理,定义超过10个字符的“/*”没有结尾就算不封闭。可以在global.h中进行更改,缓冲区大小,文件名长度,token, conTable, varTable的数组长度都可定义。由于struct中不是指针,比较占用空间,故不可分析太大的程序,容易数组越界。