C语言实现简易词法分析器

词法分析是编译的基础,需要对程序中的单词进行划分,并生成token文件(主要存符号表的入口地址,以便获取进一步需要的信息),供语法分析阶段使用。同时要生成符号表,包括变量的和常量的,在之后的分析中会不断的查填符号表,将单词的类型,值等各项信息填完整,才能进行运算等操作。采用了较为底层的C语言。结果由于好久没用过了,犯了许多低级的错误,编程过程比较艰辛,下面总结遇到的问题。

1、C语言严格区分字符和字符串,%c和%s不要混用,否则造成异常。Java中经常使用String或StringBuilder,淡化了字符的概念,在C中需要注意。

2、自定义的头文件要用双引号而不是尖括号。

3、'\n'光标移到下一行,在打印源程序时,头文件打印空行,看上去像多打印了一个'\n',其实没错。

4、字符串操作不能直接赋值额,需要搞清楚char型数组和char*的不同。Strcpy是将内容进行拷贝,char型数组用Strcpy。而可以char*=字符串常量,字符串常量表示的是一个地址,可以让char型的指针直接指向该地址,避免空间浪费。

5、在不同的处理函数中,要生成token,并赋值,然后让token的指针的数组指向该位置。但由于函数内的token变量是局部变量,跳出函数则失效,故指针指向的位置的内容将有可能改变,导致在该函数之外往文件中写token数组时乱码。

6、fgets最后读到行末尾,'添加\0,判断时'不是'\n'

7、开辟的数组空间太小导致越界,也会导致乱码。

8、最费脑子的错误就是自己设计的headCh,作为每次识别的第一个字符。因为读文件的时候,文件指针会一直的向后移动,当时为了避免文件指针往回移动的时间开销,就用一个char类型的headCh来保存第一个字符。但是预处理的函数中,需要根据下一个字符判断是否结束预处理,故由headCh保存下一个字符,再读到buffer中时,buffer的第一个字符为下一行的第二个字符,即headCh+buffer的内容才是完整的一行的内容。所以为了不改变程序的处理过程,还要保持headCh始终在buffer[start]之前的一位,而处理程序是根据headCh的值来判断调用哪个处理函数的,故每次处理结束后需要给headCh和start重新赋值。

程序流程图:


Token:

错误处理:


源码:

main.c

#include "global.h"
#include 
int main()
{
    char fname[FSIZE];

    error = fopen("error.txt", "w+");
    if(error==NULL)
    {
        printf("cannot create error.txt!\n");
    }
    out = fopen("out.txt", "w+");
    if(out==NULL)
    {
        printf("cannot create out.txt!\n");
    }
    printf("please input filename: \n");
    scanf("%s", fname);
    in = fopen(fname, "r");
    if(in==NULL)
    {
        printf("error: cannot open file %s\n", fname);
        return -1;
    }
    headCh = predeal(in);

    while(fgets(buf, BSIZE, in)!=NULL)
    {
        head = headCh;
        int len = strlen(buf);
        buf[len-1] = '\0';
      //  printf("buf:$%s$\tline:%d\n", buf, line);
        start = 0;
        while(headCh != '\0' )
        {
            while(buf[start]==' ' && headCh == ' ')
            {
                start++;
            }
            if(headCh == ' ')
            {
                if(buf[start] == '\0')break;
                headCh = buf[start++];
            }
      //      printf("start = %d\n", start);
            // printf("headCH=%c\n",headCh);
            if(isalpha(headCh))
            {
         //       printf("ooooooo: DEAL ALPHA\n");
                dealAlpha();
            }
            else if(isdigit(headCh))
            {
        //        printf("ooooooo: DEAL DIGIT\n");
                dealDigit();
            }
            else if(headCh=='/')
            {
          //      printf("ooooooo: DEAL NOTATION\n");
                if(dealNotation()==-1)
                {
                    printf("notation too long to analyze, skip this line...\n");
                    fputc('\n', out);
                    break;
                }
            }
            else if(isBorder(headCh))
            {
     //           printf("ooooooo: DEAL BORDER\n");
                dealBorder();
            }
            else if(headCh=='\'' || headCh =='"')
            {
          //      printf("ooooooo: DEAL CHAR\n");
                dealChar(headCh);
            }
            else // not available start
            {
                fprintf(error, "L%d\tcannot analyse %c\n",line,headCh);
                headCh = buf[start];

            }
            start++;
            //    flag = 1;
        }
        line++;
        flag = 0;
        headCh = goBlank(in);
        if(isNotation==0)
        {
            fprintf(out, "%c%s\n", head, buf);
        }
        isNotation = 0;
    }


    puts("Everything has done...");
    writeToken();
    writeVarTable();
    writeConTable();
    fclose(in);
    fclose(out);
    fclose(error);
    printf("错误日志\t\t\terror.txt\n");
    printf("Token文件\t\t\ttoken.txt\n");
    printf("无注释头文件的源文件日志\tout.txt\n");
    printf("常量符号表\t\t\tconTable.txt\n");
    printf("变量符号表\t\t\tvarTable.txt\n");
    return 0;
}
/**预处理**/
char predeal(FILE *in)
{
    char ch;
    ch = goBlank(in);
    while(ch  == '#')
    {
        fgets(buf, BSIZE, in);
        dealInclude(buf);
        line++;
        fputc('\n', out);
        ch = goBlank(in);
    }
    printf("headers done...\n");
    return ch;
}

/** deal headers, like #include <...>**/
void dealInclude(char *buf)
{
    char include[15];
    char ch;
    int i=9;
    strncpy(include, buf, 9);
    include[9] = '\0';
    //printf("%s%d",include,strlen(include));
    if(strcmp(include, "include <")==0)
    {
        while((ch=buf[i])!='>')
        {
            i++;
            if(ch=='\n')
            {
                fprintf(error, "L%d\theaders end without '>'\n",line);
                break;
            }
        }
    }
    else
    {
        fprintf(error, "L%d\theaders format error\n",line);
    }

}

/**step blanks and count line number**/
char goBlank(FILE* in)
{
    char ch;
    do
    {
        ch = fgetc(in);
        if(ch=='\n')
        {
            line++;
          //  printf("goblank\n");
            fputc('\n', out);
        }
    }
    while(ch ==' ' || ch =='\n' || ch =='\t');
    return ch;
}

/**deal begin with alpha**/
void dealAlpha()
{
    int symbol;
    int id;
    char word[100];
    Token token;
    VarTable varTable;
    int i;
    word[0] = headCh;
    for(i=start; isdigit(buf[i])||isalpha(buf[i]); i++)
    {
        word[i-start+1] = buf[i];
    }
    word[i-start+1] = '\0';
    // forward = i;
    start = i;
    headCh = buf[start];


    symbol = isKeyword(word);
    /**not keyword**/
    if(symbol == -1)
    {
        /*  id = isInVarTable(word);
          if(id ==-1) //not in the varTable
          {
              varTable.id = varTableNum;
              strcpy(varTable.name, word);
              varTableArray[varTableNum] = varTable;
              varTableNum++;
              id = varTable.id;
          }*/

        varTable.id = varTableNum;
        strcpy(varTable.name, word);
        varTableArray[varTableNum] = varTable;
        varTableNum++;

        token.symbol = IDN;
        sprintf(token.attr, "%d", varTable.id);//change int to string
        strcpy(token.name,word);
        tokenArray[tokenNum] = token;
        tokenNum++;
    }
    /** is keyword**/
    else
    {
        token.symbol = symbol;
        strcpy(token.name,word);
        strcpy(token.attr,"--");
        tokenArray[tokenNum] = token;
        tokenNum++;
    }
}

/**判断是否是关键字**/
int isKeyword(char * word)
{
    int i;
    for(i=0; keywordList[i][0]; i++)
    {
        if(strcmp(word, keywordList[i])==0)
        {
            return i+256;
        }
    }
    return -1;
}

/**将token数组写入文件**/
void writeToken()
{
    FILE* ftoken;
    int i=0;
    ftoken = fopen("token.txt", "w+");
    if(ftoken==NULL)
    {
        printf("cannot create file token.txt!\n");
    }

    for(i=0; i

global.h

#ifndef GLOBAL_H_INCLUDED
#define GLOBAL_H_INCLUDED

#include 
#include 

#define BSIZE   1024
#define FSIZE   50
#define TSIZE   1024
#define VTSIZE 1024
#define CTSIZE  1024
#define LIMIT_NOTATION 10

/**define keyword**/
#define INCLUDE         256
#define AUTO                 257
#define BREAK                258
#define CASE                259
#define CHAR                260
#define CONST               261
#define CONTINUE        262
#define DEFAULT             263
#define DO                      264
#define DOUBLE                  265
#define ELSE                    266
#define ENUM                267
#define EXTERN                      268
#define FLOAT               269
#define FOR                     270
#define GOTO                271
#define IF                      272
#define  INT                 273
#define  LONG                 274
#define REGISTER            275
#define RETURN             276
#define SHORT              277
#define SIGNED             278
#define SIZEOF           279
#define STATIC         280
#define STRUCT          281
#define SWITCH       282
#define TYPEDEF     283
#define UNION           284
#define UNSIGNED    285
#define VOLATILE     286
#define WHILE          287

/**define variables and consts**/
#define IDN                 300
#define INUM               301
#define FNUM                302
#define CCHAR               303
#define CSTR                 304

/**define border**/
#define PLUS                  400
#define MINUS                   401
#define MUL                     402
#define DIV                   403
#define REM                     404
#define SEMI                   405
#define  COM                    406
#define  BLP                    407
#define  BRP                        408
#define  SRP 409
#define  SLP 410
#define  BIG 411
#define  SML 412
#define  EQU 413
#define  MLP 414
#define  MRP 415
#define  COL 416
#define  QUE 417
#define  SIG 418
#define  NOT 419
#define  AND 420
#define  OR  421
#define  PP 422
#define  MM 423
#define  EQEQ 424
#define  NOTL 425
#define  NOTR 426
#define  DECL 427
#define  BIGE 428
#define  SMLE 429
#define  NOTE 430
#define  AA 431
#define  OO 432
#define  ANDE 433
#define  MINUE 434
#define  MULE 435
#define  DIVE 436
#define  XOR 437
#define  RIGHT 438
#define  LEFT 439
#define  TURN 440

/**define change char**/
#define CA 500
#define CB 501
#define CF 502
#define CN 503
#define CR 504
#define CT 505
#define CV 506
#define CBSL 507
#define CQUE 508
#define CDQM 509
#define  CQM 510
#define ZERO 511

/**structs**/
typedef struct varTable
{
    int id;
    char name[100];
} VarTable;

typedef struct conTable
{
    int id;
    char name[100];
} ConTable;

typedef struct token
{
    char name[100];
    int symbol;
    char attr[100];
} Token;

/**variables**/
FILE *error;
FILE *out;
FILE *in;

int line=1;
int tokenNum=0;
int varTableNum=0;
int conTableNum=0;
int start = 0;
int forward = 0;
int flag =0;
int isNotation=0;
char buf[BSIZE];
Token tokenArray[TSIZE];
VarTable varTableArray[VTSIZE];
ConTable conTableArray[CTSIZE];
char headCh;
char head;
char *borderList[] = { "+","-","*","/","%",";",",","{","}",")","(",">","<","=","[","]",":","?","!","&","|",
                       "++","--","==","/*","*/",":=",">=","<=","!=","&&","||","+=","-=","*=","/=","^",">>","<<","~"
                     };//by order unnecessary notation
char *keywordList[] = {"include","auto","break","case","char","const","continue","default","do","double",
                       "else","enum","extern","float","for","goto","if","int","long","register",
                       "return","short","signed","sizeof","static","struct","switch","typedef",
                       "union","unsigned","volatile","while",""
                      };//by order
char changeList[12] = {'a', 'b', 'f','n','r','t','v','\\','?','"','\'','0'};

/**Functions**/
char goBlank(FILE* in);
char predeal(FILE* in);
void dealInclude(char* in);
void dealAlpha();
void dealDigit();
void dealBorder();
int dealNotation();
void dealChar(char ch);
int isKeyword(char *word);
void writeToken();
void writeVarTable();
void writeConTable();
int isBorder(char ch);
int isInVarTable(char *name);
int isInConTable(char *name);



#endif // GLOBAL_H_INCLUDED

程序说明:

1、不考虑自定义头文件,#include < 规定这样开头,尖括号前面只能空一格。
2、标识符长度< 100,否则越界。
3、字符(串)常量,长度不可超过1, 但需要结尾。
4、注释会整行打印空行。

5、错误文件输出到error.txt;常量符号表:conTable.txt;变量符号表:varTable.txt;无头文件和注释的程序out.txt;token输出到token.txt

6、错误处理包括:不合法的浮点数,注释不封闭,转义字符不存在,头文件包含错误,程序中有非法字符。

7、为了显示注释不封闭的错误处理,定义超过10个字符的“/*”没有结尾就算不封闭。可以在global.h中进行更改,缓冲区大小,文件名长度,token, conTable, varTable的数组长度都可定义。由于struct中不是指针,比较占用空间,故不可分析太大的程序,容易数组越界。

你可能感兴趣的:(编译原理)