大家都知道,构造编译器的第一步就是词法分析,即
对程序设计语言的源程序进行扫描的过程中,将字符流形式的源程序转化为一个由各类单词符号组成的流的词法分析方法。
当然,后续还有语法分析,语义分析等步骤,说到语义分析,就不得不提之前看到的一则新闻:英国少年成全球最年轻技术富豪 15岁获李嘉诚投资。他获得投资的原因
很简单:
2011年,15岁的尼克在家准备考试时,发现逐条点开各个新闻效率太低,于是开始动手设计一款通过语义分析算法来精简新闻的程序。后来这款应用程序被美国雅虎收购,
交易价值约为3000万美元。羡慕ing啊好像跑题了,今天就说说词法分析程序的设计。
处理过程简述:
在一个程序设计语言中,一般都含有若干类单词符号,为此可首先为每类单词建立一张状态转换图,然后将这些状态转换图合并成一张统一的状态图,即得到了一个有限
自动机,再进行必要的确定化和状态数最小化处理,最后添加当进行状态转移时所需执行的语义动作,就可以据此构造词法分析程序了。以上状态转换图的画法,有限自动机的
确定化和最小化,我就不多说了,在编译课堂上都是基础知识。直接上我和小伙伴辛苦的劳动成果。
#include
#include
#include
#include
#include
# include
char prog[80],token[12],ch;
int syn,p,m,n,row=1,col=0;
FILE *in,*out;
double sum,sum2,sum3;
char *rwtab[14]={"begin","end","if","then","else","while","do","switch","case","default","goto","break","continue","extern"};
char *Table[41]={"BEGIN","END","IF","THEN","ELSE","ID","UCOUN","LT","LE","EQ","NE","GT","GE","IS","PL","MI","MU","DI","WHILE","DO","AND","OR","NOT","SWITCH","CASE","DEFAULT","GOTO","BREAK","CONTINUE","EXTERN","ADD","SUB","LEFT","RIGHT","LEFTLARGE","RIGHTLARGE","FENHAO","FANXIEGANG","MAOHAO","FILEEND"};
int rwnum[]={1,2,3,4,5,19,20,24,25,26,27,28,29,30};
scaner();
main()
{
p=0;
printf("***************编译原理词法分析******************\n");
char ch;
int i=0;
if((in=fopen("in.txt","r"))==NULL)
{
printf("不能打开程序文件!\n\n");
}
if((out=fopen("out.txt","w"))==NULL)
{
printf("不能打开写入文件!\n\n");
}
ch=fgetc(in);
prog[i++]=ch;
while(ch!=EOF)
{
ch=fgetc(in);prog[i++]=ch;
}
p=0;
do{
scaner();
switch(syn)
{
case 7:printf("(%s,%g)\n",Table[syn-1],sum*pow(10,sum3));
fprintf(out,"(%s,%g)\n",Table[syn-1],sum*pow(10,sum3));
break;
case 6:printf("(%s,%s)\n",Table[syn-1],token);
fprintf(out,"(%s,%s)\n",Table[syn-1],token);
break;
case -1:printf("you have input a wrong string at row(%d),col(%d)\n",row,p-col);
getchar();
return;
default: printf("(%s, )\n",Table[syn-1]);
fprintf(out,"(%s, )\n",Table[syn-1]);
break;
}
}while(syn!=40);
fclose(in);
fclose(out);
printf("-----全部结果已经存入out.txt文档-----\n");
fprintf(out,"--------完成--------\n");
}
scaner()
{
int j;
sum=0;
sum2=0;
sum3=0;
for(m=0;m<12;m++)token[m]=NULL;
ch=prog[p++];
m=0;
while((ch==' ')||(ch=='\t')||(ch=='\n'))
{
if(ch=='\n')
{row++;
col=p;}
ch=prog[p++];
} //去除空格字符,制表符,回车
if(ch=='/')
{
if(prog[p]=='/')
{
for(j=0;prog[p]!='\n';j++)
{
p++;
}
p++;
ch= prog[p++];
}
else if(prog[p]=='*')
{
p++;
for(j=0;prog[p]!='*';j++)
{p++;}
p++;
if(prog[p++]='/')
{ ch= prog[p++];}
}//去除注释
else
{
p--;
ch= prog[p++];
}
}
if(isalpha (ch))
{
while(isalnum (ch))
{
token[m++]=ch;
ch=prog[p++];
}
p--;
syn=6;//标识符
for(n=0;n<14;n++)
if(strcmp(token,rwtab[n])==0)
{ syn=rwnum[n];//关键字
break;
}
}
else if(isdigit(ch))
{ bool flag=0;
while(isdigit(ch))
{ sum=sum*10+ch-'0';
ch=prog[p++];
}
if (ch=='.')
{ double t=1.0;
ch=prog[p++];
int temp=ch-'0';
if(temp>9|temp<0)
{
syn=-1;
return;}
while(isdigit(ch))
{ double temp=1.0;
for(int i=t;i>0;i--)
temp=temp*10;
sum2=sum2+(ch-'0')/temp;
t++;
ch=prog[p++];
}
}
sum=sum+sum2;
if (ch=='E')
{
ch=prog[p++];
if(ch=='-')
{flag=1;
ch=prog[p++];}
else flag=0;
while(isdigit(ch))
{
sum3=sum3*10+ch-'0';
ch=prog[p++];
}
}
if (flag==1)
{
sum3=-sum3;
}
p--;
syn=7;//无符号数
}
else switch(ch)
{ case '<':token[m++]=ch;
ch=prog[p++];
if(ch=='=')
{ syn=9;
token[m++]=ch;
}
else if(ch=='>')
{ syn=11;
p--;
}
else
{ syn=8;
p--;
}
break;
case '>':token[m++]=ch;
ch=prog[p++];
if(ch=='=')
{ syn=13;
token[m++]=ch;
}
else
{ syn=12;
p--;
}
break;
case '+': token[m++]=ch;
ch=prog[p++];
if(ch=='+')
{ syn=31;
token[m++]=ch;
}
else
{ syn=15;
p--;
}
break;
case '-':token[m++]=ch;
ch=prog[p++];
if(ch=='-')
{ syn=32;
token[m++]=ch;
}
else
{ syn=16;
p--;
}
break;
case '!':token[m++]=ch;
syn=23;
break;
case '=':token[m++]=ch;
ch=prog[p++];
if(ch=='=')
{ syn=10;
token[m++]=ch;
}
else
{syn=14;
p--;}
break;
case '*': syn=17;
token[m++]=ch;
break;
case '/': syn=18;
token[m++]=ch;
break;
case '(': syn=33;
token[m++]=ch;
break;
case ')': syn=34;
token[m++]=ch;
break;
case '{': syn=35;
token[m++]=ch;
break;
case '}': syn=36;
token[m++]=ch;
break;
case ';': syn=37;
token[m++]=ch;
break;
case '\"': syn=38;
token[m++]=ch;
break;
case ':':syn=39;
token[m++]=ch;
break;
case '&':syn=21;
token[m++]=ch;
break;
case '|':syn=22;
token[m++]=ch;
break;
case EOF: syn=40;
token[m++]=ch;
break;
default: syn=-1;
break;
}
token[m++]='\0';
}
该程序以in.txt 文档输入字符流,out.txt 文档以二元式形式输出单词流。可以实现识别无符号数,标识符,关键字,还可以过滤空格,制表符,注释,不过好像无符号数
识别有问题,有待完善。去除注释也有漏洞,没来得及修改。。愿大家提出建议,共同进步。