思路很简单,将源代码作为长字符串进行读入,之后通过switch语句,及状态转换图进行词素识别,并对识别的词素进行整理输出。想法很简单,具体的实现有一点小困难,自我感觉良好的部分是符号表类SymTable的创建,为词素的判断提供了很大的帮助,并对以后分析器的改进停工了方便,例如将C语言分析器改为Java分析器,只需要改变Java语言中于C语言不同的关键字,运算符,界符等。例外的一个亮点就是switch语句的应用,大大化简了程序。本来按照运算符的分类,状态转换图至少有60个状态,还是挺恐怖的。废话不多说了,看代码吧:
package com.ant.model; public class MyModel { private String source; private String target; private StringBuffer SBSource; private StringBuffer SBTarget = new StringBuffer(); private StringBuffer lexBuf = new StringBuffer();//用于存储预读的词素,进行判断分析 private String lexStr; private int state = 0;//表明当前所处状态 private int start = 0;//表明状态转化图的出示状态 private char c; private int begin = 0;//类似指针,表明读取字符所在位置 private int forward = 0;//类似指针,用于表明向前搜索的位置 private SymTable sym_Table = new SymTable();//符号表,用于存放预定义的关键字信息,及分析过程中新添加的标识符 private int flags = 1;//用于标记终态是否需要指针后移一位,0,需要;1,不需要 private int note = 0; public MyModel() { } public MyModel(String source){ //接受源代码,并以StringBuffer的形式存储 this.source = source; SBSource = new StringBuffer(source); analysis(); } public void analysis(){ //对源代码进行分析 while(begin < source.length()){ SBTarget.append(nextToken() +'\n'); lexBuf.delete(0, lexBuf.length()); start = state = 0; } System.out.println(begin); System.out.println(source.length()); System.out.println(forward); } public String nextToken(){ //词法分析器的核心代码,根据状态转化图,对每一个词素进行识别,并将记号及其属性返回 /** * case 0~16:完成对运算符的识别 * case 17~19:完成对标识符的识别,并通过查找符号表,实现关键字的判断 */ while(true){ if(note == 1 ){ //读取内容为注释内容,另行处理 while (nextChar()!='*'){ lexBuf.append(nextChar()); forward++; } note = 0; return "< " + lexBuf.toString() + " " + "注释" + " >"; }else{ switch(state){ case 0: c = nextChar(); if(c == ' '||c == '\t'|| c == '\n'){ state = 0; forward = ++begin; System.out.println(forward); } else{ switch(c){ case '<': state = 13; break; case '>': state = 15; break; case '-': state = 3; break; case '!': state = 4; break; case '+': state = 5; break; case '*': state = 6; break; case '/': state = 7; break; case '%': state = 8; break; case '=': state = 9; break; case '&': state = 10; break; case '^': state = 11; break; case '|': state = 12; break; case '#': case '(': case ')': case '[': case ']': case '{': case '}': case '"': case '.': case '~': case '?': case ':': case ';': case ',': state = 1; break; default: state = fail(); } } break; case 1: return final_state(); case 2: flags = 0; return final_state(); case 3: //识别'-','->','--','-=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '>': case '-': case '=': state = 1;break; default: state = 2; } break; case 4: //'!','!=' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; case 5: //'+','++','+=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '+': case '=': state = 1;break; default: state = 2; } break; case 6: //'*','*=','*/' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '=': case '/': state = 1;break; default: state = 2; } break; case 7: //'/','/*','/=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '*': case '=': state = 1;break; default: state = 2; } break; case 8: //'%','%=' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; case 9: //'=','==' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; case 10: //'&','&&','&=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '&': case '=': state = 1;break; default: state = 2; } break; case 11: //'^','^=' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; case 12: //'|','||','|=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '|': case '=': state = 1;break; default: state = 2; } break; case 13: //'<','<<','<=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '<': state = 14;break; case '=': state = 1;break; default: state = 2; } break; case 14: //'<<=' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; case 15: //'>','>>','>=' lexBuf.append(c); forward++; c = nextChar(); switch(c){ case '>': state = 16;break; case '=': state = 1;break; default: state = 2; } break; case 16: //'>>=' lexBuf.append(c); forward++; c = nextChar(); if(c == '=') state = 1; else state = 2; break; //完成对标记符的捕获,并识别 case 17: if(Character.isLetter(c)) state = 18; else state = fail(); break; case 18: lexBuf.append(c); forward++; c = nextChar(); if(Character.isLetter(c)||Character.isDigit(c)) state = 18; else state = 19; break; case 19: flags = 0; return final_state(); //完成对数字的识别 case 20: if(Character.isDigit(c)) state = 21; else state = fail(); break; case 21: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 21; else if(c == '.' ) state = 22; else if(c == 'E'|| c == 'e') state = 24; else state = 27; break; case 22: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 23; else state = fail(); break; case 23: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 23; else if(c == 'E'|| c == 'e') state = 24; else state = 27; break; case 24: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 26; else if(c == '-'||c == '+') state = 25; else state = fail(); break; case 25: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 26; else state = fail(); break; case 26: lexBuf.append(c); forward++; c = nextChar(); if(Character.isDigit(c)) state = 26; else state = 27; case 27: lexStr = lexBuf.toString(); flags = 0; retract(); install_id(); return "< " + lexStr + " " + "NUM" + " >"; case 28: //完成对注释语句的读取 lexBuf.append(c); forward++; case 30: return " "; } } } } //当状态位终态是执行的操作 public String final_state(){ if(flags == 1) lexBuf.append(c); retract(); install_id(); if(lexStr.equals("/*")){ note = 1; } return "< " +'"'+ lexStr +'"' +" " + gettoken() + " >"; } //将搜索指示器回调1个字符位置,标记flags是否需要指针后移一位,0,需要;1,不需要 public void retract(){ if(flags == 0){ begin = forward; flags = 1; } else{ begin = ++forward; } } //在符号表中查找词素,当它被标记为关键字时,返回0;当为程序变量时,返回指向相应符号表表型的指针。 //若为找到该词素,则将该词素作为变量填入符号表,并返回指向新建表项的指针 public int install_id(){ lexStr = lexBuf.toString(); int index = sym_Table.lookup(lexStr); if(index == 0){ sym_Table.insert(lexStr,"id"); return sym_Table.getSymtable().size(); } else{ return 0; } } //在符号表中查找词素,若词素为关键字,则返回相应的记号,否则返回记号id public String gettoken(){ int index = sym_Table.lookup(lexStr); if(index == 0){ return "id"; } else { return sym_Table.getSymtable().get(index).getToken(); } } public char nextChar(){ //读取下一个字符 if(forward < source.length()){ return SBSource.charAt(forward); }else { return '@'; } } public int fail(){ //状态转化图识别失败 forward = begin;//有待商榷?? switch(start){ case 0: start = 17; break; case 17: start = 20; break; case 20: case 22: case 24: case 25: start = 30; break; default: } return start; } public String getTarget(){ //获取分析结果,并以String的形式返回给用户 target = SBTarget.toString(); return target; } }
package com.ant.model; import java.util.ArrayList; /** * 符号表,用于保存源语言结构的各种信息。包括关键字的预存储,及分析过程中的标识符的存储 * @author Administrator * */ public class SymTable { private StringBuffer lexmes = new StringBuffer();//用于存储形成标识符的字符串 private ArrayList<Symbol> symtable = new ArrayList<Symbol>();//符号表的实现,内为创建的符号表表项的内部类 private static char EOS = '@'; public SymTable() { symtable.add(new Symbol(-1,""));//向符号表插入指针指向为空的Symbol init();//完成关键字的预存储 } //完成C语言关键字的预存储 public void init(){ //C语言关键字的预存储 insert("auto","auto"); insert("break","break"); insert("case","case"); insert("char","cahr"); insert("const","const"); insert("continue","continue"); insert("default","default"); insert("do","do"); insert("double","double"); insert("else","else"); insert("enum","enum"); insert("extern","extern"); insert("float","flout"); insert("for","for"); insert("goto","goto"); insert("if","if"); insert("int","int"); insert("long","long"); insert("register","register"); insert("return","return"); insert("short","short"); insert("signed","signed"); insert("sizeof","sizeof"); insert("static","static"); insert("struct","struct"); insert("switch","switch"); insert("typedef","typedef"); insert("union","union"); insert("unsigned","unsigned"); insert("void","void"); insert("volatilc","volatile"); insert("while","while"); //C语言主要界符的预存储 insert("(","界符"); insert(")","界符"); insert("/*","界符"); insert("*/","界符"); insert("{","界符"); insert("}","界符"); insert(";","界符"); insert('"' + "","界符"); //C语言主要运算符的预存储 insert("->","运算符"); insert("[","运算符"); insert("]","运算符"); insert(".","运算符"); insert("!","运算符"); insert("~","运算符"); insert("++","运算符"); insert("--","运算符"); insert("-","运算符"); insert("*","运算符"); insert("&","运算符"); insert("/","运算符"); insert("%","运算符"); insert("+","运算符"); insert("-","运算符"); insert("<<","运算符"); insert(">>","运算符"); insert("<","运算符"); insert("<=","运算符"); insert(">","运算符"); insert(">=","运算符"); insert("==","运算符"); insert("!=","运算符"); insert("^","运算符"); insert("|","运算符"); insert("&&","运算符"); insert("||","运算符"); insert("?","运算符"); insert(":","运算符"); insert("=","运算符"); insert("+=","运算符"); insert("-=","运算符"); insert("*=","运算符"); insert("/=","运算符"); insert("%=","运算符"); insert(">>=","运算符"); insert("<<=","运算符"); insert("&=","运算符"); insert("^=","运算符"); insert("|=","运算符"); insert(",","运算符"); insert("#","特殊字符"); } //将字符串s和记号token插入相应的表项,并返回相应的表项的索引 public int insert(String s,String token){ int index = lexmes.length(); symtable.add(new Symbol(index,token)); lexmes.append(s + EOS); return symtable.size(); } //到符号表中查找字符串s,如果找到则返回相应的表项的索引,否则返回0 //ArrayList从0开始查找 public int lookup(String s){ for(int i = 1;i < symtable.size();i++){ StringBuffer lexBf = new StringBuffer();//用于暂时存储从lexmes中读取的字符 int index = symtable.get(i).getLexptr();//获取字符表中第i个元素所指示的字符串在lexmes中的起始位置 while(lexmes.charAt(index)!= EOS){ lexBf.append(lexmes.charAt(index)); index++; } //将从lexmes中读取的字符串进行比较,相同,则退出查找,否则继续,直至查找结束 if(s.equals(lexBf.toString())){ return i; } } return 0; } public static void main(String [] args){ SymTable sym = new SymTable(); System.out.println(sym.lookup("case")); } public ArrayList<Symbol> getSymtable() { return symtable; } public void setSymtable(ArrayList<Symbol> symtable) { this.symtable = symtable; } //创建内部类,用于存储符号表表项 public class Symbol{ private int lexptr;//用于标记词素在StringBuffer中的起始位置 private String token;//词素的相应记号,标记符统一为id,关键字的为相应的关键字 private String attribute;//词素的额外属性,暂时不予考虑, public Symbol(int lexptr,String token){ this.lexptr = lexptr; this.token = token; this.attribute = " "; } public Symbol(int lexptr,String token,String attribute){ this.lexptr = lexptr; this.token = token; this.attribute = attribute; } public void setLexptr(int lexptr) { this.lexptr = lexptr; } public int getLexptr() { return lexptr; } public void setToken(String token) { this.token = token; } public String getToken() { return token; } public void setAttribute(String attribute) { this.attribute = attribute; } public String getAttribute() { return attribute; } } }
分析器的界面就不用再说了吧,无非是一个输入框,输入框,一个执行按钮。
刚刚做完就忍不住和同志们分享一下,哈哈,当然还有很多不足。例如注释的识别,还有一些尚未发现的bug。不过感觉还是很好的。哈哈