继续完成前面一篇“设计有穷自动机DFA实现C++简单程序的词法分析、扫描(编译原理实验)”词法分析扫猫程序剩下来关于去除多余空行、空格、注释进行源程序压缩的功能。
按实验要求(如下),这里需要考虑下面带星号*的第(3)(5)点:
实验中用到的C++源程序如下图:
思路:
其实也就是将源程序中的多余空格、注释、换行等都删除,整理成单单一行的源代码。
每次对扫描程序获取到的Token进行判断,根据上一个Token的类型(有关键字、标识符、数值、字符串、特殊符号)决定当前Token是否能够与上一个Token紧邻,也即不加任何空格。
例如上面截图中倒数第二行中的 else 和 cout 两个关键字之间就必须有空格分开,否则代码就会出错了。针对上面这个简单的C++源程序,观察其DFA图可以得出以下特点:
1、关键字与标识符不能紧邻,例如 int i中间必须有空格
2、关键字与关键字也不能紧邻,如上所述
3、另外关键字与字符串也不要紧邻
对于以上样例输入,先进行词法分析,然后将获得的Token压缩并保存在StringBuilder对象中,在写入到一个新的文件,最终再次对压缩后的文件进行扫描,判断压缩前后的扫描结果是否一直。
程序输出结果(包括压缩后的源代码)如下:
根据上面这三个特点,代码实现如下(高亮部分是与上一篇源代码不同之处):
- package lexical_analysis;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.FileReader;
- import java.io.PrintWriter;
- public class Scanner_2 {
- // 定义DFA中的所有状态表
- // enum StateType {Start, Num, ID, EQ, NE, NM, NL,
- // Com, LineCom, MulCom1, MulCom2, Special, Done, Str};
- // 定义DFA中的所有状态表
- private static final int Start = 1;
- private static final int Num = 2;
- private static final int ID = 3;
- private static final int EQ = 4;
- private static final int NE = 5;
- private static final int NM = 6;
- private static final int NL = 7;
- private static final int Coms = 8;
- private static final int LineCom = 9;
- private static final int MulCom1 = 10;
- private static final int MulCom2 = 11;
- private static final int Special = 12;
- private static final int Done = 13;
- private static final int Str = 14;
- // Token类型,Initial为初始类型
- private enum TokenType {
- Initial, ID, Special, Str, KeyWord
- };
- // 关键字
- private String[] keyWords = new String[] {
- "include", "define", "iostream", "int", "folat", "double",
- "main", "if", "else", "for", "while", "do", "goto", "switch",
- "case", "static", "cin", "cout"
- };
- // 特殊字符
- private String [] special = {"{", "}", "[", "]", "(", ")",
- "#", ",", ".", ";", ":", "\\",
- "'", "\"", ">>", "<<", "!=", "=",
- "==", "<=", ">=", "++", "--"};
- // 算术运算符
- private String [] arithmetic = {"+", "-", "-", "/", "%"};
- // 源代码文件输入流
- private BufferedReader sourceFile;
- // 压缩后的文件输出流
- private PrintWriter compressedFileWriter;
- // 上一个Token的类型
- private TokenType preType = TokenType.Initial;
- // 缓存去除多余空格、注释后的源代码
- private StringBuilder compressedStr = new StringBuilder();
- // 扫描行的最大字符数
- private static final int BUF_SIZE = 256;
- // 当前行的字符长度
- private int bufSize = 0;
- // 当前行
- private String eachLine;
- // 当前扫描行的字符序列
- private char [] lineBuf = new char[BUF_SIZE];
- // 当前扫描的行数
- private int lineNum = 0;
- // 当前行的字符下标
- private int charPos = 0;
- // 是否已达文件尾
- private boolean isEOF = false;
- /**
- * 每次扫描前都要初始化一些必要变量值
- */
- private void initial(){
- bufSize = 0;
- lineNum = 0;
- charPos = 0;
- isEOF = false;
- }
- /**
- * 初始化并读取源代码文件
- * 扫描程序开始执行,直到读取文件结束符EOF
- * @throws Exception
- */
- private void scanning(String originalFile) throws Exception {
- this.sourceFile = new BufferedReader(new FileReader(originalFile));
- this.initial();
- while(!isEOF) {
- getToken();
- }
- System.out.println("========================> end scanning ...");
- }
- /**
- * 获取下一个字符
- * @return
- * @throws Exception
- */
- private char getNextChar() throws Exception {
- char nextChar = '\0';
- if(!(charPos < bufSize)) {
- if((eachLine = sourceFile.readLine()) != null) {
- lineNum++;
- System.out.println(lineNum + ": " + eachLine);
- lineBuf = eachLine.toCharArray();
- bufSize = eachLine.length();
- charPos = 0;
- nextChar = lineBuf[charPos++];
- } else {
- isEOF = true;
- nextChar = '\0';
- }
- } else {
- nextChar = lineBuf[charPos++];
- }
- return nextChar;
- }
- /**
- * 【按步长(step)】取消获取下一个字符
- */
- private void unGetNextChar(int step) {
- if(!isEOF) {
- charPos -= step;
- }
- }
- /**
- * 获取一个Token
- * @return
- * @throws Exception
- */
- private String getToken() throws Exception {
- String tokenStr = "";
- String currentToken = "";
- int currentState = Start;
- boolean isSave;
- // 不同时为EOF和Done状态
- while(currentState != Done && !isEOF) {
- char c = getNextChar();
- isSave = true;
- switch(currentState) {
- case Start:
- if(isDigit(c)) {
- currentState = Num;
- } else if(isLetter(c) || c == '.') { //点号是为了处理头文件iostream.h的格式
- currentState = ID;
- } else if(c == ' ' || c == '\t' || c == '\n') {
- isSave = false;
- } else if(c == '!') {
- currentState = NE;
- } else if(c == '=') {
- currentState = EQ;
- } else if(c == '<') {
- currentState = NM;
- } else if(c == '>') {
- currentState = NL;
- } else if(c == '/') {
- currentState = Coms;
- isSave = false;
- } else if(c == '"') {
- currentState = Str;
- } else {
- currentState = Done;
- // if(isSingle(c)) {
- // currentToken = "" + c;
- // currentState = Done;
- // isSave = false;
- // }
- }
- break;
- case Num:
- if(!isDigit(c)) {
- currentState = Done;
- unGetNextChar(1);
- isSave = false;
- }
- break;
- case ID:
- if(!isLetter(c) && !isDigit(c)) {
- currentState = Done;
- unGetNextChar(1);
- isSave = false;
- }
- break;
- case NE:
- if(c != '=') {
- currentState = Special;
- unGetNextChar(2);
- isSave = false;
- } else {
- currentState = Done;
- }
- break;
- case NM:
- if(c != '=' && c != '<') {
- currentState = Special;
- unGetNextChar(2);
- isSave = false;
- } else {
- currentState = Done;
- }
- break;
- case NL:
- if(c != '=' && c != '>') {
- currentState = Special;
- unGetNextChar(2);
- isSave = false;
- } else {
- currentState = Done;
- }
- break;
- case EQ:
- if(c != '=') {
- currentState = Special;
- unGetNextChar(2);
- isSave = false;
- } else {
- currentState = Done;
- }
- break;
- case Str:
- if(c == '"') {
- currentState = Done;
- }
- break;
- case Coms:
- isSave = false;
- if(c == '/') {
- currentState = LineCom;
- } else if(c == '*') {
- currentState = MulCom1;
- } else {
- currentState = Special;
- unGetNextChar(1);
- }
- break;
- case LineCom:
- isSave = false;
- if(c == '\n') {
- currentState = Done;
- }
- break;
- case MulCom2:
- isSave = false;
- if(c == '*') {
- currentState = MulCom2;
- } else if(c == '/') {
- currentState = Done;
- } else {
- currentState = MulCom1;
- }
- break;
- case Special:
- if(c == '!' || c == '=' || c == '<' || c == '>') {
- // if(isSpecialSingle(c)) {
- currentToken = "" + c;
- currentState = Done;
- isSave = false;
- } else {
- currentToken = "Error";
- currentState = Done;
- }
- break;
- default:
- System.out.println(lineNum + " >> Scanner Bug : state = " + currentState);
- currentState = Done;
- currentToken = "Error";
- break;
- }
- if(isSave) {
- tokenStr += c;
- }
- if(currentState == Done) {
- currentToken = tokenStr;
- printToken(currentToken);
- }
- }
- return currentToken;
- }
- /**
- * 判断是否为字母
- * @param c
- * @return
- */
- private boolean isLetter(char c) {
- if(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')){
- return true;
- }
- return false;
- }
- /**
- * 判断是否为数字
- * @param c
- * @return
- */
- private boolean isDigit(char c) {
- if('0' <= c && c <= '9') {
- return true;
- }
- return false;
- }
- /**
- * 打印时判断是否为【数值Num】
- * @param token
- * @return
- */
- private boolean isNum(String token) {
- boolean flag = true;
- char [] chs = token.toCharArray();
- int len = chs.length;
- for(int i = 0; i < len; i++) {
- if(!isDigit(chs[i])) {
- flag = false;
- }
- }
- return flag;
- }
- /**
- * 打印时判断是否为【特殊符号】
- */
- private boolean isSpecial(String token) {
- int len = special.length;
- for(int i = 0; i < len; i++) {
- if(token.equals(special[i])) {
- return true;
- }
- }
- return false;
- }
- /**
- * 判断是否为算术运算符
- * @param token
- * @return
- */
- private boolean isArithmetic(String token) {
- int len = arithmetic.length;
- for(int i = 0; i < len; i++) {
- if(token.equals(arithmetic[i])) {
- return true;
- }
- }
- return false;
- }
- /**
- * 打印时判断是否为【关键字】
- * @param token
- * @return
- */
- private boolean isKeyWord(String token) {
- int len = keyWords.length;
- for(int i = 0; i < len; i++) {
- if(keyWords[i].equals(token)) {
- return true;
- }
- }
- return false;
- }
- /**
- * 判断是否为【单个字符】即 # * { } [ ] ( ) , . ; : '
- * @param c
- * @return
- */
- // private boolean isSingle(char c) {
- // char [] single = {'#', '*', '{', '}',
- // '[', ']', '(', ')',
- // ':', ';', '.', ',',
- // '\''};
- // int len = single.length;
- // for(int i = 0; i < len; i++) {
- // if(c == single[i]) {
- // return true;
- // }
- // }
- // return false;
- // }
- /**
- * 判断是否为【单个的特殊字符】即 ! = < >
- * 因为这几个属于多义字符,能形成 != == << >>
- * @param c
- * @return
- */
- // private boolean isSpecialSingle(char c) {
- // char [] special = {'!', '=', '<', '>'};
- // int len = special.length;
- // for(int i = 0; i < len; i++) {
- // if(c == special[i]) {
- // return true;
- // }
- // }
- // return false;
- // }
- /**
- * 按类别打印扫描得到的Token
- * @param token
- */
- private void printToken(String token) {
- if(isKeyWord(token)) {
- System.out.printf("%4d: %s --- %s\n", lineNum, token, "关键字");
- token = (preType == TokenType.KeyWord ? " " : "") + token;
- preType = TokenType.KeyWord;
- this.compressedStr.append(token);
- } else if(isSpecial(token)) {
- System.out.printf("%4d: %s --- %s\n", lineNum, token,"特殊符号");
- preType = TokenType.Special;
- this.compressedStr.append(token);
- } else if(isArithmetic(token)) {
- System.out.printf("%4d: %s --- %s\n", lineNum, token,"算术运算符");
- preType = TokenType.Special;
- this.compressedStr.append(token);
- } else if(isNum(token)) {
- System.out.printf("%4d: %s --- %s\n", lineNum, token,"数值");
- preType = TokenType.Special;
- this.compressedStr.append(token);
- } else if(token.startsWith("\"")) {
- System.out.printf("%4d: %s --- %s\n", lineNum, token,"字符串");
- token = (preType == TokenType.KeyWord ? " " : "") + token;
- this.compressedStr.append(token);
- preType = TokenType.Str;
- } else {
- System.out.printf("%4d: %s --- %s\n", lineNum, token,"标识符");
- token = (preType == TokenType.KeyWord ? " " : "") + token;
- this.compressedStr.append(token);
- preType = TokenType.ID;
- }
- }
- /**
- * 打印并将被压缩后的源代码写入新的文件中
- */
- public void printCompressedFile(String compressedFile) throws Exception {
- System.out.println(this.compressedStr);
- // 创建压缩后的文件输出流
- this.compressedFileWriter = new PrintWriter(
- new FileOutputStream(new File(compressedFile)));
- // 写入到新的文件
- this.compressedFileWriter.write(new String(this.compressedStr));
- this.compressedFileWriter.flush();
- }
- /**
- * 测试
- */
- public static void main(String[] args) throws Exception {
- Scanner_2 scanner = new Scanner_2();
- System.out.println("扫描未压缩源代码文件 >> ");
- scanner.scanning("cppSrc.cpp");
- System.out.println("\n压缩之后的源代码 >> ");
- scanner.printCompressedFile("afterCompressed.cpp");
- System.out.println("\n扫描压缩后的源代码文件 >> ");
- scanner.scanning("afterCompressed.cpp");
- }
- }