注:参考自http://dukeland.hk,本博客系列内容为自己解读的成果,以备将来自己回顾使用。所有版权归原作者所有,如有任何问题,请联系原作者。
考虑到“字”分析比较简单,所以和“词”分析写到一起,形成这篇文章:
1, reader.js
//读字符是要为读词调用的,所以无外乎要做三件事儿:1,存数据;2,存自己的状态;3,提供读字符和回退字符的函数供调用 function Reader(str){ this.data = str; this.currPos = 0; this.dataLength = str.length; } //机械式地读取下一个字符,所有字符读完了就返回-1 Reader.prototype.nextChar = function (){ if (this.currPos >= this.dataLength){ return -1; //end of stream } return this.data[this.currPos++]; } //回退一个字符,也是为了读二义性的词方便 Reader.prototype.retract = function (n){ if (n == undefined){ n = 1; } this.currPos -= n; if (this.currPos < 0){ this.currPos = 0; } }
2,scanner.js
//因为读词也是为释义准备的,所以和读字符类似:1,保存数据;2,保存状态;3,提供创建一个词以及读下一个词的函数供调用 function Scanner(reader){ this.reader = reader; this.currentToken = new Token(); //storing the current analysed token this.currLine = 0; //the line number of the current line being read this.state = Scanner.START_STATE; } //开始状态也是结束状态,读到了可以处理的字符就进入到了标识符读取状态。 //有歧义的类似'/'这样的专门定义一个状态来处理,防止逻辑上乱掉了。 Scanner.START_STATE = 1; Scanner.IDENTIFIER_STATE = Scanner.START_STATE + 1; Scanner.SLASH_STATE = Scanner.IDENTIFIER_STATE + 1; //创建一个词,保存两种数据:类型和内容 Scanner.prototype.makeToken = function (type, text){ this.currentToken.type = type; this.currentToken.text = text; return type; } 和读取下一个字符类似,这里是在读取下一个关键字 Scanner.prototype.nextToken = function(){ var bufferStr = ""; while (true){ switch (this.state){ case Scanner.START_STATE: var c = this.reader.nextChar(); //先把字符类的词读进来 if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){ this.state = Scanner.IDENTIFIER_STATE; //we need to remember what the token's text is bufferStr = c; }else if (c >= "0" && c <= "9"){ //标识数字 bufferStr = c; var d; while (true){ d = this.reader.nextChar(); if (d >= "0" && d <= "9"){ bufferStr += d; }else{ this.reader.retract(); return this.makeToken(Token.tokens.INTLITERAL_TOKEN, bufferStr); } } }else{ switch (c){ case ":": return this.makeToken(Token.tokens.COLON_TOKEN); break; //这里略去了':', ';', '(', ')', '{', '}', '%'等的处理 .... //往下是一堆二义性符号的处理 case "!": if (this.reader.nextChar() == "="){ return this.makeToken(Token.tokens.NOTEQUAL_TOKEN); }else{ //记住,如果不成功这里要回退,否则会漏掉一个字符 this.reader.retract(); return this.makeToken(Token.tokens.NOT_TOKEN); } break; case "+": var d = this.reader.nextChar(); if (d == "="){ return this.makeToken(Token.tokens.PLUSASSIGN_TOKEN); }else if (d == "+"){ return this.makeToken(Token.tokens.PLUSPLUS_TOKEN); }else{ this.reader.retract(); return this.makeToken(Token.tokens.PLUS_TOKEN); } break; case "-": var d = this.reader.nextChar(); if (d == "="){ return this.makeToken(Token.tokens.MINUSASSIGN_TOKEN); }else if (d == "-"){ return this.makeToken(Token.tokens.MINUSMINUS_TOKEN); }else{ this.reader.retract(); return this.makeToken(Token.tokens.MINUS_TOKEN); } break; case "*": return this.makeToken(Token.tokens.MULT_TOKEN); break; case "=": if (this.reader.nextChar() == "="){ return this.makeToken(Token.tokens.EQUAL_TOKEN); }else{ this.reader.retract(); return this.makeToken(Token.tokens.ASSIGN_TOKEN); } break; case ">": if (this.reader.nextChar() == "="){ return this.makeToken(Token.tokens.GREATEREQUAL_TOKEN); }else{ this.reader.retract(); return this.makeToken(Token.tokens.GREATER_TOKEN); } break; case "<": if (this.reader.nextChar() == "="){ return this.makeToken(Token.tokens.LESSEQUAL_TOKEN); }else{ this.reader.retract(); return this.makeToken(Token.tokens.LESS_TOKEN); } break; case "/": this.state = Scanner.SLASH_STATE; break; case "&": if (this.reader.nextChar() == "&"){ return this.makeToken(Token.tokens.AND_TOKEN); }else{ this.reader.retract(); Errors.push({ type: Errors.SYNTAX_ERROR, msg: "You have only one &", line: this.currLine }); } break; case "|": if (this.reader.nextChar() == "|"){ return this.makeToken(Token.tokens.OR_TOKEN); }else{ this.reader.retract(); Errors.push({ type: Errors.SYNTAX_ERROR, msg: "You have only one |", line: this.currLine }); } break; case -1: return this.makeToken(Token.tokens.EOS_TOKEN); break; case "\r": case "\n": this.currLine++; default: //ignore them } } break; //一旦遇到非特殊字符的字母时编程标识符状态,那么读下一个字符的时候就到这里进行处理了。
//这里处理的关键字都是连续性的字母,所以一旦出现空格或者其它的字符,这一串字母的读取就要中断了 case Scanner.IDENTIFIER_STATE: var c = this.reader.nextChar(); if (c === -1) { return this.makeToken(Token.tokens.EOS_TOKEN); } else if ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")){ bufferStr += c; }else{ this.reader.retract(); this.state = Scanner.START_STATE; switch (bufferStr){ case "var": return this.makeToken(Token.tokens.VAR_TOKEN); case "true": case "false": case "TRUE": case "FALSE": return this.makeToken(Token.tokens.BOOLLITERAL_TOKEN, bufferStr); case "if": return this.makeToken(Token.tokens.IF_TOKEN); case "else": return this.makeToken(Token.tokens.ELSE_TOKEN); case "while": return this.makeToken(Token.tokens.WHILE_TOKEN); case "print": return this.makeToken(Token.tokens.PRINT_TOKEN); default: return this.makeToken(Token.tokens.IDENTIFIER_TOKEN, bufferStr); } } break; //因为没有处理读完所有代码还没有碰到结尾符的问题,所以有bug,自己处理吧 case Scanner.SLASH_STATE: var d = this.reader.nextChar(); if (d == "/"){ //line comment bufferStr = ""; d = this.reader.nextChar(); if (d != "\r" && d != "\n"){ while (d != "\r" && d != "\n"){ bufferStr += d; d = this.reader.nextChar(); } //to retract the line break char this.reader.retract(); } this.state = Scanner.START_STATE; return this.makeToken(Token.tokens.LINECOMMENT_TOKEN, bufferStr); }else if (d == "*"){ //block comment bufferStr = ""; var end = false; while (! end){ d = this.reader.nextChar(); if (d != -1){ if (d == "\r" || d == "\n"){ this.currLine++; } if (d == "*"){ var e = this.reader.nextChar(); if (e == "/"){ //meet */ end = true; }else{ bufferStr += "*" + e; } }else{ bufferStr += d; } }else{ end = true; } } this.state = Scanner.START_STATE; return this.makeToken(Token.tokens.BLOCKCOMMENT_TOKEN, bufferStr); }else{ this.state = Scanner.START_STATE; this.reader.retract(); return this.makeToken(Token.tokens.DIV_TOKEN); } break; } } }
3,注意到代码里用到了一堆Token的常量,这里直接贴下来,不做解释。常量的值采用 +1的方式很讲究,这样你随时可以添加新的常量。
token.js
//Token class //type: Token's type //text: the actual text that makes this token, may be null if it is not important function Token(type, text){ this.type = type; this.text = text; } Token.tokens = {}; Token.tokens.EOS_TOKEN = 1; //end of stream // using + 1 allows adding a new token easily later Token.tokens.COLON_TOKEN = Token.tokens.EOS_TOKEN + 1; Token.tokens.SEMICOLON_TOKEN = Token.tokens.COLON_TOKEN + 1; Token.tokens.LEFTPAREN_TOKEN = Token.tokens.SEMICOLON_TOKEN + 1; Token.tokens.RIGHTPAREN_TOKEN = Token.tokens.LEFTPAREN_TOKEN + 1; Token.tokens.LEFTBRACE_TOKEN = Token.tokens.RIGHTPAREN_TOKEN + 1; Token.tokens.RIGHTBRACE_TOKEN = Token.tokens.LEFTBRACE_TOKEN + 1; Token.tokens.MOD_TOKEN = Token.tokens.RIGHTBRACE_TOKEN + 1; Token.tokens.VAR_TOKEN = Token.tokens.MOD_TOKEN + 1; Token.tokens.TYPE_TOKEN = Token.tokens.VAR_TOKEN + 1; Token.tokens.BOOLLITERAL_TOKEN = Token.tokens.TYPE_TOKEN + 1; Token.tokens.INTLITERAL_TOKEN = Token.tokens.BOOLLITERAL_TOKEN + 1; Token.tokens.IF_TOKEN = Token.tokens.INTLITERAL_TOKEN + 1; Token.tokens.ELSE_TOKEN = Token.tokens.IF_TOKEN + 1; Token.tokens.WHILE_TOKEN = Token.tokens.ELSE_TOKEN + 1; Token.tokens.PRINT_TOKEN = Token.tokens.WHILE_TOKEN + 1; Token.tokens.IDENTIFIER_TOKEN = Token.tokens.PRINT_TOKEN + 1; Token.tokens.PLUS_TOKEN = Token.tokens.IDENTIFIER_TOKEN + 1; Token.tokens.PLUSPLUS_TOKEN = Token.tokens.PLUS_TOKEN + 1; Token.tokens.PLUSASSIGN_TOKEN = Token.tokens.PLUSPLUS_TOKEN + 1; Token.tokens.MINUS_TOKEN = Token.tokens.PLUSASSIGN_TOKEN + 1; Token.tokens.MINUSMINUS_TOKEN = Token.tokens.MINUS_TOKEN + 1; Token.tokens.MINUSASSIGN_TOKEN = Token.tokens.MINUSMINUS_TOKEN + 1; Token.tokens.MULT_TOKEN = Token.tokens.MINUSASSIGN_TOKEN + 1; Token.tokens.DIV_TOKEN = Token.tokens.MULT_TOKEN + 1; Token.tokens.ASSIGN_TOKEN = Token.tokens.DIV_TOKEN + 1; Token.tokens.EQUAL_TOKEN = Token.tokens.ASSIGN_TOKEN + 1; Token.tokens.NOTEQUAL_TOKEN = Token.tokens.EQUAL_TOKEN + 1; Token.tokens.GREATER_TOKEN = Token.tokens.NOTEQUAL_TOKEN + 1; Token.tokens.GREATEREQUAL_TOKEN = Token.tokens.GREATER_TOKEN + 1; Token.tokens.LESS_TOKEN = Token.tokens.GREATEREQUAL_TOKEN + 1; Token.tokens.LESSEQUAL_TOKEN = Token.tokens.LESS_TOKEN + 1; Token.tokens.AND_TOKEN = Token.tokens.LESSEQUAL_TOKEN + 1; Token.tokens.OR_TOKEN = Token.tokens.AND_TOKEN + 1; Token.tokens.NOT_TOKEN = Token.tokens.OR_TOKEN + 1; Token.tokens.LINECOMMENT_TOKEN = Token.tokens.NOT_TOKEN + 1; Token.tokens.BLOCKCOMMENT_TOKEN = Token.tokens.LINECOMMENT_TOKEN + 1; Token.backwardMap = {}; //for inverse look-up for (var x in Token.tokens){ Token.backwardMap[Token.tokens[x]] = x; }