#include <iostream> #include <fstream> #include <cctype> #include <string> using namespace std; class SourceCode{ private: string code; int index; int length; public: SourceCode(){ code=""; index=0; length=0; } SourceCode(string code){ this->code=code; index=0; length=code.length(); } void setCode(string code){ this->code=code; index=0; length=code.length(); } string getCode(){ return code; } int getIndex(){ return index; } char nextChar(){ return code[index++]; } char getPreChar(){ return code[index-1]; } void retract(){ index--; } bool end(){ return index>=length; } }; SourceCode sourceCode; string keyWords[]={"auto","short","int","long","float","double","char","struct","union","enum", "typedef","const","signed","unsigned","exterm","register","static","volatile","void", "if","else","switch","case","for","do","while","goto","continue","break", "default","sizeof","return"}; string precompiledDirectives[]={"include","define","undef","if","ifdef","ifndef","elif","endif","error"}; char getbc(){ char ch = sourceCode.nextChar(); while(!sourceCode.end() && isspace(ch)){ ch = sourceCode.nextChar(); } if(!isspace(ch)) return ch; else return -1; } char getbcBuLFandCR(){ char ch = sourceCode.nextChar(); while(!sourceCode.end() && isspace(ch) && ch!='\r' && ch!='\n'){ ch = sourceCode.nextChar(); } if(sourceCode.end()) return -1; else return ch; } bool letter(char ch){ return ch>='a' && ch <='z' || ch>='A' && ch<='Z'; } bool digit(char ch){ return ch>='0' && ch <='9'; } bool digitOctonary(char ch){ return ch>='0' && ch<='7'; } bool digitHexadecimal(char ch){ return ch>='0' && ch<='9' || ch>='a' && ch<='f' || ch>='A' && ch<='F'; } int HexToDec(string s){ int result=0; for(int i=0;s[i]!='\0';i++){ if(s[i]>='0' && s[i]<='9') result=result*16+(s[i]-'0'); else if(s[i]>='a' && s[i]<='f') result=result*16+(s[i]-'a'); else if(s[i]>='A' && s[i]<='Z') result=result*16+(s[i]-'A'); else return -1; } return result; } int OctToDec(string s){ int result=0; for(int i=0;s[i]!='\0';i++){ if(s[i]>='0' && s[i]<='7') result=result*8+(s[i]-'0'); else return -1; } return result; } int isKeyWords(string &s){ for(int i=31;i>=0;i--){ if(keyWords[i].compare(s)==0) return i; } return 32; } int isPrecompiledDirectives(string &s){ for(int i=8;i>=0;i--){ if(precompiledDirectives[i].compare(s)==0) return i; } return -1; } int isSpecialChar(char ch){ return ch=='n' || ch=='r' || ch=='t' || ch=='v' || ch=='a' || ch=='b' || ch=='f' || ch=='\'' || ch=='\"' || ch=='\\' || ch=='?'; } char getESC(){ char ch = sourceCode.nextChar(); string s; switch(ch){ case'n': return '\n'; case'r': return '\r'; case't': return '\t'; case'v': return '\v'; case'a': return '\a'; case'b': return '\b'; case'f': return '\f'; case'\'': return '\''; case'\"': return '\"'; case'\\': return '\\'; case'?': return '\?'; case'x': case'X': ch = sourceCode.nextChar(); if(digitHexadecimal(ch)){ s.append(1,ch); ch=sourceCode.nextChar(); if(digitHexadecimal(ch)) s.append(1,ch); else sourceCode.retract(); return (char)HexToDec(s); } else{ sourceCode.retract(); sourceCode.retract(); return -1; } } if(digitOctonary(ch)){ if(sourceCode.getPreChar()=='0' && !digitOctonary(ch)) return '\0'; s.append(1,ch); ch=sourceCode.nextChar(); for(int i=0;i<2 && digitOctonary(ch);i++){ s.append(1,ch); ch=sourceCode.nextChar(); } sourceCode.retract(); return (int)OctToDec(s); } sourceCode.retract(); return -2; } int scaner(string &token){ token.clear(); char ch = getbc(); if(ch=='_' || letter(ch)){ while(ch=='_' || letter(ch) || digit(ch)){ token.append(1,ch); ch = sourceCode.nextChar(); } sourceCode.retract(); int c=isKeyWords(token); if(c!=32) return c; else return 32; } if(digit(ch)){ while(digit(ch)){ token.append(1,ch); ch = sourceCode.nextChar(); } sourceCode.retract(); return 33; } switch(ch){ case'(': token.append(1,ch);return 34; case')': token.append(1,ch);return 35; case'[': token.append(1,ch);return 36; case']': token.append(1,ch);return 37; case'.': token.append(1,ch);return 38; case'~': token.append(1,ch);return 39; case'?': token.append(1,ch);return 40; case':': token.append(1,ch);return 41; case',': token.append(1,ch);return 42; case';': token.append(1,ch);return 43; case'{': token.append(1,ch);return 44; case'}': token.append(1,ch);return 45; } if(ch=='!'){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ token.append(1,ch); return 47; } sourceCode.retract(); return 46; } if(ch=='-'){ token.append(1,ch); ch=sourceCode.nextChar(); switch(ch){ case'>': token.append(1,ch);return 49; case'-': token.append(1,ch);return 50; case'=': token.append(1,ch);return 51; default: sourceCode.retract();return 48; } } if(ch=='+'){ token.append(1,ch); ch=sourceCode.nextChar(); switch(ch){ case'+': token.append(1,ch);return 53; case'=': token.append(1,ch);return 54; default: sourceCode.retract();return 52; } } if(ch=='*'){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ token.append(1,ch); return 56; } else{ sourceCode.retract(); return 55; } } if(ch=='&'){ token.append(1,ch); ch=sourceCode.nextChar(); switch(ch){ case'&': token.append(1,ch);return 58; case'=': token.append(1,ch);return 59; default: sourceCode.retract();return 57; } } if(ch=='%'){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ token.append(1,ch); return 61; } else{ sourceCode.retract(); return 60; } } if(ch=='<'){ token.append(1,ch); ch=sourceCode.nextChar(); switch (ch){ case'=': token.append(1,ch);return 63; case'<': token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ ch=sourceCode.nextChar(); return 65; } else{ sourceCode.retract(); return 64; } default: sourceCode.retract();return 62; } } if(ch=='>'){ token.append(1,ch); ch=sourceCode.nextChar(); switch (ch){ case'=': token.append(1,ch);return 67; case'>': token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ ch=sourceCode.nextChar(); return 69; } else{ sourceCode.retract(); return 68; } default: sourceCode.retract();return 66; } } if(ch=='='){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ token.append(1,ch); return 71; } else{ sourceCode.retract(); return 70; } } if(ch=='|'){ token.append(1,ch); ch=sourceCode.nextChar(); switch(ch){ case'|': token.append(1,ch);return 73; case'=': token.append(1,ch);return 74; default: sourceCode.retract();return 72; } } if(ch=='^'){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='='){ token.append(1,ch); return 76; } else{ sourceCode.retract(); return 75; } } if(ch=='/'){ token.append(1,ch); ch=sourceCode.nextChar(); switch (ch) { case'=': token.append(1,ch);return 78; case'/': while(ch!='\r' && ch!='\n' && !sourceCode.end()){ token.append(1,ch); ch=sourceCode.nextChar(); } sourceCode.retract(); return 79; case'*': sign1: while(ch!='*' && !sourceCode.end()){ token.append(1,ch); ch=sourceCode.nextChar(); } if(sourceCode.end()) return -1; else{ token.append(1,ch); ch=sourceCode.nextChar(); if(ch!='/'){ goto sign1; } else{ token.append(1,ch); return 80; } } default: sourceCode.retract();return 77; } } if(ch=='#'){ ch=getbcBuLFandCR(); if(ch=='\r' || ch=='\n'){ token.append(1,'#'); return 81; } while(letter(ch)){ token.append(1,ch); ch=sourceCode.nextChar(); } sourceCode.retract(); int c=isPrecompiledDirectives(token); if(c==-1) return -1; else{ token.insert(0,1,'#'); return c+82; } } if(ch=='\''){ ch=sourceCode.nextChar(); if(ch=='\'') return 91; if(ch!='\\'){ token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='\'') return 91; else{ sourceCode.retract(); return -1; } } ch=getESC(); if(ch==-1 || ch==-2) return -1; token.append(1,ch); ch=sourceCode.nextChar(); if(ch=='\'') return 91; else return -1; } if(ch=='\"'){ sign2: ch=sourceCode.nextChar(); while(!sourceCode.end() && ch!='\"' && ch!='\\' && ch!='\r' && ch!='\n'){ token.append(1,ch); ch=sourceCode.nextChar(); } if(sourceCode.end()) return -1; if(ch=='\\'){ ch=getESC(); if(ch==-1) return -1; if(ch==-2) ch=sourceCode.nextChar(); //非致命转义,忽略反斜杠 token.append(1,ch); goto sign2; } if(ch=='\"') return 92; } return -1; } int main(){ fstream input; char inpath[100]; char outpath[100]; string s,token; cout << "Please enter the source file's path:"; cin >> inpath; getchar(); input.open(inpath,ios::in); char ch=input.get(); while(!input.eof()){ s.append(1,ch); ch = input.get(); } sourceCode.setCode(s); //cout << s; while(!sourceCode.end()) { cout << (scaner(token)) << "\t\t" ; cout << token << endl; } getchar(); return 0; }