c语言词法分析器



#include <iostream>
#include <fstream>
#include <cctype>
#include <string>

using namespace std;


class SourceCode{
private:
	string code;
	int index;
	int length;
public:
	SourceCode(){
		code="";
		index=0;
		length=0;
	}

	SourceCode(string code){
		this->code=code;
		index=0;
		length=code.length();
	}

	void setCode(string code){
		this->code=code;
		index=0;
		length=code.length();
	}

	string getCode(){
		return code;
	}

	int getIndex(){
		return index;
	}

	char nextChar(){
		return code[index++];
	}

	char getPreChar(){
		return code[index-1];
	}

	void retract(){
		index--;
	}

	bool end(){
		return index>=length;
	}
};

SourceCode sourceCode;
string keyWords[]={"auto","short","int","long","float","double","char","struct","union","enum",
					"typedef","const","signed","unsigned","exterm","register","static","volatile","void",
					"if","else","switch","case","for","do","while","goto","continue","break",
					"default","sizeof","return"};

string precompiledDirectives[]={"include","define","undef","if","ifdef","ifndef","elif","endif","error"};

char getbc(){
	char ch = sourceCode.nextChar();
	while(!sourceCode.end() && isspace(ch)){
		ch = sourceCode.nextChar();
	}
	if(!isspace(ch)) return ch;
	else return -1;
}

char getbcBuLFandCR(){
	char ch = sourceCode.nextChar();
	while(!sourceCode.end() && isspace(ch) && ch!='\r' && ch!='\n'){
		ch = sourceCode.nextChar();
	}
	if(sourceCode.end()) return -1;
	else return ch;
}

bool letter(char ch){
	return ch>='a' && ch <='z' || ch>='A' && ch<='Z';
}

bool digit(char ch){
	return ch>='0' && ch <='9';
}

bool digitOctonary(char ch){
	return ch>='0' && ch<='7';
}

bool digitHexadecimal(char ch){
	return ch>='0' && ch<='9' || ch>='a' && ch<='f' || ch>='A' && ch<='F';
}

int HexToDec(string s){
	int result=0;
	for(int i=0;s[i]!='\0';i++){
		if(s[i]>='0' && s[i]<='9')
			result=result*16+(s[i]-'0');
		else if(s[i]>='a' && s[i]<='f')
			result=result*16+(s[i]-'a');
		else if(s[i]>='A' && s[i]<='Z')
			result=result*16+(s[i]-'A');
		else
			return -1;
	}
	return result;
}

int OctToDec(string s){
	int result=0;
	for(int i=0;s[i]!='\0';i++){
		if(s[i]>='0' && s[i]<='7')
			result=result*8+(s[i]-'0');
		else
			return -1;
	}
	return result;
}

int isKeyWords(string &s){
	for(int i=31;i>=0;i--){
		if(keyWords[i].compare(s)==0)
			return i;
	}
	return 32;
}

int isPrecompiledDirectives(string &s){
	for(int i=8;i>=0;i--){
		if(precompiledDirectives[i].compare(s)==0)
			return i;
	}
	return -1;
}

int isSpecialChar(char ch){
	return ch=='n' || ch=='r' || ch=='t' || ch=='v' || ch=='a' || ch=='b' || ch=='f'
		|| ch=='\'' || ch=='\"' || ch=='\\' || ch=='?';
}

char getESC(){
	char ch = sourceCode.nextChar();
	string s;

	switch(ch){
		case'n':	return '\n';
		case'r':	return '\r';
		case't':	return '\t';
		case'v':	return '\v';
		case'a':	return '\a';
		case'b':	return '\b';
		case'f':	return '\f';
		case'\'':	return '\'';
		case'\"':	return '\"';
		case'\\':	return '\\';
		case'?':	return '\?';
		case'x':
		case'X':
			ch = sourceCode.nextChar();
			if(digitHexadecimal(ch)){
				s.append(1,ch);
				ch=sourceCode.nextChar();
				if(digitHexadecimal(ch))
					s.append(1,ch);
				else
					sourceCode.retract();
				return (char)HexToDec(s);
			}
			else{
				sourceCode.retract();
				sourceCode.retract();
				return -1;
			}
	}
	if(digitOctonary(ch)){
		
		if(sourceCode.getPreChar()=='0' && !digitOctonary(ch)) return '\0';

		s.append(1,ch);
		ch=sourceCode.nextChar();

		for(int i=0;i<2 && digitOctonary(ch);i++){
			s.append(1,ch);
			ch=sourceCode.nextChar();
		}
		sourceCode.retract();
		return (int)OctToDec(s);
	}

	sourceCode.retract();
	return -2;
}

int scaner(string &token){
	token.clear();
	char ch = getbc();
	if(ch=='_' || letter(ch)){

		while(ch=='_' || letter(ch) || digit(ch)){
			token.append(1,ch);
			ch = sourceCode.nextChar();
		}

		sourceCode.retract();
		int c=isKeyWords(token);
		if(c!=32) return c;
		else return 32;
	}

	if(digit(ch)){

		while(digit(ch)){
			token.append(1,ch);
			ch = sourceCode.nextChar();
		}

		sourceCode.retract();
		return 33;
	}

	switch(ch){
		case'(':	token.append(1,ch);return 34;
		case')':	token.append(1,ch);return 35;
		case'[':	token.append(1,ch);return 36;
		case']':	token.append(1,ch);return 37;
		case'.':	token.append(1,ch);return 38;
		case'~':	token.append(1,ch);return 39;
		case'?':	token.append(1,ch);return 40;
		case':':	token.append(1,ch);return 41;
		case',':	token.append(1,ch);return 42;
		case';':	token.append(1,ch);return 43;
		case'{':	token.append(1,ch);return 44;
		case'}':	token.append(1,ch);return 45;
	}

	if(ch=='!'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		if(ch=='='){
			token.append(1,ch);
			return 47;
		}
		sourceCode.retract();
		return 46;
	}

	if(ch=='-'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch(ch){
			case'>':	token.append(1,ch);return 49;
			case'-':	token.append(1,ch);return 50;
			case'=':	token.append(1,ch);return 51;
			default:	sourceCode.retract();return 48;
		}
	}

	if(ch=='+'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch(ch){
			case'+':	token.append(1,ch);return 53;
			case'=':	token.append(1,ch);return 54;
			default:	sourceCode.retract();return 52;
		}
	}

	if(ch=='*'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		if(ch=='='){
			token.append(1,ch);
			return 56;
		}
		else{
			sourceCode.retract();
			return 55;
		}
	}

	if(ch=='&'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch(ch){
			case'&':	token.append(1,ch);return 58;
			case'=':	token.append(1,ch);return 59;
			default:	sourceCode.retract();return 57;
		}
	}

	if(ch=='%'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		if(ch=='='){
			token.append(1,ch);
			return 61;
		}
		else{
			sourceCode.retract();
			return 60;
		}
	}

	if(ch=='<'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch (ch){
			case'=':	token.append(1,ch);return 63;
			case'<':
				token.append(1,ch);
				ch=sourceCode.nextChar();
				if(ch=='='){
					ch=sourceCode.nextChar();
					return 65;
				}
				else{
					sourceCode.retract();
					return 64;
				}
			default:	sourceCode.retract();return 62;
		}
	}

	if(ch=='>'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch (ch){
			case'=':	token.append(1,ch);return 67;
			case'>':
				token.append(1,ch);
				ch=sourceCode.nextChar();
				if(ch=='='){
					ch=sourceCode.nextChar();
					return 69;
				}
				else{
					sourceCode.retract();
					return 68;
				}
			default:	sourceCode.retract();return 66;
		}
	}

	if(ch=='='){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		if(ch=='='){
			token.append(1,ch);
			return 71;
		}
		else{
			sourceCode.retract();
			return 70;
		}
	}

	if(ch=='|'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch(ch){
			case'|':	token.append(1,ch);return 73;
			case'=':	token.append(1,ch);return 74;
			default:	sourceCode.retract();return 72;
		}
	}

	if(ch=='^'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		if(ch=='='){
			token.append(1,ch);
			return 76;
		}
		else{
			sourceCode.retract();
			return 75;
		}
	}

	if(ch=='/'){
		token.append(1,ch);
		ch=sourceCode.nextChar();
		switch (ch)
		{
			case'=':	token.append(1,ch);return 78;
			case'/':
				while(ch!='\r' && ch!='\n' && !sourceCode.end()){
					token.append(1,ch);
					ch=sourceCode.nextChar();
				}
				sourceCode.retract();
				return 79;
			case'*':
				sign1:
				while(ch!='*' && !sourceCode.end()){
					token.append(1,ch);
					ch=sourceCode.nextChar();
				}
				if(sourceCode.end())
					return -1;
				else{
					token.append(1,ch);
					ch=sourceCode.nextChar();
					if(ch!='/'){
					goto sign1;
					}
					else{
						token.append(1,ch);
						return 80;
					}
				}
			default:	sourceCode.retract();return 77;
		}
	}

	if(ch=='#'){
		ch=getbcBuLFandCR();
		if(ch=='\r' || ch=='\n'){
			token.append(1,'#');
			return 81;
		}

		while(letter(ch)){
			token.append(1,ch);
			ch=sourceCode.nextChar();
		}

		sourceCode.retract();

		int c=isPrecompiledDirectives(token);

		if(c==-1)	return -1;
		else{
			token.insert(0,1,'#');
			return c+82;
		}
	}

	if(ch=='\''){
		ch=sourceCode.nextChar();
		if(ch=='\'') return 91;
		if(ch!='\\'){
			token.append(1,ch);
			ch=sourceCode.nextChar();
			if(ch=='\'')
				return 91;
			else{
				sourceCode.retract();
				return -1;
			}
		}

		ch=getESC();
		if(ch==-1 || ch==-2) return -1;

		token.append(1,ch);
		ch=sourceCode.nextChar();

		if(ch=='\'') return 91;
		else return -1;
	}

	if(ch=='\"'){
		sign2:
		ch=sourceCode.nextChar();
		while(!sourceCode.end() && ch!='\"' && ch!='\\' && ch!='\r' && ch!='\n'){
			token.append(1,ch);
			ch=sourceCode.nextChar();
		}
		if(sourceCode.end()) return -1;
		if(ch=='\\'){
			ch=getESC();
			if(ch==-1) return -1;
			if(ch==-2) ch=sourceCode.nextChar();	//非致命转义,忽略反斜杠
			token.append(1,ch);
			goto sign2;
		}
		if(ch=='\"') return 92;
	}

	return -1;
}

int main(){
	fstream input;
	char inpath[100];
	char outpath[100];
	string s,token;

	cout << "Please enter the source file's path:";
	cin >> inpath;
	getchar();

	input.open(inpath,ios::in);
	
	char ch=input.get();
	while(!input.eof()){
		s.append(1,ch);
		ch = input.get();
	}

	sourceCode.setCode(s);

	//cout << s;

	while(!sourceCode.end())
	{
		cout << (scaner(token)) << "\t\t" ;
		cout << token << endl;
	}

	getchar();
	return 0;
}

你可能感兴趣的:(c语言词法分析器)