编译原理之标识符拆分


#pragma once
#include
#include
#include 

using namespace std;

const unordered_set keywords({
	"auto", "short", "int", "long", "float", "double",
	"char", "struct", "union", "enum", "typedef", "const",
	"unsigned", "signed", "extern", "register", "static",
	"volatile", "void", "if", "else", "switch", "for",
	"do", "while", "goto", "continue", "break", "default",
	"sizeof", "return"
});
#define iskey(ch) (keywords.count(ch))

const unordered_set operators({
	'+', '-', '*' , '/', '%', '=',
	'!', '~', '&', '|', '(', ')',
	';', '>', '<'
});
#define isoper(ch) (operators.count(ch))

enum TokenType {
	KEY, ID, NUM, OP, OTH, NONE
};

class Tokenizer {
	string stmt;
	int idx;

public:
	Tokenizer() : idx(0) {}
	Tokenizer(string& _stmt) : stmt(_stmt), idx(0) {}

	void consume(string & _stmt) {
		stmt.swap(_stmt);
	}

	TokenType next(string& token);
	
	void reset() {
		idx = 0;
	}
};





#include "Tokenizer.h"

TokenType Tokenizer::next(string& token)
{
	token.clear();
	int cnt = 0;

	// skip spaces
	while (idx < stmt.length() && isspace(stmt[idx]))
		++idx;

	// parse token
	while (idx < stmt.length()) {
		// id
		if (isalpha(stmt[idx]) || stmt[idx] == '_') {
			size_t idx2 = idx;
			while (isalnum(stmt[idx]) || stmt[idx] == '_')
				if (++idx >= stmt.length())
					break;
			token.append(stmt.substr(idx2, idx - idx2));

			// check boundary
			if (!isspace(stmt[idx]) && !isoper(stmt[idx]))
				break;

			// check keywords
			if (iskey(token))
				return KEY;

			return ID;
		}

		// num
		else if (isalnum(stmt[idx]) && !isalpha(stmt[idx]) || stmt[idx] == '.') {
			// read hex preffix
			if (stmt[idx] == '0' && idx + 1 < stmt.length() && stmt[idx + 1] == 'x') {
				token.append(stmt.substr(idx, 2));
				idx += 2;
			}

			// read num and fp
			size_t idx2 = idx;
			while (isalnum(stmt[idx]) && !isalpha(stmt[idx]) || stmt[idx] == '.') {
				if (++idx >= stmt.length())
					break;
			}
			token.append(stmt.substr(idx2, idx - idx2));

			idx2 = idx;
			if (idx < stmt.length()) {
				// read suffix
			}

			// check boundary
			if (!isspace(stmt[idx]) && !isoper(stmt[idx]))
				break;

			return NUM;
		}

		// op 
		else if (isoper(stmt[idx])) {
			if (stmt[idx] == '+') {
				if (idx + 1 < stmt.length() && (stmt[idx + 1] == '+' || stmt[idx + 1] == '=')) {
					token.append(stmt.substr(idx, 2));
					idx += 2;
				}
				else
					token.append(1, stmt[idx++]);
			}
			else  if (stmt[idx] == '-') {
				if (idx + 1 < stmt.length() && (stmt[idx + 1] == '-' || stmt[idx + 1] == '=')) {
					token.append(stmt.substr(idx, 2));
					idx += 2;
				}
				else
					token.append(1, stmt[idx++]);
			}
			else if (stmt[idx] == '*' || stmt[idx] == '/' || stmt[idx] == '%'
				|| stmt[idx] == '&' || stmt[idx] == '|'
				|| stmt[idx] == '~' || stmt[idx] == '!' || stmt[idx] == '=') {
				if (idx + 1 < stmt.length() && stmt[idx + 1] == '=') {
					token.append(stmt.substr(idx, 2));
					idx += 2;
				}
				else
					token.append(1, stmt[idx++]);
			}
			else if (stmt[idx] == '>') {
				if (idx + 1 < stmt.length() && (stmt[idx + 1] == '>' || stmt[idx] == '=')) {
					token.append(stmt.substr(idx, 2));
					idx += 2;
				}
				else
					token.append(1, stmt[idx++]);
			}
			else if (stmt[idx] == '<') {
				if (idx + 1 < stmt.length() && (stmt[idx + 1] == '<' || stmt[idx] == '=')) {
					token.append(stmt.substr(idx, 2));
					idx += 2;
				}
				else
					token.append(1, stmt[idx++]);
			}
			else 	// ;, (, )
				token.append(1, stmt[idx++]);

			return OP;
		}

		// others
		else
			break;
	}

	// handle others
	int idx2 = idx;
	while (idx < stmt.length() && !isoper(stmt[idx]) && !isspace(stmt[idx]))
		++idx;
	token.append(stmt.substr(idx2, idx - idx2));

	if (token.empty())
		return NONE;
	else
		return OTH;
}




你可能感兴趣的:(编译原理,搜索,字符串操作,C/C++)