#pragma once
#include
#include
#include
using namespace std;
const unordered_set keywords({
"auto", "short", "int", "long", "float", "double",
"char", "struct", "union", "enum", "typedef", "const",
"unsigned", "signed", "extern", "register", "static",
"volatile", "void", "if", "else", "switch", "for",
"do", "while", "goto", "continue", "break", "default",
"sizeof", "return"
});
#define iskey(ch) (keywords.count(ch))
const unordered_set operators({
'+', '-', '*' , '/', '%', '=',
'!', '~', '&', '|', '(', ')',
';', '>', '<'
});
#define isoper(ch) (operators.count(ch))
enum TokenType {
KEY, ID, NUM, OP, OTH, NONE
};
class Tokenizer {
string stmt;
int idx;
public:
Tokenizer() : idx(0) {}
Tokenizer(string& _stmt) : stmt(_stmt), idx(0) {}
void consume(string & _stmt) {
stmt.swap(_stmt);
}
TokenType next(string& token);
void reset() {
idx = 0;
}
};
#include "Tokenizer.h"
TokenType Tokenizer::next(string& token)
{
token.clear();
int cnt = 0;
// skip spaces
while (idx < stmt.length() && isspace(stmt[idx]))
++idx;
// parse token
while (idx < stmt.length()) {
// id
if (isalpha(stmt[idx]) || stmt[idx] == '_') {
size_t idx2 = idx;
while (isalnum(stmt[idx]) || stmt[idx] == '_')
if (++idx >= stmt.length())
break;
token.append(stmt.substr(idx2, idx - idx2));
// check boundary
if (!isspace(stmt[idx]) && !isoper(stmt[idx]))
break;
// check keywords
if (iskey(token))
return KEY;
return ID;
}
// num
else if (isalnum(stmt[idx]) && !isalpha(stmt[idx]) || stmt[idx] == '.') {
// read hex preffix
if (stmt[idx] == '0' && idx + 1 < stmt.length() && stmt[idx + 1] == 'x') {
token.append(stmt.substr(idx, 2));
idx += 2;
}
// read num and fp
size_t idx2 = idx;
while (isalnum(stmt[idx]) && !isalpha(stmt[idx]) || stmt[idx] == '.') {
if (++idx >= stmt.length())
break;
}
token.append(stmt.substr(idx2, idx - idx2));
idx2 = idx;
if (idx < stmt.length()) {
// read suffix
}
// check boundary
if (!isspace(stmt[idx]) && !isoper(stmt[idx]))
break;
return NUM;
}
// op
else if (isoper(stmt[idx])) {
if (stmt[idx] == '+') {
if (idx + 1 < stmt.length() && (stmt[idx + 1] == '+' || stmt[idx + 1] == '=')) {
token.append(stmt.substr(idx, 2));
idx += 2;
}
else
token.append(1, stmt[idx++]);
}
else if (stmt[idx] == '-') {
if (idx + 1 < stmt.length() && (stmt[idx + 1] == '-' || stmt[idx + 1] == '=')) {
token.append(stmt.substr(idx, 2));
idx += 2;
}
else
token.append(1, stmt[idx++]);
}
else if (stmt[idx] == '*' || stmt[idx] == '/' || stmt[idx] == '%'
|| stmt[idx] == '&' || stmt[idx] == '|'
|| stmt[idx] == '~' || stmt[idx] == '!' || stmt[idx] == '=') {
if (idx + 1 < stmt.length() && stmt[idx + 1] == '=') {
token.append(stmt.substr(idx, 2));
idx += 2;
}
else
token.append(1, stmt[idx++]);
}
else if (stmt[idx] == '>') {
if (idx + 1 < stmt.length() && (stmt[idx + 1] == '>' || stmt[idx] == '=')) {
token.append(stmt.substr(idx, 2));
idx += 2;
}
else
token.append(1, stmt[idx++]);
}
else if (stmt[idx] == '<') {
if (idx + 1 < stmt.length() && (stmt[idx + 1] == '<' || stmt[idx] == '=')) {
token.append(stmt.substr(idx, 2));
idx += 2;
}
else
token.append(1, stmt[idx++]);
}
else // ;, (, )
token.append(1, stmt[idx++]);
return OP;
}
// others
else
break;
}
// handle others
int idx2 = idx;
while (idx < stmt.length() && !isoper(stmt[idx]) && !isspace(stmt[idx]))
++idx;
token.append(stmt.substr(idx2, idx - idx2));
if (token.empty())
return NONE;
else
return OTH;
}