经过不断的修正和测试代码,分析测试结果,该词法分析器主要实现了以下功能:
特别的,在预处理阶段,识别单/多行注释时,遇到了一个小问题,由于要支持字符串常量和字符常量,而该常量中可能包含//,/*,导致将字符串内容识别为注释,另外,由于需要支持转义字符‘\’的存在,正确识别的字符串,也成为了一大问题,在多次修改匹配代码后,终于将问题解决。
利用词法分析的token结果,按不同的类型码,对源代码文件进行着色输出。
运算符和界限符:黄色。
关键字:淡蓝色。
无符号二,八,十,十六进制整型常量:淡红色。
标识符:淡绿色。
字符串常量:淡黄色。
当朴素匹配或者正规匹配失败,或者匹配结束后,回退字符之前,出现非法字符时,对相应的错误进行记录并在分析完毕后进行输出。
目前已处理错误:未找到匹配的界限符()]}),多行注释未找到结束符(*/),出现未定义的字符(@¥`)。
值得注意的一点,当遇到未处理的词法错误时,程序存在可能的崩溃隐患。
/**
* used to test keywords
* //ohhhhhhhhhhhhhhh
*/
void $Func$ForKey_2(int a, unsigned b)
{
start: if (1){}else{}
while (1) { continue; }
do{}while (1);
for (;;){ }}
switch (1){case 1: break;default: break;}
enum { run };
goto start;
}
struct FStruct{int val = 0000x1234;};
union FUnion{int val = 0B1 + 0b1 + 01 + 0X1 + 0x1;};
extern i;
// used to test operator
void __FuncForOperation5(const float c, long d, signed e, short f)
{
typedef int Moota;
register int i = 1234 + sizeof(Moota) + sizeof("/**/hello world//\\") + sizeof('\a');
volatile int j = i + 1 - 1 * 1 / 1 % 1 <= 1 < 1 > 1 >= 1 != 1 == 1;
auto int m = (i++) + (++i) + (i && i || i & i & i | i ^ i << 1) + i >> 1 + ~i;
m += 1;m -= 2;m *= 3;m / 4;m %= 5;m <<= 6;m >>= 7;m &= 8;m |= 9;m ^= 10;
m = m ? 1 : 0;
struct FStruct* Ohhh;Ohhh->val;
struct FStruct Emmm;Emmm.val;
}
/*
int main(void)
{
$Func$ForKey_2(1, 1);__FuncForOperation5(1, 1, 1, 1);
return 0;
}
/** Copyright Moota, Private. All Rights Reserved. */
#include
#include
#include
#include
#include
#include
#include "Windows.h"
/**
* 词法分析器
* 可以识别的单词种类包括以下部分(括号代表实验要求之外的单词)
*
* (1) 关键字
* if else while do for main return int float double char
* (数据类型关键字 void unsigned long enum static short signed struct union)
* (控制语句关键字 continue break switch case default goto)
* (存储类型关键字 auto static extern register)
* (其他关键字 const volatile sizeof typedef)
* (预编译指令 #xxx)
*
* (2) 运算符
* = + - * / % < <= > >= != ==
* (算术运算符 ++ --)
* (关系运算符)
* (逻辑运算符 && || !)
* (位运算符 & | ^ ~ << >>)
* (赋值运算符 += -= *= /= %= <<= >>= &= ^= |=)
* (杂项运算符 , ? : -> .)
*
* (3) 界限符
* ; ( ) { }
* ([ ])
*
* (4) 常量
* 正规式为 digit1 digit2*,只考虑无符号十进制整型常量,digit1包括1-9,digit2包括0-9)
* (无符号二进制整型常量 0(B|b)(0-1)*)
* (无符号八进制整型常量 0(0-7)*)
* (无符号十六进制整型常量 0(X|x)(0-9|a-f|A-F)*)
* (字符串常量 "(all char)*" )
* (字符常量 '(all char)*' )
*
* (5) 标识符
* 正规式为 letter(letter|digit)*,区分大小写
* (_(letter|digit|$)*)
* ($(letter|digit|$)*)
*
* (6) 其他符号
* 空格符(' '),制表符('\t'),换行符('\n'),单行多行注释("//","\/\*") 应该在词法分析阶段被忽略
*/
class FLexicalAnalyzer
{
public:
FLexicalAnalyzer()
{
SetConsoleTitle(L"Lexical Analyzer V1.2.6");
ShowWindow(GetForegroundWindow(), SW_MAXIMIZE);
SetConsoleOutputCP(65001);
}
private:
// 工具函数
// 字符是否小写
static inline bool IsLower(char Data)
{
return 'a' <= Data and Data <= 'z';
}
// 字符是否大写
static inline bool IsUpper(char Data)
{
return 'A' <= Data and Data <= 'Z';
}
// 字符是否是字母字符
static inline bool IsLetter(char Data)
{
return IsLower(Data) || IsUpper(Data);
}
// 字符是否是数字字符
static inline bool IsDigit(char Data)
{
return '0' <= Data and Data <= '9';
}
// 读取文件
std::string ReadFile(const std::string& FilePath)
{
std::string Result;
std::ifstream InputFile(FilePath, std::ios::in);
if (!InputFile.is_open())
{
std::cerr << "The file failed to open, perhaps you need to change the file path." << "\n";
return Result;
}
char Ch;
while (InputFile.peek() != EOF)
{
InputFile.get(Ch);
Result += Ch;
}
InputFile.close();
TokenizeData.FilePath = FilePath;
return Result;
}
// 保存文件
static void SaveFile(const std::string& Data, const std::string& FilePath)
{
std::ofstream OutputFile(FilePath, std::ios::out);
OutputFile << Data;
OutputFile.close();
}
private:
std::map<std::string, int> KeywordMap; // 关键字符号表
std::map<std::string, int> RegularMap; // 正规(常量,标识符)符号表
std::map<std::string, int> SpecialMap; // 特殊(运算符,界限符)符号表
// 单词序列数据
struct FTokenData
{
std::string Token; // 单词符号
int Code = 0; // 类别码
};
// 词法分析数据
struct FTokenizeData
{
std::string FilePath; // 文件地址
std::string Result; // 最终结果
std::string Data; // 待分析数据
size_t Size = 0; // 数据长度
size_t Index = 0; // 当前字符索引
char Start = ' '; // 当前字符
FTokenData TokenData; // 当前符号信息
std::stack<int> Delimiter[3]; // 界限符栈
size_t Lines = 0; // 词法分析当前行
std::vector<std::string> Errors; // 词法分析错误池
std::vector<std::string> Warnings; // 词法分析警告池
} TokenizeData;
/**
* 初始化各个符号表
*/
void Initialize()
{
KeywordMap["main"] = 1;
KeywordMap["if"] = 2;
KeywordMap["else"] = 3;
KeywordMap["while"] = 4;
KeywordMap["do"] = 5;
KeywordMap["for"] = 6;
KeywordMap["return"] = 7;
RegularMap["letter(letter|digit)*"] = 8;
RegularMap["digit digit*"] = 9;
RegularMap["\"(all char)*\""] = 200;
RegularMap["\'(all char)*\'"] = 201;
RegularMap["_(letter|digit|$)*"] = 202;
RegularMap["$(letter|digit|$)*"] = 203;
RegularMap["0(B|b)(0-1)*"] = 204;
RegularMap["0(0-7)*"] = 205;
RegularMap["0(X|x)(0-9 a-f)*"] = 206;
SpecialMap["+"] = 10;
SpecialMap["-"] = 11;
SpecialMap["*"] = 12;
SpecialMap["/"] = 13;
SpecialMap["%"] = 14;
SpecialMap[">"] = 15;
SpecialMap[">="] = 16;
SpecialMap["<"] = 17;
SpecialMap["<="] = 18;
SpecialMap["=="] = 19;
SpecialMap["!="] = 20;
SpecialMap["="] = 21;
SpecialMap[";"] = 22;
SpecialMap["("] = 23;
SpecialMap[")"] = 24;
SpecialMap["{"] = 25;
SpecialMap["}"] = 26;
SpecialMap["["] = 100;
SpecialMap["]"] = 101;
SpecialMap[","] = 102;
SpecialMap[":"] = 103;
SpecialMap["++"] = 104;
SpecialMap["--"] = 105;
SpecialMap["&&"] = 106;
SpecialMap["||"] = 107;
SpecialMap["!"] = 108;
SpecialMap["&"] = 109;
SpecialMap["|"] = 110;
SpecialMap["^"] = 111;
SpecialMap["~"] = 112;
SpecialMap["<<"] = 113;
SpecialMap[">>"] = 114;
SpecialMap["+="] = 115;
SpecialMap["-="] = 116;
SpecialMap["*="] = 117;
SpecialMap["/="] = 118;
SpecialMap["%="] = 119;
SpecialMap["<<="] = 120;
SpecialMap[">>="] = 121;
SpecialMap["&="] = 122;
SpecialMap["|="] = 123;
SpecialMap["^="] = 124;
SpecialMap["?"] = 125;
SpecialMap["->"] = 126;
SpecialMap["."] = 127;
KeywordMap["int"] = 27;
KeywordMap["float"] = 28;
KeywordMap["double"] = 29;
KeywordMap["char"] = 30;
KeywordMap["void"] = 31;
KeywordMap["unsigned"] = 32;
KeywordMap["long"] = 33;
KeywordMap["const"] = 34;
KeywordMap["continue"] = 35;
KeywordMap["break"] = 36;
KeywordMap["enum"] = 37;
KeywordMap["switch"] = 38;
KeywordMap["case"] = 39;
KeywordMap["static"] = 40;
KeywordMap["auto"] = 41;
KeywordMap["short"] = 42;
KeywordMap["signed"] = 43;
KeywordMap["struct"] = 44;
KeywordMap["union"] = 45;
KeywordMap["goto"] = 46;
KeywordMap["default"] = 47;
KeywordMap["extern"] = 48;
KeywordMap["register"] = 49;
KeywordMap["volatile"] = 50;
KeywordMap["sizeof"] = 51;
KeywordMap["typedef"] = 52;
KeywordMap["#"] = 53;
std::cout << "///\n";
}
/**
* 数据预处理
* 主要处理制表符('\t')
* 空格符将在词法分析时自动进行忽略
* 换行符将在词法分析时作为报错的依据
*/
std::string Preprocess(const std::string& Data)
{
std::string Result;
const size_t Size = Data.size();
size_t Lines = 0;
for (size_t i = 0; i < Size; ++i)
{
if (Data[i] == '"')
{
Result += Data[i];
++i;
while (i < Size && Data[i] != '"')
{
if (Data[i] == '\\')
{
Result += Data[i];
++i;
}
if (i < Size)
{
Result += Data[i];
++i;
}
}
if (Data[i] == '"')
{
Result += Data[i];
}
}
else if (Data[i] == '\'')
{
Result += Data[i];
++i;
while (i < Size && Data[i] != '\'')
{
if (Data[i] == '\\')
{
Result += Data[i];
++i;
}
Result += Data[i];
++i;
}
if (Data[i] == '\'')
{
Result += Data[i];
}
}
else if (Data[i] == '/' && (i + 1) < Size && Data[i + 1] == '/')// 处理单行注释
{
while (i < Size && Data[i] != '\n')
{
++i;
}
if (Data[i] == '\n')
{
++Lines;
Result += Data[i];
}
}
else if (Data[i] == '/' && (i + 1) < Size && Data[i + 1] == '*')// 处理多行注释
{
i = i + 2;
bool IsFound = false;
while (i < Size)
{
if (Data[i] == '\n')
{
++Lines;
}
if (Data[i] == '*' && (i + 1) < Size && Data[i + 1] == '/')
{
IsFound = true;
i = i + 2;
break;
}
++i;
}
if (Data[i] == '\n')
{
++Lines;
}
if (!IsFound)
{
TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(Lines) + " " + "no matching '*/' symbol was found.");
}
}
else if (Data[i] == '\n')
{
++Lines;
Result += Data[i];
}
else if (Data[i] == '\t')// 处理制表符
{
Result += ' ';
}
else
{
Result += Data[i];
}
}
return Result;
}
/**
* 获取下一个有效字符,忽略空格符
*/
bool GetValidChar()
{
bool Result = false;
while (TokenizeData.Index < TokenizeData.Size)
{
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start != ' ')
{
Result = true;
break;
}
}
return Result;
}
/**
* 识别以字母为开始符号的
* 1. 关键字-letter(letter)*
* 2. 识别标识符-letter(letter|digit)*
*/
void TokenizeAlpha()
{
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$'|| TokenizeData.Start == '_');
const auto Found = KeywordMap.find(TokenizeData.TokenData.Token);
if (Found != KeywordMap.end())
{
TokenizeData.TokenData.Code = Found->second;
}
else
{
TokenizeData.TokenData.Code = RegularMap["letter(letter|digit)*"];
}
TokenizeData.Index--;
}
/**
* 识别以数字为开始符号的
* 1. 无符号十进制整型常量-digit digit*
*/
void TokenizeDigit()
{
if ('1' <= TokenizeData.Start && TokenizeData.Start <= '9')
{
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (IsDigit(TokenizeData.Start));
TokenizeData.TokenData.Code = RegularMap["digit digit*"];
TokenizeData.Index--;
}
else
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == 'B' || TokenizeData.Start == 'b')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (TokenizeData.Start == 0 || TokenizeData.Start == 1);
TokenizeData.TokenData.Code = RegularMap["0(B|b)(0-1)*"];
TokenizeData.Index--;
}
else if (TokenizeData.Start == 'X' || TokenizeData.Start == 'x')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (IsDigit(TokenizeData.Start) || ('a' <= TokenizeData.Start and TokenizeData.Start <= 'f') || ('A' <= TokenizeData.Start and TokenizeData.Start <= 'F'));
TokenizeData.TokenData.Code = RegularMap["0(X|x)(0-9 a-f)*"];
TokenizeData.Index--;
}
else if ('0' <= TokenizeData.Start and TokenizeData.Start <= '7')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while ('0' <= TokenizeData.Start and TokenizeData.Start <= '7');
TokenizeData.TokenData.Code = RegularMap["0(0-7)*"];
TokenizeData.Index--;
}
else
{
TokenizeData.TokenData.Code = RegularMap["digit digit*"];
TokenizeData.Index--;
}
}
}
/**
* 识别以非数字非字母为开始符号的
* 1. 运算符
* 2. 界限符
*/
void TokenizeSpecial()
{
for (auto& Special : SpecialMap)
{
if (TokenizeData.Start == Special.first[0])
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = Special.second;
break;
}
}
if (TokenizeData.Start == '(')
{
TokenizeData.Delimiter[0].push(TokenizeData.Start);
}
if (TokenizeData.Start == '[')
{
TokenizeData.Delimiter[1].push(TokenizeData.Start);
}
else if (TokenizeData.Start == '{')
{
TokenizeData.Delimiter[2].push(TokenizeData.Start);
}
else if (TokenizeData.Start == ')')
{
if (!TokenizeData.Delimiter[0].empty())
{
TokenizeData.Delimiter[0].pop();
}
else
{
TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched ')' symbol.");
}
}
else if (TokenizeData.Start == ']')
{
if (!TokenizeData.Delimiter[1].empty())
{
TokenizeData.Delimiter[1].pop();
}
else
{
TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched ']' symbol.");
}
}
else if (TokenizeData.Start == '}')
{
if (!TokenizeData.Delimiter[2].empty())
{
TokenizeData.Delimiter[2].pop();
}
else
{
TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched '}' symbol.");
}
}
else if (TokenizeData.Start == '!')
{
TokenizeData.TokenData.Code = SpecialMap["!"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["!="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Code = SpecialMap["="];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["=="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '<')
{
TokenizeData.TokenData.Code = SpecialMap["<"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["<="];
}
else if (TokenizeData.Start == '<')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["<<"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["<<="];
}
else
{
TokenizeData.Index--;
}
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '>')
{
TokenizeData.TokenData.Code = SpecialMap[">"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap[">="];
}
else if (TokenizeData.Start == '>')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap[">>"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap[">>="];
}
else
{
TokenizeData.Index--;
}
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '+')
{
TokenizeData.TokenData.Code = SpecialMap["+"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["+="];
}
else if (TokenizeData.Start == '+')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["++"];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '-')
{
TokenizeData.TokenData.Code = SpecialMap["-"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["-="];
}
else if (TokenizeData.Start == '-')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["--"];
}
else if (TokenizeData.Start == '>')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["->"];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '*')
{
TokenizeData.TokenData.Code = SpecialMap["*"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["*="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '/')
{
TokenizeData.TokenData.Code = SpecialMap["/"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["/="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '%')
{
TokenizeData.TokenData.Code = SpecialMap["%"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["%="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '&')
{
TokenizeData.TokenData.Code = SpecialMap["&"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["&="];
}
else if (TokenizeData.Start == '&')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["&&"];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '|')
{
TokenizeData.TokenData.Code = SpecialMap["|"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["|="];
}
else if (TokenizeData.Start == '|')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["||"];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '^')
{
TokenizeData.TokenData.Code = SpecialMap["^"];
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
if (TokenizeData.Start == '=')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.TokenData.Code = SpecialMap["^="];
}
else
{
TokenizeData.Index--;
}
}
else if (TokenizeData.Start == '"')
{
TokenizeData.TokenData.Code = RegularMap["\"(all char)*\""];
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
while (true)
{
if (TokenizeData.Start == '"')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
break;
}
if (TokenizeData.Start == '\\')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
TokenizeData.Index--;
}
else if (TokenizeData.Start == '\'')
{
TokenizeData.TokenData.Code = RegularMap["\'(all char)*\'"];
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
while (true)
{
if (TokenizeData.Start == '\'')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
break;
}
if (TokenizeData.Start == '\\')
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
TokenizeData.Index--;
}
else if (TokenizeData.Start == '_')
{
TokenizeData.TokenData.Code = RegularMap["_(letter|digit|$)*"];
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$' || TokenizeData.Start == '_');
TokenizeData.Index--;
}
else if (TokenizeData.Start == '$')
{
TokenizeData.TokenData.Code = RegularMap["$(letter|digit|$)*"];
do
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$' || TokenizeData.Start == '_');
TokenizeData.Index--;
}
else if (TokenizeData.Start == '#')
{
TokenizeData.TokenData.Code = KeywordMap["#"];
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
while (IsLetter(TokenizeData.Start))
{
TokenizeData.TokenData.Token += TokenizeData.Start;
TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
}
TokenizeData.Index--;
}
if (TokenizeData.TokenData.Code == 0)
{
TokenizeData.TokenData.Token = TokenizeData.Start;
TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an undefined " +
TokenizeData.TokenData.Token + " character whose ascii code is " + std::to_string(static_cast<int>(TokenizeData.TokenData.Token[0])) + ".");
}
}
/**
* 单词分析
*/
std::string Tokenize(const std::string& Data)
{
TokenizeData.Data = Data;
TokenizeData.Index = 0;
TokenizeData.Size = Data.size();
while (GetValidChar())
{
TokenizeData.TokenData = {"", 0};
if (TokenizeData.Start == '\n')
{
TokenizeData.Lines++;
}
else if (IsLetter(TokenizeData.Start))
{
TokenizeAlpha();
}
else if (IsDigit(TokenizeData.Start))
{
TokenizeDigit();
}
else
{
TokenizeSpecial();
}
if (!TokenizeData.TokenData.Token.empty())
{
TokenizeData.Result += "(" + TokenizeData.TokenData.Token + "," + std::to_string(TokenizeData.TokenData.Code) + ")" + "\n";
}
}
return TokenizeData.Result;
}
/** 打印彩色字
* 0=黑色 1=蓝色 2=绿色 3=湖蓝色
* 4=红色 5=紫色 6=黄色 7=白色
* 8=灰色 9=淡蓝色 10=淡绿色 11=淡浅绿色
* 12=淡红色 13=淡紫色 14=淡黄色 15=亮白色
* @param ForeColor 字体颜色
* @param BackColor 字体背景颜色
*/
static void SetFontColor(int ForeColor, int BackColor)
{
SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), static_cast<WORD>(ForeColor + BackColor * 0x10));
}
//模板一下,方便调用,T表示任何可以被cout输出的类型
template <typename T>
static void CoutWithColor(T t, int ForeColor = 7, int BackColor = 0)
{
SetFontColor(ForeColor, BackColor);
std::cout << t;
SetFontColor(7, 0);
Sleep(4);
}
/**
* 识别单词高亮
*/
static void Highlight(const std::string& SourceCode, const std::string& TokenData)
{
const size_t CodeSize = SourceCode.size();
const size_t ContentSize = TokenData.size();
size_t Index = 0;
size_t Lines = 0;
CoutWithColor(std::to_string(Lines) + "\t", 7);
++Lines;
for (size_t i = 0; i < CodeSize; ++i)
{
if (SourceCode[i] == '\n')
{
CoutWithColor(SourceCode[i] + std::to_string(Lines) + "\t", 7);
++Lines;
}
else if (SourceCode[i] == ' ' || SourceCode[i] == '\t')
{
CoutWithColor(SourceCode[i], 7);
}
else if (SourceCode[i] == '/' && (i + 1) < CodeSize && SourceCode[i + 1] == '/')
{
std::string Temp;
while (i < CodeSize && SourceCode[i] != '\n')
{
Temp += SourceCode[i++];
}
i--;
CoutWithColor(Temp, 2);
}
else if (SourceCode[i] == '/' && (i + 1) < CodeSize && SourceCode[i + 1] == '*')
{
std::string Temp;
Temp += SourceCode[i];
Temp += SourceCode[i + 1];
i = i + 2;
while (i < CodeSize)
{
if (SourceCode[i] == '\n')
{
CoutWithColor(Temp, 2);
CoutWithColor(SourceCode[i] + std::to_string(Lines) + "\t", 7);
++Lines;
++i;
Temp = "";
}
else if (SourceCode[i] == '*' && (i + 1) < CodeSize && SourceCode[i + 1] == '/')
{
Temp += SourceCode[i++];
Temp += SourceCode[i++];
break;
}
else
{
Temp += SourceCode[i++];
}
}
CoutWithColor(Temp, 2);
i--;
}
else
{
std::string Doc;
while (Index < ContentSize && TokenData[Index] != '\n')
{
Doc += TokenData[Index++];
}
if (TokenData[Index] == '\n')
{
Index++;
}
size_t Mid = 0;
std::string Token;
int Code = 0;
// 寻找 , 分隔符
for (size_t Pos = Doc.size() - 1; Pos != 0; --Pos)
{
if (Doc[Pos] != ',')
{
Mid = Pos - 1;
}
else
{
break;
}
}
// 截取 Token
for (size_t Start = 1; Start < Mid; ++Start)
{
Token += Doc[Start];
}
// 截取 Code
for (size_t Start = Mid + 1; Start < (Doc.size() - 1); ++Start)
{
Code = Code * 10 + (Doc[Start] - '0');
}
// 判断类型颜色
int ForeColor = 15;
if ((10 <= Code and Code <= 26) or (100 <= Code and Code <= 127)) // 运算符和界限符
{
ForeColor = 6;
}
else if ((1 <= Code and Code <= 7) or (27 <= Code and Code <= 53)) // 关键字
{
ForeColor = 9;
}
else if (Code == 9 or (204 <= Code and Code <= 206)) // 无符号二,八,十,十六进制整型常数
{
ForeColor = 12;
}
else if (Code == 8 or Code == 202 or Code == 203) // 标识符
{
ForeColor = 10;
}
else if (Code == 200 or Code == 201) // 字符串常量
{
ForeColor = 14;
}
CoutWithColor(Token, ForeColor);
i += Token.size() - 1;
}
}
CoutWithColor("\n", 7);
}
/**
* 显示词法分析结果
*/
void ShowResult() const
{
std::cout << "///\n";
CoutWithColor("VER: Copyright moota, private. all rights reserved.\n", 7);
CoutWithColor("INF: Lexical analysis is finished, " + std::to_string(TokenizeData.Warnings.size()) + " warnings, " +
std::to_string(TokenizeData.Errors.size()) + " errors.\n", 14);
for (auto& Info : TokenizeData.Warnings)
{
CoutWithColor(Info + "\n", 9);
}
for (auto& Info : TokenizeData.Errors)
{
CoutWithColor(Info + "\n", 12);
}
}
public:
/**
* 调试词法分析器
*/
void Main()
{
// 初始词法分析器
Initialize();
// 读取处理文件
const std::string Doc = ReadFile("Test.c");
SaveFile(Doc, "TestRead.txt");
// 预处理读入数据
const std::string PreDoc = Preprocess(Doc);
SaveFile(PreDoc, "TestPre.txt");
// 识别单词符号
const std::string Token = Tokenize(PreDoc);
SaveFile(Token, "TestToken.txt");
// 识别单词高亮
Highlight(Doc, Token);
// 显示词法分析结果
ShowResult();
getchar();
}
};
int main()
{
FLexicalAnalyzer LexicalAnalyzer;
LexicalAnalyzer.Main();
return 0;
}