为进一步熟悉编译原理中词法分析的实现过程,采用c语言实现一个简单的针对c语言的词法分析器。此程序只能分析合法c语言代码段并生成token序列,无法进行预处理或错误识别。
//c语言实现简易词法分析程序
#include
#include
#include
#include
#include
#include
#include
using namespace std;
/*共有如下六类字符表
第一类:标识符(iT) (_ | a~z | A~Z)(_ | a~z | A~Z | 0~9)*
第二类:常数(CT) (1~9)(0~9)*| 0(0~7)* | 0x(0~9 | a~f| A~F)+
第三类:关键字(kT)(32) 独立定义
第四类:界符与运算符(pT) 独立定义
第五类:字符(cT) '(o_letter | \(s_letter | x(0)*(digit | 空)(digit | 空)(digit | 空) | (0~7 | 空)(0~7 | 空)(0~7 | 空)))'
第六类: 字符串(sT) "(字符 | digit)*"
*/
/*——————————————————程序开始——————————————————*/
//全局文件流、token缓存、字符缓存
fstream in, out;
string token;
char tmp;
//关键字表
static char kT[32][20] = {
"auto", "double", "int", "struct", "break", "else",
"long", "switch", "case", "enum", "register", "typedef",
"char", "extern", "return", "union", "const", "float",
"short", "unsigned", "continue", "for", "signed", "void",
"defualt", "goto", "sizeof", "volatile", "do", "while",
"static", "if"
};
//界符运算符表
static char pT[43][10] = {
"+", "+=", "++", "-", "-=", "--", "*", "*=", "/", "/=",
"<", "<=", ">", ">=", "=", "==", "!", "!=", "&", "&&",
"|", "||", "%", "%=", "<<", ">>", "->", "[", "]", "{",
"}", ".", "\?", ":", "{", "}", ";", "(", ")", "^",
",", "#", "~"
};
vector<string> cT;//字符数组,包含单引号
vector<string> sT;//字符串数组,包含双引号
vector<string> iT;//标识符表
vector<double> CT;//常数表
//判定函数
bool is_atoZ(char ch){
if((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
return true;
else
return false;
}
bool is_1to9(char ch){
if(ch >= '1' && ch <= '9')
return true;
else
return false;
}
bool is_0to9(char ch){
if(ch >= '0' && ch <= '9')
return true;
else
return false;
}
bool is_1to7(char ch){
if(ch >= '1' && ch <= '7')
return true;
else
return false;
}
bool is_0to7(char ch){
if(ch >= '0' && ch <= '7')
return true;
else
return false;
}
bool is_num_of_0x(char ch){
if((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'))
return true;
else
return false;
}
bool is_num_of_0x_nz(char ch){
if((ch >= '1' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'))
return true;
else
return false;
}
bool is_none(char ch){
if(ch == ' ' || ch == '\n' || ch == '\t')
return true;
else
return false;
}
//情况判定
int get_case(char c){
if(is_atoZ(c))
return 2;//转至iT/kT
else if(is_0to9(c))
return 5;//转至CT
else if(is_none(c))
return 8;//转至空白处理
else if(c == '_')
return 1;//转至iT
else if(c == '/')
return 3;//转至注释
else if(c == '0')
return 4;//转至8/16进制
else if(c == '\'')
return 6;//转至cT
else if(c == '\"')
return 7;//转至sT
else
return 9;//转至pT查表
}
//具体处理
void get_iT(void){
bool flag = true;
token += tmp;
while(!in.eof()){
tmp = in.get();
if(is_atoZ(tmp) || is_0to9(tmp) || tmp == '_')
token += tmp;
else
break;
}
in.seekg(-1, in.cur);
for(int i = 0; i < iT.size(); i += 1){
if(token == iT[i]){
out << '<' << "iT" << ',' << i + 1 << '>' << endl;
flag = false;
break;
}
}
if(flag){
iT.push_back(token);
out << '<' << "iT" << ',' << iT.size() << '>' << endl;
}
token.clear();
return;
}
void get_iT_or_kT(void){
bool flag = true;
token += tmp;
while(!in.eof()){
tmp = in.get();
if(is_atoZ(tmp) || is_0to9(tmp) || tmp == '_')
token += tmp;
else
break;
}
in.seekg(-1, in.cur);
for(int i = 0; i < 32; i += 1){
if(token == kT[i]){
out << '<' << "kT" << ',' << i + 1 << '>' << endl;
flag = false;
break;
}
}
if(flag){
for(int i = 0; i < iT.size(); i += 1){
if(token == iT[i]){
out << '<' << "iT" << ',' << i + 1 << '>' << endl;
flag = false;
break;
}
}
}
if(flag){
iT.push_back(token);
out << '<' << "iT" << ',' << iT.size() << '>' << endl;
}
token.clear();
return;
}
void get_other(void){
tmp = in.get();
if(tmp == '/'){
while(!in.eof()){
tmp = in.get();
if(tmp == '\n')
return;
}
}
else if(tmp == '*'){
while (!in.eof()) {
tmp = in.get();
if(tmp == '*'){
tmp = in.get();
if(tmp == '/')
return;
else
continue;
}
}
}
else if(tmp == '='){
out << '<' << "pT" << ',' << "10" << '>' << endl;
}
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "9" << '>' << endl;
}
}
void get_0or0x(void){
bool flag = true;
double c_n = 0, c_l = 0;
tmp = in.get();
if(tmp == '0'){
while(!in.eof()){
tmp = in.get();
if(is_0to7(tmp)){
c_n = tmp - '0';
c_n += c_l*8;
c_l = c_n;
}
else
break;
}
}
else{
while (!in.eof()) {
tmp = in.get();
if(is_0to9(tmp)){
c_n = tmp - '0';
c_n += c_l*16;
c_l = c_n;
}
else if(tmp >= 'a' && tmp <= 'f'){
c_n = tmp - 'a';
c_n += c_l*16;
c_l = c_n;
}
else if(tmp >= 'A' && tmp <= 'F'){
c_n = tmp - 'A';
c_n += c_l*16;
c_l = c_n;
}
else
break;
}
}
in.seekg(-1, in.cur);
for(int i = 0; i < CT.size(); i += 1){
if(c_l == CT[i]){
out << '<' << "CT" << ',' << i + 1 << '>' << endl;
flag = false;
break;
}
}
if(flag){
CT.push_back(c_l);
out << '<' << "CT" << ',' << CT.size() << '>' << endl;
}
return;
}
void get_CT(void){
bool flage = false;
bool flag = true;
double c_n = 0, c_l = 0;
int p = 0, e = 0;
c_n = tmp - '0';
c_n += c_l*10;
c_l = c_n;
while(!in.eof()){
tmp = in.get();
if(is_0to9(tmp)){
c_n = tmp - '0';
c_n += c_l*10;
c_l = c_n;
}
else
break;
}
if(tmp == '.'){
tmp = in.get();
c_n = tmp - '0';
c_n += c_l*10;
c_n/=10;
c_l = c_n;
while(!in.eof()){
tmp = in.get();
if(is_0to9(tmp)){
c_n = tmp - '0';
c_n += c_l*10;
c_n/=10;
c_l = c_n;
}
else if(tmp == 'e'){
flage = true;
break;
}
else
break;
}
if(flage){
tmp = in.get();
if(tmp == '-'){
e = 1;
tmp = in.get();
}
else if(tmp == '+'){
e = 0;
tmp = in.get();
}
p = p*10 + (tmp - '0');
while (!in.eof()) {
tmp = in.get();
if(is_0to9(tmp))
p = p*10 + (tmp - '0');
else{
if(e){
for(int i = 0; i < p; i += 1)
c_l/= 10;
}
else{
for(int i = 0; i < p; i += 1)
c_l*= 10;
}
break;
}
}
}
}
else if(tmp == 'e'){
tmp = in.get();
if(tmp == '-'){
e = 1;
tmp = in.get();
}
else if(tmp == '+'){
e = 0;
tmp = in.get();
}
p = p*10 + (tmp - '0');
while (!in.eof()) {
tmp = in.get();
if(is_0to9(tmp))
p = p*10 + (tmp - '0');
else{
if(e){
for(int i = 0; i < p; i += 1)
c_l/= 10;
}
else{
for(int i = 0; i < p; i += 1)
c_l*= 10;
}
break;
}
}
}
in.seekg(-1, in.cur);
for(int i = 0; i < CT.size(); i += 1){
if(c_l == CT[i]){
out << '<' << "CT" << ',' << i + 1 << '>' << endl;
flag = false;
break;
}
}
if(flag){
CT.push_back(c_l);
out << '<' << "CT" << ',' << CT.size() << '>' << endl;
}
return;
}
void get_cT(void){
token += tmp;
tmp = in.get();
if(tmp == '\\'){
token += tmp;
tmp = in.get();
if(tmp == 'x'){
while(!in.eof()){
tmp = in.get();
if(tmp == '0')
continue;
else
break;
}
if(tmp == '\''){
token += '0';
token += '0';
token += tmp;
}
else{
tmp = in.get();
if(tmp == '\''){
token += '0';
in.seekg(-2, in.cur);
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
}
else{
token += tmp;
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
}
}
}
else if(is_0to7(tmp)){
tmp = in.get();
if(tmp == '\''){
in.seekg(-2, in.cur);
token += '0';
token += '0';
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
}
else{
tmp = in.get();
if(tmp == '\''){
in.seekg(-3, in.cur);
token += '0';
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
}
else{
in.seekg(-3, in.cur);
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
tmp = in.get();
token += tmp;
}
}
}
else{
token += tmp;
tmp = in.get();
token += tmp;
}
}
else{
token += tmp;
tmp = in.get();
token += tmp;
}
cT.push_back(token);
out << '<' << "cT" << ',' << cT.size() << '>' << endl;
token.clear();
return;
}
void get_sT(void){
token += tmp;
while(!in.eof()){
tmp = in.get();
if(tmp == '\"')
break;
token += tmp;
}
token += tmp;
sT.push_back(token);
out << '<' << "sT" << ',' << sT.size() << '>' << endl;
token.clear();
return;
}
void get_pT(void){
if(tmp == '-'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "5" << '>' << endl;
else if(tmp == '-')
out << '<' << "pT" << ',' << "6" << '>' << endl;
else if(tmp == '>')
out << '<' << "pT" << ',' << "27" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "4" << '>' << endl;
}
}
else if(tmp == '+'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "2" << '>' << endl;
else if(tmp == '+')
out << '<' << "pT" << ',' << "3" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "1" << '>' << endl;
}
}
else if(tmp == '<'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "12" << '>' << endl;
else if(tmp == '<')
out << '<' << "pT" << ',' << "25" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "11" << '>' << endl;
}
}
else if(tmp == '>'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "14" << '>' << endl;
else if(tmp == '>')
out << '<' << "pT" << ',' << "26" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "13" << '>' << endl;
}
}
else if(tmp == '*'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "8" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "7" << '>' << endl;
}
}
else if(tmp == '&'){
tmp = in.get();
if(tmp == '&')
out << '<' << "pT" << ',' << "20" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "19" << '>' << endl;
}
}
else if(tmp == '|'){
tmp = in.get();
if(tmp == '|')
out << '<' << "pT" << ',' << "22" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "21" << '>' << endl;
}
}
else if(tmp == '!'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "18" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "17" << '>' << endl;
}
}
else if(tmp == '%'){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "24" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "23" << '>' << endl;
}
}
else if(tmp == '='){
tmp = in.get();
if(tmp == '=')
out << '<' << "pT" << ',' << "16" << '>' << endl;
else{
in.seekg(-1, in.cur);
out << '<' << "pT" << ',' << "15" << '>' << endl;
}
}
else{
token += tmp;
for(int i = 27; i < 43; i += 1){
if(token == pT[i]){
out << '<' << "pT" << ',' << i + 1 << '>' << endl;
token.clear();
break;
}
}
}
return;
}
//主程序
int main(void){
in.open("/Users/no1/Desktop/main.txt", ios::in);
out.open("/Users/no1/Desktop/tmp.txt", ios::out);
int code_c;
token.clear();
while(!in.eof()){
tmp = in.get();
code_c = get_case(tmp);
switch (code_c) {
case 1:
get_iT();
break;
case 2:
get_iT_or_kT();
break;
case 3:
get_other();
break;
case 4:
get_0or0x();
break;
case 5:
get_CT();
break;
case 6:
get_cT();
break;
case 7:
get_sT();
break;
case 8:
continue;
break;
default:
get_pT();
break;
}
}
cout << "done" << endl;
in.close();
out.close();
return 0;
}