从txt文件中读入正则表达式
#include
#include
#include
#include
#include
#define MAX_TOKEN 100
using namespace std;
//词
struct Token
{
string name;//词的名字
string regExp;//正则表达式
vector<int>finalState;//终态
};
int readTXT(string fileName,vector<Token>&tokens)
{
ifstream input(fileName);
if(!input)
{
cout<<"Failed"<<endl;
return -1;
}
//读取正则表达式
string line;//读入的每一行
while(getline(input,line))
{
int i;
Token temp;
for(i=0;line[i]!=':'&&i<line.size();i++)
{
temp.name+=line[i];//获取词的名字
}
temp.regExp=line.substr(i+1,line.size()-i);//获取词的正则表达式
tokens.push_back(temp);
}
input.close();
return 0;
}
把使用连接运算的地方换成“.”(符号可自定义),然后将中缀正则表达式转成后缀表达式。
#include
#include
#include
#include
#include
#include
using namespace std;
//把两字符之间的连接加上.
string add_symbol(string reg)
{
int l=reg.size();
string regE=reg;
int j=0;//加的.的数量
for(int i=0;i<l-1;i++)
{
//a.b
if(reg[i]!='('&®[i]!='|'&&isalnum(reg[i+1]))
{
regE.insert(i+j+1,".");
j++;
}
//a.( )
else if(reg[i+1]=='('&®[i]!='|'&®[i]!='(')
{
regE.insert(i+j+1,".");
j++;
}
}
return regE;
}
//运算符优先级表
char *priorities[]=
{
// #|.*()
"E<<<,
">><<<>",
">>><<>",
">>>><>",
"x<<<<=",
"xxxxxx",
};
//运算符编号
int getNum(char c)
{
switch(c)
{
case '#':return 0;
case '|':return 1;
case '.':return 2;
case '*':return 3;
case '(':return 4;
case ')':return 5;
}
}
//中缀转后缀
string convert(string reg)
{
string post="";
stack<char>op;
op.push('#');
for(int i=0;i<reg.size();i++)
{
if(isalnum(reg[i]))
{
post+=reg[i];
}
//运算符处理
else
{
switch(reg[i])
{
case '(':
op.push(reg[i]);
break;
case ')':
//将栈顶运算符弹出并输出,直到遇到左括号
while(!op.empty()&&op.top()!='(')
{
post+=op.top();
op.pop();
}
op.pop();//弹出左括号,不输出
break;
default:
//优先级大于栈顶优先级
if(op.top()=='('||(op.top()!='('&&priorities[getNum(op.top())][getNum(reg[i])]=='<'))
{
op.push(reg[i]);
}
//优先级小于栈顶优先级
else
{
while(priorities[getNum(op.top())][getNum(reg[i])]=='>')
{
post+=op.top();
op.pop();
}
op.push(reg[i]);
}
break;
}
}
}
while(!op.empty())
{
post+=op.top();
op.pop();
}
post.pop_back();//去掉'#'
return post;
}
#include
#include
#include
#include
#include
#include
#define MAX 5000
using namespace std;
/*注意状态总数量和状态编号是不同的,因为状态总数可能减少(连接运算),但编号是一直增下去的*/
int state_count=0;//状态的总数量
int nmbState=1;//状态编号
int nullSymbol=0;//输入字符为?的边的数量
struct state
{
int id;
};
struct Edge
{
state source;
state target;
char symbol;//输入字符
};
//单个NFA单元
struct cell
{
state start;
state end;
int count;//边的数量
Edge edges[MAX];
};
//合并后正则的NFA
struct bigCell
{
state start;
vector<state>end;
int count;
Edge edges[MAX];
};
//单个字符NFA
cell makeNFA(char c)
{
cell a;
a.count=0;
state s,t;
s.id=nmbState++;
t.id=nmbState++;
state_count+=2;
//构建边
Edge e;
e.source=s;
e.target=t;
e.symbol=c;
//构建基本NFA
a.edges[a.count++]=e;
a.start=s;
a.end=t;
return a;
}
//把s中的边复制到t中
int edgesCopy(cell s,cell &t)
{
for(int i=0;i<s.count;i++)
{
t.edges[t.count++]=s.edges[i];
}
}
//把s中的边复制到t中
int edgesCopy(cell s,bigCell &t)
{
for(int i=0;i<s.count;i++)
{
t.edges[t.count++]=s.edges[i];
}
}
//或运算NFA
cell orNFA(cell left,cell right)
{
cell newCell;
newCell.count=0;
//新增两个状态
state s,t;
s.id=nmbState++;
t.id=nmbState++;
state_count+=2;
newCell.start=s;
newCell.end=t;
//cout<<"|"<
//新增四条边
Edge e1,e2,e3,e4;
//连接初始状态的两条
e1.source=s;
e1.symbol='?';
e1.target=left.edges[0].source;
e2.source=s;
e2.symbol='?';
e2.target=right.edges[0].source;
//连接结束状态的两条
e3.source=left.edges[left.count-1].target;
e3.symbol='?';
e3.target=t;
e4.source=right.edges[right.count-1].target;
e4.symbol='?';
e4.target=t;
nullSymbol+=4;
//构建边
edgesCopy(left,newCell);
edgesCopy(right,newCell);
newCell.edges[newCell.count++]=e1;
newCell.edges[newCell.count++]=e2;
newCell.edges[newCell.count++]=e3;
newCell.edges[newCell.count++]=e4;
return newCell;
}
//连接运算NFA
cell andNFA(cell left,cell right)
{
for(int i=0;i<right.count;i++)
{
//初始状态合并
if(right.start.id==right.edges[i].source.id)
{
right.edges[i].source.id=left.end.id;
state_count--;
}
//改变结束状态是right的开始状态的边
else if(right.start.id==right.edges[i].target.id)
{
right.edges[i].target.id=left.end.id;
state_count--;
}
}
right.start.id=left.end.id;
edgesCopy(right,left);
left.end.id=right.end.id;
return left;
}
//星闭包运算NFA
cell starNFA(cell a)
{
cell newCell;
newCell.count=0;
//新增两个状态
state s,t;
s.id=nmbState++;
t.id=nmbState++;
state_count+=2;
newCell.start=s;
newCell.end=t;
//cout<<"*"<
//新增四条边
Edge e1,e2,e3,e4;
//空边
e1.source=s;
e1.symbol='?';
e1.target=t;
//循环边
e2.source=a.end;
e2.symbol='?';
e2.target=a.start;
//起始边
e3.source=s;
e3.symbol='?';
e3.target=a.start;
//终止边
e4.source=a.end;
e4.symbol='?';
e4.target=t;
nullSymbol+=4;
//构建NFA
edgesCopy(a,newCell);
newCell.edges[newCell.count++]=e1;
newCell.edges[newCell.count++]=e2;
newCell.edges[newCell.count++]=e3;
newCell.edges[newCell.count++]=e4;
return newCell;
}
cell reg2NFA(string reg)
{
stack<cell>s;//基本NFA栈
cell current,left,right;
for(int i=0;i<reg.size();i++)
{
switch(reg[i])
{
case '|':
//弹出两个NFA
right=s.top();
s.pop();
left=s.top();
s.pop();
//用弹出的NFA构建新NFA,入栈
current=orNFA(left,right);
s.push(current);
break;
case '.':
right=s.top();
s.pop();
left=s.top();
s.pop();
current=andNFA(left,right);
s.push(current);
break;
case '*':
left=s.top();
s.pop();
current=starNFA(left);
s.push(current);
break;
//数字或字母
default:
current=makeNFA(reg[i]);//构建基本NFA
s.push(current);//入栈
}
}
current=s.top();
s.pop();
return current;
}
int showNFA(cell c)
{
cout<<"NFA信息:"<<endl;
cout<<"初态:"<<c.start.id<<endl;
cout<<"终态:"<<c.end.id<<endl;
for(int i=0;i<c.count;i++)
{
cout<<c.edges[i].source.id<<" "<<c.edges[i].symbol
<<" "<<c.edges[i].target.id<<endl;
}
cout<<endl;
return 0;
}
int showNFA(bigCell c)
{
cout<<"NFA信息:"<<endl;
cout<<"初态:"<<c.start.id<<endl;
cout<<"终态:";
for(int i=0;i<c.end.size();i++)
cout<<c.end[i].id<<" ";
cout<<endl;
for(int i=0;i<c.count;i++)
{
cout<<c.edges[i].source.id<<" "<<c.edges[i].symbol
<<" "<<c.edges[i].target.id<<endl;
}
cout<<endl;
return 0;
}
因为最后要构建词法分析器,所以做成一个大的NFA后期识别会比较方便,因此可选择将每个正则表达式的NFA合并为一个大的。
具体方法是把新增一个状态0,把所有NFA的起点连起来。
//将各个正则表达式的NFA合并
bigCell mergeNFA(vector<cell>nfa)
{
bigCell NFA;
NFA.start.id=0;
NFA.count=0;
for(int i=0;i<nfa.size();i++)
{
NFA.end.push_back(nfa[i].end);
//连接新起点和旧起点
Edge e;
e.source=NFA.start;
e.symbol='?';
e.target=nfa[i].start;
nullSymbol+=1;
edgesCopy(nfa[i],NFA);
NFA.edges[NFA.count++]=e;
}
return NFA;
}