编译原理之正则表达式转NFA

本文转载自http://chriszz.sinaapp.com/?p=257

输入一个正则表达式,输出一个NFA。

我的做法:输入一个字符串表示正则,输出则是把输出到一个.dot文件中并将dot文件编译成pdf,fedora需要sudo yum install dot,然后evince XXX.pdf就可以查看生成的NFA了。

具体算法是按照龙书上的Tompson算法来的。

废话不多说,放码过来:

/*
Author:ChrisZZ([email protected])
Time:2013-12-25 14:13:09
输入:正则表达式
输出:自动机
算法步骤:
1.把正则表达式转化为后缀表达式
2.把后缀表达式转化为NFA
3.用dot语言把NFA输出到PDF
参考:
1.Regular Expression Matching Can Be Simple And Fast
http://swtch.com/~rsc/regexp/regexp1.html
2.龙书 chap3.7.4 从正则表达式构造NFA
3.YCC学长的project中dot语言的使用
其他说明:
1.需要安装dot,并添加到系统path中
2.在windows下运行时,控制台因为编码不支持可能导致中文提示无法显示
*/
#include <iostream>
#include <string>
#include <stdio.h>
#include <stack>
#include <string.h>
#include <stdexcept>
#include <stdlib.h>

using namespace std;

const int Match = 256;
const int Split = 257;//表示epsilon分支

struct Paren{//括号结构体
    int natom;
    int nalt;
};

string re2post(string re){
    Paren paren;//括号
    stack<struct Paren>parenStk;
    string postExpr="";
    int i, len=re.length();
    int nalt=0, natom=0;
    const string invalidRegExp = "非法的正则表达式";
    for(i=0; i<len; i++){
        if(isspace(re[i])) continue;
        if(isalpha(re[i])){
            if(natom>1){
                natom--;
                postExpr = postExpr + '.';
            }
            natom++;
            postExpr = postExpr + re[i];
        }
        else if(re[i]=='('){
            if(natom>1){
                postExpr = postExpr + '.';
            }
            paren.natom = natom;
            paren.nalt = nalt;
            parenStk.push(paren);
            nalt = 0;
            natom = 0;
        }
        else if(re[i]==')'){
            if(natom==0 || parenStk.empty())
                throw runtime_error(invalidRegExp+":括号不匹配");
            while(--natom>0){//比如((a|b)(c|d))模式,当上一次匹配完倒数第二个右括号后,natom为2,需要添加'.'
                postExpr = postExpr + '.';
            }
            while(nalt-->0){
                postExpr = postExpr + '|';
            }
            paren=parenStk.top();
            parenStk.pop();
            natom = paren.natom;
            nalt = paren.nalt;
            natom++;
        }
        else if(re[i]=='*'){
            if(natom==0)
                throw runtime_error(invalidRegExp+":提前出现'*'");
            postExpr = postExpr + re[i];
        }
        else if(re[i]=='|'){
            if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'");
            while(--natom>0){
                postExpr = postExpr + '.';
            }
            nalt++;
        }
        else
            throw runtime_error(invalidRegExp);
    }
    if(!parenStk.empty())
        throw runtime_error(invalidRegExp+":括号不匹配");
    while(--natom>0){
        postExpr = postExpr + '.';
    }
    while(nalt-->0){
        postExpr = postExpr + '|';
    }
    return postExpr;
}

class NFA;

/*
* c<256表示edge权重为c;
* c=256表示终结状态,匹配成功
* c=257表示分支(split)
*/
class State{
    friend class NFA;
    friend void nfa2graph(State* head, const string& re);
public:
    State(int c=256, State* out=NULL, State* out1=NULL){
        this->c = c;
        this->out = out;
        this->out1 = out1;
        this->id = 0;
    }
    void setId(int id){
        this->id = id;
    }

private:
    int c;
    int id;//状态的编号
    State* out;//从本状态出去的状态集合的头指针
    State* out1;//两个分支的情况
};

class NFA{
public:
    NFA(){
        head = NULL;
        tail = NULL;
    }
    NFA(const int& c){
        tail = new State(Match, NULL, NULL);
        head = new State(c, tail, NULL);
    }
    void doCat(NFA& nfa){
        tail->out = nfa.head;
        tail->c = Split;
        tail = nfa.tail;
        nfa.head = NULL;
        nfa.tail = NULL;
    }
    void doUnion(NFA& nfa){
        State* newHead = new State(Split, head, nfa.head);
        State* newTail = new State(Match, NULL, NULL);
        tail->c = Split;
        tail->out = newTail;
        nfa.tail->c = Split;
        nfa.tail->out = newTail;
        tail = newTail;
        head = newHead;
        nfa.head = NULL;
        nfa.tail = NULL;
    }
    void doStar(){
        State* newTail = new State(Match, NULL, NULL);
        State* newHead = new State(Split, head, newTail);
        tail->c = Split;
        tail->out = newTail;
        tail->out1 = head;
        tail = newTail;
        head = newHead;
    }

    void nfa2graph(const string& re){
        char myfile[100];
        printf("请输入一个文件名,用来保存生成的NFA-graph(不必提供后缀):\n");
        scanf("%s", myfile);
        printf("已将DOT文件存储在\"%s.dot\",\n", myfile);
        printf("PDF文件则存储在\"%s.dot.pdf\".\n", myfile);
        int i;
        while(myfile[i]!='\0')
            i++;
        myfile[i] = '.';
        myfile[i+1] = 'd';
        myfile[i+2] = 'o';
        myfile[i+3] = 't';
        myfile[i+4] = '\0';

        FILE *file = fopen(myfile, "w");

        fputs("digraph {\n", file);
        fputs("\t\"", file);
        int len=re.length();
        for(i=0; i<len; i++){
            fprintf(file, "%c", re[i]);
        }

        fputs("\" [shape = plaintext]\n", file);
        fputs("\trankdir = LR\n", file);
        fputs("\t\"\" [shape = point]\n", file);
        fputs("\t\"\" -> 1 [label = Start]\n\n", file);

        int id = 1;

        char circle[2000];
        memset(circle, 0, sizeof(circle));
        State* p;
        stack<State*> staStk;

        head->setId(id++);
        staStk.push(head);

        while(!staStk.empty()){
            p = staStk.top();
            staStk.pop();
            char flag = 1;
            cout << "p->c=" << p->c << endl;
            if(p->c < Match){
                cout << "p->out->id=" << p->out->id << endl;
                if(p->out->id==0){
                    p->out->id = id++;
                    cout << "id=" << id << endl;                }
                else
                    flag = 0;
                fprintf(file, "\t%d -> %d [label = \"%c\"]\n", p->id, (p->out)->id, p->c);
                State *what = p->out;
                if(flag) //push(*what);
                    staStk.push(what);
            } else if(p->c == Match){
                circle[p->id] = 1;
            } else{     //对应Split的情形
                if(p->out->id==0)
                    p->out->id = id++;
                else
                    flag = 0;
                fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out->id);
                State *what = p->out;
                if(flag) staStk.push(what);

                if(p->out1!=NULL){
                    flag = 1;

                    if(p->out1->id==0)
                        p->out1->id = id++;
                    else
                        flag = 0;
                    fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out1->id);
                    what = p->out1;
                    if(flag) staStk.push(what);
                }
            }
        }

        for(i=1; i<id; i++){
            fprintf(file, "\t%d [shape = circle", i);
            if(circle[i])
                fputs(", peripheries = 2", file);
            fprintf(file, "]\n");
        }

        fputs("}", file);
        fclose(file);

        char cmd[108];
        sprintf(cmd, "dot %s -O -Tpdf", myfile);
        if(system(cmd)==0){
            printf("成功生成pdf图像!\n");
            //printf("Linux用户可以使用evince file.pdf &命令打开~\n");
        }
        else
            printf("悲剧!生成pdf图像时出现错误..\n");
    }
private:
    State* head;
    State* tail;
};

NFA post2nfa(const string& postExpr){
    stack<NFA> nfaStk;
    NFA e1, e2, e;
    int i, len=postExpr.length();
    for(i=0; i<len; i++){
        switch(postExpr[i]){
        case '.':
            e2 = nfaStk.top();
            nfaStk.pop();
            e1 = nfaStk.top();
            nfaStk.pop();
            e1.doCat(e2);
            nfaStk.push(e1);
            break;
        case '|':
            e2 = nfaStk.top();
            nfaStk.pop();
            e1 = nfaStk.top();
            nfaStk.pop();
            e1.doUnion(e2);
            nfaStk.push(e1);
            break;
        case '*':
            e = nfaStk.top();
            nfaStk.pop();
            e.doStar();
            nfaStk.push(e);
            break;
        default://
            NFA alpha(postExpr[i]);
            nfaStk.push(alpha);
        }
    }
    e = nfaStk.top();
    nfaStk.pop();
    if(!nfaStk.empty())
        throw runtime_error("未知错误");
    return e;
}

int main(){
    string re;
    while(true){
        cout << "请输入一个正则表达式:\n";
        cin >> re;
        string postExpr = re2post(re);
        cout << "postExpr is : " << postExpr << endl;
        NFA nfa = post2nfa(postExpr);
        nfa.nfa2graph(re);
        cout << "继续吗?(y/n)\n" << endl;
        char c;
        cin >> c;
        while(c!='y' && c!='n'){
            cout << "请输入'y'或'n'.\n";
            c=getchar();
        }
        if(c=='n')
            break;
    }
    cout << "Bye~\n";
    return 0;
}

 

你可能感兴趣的:(正则表达式)