AC自动机(多模式串“KMP")模版

参考博客:kuangbin AC自动机小结,AC自动机算法  海量数据处理之Tire树(字典树)


AC自动机,Aho-Corasick automation 是建立在字典树(Tire)上的多模式串快速匹配算法;

一个典型的例子就是:给出N个单词,和一篇文章,判断文章中出现了多种(个)之前的单词。


要想理解AC自动机必须先学Tire。Tire是一棵k叉树,除根节点之外,每个节点都储存了一个字符(字母),于是从祖先往下看,每一条路径都是一个单词。

AC自动机就是建立在Tire数据结构上的一个算法,类似于在Tire树上做KMP。类似kmp的next指针,它也有一个fail指针,以加速匹配的速度。

它可以判断某单词是否在文章中出现(可重叠),以及出现的次数。

具体原理不赘述,网上各种解释遍地开花。对于偷懒者来说,求个模版就算了……

AC自动机算法主要有三个步骤

1)建Tire树

2)构造fail指针

3)匹配

【模版】(带详细解释)

const int MAXN = 500*200;    //模式串串个数*模式串长度
const int MAXL = 10000+10;   //原串最大长度
const int MAXM = 128;        //Tire树分支个数,即字符种类数
struct Trie
{
    int next[MAXN][MAXM],fail[MAXN],end[MAXN];  
    int root,L;
    int newnode()
    {
        for(int i = 0;i < MAXM;i++)
            next[L][i] = -1;
        end[L++] = -1;  //单词"L"初始为-1,表示没有这个单词
        return L-1;
    }
    void init()
    {
        L = 0;
        root = newnode();
    }
    void insert(char buf[],int id)  //插入模式串,建树;id是模式串的编号,可无
    {
        int len = strlen(buf);
        int now = root;
        for(int i = 0;i < len;i++)
        {
            if(next[now][buf[i]] == -1)
                next[now][buf[i]] = newnode();
            now = next[now][buf[i]];
        }
        end[now] = id;  //记住id
    }
    void build()     //求fail指针
    {
        queueQ;
        fail[root] = root;
        for(int i = 0;i < MAXM;i++)
            if(next[root][i] == -1)
                next[root][i] = root;
            else
            {
                fail[next[root][i]] = root;
                Q.push(next[root][i]);
            }
        while( !Q.empty() )
        {
            int now = Q.front();
            Q.pop();
            for(int i = 0;i < MAXM;i++)
                if(next[now][i] == -1)
                    next[now][i] = next[fail[now]][i];
                else
                {
                    fail[next[now][i]]=next[fail[now]][i];
                    Q.push(next[now][i]);
                }
        }
    }
    int num[501];
    int query(char buf[],int n,int id)   //匹配,某些变化主要在此处;id是文章的编号
    {
        bool has = false;
        mst(num,0);
        int len = strlen(buf);
        int now = root;
        int ct = 0;
        for(int i = 0;i < len;i++)
        {
            now = next[now][buf[i]];
            int temp = now;
            while( temp != root )
            {
                if(end[temp] != -1) //如果单词存在
                {
                    num[ct++] = end[temp];  //num数组存出现过单词的id
                    //end[temp] = -1;  //把temp删除
                    has = true;
                }
                temp = fail[temp];
            }
        }
        if(!has) return 0;
    }
};

三道入门题,注意字符种类总数。

1、HDU 2222 Keywords Search  

题意:求出现了多少种单词

【代码】

/* ***********************************************
Author        :angon

************************************************ */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 500010;    //字符串个数
const int MAXL = 1000010;   //字符最大长度
const int MAXM = 26;        //Tire树分支个数
struct Trie
{
    int next[MAXN][MAXM],fail[MAXN],end[MAXN];
    int root,L;
    int newnode()
    {
        for(int i = 0;i < 26;i++)
            next[L][i] = -1;
        end[L++] = 0;
        return L-1;
    }
    void init()
    {
        L = 0;
        root = newnode();
    }
    void insert(char buf[])
    {
        int len = strlen(buf);
        int now = root;
        for(int i = 0;i < len;i++)
        {
            if(next[now][buf[i]-'a'] == -1)
                next[now][buf[i]-'a'] = newnode();
            now = next[now][buf[i]-'a'];
        }
        end[now]++;
    }
    void build()
    {
        queueQ;
        fail[root] = root;
        for(int i = 0;i < 26;i++)
            if(next[root][i] == -1)
                next[root][i] = root;
            else
            {
                fail[next[root][i]] = root;
                Q.push(next[root][i]);
            }
        while( !Q.empty() )
        {
            int now = Q.front();
            Q.pop();
            for(int i = 0;i < 26;i++)
                if(next[now][i] == -1)
                    next[now][i] = next[fail[now]][i];
                else
                {
                    fail[next[now][i]]=next[fail[now]][i];
                    Q.push(next[now][i]);
                }
        }
    }
    int query(char buf[])
    {
        int len = strlen(buf);
        int now = root;
        int res = 0;
        for(int i = 0;i < len;i++)
        {
            now = next[now][buf[i]-'a'];
            int temp = now;
            while( temp != root )
            {
                res += end[temp];
                end[temp] = 0;
                temp = fail[temp];
            }
        }
        return res;
    }
    void debug()
    {
        for(int i = 0;i < L;i++)
        {
            printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],end[i]);
            for(int j = 0;j < 26;j++)
                printf("%2d",next[i][j]);
            printf("]\n");
        }
    }
};
char buf[MAXL];
Trie ac;
int main()
{
    //freopen("in.txt","r",stdin);
    //freopen("out.txt","w",stdout);
    int t;scan(t);
    while(t--)
    {
        int n;
        scan(n);
        ac.init();
        REP(i,0,n)
        {
            scanf("%s",buf);
            ac.insert(buf);
        }
        ac.build();
        scanf("%s",buf);
        printf("%d\n",ac.query(buf));
    }


    return 0;
}

2、HDU 3065 病毒侵袭持续中

题意:要求输出每个单词出现的次数

【代码】

/* ***********************************************
Author        :angon

************************************************ */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 1010*50;    //字符串个数*50
const int MAXL = 2000000+10;   //原串最大长度
const int MAXM = 128;        //Tire树分支个数
char str[1005][100];
struct Trie
{
    int next[MAXN][MAXM],fail[MAXN],end[MAXN];
    int root,L;
    int newnode()
    {
        for(int i = 0;i < MAXM;i++)
            next[L][i] = -1;
        end[L++] = -1;
        return L-1;
    }
    void init()
    {
        L = 0;
        root = newnode();
    }
    void insert(char buf[],int id)
    {
        int len = strlen(buf);
        int now = root;
        for(int i = 0;i < len;i++)
        {
            if(next[now][buf[i]] == -1)
                next[now][buf[i]] = newnode();
            now = next[now][buf[i]];
        }
        end[now] = id;
    }
    void build()
    {
        queueQ;
        fail[root] = root;
        for(int i = 0;i < MAXM;i++)
            if(next[root][i] == -1)
                next[root][i] = root;
            else
            {
                fail[next[root][i]] = root;
                Q.push(next[root][i]);
            }
        while( !Q.empty() )
        {
            int now = Q.front();
            Q.pop();
            for(int i = 0;i < MAXM;i++)
                if(next[now][i] == -1)
                    next[now][i] = next[fail[now]][i];
                else
                {
                    fail[next[now][i]]=next[fail[now]][i];
                    Q.push(next[now][i]);
                }
        }
    }
    int num[1001];
    void query(char buf[],int n)
    {
        mst(num,0);
        int len = strlen(buf);
        int now = root;
        //int res = 0;
        for(int i = 0;i < len;i++)
        {
            now = next[now][buf[i]];
            int temp = now;
            while( temp != root )
            {
                //res += end[temp];
                //end[temp] = 0;
                if(end[temp] != -1)
                    num[end[temp]]++;
                temp = fail[temp];
            }
        }
        //return res;
        REP(i,0,n)
            if(num[i])
                printf("%s: %d\n",str[i],num[i]);
    }
};
char buf[MAXL];
Trie ac;
int main()
{
    //freopen("in.txt","r",stdin);
    //freopen("out.txt","w",stdout);
    int n;
    while(~scan(n))
    {
        ac.init();
        REP(i,0,n)
        {
            scanf("%s",str[i]);
            ac.insert(str[i],i);
        }
        ac.build();
        scanf("%s",buf);
        ac.query(buf,n);
    }


    return 0;
}

3、HDU 2896 病毒侵袭 

题意:输出出现了单词的编号

【代码】

/* ***********************************************
Author        :angon

************************************************ */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}

const int MAXN = 500*200;    //模式串串个数*模式串长度
const int MAXL = 10000+10;   //原串最大长度
const int MAXM = 128;        //Tire树分支个数
struct Trie
{
    int next[MAXN][MAXM],fail[MAXN],end[MAXN];
    int root,L;
    int newnode()
    {
        for(int i = 0;i < MAXM;i++)
            next[L][i] = -1;
        end[L++] = -1;  
        return L-1;
    }
    void init()
    {
        L = 0;
        root = newnode();
    }
    void insert(char buf[],int id)
    {
        int len = strlen(buf);
        int now = root;
        for(int i = 0;i < len;i++)
        {
            if(next[now][buf[i]] == -1)
                next[now][buf[i]] = newnode();
            now = next[now][buf[i]];
        }
        end[now] = id;
    }
    void build()
    {
        queueQ;
        fail[root] = root;
        for(int i = 0;i < MAXM;i++)
            if(next[root][i] == -1)
                next[root][i] = root;
            else
            {
                fail[next[root][i]] = root;
                Q.push(next[root][i]);
            }
        while( !Q.empty() )
        {
            int now = Q.front();
            Q.pop();
            for(int i = 0;i < MAXM;i++)
                if(next[now][i] == -1)
                    next[now][i] = next[fail[now]][i];
                else
                {
                    fail[next[now][i]]=next[fail[now]][i];
                    Q.push(next[now][i]);
                }
        }
    }
    int num[501];
    int query(char buf[],int n,int id)
    {
        bool has = false;
        mst(num,0);
        int len = strlen(buf);
        int now = root;
        int ct = 0;
        for(int i = 0;i < len;i++)
        {
            now = next[now][buf[i]];
            int temp = now;
            while( temp != root )
            {
                if(end[temp] != -1)
                {
                    num[ct++] = end[temp];
                    //end[temp] = -1;
                    has = true;
                }
                temp = fail[temp];
            }
        }
        if(!has) return 0;
        printf("web %d: ",id);
        sort(num,num+ct);
        REP(i,0,ct)
            printf("%d%c",num[i]+1,i==ct-1?'\n':' ');
        return 1;
    }
};
char buf[MAXL];
Trie ac;
int main()
{
    //freopen("in.txt","r",stdin);
    //freopen("out.txt","w",stdout);
    int n;
    while(~scan(n))
    {
        ac.init();
        REP(i,0,n)
        {
            scanf("%s",buf);
            ac.insert(buf,i);
        }
        ac.build();
        int m; scan(m);
        int total = 0;
        REPP(i,1,m)
        {
            scanf("%s",buf);
            if(ac.query(buf,n,i))
                total++;
        }
        printf("total: %d\n",total);

    }


    return 0;
}


以上~


你可能感兴趣的:(AC自动机(多模式串“KMP")模版)