Aho-Corasick算法

开源AC算法链接 https://sourceforge.net/projects/multifast/files/

本文中采用的版本是multifast-v1.4.2。

/*
 * example1.c: This program illustrates how to use ahocorasick library
 * it shows how to use the search interface to find patterns
 * This file is part of multifast.
 *
    Copyright 2010-2013 Kamiar Kanani <[email protected]>

    multifast is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    multifast is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with multifast.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <string.h>

#include "ahocorasick.h"

AC_ALPHABET_t * sample_patterns[] = {
    "taobao",
    "youku",
    "weixin",
    "weibo",
    "iqiyi",
    "baidu"
};
#define PATTERN_NUMBER (sizeof(sample_patterns)/sizeof(AC_ALPHABET_t *))

AC_ALPHABET_t * input_text1 = {"find.baidu.com/123"};
AC_ALPHABET_t * input_text2 = {"buy.taobao.com"};
AC_ALPHABET_t * input_text3 = {"video.youku.cn"};

// 1. Define a call-back function of AC_MATCH_CALBACK_t

int match_handler (AC_MATCH_t * matchp, void * param)
{
    unsigned int j;
    // in this example we don't use param

    printf ("@ %2ld: ", matchp->position);

    for (j=0; j < matchp->match_num; j++)
        printf ("#%ld (%s), ", matchp->patterns[j].rep.number, matchp->patterns[j].astring);
        // CAUTION: be careful about using m->matched_patterns[j].astring
        // if 'astring' has permanent allocation inside your program's
        // memory area, you can use it. otherwise it will point to
        // an incorrect memory place. 

    printf ("\n");

    return 0;
    // return 0 : continue searching
    // return none zero : stop searching
    // as soon as you get enough from search results, you can stop search and
    // return from ac_automata_search() and continue the rest of your program.
    // e.g. if you only need first N matches, define a counter and return none
    // zero after the counter exceeds N.
    // to find all matches always return 0
}


int main (int argc, char ** argv)
{
    unsigned int i;

    // 2. Define AC variables
    
    AC_AUTOMATA_t   *atm;
    AC_PATTERN_t    tmp_patt;
    AC_TEXT_t       tmp_text;

    // 3. Get a new automata
    
    atm = ac_automata_init ();

    // 4. Add patterns to automata
    
    for (i=0; i<PATTERN_NUMBER; i++)
    {
        tmp_patt.astring = sample_patterns[i];
        tmp_patt.rep.number = i+1; // optional
        tmp_patt.length = strlen (tmp_patt.astring);
        ac_automata_add (atm, &tmp_patt);
    }

    // 5. Finalize automata.
    
    ac_automata_finalize (atm);
    // after you have finished with adding patterns you must finalize the automata
    // from now you can not add patterns anymore.

    // 5.1. Display automata
    
    //ac_automata_display (atm, 'n');
    // the second argument determines the cast type of the pattern representative. 
    // 'n': as number 
    // 's': as string
    // because we use the integer part of union (tmp_patt.rep.number) so we used 'n'
    
    printf ("Searching: \"%s\"\n", input_text1);

    // 6. Set input text
    
    tmp_text.astring = input_text1;
    tmp_text.length = strlen (tmp_text.astring);

    // 7. Do search
    
    ac_automata_search (atm, &tmp_text, 0, match_handler, 0);
    // the 5th option is a (void *), and it will be forwarded to the callback 
    // function. you can pass everything you want to the callback function
    // using this argument.
    // in this example we don't send a parameter to callback function.
    // a typical practice is to define a struct that encloses whatever you want
    // to send the callback function, including input and output variables
    
    printf ("Searching: \"%s\"\n", input_text2);
    // do another search 

    tmp_text.astring = input_text2;
    tmp_text.length = strlen (tmp_text.astring);

    ac_automata_search (atm, &tmp_text, 0, match_handler, 0);

    printf ("Searching: \"%s\" with \'keep\' enabled\n", input_text3);
    // and another

    tmp_text.astring = input_text3;
    tmp_text.length = strlen (tmp_text.astring);

    ac_automata_search (atm, &tmp_text, 1, match_handler, 0);
    // when the keep option (3rd argument) in set, then the automata
    // considers that the given text is the next chunk of the previous text.
    // to understand the difference try it with 0 and 1 and compare the result

    // 8. Release automata

    ac_automata_release (atm);
    // do not forget to release the automata after you have done with it

    return 0;
}


1. 步骤一、初始化amt

AC_AUTOMATA_t * ac_automata_init ()
{
    AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)malloc(sizeof(AC_AUTOMATA_t));  /*分配AC_AUTOMATA_t结构大小的内存给thiz*/
    memset (thiz, 0, sizeof(AC_AUTOMATA_t));/*初始化内存空间为0*/
    thiz->root = node_create ();                     /*创建根节点*/
    thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES;    /*最大节点长度为200 #define REALLOC_CHUNK_ALLNODES 200*/                                                                                                                                 
    thiz->all_nodes = (AC_NODE_t **) malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *));   /*为所有200个字节点分配内存大小*/
    ac_automata_register_nodeptr (thiz, thiz->root);   /*将根节点放入all_node中*/
    ac_automata_reset (thiz);/*更新thiz的当前节点为根节点,base_position为0*/
    thiz->total_patterns = 0;
    thiz->automata_open = 1;/*标记自动机可以添加模式串(open = 1)*/
    return thiz;
}   
2. 步骤二---将对应的模式字符串加入到tire树中
/******************************************************************************
 * FUNCTION: ac_automata_add
 * Adds pattern to the automata.
 * PARAMS:
 * AC_AUTOMATA_t * thiz: the pointer to the automata
 * AC_PATTERN_t * patt: the pointer to added pattern
 * RETUERN VALUE: AC_ERROR_t
 * the return value indicates the success or failure of adding action
******************************************************************************/
/*这个函数实际上是在创建tire树*/
AC_STATUS_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt)
{
    unsigned int i;
    AC_NODE_t * n = thiz->root;
    AC_NODE_t * next;
    AC_ALPHABET_t alpha;

    if(!thiz->automata_open)          /*不为可添加模式,直接返回; 其实已在ac_automata_init里面设置为1了*/
        return ACERR_AUTOMATA_CLOSED;

    if (!patt->length)                /*待添加模式的字符串长度为0,则直接返回*/
        return ACERR_ZERO_PATTERN;

    if (patt->length > AC_PATTRN_MAX_LENGTH)  /*待添加模式的字符串长度大于最大值1024(#define AC_PATTRN_MAX_LENGTH 1024),则直接返回*/
        return ACERR_LONG_PATTERN;

    for (i=0; i<patt->length; i++)
    {
        alpha = patt->astring[i];  /*循环取出带添加字符串的字符*/
		/*沿着树节点的出度(边)查找是否有符合指定alpha的边,
		 *找到则返回沿这个边找到的下一个节点
		 *没有找到,则为alpha创建一个新的节点*/
        if ((next = node_find_next(n, alpha)))
        {
            n = next;
            continue;
        }
        else
        {
            next = node_create_next(n, alpha);  /*创建新节点,注册出度,
			                         *即n->outgoing[n->outgoing_degree].alpha = alpha,n->outgoing[n->outgoing_degree++].next = next*/
            next->depth = n->depth + 1;         /*更新新节点的深度*/
            n = next;                           /*更新节点*/
            ac_automata_register_nodeptr(thiz, n);/*将新节点加入到AC_AUTOMATA_t结构的all_nodes中,形成一个大的字符node数组*/
        }
    }

    if(n->final)  /*如果是最后一个节点*/
        return ACERR_DUPLICATE_PATTERN;

    n->final = 1;/*设置为最后一个节点*/
    node_register_matchstr(n, patt);  /*在最后一个字符节点设置整个字符串模式的值*/
    thiz->total_patterns++;

    return ACERR_SUCCESS;
}

最终形成的树图如下:

Aho-Corasick算法_第1张图片

    此时,所以节点的failure_node都为NULL。

3.步骤三---为树上的每一个节点设置失败跳转节点


/******************************************************************************
 * FUNCTION: ac_automata_set_failure
 * find failure node for the given node.
******************************************************************************/
static void ac_automata_set_failure
    (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
{
    unsigned int i, j;
    AC_NODE_t * m;
    /*假设alphas[1] = t,alpha[2]=a,alpha[3]=o
	      depth = 4
		  则表示从树根开始分别搜索tao,ao,o这三组字符串,如果有匹配,则m不为NULL,否则m为NULL*/
    for (i=1; i < node->depth; i++)
    {
        m = thiz->root;
        for (j=i; j < node->depth && m; j++)
            m = node_find_next (m, alphas[j]);  /*在节点m的出度上查找字符alphas[i]*/                                                                                                                           
        if (m) /*找到匹配的字符,则将本节点的失败节点设置为节点m*/
        {
            node->failure_node = m;
            break;
        }
    }
	/*如果没有找到任何节点,则将根节点赋值为失败节点*/
    if (!node->failure_node)
        node->failure_node = thiz->root;
}


/******************************************************************************
 * FUNCTION: ac_automata_traverse_setfailure 
 * Traverse all automata nodes using DFS (Depth First Search), meanwhile it set
 * the failure node for every node it passes through. this function must be
 * called after adding last pattern to automata. i.e. after calling this you
 * can not add further pattern to automata.
******************************************************************************/
static void ac_automata_traverse_setfailure
    (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
{
    unsigned int i;
    AC_NODE_t * next;
    /*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/
    for (i=0; i < node->outgoing_degree; i++)
    {
        alphas[node->depth] = node->outgoing[i].alpha;
        next = node->outgoing[i].next;
    
        /* At every node look for its failure node */
        ac_automata_set_failure (thiz, next, alphas);/*为每个节点设置failure节点指针*/
    
        /* Recursively call itself to traverse all nodes */
        ac_automata_traverse_setfailure (thiz, next, alphas);
    }
}


/******************************************************************************
 * FUNCTION: ac_automata_finalize
 * Locate the failure node for all nodes and collect all matched pattern for
 * every node. it also sorts outgoing edges of node, so binary search could be
 * performed on them. after calling this function the automate literally will
 * be finalized and you can not add new patterns to the automate.
 * PARAMS:
 * AC_AUTOMATA_t * thiz: the pointer to the automata
******************************************************************************/
void ac_automata_finalize (AC_AUTOMATA_t * thiz)
{
    unsigned int i;
    AC_ALPHABET_t alphas[AC_PATTRN_MAX_LENGTH];  /*AC_PATTRN_MAX_LENGTH 1024*/
    AC_NODE_t * node;
    /*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/
    ac_automata_traverse_setfailure (thiz, thiz->root, alphas);

    for (i=0; i < thiz->all_nodes_num; i++)
    {
        node = thiz->all_nodes[i];
        ac_automata_union_matchstrs (node);
        node_sort_edges (node);/*对于一个节点下的所有outgoing出度进行排序*/
    }
    thiz->automata_open = 0; /* do not accept patterns any more */
}



步骤4---查找


/******************************************************************************
 * FUNCTION: node_findbs_next
 * Find out the next node for a given Alpha. this function is used after the
 * pre-processing stage in which we sort edges. so it uses Binary Search.
******************************************************************************/
AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha)                                                                                                      
{           
    int min, max, mid;
    AC_ALPHABET_t amid;
                
    min = 0;
    max = thiz->outgoing_degree - 1;
        
    while (min <= max)
    {       
        mid = (min+max) >> 1;
        amid = thiz->outgoing[mid].alpha;
        if (alpha > amid)
            min = mid + 1;
        else if (alpha < amid)
            max = mid - 1;
        else
            return (thiz->outgoing[mid].next);
    }       
    return NULL;
} 


/******************************************************************************                                                                                           
 * FUNCTION: ac_automata_search
 * Search in the input text using the given automata. on match event it will
 * call the call-back function. and the call-back function in turn after doing
 * its job, will return an integer value to ac_automata_search(). 0 value means
 * continue search, and non-0 value means stop search and return to the caller.
 * PARAMS:
 * AC_AUTOMATA_t * thiz: the pointer to the automata
 * AC_TEXT_t * txt: the input text that must be searched
 * int keep: is the input text the successive chunk of the previous given text
 * void * param: this parameter will be send to call-back function. it is
 * useful for sending parameter to call-back function from caller function.
 * RETURN VALUE:
 * -1: failed; automata is not finalized
 *  0: success; input text was searched to the end
 *  1: success; input text was searched partially. (callback broke the loop)
******************************************************************************/
int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep, 
        AC_MATCH_CALBACK_f callback, void * param)
{
    unsigned long position;
    AC_NODE_t * current;
    AC_NODE_t * next;
    AC_MATCH_t match;

    if (thiz->automata_open)
        /* you must call ac_automata_locate_failure() first */
        return -1;
    
    thiz->text = 0;

    if (!keep)
        ac_automata_reset(thiz); /*将current_node重置为root*/
        
    position = 0;
    current = thiz->current_node;

    /* This is the main search loop.
     * it must be as lightweight as possible. */
    while (position < text->length)
    {
	    /*在节点current的outgoing边上二分查找,如果找到则current=next,position++; 如果没有找到,有faliure节点,则current为failure节点,没有faliure节点则直接position++*/
        if (!(next = node_findbs_next(current, text->astring[position])))
        {
            if(current->failure_node /* we are not in the root node */)
                current = current->failure_node;
            else
                position++;
        }
        else
        {
            current = next;
            position++;
        }
        /*找到叶子节点,如果是没有failure节点的情况下,此时current==next*/
        if (current->final && next)
        /* We check 'next' to find out if we came here after a alphabet
         * transition or due to a fail. in second case we should not report
         * matching because it was reported in previous node */
        {
		    /*将匹配信息记录到match结构中*/
            match.position = position + thiz->base_position;
            match.match_num = current->matched_patterns_num;
            match.patterns = current->matched_patterns;
            /* we found a match! do call-back */
            if (callback(&match, param))  /*调用回调函数进行匹配时的后续操作*/
                return 1;
        }
    }

    /* save status variables */
    thiz->current_node = current;   /*记录本次结束时的current_node,以便keep参数为1时使用*/
    thiz->base_position += position;
    return 0;
}



参考文档
1.  一个开源AC算法源码分析: http://blog.csdn.net/WJ_1062/article/details/48751951 
2.  从头到尾彻底理解KMP: http://blog.csdn.net/v_july_v/article/details/7041827 

你可能感兴趣的:(Aho-Corasick算法)