本文中采用的版本是multifast-v1.4.2。
/* * example1.c: This program illustrates how to use ahocorasick library * it shows how to use the search interface to find patterns * This file is part of multifast. * Copyright 2010-2013 Kamiar Kanani <[email protected]> multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see <http://www.gnu.org/licenses/>. */ #include <stdio.h> #include <string.h> #include "ahocorasick.h" AC_ALPHABET_t * sample_patterns[] = { "taobao", "youku", "weixin", "weibo", "iqiyi", "baidu" }; #define PATTERN_NUMBER (sizeof(sample_patterns)/sizeof(AC_ALPHABET_t *)) AC_ALPHABET_t * input_text1 = {"find.baidu.com/123"}; AC_ALPHABET_t * input_text2 = {"buy.taobao.com"}; AC_ALPHABET_t * input_text3 = {"video.youku.cn"}; // 1. Define a call-back function of AC_MATCH_CALBACK_t int match_handler (AC_MATCH_t * matchp, void * param) { unsigned int j; // in this example we don't use param printf ("@ %2ld: ", matchp->position); for (j=0; j < matchp->match_num; j++) printf ("#%ld (%s), ", matchp->patterns[j].rep.number, matchp->patterns[j].astring); // CAUTION: be careful about using m->matched_patterns[j].astring // if 'astring' has permanent allocation inside your program's // memory area, you can use it. otherwise it will point to // an incorrect memory place. printf ("\n"); return 0; // return 0 : continue searching // return none zero : stop searching // as soon as you get enough from search results, you can stop search and // return from ac_automata_search() and continue the rest of your program. // e.g. if you only need first N matches, define a counter and return none // zero after the counter exceeds N. // to find all matches always return 0 } int main (int argc, char ** argv) { unsigned int i; // 2. Define AC variables AC_AUTOMATA_t *atm; AC_PATTERN_t tmp_patt; AC_TEXT_t tmp_text; // 3. Get a new automata atm = ac_automata_init (); // 4. Add patterns to automata for (i=0; i<PATTERN_NUMBER; i++) { tmp_patt.astring = sample_patterns[i]; tmp_patt.rep.number = i+1; // optional tmp_patt.length = strlen (tmp_patt.astring); ac_automata_add (atm, &tmp_patt); } // 5. Finalize automata. ac_automata_finalize (atm); // after you have finished with adding patterns you must finalize the automata // from now you can not add patterns anymore. // 5.1. Display automata //ac_automata_display (atm, 'n'); // the second argument determines the cast type of the pattern representative. // 'n': as number // 's': as string // because we use the integer part of union (tmp_patt.rep.number) so we used 'n' printf ("Searching: \"%s\"\n", input_text1); // 6. Set input text tmp_text.astring = input_text1; tmp_text.length = strlen (tmp_text.astring); // 7. Do search ac_automata_search (atm, &tmp_text, 0, match_handler, 0); // the 5th option is a (void *), and it will be forwarded to the callback // function. you can pass everything you want to the callback function // using this argument. // in this example we don't send a parameter to callback function. // a typical practice is to define a struct that encloses whatever you want // to send the callback function, including input and output variables printf ("Searching: \"%s\"\n", input_text2); // do another search tmp_text.astring = input_text2; tmp_text.length = strlen (tmp_text.astring); ac_automata_search (atm, &tmp_text, 0, match_handler, 0); printf ("Searching: \"%s\" with \'keep\' enabled\n", input_text3); // and another tmp_text.astring = input_text3; tmp_text.length = strlen (tmp_text.astring); ac_automata_search (atm, &tmp_text, 1, match_handler, 0); // when the keep option (3rd argument) in set, then the automata // considers that the given text is the next chunk of the previous text. // to understand the difference try it with 0 and 1 and compare the result // 8. Release automata ac_automata_release (atm); // do not forget to release the automata after you have done with it return 0; }
1. 步骤一、初始化amt
AC_AUTOMATA_t * ac_automata_init () { AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)malloc(sizeof(AC_AUTOMATA_t)); /*分配AC_AUTOMATA_t结构大小的内存给thiz*/ memset (thiz, 0, sizeof(AC_AUTOMATA_t));/*初始化内存空间为0*/ thiz->root = node_create (); /*创建根节点*/ thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES; /*最大节点长度为200 #define REALLOC_CHUNK_ALLNODES 200*/ thiz->all_nodes = (AC_NODE_t **) malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *)); /*为所有200个字节点分配内存大小*/ ac_automata_register_nodeptr (thiz, thiz->root); /*将根节点放入all_node中*/ ac_automata_reset (thiz);/*更新thiz的当前节点为根节点,base_position为0*/ thiz->total_patterns = 0; thiz->automata_open = 1;/*标记自动机可以添加模式串(open = 1)*/ return thiz; }2. 步骤二---将对应的模式字符串加入到tire树中
/****************************************************************************** * FUNCTION: ac_automata_add * Adds pattern to the automata. * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata * AC_PATTERN_t * patt: the pointer to added pattern * RETUERN VALUE: AC_ERROR_t * the return value indicates the success or failure of adding action ******************************************************************************/ /*这个函数实际上是在创建tire树*/ AC_STATUS_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) { unsigned int i; AC_NODE_t * n = thiz->root; AC_NODE_t * next; AC_ALPHABET_t alpha; if(!thiz->automata_open) /*不为可添加模式,直接返回; 其实已在ac_automata_init里面设置为1了*/ return ACERR_AUTOMATA_CLOSED; if (!patt->length) /*待添加模式的字符串长度为0,则直接返回*/ return ACERR_ZERO_PATTERN; if (patt->length > AC_PATTRN_MAX_LENGTH) /*待添加模式的字符串长度大于最大值1024(#define AC_PATTRN_MAX_LENGTH 1024),则直接返回*/ return ACERR_LONG_PATTERN; for (i=0; i<patt->length; i++) { alpha = patt->astring[i]; /*循环取出带添加字符串的字符*/ /*沿着树节点的出度(边)查找是否有符合指定alpha的边, *找到则返回沿这个边找到的下一个节点 *没有找到,则为alpha创建一个新的节点*/ if ((next = node_find_next(n, alpha))) { n = next; continue; } else { next = node_create_next(n, alpha); /*创建新节点,注册出度, *即n->outgoing[n->outgoing_degree].alpha = alpha,n->outgoing[n->outgoing_degree++].next = next*/ next->depth = n->depth + 1; /*更新新节点的深度*/ n = next; /*更新节点*/ ac_automata_register_nodeptr(thiz, n);/*将新节点加入到AC_AUTOMATA_t结构的all_nodes中,形成一个大的字符node数组*/
} } if(n->final) /*如果是最后一个节点*/ return ACERR_DUPLICATE_PATTERN; n->final = 1;/*设置为最后一个节点*/ node_register_matchstr(n, patt); /*在最后一个字符节点设置整个字符串模式的值*/ thiz->total_patterns++; return ACERR_SUCCESS; }
最终形成的树图如下:
此时,所以节点的failure_node都为NULL。
3.步骤三---为树上的每一个节点设置失败跳转节点
/****************************************************************************** * FUNCTION: ac_automata_set_failure * find failure node for the given node. ******************************************************************************/ static void ac_automata_set_failure (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) { unsigned int i, j; AC_NODE_t * m; /*假设alphas[1] = t,alpha[2]=a,alpha[3]=o depth = 4 则表示从树根开始分别搜索tao,ao,o这三组字符串,如果有匹配,则m不为NULL,否则m为NULL*/ for (i=1; i < node->depth; i++) { m = thiz->root; for (j=i; j < node->depth && m; j++) m = node_find_next (m, alphas[j]); /*在节点m的出度上查找字符alphas[i]*/ if (m) /*找到匹配的字符,则将本节点的失败节点设置为节点m*/ { node->failure_node = m; break; } } /*如果没有找到任何节点,则将根节点赋值为失败节点*/ if (!node->failure_node) node->failure_node = thiz->root; } /****************************************************************************** * FUNCTION: ac_automata_traverse_setfailure * Traverse all automata nodes using DFS (Depth First Search), meanwhile it set * the failure node for every node it passes through. this function must be * called after adding last pattern to automata. i.e. after calling this you * can not add further pattern to automata. ******************************************************************************/ static void ac_automata_traverse_setfailure (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) { unsigned int i; AC_NODE_t * next; /*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/ for (i=0; i < node->outgoing_degree; i++) { alphas[node->depth] = node->outgoing[i].alpha; next = node->outgoing[i].next; /* At every node look for its failure node */ ac_automata_set_failure (thiz, next, alphas);/*为每个节点设置failure节点指针*/ /* Recursively call itself to traverse all nodes */ ac_automata_traverse_setfailure (thiz, next, alphas); } } /****************************************************************************** * FUNCTION: ac_automata_finalize * Locate the failure node for all nodes and collect all matched pattern for * every node. it also sorts outgoing edges of node, so binary search could be * performed on them. after calling this function the automate literally will * be finalized and you can not add new patterns to the automate. * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata ******************************************************************************/ void ac_automata_finalize (AC_AUTOMATA_t * thiz) { unsigned int i; AC_ALPHABET_t alphas[AC_PATTRN_MAX_LENGTH]; /*AC_PATTRN_MAX_LENGTH 1024*/ AC_NODE_t * node; /*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/ ac_automata_traverse_setfailure (thiz, thiz->root, alphas); for (i=0; i < thiz->all_nodes_num; i++) { node = thiz->all_nodes[i]; ac_automata_union_matchstrs (node); node_sort_edges (node);/*对于一个节点下的所有outgoing出度进行排序*/ } thiz->automata_open = 0; /* do not accept patterns any more */ }
步骤4---查找
/****************************************************************************** * FUNCTION: node_findbs_next * Find out the next node for a given Alpha. this function is used after the * pre-processing stage in which we sort edges. so it uses Binary Search. ******************************************************************************/ AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) { int min, max, mid; AC_ALPHABET_t amid; min = 0; max = thiz->outgoing_degree - 1; while (min <= max) { mid = (min+max) >> 1; amid = thiz->outgoing[mid].alpha; if (alpha > amid) min = mid + 1; else if (alpha < amid) max = mid - 1; else return (thiz->outgoing[mid].next); } return NULL; } /****************************************************************************** * FUNCTION: ac_automata_search * Search in the input text using the given automata. on match event it will * call the call-back function. and the call-back function in turn after doing * its job, will return an integer value to ac_automata_search(). 0 value means * continue search, and non-0 value means stop search and return to the caller. * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata * AC_TEXT_t * txt: the input text that must be searched * int keep: is the input text the successive chunk of the previous given text * void * param: this parameter will be send to call-back function. it is * useful for sending parameter to call-back function from caller function. * RETURN VALUE: * -1: failed; automata is not finalized * 0: success; input text was searched to the end * 1: success; input text was searched partially. (callback broke the loop) ******************************************************************************/ int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep, AC_MATCH_CALBACK_f callback, void * param) { unsigned long position; AC_NODE_t * current; AC_NODE_t * next; AC_MATCH_t match; if (thiz->automata_open) /* you must call ac_automata_locate_failure() first */ return -1; thiz->text = 0; if (!keep) ac_automata_reset(thiz); /*将current_node重置为root*/ position = 0; current = thiz->current_node; /* This is the main search loop. * it must be as lightweight as possible. */ while (position < text->length) { /*在节点current的outgoing边上二分查找,如果找到则current=next,position++; 如果没有找到,有faliure节点,则current为failure节点,没有faliure节点则直接position++*/ if (!(next = node_findbs_next(current, text->astring[position]))) { if(current->failure_node /* we are not in the root node */) current = current->failure_node; else position++; } else { current = next; position++; } /*找到叶子节点,如果是没有failure节点的情况下,此时current==next*/ if (current->final && next) /* We check 'next' to find out if we came here after a alphabet * transition or due to a fail. in second case we should not report * matching because it was reported in previous node */ { /*将匹配信息记录到match结构中*/ match.position = position + thiz->base_position; match.match_num = current->matched_patterns_num; match.patterns = current->matched_patterns; /* we found a match! do call-back */ if (callback(&match, param)) /*调用回调函数进行匹配时的后续操作*/ return 1; } } /* save status variables */ thiz->current_node = current; /*记录本次结束时的current_node,以便keep参数为1时使用*/ thiz->base_position += position; return 0; }