AC算法实现这几天一直都在看多模式匹配的算法,昨天为了赶着能够在信息检索的课上show一下,就赶工写了一个AC算法的程序。实际上程序大部分代码来自于Snort,我做了一定的修改,将NFA和转化为DFA的函数合并到了一块,并且删除了很多冗余的变量和函数,感觉Snort代码写的有些乱,自我感觉我的代码应该比snort的效率和可读性都要高出一些。
我写的程序能够打印出每一个匹配到的关键词的位置,并且对关键词出现的次数进行统计。
下面是对gpl文件以free、software和copy为关键词进行的搜索。
Search the gpl.txt Match KeyWord COPY at 4 line 1 char Match KeyWord FREE at 4 line 26 char Match KeyWord SOFTWARE at 4 line 31 char Match KeyWord COPY at 6 line 26 char Match KeyWord SOFTWARE at 11 line 24 char Match KeyWord FREE at 12 line 0 char Match KeyWord FREE at 13 line 38 char Match KeyWord FREE at 13 line 66 char Match KeyWord SOFTWARE at 14 line 0 char Match KeyWord SOFTWARE at 14 line 27 char Match KeyWord FREE at 14 line 39 char Match KeyWord FREE at 15 line 46 char Match KeyWord SOFTWARE at 15 line 51 char Match KeyWord SOFTWARE at 16 line 13 char Match KeyWord FREE at 17 line 23 char Match KeyWord SOFTWARE at 17 line 28 char Match KeyWord SOFTWARE at 17 line 48 char Match KeyWord FREE at 21 line 19 char Match KeyWord SOFTWARE at 21 line 24 char Match KeyWord FREE at 21 line 54 char Match KeyWord FREE at 23 line 9 char Match KeyWord FREE at 23 line 41 char Match KeyWord SOFTWARE at 23 line 46 char Match KeyWord SOFTWARE at 25 line 40 char Match KeyWord FREE at 26 line 7 char Match KeyWord SOFTWARE at 31 line 25 char Match KeyWord COPY at 39 line 45 char Match KeyWord SOFTWARE at 39 line 59 char Match KeyWord COPY at 40 line 63 char Match KeyWord SOFTWARE at 41 line 29 char Match KeyWord FREE at 44 line 61 char Match KeyWord SOFTWARE at 45 line 0 char Match KeyWord SOFTWARE at 45 line 18 char Match KeyWord FREE at 50 line 15 char Match KeyWord SOFTWARE at 50 line 56 char Match KeyWord FREE at 51 line 63 char Match KeyWord FREE at 54 line 39 char Match KeyWord COPY at 56 line 39 char Match KeyWord COPY at 60 line 28 char Match KeyWord COPY at 63 line 23 char Match KeyWord COPY at 66 line 54 char Match KeyWord COPY at 72 line 22 char Match KeyWord COPY at 79 line 13 char Match KeyWord COPY at 81 line 48 char Match KeyWord COPY at 82 line 0 char Match KeyWord COPY at 84 line 47 char Match KeyWord COPY at 87 line 60 char Match KeyWord COPY at 90 line 25 char Match KeyWord COPY at 91 line 53 char Match KeyWord COPY at 106 line 42 char Match KeyWord COPY at 109 line 57 char Match KeyWord COPY at 134 line 13 char Match KeyWord SOFTWARE at 140 line 51 char Match KeyWord COPY at 145 line 21 char Match KeyWord SOFTWARE at 147 line 25 char Match KeyWord COPY at 167 line 10 char Match KeyWord COPY at 168 line 10 char Match KeyWord COPY at 170 line 13 char Match KeyWord COPY at 172 line 17 char Match KeyWord COPY at 174 line 13 char Match KeyWord COPY at 186 line 33 char Match KeyWord COPY at 191 line 21 char Match KeyWord FREE at 205 line 33 char Match KeyWord FREE at 218 line 17 char Match KeyWord SOFTWARE at 218 line 22 char Match KeyWord SOFTWARE at 220 line 44 char Match KeyWord SOFTWARE at 223 line 14 char Match KeyWord COPY at 230 line 42 char Match KeyWord COPY at 231 line 9 char Match KeyWord FREE at 237 line 9 char Match KeyWord SOFTWARE at 237 line 14 char Match KeyWord FREE at 245 line 64 char Match KeyWord SOFTWARE at 246 line 0 char Match KeyWord FREE at 247 line 63 char Match KeyWord SOFTWARE at 247 line 68 char Match KeyWord FREE at 250 line 65 char Match KeyWord SOFTWARE at 252 line 28 char Match KeyWord COPY at 252 line 46 char Match KeyWord FREE at 252 line 65 char Match KeyWord SOFTWARE at 253 line 0 char Match KeyWord FREE at 253 line 34 char Match KeyWord SOFTWARE at 253 line 39 char Match KeyWord FREE at 255 line 18 char Match KeyWord FREE at 255 line 56 char Match KeyWord SOFTWARE at 255 line 61 char Match KeyWord SOFTWARE at 256 line 38 char Match KeyWord FREE at 260 line 38 char Match KeyWord COPY at 262 line 32 char Match KeyWord COPY at 271 line 9 char Match KeyWord FREE at 286 line 0 char Match KeyWord SOFTWARE at 286 line 5 char Match KeyWord COPY at 291 line 5 char Match KeyWord COPY at 294 line 4 char Match KeyWord FREE at 296 line 20 char Match KeyWord SOFTWARE at 296 line 25 char Match KeyWord FREE at 298 line 8 char Match KeyWord SOFTWARE at 298 line 13 char Match KeyWord COPY at 306 line 31 char Match KeyWord FREE at 307 line 44 char Match KeyWord SOFTWARE at 307 line 49 char Match KeyWord COPY at 315 line 28 char Match KeyWord FREE at 317 line 12 char Match KeyWord SOFTWARE at 317 line 17 char Match KeyWord COPY at 326 line 27 char Match KeyWord COPY at 329 line 39 char ### Summary ### FREE : 32 SOFTWARE : 35 COPY : 38 |
entry.c
#include "acsmx.h" /* * Text Data Buffer */ unsigned char text[MAXLEN]; extern int nline; int main (int argc, char **argv) { int i, nocase = 0; FILE *fd; char filename[20]; ACSM_STRUCT * acsm; if (argc < 3) { fprintf (stderr,"Usage: acsmx filename pattern1 pattern2 ... -nocase\n"); exit (0); } acsm = acsmNew (); strcpy (filename, argv[1]); fd = fopen(filename,"r"); if(fd == NULL) { fprintf(stderr,"Open file error!\n"); exit(1); } for (i = 1; i < argc; i++) if (strcmp (argv[i], "-nocase") == 0) nocase = 1; for (i = 2; i < argc; i++) { if (argv[i][0] == '-') continue; acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase,argv[i]); } /* Generate GtoTo Table and Fail Table */ acsmCompile (acsm); /*Search Pattern*/ while ( fgets(text,MAXLEN,fd) ) { acsmSearch (acsm, text, strlen (text), PrintMatch); nline++; } PrintSummary(acsm->acsmPatterns); acsmFree (acsm); printf ("\n### AC Match Finished ###\n"); system("pause"); return (0); } |
acsm.h
/* ** ACSMX.H ** ** */ #ifndef ACSMX_H #define ACSMX_H #include <stdio.h> #include <stdlib.h> #include <string.h> /* * Prototypes */ #define ALPHABET_SIZE 256 #define MAXLEN 256 #define ACSM_FAIL_STATE -1 typedef struct _acsm_pattern { struct _acsm_pattern *next; unsigned char *patrn; unsigned char *casepatrn; int n; int nocase; void * id; int nmatch; } ACSM_PATTERN; typedef struct { /* Next state - based on input character */ int NextState[ ALPHABET_SIZE ]; /* Failure state - used while building NFA & DFA */ int FailState; /* List of patterns that end here, if any */ ACSM_PATTERN *MatchList; }ACSM_STATETABLE; /* * State machine Struct */ typedef struct { int acsmMaxStates; int acsmNumStates; ACSM_PATTERN * acsmPatterns; ACSM_STATETABLE * acsmStateTable; }ACSM_STRUCT; /* * Prototypes */ ACSM_STRUCT * acsmNew (); int acsmAddPattern( ACSM_STRUCT * p, unsigned char * pat, int n,int nocase); int acsmCompile ( ACSM_STRUCT * acsm ); int acsmSearch ( ACSM_STRUCT * acsm,unsigned char * T, int n, int (*Match) (ACSM_PATTERN * mlist, int nline,int index)); void acsmFree ( ACSM_STRUCT * acsm ); void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index) ; void PrintSummary (ACSM_PATTERN * pattern) ; #endif |
acsm.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "acsmx.h"
#define MEMASSERT(p,s) if(!p){fprintf(stderr,"ACSM-No Memory: %s!\n",s);exit(0);}
/*Define the number of the line,when match a keyword*/
extern int nline=1;
/*
* Malloc the AC Memory
*/
static void *AC_MALLOC (int n)
{
void *p;
p = malloc (n);
return p;
}
/*
*Free the AC Memory
*/
static void AC_FREE (void *p)
{
if (p)
free (p);
}
/*
* Simple QUEUE NODE
*/
typedef struct _qnode
{
int state;
struct _qnode *next;
}QNODE;
/*
* Simple QUEUE Structure
*/
typedef struct _queue
{
QNODE * head, *tail;
int count;
}QUEUE;
/*
*Init the Queue
*/
static void queue_init (QUEUE * s)
{
s->head = s->tail = 0;
s->count = 0;
}
/*
* Add Tail Item to queue
*/
static void queue_add (QUEUE * s, int state)
{
QNODE * q;
/*Queue is empty*/
if (!s->head)
{
q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
/*if malloc failed,exit the problom*/
MEMASSERT (q, "queue_add");
q->state = state;
q->next = 0; /*Set the New Node's Next Null*/
}
else
{
q = (QNODE *) AC_MALLOC (sizeof (QNODE));
MEMASSERT (q, "queue_add");
q->state = state;
q->next = 0;
/*Add the new Node into the queue*/
s->tail->next = q;
/*set the new node is the Queue's Tail*/
s->tail = q;
}
s->count++;
}
/*
* Remove Head Item from queue
*/
static int queue_remove (QUEUE * s)
{
int state = 0;
QNODE * q;
/*Remove A QueueNode From the head of the Queue*/
if (s->head)
{
q = s->head;
state = q->state;
s->head = s->head->next;
s->count--;
/*If Queue is Empty,After Remove A QueueNode*/
if (!s->head)
{
s->tail = 0;
s->count = 0;
}
/*Free the QueNode Memory*/
AC_FREE (q);
}
return state;
}
/*
*Return The count of the Node in the Queue
*/
static int queue_count (QUEUE * s)
{
return s->count;
}
/*
*Free the Queue Memory
*/
static void queue_free (QUEUE * s)
{
while (queue_count (s))
{
queue_remove (s);
}
}
/*
** Case Translation Table
*/
static unsigned char xlatcase[256];
/*
* Init the xlatcase Table,Trans alpha to UpperMode
* Just for the NoCase State
*/
static void init_xlatcase ()
{
int i;
for (i = 0; i < 256; i++)
{
xlatcase[i] = toupper (i);
}
}
/*
*Convert the pattern string into upper
*/
static void ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
{
int i;
for (i = 0; i < m; i++)
{
d[i] = xlatcase[s[i]];
}
}
/*
* Add a pattern to the list of patterns terminated at this state.
* Insert at front of list.
*/
static void AddMatchListEntry (ACSM_STRUCT * acsm, int state, ACSM_PATTERN * px)
{
ACSM_PATTERN * p;
p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
MEMASSERT (p, "AddMatchListEntry");
memcpy (p, px, sizeof (ACSM_PATTERN));
/*Add the new pattern to the pattern list*/
p->next = acsm->acsmStateTable[state].MatchList;
acsm->acsmStateTable[state].MatchList = p;
}
/*
* Add Pattern States
*/
static void AddPatternStates (ACSM_STRUCT * acsm, ACSM_PATTERN * p)
{
unsigned char *pattern;
int state=0, next, n;
n = p->n; /*The number of alpha in the pattern string*/
pattern = p->patrn;
/*
* Match up pattern with existing states
*/
for (; n > 0; pattern++, n--)
{
next = acsm->acsmStateTable[state].NextState[*pattern];
if (next == ACSM_FAIL_STATE)
break;
state = next;
}
/*
* Add new states for the rest of the pattern bytes, 1 state per byte
*/
for (; n > 0; pattern++, n--)
{
acsm->acsmNumStates++;
acsm->acsmStateTable[state].NextState[*pattern] = acsm->acsmNumStates;
state = acsm->acsmNumStates;
}
/*Here,An accept state,just add into the MatchListof the state*/
AddMatchListEntry (acsm, state, p);
}
/*
* Build Non-Deterministic Finite Automata
*/
static void Build_NFA (ACSM_STRUCT * acsm)
{
int r, s;
int i;
QUEUE q, *queue = &q;
ACSM_PATTERN * mlist=0;
ACSM_PATTERN * px=0;
/* Init a Queue */
queue_init (queue);
/* Add the state 0 transitions 1st */
/*1st depth Node's FailState is 0, fail(x)=0 */
for (i = 0; i < ALPHABET_SIZE; i++)
{
s = acsm->acsmStateTable[0].NextState[i];
if (s)
{
queue_add (queue, s);
acsm->acsmStateTable[s].FailState = 0;
}
}
/* Build the fail state transitions for each valid state */
while (queue_count (queue) > 0)
{
r = queue_remove (queue);
/* Find Final States for any Failure */
for (i = 0; i < ALPHABET_SIZE; i++)
{
int fs, next;
/*** Note NextState[i] is a const variable in this block ***/
if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
{
queue_add (queue, s);
fs = acsm->acsmStateTable[r].FailState;
/*
* Locate the next valid state for 'i' starting at s
*/
/**** Note the variable "next" ****/
/*** Note "NextState[i]" is a const variable in this block ***/
while ((next=acsm->acsmStateTable[fs].NextState[i]) ==
ACSM_FAIL_STATE)
{
fs = acsm->acsmStateTable[fs].FailState;
}
/*
* Update 's' state failure state to point to the next valid state
*/
acsm->acsmStateTable[s].FailState = next;
}
else
{
acsm->acsmStateTable[r].NextState[i] =
acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].NextState[i];
}
}
}
/* Clean up the queue */
queue_free (queue);
}
/*
* Build Deterministic Finite Automata from NFA
*/
static void Convert_NFA_To_DFA (ACSM_STRUCT * acsm)
{
int r, s;
int i;
QUEUE q, *queue = &q;
/* Init a Queue */
queue_init (queue);
/* Add the state 0 transitions 1st */
for (i = 0; i < ALPHABET_SIZE; i++)
{
s = acsm->acsmStateTable[0].NextState[i];
if (s)
{
queue_add (queue, s);
}
}
/* Start building the next layer of transitions */
while (queue_count (queue) > 0)
{
r = queue_remove (queue);
/* State is a branch state */
for (i = 0; i < ALPHABET_SIZE; i++)
{
if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
{
queue_add (queue, s);
}
else
{
acsm->acsmStateTable[r].NextState[i] =
acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].NextState[i];
}
}
}
/* Clean up the queue */
queue_free (queue);
}
/*
* Init the acsm DataStruct
*/
ACSM_STRUCT * acsmNew ()
{
ACSM_STRUCT * p;
init_xlatcase ();
p = (ACSM_STRUCT *) AC_MALLOC (sizeof (ACSM_STRUCT));
MEMASSERT (p, "acsmNew");
if (p)
memset (p, 0, sizeof (ACSM_STRUCT));
return p;
}
/*
* Add a pattern to the list of patterns for this state machine
*/
int acsmAddPattern (ACSM_STRUCT * p, unsigned char *pat, int n, int nocase)
{
ACSM_PATTERN * plist;
plist = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
MEMASSERT (plist, "acsmAddPattern");
plist->patrn = (unsigned char *) AC_MALLOC (n+1);
memset(plist->patrn+n,0,1);
ConvertCaseEx (plist->patrn, pat, n);
plist->casepatrn = (unsigned char *) AC_MALLOC (n+1);
memset(plist->casepatrn+n,0,1);
memcpy (plist->casepatrn, pat, n);
plist->n = n;
plist->nocase = nocase;
plist->nmatch=0;
/*Add the pattern into the pattern list*/
plist->next = p->acsmPatterns;
p->acsmPatterns = plist;
return 0;
}
/*
* Compile State Machine
*/
int acsmCompile (ACSM_STRUCT * acsm)
{
int i, k;
ACSM_PATTERN * plist;
/* Count number of states */
acsm->acsmMaxStates = 1; /*State 0*/
for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
{
acsm->acsmMaxStates += plist->n;
}
acsm->acsmStateTable = (ACSM_STATETABLE *) AC_MALLOC (sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);
MEMASSERT (acsm->acsmStateTable, "acsmCompile");
memset (acsm->acsmStateTable, 0,sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);
/* Initialize state zero as a branch */
acsm->acsmNumStates = 0;
/* Initialize all States NextStates to FAILED */
for (k = 0; k < acsm->acsmMaxStates; k++)
{
for (i = 0; i < ALPHABET_SIZE; i++)
{
acsm->acsmStateTable[k].NextState[i] = ACSM_FAIL_STATE;
}
}
/* This is very import */
/* Add each Pattern to the State Table */
for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
{
AddPatternStates (acsm, plist);
}
/* Set all failed state transitions which from state 0 to return to the 0'th state */
for (i = 0; i < ALPHABET_SIZE; i++)
{
if (acsm->acsmStateTable[0].NextState[i] == ACSM_FAIL_STATE)
{
acsm->acsmStateTable[0].NextState[i] = 0;
}
}
/* Build the NFA */
Build_NFA (acsm);
/* Convert the NFA to a DFA */
//Convert_NFA_To_DFA (acsm);
return 0;
}
/*64KB Memory*/
static unsigned char Tc[64*1024];
/*
* Search Text or Binary Data for Pattern matches
*/
int acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,void (*PrintMatch) (ACSM_PATTERN * mlist, int nline,int index))
{
int state;
ACSM_PATTERN * mlist;
unsigned char *Tend;
ACSM_STATETABLE * StateTable = acsm->acsmStateTable;
int nfound = 0; /*Number of the found(matched) patten string*/
unsigned char *T;
int index;
/* Case conversion */
ConvertCaseEx (Tc, Tx, n);
T = Tc;
Tend = T + n;
for (state = 0; T < Tend; T++)
{
state = StateTable[state].NextState[*T];
/* State is a accept state? */
if( StateTable[state].MatchList != NULL )
{
for( mlist=StateTable[state].MatchList; mlist!=NULL;
mlist=mlist->next )
{
/*Get the index of the Match Pattern String in the Text*/
index = T - mlist->n + 1 - Tc;
//mlist->nmatch++;
nfound++;
PrintMatch (acsm->acsmPatterns,mlist, nline,index);
}
}
}
return nfound;
}
/*
* Free all memory
*/
void acsmFree (ACSM_STRUCT * acsm)
{
int i;
ACSM_PATTERN * mlist, *ilist;
for (i = 0; i < acsm->acsmMaxStates; i++)
{
if (acsm->acsmStateTable[i].MatchList != NULL)
{
mlist = acsm->acsmStateTable[i].MatchList;
while (mlist)
{
ilist = mlist;
mlist = mlist->next;
AC_FREE (ilist);
}
}
}
AC_FREE (acsm->acsmStateTable);
}
/*
* Print A Match String's Information
*/
void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index)
{
/* Count the Each Match Pattern */
ACSM_PATTERN *temp = pattern;
for (;temp!=NULL;temp=temp->next)
{
if (!strcmp(temp->patrn,mlist->patrn))
{
temp->nmatch++;
}
}
if(mlist->nocase)
fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->patrn,nline,index);
else
fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->casepatrn,nline,index);
}
/*
* Print Summary Information of the AC Match
*/
void PrintSummary (ACSM_PATTERN * pattern)
{
ACSM_PATTERN * mlist = pattern;
printf("\n### Summary ###\n");
for (;mlist!=NULL;mlist=mlist->next)
{
if(mlist->nocase)
printf("%12s : %5d\n",mlist->patrn,mlist->nmatch);
else
printf("%12s : %5d\n",mlist->casepatrn,mlist->nmatch);
}
}