AC自动机,百万级多模匹配

数据规模

  模式串:127w
  目标文本:750M(没统计有多少字符)

AC自动机

  其实就是trie树模仿KMP算法,构建了一个fail指针,实现匹配失败后不用回溯直接转移。对于结点cur,构建其孩子结点child的fail指针的算法

  1. 如果cur为根节点,设置child.fail为cur,否则下一步;
  2. 如果cur.fail存在与child字符相同的孩子节点fchild,设置child.fail为fchild,否则下一步;
  3. cur = cur.fail;
  4. 跳到1;

问题记录

  1. 百万模式串,且字符不局限于英文字母(汉字占2字节,用2个char存储),所以用孩子兄弟法存储trie树,不预留孩子空间。数据结构:
typedef struct _NODE {
	char c;
	unsigned int isWord;
	struct _NODE *child, *bro, *fail;//用孩子兄弟法存储,节省空间
}node;
  1. AC自动机成功匹配一个单词以后!
      当前结点匹配成功的话,他的fail结点也一定能匹配成功,即fail结点结尾的模式串是当结点结尾的模式串的后缀。
  2. 孩子兄弟表示法的树的广搜
      原想偷个懒,结果找了找,一下还没找到。一开始想的写个非递归,按照传统的队列辅助的话,如果要实现带记录的遍历,就要记录每层压入的结点数。所以还是递归怼上去吧…
void bfs(struct _NODE *cur, FILE *fp)
{
	static char stack[100];
	static int top = 0;
	if (cur->c != '\0') {
		stack[top++] = cur->c;
		if (cur->isWord > 0) {
			stack[top] = '\0';
			fprintf(fp, "%s %d\n", stack, cur->isWord);
		}
	}
	struct _NODE *tmp = cur->child;
	while (tmp != NULL) {
		bfs(tmp, fp);
		tmp = tmp->bro;
	}
	if (cur->c != '\0')
		--top;
}
  1. 百分比进度显示
      其实这个和AC自动机没什么关系。我以前没有写过数据集这么大的程序,匹配目标文本的时候我还以为是卡住了…所以写了个显示百分比进度的功能,大概思路就是利用文件指针的索引位置计算百分比,程序每循环一定次数显示(刷新)一次。
      但是发现,加了这个功能之后,程序的运行速度明显慢了大概五分之一,应该是比较语句和printf的速度慢于文件读取速度的问题。
      网上也没有找到完美的方法,我猜测应该是用多线程写,但是多线程想要知道主线程的进度就要用临界区变量,感觉这个开销更大啊。
      待考!

源码

  因为要求用C语言写,所以自己写了个queue,就是一个常规的queue,包含push、pop、empty啥的,懒得贴代码了。

typedef struct _NODE {
	char c;
	unsigned int isWord;
	struct _NODE *child, *bro, *fail;//用孩子兄弟法存储,节省空间
}node;

typedef struct _TREE {
	struct _NODE *root;
	int size;
}tree;

bool isReady;

//初始化trie树
void init(struct _TREE *tree)
{
	tree->root = (struct _NODE *)malloc(sizeof(struct _NODE));
	memset(tree->root, 0, sizeof(struct _NODE));
	tree->size = 0;
	isReady= false;
}

//在cur的孩子结点中寻找值为c的结点,没有则返回NULL
struct _NODE *findNext(struct _NODE *cur, char c)
{
	if (cur == NULL || cur->child == NULL)
		return NULL;
	struct _NODE *tmp = cur->child;
	while (tmp != NULL) {
		if (tmp->c == c)
			return tmp;
		tmp = tmp->bro;
	}
	return NULL;
}

//插入值为c的孩子节点到cur,返回该插入的结点
struct _NODE *putNext(struct _NODE *cur, char c)
{
	if (cur == NULL)
		return NULL;
	struct _NODE *tmp = cur->child;
	if (tmp != NULL) {
		while (tmp->bro != NULL) tmp = tmp->bro;
		tmp = tmp->bro = (struct _NODE *)malloc(sizeof(struct _NODE));
	}
	else
		tmp = cur->child = (struct _NODE *)malloc(sizeof(struct _NODE));
	memset(tmp, 0, sizeof(struct _NODE));
	tmp->c = c;
	return tmp;
}

//将字符串data+index插入以cur为根的trie树
void insert(struct _NODE *cur, const char *data, int index)
{
	if (data[index] == '\0') {
		cur->isWord = 1;
		return;
	}
	struct _NODE *tmp = findNext(cur, data[index]);
	if (tmp == NULL)
		tmp = putNext(cur, data[index]);
	insert(tmp, data, index + 1);//尾递归便于优化
}

//释放trie树空间,太麻烦了,就没写...
void clear(struct _TREE *tree)
{

}

//更新fail指针
void updateFail(struct _NODE *root)
{
	if (root == NULL)
		return;
	myQueue queue;
	queueInit(&queue);
	if (root->c == '\0') {//root为树的根节点
		struct _NODE *tmp = root->child;
		while (tmp != NULL) {
			tmp->fail = root;
			push(tmp, &queue);
			tmp = tmp->bro;
		}
	}
	while (!empty(&queue)) {
		root = (node *)pop(&queue);
		if (root->child == NULL)
			continue;
		struct _NODE *ffail, *tmp = root->child, *res;
		while (tmp != NULL) {
			ffail = root->fail;
			while ((res = findNext(ffail, tmp->c)) == NULL && ffail->fail != NULL)
				ffail = ffail->fail;
			if (res != NULL)//上面的while循环条件的与运算符会先判断左边的表达式,所以这里要先判断while是否是因为左边的表达式为假退出的
				tmp->fail = res;
			else//此情况下ffail->fail == NULL,表明ffail为root
				tmp->fail = ffail;
			push(tmp, &queue);
			tmp = tmp->bro;
		}
	}
	queueClear(&queue);
	isUpdateFail = true;
}

//从文件中读取字符串,构建trie树和fail指针
int insertFromFile(struct _TREE *tree, const char *fileName)
{
	clear(tree);
	FILE *fp = fopen(fileName, "r");
	if (fp == NULL) {
		printf("error when opening file \"%s\"\n", fileName);
		return;
	}
	else {
		printf("已打开\"%s\",开始读取模式串...\n", fileName);
	}
	fseek(fp,0,SEEK_END);
	long int total = ftell(fp);//获取文件长度
	rewind(fp);
	int i = 0;
	char buffer[100];
	while (!feof(fp)) {
		fgets(buffer, 99, fp);
		if (buffer[0] == '\0')
			continue;
		buffer[strlen(buffer) - 1] = '\0';
		insert(tree->root, buffer, 0);
		if(++i > 10000){
			i = 0;
			printf("%5.2f\r", (float)ftell(fp) / total);
		}
	}
	fclose(fp);
	printf("模式串读取完毕!  \n");

	printf("开始构建fail指针...\r");
	updateFail(tree->root);
	printf("构建fail指针完毕! \n");
	isReady = true;
}

//目标文本匹配
void queryInFile(struct _TREE *tree, const char *fileName)
{
	if (!isReady) {
		printf("请先读取模式串!\n");
		return;
	}
	int i = 0;
	FILE *fp = fopen(fileName, "r");
	if (fp == NULL) {
		printf("error when opening file \"%s\"\n", fileName);
		return;
	}
	else {
		printf("已打开\"%s\",开始匹配...\n", fileName);
	}
	fseek(fp,0,SEEK_END);
	long int total = ftell(fp);//获取文件长度
	rewind(fp);
	struct _NODE *cur = tree->root, *tmp;
	while (!feof(fp)) {
		char c = fgetc(fp);
		if (c == '\n') {
			cur = tree->root;
			continue;
		}
		tmp = findNext(cur, c);
		if (tmp == NULL && cur != tree->root) {//匹配失败
			cur = cur->fail;
			tmp = findNext(cur, c);//从fail处继续匹配
		}
		if (tmp == NULL) {//表明是从root处匹配失败
			cur = tree->root;
			continue;
		}
		cur = tmp;
		while (tmp != tree->root && tmp->isWord != 0) {
			++tmp->isWord;
			tmp = tmp->fail;//防止某些模式串是此模式串的后缀
		}
		if(++i > 1000000){
			i = 0;
			printf("%5.2f\r", (float)ftell(fp) / total);
		}
	}

	fclose(fp);
	printf("目标文本匹配完毕!\n");
}

void bfs(struct _NODE *cur, FILE *fp)
{
	static char stack[100];
	static int top = 0;
	if (cur->c != '\0') {
		stack[top++] = cur->c;
		if (cur->isWord > 0) {
			stack[top] = '\0';
			fprintf(fp, "%s %d\n", stack, cur->isWord);
		}
	}
	struct _NODE *tmp = cur->child;
	while (tmp != NULL) {
		bfs(tmp, fp);
		tmp = tmp->bro;
	}
	if (cur->c != '\0')
		--top;
}

void printToFile(struct _TREE *tree, const char *fileName)
{
	if (tree->root->child == NULL)
		return;
	printf("正在保存匹配结果...\r");
	FILE *fp = fopen(fileName, "w");
	if (fp == NULL) {
		printf("error when opening file \"%s\"\n", fileName);
		return;
	}
	bfs(tree->root, fp);
	fclose(fp);
	printf("匹配结果已保存到\"%s\"!\n", fileName);
}

  入口main函数:

int main(int argc, char **argv)
{
	tree test;
	init(&test);
	insertFromFile(&test, "patterns-127w.txt");
	queryInFile(&test, "content.txt");
	printToFile(&test, "result_unsorted.txt");

	system("pause");
	return 0;
}

你可能感兴趣的:(算法)