目标:学习AC自动机,多模匹配。
要求:尽可能用纯Python实现,提升代码的扩展性。
一、什么是AC自动机?
AC自动机,Aho-Corasick automaton,该算法在1975年产生于贝尔实验室,是著名的多模匹配算法。要学会AC自动机,我们必 须知道什么是Trie,也就是字典树。Trie树,又称单词查找树或键树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计和排序大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。
——摘自百度百科
二、AC自动机用来做什么?
一个常见的例子就是给出n个单词,再给出一段包含m个字符的文章,让你找出有多少个单词在文章里出现过。要搞懂AC自动机,先得有模式树(字典树)Trie和KMP模式匹配算法的基础知识。AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。
如果你对KMP算法了解的话,应该知道KMP算法中的next函数(shift函数或者fail函数)是干什么用的。KMP中我们用两个指针i和j分别表示,A[i-j+ 1..i]与B[1..j]完全相等。也就是说,i是不断增加的,随着i的增加j相应地变化,且j满足以A[i]结尾的长度为j的字符串正好匹配B串的前 j个字符,当A[i+1]≠B[j+1],KMP的策略是调整j的位置(减小j值)使得A[i-j+1..i]与B[1..j]保持匹配且新的B[j+1]恰好与A[i+1]匹配,而next函数恰恰记录了这个j应该调整到的位置。同样AC自动机的失败指针具有同样的功能,也就是说当我们的模式串在Trie上进行匹配时,如果与当前节点的关键字不能继续匹配,就应该去当前节点的失败指针所指向的节点继续进行匹配。
三、AC自动机的Python安装
安装过这个包的朋友,相信都遇到过各种坑。
1、pip安装
官网:https://pypi.org/project/pyahocorasick/。源码下载:
安装方式:pip install pyahocorasick(python3),但尝试过的朋友会发现,这个包需要C编译器,如果自己的电脑中没有安装C编译器,是安装不成功的。pip install ahocorasick(python2)也无法安装。具体报错代码:
pip install pyahocorasick
Collecting pyahocorasick
Using cached https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz
Building wheels for collected packages: pyahocorasick
Running setup.py bdist_wheel for pyahocorasick ... error
Complete output from command /anaconda3/bin/python -u -c "import setuptools, tokenize;__file__='/private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-install-_tg58exd/pyahocorasick/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-wheel-rbzdosp6 --python-tag cp37:
running bdist_wheel
running build
running build_ext
building 'ahocorasick' extension
creating build
creating build/temp.macosx-10.7-x86_64-3.7
gcc -Wno-unused-result -Wsign-compare -Wunreachable-code -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -I/anaconda3/include -arch x86_64 -I/anaconda3/include -arch x86_64 -DAHOCORASICK_UNICODE= -I/anaconda3/include/python3.7m -c pyahocorasick.c -o build/temp.macosx-10.7-x86_64-3.7/pyahocorasick.o
xcrun: error: invalid active developer path (/Library/Developer/CommandLineTools), missing xcrun at: /Library/Developer/CommandLineTools/usr/bin/xcrun
error: command 'gcc' failed with exit status 1
----------------------------------------
Failed building wheel for pyahocorasick
Running setup.py clean for pyahocorasick
Failed to build pyahocorasick
Installing collected packages: pyahocorasick
Running setup.py install for pyahocorasick ... error
Complete output from command /anaconda3/bin/python -u -c "import setuptools, tokenize;__file__='/private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-install-_tg58exd/pyahocorasick/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" install --record /private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-record-5oyl9c1l/install-record.txt --single-version-externally-managed --compile:
running install
running build
running build_ext
building 'ahocorasick' extension
creating build
creating build/temp.macosx-10.7-x86_64-3.7
gcc -Wno-unused-result -Wsign-compare -Wunreachable-code -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -I/anaconda3/include -arch x86_64 -I/anaconda3/include -arch x86_64 -DAHOCORASICK_UNICODE= -I/anaconda3/include/python3.7m -c pyahocorasick.c -o build/temp.macosx-10.7-x86_64-3.7/pyahocorasick.o
xcrun: error: invalid active developer path (/Library/Developer/CommandLineTools), missing xcrun at: /Library/Developer/CommandLineTools/usr/bin/xcrun
error: command 'gcc' failed with exit status 1
----------------------------------------
Command "/anaconda3/bin/python -u -c "import setuptools, tokenize;__file__='/private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-install-_tg58exd/pyahocorasick/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" install --record /private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-record-5oyl9c1l/install-record.txt --single-version-externally-managed --compile" failed with error code 1 in /private/var/folders/jd/6t6rh0991m72k_vxp02p7f440000gn/T/pip-install-_tg58exd/pyahocorasick/
如果直接下载Github中的源码,在使用ahocorasick.Automaton()函数会报错。那怎么办?
个人尝试着安装ahocorasick-python,官网:https://pypi.org/project/ahocorasick-python/,GitHub源码:源码。
但是结果发现Mac/Linux系统可以使用,Win10不行?瞬间无语了。demo环境用的是win10。
2、解决方案
网上查找了一些解决方案,主要包括三种:
(1)老老实实地装C编译器;
(2)python使用esmre代替ahocorasick实现ac自动机多模匹配
(3)个人改写ahocorasick——Python下的ahocorasick实现快速的关键字匹配
四、ahocorasick的Python代码
1、Python2代码:
# python2
# coding=utf-8
KIND = 16
class Node():
static = 0
def __init__(self):
self.fail = None
self.next = [None] * KIND
self.end = False
self.word = None
Node.static += 1
class AcAutomation():
def __init__(self):
self.root = Node()
self.queue = []
def getIndex(self, char):
return ord(char) # - BASE
def insert(self, string):
p = self.root
for char in string:
index = self.getIndex(char)
if p.next[index] == None:
p.next[index] = Node()
p = p.next[index]
p.end = True
p.word = string
def build_automation(self):
self.root.fail = None
self.queue.append(self.root)
while len(self.queue) != 0:
parent = self.queue[0]
self.queue.pop(0)
for i, child in enumerate(parent.next):
if child == None: continue
if parent == self.root:
child.fail = self.root
else:
failp = parent.fail
while failp != None:
if failp.next[i] != None:
child.fail = failp.next[i]
break
failp = failp.fail
if failp == None: child.fail = self.root
self.queue.append(child)
def matchOne(self, string):
p = self.root
for char in string:
index = self.getIndex(char)
while p.next[index] == None and p != self.root: p = p.fail
if p.next[index] == None:
p = self.root
else:
p = p.next[index]
if p.end: return True, p.word
return False, None
class UnicodeAcAutomation():
def __init__(self, encoding='utf-8'):
self.ac = AcAutomation()
self.encoding = encoding
def getAcString(self, string):
string = bytearray(string.encode(self.encoding))
ac_string = ''
for byte in string:
ac_string += chr(byte % 16)
ac_string += chr(byte / 16)
# print ac_string
return ac_string
def insert(self, string):
if type(string) != unicode:
raise Exception('UnicodeAcAutomation:: insert type not unicode')
ac_string = self.getAcString(string)
self.ac.insert(ac_string)
def build_automation(self):
self.ac.build_automation()
def matchOne(self, string):
if type(string) != unicode:
raise Exception('UnicodeAcAutomation:: insert type not unicode')
ac_string = self.getAcString(string)
retcode, ret = self.ac.matchOne(ac_string)
if ret != None:
s = ''
for i in range(len(ret) / 2):
tmp = chr(ord(ret[2 * i]) + ord(ret[2 * i + 1]) * 16)
s += tmp
ret = s.decode('utf-8')
return retcode, ret
def main():
ac = UnicodeAcAutomation()
ac.insert(u'丁亚光')
ac.insert(u'好吃的')
ac.insert(u'好玩的')
ac.build_automation()
print(ac.matchOne(u'hi,丁亚光在干啥'))
print(ac.matchOne(u'ab'))
print(ac.matchOne(u'不能吃饭啊'))
print(ac.matchOne(u'饭很好吃,有很多好好的吃的,'))
print(ac.matchOne(u'有很多好玩的'))
if __name__ == '__main__':
main()
输出:
(True, u'\u4e01\u4e9a\u5149')
(False, None)
(False, None)
(False, None)
(True, u'\u597d\u73a9\u7684')
可能很多朋友习惯了Python3,这里提供个人修改后的代码(主要是编码格式的修改)
2、Python3
# python3
# coding=utf-8
KIND = 16
class Node():
static = 0
def __init__(self):
self.fail = None
self.next = [None] * KIND
self.end = False
self.word = None
Node.static += 1
class AcAutomation():
def __init__(self):
self.root = Node()
self.queue = []
def getIndex(self, char):
return ord(char) # - BASE
def insert(self, string):
p = self.root
for char in string:
index = self.getIndex(char)
if p.next[index] == None:
p.next[index] = Node()
p = p.next[index]
p.end = True
p.word = string
def build_automation(self):
self.root.fail = None
self.queue.append(self.root)
while len(self.queue) != 0:
parent = self.queue[0]
self.queue.pop(0)
for i, child in enumerate(parent.next):
if child == None: continue
if parent == self.root:
child.fail = self.root
else:
failp = parent.fail
while failp != None:
if failp.next[i] != None:
child.fail = failp.next[i]
break
failp = failp.fail
if failp == None: child.fail = self.root
self.queue.append(child)
def matchOne(self, string):
p = self.root
for char in string:
index = self.getIndex(char)
while p.next[index] == None and p != self.root: p = p.fail
if p.next[index] == None:
p = self.root
else:
p = p.next[index]
if p.end: return True, p.word
return False, None
class UnicodeAcAutomation():
def __init__(self, encoding='utf-8'):
self.ac = AcAutomation()
self.encoding = encoding
def getAcString(self, string):
string = bytearray(string.encode(self.encoding))
ac_string = ''
for byte in string:
ac_string += chr(byte % 16)
ac_string += chr(byte // 16)
return ac_string
def insert(self, string):
if type(string) != str:
raise Exception('StrAcAutomation:: insert type not str')
ac_string = self.getAcString(string)
self.ac.insert(ac_string)
def build_automation(self):
self.ac.build_automation()
def matchOne(self, string):
if type(string) != str:
raise Exception('StrAcAutomation:: insert type not str')
ac_string = self.getAcString(string)
retcode, ret = self.ac.matchOne(ac_string)
if ret != None:
s = ''
for i in range(len(ret) // 2):
s += chr(ord(ret[2 * i]) + ord(ret[2 * i + 1]) * 16)
ret = s.encode("latin1").decode('utf-8')
return retcode, ret
def main():
ac = UnicodeAcAutomation()
ac.insert('丁亚光')
ac.insert('好吃的')
ac.insert('好玩的')
ac.build_automation()
print(ac.matchOne('hi,丁亚光在干啥'))
print(ac.matchOne('ab'))
print(ac.matchOne('不能吃饭啊'))
print(ac.matchOne('饭很好吃,有很多好好的吃的,'))
print(ac.matchOne('有很多好玩的'))
if __name__ == '__main__':
输出:
(True, '丁亚光')
(False, None)
(False, None)
(False, None)
(True, '好玩的')
总结:ahocorasick个人改写的方法还有很多,比如根据ahocorasick-python的源码进行改写。其中ahocorasick-python的核心源码如下。
# coding:utf-8
# write by zhou
# revised by zw
class Node(object):
"""
节点的抽象
"""
def __init__(self, str='', is_root=False):
self._next_p = {}
self.fail = None
self.is_root = is_root
self.str = str
self.parent = None
def __iter__(self):
return iter(self._next_p.keys())
def __getitem__(self, item):
return self._next_p[item]
def __setitem__(self, key, value):
_u = self._next_p.setdefault(key, value)
_u.parent = self
def __repr__(self):
return "" % \
(self.str, object.__repr__(self)[1:-1].split('at')[-1])
def __str__(self):
return self.__repr__()
class AhoCorasick(object):
"""
Ac自动机对象
"""
def __init__(self, *words):
self.words_set = set(words)
self.words = list(self.words_set)
self.words.sort(key=lambda x: len(x))
self._root = Node(is_root=True)
self._node_meta = {}
self._node_all = [(0, self._root)]
_a = {}
for word in self.words:
for w in word:
_a.setdefault(w, set())
_a[w].add(word)
def node_append(keyword):
assert len(keyword) > 0
_ = self._root
for _i, k in enumerate(keyword):
node = Node(k)
if k in _:
pass
else:
_[k] = node
self._node_all.append((_i+1, _[k]))
self._node_meta.setdefault(id(_[k]),set())
if _i >= 1:
for _j in _a[k]:
if keyword[:_i+1].endswith(_j):
self._node_meta[id(_[k])].add((_j, len(_j)))
_ = _[k]
else:
if _ != self._root:
self._node_meta[id(_)].add((keyword, len(keyword)))
for word in self.words:
node_append(word)
self._node_all.sort(key=lambda x: x[0])
self._make()
def _make(self):
"""
构造Ac树
:return:
"""
for _level, node in self._node_all:
if node == self._root or _level <= 1:
node.fail = self._root
else:
_node = node.parent.fail
while True:
if node.str in _node:
node.fail = _node[node.str]
break
else:
if _node == self._root:
node.fail = self._root
break
else:
_node = _node.fail
def search(self, content, with_index=False):
result = set()
node = self._root
index = 0
for i in content:
while 1:
if i not in node:
if node == self._root:
break
else:
node = node.fail
else:
for keyword, keyword_len in self._node_meta.get(id(node[i]), set()):
if not with_index:
result.add(keyword)
else:
result.add((keyword, (index - keyword_len + 1, index + 1)))
node = node[i]
break
index += 1
return result
if __name__ == '__main__':
ac = AhoCorasick("abc", 'abe', 'acdabd', 'bdf', 'df', 'f', 'ac', 'cd', 'cda')
print(ac.search('acdabdf', True))
输出:
{('cd', (1, 3)), ('acdabd', (0, 6)), ('df', (5, 7)), ('f', (6, 7)), ('bdf', (4, 7)), ('cda', (1, 4)), ('ac', (0, 2))}
参考文献:
1、AC自动机的python实现
2、70行Python实现AC自动机
3、序列比对(二十六)——精准匹配之KMP算法、Trie树以及AC自动机
4、关于AC自动机的思考