多模匹配的一个场景:从一段字符串中匹配多个模式字符串(关键字符串)
多模匹配常应用场景:(1)关键字过滤 (2)入侵检测(3)病毒检测(4)分词等
多模匹配具体算法有很多,常用的有(1)Trie树(2)AC算法(3)WM算法
输入:
关键字数组:[“研究”, “脂肪酶”, “水平下”, “哮喘”, “正相关”, “表达式”, “研究结果”]
待匹配字符:“该研究结果表明,IL-17A和IL-9高表达以及脂肪酶、CCL11水平下降与成人哮喘之间呈正相关。”
输出:
匹配成功关键词组:[‘研究’, ‘研究结果’, ‘脂肪酶’, ‘水平下’, ‘哮喘’, ‘正相关’]
测试说明:使用2万多个关键词,对2000来份字符串进行提取,提取速度均在2秒内
# -*- encoding=utf-8 -*-
class Node(object):
def __init__(self):
self.next = {}
self.fail = None
self.isWord = False
class Ahocorasick(object):
def __init__(self):
self.__root = Node()
def addWord(self, word):
"""
@param word: 添加关键词到Tire树中
"""
tmp = self.__root
for i in range(0, len(word)):
if not tmp.next.__contains__(word[i]):
tmp.next[word[i]] = Node()
tmp = tmp.next[word[i]]
tmp.isWord = True
def make(self):
"""
build the fail function
构建自动机,失效函数
"""
tmpQueue = []
tmpQueue.append(self.__root)
while (len(tmpQueue) > 0):
temp = tmpQueue.pop()
p = None
for k, v in temp.next.items():
if temp == self.__root:
temp.next[k].fail = self.__root
else:
p = temp.fail
while p is not None:
if p.next.__contains__(k):
temp.next[k].fail = p.next[k]
break
p = p.fail
if p is None:
temp.next[k].fail = self.__root
tmpQueue.append(temp.next[k])
def search(self, content):
"""
@return: 返回匹配串的集合,(也可针对需求更改,返回匹配串的位置下标)
"""
p = self.__root
result = []
startWordIndex = 0
endWordIndex = -1
currentPosition = 0
while currentPosition < len(content):
word = content[currentPosition]
# 检索状态机,直到匹配
while p.next.__contains__(word) == False and p != self.__root:
p = p.fail
if p.next.__contains__(word):
if p == self.__root:
# 若当前节点是根且存在转移状态,则说明是匹配词的开头,记录词的起始位置
startWordIndex = currentPosition
# 转移状态机的状态
p = p.next[word]
else:
p = self.__root
if p.isWord:
# 若状态为词的结尾,则把词放进结果集
# result.append((startWordIndex, currentPosition))
result.append(content[startWordIndex:currentPosition + 1])
currentPosition += 1
return result
def replace(self, content):
"""
替换部分字符
"""
replacepos = self.search(content)
result = content
for i in replacepos:
result = result[0:i[0]] + (i[1] - i[0] + 1) * u'*' + content[i[1] + 1:]
return result
if __name__ == '__main__':
# 1.创建对象
ah = Ahocorasick()
# 2.添加关键词(模式串)
ah.addWord('研究')
ah.addWord("脂肪酶")
ah.addWord("水平下")
ah.addWord("哮喘")
ah.addWord("正相关")
ah.addWord("表达式")
ah.addWord('研究结果')
# 3.穿件损失函数和输出函数
ah.make()
# 4.匹配过程
mm = ah.search('该研究结果表明,IL-17A和IL-9高表达以及脂肪酶、CCL11水平下降与成人哮喘之间呈正相关。')
print(mm)
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.Vector;
/***
* Wu-Manber多模式关键词快速检测(过滤)算法
* 说明:详细使用案例详见main方法,建议在关键词超过一定量级时修改加载关键词的方法(内存消耗问题)
*
*/
public class WuManber {
private int B=2;//块字符X的长度(模式串后缀字符的个数)
private boolean initFlag = false;//是否初始化
private UnionPatternSet unionPatternSet = new UnionPatternSet();
private int maxIndex = (int) java.lang.Math.pow(2, 16); //maxIndex = 65536
private int shiftTable[] = new int[maxIndex];
private Vector <AtomicPattern> hashTable[] = new Vector[maxIndex];
private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();
public WuManber() {
}
public static void main(String[] args) {
//1.构建对象
WuManber objWM=new WuManber();
//获取待匹配的关键词(模式串)
Vector<String> vKey=new Vector<>();
vKey.add("研究");
vKey.add("哮喘");
vKey.add("研究结果");
vKey.add("脂肪酶");
vKey.add("水平下");
vKey.add("正相关");
vKey.add("表达式");
//2.构建WM算法词典表,确认构建成功
if(objWM.addFilterKeyWord(vKey, 0)){
long startTime=System.currentTimeMillis(); //获取开始时间
String text = "该研究结果表明,IL-17A和IL-9高表达以及脂肪酶、CCL11水平下降与成人哮喘之间呈正相关。";
//3.进行匹配工作,并对结果进行处理
List<String> sResult = objWM.macth(text, new Vector(0));
System.out.println(sResult);
long endTime = System.currentTimeMillis();
System.out.println("算法运行时间: "+(endTime-startTime)+"ms"); //算法运行时间
}
//4.释放对象存储空间
objWM.clear();
}
/**
* 匹配工作进行
* @param content
* @param levelSet
* @return
*/
public List<String> macth(String content, Vector <Integer> levelSet) {
List<String> sResult = new ArrayList<>();
if (initFlag == false)
init();
Vector <AtomicPattern> aps = new Vector <AtomicPattern>();
String preContent = content;//preConvert(content);
for (int i = 0; i < preContent.length();) {
char checkChar = preContent.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector <AtomicPattern> tmpAps = new Vector <AtomicPattern>();
tmpAps = findMathAps(preContent.substring(0, i + 1),hashTable[checkChar]);
aps.addAll(tmpAps);
if(tmpAps.size()>0){
sResult.add(tmpAps.get(0).getPattern().str);
}
i++;
} else
i = i + shiftTable[checkChar];
}
parseAtomicPatternSet(aps, levelSet);
return sResult;
}
/**
* 加入关键词,当关键词较多时不推荐使用vector的方式接入参数,因为会严重消耗内存
* @param keyWord
* @param level
* @return
*/
public boolean addFilterKeyWord(Vector<String> keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
Object[] strArray = keyWord.toArray();
for (int i = 0; i < strArray.length; i++) {
String sPattern=(String)strArray[i];
Pattern pattern = new Pattern(sPattern);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
}
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
}
/**
* 验证字符
* @param ch
* @return
*/
private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
return true;// 简体中文汉字编码
return false;
}
/**
* 封装原子模式集
* @param aps
* @param levelSet
*/
private void parseAtomicPatternSet(Vector <AtomicPattern> aps,Vector <Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps) == true) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
}
/**
* 查找原子模式
* @param src
* @param destAps
* @return
*/
private Vector <AtomicPattern> findMathAps(String src,Vector <AtomicPattern> destAps) {
Vector <AtomicPattern> aps = new Vector <AtomicPattern>();
for (int i = 0; i < destAps.size(); i++) {
AtomicPattern ap = destAps.get(i);
if (ap.findMatchInString(src) == true)
aps.add(ap);
}
return aps;
}
/**
* 预转换内容(除掉特殊字符)
* @param content
* @return
*/
private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch) == true) {
retStr = retStr + ch;
}
}
return retStr;
}
/**
* shift table and hash table of initialize
*/
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector <AtomicPattern>();
shiftTableInit();
hashTableInit();
}
/**
* 清除
*/
public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
}
/**
* 初始化跳跃表
*/
private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = B;
Vector <UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector <AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
//System.out.print(pattern.charAtEnd(1)+"\t");//如pattern.charAtEnd(1)==B,则shiftTable[pattern.charAtEnd(1)]==shiftTable[53]
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
}
/**
* 初始化HASH表
*/
private void hashTableInit() {
Vector <UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector <AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
//System.out.println(pattern.charAtEnd(0));//存储shiftTable[pattern.charAtEnd(0)]==0的字符块
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
//System.out.println(hashTable[pattern.charAtEnd(0)]);
}
}
}
}
}
/**
* 模式类
*/
class Pattern {
public String str;
Pattern(String str) {
this.str = str;
}
public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
}
public String getStr() {
return str;
}
}
/**
* 原子模式类
*/
class AtomicPattern {
private Pattern pattern;
public UnionPattern belongUnionPattern;
AtomicPattern(Pattern pattern) {
this.pattern = pattern;
}
public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
}
public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
}
public Pattern getPattern() {
return pattern;
}
public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.length() - this.pattern.str.length();
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
return true;
return false;
}
}
/**
* 合并的模式类
*/
class UnionPattern {
public Vector <AtomicPattern> apSet;
private int level;
// union string
UnionPattern() {
this.apSet = new Vector <AtomicPattern>();
}
public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
}
public Vector <AtomicPattern> getSet() {
return apSet;
}
public boolean isIncludeAllAp(Vector <AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
if (isInAps(ap, inAps) == false)
return false;
}
return true;
}
private boolean isInAps(AtomicPattern ap, Vector <AtomicPattern> inAps) {
for (int i = 0; i < inAps.size(); i++) {
AtomicPattern destAp = inAps.get(i);
if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true)
return true;
}
return false;
}
public void setLevel(int level) {
this.level = level;
}
public int getLevel() {
return this.level;
}
}
/**
* 合并的模式集子类
*/
class UnionPatternSet {
// union string set
public Vector <UnionPattern> unionPatternSet;
UnionPatternSet() {
this.unionPatternSet = new Vector <UnionPattern>();
}
public void addNewUnionPattrn(UnionPattern up) {
this.unionPatternSet.add(up);
}
public Vector <UnionPattern> getSet() {
return unionPatternSet;
}
public void clear() {
unionPatternSet.clear();
}
}