Python 敏感词过滤的实现

一个简单的实现

class NaiveFilter():

'''Filter Messages from keywords

very simple filter implementation

>>> f = NaiveFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
    self.keywords = set([])

def parse(self, path):
    for keyword in open(path):
        self.keywords.add(keyword.strip().decode('utf-8').lower())

def filter(self, message, repl="*"):
    message = str(message).lower()
    for kw in self.keywords:
        message = message.replace(kw, repl)
    return message

其中strip() 函数 删除附近的一些空格,解码采用utf-8的形式,然后将其转为小写。
parse()函数就是打开文件,然后从中取各个关键词,然后将其存在关键词集合中。

filter()函数是一个过滤器函数,其中将消息转化为小写,然后将关键词替换成.

class BSFilter:

'''Filter Messages from keywords

Use Back Sorted Mapping to reduce replacement times

>>> f = BSFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
    self.keywords = []
    self.kwsets = set([])
    self.bsdict = defaultdict(set)
    self.pat_en = re.compile(r'^[0-9a-zA-Z]+$')  # english phrase or not

def add(self, keyword):
    if not isinstance(keyword, str):
        keyword = keyword.decode('utf-8')
    keyword = keyword.lower()
    if keyword not in self.kwsets:
        self.keywords.append(keyword)
        self.kwsets.add(keyword)
        index = len(self.keywords) - 1
        for word in keyword.split():
            if self.pat_en.search(word):
                self.bsdict[word].add(index)
            else:
                for char in word:
                    self.bsdict[char].add(index)

def parse(self, path):
    with open(path, "r") as f:
        for keyword in f:
            self.add(keyword.strip())

def filter(self, message, repl="*"):
    if not isinstance(message, str):
        message = message.decode('utf-8')
    message = message.lower()
    for word in message.split():
        if self.pat_en.search(word):
            for index in self.bsdict[word]:
                message = message.replace(self.keywords[index], repl)
        else:
            for char in word:
                for index in self.bsdict[char]:
                    message = message.replace(self.keywords[index], repl)
    return message

在上面的实现例子中,对于搜索查找进行了优化,对于英语单词,直接进行了按词索引字典查找。对于其他语言模式,我们采用逐字符查找匹配的一种模式。
BFS:宽度优先搜索方式。

class DFAFilter():

'''Filter Messages from keywords

Use DFA to keep algorithm perform constantly

>>> f = DFAFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
    self.keyword_chains = {}
    self.delimit = '\x00'

def add(self, keyword):
    if not isinstance(keyword, str):
        keyword = keyword.decode('utf-8')
    keyword = keyword.lower()
    chars = keyword.strip()
    if not chars:
        return
    level = self.keyword_chains
    for i in range(len(chars)):
        if chars[i] in level:
            level = level[chars[i]]
        else:
            if not isinstance(level, dict):
                break
            for j in range(i, len(chars)):
                level[chars[j]] = {}
                last_level, last_char = level, chars[j]
                level = level[chars[j]]
            last_level[last_char] = {self.delimit: 0}
            break
    if i == len(chars) - 1:
        level[self.delimit] = 0

def parse(self, path):
    with open(path,encoding='UTF-8') as f:
        for keyword in f:
            self.add(keyword.strip())

def filter(self, message, repl="*"):
    if not isinstance(message, str):
        message = message.decode('utf-8')
    message = message.lower()
    ret = []
    start = 0
    while start < len(message):
        level = self.keyword_chains
        step_ins = 0
        for char in message[start:]:
            if char in level:
                step_ins += 1
                if self.delimit not in level[char]:
                    level = level[char]
                else:
                    ret.append(repl * step_ins)
                    start += step_ins - 1
                    break
            else:
                ret.append(message[start])
                break
        else:
            ret.append(message[start])
        start += 1

    return ''.join(ret)

DFA即Deterministic Finite Automaton,也就是确定有穷自动机。
使用了嵌套的字典来实现。www.81rz.com/mnst131/

你可能感兴趣的:(Python 敏感词过滤的实现)