基于DFA的敏感词检测和替换模块 SmallGFW


#encoding=utf-8

#DFA based text filter
#author=sunjoy
#version=0.3
class GFW(object):
    def __init__(self):
        self.d = {}
    
    #give a list of "ming gan ci"
    def set(self,keywords):
        p = self.d
        q = {}
        k = ''
        for word in keywords:
            word += chr(11)
            p = self.d
            for char in word:
                char = char.lower()
                if p=='':
                    q[k] = {}
                    p = q[k]
                if not (char in p):
                    p[char] = ''
                    q = p
                    k = char
                p = p[char]
        
        pass
    
    def replace(self,text,mask):
        """
        >>> gfw = GFW()
        >>> gfw.set(["sexy","girl","love","shit"])
        >>> s = gfw.replace("Shit!,Cherry is a sexy girl. She loves python.","*")
        >>> print s
        *!,Cherry is a * *. She *s python.
        """
        p = self.d
        i = 0
        j = 0
        z = 0
        result = []
        ln = len(text)
        while i+j<ln:
            #print i,j
            t = text[i+j].lower()
            #print hex(ord(t))
            if not (t in p):
                j = 0
                i += 1
                p = self.d
                continue
            p = p[t]
            j+=1
            if chr(11) in p:
                p = self.d
                result.append(text[z:i])
                result.append(mask)
                i = i+j
                z = i
                j = 0
        result.append(text[z:i+j])
        return "".join(result)
        
    def check(self,text):
        """
        >>> gfw = GFW()
        >>> gfw.set(["abd","defz","bcz"])
        >>> print gfw.check("xabdabczabdxaadefz")
        [(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')]
        """
        p = self.d
        i = 0
        j = 0
        result = []
        ln = len(text)
        while i+j<ln:
            t = text[i+j].lower()
            #print i,j,hex(ord(t))
            if not (t in p):
                j = 0
                i += 1
                p = self.d
                continue
            p = p[t]
            j+=1
            #print p,i,j
            if chr(11) in p:
                p = self.d
                result.append((i,j,text[i:i+j]))
                i = i+j
                j = 0
        return result
        
if __name__=="__main__":
    import doctest,sys
    doctest.testmod(sys.modules[__name__])
    

   



smallgfw: 一个基于DFA的敏感词检测和替换模块,用法如doctest所示。

>>> gfw = GFW()
>>> gfw.set(["sexy","girl","love","shit"])#设置敏感词列表
>>> s = gfw.replace("shit!,Cherry is a sexy girl. She loves python.","*")
>>> print s
*!,Cherry is a * *. She *s python. #屏蔽后的效果

>>> gfw = GFW()
>>> gfw.set(["abd","defz","bcz"])
>>> print gfw.check("xabdabczabdxaadefz") #检测敏感词的出现位置
[(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')] #例如,(5, 3, 'bcz')表示下标5之后长度为3的子串



from:http://www.oschina.net/p/smallgfw


你可能感兴趣的:(基于DFA的敏感词检测和替换模块 SmallGFW)