使用python开发敏感词检测过滤系统

这里使用python 的 bottle框架来做简易的敏感词过滤系统,算法采用成熟的DFA关键词匹配算法,本系统只提供一套基于http的 api,可以提供给各个应用使用。这里我只把最核心的业务实现,其他的再完善。

检索算法是网上找的DFA算法的python实现版本

smallgfw.py

#encoding=utf-8
#DFA based text filter
#version=0.3
class GFW(object):
    def __init__(self):
        self.d = {}
    
    #give a list of "ming gan ci"
    def set(self,keywords):
        p = self.d
        q = {}
        k = ''
        for word in keywords:
            word += chr(11)
            p = self.d
            for char in word:
                char = char.lower()
                if p=='':
                    q[k] = {}
                    p = q[k]
                if not (char in p):
                    p[char] = ''
                    q = p
                    k = char
                p = p[char]
        
        pass
    
    def replace(self,text,mask):
        """
        >>> gfw = GFW()
        >>> gfw.set(["sexy","girl","love","shit"])
        >>> s = gfw.replace("Shit!,Cherry is a sexy girl. She loves python.","*")
        >>> print s
        *!,Cherry is a * *. She *s python.
        """
        p = self.d
        i = 0 
        j = 0
        z = 0
        result = []
        ln = len(text)
        while i+j<ln:
            #print i,j
            t = text[i+j].lower()
            #print hex(ord(t))
            if not (t in p):
                j = 0
                i += 1
                p = self.d
                continue
            p = p[t]
            j+=1
            if chr(11) in p:
                p = self.d
                result.append(text[z:i])
                result.append(mask)
                i = i+j
                z = i
                j = 0
        result.append(text[z:i+j])
        return "".join(result)
        
    def check(self,text):
        """
        >>> gfw = GFW()
        >>> gfw.set(["abd","defz","bcz"])
        >>> print gfw.check("xabdabczabdxaadefz")
        [(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')]
        """
        p = self.d
        i = 0 
        j = 0
        result = []
        ln = len(text)
        while i+j<ln:
            t = text[i+j].lower()
            #print i,j,hex(ord(t))
            if not (t in p):
                j = 0
                i += 1
                p = self.d
                continue
            p = p[t]
            j+=1
            #print p,i,j
            if chr(11) in p:
                p = self.d
                result.append((i,j,text[i:i+j]))
                i = i+j
                j = 0
        return result
        
if __name__=="__main__":
    import doctest,sys
    doctest.testmod(sys.modules[__name__])
    

    



然后编写bottle框架的api文件代码,localbottle.py

#-*- coding:utf-8 -*-
#localhost testing
#caroltc 2014/10/7
from bottle import route, run, request
from smallgfw import *
import json
import sys

def initWords():
    path = 'words.txt'  
    fp = open(path,'r')  
    word_list = []  
    for line in fp:  
        line = line[0:-1]  
        word_list.append(line)  
    fp.close()
    return word_list

@route('/replace', method="POST")
def replace():
    reload(sys)
    sys.setdefaultencoding('utf8')
    getwords = request.params.words or ""
    gfw = GFW()
    words = initWords()
    gfw.set(words)#设置敏感词列表
    res = gfw.check(getwords.encode('utf8'))
#    for obj in res:
#        print json.dumps(obj),obj[2]
    s = gfw.replace(getwords.encode('utf8'),"**")
    return s

@route('/check',method="POST")
def check():
    reload(sys)
    sys.setdefaultencoding('utf8')
    getwords = request.params.words or ""
    gfw = GFW()
    words = initWords()
    gfw.set(words)#设置敏感词列表
    res = gfw.check(getwords.encode('utf8'))
    resp = {}
    resp['count'] = len(res)
    resp['datas']= res
    return json.dumps(resp)

@route('/test')
def test():
    reload(sys)
    sys.setdefaultencoding('utf8')
    webdata = '<h1>check</h1><form action="/replace" method="post"><input type="text" name="words" /><input type="submit"></from>'
    return webdata


run(host='localhost', port=80, debug=True)



敏感词文件为words.txt,每行一个词就可以了,该文件采用gb2312编码,程序均为utf8编码

测试一下api,均为POST请求

过滤敏感词API,直接返回过滤后的数据

使用python开发敏感词检测过滤系统_第1张图片

检测敏感词API,返回json格式数据

使用python开发敏感词检测过滤系统_第2张图片

用bottle来开发这样的小工具相当快,而且敏感词检测系统在很多应用场景都需要,独立出来写成接口可以提高效率,并且易于维护,国内目前第三方敏感词检测服务还不多,天朝的需求又很旺盛,可以试试搞个在线敏感词检测服务平台。

你可能感兴趣的:(使用python开发敏感词检测过滤系统)