这里使用python 的 bottle框架来做简易的敏感词过滤系统,算法采用成熟的DFA关键词匹配算法,本系统只提供一套基于http的 api,可以提供给各个应用使用。这里我只把最核心的业务实现,其他的再完善。
检索算法是网上找的DFA算法的python实现版本
smallgfw.py
#encoding=utf-8 #DFA based text filter #version=0.3 class GFW(object): def __init__(self): self.d = {} #give a list of "ming gan ci" def set(self,keywords): p = self.d q = {} k = '' for word in keywords: word += chr(11) p = self.d for char in word: char = char.lower() if p=='': q[k] = {} p = q[k] if not (char in p): p[char] = '' q = p k = char p = p[char] pass def replace(self,text,mask): """ >>> gfw = GFW() >>> gfw.set(["sexy","girl","love","shit"]) >>> s = gfw.replace("Shit!,Cherry is a sexy girl. She loves python.","*") >>> print s *!,Cherry is a * *. She *s python. """ p = self.d i = 0 j = 0 z = 0 result = [] ln = len(text) while i+j<ln: #print i,j t = text[i+j].lower() #print hex(ord(t)) if not (t in p): j = 0 i += 1 p = self.d continue p = p[t] j+=1 if chr(11) in p: p = self.d result.append(text[z:i]) result.append(mask) i = i+j z = i j = 0 result.append(text[z:i+j]) return "".join(result) def check(self,text): """ >>> gfw = GFW() >>> gfw.set(["abd","defz","bcz"]) >>> print gfw.check("xabdabczabdxaadefz") [(1, 3, 'abd'), (5, 3, 'bcz'), (8, 3, 'abd'), (14, 4, 'defz')] """ p = self.d i = 0 j = 0 result = [] ln = len(text) while i+j<ln: t = text[i+j].lower() #print i,j,hex(ord(t)) if not (t in p): j = 0 i += 1 p = self.d continue p = p[t] j+=1 #print p,i,j if chr(11) in p: p = self.d result.append((i,j,text[i:i+j])) i = i+j j = 0 return result if __name__=="__main__": import doctest,sys doctest.testmod(sys.modules[__name__])
#-*- coding:utf-8 -*- #localhost testing #caroltc 2014/10/7 from bottle import route, run, request from smallgfw import * import json import sys def initWords(): path = 'words.txt' fp = open(path,'r') word_list = [] for line in fp: line = line[0:-1] word_list.append(line) fp.close() return word_list @route('/replace', method="POST") def replace(): reload(sys) sys.setdefaultencoding('utf8') getwords = request.params.words or "" gfw = GFW() words = initWords() gfw.set(words)#设置敏感词列表 res = gfw.check(getwords.encode('utf8')) # for obj in res: # print json.dumps(obj),obj[2] s = gfw.replace(getwords.encode('utf8'),"**") return s @route('/check',method="POST") def check(): reload(sys) sys.setdefaultencoding('utf8') getwords = request.params.words or "" gfw = GFW() words = initWords() gfw.set(words)#设置敏感词列表 res = gfw.check(getwords.encode('utf8')) resp = {} resp['count'] = len(res) resp['datas']= res return json.dumps(resp) @route('/test') def test(): reload(sys) sys.setdefaultencoding('utf8') webdata = '<h1>check</h1><form action="/replace" method="post"><input type="text" name="words" /><input type="submit"></from>' return webdata run(host='localhost', port=80, debug=True)
测试一下api,均为POST请求
过滤敏感词API,直接返回过滤后的数据
检测敏感词API,返回json格式数据
用bottle来开发这样的小工具相当快,而且敏感词检测系统在很多应用场景都需要,独立出来写成接口可以提高效率,并且易于维护,国内目前第三方敏感词检测服务还不多,天朝的需求又很旺盛,可以试试搞个在线敏感词检测服务平台。