直播弹幕过滤敏感词之DFA算法

最近遇到了一个需求,将弹幕里面的敏感词过滤

一  、DFA算法

二  、java实现

1. 初始化敏感词库,将敏感词加入到 HashMap 中,考虑到搜索效率,这里我们将敏感词库存储在Redis


public class SensitiveWordInit {
//字符编码
private String ENCODING = "GBK";
public HashMap sensitiveWordMap;

SensitiveWordInit(){
super();
}

/**
* @version 1.0
*/
public Map initKeyWord(){
try {
//读取敏感词库
Set keyWordSet = readSensitiveWordFile();
//将敏感词库加入到HashMap中
addSensitiveWordToHashMap(keyWordSet);
} catch (Exception e) {
e.printStackTrace();
}
return sensitiveWordMap;
}

/**
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:

* @param keyWordSet 敏感词库
* @version 1.0
*/
private void addSensitiveWordToHashMap(Set keyWordSet) {
//初始化敏感词容器,减少扩容操作
sensitiveWordMap = new HashMap(keyWordSet.size());
String key = null;
Map nowMap = null;
Map newWorMap = null;
//迭代keyWordSet
Iterator iterator = keyWordSet.iterator();
while(iterator.hasNext()){
//关键字
key = iterator.next();
nowMap = sensitiveWordMap;
for(int i = 0 ; i < key.length() ; i++){
//转换成char型
char keyChar = key.charAt(i);
//获取
Object wordMap = nowMap.get(keyChar);
//如果存在该key,直接赋值
if(wordMap != null){
nowMap = (Map) wordMap;
}
else{ //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
newWorMap = new HashMap();
//不是最后一个
newWorMap.put("isEnd", "0");
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}

if(i == key.length() - 1){
//最后一个
nowMap.put("isEnd", "1");
}
}
}
}

/**
* 读取敏感词库中的内容,将内容添加到set集合中
* @version 1.0
* @throws Exception
*/
@SuppressWarnings("resource")
private Set readSensitiveWordFile() throws Exception{
Set set = new HashSet();
//缓存读取
String SensitiveWord = RedisUtil.getInstance(5).get("SensitiveWord");
String[] words = SensitiveWord.split("\n");
for(String str : words){
set.add(str);
}
// //读取文件
// File file = new File("D:\\SensitiveWord.txt");
// InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
// try {
// //文件流是否存在
// if(file.isFile() && file.exists()){
// set = new HashSet();
// BufferedReader bufferedReader = new BufferedReader(read);
// String txt = null;
// //读取文件,将文件内容放入到set中
// while((txt = bufferedReader.readLine()) != null){
// set.add(txt);
// }
// }
// else{ //不存在抛出异常信息
// throw new Exception("敏感词库文件不存在");
// }
// } catch (Exception e) {
// throw e;
// }finally{
// read.close(); //关闭文件流
// }
return set;
}
}

             
           2.写一个敏感词工具类,其方法包括是是否包含敏感词,获取敏感词替换词

public class SensitivewordFilterUtil {
public Map sensitiveWordMap = null;
/**
* 最小匹配规则
*/
public static int minMatchTYpe = 1;
/**
* 最大匹配规则
*/
public static int maxMatchType = 2;

/**
* 构造函数,初始化敏感词库
*/
public SensitivewordFilterUtil(){
sensitiveWordMap = new SensitiveWordInit().initKeyWord();
}

/**
* 判断文字是否包含敏感字符
* @param txt 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return 若包含返回true,否则返回false
* @version 1.0
*/
public boolean isContaintSensitiveWord(String txt,int matchType){
boolean flag = false;
for(int i = 0 ; i < txt.length() ; i++){
//判断是否包含敏感字符
int matchFlag = this.checkSensitiveWord(txt, i, matchType);
//大于0存在,返回true
if(matchFlag > 0){
flag = true;
}
}
return flag;
}

/**
* 获取文字中的敏感词
* @param txt 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return
* @version 1.0
*/
public Set getSensitiveWord(String txt , int matchType){
Set sensitiveWordList = new HashSet();

for(int i = 0 ; i < txt.length() ; i++){
//判断是否包含敏感字符
int length = checkSensitiveWord(txt, i, matchType);
//存在,加入list中
if(length > 0){
sensitiveWordList.add(txt.substring(i, i+length));
//减1的原因,是因为for会自增
i = i + length - 1;
}
}

return sensitiveWordList;
}

/**
* 替换敏感字字符
* @param txt
* @param matchType
* @param replaceChar 替换字符,默认*
* @version 1.0
*/
public String replaceSensitiveWord(String txt,int matchType,String replaceChar){
String resultTxt = txt;
//获取所有的敏感词
Set set = getSensitiveWord(txt, matchType);
Iterator iterator = set.iterator();
String word = null;
String replaceString = null;
while (iterator.hasNext()) {
word = iterator.next();
replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}

return resultTxt;
}

/**
* 获取替换字符串
* @param replaceChar
* @param length
* @return
* @version 1.0
*/
private String getReplaceChars(String replaceChar,int length){
String resultReplace = replaceChar;
for(int i = 1 ; i < length ; i++){
resultReplace += replaceChar;
}

return resultReplace;
}

/**
* 检查文字中是否包含敏感字符,检查规则如下:

* @param txt
* @param beginIndex
* @param matchType
* @return,如果存在,则返回敏感词字符的长度,不存在返回0
* @version 1.0
*/
@SuppressWarnings({ "rawtypes"})
public int checkSensitiveWord(String txt,int beginIndex,int matchType){
//敏感词结束标识位:用于敏感词只有1位的情况
boolean flag = false;
//匹配标识数默认为0
int matchFlag = 0;
char word = 0;
Map nowMap = sensitiveWordMap;
for(int i = beginIndex; i < txt.length() ; i++){
word = txt.charAt(i);
//获取指定key
nowMap = (Map) nowMap.get(word);
//存在,则判断是否为最后一个
if(nowMap != null){
matchFlag++; //找到相应key,匹配标识+1
//如果为最后一个匹配规则,结束循环,返回匹配标识数
if("1".equals(nowMap.get("isEnd"))){
//结束标志位为true
flag = true;
//最小规则,直接返回,最大规则还需继续查找
if(SensitivewordFilterUtil.minMatchTYpe == matchType){
break;
}
}
}
else{ //不存在,直接返回
break;
}
}
//长度必须大于等于1,为词
if(matchFlag < 2 || !flag){
matchFlag = 0;
}
return matchFlag;
}

/**
* @param message
*/
public String filterMessage(String message){
//包含敏感词汇
if(isContaintSensitiveWord(message, 1) || isContaintSensitiveWord(message, 2)){
String messageText = RedisUtil.getInstance(5).get("ReplaceMessage");
String[] messages = messageText.split("\n");
Random random = new Random();
return messages[random.nextInt(messages.length)];
}
return message;
}
}


3.编写测试类

public class test {

public static void main(String[] args) {
String text = "你是傻逼吗,fuck you ";
SensitiveWordService sensitiveWordService = new SensitiveWordService();
System.out.println("before>>>>>>>>>>>"+text);
String result=sensitiveWordService.sensitiveWordFiltering(text);
System.out.println("after>>>>>>>>>>>>"+result);

}


}

class SensitiveWordService{

public String sensitiveWordFiltering(String text) {
SensitivewordFilterUtil sensitivewordFilterUtil = new SensitivewordFilterUtil();
boolean flag=sensitivewordFilterUtil.isContaintSensitiveWord(text, 2);
if(flag==false){
return text;
}else{
String replaceChar=sensitivewordFilterUtil.filterMessage(text);
String resultTxt = sensitivewordFilterUtil.replaceSensitiveWord(text, 2, replaceChar);
return resultTxt;
}
}

}


4.测试结果












你可能感兴趣的:(直播弹幕过滤敏感词之DFA算法)