运用DFA算法加密。
首先我先对敏感词库初始化,若我的敏感词库为
冰毒
白粉
大麻
大坏蛋
初始化之后得到的是下面这样。:
{冰={毒={isEnd=1}, isEnd=0}, 白={粉={isEnd=1}, isEnd=0}, 大={麻={isEnd=1}, isEnd=0, 坏={蛋={isEnd=1}, isEnd=0}}}。
ok,我把初始化之后的数据用A来表示。假设待检测的文字为:张三是个大坏蛋,他竟然吸食白粉和冰毒。
后面检测文字中是否有敏感词的时候,先把要检测的文字迭代循环,并转换成charAt值,这样的话,
如果 A.get(charAt) 为空的话,说明这个字不在敏感词库中,比如 "张","三","是","个" ........
如果 A.get(charAt) 不为空的话,说明这个字存在敏感词库中,比如 "大","坏","蛋" ...........
假设我们检测到 "大" "坏" 的时候,发现这个字存在于敏感词库中,这个时候需要看项目需求,如果只是检测 输入框内是否含有敏感词,
那这里就可以不进行判断了,已经含有敏感词了。
如果要把所有的敏感词用 "*" 号替换的话,那就要继续往下匹配,判断该敏感词是否是最后一个......
以上就是基本思路了,下面上代码 ,不懂的可以留言给我。。。
温馨提示:
初始化敏感词库的时候
1、加了redis缓存
2、敏感词库我是放在了服务器下面
3、编码格式注意,代码里的编码格式要与你的敏感词库的编码格式一致。utf-8或者gbk。(win下把txt另存为可以看到,linux下vim txt,:set fileencoding)
linux下文件编码格式转换,这里是gbk -> utf-8:iconv -f gb18030 -t utf-8 sensitiveword.txt -o sensitiveword.txt
你们用main方法测试的时候,要把缓存注释掉,敏感词库路径改为 你们本地。
/*
* Project: admin.common
*
* File Created at 2017年8月23日
*
* Copyright 2016 CMCC Corporation Limited.
* All rights reserved.
*
* This software is the confidential and proprietary information of
* ZYHY Company. ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license.
*/
package com.cmcc.admin.common.sensitive;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* @Type SensitiveWordFilter.java
* @Desc
* @author whb
* @date 2017年8月23日 下午1:56:38
* @version
*/
public class SensitiveWordFilter {
@SuppressWarnings("rawtypes")
private Map sensitiveWordMap = null;
public static int minMatchType = 1; //最小匹配规则
public static int maxMatchType = 2; //最大匹配规则
/**
* 构造函数,初始化敏感词库
* @throws Exception
* @since 1.8
* @author whb
*/
public SensitiveWordFilter() throws Exception {
sensitiveWordMap = new SensitiveWordInit().initKeyWord();
}
/**
* 检查文字中敏感词的长度
* @param txt
* @param beginIndex
* @param matchType
* @return 如果存在,则返回敏感词字符的长度,不存在返回0
* @since 1.8
* @author whb
*/
@SuppressWarnings("rawtypes")
public int checkSensitiveWord(String txt, int beginIndex, int matchType) {
Map nowMap = sensitiveWordMap;
boolean flag = false; //敏感词结束标识位:用于敏感词只有1位的情况
char word = 0;
int matchFlag = 0; //匹配标识数默认为0
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
nowMap = (Map) nowMap.get(word); //获取指定key
if (nowMap == null) {
break;//不存在,直接返回
}
//输入的字(排列组合的匹配)出现在敏感词库中,判断是否为最后一个
matchFlag++; //找到相应key,匹配标识+1
if (isEnd(nowMap)) { //如果为最后一个匹配规则,结束循环,返回匹配标识数
flag = true; //结束标志位为true
if (SensitiveWordFilter.minMatchType == matchType) {
break;//最小规则,直接返回,最大规则还需继续查找
}
}
}
if (matchFlag < 2 || !flag) { //长度必须大于等于1,为词
matchFlag = 0;
}
return matchFlag;
}
/**
* 是否包含敏感词
* @param txt
* @param matchType
* @return true:是;false:否
* @since 1.8
* @author whb
*/
public boolean isContaintSensitiveWord(String txt, int matchType) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
int matchFlag = this.checkSensitiveWord(txt, i, matchType);
if (matchFlag > 0) {
flag = true;
}
}
return flag;
}
/**
* 是否包含敏感词(重庆项目默认值,按最小匹配规则来,只要有敏感词就ok)
* 如果敏感词库为:
* 中
* 中国
* 中国人
* 初始化之后为:{中={isEnd=1, 国={人={isEnd=1}, isEnd=1}}}
* 测试的文字为:我是一名中国人。
* 1、按最小规则匹配, 匹配 中 的时候,就为最后一个了 直接break。
* 2、按最大规则匹配, 匹配 中 的时候,就为最后一个,继续匹配 国,人。
* @param txt
* @return true:是;false:否
* @since 1.8
* @author whb
*/
public boolean isSensitive(String txt) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
int matchFlag = this.checkSensitiveWord(txt, i, 1);
if (matchFlag > 0) {
flag = true;
}
}
return flag;
}
/**
* 获取文字中的敏感词
* @param txt
* @param matchType
* @return
* @since 1.8
* @author whb
*/
public Set getSensitiveWord(String txt, int matchType) {
Set sensitiveWordList = new HashSet();
for (int i = 0; i < txt.length(); i++) {
int length = checkSensitiveWord(txt, i, matchType);
if (length > 0) { //存在,加入list中
sensitiveWordList.add(txt.substring(i, i + length));
i = i + length - 1; //减1的原因,是因为for会自增
}
}
return sensitiveWordList;
}
/**
* 替换敏感字字符
* @param txt
* @param matchType
* @param replaceChar
* @return
* @since 1.8
* @author whb
*/
public String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
String resultTxt = txt;
Set set = this.getSensitiveWord(txt, matchType); //获取所有的敏感词
Iterator iterator = set.iterator();
String word = null;
String replaceString = null;
while (iterator.hasNext()) {
word = iterator.next();
replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}
return resultTxt;
}
/**
* 获取替换字符串
* @param replaceChar
* @param length
* @return
* @since 1.8
* @author whb
*/
private String getReplaceChars(String replaceChar, int length) {
String resultReplace = replaceChar;
for (int i = 1; i < length; i++) {
resultReplace += replaceChar;
}
return resultReplace;
}
/**
* 判断是否为最后一个
* @param nowMap
* @return
* @since 1.8
* @author whb
*/
@SuppressWarnings("rawtypes")
private boolean isEnd(Map nowMap) {
boolean flag = false;
if ("1".equals(nowMap.get("isEnd"))) {
flag = true;
}
return flag;
}
public static void main(String[] args) throws Exception {
SensitiveWordFilter filter = new SensitiveWordFilter();
System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size());
String string = "王弘博是个大坏蛋,他竟然吸食白粉和冰毒";
System.out.println("待检测语句的字数:" + string.length());
long beginTime = System.currentTimeMillis();
Set set = filter.getSensitiveWord(string, 1);
String result = filter.replaceSensitiveWord(string, 1, "*");
boolean flag = filter.isSensitive(string);
System.out.println(flag);
long endTime = System.currentTimeMillis();
System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);
System.out.println("敏感词处理之后为:"+result);
System.out.println("总共消耗时间为:" + (endTime - beginTime));
}
}
/**
* Revision history
* -------------------------------------------------------------------------
*
* Date Author Note
* -------------------------------------------------------------------------
* 2017年8月23日 whb create
*/
/*
* Project: admin.common
*
* File Created at 2017年8月23日
*
* Copyright 2016 CMCC Corporation Limited.
* All rights reserved.
*
* This software is the confidential and proprietary information of
* ZYHY Company. ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license.
*/
package com.cmcc.admin.common.sensitive;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.AbstractApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.cmcc.aqb.cache.redis.RedisClient;
/**
* @Type SensitiveWordInit.java
* @Desc
* @author whb
* @date 2017年8月23日 下午1:57:03
* @version
*/
public class SensitiveWordInit {
private static final String ENCODING = "utf-8"; //字符编码
@SuppressWarnings("rawtypes")
public HashMap sensitiveWordMap;
public SensitiveWordInit() {
super();
}
static RedisClient redisClient = null;
private static String SPILIT = "#";
private static int EXPIRE_TIME = 3600;// seconds
private static String SENSITIVE_WORD = SensitiveWordInit.class.getName();
private String sensitiveWordKey(String type) {
StringBuilder sb = new StringBuilder();
sb.append(type).append(SPILIT).append("sensitiveWordInit");
return sb.toString();
}
/**
*
* @return
* @throws Exception
* @since 1.8
* @author whb
*/
@SuppressWarnings({ "rawtypes", "resource" })
public Map initKeyWord() {
try {
ApplicationContext ac = new ClassPathXmlApplicationContext(new String[] {
"spring/datasource.xml", "spring/cache.xml" });
redisClient = (RedisClient) ac.getBean("redisClient");
String key = sensitiveWordKey(SENSITIVE_WORD);
sensitiveWordMap = redisClient.get(key);
if (sensitiveWordMap == null) {
Set set = readSensitiveWordFile();
addSensitiveWordToHashMap(set);
redisClient.put(key, sensitiveWordMap, EXPIRE_TIME);
}
((AbstractApplicationContext) ac).registerShutdownHook();
return sensitiveWordMap;
} catch (Exception e) {
throw new RuntimeException("初始化敏感词库错误");
}
}
/**
* 读取敏感词库,并把内容放到set里
* @return
* @throws Exception
* @since 1.8
* @author whb
*/
private Set readSensitiveWordFile() throws Exception {
Set set = null;
File file = new File("/home/sensitiveword.txt");
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
new FileInputStream(file), ENCODING))) {
if (file.isFile() && file.exists()) {
set = new HashSet();
String txt = null;
while ((txt = bufferedReader.readLine()) != null) {
set.add(txt);
}
} else {
throw new Exception("敏感词库文件不存在");
}
} catch (Exception e) {
e.printStackTrace();
throw e;
}
return set;
}
/**
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
* 中 = {
* isEnd = 0
* 国 = {
* isEnd = 1
* 人 = {isEnd = 0
* 民 = {isEnd = 1}
* }
* 男 = {
* isEnd = 0
* 人 = {
* isEnd = 1
* }
* }
* }
* }
* 五 = {
* isEnd = 0
* 星 = {
* isEnd = 0
* 红 = {
* isEnd = 0
* 旗 = {
* isEnd = 1
* }
* }
* }
* }
* @param keyWordSet
* @since 1.8
* @author whb
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
private void addSensitiveWordToHashMap(Set keyWordSet) {
sensitiveWordMap = new HashMap(keyWordSet.size()); //初始化敏感词容器,避免扩容操作
String key = null;
Map nowMap = null;
Map newWorMap = null;
Iterator iterator = keyWordSet.iterator();
while (iterator.hasNext()) {
key = iterator.next();
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
char charKey = key.charAt(i); //转换成char型
Object wordMap = nowMap.get(charKey);
if (wordMap != null) {
nowMap = (Map) wordMap; //一个一个放进Map中
} else { //不存在,则构建一个Map,同时将isEnd设置为0,因为它不是最后一个
newWorMap = new HashMap();
newWorMap.put("isEnd", "0");//不是最后一个
nowMap.put(charKey, newWorMap);//没有这个key,就把(isEnd,0) 放在Map中
nowMap = newWorMap;
}
if (i == key.length() - 1) { //最后一个
nowMap.put("isEnd", "1");
}
}
}
}
}
/**
* Revision history
* -------------------------------------------------------------------------
*
* Date Author Note
* -------------------------------------------------------------------------
* 2017年8月23日 whb create
*/