之前收集很多网站关键词检索工具类,分享一个很好用的,假如你的项目有评论,聊天这样的需求,可以参考,如果考虑不用自己添加词库,最好是去对接百度分控,网易等网站的敏感词过滤接口api
package net.jeeshop.web.action.front.modelComment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* 敏感词过滤 工具类 -- 【匹配度高,可以使用】
*
* @author g
* @version 0.1
* @CreateDate 2018年11月16日 11:28:32
*/
public class SensitiveWord {
private StringBuilder replaceAll;//初始化
private String encoding = "UTF-8";
private String replceStr = "*";
private int replceSize = 500;
private String fileName = "CensorWords.txt";
private List arrayList;
public Set sensitiveWordSet;//包含的敏感词列表,过滤掉重复项
public List sensitiveWordList;//包含的敏感词列表,包括重复项,统计次数
/**
* 文件要求路径在src或resource下,默认文件名为CensorWords.txt
* @param fileName 词库文件名(含后缀)
*/
public SensitiveWord(String fileName)
{
this.fileName = fileName;
}
/**
* @param replceStr 敏感词被转换的字符
* @param replceSize 初始转义容量
*/
public SensitiveWord(String replceStr,int replceSize)
{
this.replceStr = fileName;
this.replceSize = replceSize;
}
public SensitiveWord()
{
}
/**
* @param str 将要被过滤信息
* @return 过滤后的信息
*/
public String filterInfo(String str)
{ sensitiveWordSet = new HashSet();
sensitiveWordList= new ArrayList<>();
StringBuilder buffer = new StringBuilder(str);
HashMap hash = new HashMap(arrayList.size());
String temp;
for(int x = 0; x < arrayList.size();x++)
{
temp = arrayList.get(x);
int findIndexSize = 0;
for(int start = -1;(start=buffer.indexOf(temp,findIndexSize)) > -1;)
{
//System.out.println("###replace="+temp);
findIndexSize = start+temp.length();//从已找到的后面开始找
Integer mapStart = hash.get(start);//起始位置
if(mapStart == null || (mapStart != null && findIndexSize > mapStart))//满足1个,即可更新map
{
hash.put(start, findIndexSize);
//System.out.println("###敏感词:"+buffer.substring(start, findIndexSize));
}
}
}
Collection values = hash.keySet();
for(Integer startIndex : values)
{
Integer endIndex = hash.get(startIndex);
//获取敏感词,并加入列表,用来统计数量
String sensitive = buffer.substring(startIndex, endIndex);
//System.out.println("###敏感词:"+sensitive);
if (!sensitive.contains("*")) {//添加敏感词到集合
sensitiveWordSet.add(sensitive);
sensitiveWordList.add(sensitive);
}
buffer.replace(startIndex, endIndex, replaceAll.substring(0,endIndex-startIndex));
}
hash.clear();
return buffer.toString();
}
/**
* 初始化敏感词库
*/
public void InitializationWork()
{
replaceAll = new StringBuilder(replceSize);
for(int x=0;x < replceSize;x++)
{
replaceAll.append(replceStr);
}
//加载词库
arrayList = new ArrayList();
InputStreamReader read = null;
BufferedReader bufferedReader = null;
try {
read = new InputStreamReader(SensitiveWord.class.getClassLoader().getResourceAsStream(fileName),encoding);
bufferedReader = new BufferedReader(read);
for(String txt = null;(txt = bufferedReader.readLine()) != null;){
if(!arrayList.contains(txt))
arrayList.add(txt);
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
if(null != bufferedReader)
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
if(null != read)
read.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public StringBuilder getReplaceAll() {
return replaceAll;
}
public void setReplaceAll(StringBuilder replaceAll) {
this.replaceAll = replaceAll;
}
public String getReplceStr() {
return replceStr;
}
public void setReplceStr(String replceStr) {
this.replceStr = replceStr;
}
public int getReplceSize() {
return replceSize;
}
public void setReplceSize(int replceSize) {
this.replceSize = replceSize;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public List getArrayList() {
return arrayList;
}
public void setArrayList(List arrayList) {
this.arrayList = arrayList;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public static void main(String[] args){
long startNumer = System.currentTimeMillis();
SensitiveWord sw = new SensitiveWord("CensorWords.txt");
sw.InitializationWork();
//System.out.println("敏感词的数量:" + arrayList.size());
String str = "太多的伤yuming感情怀也许只局限于饲养基地 荧幕中的情节,主人公尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的伤感。"
+ "然后法轮功 我们的扮演的角色就是跟随着主人yum公的喜红客联盟 怒于饲养基地 荧幕中的情节,主人公尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的伤感。"
+ "然后法轮功 我们的扮演的角色就是跟随着主人yum公的喜红客联盟 怒哀20于饲养基地 荧幕中的情节,主人公尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的伤感。"
+ "然后法轮功 我们的扮演的角色就是跟随着主人yum公的喜红客联盟 怒哀20哀2015/4/16 20152015/4/16乐而过于牵强的把自己的情感也附加于银幕情节中,然后感动就流泪,"
+ "关, 人, 流, 电, 发, 情, 太, 限, 法轮功, 个人, 经, 色, 许, 公, 动, 地, 方, 基, 在, 上, 红, 强, 自杀指南, 制, 卡, 三级片, 一, 夜, 多, 手机, 于, 自,"
+ "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个人一杯红酒一部电影在夜三级片 深人静的晚上,关上电话静静的发呆着。";
System.out.println("被检测字符串长度:"+str.length());
str = sw.filterInfo(str);
long endNumber = System.currentTimeMillis();
//System.out.println("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
//System.out.println("语句中包含敏感词的个数为:" + sensitiveWordList.size() + "。包含:" + sensitiveWordList);
System.out.println("总共耗时:"+(endNumber-startNumer)+"ms");
System.out.println("替换后的字符串为:\n"+str);
}
}