敏感词过滤工具类

/**
 * @author Administrator
 * @create 2018/7/31
 */
public class SensitiveWordUtil {

    private static Logger logger = LoggerFactory.getLogger(SensitiveWordUtil.class);
    /**
     * 常量定义
     */
    private static final String KEY_IS_END = "isEnd";
    private static final String IS_END = "1";
    private static final String NOT_END = "0";
    private static final String filePath = "src/main/resources/dictionary.txt";
    /**
     * 敏感词匹配规则
     * 最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人
     * 最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]
     */
    public static final int MIN_MATCH_TYPE = 1;
    public static final int MAX_MaTCH_TYPE = 2;
    /**
     * 本地存储的DFA数据模型
     */
    private static Map sensitiveWordMap;

    static {
        // 初始化操作
        initSensitiveWordMap(getSensitiveWordSet());
    }

    /**
     * 获取本地词典
     * @return
     */
    private static Set getSensitiveWordSet() {
        InputStream inputStream = null;
        InputStreamReader inputStreamReader = null;
        BufferedReader bufferedReader = null;
        try {
            File file = new File(filePath);
            inputStream = new FileInputStream(file);
            inputStreamReader = new InputStreamReader(inputStream, "UTf-8");
            bufferedReader = new BufferedReader(inputStreamReader);
            Set sensitiveWordSet = new HashSet<>();
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                sensitiveWordSet.add(line);
            }
            logger.info("敏感词的数量:" + sensitiveWordSet.size());
            return sensitiveWordSet;
        }  catch (Exception e) {
            logger.error("获取本地敏感词库出错",e);
        } finally {
            // 关闭资源
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
                if (inputStreamReader != null) {
                    inputStreamReader.close();
                }
                if (inputStream != null) {
                    inputStream.close();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return null;
    }

    /**
     * 将词典转化为DFA数据模型
     * @param sensitiveWordSet
     */
    private static void initSensitiveWordMap(Set sensitiveWordSet) {
        // 校验
        if (sensitiveWordSet == null || sensitiveWordSet.size() <= 0) {
            return;
        }
        //初始化敏感词容器,减少扩容操作
        sensitiveWordMap = new HashMap(sensitiveWordSet.size());
        String key;
        Map nowMap;
        Map newWorMap;
        //迭代sensitiveWordSet
        Iterator iterator = sensitiveWordSet.iterator();
        while (iterator.hasNext()) {
            //关键字
            key = iterator.next();
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                //转换成char型
                char keyChar = key.charAt(i);
                //库中获取关键字
                Object wordMap = nowMap.get(keyChar);
                //如果存在该key,直接赋值,用于下一个循环获取
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                    newWorMap = new HashMap<>(2);
                    //不是最后一个
                    newWorMap.put(KEY_IS_END, NOT_END);
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                if (i == key.length() - 1) {
                    //最后一个
                    nowMap.put(KEY_IS_END, IS_END);
                }
            }
        }
    }

    /**
     * 检查文字中是否包含敏感字符,检查规则如下:
* @param txt * @param beginIndex * @param matchType * @return 如果存在,则返回敏感词字符的长度,不存在返回0 */ private static int checkSensitiveWord(String txt, int beginIndex, int matchType) { //敏感词结束标识位:用于敏感词只有1位的情况 boolean flag = false; //匹配标识数默认为0 int matchFlag = 0; char word; Map nowMap = sensitiveWordMap; for (int i = beginIndex; i < txt.length(); i++) { word = txt.charAt(i); //获取指定key nowMap = (Map) nowMap.get(word); if (nowMap != null) { //存在,则判断是否为最后一个 //找到相应key,匹配标识+1 matchFlag++; //如果为最后一个匹配规则,结束循环,返回匹配标识数 if ("1".equals(nowMap.get("isEnd"))) { //结束标志位为true flag = true; //最小规则,直接返回,最大规则还需继续查找 if (MIN_MATCH_TYPE == matchType) { break; } } } else {//不存在,直接返回 break; } } if (matchFlag < 2 || !flag) { //长度必须大于等于1,为词 matchFlag = 0; } return matchFlag; } /** * 获取文字中的敏感词 * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return */ private static Set getSensitiveWord(String txt, int matchType) { Set sensitiveWordSet = new HashSet<>(); for (int i = 0; i < txt.length(); i++) { //判断是否包含敏感字符 int length = checkSensitiveWord(txt, i, matchType); if (length > 0) { //存在,加入set中 sensitiveWordSet.add(txt.substring(i, i + length)); //减1的原因,是因为for会自增 i = i + length - 1; } } logger.info("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet); return sensitiveWordSet; } /** * 判断文字是否包含敏感字符 * @param txt 文字 * @return 若包含返回true,否则返回false */ public static boolean contains(String txt) { logger.info("待检测的字符串为:" + txt); Set sensitiveWord = getSensitiveWord(txt, MAX_MaTCH_TYPE); if (sensitiveWord != null && sensitiveWord.size() > 0) { return true; } return false; } }

你可能感兴趣的:(敏感词过滤工具类)