DFA敏感词过滤算法

运用DFA算法加密。

首先我先对敏感词库初始化,若我的敏感词库为

冰毒
白粉
大麻
大坏蛋

初始化之后得到的是下面这样。:

{冰={毒={isEnd=1}, isEnd=0}, 白={粉={isEnd=1}, isEnd=0}, 大={麻={isEnd=1}, isEnd=0, 坏={蛋={isEnd=1}, isEnd=0}}}。

ok,我把初始化之后的数据用A来表示。假设待检测的文字为:张三是个大坏蛋,他竟然吸食白粉和冰毒。

后面检测文字中是否有敏感词的时候,先把要检测的文字迭代循环,并转换成charAt值,这样的话,

如果 A.get(charAt) 为空的话,说明这个字不在敏感词库中,比如 "张","三","是","个" ........

如果 A.get(charAt) 不为空的话,说明这个字存在敏感词库中,比如 "大","坏","蛋" ...........

假设我们检测到  "大" "坏" 的时候,发现这个字存在于敏感词库中,这个时候需要看项目需求,如果只是检测 输入框内是否含有敏感词,

那这里就可以不进行判断了,已经含有敏感词了。

如果要把所有的敏感词用 "*" 号替换的话,那就要继续往下匹配,判断该敏感词是否是最后一个......

以上就是基本思路了,下面上代码 ,不懂的可以留言给我。。。

温馨提示:

初始化敏感词库的时候

1、加了redis缓存

2、敏感词库我是放在了服务器下面

3、编码格式注意,代码里的编码格式要与你的敏感词库的编码格式一致。utf-8或者gbk。(win下把txt另存为可以看到,linux下vim txt,:set fileencoding)

linux下文件编码格式转换,这里是gbk -> utf-8:iconv -f gb18030 -t utf-8 sensitiveword.txt -o sensitiveword.txt

你们用main方法测试的时候,要把缓存注释掉,敏感词库路径改为 你们本地。


/*
 * Project: admin.common
 *
 * File Created at 2017年8月23日
 *
 * Copyright 2016 CMCC Corporation Limited.
 * All rights reserved.
 *
 * This software is the confidential and proprietary information of
 * ZYHY Company. ("Confidential Information").  You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license.
 */
package com.cmcc.admin.common.sensitive;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * @Type SensitiveWordFilter.java
 * @Desc
 * @author whb
 * @date 2017年8月23日 下午1:56:38
 * @version
 */
public class SensitiveWordFilter {
    @SuppressWarnings("rawtypes")
    private Map sensitiveWordMap = null;
    public static int minMatchType = 1; //最小匹配规则
    public static int maxMatchType = 2; //最大匹配规则

    /**
     * 构造函数,初始化敏感词库
     * @throws Exception
     * @since 1.8
     * @author whb
     */
    public SensitiveWordFilter() throws Exception {
        sensitiveWordMap = new SensitiveWordInit().initKeyWord();
    }

    /**
     * 检查文字中敏感词的长度
     * @param txt
     * @param beginIndex
     * @param matchType
     * @return 如果存在,则返回敏感词字符的长度,不存在返回0
     * @since 1.8
     * @author whb
     */
    @SuppressWarnings("rawtypes")
    public int checkSensitiveWord(String txt, int beginIndex, int matchType) {
        Map nowMap = sensitiveWordMap;
        boolean flag = false; //敏感词结束标识位:用于敏感词只有1位的情况
        char word = 0;
        int matchFlag = 0; //匹配标识数默认为0
        for (int i = beginIndex; i < txt.length(); i++) {
            word = txt.charAt(i);
            nowMap = (Map) nowMap.get(word); //获取指定key
            if (nowMap == null) {
                break;//不存在,直接返回
            }
            //输入的字(排列组合的匹配)出现在敏感词库中,判断是否为最后一个
            matchFlag++; //找到相应key,匹配标识+1
            if (isEnd(nowMap)) { //如果为最后一个匹配规则,结束循环,返回匹配标识数
                flag = true; //结束标志位为true
                if (SensitiveWordFilter.minMatchType == matchType) {
                    break;//最小规则,直接返回,最大规则还需继续查找
                }
            }
        }
        if (matchFlag < 2 || !flag) { //长度必须大于等于1,为词
            matchFlag = 0;
        }
        return matchFlag;
    }

    /**
     * 是否包含敏感词
     * @param txt
     * @param matchType
     * @return true:是;false:否
     * @since 1.8
     * @author whb
     */
    public boolean isContaintSensitiveWord(String txt, int matchType) {
        boolean flag = false;
        for (int i = 0; i < txt.length(); i++) {
            int matchFlag = this.checkSensitiveWord(txt, i, matchType);
            if (matchFlag > 0) {
                flag = true;
            }
        }
        return flag;
    }

    /**
     * 是否包含敏感词(重庆项目默认值,按最小匹配规则来,只要有敏感词就ok)
     * 如果敏感词库为:
     *          中
     *          中国
     *          中国人
     *  初始化之后为:{中={isEnd=1, 国={人={isEnd=1}, isEnd=1}}}
     *  测试的文字为:我是一名中国人。
     *  1、按最小规则匹配,  匹配 中 的时候,就为最后一个了 直接break。
     *  2、按最大规则匹配,  匹配 中 的时候,就为最后一个,继续匹配 国,人。
     * @param txt
     * @return true:是;false:否
     * @since 1.8
     * @author whb
     */
    public boolean isSensitive(String txt) {
        boolean flag = false;
        for (int i = 0; i < txt.length(); i++) {
            int matchFlag = this.checkSensitiveWord(txt, i, 1);
            if (matchFlag > 0) {
                flag = true;
            }
        }
        return flag;
    }

    /**
     * 获取文字中的敏感词
     * @param txt
     * @param matchType
     * @return
     * @since 1.8
     * @author whb
     */
    public Set getSensitiveWord(String txt, int matchType) {
        Set sensitiveWordList = new HashSet();
        for (int i = 0; i < txt.length(); i++) {
            int length = checkSensitiveWord(txt, i, matchType);
            if (length > 0) { //存在,加入list中
                sensitiveWordList.add(txt.substring(i, i + length));
                i = i + length - 1; //减1的原因,是因为for会自增
            }
        }
        return sensitiveWordList;
    }

    /**
     * 替换敏感字字符
     * @param txt
     * @param matchType
     * @param replaceChar
     * @return
     * @since 1.8
     * @author whb
     */
    public String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
        String resultTxt = txt;
        Set set = this.getSensitiveWord(txt, matchType); //获取所有的敏感词
        Iterator iterator = set.iterator();
        String word = null;
        String replaceString = null;
        while (iterator.hasNext()) {
            word = iterator.next();
            replaceString = getReplaceChars(replaceChar, word.length());
            resultTxt = resultTxt.replaceAll(word, replaceString);
        }
        return resultTxt;
    }

    /**
     * 获取替换字符串
     * @param replaceChar
     * @param length
     * @return
     * @since 1.8
     * @author whb
     */
    private String getReplaceChars(String replaceChar, int length) {
        String resultReplace = replaceChar;
        for (int i = 1; i < length; i++) {
            resultReplace += replaceChar;
        }
        return resultReplace;
    }

    /**
     * 判断是否为最后一个
     * @param nowMap
     * @return
     * @since 1.8
     * @author whb
     */
    @SuppressWarnings("rawtypes")
    private boolean isEnd(Map nowMap) {
        boolean flag = false;
        if ("1".equals(nowMap.get("isEnd"))) {
            flag = true;
        }
        return flag;
    }

    public static void main(String[] args) throws Exception {
        SensitiveWordFilter filter = new SensitiveWordFilter();
        System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size());
        String string = "王弘博是个大坏蛋,他竟然吸食白粉和冰毒";
        System.out.println("待检测语句的字数:" + string.length());
        long beginTime = System.currentTimeMillis();
        Set set = filter.getSensitiveWord(string, 1);
         String result = filter.replaceSensitiveWord(string, 1, "*");
        boolean flag = filter.isSensitive(string);
        System.out.println(flag);
        long endTime = System.currentTimeMillis();
          System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);
         System.out.println("敏感词处理之后为:"+result);
        System.out.println("总共消耗时间为:" + (endTime - beginTime));
    }
}

/**
 * Revision history
 * -------------------------------------------------------------------------
 *
 * Date Author Note
 * -------------------------------------------------------------------------
 * 2017年8月23日 whb create
 */

/*
 * Project: admin.common
 *
 * File Created at 2017年8月23日
 *
 * Copyright 2016 CMCC Corporation Limited.
 * All rights reserved.
 *
 * This software is the confidential and proprietary information of
 * ZYHY Company. ("Confidential Information").  You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license.
 */
package com.cmcc.admin.common.sensitive;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.springframework.context.ApplicationContext;
import org.springframework.context.support.AbstractApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import com.cmcc.aqb.cache.redis.RedisClient;

/**
 * @Type SensitiveWordInit.java
 * @Desc
 * @author whb
 * @date 2017年8月23日 下午1:57:03
 * @version
 */
public class SensitiveWordInit {

    private static final String ENCODING = "utf-8"; //字符编码

    @SuppressWarnings("rawtypes")
    public HashMap sensitiveWordMap;

    public SensitiveWordInit() {
        super();
    }

    static RedisClient redisClient = null;
    private static String SPILIT = "#";
    private static int EXPIRE_TIME = 3600;// seconds
    private static String SENSITIVE_WORD = SensitiveWordInit.class.getName();

    private String sensitiveWordKey(String type) {
        StringBuilder sb = new StringBuilder();
        sb.append(type).append(SPILIT).append("sensitiveWordInit");
        return sb.toString();
    }

    /**
     *
     * @return
     * @throws Exception
     * @since 1.8
     * @author whb
     */
    @SuppressWarnings({ "rawtypes", "resource" })
    public Map initKeyWord() {
        try {
            ApplicationContext ac = new ClassPathXmlApplicationContext(new String[] {
                    "spring/datasource.xml", "spring/cache.xml" });
            redisClient = (RedisClient) ac.getBean("redisClient");
            String key = sensitiveWordKey(SENSITIVE_WORD);
            sensitiveWordMap = redisClient.get(key);
            if (sensitiveWordMap == null) {
                Set set = readSensitiveWordFile();
                addSensitiveWordToHashMap(set);
                redisClient.put(key, sensitiveWordMap, EXPIRE_TIME);
            }
            ((AbstractApplicationContext) ac).registerShutdownHook();
            return sensitiveWordMap;
        } catch (Exception e) {
            throw new RuntimeException("初始化敏感词库错误");
        }
    }

    /**
     * 读取敏感词库,并把内容放到set里
     * @return
     * @throws Exception
     * @since 1.8
     * @author whb
     */
    private Set readSensitiveWordFile() throws Exception {
        Set set = null;
        File file = new File("/home/sensitiveword.txt");
        try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
                new FileInputStream(file), ENCODING))) {
            if (file.isFile() && file.exists()) {
                set = new HashSet();
                String txt = null;
                while ((txt = bufferedReader.readLine()) != null) {
                    set.add(txt);
                }
            } else {
                throw new Exception("敏感词库文件不存在");
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw e;
        }
        return set;

    }

    /**
     * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
* 中 = { * isEnd = 0 * 国 = {
* isEnd = 1 * 人 = {isEnd = 0 * 民 = {isEnd = 1} * } * 男 = { * isEnd = 0 * 人 = { * isEnd = 1 * } * } * } * } * 五 = { * isEnd = 0 * 星 = { * isEnd = 0 * 红 = { * isEnd = 0 * 旗 = { * isEnd = 1 * } * } * } * } * @param keyWordSet * @since 1.8 * @author whb */ @SuppressWarnings({ "rawtypes", "unchecked" }) private void addSensitiveWordToHashMap(Set keyWordSet) { sensitiveWordMap = new HashMap(keyWordSet.size()); //初始化敏感词容器,避免扩容操作 String key = null; Map nowMap = null; Map newWorMap = null; Iterator iterator = keyWordSet.iterator(); while (iterator.hasNext()) { key = iterator.next(); nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { char charKey = key.charAt(i); //转换成char型 Object wordMap = nowMap.get(charKey); if (wordMap != null) { nowMap = (Map) wordMap; //一个一个放进Map中 } else { //不存在,则构建一个Map,同时将isEnd设置为0,因为它不是最后一个 newWorMap = new HashMap(); newWorMap.put("isEnd", "0");//不是最后一个 nowMap.put(charKey, newWorMap);//没有这个key,就把(isEnd,0) 放在Map中 nowMap = newWorMap; } if (i == key.length() - 1) { //最后一个 nowMap.put("isEnd", "1"); } } } } } /** * Revision history * ------------------------------------------------------------------------- * * Date Author Note * ------------------------------------------------------------------------- * 2017年8月23日 whb create */


你可能感兴趣的:(敏感词过滤)