关键词敏感字高效查找匹配算法

本算法对纯文本匹配执行效率已改进到:5000字5毫秒(2400敏感词库)

原理:基于多叉树的查找。

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;


/**
 * 敏感词库初始化
 * 
 * @author pipi
 *
 */
public class SensitiveWordInit {
	/**
	 * 敏感词库
	 */
	@SuppressWarnings("rawtypes")
	public static HashMap sensitiveWordMap;

	/**
	 * 初始化敏感词
	 * 
	 * @return
	 */
	@SuppressWarnings("rawtypes")
	public Map initKeyWord(List sensitiveWords) {
		try {
			// 从敏感词集合对象中取出敏感词并封装到Set集合中
			Set keyWordSet = new HashSet();
			for (SmsSensitiveWord s : sensitiveWords) {
				keyWordSet.add(s.getSensitiveWord().trim());
			}
			// 将敏感词库加入到HashMap中
			addSensitiveWordToHashMap(keyWordSet);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return sensitiveWordMap;
	}

	/**
	 * 封装敏感词库
	 * 
	 * @param keyWordSet
	 */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	private void addSensitiveWordToHashMap(Set keyWordSet) {
		// 初始化HashMap对象并控制容器的大小
		sensitiveWordMap = new HashMap(keyWordSet.size());
		// 敏感词
		String key = null;
		// 用来按照相应的格式保存敏感词库数据
		Map nowMap = null;
		// 用来辅助构建敏感词库
		Map newWorMap = null;
		// 使用一个迭代器来循环敏感词集合
		Iterator iterator = keyWordSet.iterator();
		while (iterator.hasNext()) {
			key = iterator.next();
			// 等于敏感词库,HashMap对象在内存中占用的是同一个地址,所以此nowMap对象的变化,sensitiveWordMap对象也会跟着改变
			nowMap = sensitiveWordMap;
			for (int i = 0; i < key.length(); i++) {
				// 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
				char keyChar = key.charAt(i);

				// 判断这个字是否存在于敏感词库中
				Object wordMap = nowMap.get(keyChar);
				if (wordMap != null) {
					nowMap = (Map) wordMap;
				} else {
					newWorMap = new HashMap();
					newWorMap.put("isEnd", "0");
					nowMap.put(keyChar, newWorMap);
					nowMap = newWorMap;
				}

				// 如果该字是当前敏感词的最后一个字,则标识为结尾字
				if (i == key.length() - 1) {
					nowMap.put("isEnd", "1");
				}
			}
		}
	}
}
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * 敏感词过滤工具类
 * 
 * @author pipi
 *
 */
public class SensitivewordEngine {

	/**
	 * 只过滤最小敏感词
	 */
	public static int minMatchTYpe = 1;

	/**
	 * 过滤所有敏感词
	 */
	public static int maxMatchType = 2;

	/**
	 * 敏感词库敏感词数量
	 * 
	 * @return
	 */
	public static int getWordSize() {
		if (SensitiveWordInit.sensitiveWordMap == null) {
			return 0;
		}
		return SensitiveWordInit.sensitiveWordMap.size();
	}

	/**
	 * 是否包含敏感词
	 * 
	 * @param txt
	 * @param matchType
	 * @return
	 */
	public static boolean isContaintSensitiveWord(String txt, int matchType) {
		boolean flag = false;
		for (int i = 0; i < txt.length(); i++) {
			int matchFlag = checkSensitiveWord(txt, i, matchType);
			if (matchFlag > 0) {
				flag = true;
			}
		}
		return flag;
	}

	/**
	 * 获取敏感词内容
	 * 
	 * @param txt
	 * @param matchType
	 * @return 敏感词内容
	 */
	public static Set getSensitiveWord(String txt, int matchType) {
		Set sensitiveWordList = new HashSet();

		for (int i = 0; i < txt.length(); i++) {
			int length = checkSensitiveWord(txt, i, matchType);
			if (length > 0) {
				// 将检测出的敏感词保存到集合中
				sensitiveWordList.add(txt.substring(i, i + length));
				i = i + length - 1;
			}
		}

		return sensitiveWordList;
	}

	/**
	 * 替换敏感词
	 * 
	 * @param txt
	 * @param matchType
	 * @param replaceChar
	 * @return
	 */
	public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
		String resultTxt = txt;
		Set set = getSensitiveWord(txt, matchType);
		Iterator iterator = set.iterator();
		String word = null;
		String replaceString = null;
		while (iterator.hasNext()) {
			word = iterator.next();
			replaceString = getReplaceChars(replaceChar, word.length());
			resultTxt = resultTxt.replaceAll(word, replaceString);
		}

		return resultTxt;
	}

	/**
	 * 替换敏感词内容
	 * 
	 * @param replaceChar
	 * @param length
	 * @return
	 */
	private static String getReplaceChars(String replaceChar, int length) {
		String resultReplace = replaceChar;
		for (int i = 1; i < length; i++) {
			resultReplace += replaceChar;
		}

		return resultReplace;
	}

	/**
	 * 检查敏感词数量
	 * 
	 * @param txt
	 * @param beginIndex
	 * @param matchType
	 * @return 
	 */
	@SuppressWarnings("rawtypes")
	public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
		boolean flag = false;
		// 记录敏感词数量
		int matchFlag = 0;
		char word = 0;
		Map nowMap = SensitiveWordInit.sensitiveWordMap;
		for (int i = beginIndex; i < txt.length(); i++) {
			word = txt.charAt(i);
			// 判断该字是否存在于敏感词库中
			nowMap = (Map) nowMap.get(word);
			if (nowMap != null) {
				matchFlag++;
				// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
				if ("1".equals(nowMap.get("isEnd"))) {
					flag = true;
					// 判断过滤类型,如果是小过滤则跳出循环,否则继续循环
					if (SensitivewordEngine.minMatchTYpe == matchType) {
						break;
					}
				}
			} else {
				break;
			}
		}
		if (!flag) {
			matchFlag = 0;
		}
		return matchFlag;
	}

}

 

你可能感兴趣的:(笔记)