全网关于过滤算法了解到的有以下几种:
1. arrayList.contains(txt)
2. DFA(循环机算法的实现)
3. 正则表达式实现
3. 多叉树,前缀树(精度高,复杂度低)
字段树的过滤算法复杂度比较好:算法比较假如敏感词平均长度为10,数量为100000,文本长度为 len。常规遍历方式,复杂度O(100000 * (len + 10));前缀树算法复杂度O(10 * len)。
1:读取关键词文本:
public void afterPropertiesSet() { rootNode = new TrieNode(); try { InputStream is = Thread.currentThread().getContextClassLoader() .getResourceAsStream("SensitiveWords.txt"); InputStreamReader read = new InputStreamReader(is); BufferedReader bufferedReader = new BufferedReader(read); String lineTxt; while ((lineTxt = bufferedReader.readLine()) != null) { lineTxt = lineTxt.trim(); addWord(lineTxt); } read.close(); } catch (Exception e) { logger.error("读取敏感词文件失败" + e.getMessage()); } }
2:判断是否是一个符号,过滤掉颜表情等各种符号
private boolean isSymbol(char c) { int ic = (int) c; // 0x2E80-0x9FFF 东亚文字范围 return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF); }
3:敏感词文本生成前缀树
/** * 读取的文本生成前缀树 * @param lineTxt */ private void addWord(String lineTxt) { TrieNode tempNode = rootNode; // 循环每个字节 for (int i = 0; i < lineTxt.length(); ++i) { Character c = lineTxt.charAt(i); // 过滤空格 if (isSymbol(c)) { continue; } TrieNode node = tempNode.getSubNode(c); if (node == null) { // 没初始化 node = new TrieNode(); tempNode.addSubNode(c, node); } tempNode = node; if (i == lineTxt.length() - 1) { // 关键词结束, 设置结束标志 tempNode.setKeywordEnd(true); } } }
4:过滤敏感词
/** * 过滤敏感词 */ public String filter(String text) { if (StringUtils.isBlank(text)) { return text; } String replacement = DEFAULT_REPLACEMENT; StringBuilder result = new StringBuilder(); TrieNode tempNode = rootNode; int begin = 0; // 回滚数 int position = 0; // 当前比较的位置 while (position < text.length()) { char c = text.charAt(position); // 空格直接跳过 if (isSymbol(c)) { if (tempNode == rootNode) { result.append(c); ++begin; } ++position; continue; } tempNode = tempNode.getSubNode(c); // 当前位置的匹配结束 if (tempNode == null) { // 以begin开始的字符串不存在敏感词 result.append(text.charAt(begin)); // 跳到下一个字符开始测试 position = begin + 1; begin = position; // 回到树初始节点 tempNode = rootNode; } else if (tempNode.isKeywordEnd()) { // 发现敏感词, 从begin到position的位置用replacement替换掉 result.append(replacement); position = position + 1; begin = position; tempNode = rootNode; } else { ++position; } } result.append(text.substring(begin)); return result.toString(); }
也可以直接复制以下SensitiveService类到自己项目中;
package com.nowcoder.service; import org.apache.commons.lang.CharUtils; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.InitializingBean; import org.springframework.stereotype.Service; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; @Service public class SensitiveService implements InitializingBean { private static final Logger logger = LoggerFactory.getLogger(SensitiveService.class); /** * 默认敏感词替换符 */ private static final String DEFAULT_REPLACEMENT = "敏感词"; private class TrieNode { /** * true 关键词的终结 ; false 继续 */ private boolean end = false; /** * key下一个字符,value是对应的节点 */ private MapsubNodes = new HashMap<>(); /** * 向指定位置添加节点树 */ void addSubNode(Character key, TrieNode node) { subNodes.put(key, node); } /** * 获取下个节点 */ TrieNode getSubNode(Character key) { return subNodes.get(key); } boolean isKeywordEnd() { return end; } void setKeywordEnd(boolean end) { this.end = end; } public int getSubNodeCount() { return subNodes.size(); } } /** * 根节点 */ private TrieNode rootNode = new TrieNode(); /** * 判断是否是一个符号,过滤掉颜表情等各种符号 */ private boolean isSymbol(char c) { int ic = (int) c; // 0x2E80-0x9FFF 东亚文字范围 return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF); } /** * 过滤敏感词 */ public String filter(String text) { if (StringUtils.isBlank(text)) { return text; } String replacement = DEFAULT_REPLACEMENT; StringBuilder result = new StringBuilder(); TrieNode tempNode = rootNode; int begin = 0; // 回滚数 int position = 0; // 当前比较的位置 while (position < text.length()) { char c = text.charAt(position); // 空格直接跳过 if (isSymbol(c)) { if (tempNode == rootNode) { result.append(c); ++begin; } ++position; continue; } tempNode = tempNode.getSubNode(c); // 当前位置的匹配结束 if (tempNode == null) { // 以begin开始的字符串不存在敏感词 result.append(text.charAt(begin)); // 跳到下一个字符开始测试 position = begin + 1; begin = position; // 回到树初始节点 tempNode = rootNode; } else if (tempNode.isKeywordEnd()) { // 发现敏感词, 从begin到position的位置用replacement替换掉 result.append(replacement); position = position + 1; begin = position; tempNode = rootNode; } else { ++position; } } result.append(text.substring(begin)); return result.toString(); } /** * 读取的文本生成前缀树 * @param lineTxt */ private void addWord(String lineTxt) { TrieNode tempNode = rootNode; // 循环每个字节 for (int i = 0; i < lineTxt.length(); ++i) { Character c = lineTxt.charAt(i); // 过滤空格 if (isSymbol(c)) { continue; } TrieNode node = tempNode.getSubNode(c); if (node == null) { // 没初始化 node = new TrieNode(); tempNode.addSubNode(c, node); } tempNode = node; if (i == lineTxt.length() - 1) { // 关键词结束, 设置结束标志 tempNode.setKeywordEnd(true); } } } @Override public void afterPropertiesSet() { rootNode = new TrieNode(); try { InputStream is = Thread.currentThread().getContextClassLoader() .getResourceAsStream("SensitiveWords.txt"); InputStreamReader read = new InputStreamReader(is); BufferedReader bufferedReader = new BufferedReader(read); String lineTxt; while ((lineTxt = bufferedReader.readLine()) != null) { lineTxt = lineTxt.trim(); addWord(lineTxt); } read.close(); } catch (Exception e) { logger.error("读取敏感词文件失败" + e.getMessage()); } } public static void main(String[] argv) { // SensitiveService s = new SensitiveService(); // s.addWord("色情"); // s.addWord("好色"); // System.out.print(s.filter("你好X色**情XX")); } }