到了第六步,我们只是理论上探讨优化的步骤,最后,我们进行集体测试,使用敏感词越多,效果越明显:
package test; import static util.PrintUtil.print; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; public class Test { static int key_max = 0; // 敏感词最大长度 static String[] keys = {"办证", "气枪出售", "裸聊", "裸表演", "土枪卖"}; static String tContent = "再办证顶"; static ArrayList<String> first = new ArrayList<String>(); static String[] sortFirst; static char[] charFirst; static HashMap<String, ArrayList<String>> map = new HashMap<String, ArrayList<String>>(); static HashMap<String, String[]> sortMap = new HashMap<String, String[]>(); static HashMap<String, char[]> charMap = new HashMap<String, char[]>(); static void init(String[] keys) { ArrayList<String> temp; String key, value; int length; for (String k : keys) { if (!first.contains(k.substring(0, 1))) { first.add(k.substring(0, 1)); } length = k.length(); if (length > key_max) key_max = length; for (int i = 1; i < length; i ++) { key = k.substring(0, i); value = k.substring(i, i + 1); if (i == 1 && !first.contains(key)) { first.add(key); } // 有,添加 if (map.containsKey(key)) { if (!map.get(key).contains(value)) { map.get(key).add(value); } } // 没有添加 else { temp = new ArrayList<String>(); temp.add(value); map.put(key, temp); } } } sortFirst = first.toArray(new String[first.size()]); Arrays.sort(sortFirst); // 排序 charFirst = new char[first.size()]; for (int i = 0; i < charFirst.length; i ++) { charFirst[i] = first.get(i).charAt(0); } Arrays.sort(charFirst); // 排序 String[] sortValue; ArrayList<String> v; Map.Entry<String, ArrayList<String>> entry; Iterator<Entry<String, ArrayList<String>>> iter = map.entrySet().iterator(); while (iter.hasNext()) { entry = (Map.Entry<String, ArrayList<String>>) iter.next(); v = (ArrayList<String>)entry.getValue(); sortValue = v.toArray(new String[v.size()]); Arrays.sort(sortValue); // 排序 sortMap.put(entry.getKey(), sortValue); } char[] charValue; iter = map.entrySet().iterator(); while (iter.hasNext()) { entry = (Map.Entry<String, ArrayList<String>>) iter.next(); v = (ArrayList<String>)entry.getValue(); charValue = new char[v.size()]; for (int i = 0; i < charValue.length; i ++) { charValue[i] = v.get(i).charAt(0); } Arrays.sort(charValue); // 排序 charMap.put(entry.getKey(), charValue); } } /** * 快速实现的方法 */ public final static String test1(String content) { for (String k : keys) { if (content.indexOf(k) > -1) return k; } return null; } /** * 优化一 */ public final static String test2(String content) { boolean bFirst = false; int length = content.length(); for (int i = 0; i < length; i ++) { if (first.contains(content.substring(i, i + 1))) { bFirst = true; break; } } return bFirst ? test1(content) : null; } /** * 优化二 */ public final static String test3(String content) { String r = null, f, g, c = content; ArrayList<String> temps; int length = c.length(); tag : for (int i = 0; i < length - 1; i++) { f = c.substring(i, i + 1); if (first.contains(f)) { for (int j = i + 1; j < length; j++) { f = c.substring(i, j); g = c.substring(j, j + 1); temps = map.get(f); if (temps == null) { // 找到了 //print("ok"); r = f; break tag; } if (temps.contains(g)) { if (j == length - 1) { //print("find!"); r = c.substring(i, j + 1); break tag; } } else { // 没有找到了 break; } } } } return r; } /** * 优化三 */ public final static String test4(String content) { String r = null, f, g, c = content; String[] temps; int length = c.length(); tag : for (int i = 0; i < length - 1; i++) { f = c.substring(i, i + 1); // 二分查找 if (Arrays.binarySearch(sortFirst, f) > -1) { for (int j = i + 1; j < length; j++) { f = c.substring(i, j); g = c.substring(j, j + 1); temps = sortMap.get(f); if (temps == null) { // 找到了 //print("ok"); r = f; break tag; } // 二分查找 if (Arrays.binarySearch(temps, g) > -1) { if (j == length - 1) { //print("find!"); r = c.substring(i, j + 1); break tag; } } else { // 没有找到了 break; } } } } return r; } /** * 优化四 */ public final static String test5(String content) { String r = null, f, c = content; char g; char[] temps; int length = c.length(); tag : for (int i = 0; i < length - 1; i++) { g = c.charAt(i); // 二分查找 if (Arrays.binarySearch(charFirst, g) > -1) { for (int j = i + 1; j < length; j++) { f = c.substring(i, j); g = c.charAt(j); temps = charMap.get(f); if (temps == null) { // 找到了 //print("ok"); r = f; break tag; } // 二分查找 if (Arrays.binarySearch(temps, g) > -1) { if (j == length - 1) { //print("find!"); r = c.substring(i, j + 1); break tag; } } else { // 没有找到了 break; } } } } return r; } /** * 优化五 */ public final static String test6(String content) { String r = null, c = content; char g; char[] temps; char[] keys = new char[key_max]; int length = c.length(), index; tag : for (int i = 0; i < length - 1; i++) { index = 0; g = c.charAt(i); // 过滤特殊字符 if (Arrays.binarySearch(filters, g) > -1) { continue; } // 二分查找 if (Arrays.binarySearch(charFirst, g) > -1) { keys[index++] = g; for (int j = i + 1; j < length; j++) { g = c.charAt(j); // 过滤特殊字符 if (Arrays.binarySearch(filters, g) > -1) { continue; } temps = charMap.get(String.valueOf(keys, 0, index)); if (temps == null) { // 找到了 //print("ok"); r = String.valueOf(keys, 0, index); break tag; } // 二分查找 if (Arrays.binarySearch(temps, g) > -1) { if (j == length - 1) { //print("find!"); keys[index++] = g; r = String.valueOf(keys, 0, index); break tag; } } else { // 没有找到了 break; } keys[index++] = g; } } } return r; } public static StringBuffer read(String file) throws IOException{ BufferedReader in = new BufferedReader(new FileReader(file)); String line = null; StringBuffer buffer = new StringBuffer(); while((line = in.readLine())!= null){ buffer.append(line); } return buffer; } // 过滤特殊字符[敏感词需要过滤、用户输入内容也需要过滤] static char[] filters = ",.~!@#$%^&*(){}[];':\"".toCharArray(); static { Arrays.sort(filters); /* 排序 */ } // 过滤特殊字符正则表达式 static String regexp = ",|\\.|\\(|\\)|\\*|&|\\^|%|\\$"; public static void main(String[] args) throws IOException { // 读取敏感词组 String[] keys = read("data/keyword1").toString().split("@"); tContent = read("data/test1").toString(); // 读取测试内容 init(keys); // 初始化 long time1; int max = 1000; String newContent; time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { newContent = tContent.replaceAll(regexp, ""); test1(newContent); } print("test1 time:" + (System.currentTimeMillis() - time1)); time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { newContent = tContent.replaceAll(regexp, ""); test2(newContent); } print("test2 time:" + (System.currentTimeMillis() - time1)); time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { newContent = tContent.replaceAll(regexp, ""); test3(newContent); } print("test3 time:" + (System.currentTimeMillis() - time1)); time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { newContent = tContent.replaceAll(regexp, ""); test4(newContent); } print("test4 time:" + (System.currentTimeMillis() - time1)); time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { newContent = tContent.replaceAll(regexp, ""); test5(newContent); } print("test5 time:" + (System.currentTimeMillis() - time1)); time1 = System.currentTimeMillis(); for (int i = 0; i < max; i ++) { // 取消正则过滤特殊字符 test6(tContent); } print("test6 time:" + (System.currentTimeMillis() - time1)); } }
代码有不妥之处,欢迎指出^_^。