以前在做关键词或脏字过滤的时候都是使用的TrieTree,后来随便搜索发现了yeerh的这篇文章:http://www.cnblogs.com/yeerh/archive/2011/10/20/2219035.html,比较了一下自己的实现和yeerh的TrieTree实现,发现作者trie node增加一个end能够增快搜索,确实优于自己的实现。所以把网站的关键词搜索替换成了yeerh的实现,并替换成了java版本。
先引入接口,还没怎么想好:
public interface Check {
public void addWord(String word);
public boolean hasBadWord(String text);
public String replaceWith(String text,char mark);
}
import java.util.*;
/**
* User: fafu
* Date: 14-7-25
* Time: 下午5:37
* This class is
*/
public class TrieCheck implements Check {
private TrieNode root;
@Override
public void addWord(String word) {
if (word == null || word.length() == 0) return;
TrieNode current = root;
for (int i = 0; i < word.length(); i++) {
char code = word.charAt(i);
current = current.add(code);
}
current.end = true;
}
@Override
public boolean hasBadWord(String text) {
IndexWordPair pair = getBaddWord(text);
if (pair == null) return false;
return true;
}
private IndexWordPair getBaddWord(String text) {
if (text == null || text.length() == 0) return null;
List chlist = new ArrayList();
for (int i = 0; i < text.length(); i++) {
TrieNode current = root;
int index = i;
while ((current = current.child.get(text.charAt(index))) != null) {
chlist.add(text.charAt(index));
if (current.end) {
return new IndexWordPair(index, Arrays.toString(chlist.toArray(new Character[0])));
}
if (text.length() == ++index) {
break;
}
}
}
return null;
}
@Override
public String replaceWith(String text, char mark) {
if (text == null || text.length() == 0) return null;
char[] ca = text.toCharArray();
for (int i = 0; i < text.length(); i++) {
List chlist = new ArrayList();
TrieNode current = root;
while ((current = current.child.get(ca[i])) != null) {
chlist.add(ca[i]);
if (current.end) {
for (int idx = 0; idx < chlist.size(); idx++) {
ca[idx] = mark;
}
break;
}
if (text.length() == i+1) {
break;
}
}
}
StringBuilder sb = new StringBuilder();
for(char c:ca){
sb.append(c);
}
return sb.toString();
}
private static class TrieNode {
public char value;
public Map child = new HashMap();
private boolean end = true;
public TrieNode() {
}
public TrieNode add(char newChar) {
TrieNode t = child.get(newChar);
if (t == null) t = new TrieNode();
t.value = newChar;
child.put(newChar, t);
return t;
}
}
private class IndexWordPair {
public int index;
public String word;
public IndexWordPair(int index, String word) {
this.index = index;
this.word = word;
}
}
}
再引入FastCheck版本:
import java.util.HashSet;
/**
* User: fafu
* Date: 14-7-25
* Time: 下午4:49
* This class is
*/
public class FastCheck implements Check{
private HashSet hash = new HashSet();
private byte[] fastCheck = new byte[65536];
private byte[] fastLength = new byte[65536];
private boolean[] charCheck = new boolean[65536];
private boolean[] endCheck = new boolean[65536];
private int maxWordLength = 0;
private int minWordLength = Integer.MAX_VALUE;
public void addWord(String word) {
maxWordLength = Math.max(maxWordLength, word.length());
minWordLength = Math.min(minWordLength, word.length());
for (int i = 0; i < 7 && i < word.length(); i++) {
fastCheck[word.charAt(i)] |= (byte) (1 << i);
}
for (int i = 7; i < word.length(); i++) {
fastCheck[word.charAt(i)] |= 0x80;
}
if (word.length() == 1) {
charCheck[word.charAt(0)] = true;
} else {
fastLength[word.charAt(0)] |= (byte) (1 << (Math.min(7, word.length() - 2)));
endCheck[word.charAt(word.length() - 1)] = true;
hash.add(word);
}
}
public boolean hasBadWord(String text) {
int index = 0;
while (index < text.length()) {
int count = 1;
if (index > 0 || (fastCheck[text.charAt(index)] & 1) == 0) {
while (index < text.length() - 1 && (fastCheck[text.charAt(++index)] & 1) == 0) ;
}
char begin = text.charAt(index);
if (minWordLength == 1 && charCheck[begin]) {
return true;
}
for (int j = 1; j <= Math.min(maxWordLength, text.length() - index - 1); j++) {
char current = text.charAt(index + j);
if ((fastCheck[current] & 1) == 0) {
++count;
}
if ((fastCheck[current] & (1 << Math.min(j, 7))) == 0) {
break;
}
if (j + 1 >= minWordLength) {
if ((fastLength[begin] & (1 << Math.min(j - 1, 7))) > 0 && endCheck[current]) {
if (hash.contains(text.substring(index, index + j + 1))) {
return true;
}
}
}
}
index += count;
}
return false;
}
@Override
public String replaceWith(String text, char mark) {
int index = 0;
char[] ca = text.toCharArray();
while (index < text.length()) {
int count = 1;
if (index > 0 || (fastCheck[text.charAt(index)] & 1) == 0) {
while (index < text.length() - 1 && (fastCheck[text.charAt(++index)] & 1) == 0) ;
}
char begin = text.charAt(index);
if (minWordLength == 1 && charCheck[begin]) {
ca[index] = mark;
index++;
continue;
}
for (int j = 1; j <= Math.min(maxWordLength, text.length() - index - 1); j++) {
char current = text.charAt(index + j);
if ((fastCheck[current] & 1) == 0) {
++count;
}
if ((fastCheck[current] & (1 << Math.min(j, 7))) == 0) {
break;
}
if (j + 1 >= minWordLength) {
if ((fastLength[begin] & (1 << Math.min(j - 1, 7))) > 0 && endCheck[current]) {
if (hash.contains(text.substring(index, index + j + 1))) {
for(int m = index;m<(index+j+1);m++){
ca[m] = mark;
}
break;
}
}
}
}
index += count;
}
StringBuilder sb = new StringBuilder();
for(char c:ca){
sb.append(c);
}
return sb.toString();
}
}