现状:做爬虫的时候考虑爬下来的url要与已爬过的url(所有的,包括相关与不相关的)进行比较,看是否包含该url,如果包含则舍弃,否则爬行。但已爬行的url数据非常大,在判断是否包含该url时,造成了visited表的全表扫描,代价太大,无疑降低了爬行性能,浪费爬行时间。
Bloom-Filter算法简介:
Bloom-Filter,即布隆过滤器,1970年由Bloom中提出。它可以用于检索一个元素是否在一个集合中,其优点是空间效率和查询时间都远远超过其他算法,其不足在于Bloom- Filter存在着误判。常见的补救办法是在建立一个小的白名单,存储那些可能别误判的地址。
由于以前再看数学之美的时候讲到过布隆过滤器,当时也是用于该场景,于是决定采用布隆过滤器来优化与visited表的比较时间。
但布隆过滤器代码有两种:
1 使用java自带的
import java.util.BitSet;
public class bloomFilter {
private int defaultSize = 5000 << 10000;
private int basic = defaultSize -1;
private String key = null;
private BitSet bits = new BitSet(defaultSize);
public bloomFilter(String key){
this.key = key;
}
private int[] lrandom(){
int[] randomsum = new int[8];
int random1 = hashCode(key,1);
int random2 = hashCode(key,2);
int random3 = hashCode(key,3);
int random4 = hashCode(key,4);
int random5 = hashCode(key,5);
int random6 = hashCode(key,6);
int random7 = hashCode(key,7);
int random8 = hashCode(key,8);
randomsum[0] = random1;
randomsum[1] = random2;
randomsum[2] = random3;
randomsum[3] = random4;
randomsum[4] = random5;
randomsum[5] = random6;
randomsum[6] = random7;
randomsum[7] = random8;
return randomsum;
}
private int[] sameLrandom(){
int[] randomsum = new int[8];
int random1 = hashCode(key,1);
int random2 = hashCode(key,1);
int random3 = hashCode(key,1);
int random4 = hashCode(key,1);
int random5 = hashCode(key,1);
int random6 = hashCode(key,1);
int random7 = hashCode(key,1);
int random8 = hashCode(key,1);
randomsum[0] = random1;
randomsum[1] = random2;
randomsum[2] = random3;
randomsum[3] = random4;
randomsum[4] = random5;
randomsum[5] = random6;
randomsum[6] = random7;
randomsum[7] = random8;
return randomsum;
}
private void add(){
if(exist()){
System.out.println("已经包含("+key+")");
return;
}
int keyCode[] = lrandom();
bits.set(keyCode[0]);
bits.set(keyCode[1]);
bits.set(keyCode[2]);
bits.set(keyCode[3]);
bits.set(keyCode[4]);
bits.set(keyCode[5]);
bits.set(keyCode[6]);
bits.set(keyCode[7]);
}
private boolean exist(){
int keyCode[] = lrandom();
if(bits.get(keyCode[0])&&
bits.get(keyCode[1])
&&bits.get(keyCode[2])
&&bits.get(keyCode[3])
&&bits.get(keyCode[4])
&&bits.get(keyCode[5])
&&bits.get(keyCode[6])
&&bits.get(keyCode[7])){
return true;
}
return false;
}
private boolean set0(){
if(exist()){
int keyCode[] = lrandom();
bits.clear(keyCode[0]);
bits.clear(keyCode[1]);
bits.clear(keyCode[2]);
bits.clear(keyCode[3]);
bits.clear(keyCode[4]);
bits.clear(keyCode[5]);
bits.clear(keyCode[6]);
bits.clear(keyCode[7]);
return true;
}
return false;
}
private int hashCode(String key,int Q){
int h = 0;
int off = 0;
char val[] = key.toCharArray();
int len = key.length();
for (int i = 0; i < len; i++) {
h = (30 + Q) * h + val[off++];
}
return changeInteger(h);
}
private int changeInteger(int h) {
return basic & h;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
bloomFilter f = new bloomFilter("http://www.agrilink.cn/");
System.out.println(f.defaultSize);
f.add();
System.out.println(f.exist());
f.set0();
System.out.println(f.exist());
}
}
2. 还有一个java 版的 ,也是 使用 bitset
import java.util.BitSet;
public class SimpleBloomFilter {
private static final int DEFAULT_SIZE =2 << 24 ;
private static final int [] seeds =new int []{5,7, 11 , 13 , 31 , 37 , 61};
private BitSet bits= new BitSet(DEFAULT_SIZE);
private SimpleHash[] func=new SimpleHash[seeds.length];
public SimpleBloomFilter() {
for( int i= 0 ; i< seeds.length; i ++ ) {
func[i]=new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
public void add(String value) {
for(SimpleHash f : func) {
bits.set(f.hash(value), true );
}
}
public boolean contains(String value) {
if(value ==null ) {
return false ;
}
boolean ret = true ;
for(SimpleHash f : func) {
ret=ret&& bits.get(f.hash(value));
}
return ret;
}
//内部类,simpleHash
public static class SimpleHash {
private int cap;
private int seed;
public SimpleHash( int cap, int seed) {
this.cap= cap;
this.seed =seed;
}
public int hash(String value) {
int result=0 ;
int len= value.length();
for (int i= 0 ; i< len; i ++ ) {
result =seed* result + value.charAt(i);
}
return (cap - 1 ) & result;
}
}
public static void main(String[] args) {
String value = "[email protected]" ;
SimpleBloomFilter filter=new SimpleBloomFilter();
System.out.println(filter.contains(value));
filter.add(value);
System.out.println(filter.contains(value));
}
}