布隆过滤器在网咯爬虫中的应用

现状:做爬虫的时候考虑爬下来的url要与已爬过的url(所有的,包括相关与不相关的)进行比较,看是否包含该url,如果包含则舍弃,否则爬行。但已爬行的url数据非常大,在判断是否包含该url时,造成了visited表的全表扫描,代价太大,无疑降低了爬行性能,浪费爬行时间。

 Bloom-Filter算法简介:
       Bloom-Filter,即布隆过滤器,1970年由Bloom中提出。它可以用于检索一个元素是否在一个集合中,其优点是空间效率和查询时间都远远超过其他算法,其不足在于Bloom- Filter存在着误判。常见的补救办法是在建立一个小的白名单,存储那些可能别误判的地址。

由于以前再看数学之美的时候讲到过布隆过滤器,当时也是用于该场景,于是决定采用布隆过滤器来优化与visited表的比较时间。

但布隆过滤器代码有两种:

1 使用java自带的

import java.util.BitSet;

public class bloomFilter {

    private int defaultSize = 5000 << 10000;
    private int basic = defaultSize -1;
    private String key = null;
    private BitSet bits = new BitSet(defaultSize);
   
    public bloomFilter(String key){
        this.key = key;
    }
   
    private int[] lrandom(){
        int[] randomsum = new int[8];
        int random1 = hashCode(key,1);
        int random2 = hashCode(key,2);
        int random3 = hashCode(key,3);
        int random4 = hashCode(key,4);
        int random5 = hashCode(key,5);
        int random6 = hashCode(key,6);
        int random7 = hashCode(key,7);
        int random8 = hashCode(key,8);
        randomsum[0] = random1;
        randomsum[1] = random2;
        randomsum[2] = random3;
        randomsum[3] = random4;
        randomsum[4] = random5;
        randomsum[5] = random6;
        randomsum[6] = random7;
        randomsum[7] = random8;
        return randomsum;
    }
   
    private int[] sameLrandom(){
        int[] randomsum = new int[8];
        int random1 = hashCode(key,1);
        int random2 = hashCode(key,1);
        int random3 = hashCode(key,1);
        int random4 = hashCode(key,1);
        int random5 = hashCode(key,1);
        int random6 = hashCode(key,1);
        int random7 = hashCode(key,1);
        int random8 = hashCode(key,1);
        randomsum[0] = random1;
        randomsum[1] = random2;
        randomsum[2] = random3;
        randomsum[3] = random4;
        randomsum[4] = random5;
        randomsum[5] = random6;
        randomsum[6] = random7;
        randomsum[7] = random8;
        return randomsum;
    }
   
    private void add(){
        if(exist()){
            System.out.println("已经包含("+key+")");
            return;
        }
        int keyCode[] = lrandom();
        bits.set(keyCode[0]);
        bits.set(keyCode[1]);
        bits.set(keyCode[2]);
        bits.set(keyCode[3]);
        bits.set(keyCode[4]);
        bits.set(keyCode[5]);
        bits.set(keyCode[6]);
        bits.set(keyCode[7]);
    }
   
    private boolean exist(){
        int keyCode[] = lrandom();
        if(bits.get(keyCode[0])&&
                bits.get(keyCode[1])
                &&bits.get(keyCode[2])
                &&bits.get(keyCode[3])
                &&bits.get(keyCode[4])
                &&bits.get(keyCode[5])
                &&bits.get(keyCode[6])
                &&bits.get(keyCode[7])){
            return true;
        }
        return false;
    }
   
    private boolean set0(){
        if(exist()){
            int keyCode[] = lrandom();
            bits.clear(keyCode[0]);
            bits.clear(keyCode[1]);
            bits.clear(keyCode[2]);
            bits.clear(keyCode[3]);
            bits.clear(keyCode[4]);
            bits.clear(keyCode[5]);
            bits.clear(keyCode[6]);
            bits.clear(keyCode[7]);
            return true;
        }
        return false;
    }
   
    private int hashCode(String key,int Q){
        int h = 0;
        int off = 0;
        char val[] = key.toCharArray();
        int len = key.length();
        for (int i = 0; i < len; i++) {
            h = (30 + Q) * h + val[off++];
        }
        return changeInteger(h);
    }
   
    private int changeInteger(int h) {
        return basic & h;
    }
   
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        bloomFilter f = new bloomFilter("http://www.agrilink.cn/");
   
        System.out.println(f.defaultSize);
        f.add();
        System.out.println(f.exist());
        f.set0();
        System.out.println(f.exist());
    }

}

 

2. 还有一个java 版的 ,也是 使用 bitset

 

import java.util.BitSet;
public class  SimpleBloomFilter {
     private static final  int  DEFAULT_SIZE  =2 << 24 ;
     private static final  int [] seeds =new  int []{5,7, 11 , 13 , 31 , 37 , 61};
     private  BitSet bits= new  BitSet(DEFAULT_SIZE);
     private  SimpleHash[]  func=new  SimpleHash[seeds.length];
    
  
    
     public  SimpleBloomFilter() {
         for( int  i= 0 ; i< seeds.length; i ++ ) {
            func[i]=new  SimpleHash(DEFAULT_SIZE, seeds[i]);
        }
    }
     public void  add(String value) {
         for(SimpleHash f : func) {
            bits.set(f.hash(value),  true );
        }
    }
     public boolean  contains(String value) {
         if(value ==null ) {
             return false ;
        }
         boolean  ret  = true ;
         for(SimpleHash f : func) {
            ret=ret&& bits.get(f.hash(value));
        }
         return  ret;
    }
    
     //内部类,simpleHash
     public static class SimpleHash {
         private int  cap;
         private int  seed;
         public  SimpleHash( int cap, int seed) {
             this.cap= cap;
             this.seed =seed;
        }
         public int hash(String value) {
             int  result=0 ;
             int  len= value.length();
             for  (int i= 0 ; i< len; i ++ ) {
                result =seed* result + value.charAt(i);
            }
             return (cap - 1 ) & result;
        }
    }
    
    
    
    
    
    
    
     public static void  main(String[] args) {
         String value  = "[email protected]" ;
         SimpleBloomFilter filter=new  SimpleBloomFilter();
         System.out.println(filter.contains(value));
         filter.add(value);
         System.out.println(filter.contains(value));
     }
    
    
    
    
}

 

你可能感兴趣的:(布隆过滤器在网咯爬虫中的应用)