golang 布隆过滤器实现源码分析

golang 布隆过滤器

“github.com/willf/bloom”

源码分析

结构体定义,m和k,通过README可以知道,
m是数组集合大小,而k是hash函数个数

// member of a set.
type BloomFilter struct {
    m uint
    k uint
    b *bitset.BitSet
}

// New creates a new Bloom filter with _m_ bits and _k_ hashing functions
// We force _m_ and _k_ to be at least one to avoid panics.
func New(m uint, k uint) *BloomFilter {
    return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
}

这里使用了bitset作为数组实现
结构体定义:

// A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0.
type BitSet struct {
    length uint
    set    []uint64
}

// New creates a new BitSet with a hint that length bits will be required
func New(length uint) (bset *BitSet) {
    defer recover ....
    bset = &BitSet{
        length,
        make([]uint64, wordsNeeded(length)), // 计算实际申请长度
    }
    return bset
}

用int64位表示0~63个整数
比如:

第一次add 0:
数组表示应该是1(1)
第二次add 10
数组表示应该是1024+1=1025(1000000001)
第三次add 64
因为已经大于63,所以只能新建一个int64,所以应该两个元素,1025和1

计算hash

// Add data to the Bloom Filter. Returns the filter (allows chaining)
func (f *BloomFilter) Add(data []byte) *BloomFilter {
    h := baseHashes(data)
    for i := uint(0); i < f.k; i++ { //执行k次,一个整数用k位表示,一旦不存在,k位bit肯定不为1
        // 实现hash函数是murmurhash,https://xiaobazhang.github.io/2018/06/19/MurmurHash%E7%AE%97%E6%B3%95/
        f.b.Set(f.location(h, i))
    }
    return f
}

计算碰撞率

// EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits
// and k hash functions, what the false positive rate will be
// while storing n entries; runs 100,000 tests. This is an empirical
// test using integers as keys. As a side-effect, it clears the BloomFilter.
func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) {
    rounds := uint32(100000)
    f.ClearAll()
    n1 := make([]byte, 4)
    for i := uint32(0); i < uint32(n); i++ {
        binary.BigEndian.PutUint32(n1, i)
        f.Add(n1)
    }
    fp := 0
    // test for number of rounds
    for i := uint32(0); i < rounds; i++ {
        binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
        if f.Test(n1) {
            //fmt.Printf("%v failed.\n", i+uint32(n)+1)
            fp++
        }
    }
    fpRate = float64(fp) / (float64(rounds))
    f.ClearAll()
    return
}

根据n和fp估算m和k
证明公式:https://en.wikipedia.org/wiki/Bloom_filter

// EstimateParameters estimates requirements for m and k.
// Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
// used with permission.
func EstimateParameters(n uint, p float64) (m uint, k uint) {
    m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
    k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
    return
}

// NewWithEstimates creates a new Bloom filter for about n items with fp
// false positive rate
func NewWithEstimates(n uint, fp float64) *BloomFilter {
    m, k := EstimateParameters(n, fp)
    return New(m, k)
}

你可能感兴趣的:(golang)