pybloom去重

环境

python3.6

pip3 install bitarray-0.8.1-cp36-cp36m-win_amd64.whl(pybloom_live依赖这个包,需要先安装)

pip3 install pybloom_live

下载地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/



1. pybloom_live

ScalableBloomFilter

from pybloom_live import ScalableBloomFilter

#mode=ScalableBloomFilter.SMALL_SET_GROWTH

sbf = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

url = "www.baidu.com"

url2 = "www.douban,com"

sbf.add(url)

print(url in sbf)  # True

print(url2 in sbf)  # False


BloomFilter

from pybloom_live import BloomFilter

bf = BloomFilter(capacity=1000)

bf.add("www.baidu.com")

print("www.baidu.com" in bf)  # True

print("www.douban.com" in bf)  # False



2. pybloom

BloomFilter 是定容。

ScalableBloomFilter 可以自动扩容

# -*- coding: utf-8 -*-

from pybloom import BloomFilter

f = BloomFilter(capacity=1000, error_rate=0.001)# capacity是容量, error_rate 是能容忍的误报率,超过误报率,抛出异常

print([f.add(x) for x in range(10)])#[False, False, False, False, False, False, False, False, False, False]

print(all([(x in f) for x in range(10)]))#True

print(10 in f)#False

print(5 in f)#True

f = BloomFilter(capacity=1000, error_rate=0.001)

print(f.capacity)#等于capacity

print('len(f):',len(f))

for i in range(0, f.capacity):

    f.add(i)

print('len(f):',len(f))

print((1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18)#True

from pybloom import ScalableBloomFilter

sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

count = 10000

for i in range(0, count):

    sbf.add(i)

print((1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18)#True

-

你可能感兴趣的:(pybloom去重)