BloomFilter——大规模数据处理利器:http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html
一、Java版的BloomFilter
JAVA Bitset应用总结:http://blog.csdn.net/originalintention/article/details/8224831
Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。
package cn.edu.xjtu.nhpcc.jenva.datastructure; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.BitSet; public class BloomFilter { /* BitSet初始分配2^24个bit */ private static final int DEFAULT_SIZE =1<<24; //each filter is 2^24 bits, equal 4MB /* 不同哈希函数的种子,一般应取质数 */ private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 }; private BitSet bitSet = new BitSet(DEFAULT_SIZE); /* 哈希函数对象 */ private SimpleHash[] func =new SimpleHash[seeds.length]; public BloomFilter() { for (int i =0; i < seeds.length; i++) { func[i] =new SimpleHash(DEFAULT_SIZE, seeds[i]); } } /** * @param args * @throws IOException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException { // TODO Auto-generated method stub BloomFilter filter = new BloomFilter(); filter.add("shengeng"); filter.add("hujun"); filter.add("Hello"); filter.add("大众点评"); String fileName = "ourbitset"; filter.persistBitSet(fileName); filter.bitSet = null; filter.loadPersistedBitSet(fileName); boolean b1 = filter.contains("shengeng"); boolean b2 = filter.contains("hello"); boolean b3 = filter.contains("大众点评"); boolean b4 = filter.contains("百度"); System.out.println(b1); System.out.println(b2); System.out.println(b3); System.out.println(b4); } // 将字符串标记到bits中 public void add(String value) { for (SimpleHash f : func) { bitSet.set(f.hash(value), true); } } //判断字符串是否已经被bits标记 public boolean contains(String value) { if (value ==null) { return false; } boolean ret =true; for (SimpleHash f : func) { ret = ret && bitSet.get(f.hash(value)); } return ret; } /* 哈希函数类 */ public static class SimpleHash { private int cap; private int seed; public SimpleHash(int cap, int seed) { this.cap = cap; this.seed = seed; } //hash函数,采用简单的加权和hash public int hash(String value) { int result =0; int len = value.length(); for (int i =0; i < len; i++) { result = seed * result + value.charAt(i); } return (cap -1) & result; } } /* * This function is for persistence of bitset. */ public void persistBitSet(String fileName) throws IOException{ File file = new File(fileName); FileOutputStream fileOutputStream = new FileOutputStream(file); ObjectOutputStream objectOutputStream = new ObjectOutputStream(fileOutputStream); objectOutputStream.writeObject(this.bitSet); objectOutputStream.close(); } /* * This function is to load persisted bitset. */ public void loadPersistedBitSet(String fileName) throws IOException, ClassNotFoundException{ File file = new File(fileName); FileInputStream fileInputStream = new FileInputStream(file); ObjectInputStream objectInputStream = new ObjectInputStream(fileInputStream); BitSet bitSet = (BitSet) objectInputStream.readObject(); this.bitSet = bitSet; } }
关于程序需要注意:
1. Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。本例中“指定大小”为DEFAULT_SIZE 。这个写了一个程序验证过了。
2. Java中一个对象占用了多少内存,只和它直接所有的变量以及构造函数里的变量有关系,与该类内部其它方法有多少变量没有关系。
二、Python下的BloomFilter
zhxue@xzh3:~/xzhprog/python$ sudo aptitude install -y python-pip
1. 使用pybloomfilter module(未成功)
zhxue@xzh3:~/xzhprog/python$ sudo pip install -y pybloomfiltermmap
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件
执行时,碰到下列问题:
zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py
Traceback (most recent call last):
File "testBloomFilter.py", line 1, in <module>
from pybloomfilter import BloomFilter
ImportError: No module named pybloomfilter
zhxue@xzh3:~/xzhprog/python$ vi testBloomFilter.py
于是:
zhxue@xzh3:~/xzhprog/python$ sudo pip install pybloomfilter
Downloading/unpacking pybloomfilter
Downloading pybloomfilter-1.0.tar.gz
Running setup.py egg_info for package pybloomfilter
Installing collected packages: pybloomfilter
Running setup.py install for pybloomfilter
Successfully installed pybloomfilter
Cleaning up...
zhxue@xzh3:~/xzhprog/python$
接着如下问题:
zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py
[-]calg library: http://c-algorithms.sourceforge.net
待续。。。。。
参考文献:
http://blog.csdn.net/pirage/article/details/8878846
http://axiak.github.io/pybloomfiltermmap/index.html#install
2. 直接编程(验证通过)
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install -y python-dev
zhxue@xzh3:~/xzhprog/python$ sudo pip install bitarray
zhxue@xzh3:~/xzhprog/python$ sudo pip install mmh3
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件
from bitarray import bitarray
import mmh3
class BloomFilter:
def __init__(self, size, hash_count):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def add(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
self.bit_array[result] = 1
def lookup(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
if self.bit_array[result] == 0:
return "Nope"
return "Probably"
bf = BloomFilter(500000, 7)
lines = open("/usr/share/dict/american-english").read().splitlines()
for line in lines:
bf.add(line)
print bf.lookup("google")
print bf.lookup("Max")
print bf.lookup("mice")
print bf.lookup("3")
运行结果:
zhxue@xzh3:~/xzhprog/python$ python myBloomFilter.py
Nope
Probably
Probably
Nope
参考文献:
http://maxburstein.com/blog/creating-a-simple-bloom-filter/