BloomFilter


BloomFilter——大规模数据处理利器:http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html


一、Java版的BloomFilter

JAVA Bitset应用总结:http://blog.csdn.net/originalintention/article/details/8224831


Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。

package cn.edu.xjtu.nhpcc.jenva.datastructure;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet;

public class BloomFilter {
	
	/* BitSet初始分配2^24个bit */ 
	private static final int DEFAULT_SIZE =1<<24; //each filter is 2^24 bits, equal 4MB 
	/* 不同哈希函数的种子,一般应取质数 */
	private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 };
	private BitSet bitSet = new BitSet(DEFAULT_SIZE);
	/* 哈希函数对象 */ 
	private SimpleHash[] func =new SimpleHash[seeds.length];

	public BloomFilter() 
	{
		for (int i =0; i < seeds.length; i++)
		{
			func[i] =new SimpleHash(DEFAULT_SIZE, seeds[i]);
		}
	}
	
	/**
	 * @param args
	 * @throws IOException 
	 * @throws ClassNotFoundException 
	 */
	public static void main(String[] args) throws IOException, ClassNotFoundException {
		// TODO Auto-generated method stub
		
		BloomFilter filter = new BloomFilter();
		filter.add("shengeng");
		filter.add("hujun");
		filter.add("Hello");
		filter.add("大众点评");
		
		String fileName = "ourbitset";
		filter.persistBitSet(fileName);
		filter.bitSet = null;
		filter.loadPersistedBitSet(fileName);
		
		boolean b1 = filter.contains("shengeng");
		boolean b2 = filter.contains("hello");
		boolean b3 = filter.contains("大众点评");
		boolean b4 = filter.contains("百度");
		
		System.out.println(b1);
		System.out.println(b2);
		System.out.println(b3);
		System.out.println(b4);

	}

	// 将字符串标记到bits中
	public void add(String value) 
	{
		for (SimpleHash f : func) 
		{
			bitSet.set(f.hash(value), true);
		}
	}

	//判断字符串是否已经被bits标记
	public boolean contains(String value) 
	{
		if (value ==null) 
		{
			return false;
		}
		boolean ret =true;
		for (SimpleHash f : func) 
		{
			ret = ret && bitSet.get(f.hash(value));
		}
		return ret;
	}

	/* 哈希函数类 */
	public static class SimpleHash 
	{
		private int cap;
		private int seed;
	
		public SimpleHash(int cap, int seed) 
		{
			this.cap = cap;
			this.seed = seed;
		}
	
		//hash函数,采用简单的加权和hash
		public int hash(String value) 
		{
			int result =0;
			int len = value.length();
			for (int i =0; i < len; i++) 
			{
				result = seed * result + value.charAt(i);
			}
			return (cap -1) & result;
		}
	}
	
	
	/*
	 * This function is for persistence of bitset.
	 */
	public void persistBitSet(String fileName) throws IOException{
		File file = new File(fileName);
		FileOutputStream fileOutputStream = new FileOutputStream(file);
		ObjectOutputStream objectOutputStream = new ObjectOutputStream(fileOutputStream);
		objectOutputStream.writeObject(this.bitSet);
		objectOutputStream.close();
	}
	
	/*
	 * This function is to load persisted bitset.
	 */
	public void loadPersistedBitSet(String fileName) throws IOException, ClassNotFoundException{
		File file = new File(fileName);
		FileInputStream fileInputStream = new FileInputStream(file);
		ObjectInputStream objectInputStream = new ObjectInputStream(fileInputStream);
		BitSet bitSet = (BitSet) objectInputStream.readObject();
		
		this.bitSet = bitSet;
	}
	
}




关于程序需要注意:


1. Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。本例中“指定大小”为DEFAULT_SIZE 。这个写了一个程序验证过了。

2. Java中一个对象占用了多少内存,只和它直接所有的变量以及构造函数里的变量有关系,与该类内部其它方法有多少变量没有关系。



二、Python下的BloomFilter

zhxue@xzh3:~/xzhprog/python$ sudo aptitude install -y  python-pip

1. 使用pybloomfilter module(未成功

zhxue@xzh3:~/xzhprog/python$ sudo pip install -y  pybloomfiltermmap

zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件

执行时,碰到下列问题:

zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py 
Traceback (most recent call last):
  File "testBloomFilter.py", line 1, in <module>
    from pybloomfilter import BloomFilter
ImportError: No module named pybloomfilter
zhxue@xzh3:~/xzhprog/python$ vi testBloomFilter.py 

于是:

zhxue@xzh3:~/xzhprog/python$ sudo pip install pybloomfilter

Downloading/unpacking pybloomfilter
  Downloading pybloomfilter-1.0.tar.gz
  Running setup.py egg_info for package pybloomfilter
    
Installing collected packages: pybloomfilter
  Running setup.py install for pybloomfilter
    
Successfully installed pybloomfilter
Cleaning up...
zhxue@xzh3:~/xzhprog/python$ 

接着如下问题:

zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py 

[-]calg library: http://c-algorithms.sourceforge.net

待续。。。。。

参考文献:

http://blog.csdn.net/pirage/article/details/8878846

http://axiak.github.io/pybloomfiltermmap/index.html#install


2. 直接编程(验证通过)

zhxue@xzh3:~/xzhprog/python$ sudo apt-get install -y  python-dev

zhxue@xzh3:~/xzhprog/python$ sudo pip install bitarray

zhxue@xzh3:~/xzhprog/python$ sudo pip install mmh3

zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件


from bitarray import bitarray
import mmh3

class BloomFilter:

    def __init__(self, size, hash_count):
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def add(self, string):
        for seed in xrange(self.hash_count):
            result = mmh3.hash(string, seed) % self.size
            self.bit_array[result] = 1

    def lookup(self, string):
        for seed in xrange(self.hash_count):
            result = mmh3.hash(string, seed) % self.size
            if self.bit_array[result] == 0:
                return "Nope"
        return "Probably"

bf = BloomFilter(500000, 7)

lines = open("/usr/share/dict/american-english").read().splitlines()

for line in lines:
    bf.add(line)

print bf.lookup("google")
print bf.lookup("Max")
print bf.lookup("mice")
print bf.lookup("3")

运行结果:

zhxue@xzh3:~/xzhprog/python$ python myBloomFilter.py 
Nope
Probably
Probably
Nope

参考文献:

http://maxburstein.com/blog/creating-a-simple-bloom-filter/

你可能感兴趣的:(BloomFilter)