动态布隆过滤器Java实现

 布隆过滤器(Bloom Filter)是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法,缺点是有一定的误识别率和删除困难。当年,布隆过滤器还是静态的,即只能处理一定容量的数据,不能处理未知规模的数据。




那么在这里,我写了四个类,其中类AbstractBloomFilter是抽象类,提供了大多数布隆过滤器提供的功能。此外,类StaticBloomFilter是静态布隆过滤器,只能处理一定数据容量。类TwoStageBloomFilter,又称为二阶段布隆过滤器,提供元素地址查询功能,但也只能处理一定数据容量。类BinaryTreeBloomFilter是二叉树布隆过滤器,利用树形结构组织静态布隆过滤器,扩展性好,能够处理未知规模的数据。但需要注意,如果数据容量达到一定程度,树形过滤器的内存占用量也会上升。目前,还在改进树形布隆过滤器,不久希望可以支持文件存储,节省内存空间。

 

参考:

 

静态布隆过滤器介绍: http://en.wikipedia.org/wiki/Bloom_filter

树形布隆过滤器介绍论文:http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5937232&tag=1   论文《A Forest-structured Bloom Filter with Flash Memory》

二阶段布隆过滤器介绍论文:http://www.springerlink.com/content/5700583110163736/ 论文:《TBF: A High-Efficient Query Mechanism in De-duplication Backup System》

一个静态布隆过滤器实现版本:http://code.google.com/p/java-bloomfilter/

 

代码缺陷

    注释不多,二叉树过滤器的实现还不够理想。希望后面能加上缓存策略,单个节点过滤器独立选择RAM存储或文件存储。


代码(3个见附件)

AbstractBloomFilter.java

 

StaticBloomFilter.java

 

 

TwoStageBloomFilter.java

 

import java.io.Serializable;

import java.util.ArrayList;

import java.util.BitSet;

import java.util.HashMap;

import java.util.SortedMap;

 

 

 

public class TwoStageBloomFilter<E> extends AbstractBloomFilter<E> implements Serializable{

 

private short[] list;//short length is 32;

private BitSet bitset;

private int listSize;

private int expectedNumberOfFilterElements; // expected (maximum) number of elements to be added

 

private int numberOfAddedElements; // number of elements actually added to the Bloom filter

private HashMap<Integer,Integer> second=new HashMap<Integer,Integer>();/*first Integer is XOR operation result of K hashValues;

                                                                        second Integer referes to the index in the following hashValueList */

private ArrayList<String> hashValueList=new ArrayList<String>();//hashValueList is used to save hash values

private ArrayList<String> addressList=new ArrayList<String>();//addressList is used to save addresses of chunks

private ArrayList<Integer> nextIndexList=new ArrayList<Integer>();//nextIndexList is used to save the next index of hashValueList ,who has the same XOR result

public TwoStageBloomFilter(float falsePositiveProbability,

int expectedNumberOfFilterElements) {

this((float)falsePositiveProbability, expectedNumberOfFilterElements,true);

}

public TwoStageBloomFilter(float falsePositiveProbability,

int expectedNumberOfFilterElements,boolean intmode) {

this.intmode=intmode;

this.numberOfAddedElements=0;

this.expectedNumberOfFilterElements=expectedNumberOfFilterElements;

this.falsePositiveProbability=falsePositiveProbability;

this.listSize=(int)(-this.K*this.expectedNumberOfFilterElements/(Math.log

(1-Math.pow(this.falsePositiveProbability, 1.0/this.K))));//-kn/ln(1-f^(1/k))

this.listSize=(this.listSize==0)?1:this.listSize;

if(intmode)

this.list=new short[this.listSize];

else

this.bitset=new BitSet(this.listSize);

}

public TwoStageBloomFilter(double falsePositiveProbability,

int expectedNumberOfFilterElements) {

this((float)falsePositiveProbability, expectedNumberOfFilterElements);

}

public TwoStageBloomFilter(int expectedNumberOfFilterElements) {

this((float)1.0/(expectedNumberOfFilterElements*100), expectedNumberOfFilterElements);

}

public TwoStageBloomFilter(float falsePositiveProbability,

int expectedNumberOfFilterElements, short k) {

this((float)falsePositiveProbability, expectedNumberOfFilterElements, k,true);

}

public TwoStageBloomFilter(float falsePositiveProbability,

int expectedNumberOfFilterElements, short k,boolean intmode) {

this.intmode=intmode;

this.K=k;

this.numberOfAddedElements=0;

this.expectedNumberOfFilterElements=expectedNumberOfFilterElements;

this.falsePositiveProbability=falsePositiveProbability;

this.listSize=(int)(-this.K*this.expectedNumberOfFilterElements/(Math.log

(1-Math.pow(this.falsePositiveProbability, 1.0/this.K))));//-kn/ln(1-f^(1/k))

this.listSize=(this.listSize==0)?1:this.listSize;

if(intmode)

this.list=new short[this.listSize];

else

this.bitset=new BitSet(this.listSize);

}

public TwoStageBloomFilter(double falsePositiveProbability,

int expectedNumberOfFilterElements, short k) {

this((float)falsePositiveProbability, expectedNumberOfFilterElements, k);

}

public TwoStageBloomFilter(int expectedNumberOfFilterElements, short k) {

this((float)1.0/(expectedNumberOfFilterElements*100), expectedNumberOfFilterElements, k);

}

@Override

protected boolean addElement(E element,String address) {

if(!this.spareSpaceAvailable())

{

System.err.println("One Bloom Filter is full");

return false;

}

byte[] bytes=element.toString().getBytes();

byte[] hashValue=this.getHashValue(bytes);

int[] pos=this.createHashes(hashValue);

if(this.contains(pos))

return true;

int xorResult=pos[0];

for(int j=1;j<pos.length;j++)

xorResult^=pos[j];

xorResult=Math.abs(xorResult%this.listSize);

Integer hashIndex=-1;

if((hashIndex=this.second.get(new Integer(xorResult)))==null)

{

this.second.put(xorResult, this.hashValueList.size());

this.hashValueList.add(new String(hashValue));

this.addressList.add(address);

this.nextIndexList.add(-1);

for(int j=0;j<K;j++)

{

if(intmode)

this.list[Math.abs(pos[j]%this.listSize)]++;

else

this.bitset.set(Math.abs(pos[j]%this.listSize));

}

this.numberOfAddedElements++;

return true;

}

Integer tmp;

String tmpStr;

while((tmp=this.nextIndexList.get(hashIndex))!=null)

{

tmpStr=this.hashValueList.get(hashIndex);

if(tmpStr.equals(new String(hashValue)))

return true;

if(tmpStr.equals("!!"))

{

this.hashValueList.set(hashIndex, new String(hashValue));

this.addressList.set(hashIndex, address);

return true;

}

hashIndex=tmp;

}

for(int j=0;j<K;j++)

{

if(intmode)

this.list[Math.abs(pos[j]%this.listSize)]++;

else

this.bitset.set(Math.abs(pos[j]%this.listSize));

}

this.nextIndexList.set(hashIndex, this.hashValueList.size());

this.hashValueList.add(new String(hashValue));

this.addressList.add(address);

this.nextIndexList.add(-1);

this.numberOfAddedElements++;

return true;

}

@Override

protected boolean removeElement(E element) {

if(!intmode)

{

System.err.println("BitSet does not support removeElement method,you can create BF using intmode=true");

return false;

}

byte[] bytes=element.toString().getBytes();

byte[] hashValue=this.getHashValue(bytes);

return this.removeByHashValue(hashValue);

}

@Override

protected boolean contains(E element) {

byte[] bytes=element.toString().getBytes();

int[] pos=this.createHashes(this.getHashValue(bytes));

return this.contains(pos);

}

@Override

protected void clear() {

if(intmode)

this.list=new short[this.listSize];

else

this.bitset=new BitSet(this.listSize);

this.hashValueList.clear();

this.addressList.clear();

this.nextIndexList.clear();

this.numberOfAddedElements=0;

}

 

protected String getAddressIfContains(E element) {

byte[] bytes=element.toString().getBytes();

byte[] hashValue=this.getHashValue(bytes);

int[] pos=this.createHashes(hashValue);

if(!this.contains(pos))

return null;

int xorResult=pos[0];

for(int j=1;j<pos.length;j++)

xorResult^=pos[j];

xorResult=Math.abs(xorResult%this.listSize);

Integer index;

if((index=this.second.get(xorResult))!=null)

{

while(!this.hashValueList.get(index).equals(new String(hashValue)))

{

index=this.nextIndexList.get(index);

if(index==-1)

return null;

}

return this.addressList.get(index);

}

return null;

}

@Override

protected boolean contains(int[] pos) {

if(intmode)

for(int j=0;j<K;j++)

{

if(this.list[Math.abs(pos[j]%this.listSize)]>0)

continue;

else

return false;

}

else

for(int j=0;j<K;j++)

{

if(this.bitset.get(Math.abs(pos[j]%this.listSize)))

continue;

else

return false;

}

return true;

}

public boolean removeByHashValue(byte[] hashValue)

{

int[] pos=this.createHashes(hashValue);

if(!this.contains(pos))

return false;

int xorResult=pos[0];

for(int j=1;j<pos.length;j++)

xorResult^=pos[j];

xorResult=Math.abs(xorResult%this.listSize);

Integer hashIndex=-1;

if((hashIndex=this.second.get(new Integer(xorResult)))==null)

{

return false;

}

do

{

if(hashIndex==-1)

return false;

if(this.hashValueList.get(hashIndex).equals(new String(hashValue)))

{

this.hashValueList.set(hashIndex, "!!");

break;

}

hashIndex=this.nextIndexList.get(hashIndex);

}while(true);

for(int j=0;j<this.K;j++)

{

this.list[Math.abs(pos[j]%this.listSize)]--;

}

this.numberOfAddedElements--;

return true;

}

public int getListSize()

{

return this.listSize;

}

public int getExpectedNumberOfFilterElements()

{

return this.expectedNumberOfFilterElements;

}

public int getNumberOfAddedElements()

{

return this.numberOfAddedElements;

}

 

@Override

protected boolean spareSpaceAvailable() {

return this.expectedNumberOfFilterElements>this.numberOfAddedElements;

}

 

@Override

protected AbstractBloomFilter clone() {

return new TwoStageBloomFilter(this.listSize);

}

 

@Override

protected boolean addElement(E element) {

System.err.println("addElement(E) is not applicable for TwoStageBloomFilter");

return false;

}

}

 

BinaryTreeBloomFilter.java

 


你可能感兴趣的:(java,dynamic,filter,布隆过滤器,bloom)