那么在这里,我写了四个类,其中类AbstractBloomFilter是抽象类,提供了大多数布隆过滤器提供的功能。此外,类StaticBloomFilter是静态布隆过滤器,只能处理一定数据容量。类TwoStageBloomFilter,又称为二阶段布隆过滤器,提供元素地址查询功能,但也只能处理一定数据容量。类BinaryTreeBloomFilter是二叉树布隆过滤器,利用树形结构组织静态布隆过滤器,扩展性好,能够处理未知规模的数据。但需要注意,如果数据容量达到一定程度,树形过滤器的内存占用量也会上升。目前,还在改进树形布隆过滤器,不久希望可以支持文件存储,节省内存空间。
参考:
静态布隆过滤器介绍: http://en.wikipedia.org/wiki/Bloom_filter
树形布隆过滤器介绍论文:http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5937232&tag=1 论文《A Forest-structured Bloom Filter with Flash Memory》
二阶段布隆过滤器介绍论文:http://www.springerlink.com/content/5700583110163736/ 论文:《TBF: A High-Efficient Query Mechanism in De-duplication Backup System》
一个静态布隆过滤器实现版本:http://code.google.com/p/java-bloomfilter/
代码缺陷
注释不多,二叉树过滤器的实现还不够理想。希望后面能加上缓存策略,单个节点过滤器独立选择RAM存储或文件存储。
代码(3个见附件)
AbstractBloomFilter.java
StaticBloomFilter.java
TwoStageBloomFilter.java
import java.io.Serializable;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.SortedMap;
public class TwoStageBloomFilter<E> extends AbstractBloomFilter<E> implements Serializable{
private short[] list;//short length is 32;
private BitSet bitset;
private int listSize;
private int expectedNumberOfFilterElements; // expected (maximum) number of elements to be added
private int numberOfAddedElements; // number of elements actually added to the Bloom filter
private HashMap<Integer,Integer> second=new HashMap<Integer,Integer>();/*first Integer is XOR operation result of K hashValues;
second Integer referes to the index in the following hashValueList */
private ArrayList<String> hashValueList=new ArrayList<String>();//hashValueList is used to save hash values
private ArrayList<String> addressList=new ArrayList<String>();//addressList is used to save addresses of chunks
private ArrayList<Integer> nextIndexList=new ArrayList<Integer>();//nextIndexList is used to save the next index of hashValueList ,who has the same XOR result
public TwoStageBloomFilter(float falsePositiveProbability,
int expectedNumberOfFilterElements) {
this((float)falsePositiveProbability, expectedNumberOfFilterElements,true);
}
public TwoStageBloomFilter(float falsePositiveProbability,
int expectedNumberOfFilterElements,boolean intmode) {
this.intmode=intmode;
this.numberOfAddedElements=0;
this.expectedNumberOfFilterElements=expectedNumberOfFilterElements;
this.falsePositiveProbability=falsePositiveProbability;
this.listSize=(int)(-this.K*this.expectedNumberOfFilterElements/(Math.log
(1-Math.pow(this.falsePositiveProbability, 1.0/this.K))));//-kn/ln(1-f^(1/k))
this.listSize=(this.listSize==0)?1:this.listSize;
if(intmode)
this.list=new short[this.listSize];
else
this.bitset=new BitSet(this.listSize);
}
public TwoStageBloomFilter(double falsePositiveProbability,
int expectedNumberOfFilterElements) {
this((float)falsePositiveProbability, expectedNumberOfFilterElements);
}
public TwoStageBloomFilter(int expectedNumberOfFilterElements) {
this((float)1.0/(expectedNumberOfFilterElements*100), expectedNumberOfFilterElements);
}
public TwoStageBloomFilter(float falsePositiveProbability,
int expectedNumberOfFilterElements, short k) {
this((float)falsePositiveProbability, expectedNumberOfFilterElements, k,true);
}
public TwoStageBloomFilter(float falsePositiveProbability,
int expectedNumberOfFilterElements, short k,boolean intmode) {
this.intmode=intmode;
this.K=k;
this.numberOfAddedElements=0;
this.expectedNumberOfFilterElements=expectedNumberOfFilterElements;
this.falsePositiveProbability=falsePositiveProbability;
this.listSize=(int)(-this.K*this.expectedNumberOfFilterElements/(Math.log
(1-Math.pow(this.falsePositiveProbability, 1.0/this.K))));//-kn/ln(1-f^(1/k))
this.listSize=(this.listSize==0)?1:this.listSize;
if(intmode)
this.list=new short[this.listSize];
else
this.bitset=new BitSet(this.listSize);
}
public TwoStageBloomFilter(double falsePositiveProbability,
int expectedNumberOfFilterElements, short k) {
this((float)falsePositiveProbability, expectedNumberOfFilterElements, k);
}
public TwoStageBloomFilter(int expectedNumberOfFilterElements, short k) {
this((float)1.0/(expectedNumberOfFilterElements*100), expectedNumberOfFilterElements, k);
}
@Override
protected boolean addElement(E element,String address) {
if(!this.spareSpaceAvailable())
{
System.err.println("One Bloom Filter is full");
return false;
}
byte[] bytes=element.toString().getBytes();
byte[] hashValue=this.getHashValue(bytes);
int[] pos=this.createHashes(hashValue);
if(this.contains(pos))
return true;
int xorResult=pos[0];
for(int j=1;j<pos.length;j++)
xorResult^=pos[j];
xorResult=Math.abs(xorResult%this.listSize);
Integer hashIndex=-1;
if((hashIndex=this.second.get(new Integer(xorResult)))==null)
{
this.second.put(xorResult, this.hashValueList.size());
this.hashValueList.add(new String(hashValue));
this.addressList.add(address);
this.nextIndexList.add(-1);
for(int j=0;j<K;j++)
{
if(intmode)
this.list[Math.abs(pos[j]%this.listSize)]++;
else
this.bitset.set(Math.abs(pos[j]%this.listSize));
}
this.numberOfAddedElements++;
return true;
}
Integer tmp;
String tmpStr;
while((tmp=this.nextIndexList.get(hashIndex))!=null)
{
tmpStr=this.hashValueList.get(hashIndex);
if(tmpStr.equals(new String(hashValue)))
return true;
if(tmpStr.equals("!!"))
{
this.hashValueList.set(hashIndex, new String(hashValue));
this.addressList.set(hashIndex, address);
return true;
}
hashIndex=tmp;
}
for(int j=0;j<K;j++)
{
if(intmode)
this.list[Math.abs(pos[j]%this.listSize)]++;
else
this.bitset.set(Math.abs(pos[j]%this.listSize));
}
this.nextIndexList.set(hashIndex, this.hashValueList.size());
this.hashValueList.add(new String(hashValue));
this.addressList.add(address);
this.nextIndexList.add(-1);
this.numberOfAddedElements++;
return true;
}
@Override
protected boolean removeElement(E element) {
if(!intmode)
{
System.err.println("BitSet does not support removeElement method,you can create BF using intmode=true");
return false;
}
byte[] bytes=element.toString().getBytes();
byte[] hashValue=this.getHashValue(bytes);
return this.removeByHashValue(hashValue);
}
@Override
protected boolean contains(E element) {
byte[] bytes=element.toString().getBytes();
int[] pos=this.createHashes(this.getHashValue(bytes));
return this.contains(pos);
}
@Override
protected void clear() {
if(intmode)
this.list=new short[this.listSize];
else
this.bitset=new BitSet(this.listSize);
this.hashValueList.clear();
this.addressList.clear();
this.nextIndexList.clear();
this.numberOfAddedElements=0;
}
protected String getAddressIfContains(E element) {
byte[] bytes=element.toString().getBytes();
byte[] hashValue=this.getHashValue(bytes);
int[] pos=this.createHashes(hashValue);
if(!this.contains(pos))
return null;
int xorResult=pos[0];
for(int j=1;j<pos.length;j++)
xorResult^=pos[j];
xorResult=Math.abs(xorResult%this.listSize);
Integer index;
if((index=this.second.get(xorResult))!=null)
{
while(!this.hashValueList.get(index).equals(new String(hashValue)))
{
index=this.nextIndexList.get(index);
if(index==-1)
return null;
}
return this.addressList.get(index);
}
return null;
}
@Override
protected boolean contains(int[] pos) {
if(intmode)
for(int j=0;j<K;j++)
{
if(this.list[Math.abs(pos[j]%this.listSize)]>0)
continue;
else
return false;
}
else
for(int j=0;j<K;j++)
{
if(this.bitset.get(Math.abs(pos[j]%this.listSize)))
continue;
else
return false;
}
return true;
}
public boolean removeByHashValue(byte[] hashValue)
{
int[] pos=this.createHashes(hashValue);
if(!this.contains(pos))
return false;
int xorResult=pos[0];
for(int j=1;j<pos.length;j++)
xorResult^=pos[j];
xorResult=Math.abs(xorResult%this.listSize);
Integer hashIndex=-1;
if((hashIndex=this.second.get(new Integer(xorResult)))==null)
{
return false;
}
do
{
if(hashIndex==-1)
return false;
if(this.hashValueList.get(hashIndex).equals(new String(hashValue)))
{
this.hashValueList.set(hashIndex, "!!");
break;
}
hashIndex=this.nextIndexList.get(hashIndex);
}while(true);
for(int j=0;j<this.K;j++)
{
this.list[Math.abs(pos[j]%this.listSize)]--;
}
this.numberOfAddedElements--;
return true;
}
public int getListSize()
{
return this.listSize;
}
public int getExpectedNumberOfFilterElements()
{
return this.expectedNumberOfFilterElements;
}
public int getNumberOfAddedElements()
{
return this.numberOfAddedElements;
}
@Override
protected boolean spareSpaceAvailable() {
return this.expectedNumberOfFilterElements>this.numberOfAddedElements;
}
@Override
protected AbstractBloomFilter clone() {
return new TwoStageBloomFilter(this.listSize);
}
@Override
protected boolean addElement(E element) {
System.err.println("addElement(E) is not applicable for TwoStageBloomFilter");
return false;
}
}
BinaryTreeBloomFilter.java