bloomfilter的实现

bloomfilter利用多个hash函数将key映射到位上,可以大幅节省存储空间。

搜索引擎的爬虫在判断自己是否爬过某个页面时就会用bloomfilter判断。

具体介绍可以看这篇博客http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html


下面是我的实现

#include "stdafx.h"
#include<cmath>
#include<iostream>
using namespace std;

class bloomfilter
{
private:
	unsigned int m;//bit数组的宽度
	unsigned int k;//使用的hash函数的个数
	double f;// False Positive的比率
	unsigned int n;//key数量
	char*bitmap;

private:
	int hash1(char*str);
	int hash2(char*str);
	int getbit(const int nn);
	void setbit(const int nn);
public:
	bloomfilter(const int N);
	bool find(char*str);
	~bloomfilter();
};

bloomfilter::bloomfilter(const int N)
{
	n = N;
	f = 0.00001;
	//要达到上述的false positive比率需要的hash函数个数为k = -ln(f) / ln(2)
	//k = -log(f) / log(2.0);
	//实际f、n、m、k有一定关系,这里偷懒了
	//n = m ln(0.6185) / ln(f)
	//m = (n + 1)*log(f) / log(0.6185);//m是bit数,一个char有8个bit
	k = 2;
	m = 1000000;
	bitmap = new char[m/8+1];
	memset(bitmap, 0, (m / 8 + 1)*sizeof(char));
}
bloomfilter::~bloomfilter()
{
	delete[]bitmap;
}
bool bloomfilter::find(char*str)
{
	int l1 = hash1(str);
	int flag = 0;
	if (getbit(l1))
		flag++;
	else
		setbit(l1);
	int l2 = hash2(str);
	if (getbit(l2))
		flag++;
	else
		setbit(l2);
	return flag == 2;
}

//位操作
int bloomfilter::getbit(const int nn)
{
	int nnn = nn >> 3;
	int lessthan8 = nn % 8;
	return (bitmap[nnn]>>lessthan8)%2==1;
}

//位操作
void bloomfilter::setbit(const int nn)
{
	int nnn = nn >>3;
	int lessthan8 = nn % 8;
	bitmap[nnn]+=(1 << lessthan8);
}

//可以自己设置合适的hash函数
int bloomfilter::hash1(char*str)
{
	unsigned int   h=0;
	char *p;
	for (p = str; *p!='\n'; p++) 
	{
		h = 31 * h + *p;
	}
	return h%m;
}

int bloomfilter::hash2(char*str)
{
	unsigned int hash = 0;
	unsigned int i = 0;
	int len = strlen(str);
	for (i = 0; i < len; str++, i++) {
		hash = (*str) + (hash << 6) + (hash << 16) - hash;
	}
	return hash%m;
}



int _tmain(int argc, _TCHAR* argv[])
{
	//cout << log(0.00001) << endl;
	//char a = 20;
	//a += 1 << 5;
	//cout << ((a >> 4)%2==1) << endl;


	bloomfilter bf(1000);
	cout<<bf.find("www.google.com");
	cout<<bf.find("www.baidu.com");
	cout<<bf.find("www.google.com");
	system("pause");
	return 0;
}


你可能感兴趣的:(bloomfilter)