bloom filter实现

布隆过滤器的实现较为简单,在网上找了几个有名的字符串哈希函数。

在网址过滤时,如果往过滤器中插入失败,即判定网址重复,有一定误判率。

头文件:

/*
 * bloom_filter.h
 *
 *  Created on: May 2, 2012
 *      Author: joan
 */

#ifndef BLOOM_FILTER_H_
#define BLOOM_FILTER_H_

#include "../OPTION.h"

#define HASH_FUN_NUM	7
#define CHAR_BITS		8
#define MAP_SIZE		(HASH_SIZE*CHAR_BITS)

typedef unsigned int (*HashFunc)(const char *);

class bloom_filter
{
public:
	bloom_filter();
	~bloom_filter();
	bool insert(const string &key);
private:
	bool SetBit(const int pos);
	static unsigned int SDBMHash(const char *str);
	static unsigned int RSHash(const char *str);
	static unsigned int JSHash(const char *str);
	static unsigned int ELFHash(const char *str);
	static unsigned int BKDRHash(const char *str);
	static unsigned int DJBHash(const char *str);
	static unsigned int APHash(const char *str);
private:
	char *bbmap;		//bloom filter bitmap
	HashFunc hashp[HASH_FUN_NUM];	//hash function pointer
};

#endif /* BLOOM_FILTER_H_ */
源文件:
/*
 * bloom_filter.cpp
 *
 *  Created on: May 2, 2012
 *      Author: joan
 */

#include "bloom_filter.h"

bloom_filter::bloom_filter()
{
	bbmap = new char[HASH_SIZE];
	memset(bbmap, 0, HASH_SIZE);

	hashp[0] = SDBMHash;
	hashp[1] = RSHash;
	hashp[2] = JSHash;
	hashp[3] = ELFHash;
	hashp[4] = BKDRHash;
	hashp[5] = DJBHash;
	hashp[6] = APHash;
}

bloom_filter::~bloom_filter()
{
	delete [] bbmap;
}

/*
 * insert a key into bitmap
 * HASH_FUN_NUM bits will be set to 1
 * if all the HASH_FUN_NUM positions are already set, a collision happens,
 *   maybe a duplicate key or a new key, then false is returned
 * if at least one bit is set, true is returned
 */
bool bloom_filter::insert(const string &key)
{
	bool ret = false;
	for(int i=0; i<HASH_FUN_NUM; i++)
	{
		int pos = (hashp[i])(key.c_str());
		//ret = ret || SetBit(pos); !!!!!! this is quit wrong, do not forget the feature of || operator
		ret = SetBit(pos) || ret;
	}
	return ret;
}

/*
 * if setting bit at position @pos succeeds, true is returned.
 */
bool bloom_filter::SetBit(const int pos)
{
	unsigned int charpos   = pos / CHAR_BITS;
	unsigned int bitoffset = pos % CHAR_BITS;
	if( bbmap[charpos] & (1<<bitoffset) )
	{//already set to 1
		return false;
	}
	//set this bit to 1
	bbmap[charpos] |= 1<<bitoffset;
	return true;
}

unsigned int bloom_filter::SDBMHash(const char *str)
{
	unsigned int hash = 0;

	while (*str)
	{
		// equivalent to: hash = 65599*hash + (*str++);
		hash = (*str++) + (hash << 6) + (hash << 16) - hash;
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::RSHash(const char *str)
{
	unsigned int b = 378551;
	unsigned int a = 63689;
	unsigned int hash = 0;

	while (*str)
	{
		hash = hash * a + (*str++);
		a *= b;
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::JSHash(const char *str)
{
	unsigned int hash = 1315423911;

	while (*str)
	{
		hash ^= ((hash << 5) + (*str++) + (hash >> 2));
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::ELFHash(const char *str)
{
	unsigned int hash = 0;
	unsigned int x    = 0;

	while (*str)
	{
		hash = (hash << 4) + (*str++);
		if ((x = hash & 0xF0000000L) != 0)
		{
			hash ^= (x >> 24);
			hash &= ~x;
		}
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::BKDRHash(const char *str)
{
	unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
	unsigned int hash = 0;

	while (*str)
	{
		hash = hash * seed + (*str++);
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::DJBHash(const char *str)
{
	unsigned int hash = 5381;

	while (*str)
	{
		hash += (hash << 5) + (*str++);
	}

	return (hash % MAP_SIZE);
}

unsigned int bloom_filter::APHash(const char *str)
{
	unsigned int hash = 0;
	int i;

	for (i=0; *str; i++)
	{
		if ((i & 1) == 0)
		{
			hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
		}
		else
		{
			hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
		}
	}

	return (hash % MAP_SIZE);
}

你可能感兴趣的:(bloom filter实现)