[置顶] 布隆过滤器

(一)原理

     布隆过滤器的原理实际上就是位图和哈希表的互补,位图省空间,哈希表省时间。这样便可以在位图中查询K(某字符串)的存在与否

(二)代码实现如下

BiMap.h


#pragma once

#include <vector>

class BitMap
{
public:
	BitMap()
		:_size(0)
	{}

	BitMap(size_t size)//这里size_t是4个字节,共32位,所以一个size可以表示32个数的状态
		:_size(0)
	{
		_arrays.resize((size >> 5) + 1);//由于当size小于8的时候,右移5位后可能是0,因此加上1,保证最少开辟1个字节
	}

	bool Set(size_t num)
	{
		size_t index = num >> 5;//相当于除以32,找到它是第几个数,即index
		size_t n = num % 32;//%32找到num在位图的第index个数上的第n位

		if (_arrays[index] & (1 << n))//由于调用一次Set,_size就会加一,这样_size就不准了。
			//这样写就避免这个问题了
		{
			return false;
		}

		_arrays[index] |= (1 << n);
		++_size;
		return true;
	}

	bool ReSet(size_t num)
	{
		size_t index = num >> 5;
		size_t n = num % 32;

		if (_arrays[index] & (1 << n))//原先这个位是1,现在改成0,当然_size也要减一
		{
			_arrays[index] &= (~(1 << n));
			--_size;
			return true;
		}
		else
		{
			return false;
		}
	}

	bool Test(size_t num)
	{
		size_t index = num >> 5;
		size_t n = num % 32;

		return _arrays[index] & (1 << n);
	}

	void Clear()//置空该位图
	{
		_arrays.assign(_arrays.size(), 0);
	}
	void Resize(size_t size)
	{
		_arrays.resize((size >> 5) + 1);
	}

protected:
	vector<size_t> _arrays;
	size_t _size;
};

void Test1()
{
	BitMap bm(65);
	bm.Set(1);
	bm.Set(4);
	bm.Set(33);

	cout << "1?" << bm.Test(1) << endl;
	cout << "2?" << bm.Test(2) << endl;
	cout << "4?" << bm.Test(4) << endl;
	cout << "33?" << bm.Test(33) << endl;

	bm.ReSet(33);
	bm.ReSet(4);

	cout << "1?" << bm.Test(1) << endl;
	cout << "2?" << bm.Test(2) << endl;
	cout << "4?" << bm.Test(4) << endl;
	cout << "33?" << bm.Test(33) << endl;
}

void Test2()
{
	BitMap bm(-1);
	bm.Set(10000000);
}
Bloom.h

# include "BitMap.h"
class Bloom
{
private:
	BitMap Map;
	size_t _capacity;
public:
	Bloom(size_t size)
	{
		_capacity = _GetNextPrime(size);
		Map.Resize(_capacity);
	}
	void Set(const char *&key)
	{
		size_t index1 = BKDRHash(key);
		size_t index2 = SDBMHash(key);
		size_t index3 = RSHash(key);
		size_t index4 = APHash(key);
		size_t index5 = JSHash(key);
		Map.Set(index1%_capacity);
		Map.Set(index2%_capacity);
		Map.Set(index3%_capacity);
		Map.Set(index4%_capacity);
		Map.Set(index5%_capacity);
	}
	void Set(const string & key)
	{
		size_t index1 = HashFunc1()(key);
		size_t index2 = HashFunc2()(key);
		size_t index3 = HashFunc3()(key);
		size_t index4 = HashFunc4()(key);
		size_t index5 = HashFunc5()(key);

		Map.Set(index1%_capacity);
		Map.Set(index2%_capacity);
		Map.Set(index3%_capacity);
		Map.Set(index4%_capacity);
		Map.Set(index5%_capacity);
	}
	
	bool IsIn(const string & key)
	{
		size_t index1 = HashFunc1()(key);
		if (!Map.Test(index1%_capacity))
		{
			return false;
		}
		size_t index2 = HashFunc2()(key);
		if (!Map.Test(index2%_capacity))
		{
			return false;
		}
		size_t index3 = HashFunc3()(key);
		if (!Map.Test(index3%_capacity))
		{
			return false;
		}
		size_t index4 = HashFunc4()(key);
		if (!Map.Test(index4%_capacity))
		{
			return false;
		}
		size_t index5 = HashFunc5()(key);
		if (!Map.Test(index5%_capacity))
		{
			return false;
		}

		return true;
	}
private:
	static size_t BKDRHash(const char *str)
	{
		unsigned int seed = 131; // 31 131 1313 13131 131313 
		unsigned int hash = 0;
		while (*str)
		{
			hash = hash * seed + (*str++);
		}

		return (hash & 0x7FFFFFFF);
	}

	static size_t SDBMHash(const char *str)
	{
		register size_t hash = 0;
		while (size_t ch = (size_t)*str++)
		{
			hash = 65599 * hash + ch;
			//hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;   
		}
		return hash;
	}

	static size_t RSHash(const char *str)
	{
		register size_t hash = 0;
		size_t magic = 63689;
		while (size_t ch = (size_t)*str++)
		{
			hash = hash * magic + ch;
			magic *= 378551;
		}
		return hash;
	}

	static size_t APHash(const char *str)
	{
		register size_t hash = 0;
		size_t ch;
		for (long i = 0; ch = (size_t)*str++; i++)
		{
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
			}
		}
		return hash;
	}

	static size_t JSHash(const char *str)
	{
		if (!*str)
			return 0;

		register size_t hash = 1315423911;
		while (size_t ch = (size_t)*str++)
		{
			hash ^= ((hash << 5) + ch + (hash >> 2));
		}
		return hash;
	}
	struct HashFunc1
	{
		size_t operator()(const string& key)
		{
			return BKDRHash(key.c_str());
		}
	};


	struct HashFunc2
	{
		size_t operator()(const string & key)
		{
			return SDBMHash(key.c_str());
		}
	};


	struct HashFunc3
	{
		size_t operator()(const string & key)
		{
			return RSHash(key.c_str());
		}
	};

	struct HashFunc4
	{
		size_t operator()(const string & key)
		{
			return APHash(key.c_str());
		}
	};

	struct HashFunc5
	{
		size_t operator()(const string & key)
		{
			return JSHash(key.c_str());
		}
	};

protected:
	unsigned long _GetNextPrime(unsigned long num)
	{
		const int _PrimeSize = 28;
		static const unsigned long _PrimeList[_PrimeSize] =
		{
			53ul, 97ul, 193ul, 389ul, 769ul,
			1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
			49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
			1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
			50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
			1610612741ul, 3221225473ul, 4294967291ul
		};
		size_t pos = 0;
		while (pos < _PrimeSize)
		{
			if (_PrimeList[pos] > num)
			{
				break;
			}
			++pos;
		}
		return _PrimeList[pos];
	}

};


# include<iostream>
using namespace std;
# include"Bloom.h"
int main()
{
	Bloom b(90);
	char *p = "afshrajsys";
	b.Set(p);
	cout << b.IsIn(p) << endl;
	cout << b.IsIn("qq")<<endl;
}



你可能感兴趣的:(布隆过滤器)