(一)原理
布隆过滤器的原理实际上就是位图和哈希表的互补,位图省空间,哈希表省时间。这样便可以在位图中查询K(某字符串)的存在与否
(二)代码实现如下
#pragma once #include <vector> class BitMap { public: BitMap() :_size(0) {} BitMap(size_t size)//这里size_t是4个字节,共32位,所以一个size可以表示32个数的状态 :_size(0) { _arrays.resize((size >> 5) + 1);//由于当size小于8的时候,右移5位后可能是0,因此加上1,保证最少开辟1个字节 } bool Set(size_t num) { size_t index = num >> 5;//相当于除以32,找到它是第几个数,即index size_t n = num % 32;//%32找到num在位图的第index个数上的第n位 if (_arrays[index] & (1 << n))//由于调用一次Set,_size就会加一,这样_size就不准了。 //这样写就避免这个问题了 { return false; } _arrays[index] |= (1 << n); ++_size; return true; } bool ReSet(size_t num) { size_t index = num >> 5; size_t n = num % 32; if (_arrays[index] & (1 << n))//原先这个位是1,现在改成0,当然_size也要减一 { _arrays[index] &= (~(1 << n)); --_size; return true; } else { return false; } } bool Test(size_t num) { size_t index = num >> 5; size_t n = num % 32; return _arrays[index] & (1 << n); } void Clear()//置空该位图 { _arrays.assign(_arrays.size(), 0); } void Resize(size_t size) { _arrays.resize((size >> 5) + 1); } protected: vector<size_t> _arrays; size_t _size; }; void Test1() { BitMap bm(65); bm.Set(1); bm.Set(4); bm.Set(33); cout << "1?" << bm.Test(1) << endl; cout << "2?" << bm.Test(2) << endl; cout << "4?" << bm.Test(4) << endl; cout << "33?" << bm.Test(33) << endl; bm.ReSet(33); bm.ReSet(4); cout << "1?" << bm.Test(1) << endl; cout << "2?" << bm.Test(2) << endl; cout << "4?" << bm.Test(4) << endl; cout << "33?" << bm.Test(33) << endl; } void Test2() { BitMap bm(-1); bm.Set(10000000); }Bloom.h
# include "BitMap.h" class Bloom { private: BitMap Map; size_t _capacity; public: Bloom(size_t size) { _capacity = _GetNextPrime(size); Map.Resize(_capacity); } void Set(const char *&key) { size_t index1 = BKDRHash(key); size_t index2 = SDBMHash(key); size_t index3 = RSHash(key); size_t index4 = APHash(key); size_t index5 = JSHash(key); Map.Set(index1%_capacity); Map.Set(index2%_capacity); Map.Set(index3%_capacity); Map.Set(index4%_capacity); Map.Set(index5%_capacity); } void Set(const string & key) { size_t index1 = HashFunc1()(key); size_t index2 = HashFunc2()(key); size_t index3 = HashFunc3()(key); size_t index4 = HashFunc4()(key); size_t index5 = HashFunc5()(key); Map.Set(index1%_capacity); Map.Set(index2%_capacity); Map.Set(index3%_capacity); Map.Set(index4%_capacity); Map.Set(index5%_capacity); } bool IsIn(const string & key) { size_t index1 = HashFunc1()(key); if (!Map.Test(index1%_capacity)) { return false; } size_t index2 = HashFunc2()(key); if (!Map.Test(index2%_capacity)) { return false; } size_t index3 = HashFunc3()(key); if (!Map.Test(index3%_capacity)) { return false; } size_t index4 = HashFunc4()(key); if (!Map.Test(index4%_capacity)) { return false; } size_t index5 = HashFunc5()(key); if (!Map.Test(index5%_capacity)) { return false; } return true; } private: static size_t BKDRHash(const char *str) { unsigned int seed = 131; // 31 131 1313 13131 131313 unsigned int hash = 0; while (*str) { hash = hash * seed + (*str++); } return (hash & 0x7FFFFFFF); } static size_t SDBMHash(const char *str) { register size_t hash = 0; while (size_t ch = (size_t)*str++) { hash = 65599 * hash + ch; //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash; } return hash; } static size_t RSHash(const char *str) { register size_t hash = 0; size_t magic = 63689; while (size_t ch = (size_t)*str++) { hash = hash * magic + ch; magic *= 378551; } return hash; } static size_t APHash(const char *str) { register size_t hash = 0; size_t ch; for (long i = 0; ch = (size_t)*str++; i++) { if ((i & 1) == 0) { hash ^= ((hash << 7) ^ ch ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ ch ^ (hash >> 5))); } } return hash; } static size_t JSHash(const char *str) { if (!*str) return 0; register size_t hash = 1315423911; while (size_t ch = (size_t)*str++) { hash ^= ((hash << 5) + ch + (hash >> 2)); } return hash; } struct HashFunc1 { size_t operator()(const string& key) { return BKDRHash(key.c_str()); } }; struct HashFunc2 { size_t operator()(const string & key) { return SDBMHash(key.c_str()); } }; struct HashFunc3 { size_t operator()(const string & key) { return RSHash(key.c_str()); } }; struct HashFunc4 { size_t operator()(const string & key) { return APHash(key.c_str()); } }; struct HashFunc5 { size_t operator()(const string & key) { return JSHash(key.c_str()); } }; protected: unsigned long _GetNextPrime(unsigned long num) { const int _PrimeSize = 28; static const unsigned long _PrimeList[_PrimeSize] = { 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul }; size_t pos = 0; while (pos < _PrimeSize) { if (_PrimeList[pos] > num) { break; } ++pos; } return _PrimeList[pos]; } };
# include<iostream> using namespace std; # include"Bloom.h" int main() { Bloom b(90); char *p = "afshrajsys"; b.Set(p); cout << b.IsIn(p) << endl; cout << b.IsIn("qq")<<endl; }