hash函数常用的是mod 素数,或者使用乘法策略,取某些位,这些策略直接影响到hash table的resize,如果是mod素数的话,只能按素数来递增,如果是取乘法方法,只能以2^p来递增。
参考文献[1][2]等给出了很多,常用的字符串hash函数,但更有价值的还包括下面这两个:
murmur hash[3][4],其中[3]无法直接访问,不过本文的例子中有具体的实现,更多murmur hash[8]。
city hash见参考文献[5]
hash table的构建通常使用list来解决冲突的问题,c++ tr1中就是这样的,tr1的实现中使用了策略base的设计方法,详情见参考文献[6],目前没有仔细看。这个参考文献是在浏览的源码( /usr/include/c++/4.4/tr1_impl/hashtable)时,在头注释中发现的。
resize的策略包括:
策略1:
1)全部重新copy一遍,
策略2:
1)resize时,使用2个hashtable,当insert时,只向新的hash table中insert,同时将old hash_table中的r个数据放如新的table;查找时,2个hash table同时查询
2)如果old hash table数据已经全部移动完毕,删除old hash table
上面两个策略的详细信息见参考文献
说到hash需要提一下bloomfilter,它通过hash实现,好的hash函数可以使bloomfilter具有很好的性能。它的一个使用方法,就是如果要查询数据库,可以在数据库前加一个bloomfilter,如果没在bloomfilter中,就不用查询数据库了,因为bloomfilter返回false是不会有错误的。cityhash和murmurhash来实现bloomfilter应该是非常好的选择。
#include "basictypes.h" #include <string> #include <vector> #include <stdlib.h> #include <stdio.h> #include <sys/time.h> #include "cityhash/include/city.h" #include <tr1/unordered_map> #include <map> // 64-bit hash for 64-bit platforms const uint32 kFingerPrintSeed = 19820125; uint64 MurmurHash64A(const void* key, int len, uint32 seed) { const uint64 m = 0xc6a4a7935bd1e995; const int r = 47; uint64 h = seed ^ (len * m); const uint64* data = (const uint64 *)key; const uint64* end = data + (len/8); while (data != end) { uint64 k = *data++; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const uint8* data2 = (const uint8*)data; switch (len & 7) { case 7: h ^= static_cast<uint64>(data2[6]) << 48; case 6: h ^= static_cast<uint64>(data2[5]) << 40; case 5: h ^= static_cast<uint64>(data2[4]) << 32; case 4: h ^= static_cast<uint64>(data2[3]) << 24; case 3: h ^= static_cast<uint64>(data2[2]) << 16; case 2: h ^= static_cast<uint64>(data2[1]) << 8; case 1: h ^= static_cast<uint64>(data2[0]); h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; } // 32-bit hash uint32 MurmurHash32A(const void* key, int len, uint32 seed) { const uint32 m = 0x5bd1e995; const int r = 24; uint32 h = seed ^ (len * m); const uint32* data = (const uint32 *)key; while (len >= 4) { uint32 k = *(uint32 *)data; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; data += 1; len -= 4; } // Handle the last few bytes of the input array const uint8* data2 = (const uint8*)data; switch (len) { case 3: h ^= static_cast<uint32>(data2[2]) << 16; case 2: h ^= static_cast<uint32>(data2[1]) << 8; case 1: h ^= static_cast<uint32>(data2[0]); h *= m; }; // Do a few final mixes of the hash to ensure the last few // bytes are well-incorporated. h ^= h >> 13; h *= m; h ^= h >> 15; return h; } /* A Simple Hash Function */ unsigned int simple_hash(char *str) { register unsigned int hash; register unsigned char *p; for(hash = 0, p = (unsigned char *)str; *p ; p++) hash = 31 * hash + *p; return (hash & 0x7FFFFFFF); } /* RS Hash Function */ unsigned int RS_hash(char *str) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; while (*str) { hash = hash * a + (*str++); a *= b; } return (hash & 0x7FFFFFFF); } /* JS Hash Function */ unsigned int JS_hash(char *str) { unsigned int hash = 1315423911; while (*str) { hash ^= ((hash << 5) + (*str++) + (hash >> 2)); } return (hash & 0x7FFFFFFF); } /* P. J. Weinberger Hash Function */ unsigned int PJW_hash(char *str) { unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; while (*str) { hash = (hash << OneEighth) + (*str++); if ((test = hash & HighBits) != 0) { hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return (hash & 0x7FFFFFFF); } /* ELF Hash Function */ unsigned int ELF_hash(char *str) { unsigned int hash = 0; unsigned int x = 0; while (*str) { hash = (hash << 4) + (*str++); if ((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); hash &= ~x; } } return (hash & 0x7FFFFFFF); } /* BKDR Hash Function */ unsigned int BKDR_hash(char *str) { unsigned int seed = 131; // 31 131 1313 13131 131313 etc.. unsigned int hash = 0; while (*str) { hash = hash * seed + (*str++); } return (hash & 0x7FFFFFFF); } /* SDBM Hash Function */ unsigned int SDBM_hash(char *str) { unsigned int hash = 0; while (*str) { hash = (*str++) + (hash << 6) + (hash << 16) - hash; } return (hash & 0x7FFFFFFF); } /* DJB Hash Function */ unsigned int DJB_hash(char *str) { unsigned int hash = 5381; while (*str) { hash += (hash << 5) + (*str++); } return (hash & 0x7FFFFFFF); } /* AP Hash Function */ unsigned int AP_hash(char *str) { unsigned int hash = 0; int i; for (i=0; *str; i++) { if ((i & 1) == 0) { hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5))); } } return (hash & 0x7FFFFFFF); } /* CRC Hash Function */ unsigned int CRC_hash(char *str) { unsigned int nleft = strlen(str); unsigned long long sum = 0; unsigned short int *w = (unsigned short int *)str; unsigned short int answer = 0; /* * Our algorithm is simple, using a 32 bit accumulator (sum), we add * sequential 16 bit words to it, and at the end, fold back all the * carry bits from the top 16 bits into the lower 16 bits. */ while ( nleft > 1 ) { sum += *w++; nleft -= 2; } /* * mop up an odd byte, if necessary */ if ( 1 == nleft ) { *( unsigned char * )( &answer ) = *( unsigned char * )w ; sum += answer; } /* * add back carry outs from top 16 bits to low 16 bits * add hi 16 to low 16 */ sum = ( sum >> 16 ) + ( sum & 0xFFFF ); /* add carry */ sum += ( sum >> 16 ); /* truncate to 16 bits */ answer = ~sum; return (answer & 0xFFFFFFFF); } std::string Itoa(int value) { if (value < 0) { value *= -1; } char character[] = "0123456789abcdefghijklmnopqrstuvwxyz"; std::string res = ""; do { res += character[value % sizeof(character)]; } while ((value /= sizeof(character)) > 0); return res; } int GetTime() { timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000 + tv.tv_usec; } class StringHash { public: uint64 operator()(const std::string& s) const { return CityHash64(s.c_str(), s.size()); // return MurmurHash64A(s.c_str(), s.size(), kFingerPrintSeed) % (unsigned int) 0xFFFFFFFF; } }; class StringEqual { public: bool operator()(const std::string& left, const std::string& right) const { return left == right; } }; int main(int argc, char** argv) { const int kDataSize = 1000000; std::string content = ""; std::vector<std::string> data; for (int i = 0; i < kDataSize; ++i) { content = ""; for (int j = 0; j < 10; ++j) { content += Itoa(rand()); } data.push_back(content); } //murmur test int start = GetTime(); for (int i = 0; i < kDataSize; ++i) { MurmurHash64A(data[i].c_str(), data[i].size(), kFingerPrintSeed); } printf("murmur64: %d\n", GetTime() - start); start = GetTime(); for (int i = 0; i < kDataSize; ++i) { MurmurHash32A(data[i].c_str(), data[i].size(), kFingerPrintSeed); } printf("murmur32:%d\n", GetTime() - start); //simple hash start = GetTime(); for (int i = 0; i < kDataSize; ++i) { simple_hash(const_cast<char*>(data[i].c_str())); } printf("simple hash:%d\n", GetTime() - start); // bkdr hash start = GetTime(); for (int i = 0; i < kDataSize; ++i) { BKDR_hash(const_cast<char*>(data[i].c_str())); } printf("bkdr hash:%d\n", GetTime() - start); // AP hash start = GetTime(); for (int i = 0; i < kDataSize; ++i) { AP_hash(const_cast<char*>(data[i].c_str())); } printf("AP hash:%d\n", GetTime() - start); // City hash start = GetTime(); for (int i = 0; i < kDataSize; ++i) { CityHash64(data[i].c_str(), data[i].size()); } printf("city hash:%d\n", GetTime() - start); std::tr1::unordered_map<std::string, int, StringHash, StringEqual> my_map_city; // City hash insert start = GetTime(); for (int i = 0; i < kDataSize; ++i) { my_map_city[data[i]] = i; } printf("city hash insert:%d\n", GetTime() - start); // map insert std::map<std::string, int> my_map_tree; start = GetTime(); for (int i = 0; i < kDataSize; ++i) { my_map_tree[data[i]] = i; } printf("tree map insert:%d\n", GetTime() - start); // City hash search start = GetTime(); int value = 0; for (int i = 0; i < kDataSize; ++i) { value = my_map_city[data[i]]; } printf("city hash search:%d\n", GetTime() - start); // map search start = GetTime(); for (int i = 0; i < kDataSize; ++i) { value = my_map_tree[data[i]]; } printf("tree map search:%d\n", GetTime() - start); }
参考文献
[1]http://blog.csdn.net/liuben/article/details/5050697
[2]http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html
[3]http://sites.google.com/site/murmurhash/
[4]http://blog.csdn.net/wisage/article/details/7104866
[5]http://code.google.com/p/cityhash/
[6]http://gcc.gnu.org/onlinedocs/libstdc++/ext/pb_ds/index.html
[7]http://en.wikipedia.org/wiki/Hash_table
[8]http://en.wikipedia.org/wiki/MurmurHash
[9]http://hi.baidu.com/fdwm_lx/blog/item/f670e73582c8411d90ef3950.html
[10]http://www.cnblogs.com/Frandy/archive/2011/07/26/Hash_map_Unordered_map.html