关于Hash的总结

hash函数

  hash函数常用的是mod 素数,或者使用乘法策略,取某些位,这些策略直接影响到hash table的resize,如果是mod素数的话,只能按素数来递增,如果是取乘法方法,只能以2^p来递增。

  参考文献[1][2]等给出了很多,常用的字符串hash函数,但更有价值的还包括下面这两个:

  murmur hash[3][4],其中[3]无法直接访问,不过本文的例子中有具体的实现,更多murmur hash[8]。

  city hash见参考文献[5]

hash table的构建

  hash table的构建通常使用list来解决冲突的问题,c++ tr1中就是这样的,tr1的实现中使用了策略base的设计方法,详情见参考文献[6],目前没有仔细看。这个参考文献是在浏览的源码( /usr/include/c++/4.4/tr1_impl/hashtable)时,在头注释中发现的。

resize问题

  resize的策略包括:

策略1:

 1)全部重新copy一遍,

策略2:

  1)resize时,使用2个hashtable,当insert时,只向新的hash table中insert,同时将old hash_table中的r个数据放如新的table;查找时,2个hash table同时查询

  2)如果old hash table数据已经全部移动完毕,删除old hash table

上面两个策略的详细信息见参考文献

bloomFilter

 说到hash需要提一下bloomfilter,它通过hash实现,好的hash函数可以使bloomfilter具有很好的性能。它的一个使用方法,就是如果要查询数据库,可以在数据库前加一个bloomfilter,如果没在bloomfilter中,就不用查询数据库了,因为bloomfilter返回false是不会有错误的。cityhash和murmurhash来实现bloomfilter应该是非常好的选择。

consistent hash(一致性hash)

  consistent hasn主要用于分布式系统,当增加或删除一个主机是,不会造成严重的抖动,因为他的策略只会导致相邻的一台主机rehash,因此影响比较小。参考文献[9]讲的也挺清楚的,值得参考。

C++中的hashtable

  C++ tr1中的hashtable实现是unordered_map,本文的例子和参考文献[10],有简单的演示,我也看了部分unordered_map的实现,内部是hash_table实现的。其中参考文献[10]的参考文献给了更多有价值的参考信息。

本文的计算性能比较

代码如下:
#include "basictypes.h"
#include <string>
#include <vector>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include "cityhash/include/city.h"
#include <tr1/unordered_map>
#include <map>
// 64-bit hash for 64-bit platforms
const uint32 kFingerPrintSeed = 19820125;  
uint64 MurmurHash64A(const void* key, int len, uint32 seed) {
  const uint64 m = 0xc6a4a7935bd1e995;
  const int r = 47;

  uint64 h = seed ^ (len * m);

  const uint64* data = (const uint64 *)key;
  const uint64* end = data + (len/8);

  while (data != end) {
    uint64 k = *data++;

    k *= m;
    k ^= k >> r;
    k *= m;

    h ^= k;
    h *= m;
  }

  const uint8* data2 = (const uint8*)data;

  switch (len & 7) {
    case 7: h ^= static_cast<uint64>(data2[6]) << 48;
    case 6: h ^= static_cast<uint64>(data2[5]) << 40;
    case 5: h ^= static_cast<uint64>(data2[4]) << 32;
    case 4: h ^= static_cast<uint64>(data2[3]) << 24;
    case 3: h ^= static_cast<uint64>(data2[2]) << 16;
    case 2: h ^= static_cast<uint64>(data2[1]) << 8;
    case 1: h ^= static_cast<uint64>(data2[0]);
    h *= m;
  };

  h ^= h >> r;
  h *= m;
  h ^= h >> r;

  return h;
}

// 32-bit hash
uint32 MurmurHash32A(const void* key, int len, uint32 seed) {
  const uint32 m = 0x5bd1e995;
  const int r = 24;

  uint32 h = seed ^ (len * m);

  const uint32* data = (const uint32 *)key;

  while (len >= 4) {
    uint32 k = *(uint32 *)data;

    k *= m;
    k ^= k >> r;
    k *= m;

    h *= m;
    h ^= k;

    data += 1;
    len -= 4;
  }

  // Handle the last few bytes of the input array
 const uint8* data2 = (const uint8*)data;

  switch (len) {
    case 3: h ^= static_cast<uint32>(data2[2]) << 16;
    case 2: h ^= static_cast<uint32>(data2[1]) << 8;
    case 1: h ^= static_cast<uint32>(data2[0]);
            h *= m;
  };

  // Do a few final mixes of the hash to ensure the last few
  // bytes are well-incorporated.

  h ^= h >> 13;
  h *= m;
  h ^= h >> 15;

  return h;
}

/* A Simple Hash Function */
unsigned int simple_hash(char *str)
{
	register unsigned int hash;
	register unsigned char *p;

	for(hash = 0, p = (unsigned char *)str; *p ; p++)
		hash = 31 * hash + *p;

	return (hash & 0x7FFFFFFF);
}

/* RS Hash Function */
unsigned int RS_hash(char *str)
{
         unsigned int b = 378551;
         unsigned int a = 63689;
         unsigned int hash = 0;

         while (*str)
         {
                 hash = hash * a + (*str++);
                 a *= b;
         }

         return (hash & 0x7FFFFFFF);
}

/* JS Hash Function */
unsigned int JS_hash(char *str)
{
         unsigned int hash = 1315423911;

         while (*str)
         {
                 hash ^= ((hash << 5) + (*str++) + (hash >> 2));
         }
        
         return (hash & 0x7FFFFFFF);
}

/* P. J. Weinberger Hash Function */
unsigned int PJW_hash(char *str)
{
         unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
         unsigned int ThreeQuarters     = (unsigned int)((BitsInUnignedInt   * 3) / 4);
         unsigned int OneEighth         = (unsigned int)(BitsInUnignedInt / 8);

         unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
         unsigned int hash              = 0;
         unsigned int test              = 0;

         while (*str)
         {
                 hash = (hash << OneEighth) + (*str++);
                 if ((test = hash & HighBits) != 0)
                 {
                         hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
                 }
         }

         return (hash & 0x7FFFFFFF);
}

/* ELF Hash Function */
unsigned int ELF_hash(char *str)
{
         unsigned int hash = 0;
         unsigned int x     = 0;

         while (*str)
         {
                 hash = (hash << 4) + (*str++);
                 if ((x = hash & 0xF0000000L) != 0)
                 {
                         hash ^= (x >> 24);
                         hash &= ~x;
                 }
         }

         return (hash & 0x7FFFFFFF);
}

/* BKDR Hash Function */
unsigned int BKDR_hash(char *str)
{
         unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
         unsigned int hash = 0;

         while (*str)
         {
                 hash = hash * seed + (*str++);
         }

         return (hash & 0x7FFFFFFF);
}

/* SDBM Hash Function */
unsigned int SDBM_hash(char *str)
{
         unsigned int hash = 0;

         while (*str)
         {
                 hash = (*str++) + (hash << 6) + (hash << 16) - hash;
         }

         return (hash & 0x7FFFFFFF);
}

/* DJB Hash Function */
unsigned int DJB_hash(char *str)
{
         unsigned int hash = 5381;

         while (*str)
         {
                 hash += (hash << 5) + (*str++);
         }

         return (hash & 0x7FFFFFFF);
}

/* AP Hash Function */
unsigned int AP_hash(char *str)
{
         unsigned int hash = 0;
         int i;
         for (i=0; *str; i++)
         {
                 if ((i & 1) == 0)
                 {
                         hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
                 }
                 else
                 {
                         hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
                 }
         }

         return (hash & 0x7FFFFFFF);
}

/* CRC Hash Function */
unsigned int CRC_hash(char *str)
{
    unsigned int        nleft   = strlen(str);
    unsigned long long  sum     = 0;
    unsigned short int *w       = (unsigned short int *)str;
    unsigned short int  answer  = 0;

    /*
     * Our algorithm is simple, using a 32 bit accumulator (sum), we add
     * sequential 16 bit words to it, and at the end, fold back all the
     * carry bits from the top 16 bits into the lower 16 bits.
     */
    while ( nleft > 1 ) {
        sum += *w++;
        nleft -= 2;
    }
    /*
     * mop up an odd byte, if necessary
     */
    if ( 1 == nleft ) {
        *( unsigned char * )( &answer ) = *( unsigned char * )w ;
        sum += answer;
    }
    /*
     * add back carry outs from top 16 bits to low 16 bits
     * add hi 16 to low 16
     */
    sum = ( sum >> 16 ) + ( sum & 0xFFFF );
    /* add carry */
    sum += ( sum >> 16 );
    /* truncate to 16 bits */
    answer = ~sum;

    return (answer & 0xFFFFFFFF);
}


std::string Itoa(int value) {
  if (value < 0) {
    value *= -1;
  }
  char character[] = "0123456789abcdefghijklmnopqrstuvwxyz";
  std::string res = "";
  do {
    res += character[value % sizeof(character)];
  } while ((value /= sizeof(character)) > 0);
  return res;
}
int GetTime() {
  timeval tv;
  gettimeofday(&tv, NULL);
  return tv.tv_sec * 1000000 + tv.tv_usec;
}
class StringHash {
 public:
  uint64 operator()(const std::string& s) const {
    return CityHash64(s.c_str(), s.size());
    // return MurmurHash64A(s.c_str(), s.size(), kFingerPrintSeed) % (unsigned int) 0xFFFFFFFF;
  }
};
class StringEqual {
 public:
  bool operator()(const std::string& left, const std::string& right) const {
    return left == right;
  }
};
int main(int argc, char** argv) {
  const int kDataSize = 1000000;
  std::string content = "";
  std::vector<std::string> data;
  for (int i = 0; i < kDataSize; ++i) {
    content = "";
    for (int j = 0; j < 10; ++j) {
      content += Itoa(rand());
    }
    data.push_back(content);
  }
  //murmur test

  int start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    MurmurHash64A(data[i].c_str(), data[i].size(), kFingerPrintSeed);
  }
  printf("murmur64: %d\n", GetTime() - start);  
  
  start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    MurmurHash32A(data[i].c_str(), data[i].size(), kFingerPrintSeed);   
  }
  printf("murmur32:%d\n", GetTime() - start);  
  //simple hash
  start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    simple_hash(const_cast<char*>(data[i].c_str()));
  }
  printf("simple hash:%d\n", GetTime() - start);  
  // bkdr hash
  start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    BKDR_hash(const_cast<char*>(data[i].c_str()));
  }
  printf("bkdr hash:%d\n", GetTime() - start);  

  // AP  hash
  start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    AP_hash(const_cast<char*>(data[i].c_str()));
  }
  printf("AP hash:%d\n", GetTime() - start);  

  // City  hash
  start = GetTime();
  for (int i = 0; i < kDataSize; ++i) {
    CityHash64(data[i].c_str(), data[i].size());
  }
  printf("city hash:%d\n", GetTime() - start);  
    std::tr1::unordered_map<std::string, int, StringHash, StringEqual> my_map_city;
  // City  hash insert
  start = GetTime();  
  for (int i = 0; i < kDataSize; ++i) {
    my_map_city[data[i]] = i;
  }
  printf("city hash insert:%d\n", GetTime() - start);  
  
  // map insert
  std::map<std::string, int> my_map_tree;
  start = GetTime();  
  for (int i = 0; i < kDataSize; ++i) {
    my_map_tree[data[i]] = i;
  }
  printf("tree map insert:%d\n", GetTime() - start);  
  
  // City  hash search
  start = GetTime();
  int value = 0;
  for (int i = 0; i < kDataSize; ++i) {
    value = my_map_city[data[i]];
  }
  printf("city hash search:%d\n", GetTime() - start);  

  // map search
  start = GetTime();  
  for (int i = 0; i < kDataSize; ++i) {
    value = my_map_tree[data[i]];
  }
  printf("tree map search:%d\n", GetTime() - start);  

  
  
}


参考文献

[1]http://blog.csdn.net/liuben/article/details/5050697

[2]http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html

[3]http://sites.google.com/site/murmurhash/ 

[4]http://blog.csdn.net/wisage/article/details/7104866

[5]http://code.google.com/p/cityhash/

[6]http://gcc.gnu.org/onlinedocs/libstdc++/ext/pb_ds/index.html

[7]http://en.wikipedia.org/wiki/Hash_table

[8]http://en.wikipedia.org/wiki/MurmurHash

[9]http://hi.baidu.com/fdwm_lx/blog/item/f670e73582c8411d90ef3950.html

[10]http://www.cnblogs.com/Frandy/archive/2011/07/26/Hash_map_Unordered_map.html

你可能感兴趣的:(String,function,tree,table,insert,character)