Twitter-高性能hash_map

学习twitter的高性能散列表源码:


个人认为Twitter散列表的优点
1、使用C宏定义实现C++泛型的思想;
2、散列函数冲突小;
3、使用bitmap思想,标志位占用空间小;
4、自动扩展容量,判断扩容的条件;


个人认为Twitter散列表的缺点
1、值的类型都用指针指向,对于值类型是简单整型有指针占用空间的缺点;(虽然值类型是基本整型的情况较少,但是可以通过宏进行值类型是基本类型进行选定)


1、使用C的#define写类似于C++泛型的代码;

#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
   __KHASH_TYPE(name, khkey_t, khval_t)                        \
   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)

name:散列表的名称可以自定义;

khval_t:散列表的值类型可以自定义,在散列表里是指针;

khkey_t:散列表的key可以不可以自定义,但是可以选择32位或者64位的key;


__hash_func:散列函数可以选择,提供很多种散列函数;

/* --- BEGIN OF HASH FUNCTIONS --- */

/*! @function
  @abstract     Integer hash function
  @param  key   The integer [khint32_t]
  @return       The hash value [khint_t]
 */
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function
  @abstract     Integer comparison function
 */
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
  @abstract     64-bit integer hash function
  @param  key   The integer [khint64_t]
  @return       The hash value [khint_t]
 */
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function
  @abstract     64-bit integer comparison function
 */
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function
  @abstract     const char* hash function
  @param  s     Pointer to a null terminated string
  @return       The hash value
 */
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
   khint_t h = (khint_t)*s;
   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
   return h;
}
/*! @function
  @abstract     Another interface to const char* hash function
  @param  key   Pointer to a null terminated string [const char*]
  @return       The hash value [khint_t]
 */
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function
  @abstract     Const char* comparison function
 */
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)

static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
    key += ~(key << 15);
    key ^=  (key >> 10);
    key +=  (key << 3);
    key ^=  (key >> 6);
    key += ~(key << 11);
    key ^=  (key >> 16);
    return key;
}
#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)

__hash_equal:散列里面的判断key相等,因为只有32和64的整数,所以不可以选择;

下面的定义类型结构体:

__KHASH_TYPE(name, khkey_t, khval_t)
#define __KHASH_TYPE(name, khkey_t, khval_t) \
   typedef struct kh_##name##_s { \
      khint_t n_buckets, size, n_occupied, upper_bound; \
      khint32_t *flags; \
      khkey_t *keys; \
      khval_t *vals; \
   } kh_##name##_t;

下面的定义name和khval_t的函数:

__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
   }                                                  \
   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
   {                                                  \
      if (h) {                                        \
         kfree((void *)h->keys); kfree(h->flags);              \
         kfree((void *)h->vals);                            \
         kfree(h);                                       \
      }                                               \
   }                                                  \
   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
   {                                                  \
      if (h && h->flags) {                               \
         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
         h->size = h->n_occupied = 0;                       \
      }                                               \
   }                                                  \
   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
   {                                                  \
      if (h->n_buckets) {                                   \
         khint_t k, i, last, mask, step = 0; \
         mask = h->n_buckets - 1;                           \
         k = __hash_func(key); i = k & mask;                   \
         last = i; \
         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
            i = (i + (++step)) & mask; \
            if (i == last) return h->n_buckets;                \
         }                                            \
         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
      } else return 0;                                   \
   }                                                  \
   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
      khint32_t *new_flags = 0;                             \
      khint_t j = 1;                                     \
      {                                               \
         kroundup32(new_n_buckets);                            \
         if (new_n_buckets < 4) new_n_buckets = 4;             \
         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
         else { /* hash table size to be changed (shrink or expand); rehash */ \
            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
            if (!new_flags) return -1;                      \
            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
            if (h->n_buckets < new_n_buckets) { /* expand */      \
               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
               if (!new_keys) { kfree(new_flags); return -1; }    \
               h->keys = new_keys;                          \
               if (kh_is_map) {                          \
                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
                  if (!new_vals) { kfree(new_flags); return -1; } \
                  h->vals = new_vals;                       \
               }                                      \
            } /* otherwise shrink */                        \
         }                                            \
      }                                               \
      if (j) { /* rehashing is needed */                       \
         for (j = 0; j != h->n_buckets; ++j) {                 \
            if (__ac_iseither(h->flags, j) == 0) {             \
               khkey_t key = h->keys[j];                    \
               khval_t val;                              \
               khint_t new_mask;                         \
               new_mask = new_n_buckets - 1;                   \
               if (kh_is_map) val = h->vals[j];             \
               __ac_set_isdel_true(h->flags, j);               \
               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
                  khint_t k, i, step = 0; \
                  k = __hash_func(key);                     \
                  i = k & new_mask;                      \
                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
                  __ac_set_isempty_false(new_flags, i);        \
                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
                  } else { /* write the element and jump out of the loop */ \
                     h->keys[i] = key;                   \
                     if (kh_is_map) h->vals[i] = val;       \
                     break;                              \
                  }                                   \
               }                                      \
            }                                         \
         }                                            \
         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
         }                                            \
         kfree(h->flags); /* free the working space */            \
         h->flags = new_flags;                              \
         h->n_buckets = new_n_buckets;                      \
         h->n_occupied = h->size;                           \
         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
      }                                               \
      return 0;                                          \
   }                                                  \
   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
   {                                                  \
      khint_t x;                                         \
      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
         if (h->n_buckets > (h->size<<1)) {                    \
            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
               *ret = -1; return h->n_buckets;                 \
            }                                         \
         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
            *ret = -1; return h->n_buckets;                    \
         }                                            \
      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
      {                                               \
         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
         else {                                          \
            last = i; \
            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
               if (__ac_isdel(h->flags, i)) site = i;          \
               i = (i + (++step)) & mask; \
               if (i == last) { x = site; break; }             \
            }                                         \
            if (x == h->n_buckets) {                        \
               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
               else x = i;                               \
            }                                         \
         }                                            \
      }                                               \
      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
         h->keys[x] = key;                               \
         __ac_set_isboth_false(h->flags, x);                   \
         ++h->size; ++h->n_occupied;                           \
         *ret = 1;                                       \
      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
         h->keys[x] = key;                               \
         __ac_set_isboth_false(h->flags, x);                   \
         ++h->size;                                      \
         *ret = 2;                                       \
      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
      return x;                                          \
   }                                                  \
   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
   {                                                  \
      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
         __ac_set_isdel_true(h->flags, x);                     \
         --h->size;                                      \
      }                                               \
   }

2、散列表支持动态扩展桶的数目;

在put函数里面,会判断,如果满足

h->n_occupied >= h->upper_bound

则通过resize进行扩展:

khint_t x;
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ 
 if (h->n_buckets > (h->size<<1)) { 
    if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ 
       *ret = -1; return h->n_buckets;                 
    }                                         
 } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ 
    *ret = -1; return h->n_buckets;                
 }                                          
}

3、使用bitmap作为桶有没有被放置的标记;

khint32_t *flags;//散列表里的bit标志,bitmap

加上一系列对于bitmap的快速位操作,进行判断空、某个位置是否存在key等

#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))

4、定义一些工具函数

//函数声明
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
   extern kh_##name##_t *kh_init_##name(void);                    \
   extern void kh_destroy_##name(kh_##name##_t *h);               \
   extern void kh_clear_##name(kh_##name##_t *h);                 \
   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
//方便使用的宏定义,可以通过name和语义进行调用;

/*!
  @abstract Type of the hash table.
  @param  name  Name of the hash table [symbol]
 */
#define khash_t(name) kh_##name##_t

/*! @function
  @abstract     Initiate a hash table.
  @param  name  Name of the hash table [symbol]
  @return       Pointer to the hash table [khash_t(name)*]
 */
#define kh_init(name) kh_init_##name()

/*! @function
  @abstract     Destroy a hash table.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
 */
#define kh_destroy(name, h) kh_destroy_##name(h)

/*! @function
  @abstract     Reset a hash table without deallocating memory.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
 */
#define kh_clear(name, h) kh_clear_##name(h)

/*! @function
  @abstract     Resize a hash table.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  s     New size [khint_t]
 */
#define kh_resize(name, h, s) kh_resize_##name(h, s)

/*! @function
  @abstract     Insert a key to the hash table.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  k     Key [type of keys]
  @param  r     Extra return code: -1 if the operation failed;
                0 if the key is present in the hash table;
                1 if the bucket is empty (never used); 2 if the element in
            the bucket has been deleted [int*]
  @return       Iterator to the inserted element [khint_t]
 */
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)

/*! @function
  @abstract     Retrieve a key from the hash table.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  k     Key [type of keys]
  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
 */
#define kh_get(name, h, k) kh_get_##name(h, k)

/*! @function
  @abstract     Remove a key from the hash table.
  @param  name  Name of the hash table [symbol]
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  k     Iterator to the element to be deleted [khint_t]
 */
#define kh_del(name, h, k) kh_del_##name(h, k)

/*! @function
  @abstract     Test whether a bucket contains data.
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  x     Iterator to the bucket [khint_t]
  @return       1 if containing data; 0 otherwise [int]
 */
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))

/*! @function
  @abstract     Get key given an iterator
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  x     Iterator to the bucket [khint_t]
  @return       Key [type of keys]
 */
#define kh_key(h, x) ((h)->keys[x])

/*! @function
  @abstract     Get value given an iterator
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  x     Iterator to the bucket [khint_t]
  @return       Value [type of values]
  @discussion   For hash sets, calling this results in segfault.
 */
#define kh_val(h, x) ((h)->vals[x])

/*! @function
  @abstract     Alias of kh_val()
 */
#define kh_value(h, x) ((h)->vals[x])

/*! @function
  @abstract     Get the start iterator
  @param  h     Pointer to the hash table [khash_t(name)*]
  @return       The start iterator [khint_t]
 */
#define kh_begin(h) (khint_t)(0)

/*! @function
  @abstract     Get the end iterator
  @param  h     Pointer to the hash table [khash_t(name)*]
  @return       The end iterator [khint_t]
 */
#define kh_end(h) ((h)->n_buckets)

/*! @function
  @abstract     Get the number of elements in the hash table
  @param  h     Pointer to the hash table [khash_t(name)*]
  @return       Number of elements in the hash table [khint_t]
 */
#define kh_size(h) ((h)->size)

/*! @function
  @abstract     Get the number of buckets in the hash table
  @param  h     Pointer to the hash table [khash_t(name)*]
  @return       Number of buckets in the hash table [khint_t]
 */
#define kh_n_buckets(h) ((h)->n_buckets)

/*! @function
  @abstract     Iterate over the entries in the hash table
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  kvar  Variable to which key will be assigned
  @param  vvar  Variable to which value will be assigned
  @param  code  Block of code to execute
 */
#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
      if (!kh_exist(h,__i)) continue;                 \
      (kvar) = kh_key(h,__i);                      \
      (vvar) = kh_val(h,__i);                      \
      code;                                  \
   } }

/*! @function
  @abstract     Iterate over the values in the hash table
  @param  h     Pointer to the hash table [khash_t(name)*]
  @param  vvar  Variable to which value will be assigned
  @param  code  Block of code to execute
 */
#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
      if (!kh_exist(h,__i)) continue;                 \
      (vvar) = kh_val(h,__i);                      \
      code;                                  \
   } }

/* More conenient interfaces */

/*! @function
  @abstract     Instantiate a hash set containing integer keys
  @param  name  Name of the hash table [symbol]
 */
#define KHASH_SET_INIT_INT(name)                            \
   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

/*! @function
  @abstract     Instantiate a hash map containing integer keys
  @param  name  Name of the hash table [symbol]
  @param  khval_t  Type of values [type]
 */
#define KHASH_MAP_INIT_INT(name, khval_t)                      \
   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

/*! @function
  @abstract     Instantiate a hash map containing 64-bit integer keys
  @param  name  Name of the hash table [symbol]
 */
#define KHASH_SET_INIT_INT64(name)                             \
   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)

/*! @function
  @abstract     Instantiate a hash map containing 64-bit integer keys
  @param  name  Name of the hash table [symbol]
  @param  khval_t  Type of values [type]
 */
#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)

typedef const char *kh_cstr_t;
/*! @function
  @abstract     Instantiate a hash map containing const char* keys
  @param  name  Name of the hash table [symbol]
 */
#define KHASH_SET_INIT_STR(name)                            \
   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)

/*! @function
  @abstract     Instantiate a hash map containing const char* keys
  @param  name  Name of the hash table [symbol]
  @param  khval_t  Type of values [type]
 */
#define KHASH_MAP_INIT_STR(name, khval_t)                      \
   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)

使用散列表

KHASH_MAP_INIT_INT(32, char)


void test_khash_map()
{
	int ret, is_missing;
	khiter_t k;
	khash_t(32)* h = kh_init(32);
	k = kh_put(32, h, 5, &ret);
	kh_value(h, k) = 10;
	k = kh_get(32, h, 10);
	is_missing = (k == kh_end(h));
	k = kh_get(32, h, 5);
	kh_del(32, h, k);
	for (k = kh_begin(h); k != kh_end(h); ++k)
		if (kh_exist(h, k)) kh_value(h, k) = 1;
	kh_destroy(32, h);
	return;
}

你可能感兴趣的:(机器学习,哈希算法,算法,散列表)