ddddfang

hash table

1. 概述

A hash table consists of an array of ‘buckets’（吊桶）, each of which stores a key-value pair. In order to locate the bucket where a key-value pair should be stored, the key is passed through a hashing function. This function returns an integer which is used as the pair’s index in the array of buckets. When we want to retrieve a key-value pair, we supply the key to the same hashing function, receive its index, and use the index to find it in the array. Array indexing has algorithmic complexity O(1), making hash tables fast at storing and retrieving data.
The hash function we choose should:

Take a string as its input and return a number between 0 and m, our desired bucket array length.
Return an even distribution of （平均分配）bucket indexes for an average set of inputs. If our hash function is unevenly distributed, it will put more items in some buckets than others. This will lead to a higher rate of collisions（冲突，碰撞） . Collisions reduce the efficiency of our hash table.

Hash functions map an infinitely（无限的） large number of inputs to a finite number of outputs

来自 https://github.com/jamesroutley/write-a-hash-table/tree/master/01-introduction

hash表的内部过程是将一个string类型的key映射得到一个（数组的）index，这个index内容放什么是自定义的（value）。

2. hash表实现的map和红黑树(or skiplist)实现的map

hash表实现map查找插入删除几乎常数时间,效率极高,但占用内存一般高于实际存储的节点个数(不然冲突问题可能影响到效率),内部无序(因此无法范围查找)
红黑树实现map查找插入删除O(lgN)时间,和hash没法比.但占用内存少(=节点个数),内部有序(可按key排序),灵活(找不到exact key时甚至可以找一个最接近的)
skiplist实现map效率上和红黑树相同,但查找插入删除实现都比红黑树简单,且空间占用在1/4等比递减的时候比红黑树有优势

一般而言,有序集合都是按key排序,若按插入顺序排序就简单了,再维护一个list即可

3. 一个存在问题的hash table实现

hashtable.h

#ifndef __HASH_TABLE_H
#define __HASH_TABLE_H

#define HT_PRIME_1 151
#define HT_PRIME_2 153
#define HT_INITIAL_BASE_SIZE 100

typedef struct
{
    char* key;
    void* value;
} ht_item;

typedef struct
{
    int base_size;
    int size;        //size 和 count决定要不要resize hash表的大小,hash表太满不仅导致
    int count;        //无法插入数据,还会使插入效率大大降低!因此动态调整hash表大小是必要的
    ht_item** items;
} ht_hash_table;

ht_hash_table *ht_new();
void ht_del_hash_table(ht_hash_table* ht);

void *ht_insert(ht_hash_table *ht, const char *key, void *value);
void *ht_search(ht_hash_table* ht, const char* key);
void ht_delete(ht_hash_table* ht, const char* key);

//void ht_show(ht_hash_table *ht);
void ht_map(ht_hash_table *ht,void apply(const char *key,void **value,void *cl),void *cl);

#endif

hashtable.c

#include 
#include 
#include 
#include 
#include 
#include "hashTable.h"

//这是要判断x是不是质数
static int is_prime(const int x)
{
    int i;
    if (x < 2)
        return -1;
    if (x < 4)
        return 1;
    if ((x % 2) == 0)
        return 0;
    for (i = 3; i <= floor(sqrt((double) x)); i += 2)
    {
        if ((x % i) == 0)
            return 0;
    }
    return 1;
}

//寻找一个 >=x 的质数
static int next_prime(int x)
{
    while (is_prime(x) != 1)
    {
        x++;
    }
    return x;
}

static ht_item HT_DELETED_ITEM = {NULL, NULL};        //所有被delete的item都指向这货啊

static ht_item *ht_new_item(const char *k, void *v)
{
    ht_item *i = malloc(sizeof(ht_item));
    if(i)
    {
        i->key = strdup(k);                //strdup = malloc and memcpy
        i->value = v;
    }
    return i;
}

static void ht_del_item(ht_item *i)
{
    if(i)
    {
        if(i->key)
            free(i->key);
        free(i);
    }
}

//哈希函数,a为给定的一个随机值吧
//s is key, a is a number(给的是个 prime), m is hash table size
static int ht_hash(const char *s, const int a, const int m)
{
    long hash = 0;        //index
    int i;
    const int len_s = strlen(s);
    for (i = 0; i < len_s; i++)
    {
        //if strlen = 3, then s[0]*a^2 + s[1]*a^1 +s[2]*1; s[i] is ASCII value
        hash += (long)pow(a, len_s - (i+1)) * s[i];
        //refuce to hash tabel size scope
        hash = hash % m;
    }
    return (int)hash;
}

//如果hash遇到冲突,将double hash, attempt 代表发生冲突的次数(初次为0其实就是hash_a)
static int ht_get_hash(const char *s, const int num_buckets, const int attempt)
{
    const int hash_a = ht_hash(s, HT_PRIME_1, num_buckets);
    const int hash_b = ht_hash(s, HT_PRIME_2, num_buckets);
    return (hash_a + (attempt * (hash_b + 1))) % num_buckets; //加1是为了防止hash得到0且冲突的case
}

//新分配一个hash table,大小为 >=base_size 的一个质数
static ht_hash_table *ht_new_sized(const int base_size)
{
    //ht_hash_table* ht = xmalloc(sizeof(ht_hash_table));        //xmalloc与malloc几乎一样,多了写log的功能
    ht_hash_table *ht = (ht_hash_table *)malloc(sizeof(ht_hash_table));        //xmalloc与malloc几乎一样,多了写log的功能
    if(!ht)
        goto FAIL;
    ht->base_size = base_size;

    ht->size = next_prime(ht->base_size);

    ht->count = 0;
    ht->items = calloc((size_t)ht->size, sizeof(ht_item*));
    if(!ht->items)
        goto FAIL;
    return ht;

FAIL:
    if(ht)
        free(ht);
    return NULL;
}

//现在要resize,以新的 base_size
static void ht_resize(ht_hash_table *ht, const int base_size)
{
    int i;
    if (base_size < HT_INITIAL_BASE_SIZE)        //hash 表维护了一个最小的size
        return;
    ht_hash_table *new_ht = ht_new_sized(base_size);
    if(!new_ht)
        return;
    //将 hash table 原有内容挨个拷贝(插入)到新 table 的 桶 里面
    for (i = 0; i < ht->size; i++)
    {
        ht_item* item = ht->items[i];
        if (item != NULL && item != &HT_DELETED_ITEM)
        {
            ht_insert(new_ht, item->key, item->value);
        }
    }
    ht->base_size = new_ht->base_size;
    ht->count = new_ht->count;

    //吧 new_ht 和 ht 的 size 交换,(旧 size 在一会释放的时候还有用所以不能丢)
    const int tmp_size = ht->size;
    ht->size = new_ht->size;
    new_ht->size = tmp_size;

    //吧 new_ht 和 ht 的 tmp_items 指针交换(然后就相当于原来的ht换到了大空间了)
    ht_item** tmp_items = ht->items;
    ht->items = new_ht->items;
    new_ht->items = tmp_items;

    //new_ht 隐退,深藏功与名
    ht_del_hash_table(new_ht);
}

//此函数也只是作为 insert 时候可能使用的一个util
static void ht_resize_up(ht_hash_table *ht)
{
    const int new_size = ht->base_size * 2;
    ht_resize(ht, new_size);
}

//此函数也只是作为 delete 时候可能使用的一个util
static void ht_resize_down(ht_hash_table *ht)
{
    const int new_size = ht->base_size / 2;
    ht_resize(ht, new_size);
}

void *ht_insert(ht_hash_table *ht, const char *key, void *value)
{
    void *oldvalue = NULL;
    const int load = ht->count * 100 / ht->size;
    if (load > 70)        //>0.7 就 resize hash table
    {
        ht_resize_up(ht);
    }
    ht_item *item = ht_new_item(key, value);
    if(!item)
        return NULL;
    int index = ht_get_hash(item->key, ht->size, 0);
    ht_item *cur_item = ht->items[index];
    int i = 1;
    while (cur_item != NULL && cur_item != &HT_DELETED_ITEM)        //这个坑已经被占了,说明发生了冲突
    {
        if (strcmp(cur_item->key, key) == 0)        //插入相同的key被视为更新
        {
            oldvalue = cur_item->value;
            ht_del_item(cur_item);                //
            ht->items[index] = item;        //
            return oldvalue;        //如果发现返回值 非 NULL,应该立刻执行对 value 的释放操作!不然刚刚的覆盖会引起内存泄漏
        }
        index = ht_get_hash(item->key, ht->size, i);
        cur_item = ht->items[index];
        i++;
    }
    //走到这里说明终于找到了一个新的坑
    ht->items[index] = item;
    ht->count++;
    return NULL;
}

void *ht_search(ht_hash_table *ht, const char *key)
{
    int index = ht_get_hash(key, ht->size, 0);        //首次hash拿到的index
    ht_item* item = ht->items[index];
    int i = 1;
    while (item != NULL && item != &HT_DELETED_ITEM)
    {
        if (strcmp(item->key, key) == 0)
        {
            return item->value;        //找到了,成功返回
        }
        index = ht_get_hash(key, ht->size, i);        //没找到,那么看是否因为hash冲突被安排到了下个hash index处
        item = ht->items[index];
        i++;
    }
    return NULL;
}

//如果key-value对不在hash表中,则本函数啥也不做
void ht_delete(ht_hash_table *ht, const char *key)
{
    const int load = ht->count * 100 / ht->size;
    if (load < 10)        //<0.1 就 resize hash table
    {
        ht_resize_down(ht);
    }
    int index = ht_get_hash(key, ht->size, 0);
    ht_item* item = ht->items[index];
    int i = 1;
    //item本身并不会从数组中被删除(不然会影响冲突链的查找),而是指向 HT_DELETED_ITEM 即表示删除了
    while (item != NULL)        //这里 item==&HT_DELETED_ITEM 也应该是允许的,因为被删除的元素在 冲突链 上位置并不确定
    {
        if(item != &HT_DELETED_ITEM)
        {
            if (strcmp(item->key, key) == 0)
            {
                ht_del_item(item);
                ht->items[index] = &HT_DELETED_ITEM;        //成功删除
                ht->count--;
                return;
            }
        }
        index = ht_get_hash(key, ht->size, i);
        item = ht->items[index];
        i++;
    }
}

ht_hash_table *ht_new()
{
    return ht_new_sized(HT_INITIAL_BASE_SIZE);
}

void ht_del_hash_table(ht_hash_table *ht)
{
    int i = 0;
    for (i = 0; i < ht->size; i++)
    {
        ht_item* item = ht->items[i];
        if (item != NULL && item != &HT_DELETED_ITEM)        //
        {
            ht_del_item(item);
        }
    }
    free(ht->items);
    free(ht);
}

void ht_map(ht_hash_table *ht,void apply(const char *key,void **value,void *cl),void *cl)
{
    int i = 0;
    for (i = 0; i < ht->size; i++)
    {
        ht_item* item = ht->items[i];
        if (item != NULL && item != &HT_DELETED_ITEM)
        {
            //printf("%d (%s,%p)\n",i,item->key,item->value);
            apply(item->key,&item->value,cl);
        }
    }
}

测试这个hash表的时候发现hash函数对插入key为有规律的string的时候性能并不好。hash表还有另一种实现是开链法，将冲突的element链成一个list。

hash表的关键点：

为了使hash取得较好的效率一般都是动态伸缩 hash表 bucket的size，即当 insert 导致size 超过某个阈值则需resize（不然冲突链过长会导致hash性能下降）。如何resize？简单粗暴的采取”再新建一个bucket表–>将原表中所有node挨个重新插入到新表中” 会出现问题：数据量巨大时，某一次节点插入耗时将很长! 一致性hash？貌似实现起来有点复杂。《redis设计与实现》提到 redis 采用两张 bucket（主bucket0，副bucket1），当需要rehash时每次插入、删除、查找会顺带将一些主bucket0中的元素rehash 到 bucket1，同时hash manager结构体会记录主bucket0 rehash到哪个index了，总会有全部rehash完毕的时候嘛，那时候再释放bucket0，并将 bucket1–>bucket0。
hash表数据类型的安全。k一定是string吗？hash表是将 XXX 杂糅成 hash_index，XXX 可不一定是 string。v同样可以是某个基本类型或指向某个复杂struct的指针。处理不好会crash或者内存泄漏。redis要求提供keyDup、valDup、keyDestruct、valDestruct等函数。可以很好解决这个问题。
hash函数的选择。上面的hash表实现之所以性能出现问题和我的测试用例密不可分。我在插入数据的时候都是一些有规律的string，比如key_0, key_1, …..直接导致其冲突链表过长。redis采用的murmurhash函数，貌似这个函数被很多开源代码采用，对有规律的key表现出色。当然这个murmurhash只是最好是一个default hash函数，可以替换的。

所以我借鉴redis的思想自己实现hash表。目前测试起来性能还可以，后面有bug再修吧
目前发现一个问题：缺少一个接口：对hash表中原有(k,v)对进行 v++ 或者 v– 的操作

4. 参考redis的hash table实现