redis数据结构底层实现

版本

6.0.3

基础类型robj

// server.h
typedef struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
                            * LFU data (least significant 8 bits frequency
                            * and most significant 16 bits access time). */
    int refcount;
    void *ptr;
} robj;
// type:类型,有以下7种
#define OBJ_STRING 0    /* String object. */
#define OBJ_LIST 1      /* List object. */
#define OBJ_SET 2       /* Set object. */
#define OBJ_ZSET 3      /* Sorted set object. */
#define OBJ_HASH 4      /* Hash object. */
#define OBJ_MODULE 5    /* Module object. */
#define OBJ_STREAM 6    /* Stream object. */
// encoding:编码
#define OBJ_ENCODING_RAW 0     /* Raw representation */
#define OBJ_ENCODING_INT 1     /* Encoded as integer */
#define OBJ_ENCODING_HT 2      /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3  /* Encoded as zipmap */ 是个旧的表示方式,已不再用。在小于Redis 2.6的版本中才有。
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */ 也是个旧的表示方式,已不再用。
#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6  /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8  /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */
#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
// refcount: 引用计数
// ptr:数据指针,指向真正的数据。
如字符串,type是OBJ_STRING,encoding是OBJ_ENCODING_RAW,OBJ_ENCODING_INT,OBJ_ENCODING_EMBSTR之一
type 可能的encoding
OBJ_STRING OBJ_ENCODING_INT、OBJ_ENCODING_RAW、OBJ_ENCODING_EMBSTR
OBJ_LIST OBJ_ENCODING_QUICKLIST
OBJ_SET OBJ_ENCODING_INTSET、OBJ_ENCODING_HT
OBJ_ZSET OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_SKIPLIST
OBJ_HASH OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_HT

1. String

type: OBJ_STRING

encoding:OBJ_ENCODING_RAW,OBJ_ENCODING_INT,OBJ_ENCODING_EMBSTR

encoding condition interface
OBJ_ENCODING_INT val可转化为long时 if (len <= 20 && string2l(s,len,&value)) string2ll
OBJ_ENCODING_EMBSTR 非long且len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT //44 createEmbeddedStringObject
OBJ_ENCODING_RAW 非以上情况时 createRawStringObjectt_s

redis的string实现是SDS(simple dynamic string),是作者自己定义的,而不是用C的字符串。

定义

// __attribute__ ((__packed__))  取消字节对齐,紧凑排列。

/* Note: sdshdr5 is never used, we just access the flags byte directly.
 * However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

// flags值定义
#define SDS_TYPE_5  0
#define SDS_TYPE_8  1
#define SDS_TYPE_16 2
#define SDS_TYPE_32 3
#define SDS_TYPE_64 4
  • len:记录当前已使用的字节数(不包括'\0'),获取SDS长度的复杂度为O(1)
  • alloc:记录当前字节数组总共分配的字节数量(不包括'\0')
  • flags:标记当前字节数组的属性,是sdshdr8还是sdshdr16等
  • buf:字节数组,用于保存字符串,包括结尾空白字符'\0'

创建字符串

  1. 算len,调用sdsnewlen
  2. 根据len决定用哪种类型
  3. 分配内存,赋值
  4. 计算len,alloc,flag
/* Create a new sds string starting from a null terminated C string. */
sds sdsnew(const char *init) {
    size_t initlen = (init == NULL) ? 0 : strlen(init);
    return sdsnewlen(init, initlen);
}

 // If NULL is used for 'init' the string is initialized with zero bytes.
 // If SDS_NOINIT is used, the buffer is left uninitialized;
sds sdsnewlen(const void *init, size_t initlen) {
    void *sh;
    sds s;
    char type = sdsReqType(initlen);
    /* Empty strings are usually created in order to append. Use type 8
     * since type 5 is not good at this. */
    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
    int hdrlen = sdsHdrSize(type);
    unsigned char *fp; /* flags pointer. */

    sh = s_malloc(hdrlen+initlen+1);
    if (sh == NULL) return NULL;
    if (init==SDS_NOINIT)
        init = NULL;
    else if (!init)
        memset(sh, 0, hdrlen+initlen+1);
    s = (char*)sh+hdrlen;
    fp = ((unsigned char*)s)-1;
    switch(type) {
        case SDS_TYPE_5: {
            *fp = type | (initlen << SDS_TYPE_BITS);
            break;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_16: {
            SDS_HDR_VAR(16,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_32: {
            SDS_HDR_VAR(32,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_64: {
            SDS_HDR_VAR(64,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
    }
    if (initlen && init)
        memcpy(s, init, initlen);
    s[initlen] = '\0';
    return s;
}

2、扩容

  1. 计算剩余可以空间,够用直接返回
  2. 计算扩容后所需长度newlen
  3. < 1M,成倍扩容
  4. >= 1M,每次只增加1M
  5. 根据newlen计算新的type
  6. 类型不变alloc,类型变了malloc,memcpy,free
sds sdsMakeRoomFor(sds s, size_t addlen) {
    void *sh, *newsh;
    size_t avail = sdsavail(s);
    size_t len, newlen;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen;

    /* Return ASAP if there is enough space left. */
    if (avail >= addlen) return s;

    len = sdslen(s);
    sh = (char*)s-sdsHdrSize(oldtype);
    newlen = (len+addlen);
    if (newlen < SDS_MAX_PREALLOC)
        newlen *= 2;
    else
        newlen += SDS_MAX_PREALLOC;

    type = sdsReqType(newlen);

    /* Don't use type 5: the user is appending to the string and type 5 is
     * not able to remember empty space, so sdsMakeRoomFor() must be called
     * at every appending operation. */
    if (type == SDS_TYPE_5) type = SDS_TYPE_8;

    hdrlen = sdsHdrSize(type);
    if (oldtype==type) {
        newsh = s_realloc(sh, hdrlen+newlen+1);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+hdrlen;
    } else {
        /* Since the header size changes, need to move the string forward,
         * and can't use realloc */
        newsh = s_malloc(hdrlen+newlen+1);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len+1);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
        sdssetlen(s, len);
    }
    sdssetalloc(s, newlen);
    return s;
}

string set过程

  1. 接受客户端协议并解析,此时全部为sds,编码为OBJ_ENCODING_RAW或OBJ_ENCODING_EMBSTR。
  2. set首先尝试把val转为OBJ_ENCODING_INT(tryObjectEncoding)。
  3. 然后真正set

与c字符串相比的优势

  1. 获取字符串长度O(1)。
  2. 预分配内存,不用每次修改字符串长度都重新分配内存,减少了内存重分配次数。
  3. 用len判断字符串结束,而不是空字符,可以存储二进制数据(如bitmap)。

2、List

type: OBJ_LIST

encoding: OBJ_ENCODING_QUICKLIST

quicklist是一个双向链表,链表的每个节点是一个ziplist

127.0.0.1:6379> lpush test 111
(integer) 2
127.0.0.1:6379> OBJECT ENCODING test
"quicklist"
typedef struct quicklistNode {
    struct quicklistNode *prev;
    struct quicklistNode *next;
    unsigned char *zl;              //ziplist ptr
    unsigned int sz;             /* ziplist size in bytes */
    unsigned int count : 16;     /* count of items in ziplist */
    unsigned int encoding : 2;   /* RAW==1 or LZF==2 */
    unsigned int container : 2;  /* NONE==1 or ZIPLIST==2 */
    unsigned int recompress : 1; /* was this node previous compressed? */
    unsigned int attempted_compress : 1; /* node can't compress; too small */
    unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;

[图片上传失败...(image-b3753c-1601286730477)]

ziplist简介

整体结构
/* The size of a ziplist header: two 32 bit integers for the total
 * bytes count and last item offset. One 16 bit integer for the number
 * of items field. */
// 两个32bit整数表示字节数和最后一个节点偏移,一个16bit整数表示节点数
#define ZIPLIST_HEADER_SIZE     (sizeof(uint32_t)*2+sizeof(uint16_t))

/* Size of the "end of ziplist" entry. Just one byte. */    // ziplist结尾,占用1字节
#define ZIPLIST_END_SIZE        (sizeof(uint8_t))

/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
    unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
    unsigned char *zl = zmalloc(bytes);     // 分配下内存
    ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);    // 设置字节数
    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);    //设置最后一个节点偏移
    ZIPLIST_LENGTH(zl) = 0;     // 长度0
    zl[bytes-1] = ZIP_END;      // 结尾固定为255 0xFF,支持双向遍历
    return zl;
}

[图片上传失败...(image-caafe1-1601286730477)]

节点结构

[图片上传失败...(image-efad29-1601286730477)]

  • pre_entry_length: 前一个节点的长度
    • 1 字节:如果前一节点的长度小于 254 字节,便使用一个字节保存它的值。
    • 5 字节:如果前一节点的长度大于等于 254 字节,那么将第 1 个字节的值设为 254 ,然后用接下来的 4 个字节保存实际长度。
  • encoding:编码格式,根据数据大小确定占用的内存,极致节约
  • length:content的内存占用
  • content:实际内容

[图片上传失败...(image-b9c6e5-1601286730477)]

3.2之前

ziplist or linkedlist

  • ziplist:连续内存,不需要每个节点都保存前后指针,内存占用少,但插入删除慢,适合数据少时使用
  • linkedlist:插入删除快。但每个节点需要保存前后指针,内存开销大。且内存不连续,容易产生内存碎片。

quicklist 优势

  • 结合ziplist和linkedlist的优点。
  • 双向链表前后插入删除快、单个节点是ziplist,减少内存开销,减少内存碎片。

配置参数

list-max-ziplist-size -2

我们来详细解释一下这个参数的含义。它可以取正值,也可以取负值。

当取正值的时候,表示按照数据项个数来限定每个quicklist节点上的ziplist长度。比如,当这个参数配置成5的时候,表示每个quicklist节点的ziplist最多包含5个数据项。

当取负值的时候,表示按照占用字节数来限定每个quicklist节点上的ziplist长度。这时,它只能取-1到-5这五个值,每个值含义如下:

  • -5: 每个quicklist节点上的ziplist大小不能超过64 Kb。(注:1kb => 1024 bytes)
  • -4: 每个quicklist节点上的ziplist大小不能超过32 Kb。
  • -3: 每个quicklist节点上的ziplist大小不能超过16 Kb。
  • -2: 每个quicklist节点上的ziplist大小不能超过8 Kb。(-2是Redis给出的默认值)
  • -1: 每个quicklist节点上的ziplist大小不能超过4 Kb。

list-compress-depth 0
这个参数表示一个quicklist两端不被压缩的节点个数。注:这里的节点个数是指quicklist双向链表的节点个数,而不是指ziplist里面的数据项个数。实际上,一个quicklist节点上的ziplist,如果被压缩,就是整体被压缩的。

参数list-compress-depth的取值含义如下:

  • 0: 是个特殊值,表示都不压缩。这是Redis的默认值。
  • 1: 表示quicklist两端各有1个节点不压缩,中间的节点压缩。
  • 2: 表示quicklist两端各有2个节点不压缩,中间的节点压缩。
  • 依此类推…

3、Hash

OBJ_ENCODING_ZIPLIST:
哈希对象保存的所有键值对的键和值的字符串长度都小于 64 字节;
哈希对象保存的键值对数量小于 512 个;

OBJ_ENCODING_HT: 不满于以上条件时。

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

typedef struct dictType {   // 函数指针,为了每种类型对应自己的函数
    uint64_t (*hashFunction)(const void *key);          // hash 函数
    void *(*keyDup)(void *privdata, const void *key);   // key 复制函数
    void *(*valDup)(void *privdata, const void *obj);   // val 复制函数
    int (*keyCompare)(void *privdata, const void *key1, const void *key2); // key 比较函数
    void (*keyDestructor)(void *privdata, void *key);   // key 销毁函数
    void (*valDestructor)(void *privdata, void *obj);   // val 销毁函数
} dictType;

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table;
    unsigned long size;         // 空间,pow(2,n)
    unsigned long sizemask;     // size - 1,h = dictHashKey(ht, he->key) & n.sizemask; 用于确定桶的位置
    unsigned long used;         // 元素数
} dictht;

typedef struct dict {
    dictType *type;     // 函数接口
    void *privdata;     // 函数接口的可选参数
    dictht ht[2];       // 平时只有一个ht有值,rehash过程中两个都有值
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */  // rehash的进度
    unsigned long iterators; /* number of iterators currently running */
} dict;

/* Hash type hash table (note that small hashes are represented with ziplists) */
dictType hashDictType = {
    dictSdsHash,                /* hash function */
    NULL,                       /* key dup */
    NULL,                       /* val dup */
    dictSdsKeyCompare,          /* key compare */
    dictSdsDestructor,          /* key destructor */
    dictSdsDestructor           /* val destructor */
};

/* Reset an hashtable already initialized with ht_init().
 * NOTE: This function should only called by ht_destroy(). */
static void _dictReset(dict *ht) {
    ht->table = NULL;
    ht->size = 0;
    ht->sizemask = 0;
    ht->used = 0;
}

/* Create a new hash table */
static dict *dictCreate(dictType *type, void *privDataPtr) {
    dict *ht = malloc(sizeof(*ht));
    _dictInit(ht,type,privDataPtr);
    return ht;
}

/* Initialize the hash table */
static int _dictInit(dict *ht, dictType *type, void *privDataPtr) {
    _dictReset(ht);
    ht->type = type;
    ht->privdata = privDataPtr;
    return DICT_OK;
}

[图片上传失败...(image-930d4f-1601286730477)]

  • 解决hash冲突,拉链法。
扩容与缩容
  1. 初始大小 4
  2. 扩容后size为第一个大于等于ht[0].used * 2 的 2^n
  3. 缩容后size为第一个大于等于 ht[0].used 的 2^n
负载因子为used/size
扩容时机
(1)负载因子>=1,且当前没有子进程在执行aof文件重写或者生成RDB文件

(2)负载因子>=5

缩容时机
    负载因子<0.1,且当前没有子进程在执行aof文件重写或者生成RDB文件
    
为什么限制BGSAVE 命令或 BGREWRITEAOF 命令执行时尽量不扩容缩容?
大多数操作系统都采用写时复制(copy-on-write)技术来优化子进程的使用效率,尽可能地避免在子进程存在期间进行哈希表扩展操作, 这可以避免不必要的内存写入操作, 最大限度地节约内存。

这里的内存修改主要是rehash过程中节点的指针修改。

关于写时复制可参考:https://juejin.im/post/6844903702373859335

渐进式rehash

  1. 为h[1]分配空间,同时存在两个dictht.
  2. rehashidx设为0,开始rehash
  3. 在 rehash 进行期间, 每次对字典执行添加、删除、查找或者更新操作时, 程序除了执行指定的操作以外, 还会顺带将 ht[0] 哈希表在 rehashidx 索引上的所有键值对 rehash 到 ht[1] , 当 rehash 工作完成之后, 程序将 rehashidx 属性的值增一。
  4. 当h[0]所有键值都被rehash到h[1],free h[0].table, h[0]=h[1], 重置h[1], rehashidx设为-1。rehash完成。
/* Performs N steps of incremental rehashing. Returns 1 if there are still
 * keys to move from the old to the new hash table, otherwise 0 is returned.
 *
 * Note that a rehashing step consists in moving a bucket (that may have more
 * than one key as we use chaining) from the old to the new hash table, however
 * since part of the hash table may be composed of empty spaces, it is not
 * guaranteed that this function will rehash even a single bucket, since it
 * will visit at max N*10 empty buckets in total, otherwise the amount of
 * work it does would be unbound and the function may block for a long time. */
int dictRehash(dict *d, int n) {
    int empty_visits = n*10; /* Max number of empty buckets to visit. */
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht[0].used != 0) {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        assert(d->ht[0].size > (unsigned long)d->rehashidx);
        while(d->ht[0].table[d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht[0].table[d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while(de) {
            uint64_t h;

            nextde = de->next;
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;
            d->ht[0].used--;
            d->ht[1].used++;
            de = nextde;
        }
        d->ht[0].table[d->rehashidx] = NULL;
        d->rehashidx++;
    }

    /* Check if we already rehashed the whole table... */
    if (d->ht[0].used == 0) {
        zfree(d->ht[0].table);
        d->ht[0] = d->ht[1];
        _dictReset(&d->ht[1]);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

rehash期间操作处理

  1. 删除、查找、更新在两个哈希表进行。
  2. 添加只在h[1]进行,h[0]只减不增,慢慢变空。

4、Set

  • type: OBJ_SET
  • encoding: OBJ_ENCODING_HT、 OBJ_ENCODING_INTSET

都是整数且数量不超过512使用INTSET,否则使用hashtable

/* Note that these encodings are ordered, so:
 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))

typedef struct intset {
    uint32_t encoding;   // 以上三种之一,随着存储数据大小变更
    uint32_t length;
    int8_t contents[];
} intset;

static uint8_t _intsetValueEncoding(int64_t v) {
    if (v < INT32_MIN || v > INT32_MAX)
    |   return INTSET_ENC_INT64;
    else if (v < INT16_MIN || v > INT16_MAX)
    |   return INTSET_ENC_INT32;
    else
    |   return INTSET_ENC_INT16;
}

// t_set.c
/* Add the specified value into a set.
 *
 * If the value was already member of the set, nothing is done and 0 is
 * returned, otherwise the new element is added and 1 is returned. */
int setTypeAdd(robj *subject, sds value) {
    long long llval;
    if (subject->encoding == OBJ_ENCODING_HT) {
    |   dict *ht = subject->ptr;
    |   dictEntry *de = dictAddRaw(ht,value,NULL);
    |   if (de) {
    |   |   dictSetKey(ht,de,sdsdup(value));
    |   |   dictSetVal(ht,de,NULL);
    |   |   return 1;
    |   }
    } else if (subject->encoding == OBJ_ENCODING_INTSET) {
    |   if (isSdsRepresentableAsLongLong(value,&llval) == C_OK) {
    |   |   uint8_t success = 0;
    |   |   subject->ptr = intsetAdd(subject->ptr,llval,&success);
    |   |   if (success) {
    |   |   |   /* Convert to regular set when the intset contains
    |   |   |   |* too many entries. */
    |   |   |   if (intsetLen(subject->ptr) > server.set_max_intset_entries)
    |   |   |   |   setTypeConvert(subject,OBJ_ENCODING_HT);
    |   |   |   return 1;
    |   |   }
    |   } else {
    |   |   /* Failed to get integer from object, convert to regular set. */
    |   |   setTypeConvert(subject,OBJ_ENCODING_HT);

    |   |   /* The set *was* an intset and this value is not integer
    |   |   |* encodable, so dictAdd should always work. */
    |   |   serverAssert(dictAdd(subject->ptr,sdsdup(value),NULL) == DICT_OK);
    |   |   return 1;
    |   }
    } else {
    |   serverPanic("Unknown set encoding");
    }
    return 0;
}

add过程

  1. 当前是OBJ_ENCODING_HT,直接用hashtable的规则插入,值为NULL
  2. 当前是OBJ_ENCODING_INTSET,且要add的值是Int,add到set中,之后判断数量是否大于set_max_intset_entries(512),大于则转为hashtable
  3. 当前是OBJ_ENCODING_INTSET,且要add的值不是Int,先转换再add

intset插入过程

/* Insert an integer in the intset */
intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
    uint8_t valenc = _intsetValueEncoding(value);   // 计算value所需的encoding
    uint32_t pos;
    if (success) *success = 1;  // 默认成功

    /* Upgrade encoding if necessary. If we need to upgrade, we know that
    |* this value should be either appended (if > 0) or prepended (if < 0),
    |* because it lies outside the range of existing values. */
    if (valenc > intrev32ifbe(is->encoding)) {
    |   /* This always succeeds, so we don't need to curry *success. */
    |   return intsetUpgradeAndAdd(is,value);   // 提升encoding并add
    } else {
    |   /* Abort if the value is already present in the set.
    |   |* This call will populate "pos" with the right position to insert
    |   |* the value when it cannot be found. */
    |   if (intsetSearch(is,value,&pos)) {  // 存在返回1,pos为查到的位置. 不存在返回0,pos为要插入的位置
    |   |   if (success) *success = 0;  // 已经有了,失败
    |   |   return is;
    |   }

    |   is = intsetResize(is,intrev32ifbe(is->length)+1);   // 分配内存
    |   if (pos < intrev32ifbe(is->length)) intsetMoveTail(is,pos,pos+1); // 把pos的内存移到pos+1,留出空间  
    }

    _intsetSet(is,pos,value);   // 插入
    is->length = intrev32ifbe(intrev32ifbe(is->length)+1);  // 修改length
    return is;
}
  1. 需要提升encoding时,先提升,再插入。
  2. 不需要提升encoding时,如果已存在,返回失败。不存在则扩容,memmove,插入,修改length。

5、ZSet

  • type: OBJ_ZSET
  • encoding: OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_SKIPLIST

元素数量少于128个,所有元素小于64字节时使用ziplist,其他情况使用skiplist

skiplist

https://15721.courses.cs.cmu.edu/spring2018/papers/08-oltpindexes1/pugh-skiplists-cacm1990.pdf

 * This skiplist implementation is almost a C translation of the original
 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
 * Alternative to Balanced Trees", modified in three ways:
 * a) this implementation allows for repeated scores.
 * b) the comparison is not just by key (our 'score') but by satellite data.
 * c) there is a back pointer, so it's a doubly linked list with the back
 * pointers being only at "level 1". This allows to traverse the list
 * from tail to head, useful for ZREVRANGE. */
// 基于论文Skip Lists: A Probabilistic Alternative to Balanced Trees,三点修改

// a. 支持重复score
// b. 不仅仅通过key比较,还通过其他数据
// c. 第一层有一个后退指针,可以从尾到头遍历,对ZREVRANGE很有用


#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^64 elements */
#define ZSKIPLIST_P 0.25      /* Skiplist P = 1/4 */

/* ZSETs use a specialized version of Skiplists */
typedef struct zskiplistNode {
    sds ele;            // 成员数据
    double score;       // 分数
    struct zskiplistNode *backward;     // 第一层链表中指向前一个节点的指针
    struct zskiplistLevel {     // 层
    |   struct zskiplistNode *forward;  // 指向下一个节点的指针
    |   unsigned long span;     // 节点跨度,用于计算排名
    } level[];
} zskiplistNode;

typedef struct zskiplist {
    struct zskiplistNode *header, *tail;
    unsigned long length;   // 节点总数
    int level;          // 跳表的最大层数
} zskiplist;

typedef struct zset {
    dict *dict;     // 存储成员及分值,用于快速索引
    zskiplist *zsl;     // 跳跃表,按分值排序成员。
} zset;

[图片上传失败...(image-e010fd-1601286730477)]

总结下内存节约

减少数据存储时占用内存

  1. sds紧凑排列,分为四种长度,区分长短字符串使用不同存储方式。
  2. ziplist不需要存储前后指针,且每个节点根据数据类型使用尽量小的内存存储。
  3. intset结构,根据数据大小动态调整encoding。
  4. BGSAVE 命令或 BGREWRITEAOF 命令执行时,提高hashtable负载因子,尽量不进行rehash

连续内存,减少内存碎片

  1. quicklist的node为ziplist
  2. hash在全整数时使用intset

参考资料

  • redis设计与实现
  • redis6.0.3源码
  • 一些技术博客

http://zhangtielei.com/posts/blog-redis-skiplist.html

https://juejin.im/post/6854573221237719048

https://juejin.im/post/6844903618890432526

你可能感兴趣的:(redis数据结构底层实现)