Redis源码分析之数据结构

本篇我们来看一下Redis的数据存储结构。

数据库

Redis的数据库对应的结构体是redisDb,对应的结构体定义如下:

typedef struct redisDb {
    dict *dict;                 /* The keyspace for this DB */
    dict *expires;              /* Timeout of keys with a timeout set */
    dict *blocking_keys;        /* Keys with clients waiting for data (BLPOP)*/
    dict *ready_keys;           /* Blocked keys that received a PUSH */
    dict *watched_keys;         /* WATCHED keys for MULTI/EXEC CAS */
    int id;                     /* Database ID */
    long long avg_ttl;          /* Average TTL, just for stats */
    unsigned long expires_cursor; /* Cursor of the active expire cycle. */
    list *defrag_later;         /* List of key names to attempt to defrag one by one, gradually. */
} redisDb;
  1. dict中存储的是所有数据库中的数据,键为sds,值为robj。robj的定义如下:

    #define OBJ_STRING 0    /* String object. */
    #define OBJ_LIST 1      /* List object. */
    #define OBJ_SET 2       /* Set object. */
    #define OBJ_ZSET 3      /* Sorted set object. */
    #define OBJ_HASH 4      /* Hash object. */
    #define OBJ_MODULE 5    /* Module object. */
    #define OBJ_STREAM 6    /* Stream object. */
    #define OBJ_ENCODING_RAW 0     /* Raw representation */
    #define OBJ_ENCODING_INT 1     /* Encoded as integer */
    #define OBJ_ENCODING_HT 2      /* Encoded as hash table */
    #define OBJ_ENCODING_ZIPMAP 3  /* Encoded as zipmap */
    #define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
    #define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
    #define OBJ_ENCODING_INTSET 6  /* Encoded as intset */
    #define OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
    #define OBJ_ENCODING_EMBSTR 8  /* Embedded sds string encoding */
    #define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */
    #define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
    typedef struct redisObject {
        unsigned type:4;
        unsigned encoding:4;
        unsigned lru:24; // LRU time (relative to global lru_clock) or LFU data (least significant 8 bits frequency and most significant 16 bits access time).
        int refcount;
        void *ptr;
    } robj;
    
  2. expires中存储键和对应的过期时间。

  3. blocking_keys中存储因为执行阻塞操作而造成阻塞的key,如客户端执行了blpop yourlist,但是yourlist列表中没有元素,这是客户端会被阻塞。blocking_keys的键是对应的key,值是被这个key阻塞的客户端的列表。

  4. ready_keys中存储解除阻塞的key,如被blpop阻塞的客户端在对应列表有数据时应该被解除阻塞,ready_keys的键是对应的key,值是NULL,用于去重。

  5. watched_keys用于存储被WATCH命令监视修改的key,键是对应的key,值是watch这个key的客户端的列表。

  6. id是数据库的标识。

  7. avg_ttl存储主动删除过期键时,未过期的key的平均ttl。

  8. expires_cursor存储主动删除过期键时,删除的游标(实际是expires哈希表的entry数组的下标),这是由于在过期键很多时,不会一次性清除所有过期键,而是先清除部分,后面再清除剩余的部分。

  9. defrag_later存储需要碎片整理的key。

接下来我们来看一下Redis的不同类型的robj。

字符串

字符串类型的robj类型为OBJ_STRING,encoding为OBJ_ENCODING_RAW、OBJ_ENCODING_EMBSTR或OBJ_ENCODING_INT,ptr指向一个sds或直接一个long值:

typedef char *sds;
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

sds实际上可以认为是一个变长的动态字节数组,根据flags,我们可以确定sds头的长度,进而确定sds的长度。OBJ_ENCODING_EMBSTR与OBJ_ENCODING_RAW的区别是,OBJ_ENCODING_EMBSTR存储不可变的短字符串,而且OBJ_ENCODING_EMBSTR的存储结构将robj和sds分配在连续的内存空间中。

列表

列表类型的robj类型为OBJ_LIST,encoding为OBJ_ENCODING_QUICKLIST,ptr指向一个quicklist:

#define QL_FILL_BITS 16
#define QL_COMP_BITS 16
#define QL_BM_BITS 4
#define QUICKLIST_NODE_ENCODING_RAW 1
#define QUICKLIST_NODE_ENCODING_LZF 2
typedef struct quicklist {
    quicklistNode *head;
    quicklistNode *tail;
    unsigned long count;        /* total count of all entries in all ziplists */
    unsigned long len;          /* number of quicklistNodes */
    int fill : QL_FILL_BITS;              /* fill factor for individual nodes */
    unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
    unsigned int bookmark_count: QL_BM_BITS; /* for defragment */
    quicklistBookmark bookmarks[]; /* for defragment */
} quicklist;
typedef struct quicklistNode {
    struct quicklistNode *prev;
    struct quicklistNode *next;
    unsigned char *zl;           /* ziplist */
    unsigned int sz;             /* ziplist size in bytes */
    unsigned int count : 16;     /* count of items in ziplist */
    unsigned int encoding : 2;   /* RAW==1 or LZF==2 */
    unsigned int container : 2;  /* ZIPLIST==2 */
    unsigned int recompress : 1; /* was this node previous compressed? */
    unsigned int attempted_compress : 1; /* node can't compress; too small */
    unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;
typedef struct quicklistBookmark {
    quicklistNode *node;
    char *name;
} quicklistBookmark;

quicklist是一个双端链表结构,每一个节点是一个ziplist,为什么要存储一个ziplist而不是直接存储数据呢,这是因为如果直接存储数据,每一条数据都需要新建一个quicklistNode结构体,而quicklistNode结构体本身会占据一些空间。quicklistNode.zl可能直接存储一个ziplist,也可能存储一个由LZF算法压缩过的ziplist,取决于quicklistNode.encoding,QUICKLIST_NODE_ENCODING_RAW代表未压缩,QUICKLIST_NODE_ENCODING_LZF代表是压缩后的节点。

quicklist.fill来源于配置list_max_ziplist_size,表示一个ziplist最多存储多少个元素。quicklist.compress来源于配置list_compress_depth,在压缩一个节点时,会同时尝试压缩quicklist的第compress个节点和倒数第compress个节点,同时在quicklist的长度小于compress*2的值时,不会压缩quicklist里的节点。

ziplist的存储结构如下:

总存储长度 最后一个元素偏移量 元素数 前一节点大小 节点1 ... 前一节点大小 节点n 结尾
4字节 4字节 2字节 1字节或5字节 - ... 1字节或5字节 - 1字节(0xff)
集合

集合可能的存储格式有两种,一是intset,在元素全部是整数的时候用这个结构存储,对应的encoding是OBJ_ENCODING_INTSET,一旦包含了非整数,存储结构就会退化为dict,对应的encoding为OBJ_ENCODING_HT。这里只介绍intset,dict放到哈希表时讲。

#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))
typedef struct intset {
    uint32_t encoding;
    uint32_t length;
    int8_t contents[];
} intset;

intset.encoding存储每个元素的大小,在遇到encoding对应的整型不能表示的整数时,整个集合会升级,每个元素占用的空间会变大,intset.length存储元素的总数。intset.contents中存储所有的元素,它们是有序的,每次插入元素时,会进行二分查找,找到对应的插入位置,将后面的元素后移,然后插入元素到对应位置。

有序集合

有序集合的可能的存储格式有两种,ziplist(对应的encoding为OBJ_ENCODING_ZIPLIST)和zset(对应的encoding为OBJ_ENCODING_SKIPLIST),在所有键的长度都不超过zset_max_ziplist_value且集合中的元素总数不超过zset_max_ziplist_entries时存储结构为ziplist,否则为zset。ziplist我们前面已经介绍过了就不再重复介绍了,我们这里只介绍zset:

#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
typedef struct zset {
    dict *dict;
    zskiplist *zsl;
} zset;

zset.dict的键是成员名,值是成员对应的分数。zset.zsl是一个跳表结构,每个节点存储了成员名和成员对应的分数,且是按分数排好序的。

typedef struct zskiplist {
    struct zskiplistNode *header, *tail;
    unsigned long length;
    int level;
} zskiplist;
typedef struct zskiplistNode {
    sds ele;
    double score;
    struct zskiplistNode *backward;
    struct zskiplistLevel {
        struct zskiplistNode *forward;
        unsigned long span;
    } level[];
} zskiplistNode;

跳表是一类快速查找的数据结构,每个节点存储有多个指向不同跨度的下一个节点的指针,在查找节点时,先从最高层(最稀疏层)开始查找,找到最接近的一个节点时,再从下一层查找,一直找到最底层。

在插入跳表时,首先找到插入位置,然后生成一个随机数(按概率,比如50%的概率1,25%的概率2...)确定需要索引的层级,更新这些层级的相邻节点的指针并插入。

哈希表

哈希表的可能的存储格式有两种,ziplist(对应的encoding为OBJ_ENCODING_ZIPLIST)和dict(对应的encoding为OBJ_ENCODING_HT),在元素总数不超过hash_max_ziplist_entries时存储结构为ziplist,否则为dict。

#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_HT 2      /* Encoded as hash table */
#define HASH_SET_TAKE_FIELD (1<<0)
#define HASH_SET_TAKE_VALUE (1<<1)
#define HASH_SET_COPY 0
typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

数据存储在dictht中,每个dict有两个dictht,这是为了rehash时尽量不影响吞吐量,在rehash时,rehashidx存储了一个游标,值是rehash的进度,也就是有多少个entry从ht[0]移到了ht[1]。

typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
} dictht;
typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

dictht.used存储有多少个元素。

模块自定义数据
struct RedisModule {
    void *handle;   /* Module dlopen() handle. */
    char *name;     /* Module name. */
    int ver;        /* Module version. We use just progressive integers. */
    int apiver;     /* Module API version as requested during initialization.*/
    list *types;    /* Module data types. */
    list *usedby;   /* List of modules using APIs from this one. */
    list *using;    /* List of modules we use some APIs of. */
    list *filters;  /* List of filters the module has registered. */
    int in_call;    /* RM_Call() nesting level */
    int in_hook;    /* Hooks callback nesting level for this module (0 or 1). */
    int options;    /* Module options and capabilities. */
    int blocked_clients;         /* Count of RedisModuleBlockedClient in this module. */
    RedisModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */
};
typedef struct RedisModuleType {
    uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */
    struct RedisModule *module;
    moduleTypeLoadFunc rdb_load;
    moduleTypeSaveFunc rdb_save;
    moduleTypeRewriteFunc aof_rewrite;
    moduleTypeMemUsageFunc mem_usage;
    moduleTypeDigestFunc digest;
    moduleTypeFreeFunc free;
    moduleTypeAuxLoadFunc aux_load;
    moduleTypeAuxSaveFunc aux_save;
    int aux_save_triggers;
    char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */
} moduleType;
typedef struct moduleValue {
    moduleType *type;
    void *value;
} moduleValue;

存储模块的自定义数据类型。

消息队列
typedef struct streamID {
    uint64_t ms;        /* Unix time in milliseconds. */
    uint64_t seq;       /* Sequence number. */
} streamID;
typedef struct raxNode {
    uint32_t iskey:1;     /* Does this node contain a key? */
    uint32_t isnull:1;    /* Associated value is NULL (don't store it). */
    uint32_t iscompr:1;   /* Node is compressed. */
    uint32_t size:29;     /* Number of children, or compressed string len. */
    unsigned char data[];
} raxNode;
typedef struct rax {
    raxNode *head;
    uint64_t numele;
    uint64_t numnodes;
} rax;
typedef struct stream {
    rax *rax;               /* The radix tree holding the stream. */
    uint64_t length;        /* Number of elements inside this stream. */
    streamID last_id;       /* Zero if there are yet no items. */
    rax *cgroups;           /* Consumer groups dictionary: name -> streamCG */
} stream;

消息队列对应的存储结构是stream,stream.rax存储数据,stream.streamID存储最后入队的元素的ID信息,stream.length存储元素数,stream.cgroups存储消费组信息。rax是基数树(radix tree)存储结构,用于快速查找元素。

你可能感兴趣的:(Redis源码分析之数据结构)