Memcached源码分析 - 消息回应（7）

Memcached源码分析 - 网络模型（1）
Memcached源码分析 - 命令解析（2）
Memcached源码分析 - 数据存储（3）
Memcached源码分析 - 增删改查操作（4）
Memcached源码分析 - 内存存储机制Slabs（5）
Memcached源码分析 - LRU淘汰算法（6）
Memcached源码分析 - 消息回应（7）

开篇

这篇文章作为Memcached源码系列的最后一篇文章，主要是为了讲解清楚Memcached在响应请求的流程，整个过程我总结一下分为协议部分、准备发送报文、执行报文发送、结束报文发送 四大块内容。
整个内容其实也是借鉴了前人的经验，按照惯例在参考文献列出对应的参考文章以示尊重。

协议部分

这部分协议在memcached的github上的文档上找到，我截取了get命令的响应报文格式，根据get的key的个数返回内容并以END\r\n结尾。
VALUE []\r\n
\r\n
VALUE []\r\n
\r\n
END\r\n

Retrieval command:
------------------
The retrieval commands "get" and "gets" operate like this:

get *\r\n
gets *\r\n

- * means one or more key strings separated by whitespace.

After this command, the client expects zero or more items, each of
which is received as a text line followed by a data block. After all
the items have been transmitted, the server sends the string

"END\r\n"

to indicate the end of response.

Each item sent by the server looks like this:

VALUE    []\r\n
\r\n

-  is the key for the item being sent

-  is the flags value set by the storage command

-  is the length of the data block to follow, *not* including
  its delimiting \r\n

-  is a unique 64-bit integer that uniquely identifies
  this specific item.

-  is the data for this item.

server端响应报文的数据结构

数据结构图

说明：

写入数据数据保存到c->msglist当中，当c->msglist中当前msgused下标所指节点空间写完后，就往后顺延一个继续写数据。
msglist中的元素是struct msghdr结构，实际的数据保存在c->iov的数组当中，达到上限后就认为写满了。

结构相关的数据结构

重点关注下conn_new方法中核心变量的初始化。

// 初始化iov的个数为400个
c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); *400
// 初始化msglist的大小为10
c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); *10

struct conn {
    char   *wbuf;
    char   *wcurr;
    int    wsize;
    int    wbytes;
    /** which state to go into after finishing current write */
    enum conn_states  write_and_go;
    void   *write_and_free; /** free this memory after finishing writing */

    char   *ritem;  /** when we read in an item's value, it goes here */
    int    rlbytes;
    void   *item;     /* for commands set/add/replace  */
    int    sbytes;    /* how many bytes to swallow */

    //iov主要存储iov的数据结构
    //iov数据结构会在conn_new中初始化，初始化的时候，系统会分配400个iovec的结构，最高水位600个
    struct iovec *iov;
    //iov的长度
    int    iovsize;   /* number of elements allocated in iov[] */
    //iovused 这个主要记录iov使用了多少
    int    iovused;   /* number of elements used in iov[] */

    //msglist主要存储msghdr的列表数据结构
    //msglist数据结构在conn_new中初始化的时候，系统会分配10个结构
    struct msghdr *msglist;
    //msglist的长度，初始化为10个，最高水位100，不够用的时候会realloc，每次扩容都会扩容一倍
    int    msgsize;   /* number of elements allocated in msglist[] */
    //msglist已经使用的长度
    int    msgused;   /* number of elements used in msglist[] */
    
    //这个参数主要帮助记录那些msglist已经发送过了，哪些没有发送过
    int    msgcurr;   /* element in msglist[] being transmitted now */
    int    msgbytes;  /* number of bytes in current msg */

    item   **ilist;   /* list of items to write out */
    int    isize;
    item   **icurr;
    int    ileft;
};


#include
struct msghdr  { 
    void  * msg_name ;   / *  消息的协议地址  * / 
    socklen_t msg_namelen ;   / *  地址的长度  * / 
    struct iovec  * msg_iov ;   / *  多io缓冲区的地址  * / 
    int  msg_iovlen ;   / *  缓冲区的个数  * / 
    void  * msg_control ;   / *  辅助数据的地址  * / 
    socklen_t msg_controllen ;   / *  辅助数据的长度  * / 
    int  msg_flags ;   / *  接收消息的标识  * / 
} ;

struct iovec {
    ptr_t iov_base; /* io_base都指向了不同的buffer的地址 */
    size_t iov_len; /* io_len是指该buffer中的数据长度*/
};


#define DATA_BUFFER_SIZE 2048
#define ITEM_LIST_INITIAL 200
#define SUFFIX_LIST_INITIAL 100
#define IOV_LIST_INITIAL 400
#define MSG_LIST_INITIAL 10

conn *conn_new(const int sfd, enum conn_states init_state,
                const int event_flags,
                const int read_buffer_size, enum network_transport transport,
                struct event_base *base) {

    conn *c;
    c = conns[sfd];

    if (NULL == c) {
        if (!(c = (conn *)calloc(1, sizeof(conn)))) {
           // 省略一部分代码
        }

        c->rbuf = c->wbuf = 0;
        c->ilist = 0;
        c->suffixlist = 0;
        c->iov = 0;
        c->msglist = 0;
        c->hdrbuf = 0;

        c->rsize = read_buffer_size;
        c->wsize = DATA_BUFFER_SIZE;
        c->isize = ITEM_LIST_INITIAL;
        c->suffixsize = SUFFIX_LIST_INITIAL;
        c->iovsize = IOV_LIST_INITIAL;
        c->msgsize = MSG_LIST_INITIAL;
        c->hdrsize = 0;

        c->rbuf = (char *)malloc((size_t)c->rsize);
        c->wbuf = (char *)malloc((size_t)c->wsize);
        c->ilist = (item **)malloc(sizeof(item *) * c->isize);
        c->suffixlist = (char **)malloc(sizeof(char *) * c->suffixsize);
        // 初始化iov的个数为400个
        c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); *400
        // 初始化msglist的大小为10
        c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); *10
        c->sfd = sfd;
        conns[sfd] = c;
    }
    // 省略相关代码
    return c;
}

准备发送报文

准备发送报文的过程就是遍历get 命令指定的keys逐个进行获取保存到发送队列数据结构当中。整个过程如下：

do/while双层循环保证所有key完成遍历。
获取key对应的变量 it = limited_get(key, nkey, c, exptime, should_touch)。
通过add_iov方法按照协议格式写入响应的报文，内部细节进一步分析。
写完以后将状态设置为conn_mwrite进入报文发送流程

static inline void process_get_command(conn *c, token_t *tokens, size_t ntokens, bool return_cas, bool should_touch) {
    char *key;
    size_t nkey;
    int i = 0;
    int si = 0;
    item *it;
    token_t *key_token = &tokens[KEY_TOKEN];
    char *suffix;
    int32_t exptime_int = 0;
    rel_time_t exptime = 0;
    bool fail_length = false;
    assert(c != NULL);

    // 省略相关代码
    do {
        while(key_token->length != 0) {

            key = key_token->value;
            nkey = key_token->length;
            
            // 获取item变量
            it = limited_get(key, nkey, c, exptime, should_touch);

            if (it) {

                // 按照Memcached规定的格式组装报文，末尾最后跟上完结的标识"END\r\n"
                if (return_cas || !settings.inline_ascii_response)
                {
                  int nbytes;
                  suffix = _ascii_get_suffix_buf(c, si);

                  si++;
                  nbytes = it->nbytes;
                  // 这里计算后缀的长度，其实把data数据长度一起放进去了，所以suffix_len 包括两部分的长度
                  int suffix_len = make_ascii_get_suffix(suffix, it, return_cas, nbytes);
                  // 组装字符串"VALUE "，组装key，组装flag，
                  if (add_iov(c, "VALUE ", 6) != 0 ||
                      add_iov(c, ITEM_key(it), it->nkey) != 0 ||
                      (settings.inline_ascii_response && add_iov(c, ITEM_suffix(it), it->nsuffix - 2) != 0) ||
                      add_iov(c, suffix, suffix_len) != 0)
                      {
                      }

                  if ((it->it_flags & ITEM_CHUNKED) == 0) {
                      add_iov(c, ITEM_data(it), it->nbytes);
                  } else if (add_chunked_item_iovs(c, it, it->nbytes) != 0) {
                      goto stop;
                  }
                }
                else
                {
                  MEMCACHED_COMMAND_GET(c->sfd, ITEM_key(it), it->nkey,
                                        it->nbytes, ITEM_get_cas(it));
                  if (add_iov(c, "VALUE ", 6) != 0 ||
                      add_iov(c, ITEM_key(it), it->nkey) != 0)
                      {
                          item_remove(it);
                          goto stop;
                      }
                  if ((it->it_flags & ITEM_CHUNKED) == 0)
                      {
                          if (add_iov(c, ITEM_suffix(it), it->nsuffix + it->nbytes) != 0)
                          {
                              item_remove(it);
                              goto stop;
                          }
                      } else if (add_iov(c, ITEM_suffix(it), it->nsuffix) != 0 ||
                                 add_chunked_item_iovs(c, it, it->nbytes) != 0) {
                          item_remove(it);
                          goto stop;
                      }
                }
                
                // 赋值到c->ilist当中
                *(c->ilist + i) = it;
                i++;
            } else {
                // 省略相关代码
            }

            key_token++;
        }

        if(key_token->value != NULL) {
            ntokens = tokenize_command(key_token->value, tokens, MAX_TOKENS);
            key_token = tokens;
        }

    } while(key_token->value != NULL);
stop:

    c->icurr = c->ilist;
    c->ileft = i;
    if (return_cas || !settings.inline_ascii_response) {
        c->suffixcurr = c->suffixlist;
        c->suffixleft = si;
    }
    
    // 组装结尾的字符串"END\r\n"
    if (key_token->value != NULL || add_iov(c, "END\r\n", 5) != 0
        || (IS_UDP(c->transport) && build_udp_headers(c) != 0)) {
        conn_release_items(c);
    }
    else {
        // 这里核心把状态设置成可写
        conn_set_state(c, conn_mwrite);
        c->msgcurr = 0;
    }
}

add_iov就是把数据保存到msglist中的msghdr当中，期间对于需要扩容的数据结构按照2倍的速率进行扩容直至达到上限值。

负责保存发送的数据
m = &c->msglist[c->msgused - 1]
m->msg_iov[m->msg_iovlen].iov_base = (void *)buf;
m->msg_iov[m->msg_iovlen].iov_len = len;
c->msgbytes += len;
c->iovused++;
m->msg_iovlen++;
add_msghdr针对c->msglist进行扩容。
ensure_iov_space针对c->iov进行扩容。

static int add_iov(conn *c, const void *buf, int len) {
    struct msghdr *m;
    int leftover;

    assert(c != NULL);

    if (IS_UDP(c->transport)) {
        // 我们只关心处理TCP场景的情况，其他的暂时省略
    } else {

        m = &c->msglist[c->msgused - 1];
        // 对msglist进行扩容，以2倍大小进行扩容
        if (m->msg_iovlen == IOV_MAX) {
            add_msghdr(c);
            m = &c->msglist[c->msgused - 1];
        }

        // 对iov进行扩容，以2倍大小进行扩容
        if (ensure_iov_space(c) != 0)
            return -1;

        m->msg_iov[m->msg_iovlen].iov_base = (void *)buf;
        m->msg_iov[m->msg_iovlen].iov_len = len;
        c->msgbytes += len;
        c->iovused++;
        m->msg_iovlen++;
    }

    return 0;
}


static int ensure_iov_space(conn *c) {
    assert(c != NULL);
    
    // 分配两倍的内存
    if (c->iovused >= c->iovsize) {
        int i, iovnum;
        struct iovec *new_iov = (struct iovec *)realloc(c->iov,
                                (c->iovsize * 2) * sizeof(struct iovec));
        
        // 赋值新的struct iovec对象
        c->iov = new_iov;
        c->iovsize *= 2;

        //拷贝到新内存当中
        for (i = 0, iovnum = 0; i < c->msgused; i++) {
            c->msglist[i].msg_iov = &c->iov[iovnum];
            iovnum += c->msglist[i].msg_iovlen;
        }
    }

    return 0;
}



static int add_msghdr(conn *c)
{
    struct msghdr *msg;

    assert(c != NULL);

    // 重新按照2倍的速率进行扩容
    if (c->msgsize == c->msgused) {
        msg = realloc(c->msglist, c->msgsize * 2 * sizeof(struct msghdr));
        c->msglist = msg;
        c->msgsize *= 2;
    }

    msg = c->msglist + c->msgused;
    memset(msg, 0, sizeof(struct msghdr));

    //核心的地方在于msg_iov指向的是iov的数组
    msg->msg_iov = &c->iov[c->iovused];

    c->msgbytes = 0;
    c->msgused++;

    return 0;
}

执行报文发送

transmit负责报文的发送，发送成功后将状态设置为conn_new_cmd进行结束报文的后续处理。这里重点关注下transmit过程。

case conn_mwrite:
            // 省略相关代码
            switch (transmit(c)) {
            case TRANSMIT_COMPLETE:
                if (c->state == conn_mwrite) {
                    conn_release_items(c);
                    if(c->protocol == binary_prot) {
                        conn_set_state(c, c->write_and_go);
                    } else {
                        conn_set_state(c, conn_new_cmd);
                    }
                } else if (c->state == conn_write) {
                    if (c->write_and_free) {
                        free(c->write_and_free);
                        c->write_and_free = 0;
                    }
                    conn_set_state(c, c->write_and_go);
                } else {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Unexpected state %d\n", c->state);
                    conn_set_state(c, conn_closing);
                }
                break;

            case TRANSMIT_INCOMPLETE:
            case TRANSMIT_HARD_ERROR:
                break;                   /* Continue in state machine. */

            case TRANSMIT_SOFT_ERROR:
                stop = true;
                break;
            }
            break;

transmit的内部过程其实就是for循环遍历直至所有数据发送完毕的过程。

遍历c->msglist依次进行发送。通过c->msglist[c->msgcurr].msg_iovlen == 0判断msglist当前下标的数据是否发送完成。
通过c->msgcurr++进行下一个下标对应的数据的发送。
通过sendmsg方法执行数据的真正发送。

/*
 * Transmit the next chunk of data from our list of msgbuf structures.
 *
 * Returns:
 *   TRANSMIT_COMPLETE   All done writing.
 *   TRANSMIT_INCOMPLETE More data remaining to write.
 *   TRANSMIT_SOFT_ERROR Can't write any more right now.
 *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
 */
static enum transmit_result transmit(conn *c) {
    assert(c != NULL);

    if (c->msgcurr < c->msgused &&
            c->msglist[c->msgcurr].msg_iovlen == 0) {
        /* Finished writing the current msg; advance to the next. */
        c->msgcurr++;
    }
    if (c->msgcurr < c->msgused) {
        ssize_t res;
        struct msghdr *m = &c->msglist[c->msgcurr];

        res = sendmsg(c->sfd, m, 0);
        if (res > 0) {
            pthread_mutex_lock(&c->thread->stats.mutex);
            c->thread->stats.bytes_written += res;
            pthread_mutex_unlock(&c->thread->stats.mutex);

            /* We've written some of the data. Remove the completed
               iovec entries from the list of pending writes. */
            while (m->msg_iovlen > 0 && res >= m->msg_iov->iov_len) {
                res -= m->msg_iov->iov_len;
                m->msg_iovlen--;
                m->msg_iov++;
            }

            /* Might have written just part of the last iovec entry;
               adjust it so the next write will do the rest. */
            if (res > 0) {
                m->msg_iov->iov_base = (caddr_t)m->msg_iov->iov_base + res;
                m->msg_iov->iov_len -= res;
            }
            return TRANSMIT_INCOMPLETE;
        }
        if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
            if (!update_event(c, EV_WRITE | EV_PERSIST)) {
                if (settings.verbose > 0)
                    fprintf(stderr, "Couldn't update event\n");
                conn_set_state(c, conn_closing);
                return TRANSMIT_HARD_ERROR;
            }
            return TRANSMIT_SOFT_ERROR;
        }
        /* if res == 0 or res == -1 and error is not EAGAIN or EWOULDBLOCK,
           we have a real error, on which we close the connection */
        if (settings.verbose > 0)
            perror("Failed to write, and not due to blocking");

        if (IS_UDP(c->transport))
            conn_set_state(c, conn_read);
        else
            conn_set_state(c, conn_closing);
        return TRANSMIT_HARD_ERROR;
    } else {
        return TRANSMIT_COMPLETE;
    }
}

结束报文发送

结束报文发送的过程比较简单，基本上就是设置状态进入下一次请求参数解析的过程。

conn_shrink方法优化参数接收相关的数据结构
reset_cmd_handler设置状态为conn_parse_cmd开始下一轮请求参数解析

case conn_new_cmd:
            /* Only process nreqs at a time to avoid starving other
               connections */

            --nreqs;
            if (nreqs >= 0) {
                reset_cmd_handler(c);
            } else {
                pthread_mutex_lock(&c->thread->stats.mutex);
                c->thread->stats.conn_yields++;
                pthread_mutex_unlock(&c->thread->stats.mutex);
                if (c->rbytes > 0) {
                    /* We have already read in data into the input buffer,
                       so libevent will most likely not signal read events
                       on the socket (unless more data is available. As a
                       hack we should just put in a request to write data,
                       because that should be possible ;-)
                    */
                    if (!update_event(c, EV_WRITE | EV_PERSIST)) {
                        if (settings.verbose > 0)
                            fprintf(stderr, "Couldn't update event\n");
                        conn_set_state(c, conn_closing);
                        break;
                    }
                }
                stop = true;
            }
            break;


static void reset_cmd_handler(conn *c) {
    c->cmd = -1;
    c->substate = bin_no_state;
    if(c->item != NULL) {
        item_remove(c->item);
        c->item = NULL;
    }
    conn_shrink(c);
    if (c->rbytes > 0) {
        conn_set_state(c, conn_parse_cmd);
    } else {
        conn_set_state(c, conn_waiting);
    }
}


/*
 * Shrinks a connection's buffers if they're too big.  This prevents
 * periodic large "get" requests from permanently chewing lots of server
 * memory.
 *
 * This should only be called in between requests since it can wipe output
 * buffers!
 */
static void conn_shrink(conn *c) {
    assert(c != NULL);

    if (IS_UDP(c->transport))
        return;

    //如果bufsize大于READ_BUFFER_HIGHWAT（8192）的时候需要重新处理
    //DATA_BUFFER_SIZE等于2048，所以我们可以看到之前的代码中对rbuf最多只能进行4次recalloc
    if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) {
        char *newbuf;

        if (c->rcurr != c->rbuf)
            memmove(c->rbuf, c->rcurr, (size_t)c->rbytes);

        newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE);

        if (newbuf) {
            c->rbuf = newbuf;
            c->rsize = DATA_BUFFER_SIZE;
        }
        /* TODO check other branch... */
        c->rcurr = c->rbuf;
    }

    if (c->isize > ITEM_LIST_HIGHWAT) {
        item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0]));
        if (newbuf) {
            c->ilist = newbuf;
            c->isize = ITEM_LIST_INITIAL;
        }
    /* TODO check error condition? */
    }

    if (c->msgsize > MSG_LIST_HIGHWAT) {
        struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0]));
        if (newbuf) {
            c->msglist = newbuf;
            c->msgsize = MSG_LIST_INITIAL;
        }
    /* TODO check error condition? */
    }

    if (c->iovsize > IOV_LIST_HIGHWAT) {
        struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0]));
        if (newbuf) {
            c->iov = newbuf;
            c->iovsize = IOV_LIST_INITIAL;
        }
    /* TODO check return value */
    }
}

参考文章

Memcached官方doc
《Memcached源码分析 - Memcached源码分析之消息回应（3）》
struct msghdr 和 struct iovec