memcached的连接处理

一、相关结构

enum conn_states {
    conn_listening,  /**< the socket which listens for connections */
    conn_new_cmd,    /**< Prepare connection for next command */
    conn_waiting,    /**< waiting for a readable socket */
    conn_read,       /**< reading in a command line */
    conn_parse_cmd,  /**< try to parse a command from the input buffer */
    conn_write,      /**< writing out a simple response */
    conn_nread,      /**< reading in a fixed number of bytes */
    conn_swallow,    /**< swallowing unnecessary bytes w/o storing */
    conn_closing,    /**< closing this connection */
    conn_mwrite,     /**< writing out many items sequentially */
    conn_max_state   /**< Max state value (used for assertion) */
};

二、drive_machine()

当client发起tcp连接(telnet 127.0.0.1 11211)时,event_handler()会调用drive_machine()。之后根据struct conn的当前状态去处理。

1、case conn_listening

调用accept(),之后调用dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,  DATA_BUFFER_SIZE, tcp_transport),其中sfd是刚刚accept的套接字,conn_new_cmd是设置给sfd的状态。与执行流程一节分析一样,dispatch_conn_new()将该sfd交给某个工作子线程管理。


当client在telnet中键入“set foo 0 0 3”后。

2、case conn_new_cmd(注意此时为工作线程处理)

--nreqs(nreqs = settings.reqs_per_event,本例为20,Maximum number of io to process on each io-event)。

如果nreqs<0,c->thread->stats.conn_yields++(放弃的conn数+1)。如果c->rbytes > 0(该连接的输入缓冲区中已经有数据),很可能不会有新数据到来,那么很可能这个事件永远不会再被signal,这个conn可能就真的被永远放弃了;为了防止这种情况,调用update_event(),将该事件的ev_flags由EV_READ更新为EV_WRITE,这样之后这个事件又被signal,这个连接可以稍后得以处理。本次放弃处理struct conn。

如果nreqs>=0,调用reset_cmd_handler(c)。先设置c->substate = bin_no_state,此时c->item应当为null(c->item  is used to hold an item structure createdafter reading the command line ofset/add/replace commands, but before we finished reading the actual data. The data is read into ITEM_data(item) to avoid extra copying.)。接着调用conn_shrink(c),作用为Shrinks a connection's buffers if they're too big。

        如果c->rbytes > 0,调用conn_set_state(c, conn_parse_cmd),否则调用conn_set_state(c, conn_waiting)。


本例中此时c->rbytes = 0,进入conn_waiting状态,继续处理。

3、case conn_waiting

update_event(c, EV_READ | EV_PERSIST)后,conn_set_state(c, conn_read)。然后退出本次处理。


进入conn_read状态。

4、case conn_read

        case conn_read:
            res = IS_UDP(c->transport) ? try_read_udp(c) : try_read_network(c);

            switch (res) {
            case READ_NO_DATA_RECEIVED:
                conn_set_state(c, conn_waiting);
                break;
            case READ_DATA_RECEIVED:
                conn_set_state(c, conn_parse_cmd);
                break;
            case READ_ERROR:
                conn_set_state(c, conn_closing);
                break;
            case READ_MEMORY_ERROR: /* Failed to allocate more memory */
                /* State already set by try_read_network */
                break;
            }
            break;


此时已经读入一定的数据,进入conn_parse_cmd状态,继续处理。

5、case conn_parse_cmd

        case conn_parse_cmd :
            if (try_read_command(c) == 0) {
                /* wee need more data! */
                conn_set_state(c, conn_waiting);
            }

            break;


try_read_command(conn *c)设置c->protocol = binary_prot或ascii_prot,此处使用的是ascii_prot(binary_prot暂不讨论);最后它调用process_command(c, c->rcurr)。

static void process_command(conn *c, char *command) {  //分解命令行,并相应处理

    token_t tokens[MAX_TOKENS];
    size_t ntokens;
    int comm;

    assert(c != NULL);

    MEMCACHED_PROCESS_COMMAND_START(c->sfd, c->rcurr, c->rbytes);

    if (settings.verbose > 1)
        fprintf(stderr, "<%d %s\n", c->sfd, command);

    /*
     * for commands set/add/replace, we build an item and read the data
     * directly into it, then continue in nread_complete().
     */

    c->msgcurr = 0;
    c->msgused = 0;
    c->iovused = 0;
    if (add_msghdr(c) != 0) {      //Adds a message header to a connection.实际上对struct msghdr *msg = c->msglist + c->msgused进行初始化
                                   //(msg->msg_iov = &c->iov[c->iovused];)。对于UDP其中会调用add_iov(),暂不讨论。
        out_string(c, "SERVER_ERROR out of memory preparing response");
        return;
    }

    ntokens = tokenize_command(command, tokens, MAX_TOKENS);  //只是将一整条命令根据空格分割成数组形式,结尾以length = 0作为结束标志
    if (ntokens >= 3 &&
        ((strcmp(tokens[COMMAND_TOKEN].value, "get") == 0) ||
         (strcmp(tokens[COMMAND_TOKEN].value, "bget") == 0))) {

        process_get_command(c, tokens, ntokens, false);

    } else if ((ntokens == 6 || ntokens == 7) &&
               ((strcmp(tokens[COMMAND_TOKEN].value, "add") == 0 && (comm = NREAD_ADD)) ||
                (strcmp(tokens[COMMAND_TOKEN].value, "set") == 0 && (comm = NREAD_SET)) ||
                (strcmp(tokens[COMMAND_TOKEN].value, "replace") == 0 && (comm = NREAD_REPLACE)) ||
                (strcmp(tokens[COMMAND_TOKEN].value, "prepend") == 0 && (comm = NREAD_PREPEND)) ||
                (strcmp(tokens[COMMAND_TOKEN].value, "append") == 0 && (comm = NREAD_APPEND)) )) {

        process_update_command(c, tokens, ntokens, comm, false);

    } else if ((ntokens == 7 || ntokens == 8) && (strcmp(tokens[COMMAND_TOKEN].value, "cas") == 0 && (comm = NREAD_CAS))) {

        process_update_command(c, tokens, ntokens, comm, true);

    } else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "incr") == 0)) {

        process_arithmetic_command(c, tokens, ntokens, 1);

    } else if (ntokens >= 3 && (strcmp(tokens[COMMAND_TOKEN].value, "gets") == 0)) {

        process_get_command(c, tokens, ntokens, true);

    } else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "decr") == 0)) {

        process_arithmetic_command(c, tokens, ntokens, 0);

    } else if (ntokens >= 3 && ntokens <= 5 && (strcmp(tokens[COMMAND_TOKEN].value, "delete") == 0)) {

        process_delete_command(c, tokens, ntokens);

    } else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "touch") == 0)) {

        process_touch_command(c, tokens, ntokens);

    } else if (ntokens >= 2 && (strcmp(tokens[COMMAND_TOKEN].value, "stats") == 0)) {

        process_stat(c, tokens, ntokens);

    } else if (ntokens >= 2 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "flush_all") == 0)) {
        time_t exptime = 0;

        set_noreply_maybe(c, tokens, ntokens);

        pthread_mutex_lock(&c->thread->stats.mutex);
        c->thread->stats.flush_cmds++;
        pthread_mutex_unlock(&c->thread->stats.mutex);

        if(ntokens == (c->noreply ? 3 : 2)) {
            settings.oldest_live = current_time - 1;
            item_flush_expired();
            out_string(c, "OK");
            return;
        }

        exptime = strtol(tokens[1].value, NULL, 10);
        if(errno == ERANGE) {
            out_string(c, "CLIENT_ERROR bad command line format");
            return;
        }

        /*
          If exptime is zero realtime() would return zero too, and
          realtime(exptime) - 1 would overflow to the max unsigned
          value.  So we process exptime == 0 the same way we do when
          no delay is given at all.
        */
        if (exptime > 0)
            settings.oldest_live = realtime(exptime) - 1;
        else /* exptime == 0 */
            settings.oldest_live = current_time - 1;
        item_flush_expired();
        out_string(c, "OK");
        return;

    } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "version") == 0)) {

        out_string(c, "VERSION " VERSION);

    } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "quit") == 0)) {

        conn_set_state(c, conn_closing);

    } else if (ntokens > 1 && strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0) {
        if (ntokens == 5 && strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0) {
            int src, dst, rv;

            if (settings.slab_reassign == false) {
                out_string(c, "CLIENT_ERROR slab reassignment disabled");
                return;
            }

            src = strtol(tokens[2].value, NULL, 10);
            dst = strtol(tokens[3].value, NULL, 10);

            if (errno == ERANGE) {
                out_string(c, "CLIENT_ERROR bad command line format");
                return;
            }

            rv = slabs_reassign(src, dst);
            switch (rv) {
            case REASSIGN_OK:
                out_string(c, "OK");
                break;
            case REASSIGN_RUNNING:
                out_string(c, "BUSY currently processing reassign request");
                break;
            case REASSIGN_BADCLASS:
                out_string(c, "BADCLASS invalid src or dst class id");
                break;
            case REASSIGN_NOSPARE:
                out_string(c, "NOSPARE source class has no spare pages");
                break;
            case REASSIGN_SRC_DST_SAME:
                out_string(c, "SAME src and dst class are identical");
                break;
            }
            return;
        } else if (ntokens == 4 &&
            (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) {
            process_slabs_automove_command(c, tokens, ntokens);
        } else {
            out_string(c, "ERROR");
        }
    } else if ((ntokens == 3 || ntokens == 4) && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) {
        process_verbosity_command(c, tokens, ntokens);
    } else {
        out_string(c, "ERROR");
    }
    return;
}


本次执行为set命令,故执行process_update_command (c, tokens, ntokens=6, comm=2, handle_cas=false)。其中调用item *it = item_alloc(key, nkey, flags, realtime(exptime), vlen)(数据结构一节讨论);c->item = it; c->ritem = ITEM_data(it); c->rlbytes = it->nbytes;(即将读入的数据value的长度+2) c->cmd = comm;(此处为set);conn_set_state(c, conn_nread)进入conn_nread状态。


进入conn_nread,继续处理。由于我们还未敲入value数据,理所当然会退出处理,进入事件循环,在进入事件循环前先update_event(c, EV_READ | EV_PERSIST)。这时我们在client输入“bar”,event_handler()又一次被调用,重新进入conn_nread状态。

6、case conn_nread

当读完c->rlbytes长度的数据后,执行complete_nread(c)。注意,读数据时,数据直接存放在c->ritem= ITEM_data(it)处,这样减少了数据复制次数。complete_nread(c)调用complete_nread_ascii(c)或complete_nread_binary(c)。

static void complete_nread_ascii(conn *c) {
    assert(c != NULL);

    item *it = c->item;
    int comm = c->cmd;
    enum store_item_type ret;

    pthread_mutex_lock(&c->thread->stats.mutex);
    c->thread->stats.slab_stats[it->slabs_clsid].set_cmds++;
    pthread_mutex_unlock(&c->thread->stats.mutex);

    if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) != 0) {
        out_string(c, "CLIENT_ERROR bad data chunk");
    } else {
      ret = store_item(it, comm, c);  //先获得粒度锁,再调用do_store_item(),内部暂不讨论。

      switch (ret) {
      case STORED:
          out_string(c, "STORED");
          break;
      case EXISTS:
          out_string(c, "EXISTS");
          break;
      case NOT_FOUND:
          out_string(c, "NOT_FOUND");
          break;
      case NOT_STORED:
          out_string(c, "NOT_STORED");
          break;
      default:
          out_string(c, "SERVER_ERROR Unhandled storage type.");
      }

    }

    item_remove(c->item);       /* release the c->item reference 应该是在store_item()中,增加了reference,此处减小reference使得最终reference为1。*/
    c->item = 0;
}

此处对out_string(c, "STORED")作进一步的分析:

static void out_string(conn *c, const char *str) {
    size_t len;

    assert(c != NULL);

    if (c->noreply) {
        if (settings.verbose > 1)
            fprintf(stderr, ">%d NOREPLY %s\n", c->sfd, str);
        c->noreply = false;
        conn_set_state(c, conn_new_cmd);
        return;
    }

    if (settings.verbose > 1)
        fprintf(stderr, ">%d %s\n", c->sfd, str);

    /* Nuke a partial output... */
    c->msgcurr = 0;
    c->msgused = 0;
    c->iovused = 0;
    add_msghdr(c);

    len = strlen(str);
    if ((len + 2) > c->wsize) {
        /* ought to be always enough. just fail for simplicity */
        str = "SERVER_ERROR output line too long";
        len = strlen(str);
    }

    memcpy(c->wbuf, str, len);
    memcpy(c->wbuf + len, "\r\n", 2);
    c->wbytes = len + 2;
    c->wcurr = c->wbuf;

    conn_set_state(c, conn_write);
    c->write_and_go = conn_new_cmd;    //设置写完之后的状态。
    return;
}

之后进入conn_write状态,继续处理。

7、case conn_write

        case conn_write:
            /*
             * We want to write out a simple response. If we haven't already,
             * assemble it into a msgbuf list (this will be a single-entry
             * list for TCP or a two-entry list for UDP).
             */
            if (c->iovused == 0 || (IS_UDP(c->transport) && c->iovused == 1)) {
                if (add_iov(c, c->wcurr, c->wbytes) != 0) {   //实际上主要功能为令c使用的(struct msghdr msg)->msg_iov[m->msg_iovlen].iov_base = (void *)(c->wbytes)。
                    if (settings.verbose > 0)                 //即初始化struct msghdr msg的缓存空间。对于udp,需要考虑MTU的问题,具体暂不讨论。
                        fprintf(stderr, "Couldn't build response\n");
                    conn_set_state(c, conn_closing);
                    break;
                }
            }

只要不出错,conn_write和conn_mwrite总是紧接着执行的。进入conn_mwite,继续处理。

8、case conn_mwirte

如果是UDP,需要进一步处理build_udp_headers(c)。然后transmit(c)。transmit()调用sendmsg(c->sfd, &c->msglist[c->msgcurr], 0)来发送数据,每次发送一个msghdr中的数据,并返回TRANSMIT_INCOMPLETE,从而反复调用transmit(c)。一般情况下,最终返回TRANSMIT_COMPLETE。(其他返回情况暂不考虑)

        对于TRANSMIT_COMPLETE,调用conn_set_state(c, c->write_and_go),注意前面out_string()中设置c->write_and_go = conn_new_cmd,这样又进入了conn_new_cmd状态,可以继续接收新的指令了。


其他状态:

9、case conn_closing

当运行中出错时,常常会进入此状态。

对于UDP,执行conn_cleanup(c)。如果c->item != NULL,item_remove(c->item);之后检测c->ileft,c->suffixleft,c->write_and_free,c->sasl_conn;对于UDP,最终设置conn_set_state(c, conn_read)。

对于TCP,执行conn_close(c)。首先调用event_del(&c->event),close(c->sfd),令allow_new_conns = true(一般情况下此值一直为true)。然后调用conn_cleanup(c)。最后将struct conn放入freeconns数组,或者conn_free(c)(释放c的各个动态空间)。


10、case conn_swallow

在前面conn_read状态中,曾执行过process_update_command(),该函数调用item_alloc(),若返回值为NULL(item都没分配成功,后面的数据又有何用,只能舍弃掉),则向客户端发送错误报告,并在发送完之后进入conn_swallow状态(c->write_and_go = conn_swallow; c->sbytes = vlen;)

实际上就是从客户端读入c->sbytes个字符,并舍弃掉。正常情况下,最终读完c->sbytes个字符,c->sbytes == 0,从而进入conn_new_cmd状态。

你可能感兴趣的:(memcached)