enum conn_states {
conn_listening, /**< the socket which listens for connections */
conn_new_cmd, /**< Prepare connection for next command */
conn_waiting, /**< waiting for a readable socket */
conn_read, /**< reading in a command line */
conn_parse_cmd, /**< try to parse a command from the input buffer */
conn_write, /**< writing out a simple response */
conn_nread, /**< reading in a fixed number of bytes */
conn_swallow, /**< swallowing unnecessary bytes w/o storing */
conn_closing, /**< closing this connection */
conn_mwrite, /**< writing out many items sequentially */
conn_max_state /**< Max state value (used for assertion) */
};
当client发起tcp连接(telnet 127.0.0.1 11211)时,event_handler()会调用drive_machine()。之后根据struct conn的当前状态去处理。
调用accept(),之后调用dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST, DATA_BUFFER_SIZE, tcp_transport),其中sfd是刚刚accept的套接字,conn_new_cmd是设置给sfd的状态。与执行流程一节分析一样,dispatch_conn_new()将该sfd交给某个工作子线程管理。
当client在telnet中键入“set foo 0 0 3”后。
--nreqs(nreqs = settings.reqs_per_event,本例为20,Maximum number of io to process on each io-event)。
如果nreqs<0,c->thread->stats.conn_yields++(放弃的conn数+1)。如果c->rbytes > 0(该连接的输入缓冲区中已经有数据),很可能不会有新数据到来,那么很可能这个事件永远不会再被signal,这个conn可能就真的被永远放弃了;为了防止这种情况,调用update_event(),将该事件的ev_flags由EV_READ更新为EV_WRITE,这样之后这个事件又被signal,这个连接可以稍后得以处理。本次放弃处理struct conn。
如果nreqs>=0,调用reset_cmd_handler(c)。先设置c->substate = bin_no_state,此时c->item应当为null(c->item is used to hold an item structure createdafter reading the command line ofset/add/replace commands, but before we finished reading the actual data. The data is read into ITEM_data(item) to avoid extra copying.)。接着调用conn_shrink(c),作用为Shrinks a connection's buffers if they're too big。
如果c->rbytes > 0,调用conn_set_state(c, conn_parse_cmd),否则调用conn_set_state(c, conn_waiting)。
本例中此时c->rbytes = 0,进入conn_waiting状态,继续处理。
update_event(c, EV_READ | EV_PERSIST)后,conn_set_state(c, conn_read)。然后退出本次处理。
进入conn_read状态。
case conn_read:
res = IS_UDP(c->transport) ? try_read_udp(c) : try_read_network(c);
switch (res) {
case READ_NO_DATA_RECEIVED:
conn_set_state(c, conn_waiting);
break;
case READ_DATA_RECEIVED:
conn_set_state(c, conn_parse_cmd);
break;
case READ_ERROR:
conn_set_state(c, conn_closing);
break;
case READ_MEMORY_ERROR: /* Failed to allocate more memory */
/* State already set by try_read_network */
break;
}
break;
此时已经读入一定的数据,进入conn_parse_cmd状态,继续处理。
case conn_parse_cmd :
if (try_read_command(c) == 0) {
/* wee need more data! */
conn_set_state(c, conn_waiting);
}
break;
try_read_command(conn *c)设置c->protocol = binary_prot或ascii_prot,此处使用的是ascii_prot(binary_prot暂不讨论);最后它调用process_command(c, c->rcurr)。
static void process_command(conn *c, char *command) { //分解命令行,并相应处理
token_t tokens[MAX_TOKENS];
size_t ntokens;
int comm;
assert(c != NULL);
MEMCACHED_PROCESS_COMMAND_START(c->sfd, c->rcurr, c->rbytes);
if (settings.verbose > 1)
fprintf(stderr, "<%d %s\n", c->sfd, command);
/*
* for commands set/add/replace, we build an item and read the data
* directly into it, then continue in nread_complete().
*/
c->msgcurr = 0;
c->msgused = 0;
c->iovused = 0;
if (add_msghdr(c) != 0) { //Adds a message header to a connection.实际上对struct msghdr *msg = c->msglist + c->msgused进行初始化
//(msg->msg_iov = &c->iov[c->iovused];)。对于UDP其中会调用add_iov(),暂不讨论。
out_string(c, "SERVER_ERROR out of memory preparing response");
return;
}
ntokens = tokenize_command(command, tokens, MAX_TOKENS); //只是将一整条命令根据空格分割成数组形式,结尾以length = 0作为结束标志
if (ntokens >= 3 &&
((strcmp(tokens[COMMAND_TOKEN].value, "get") == 0) ||
(strcmp(tokens[COMMAND_TOKEN].value, "bget") == 0))) {
process_get_command(c, tokens, ntokens, false);
} else if ((ntokens == 6 || ntokens == 7) &&
((strcmp(tokens[COMMAND_TOKEN].value, "add") == 0 && (comm = NREAD_ADD)) ||
(strcmp(tokens[COMMAND_TOKEN].value, "set") == 0 && (comm = NREAD_SET)) ||
(strcmp(tokens[COMMAND_TOKEN].value, "replace") == 0 && (comm = NREAD_REPLACE)) ||
(strcmp(tokens[COMMAND_TOKEN].value, "prepend") == 0 && (comm = NREAD_PREPEND)) ||
(strcmp(tokens[COMMAND_TOKEN].value, "append") == 0 && (comm = NREAD_APPEND)) )) {
process_update_command(c, tokens, ntokens, comm, false);
} else if ((ntokens == 7 || ntokens == 8) && (strcmp(tokens[COMMAND_TOKEN].value, "cas") == 0 && (comm = NREAD_CAS))) {
process_update_command(c, tokens, ntokens, comm, true);
} else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "incr") == 0)) {
process_arithmetic_command(c, tokens, ntokens, 1);
} else if (ntokens >= 3 && (strcmp(tokens[COMMAND_TOKEN].value, "gets") == 0)) {
process_get_command(c, tokens, ntokens, true);
} else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "decr") == 0)) {
process_arithmetic_command(c, tokens, ntokens, 0);
} else if (ntokens >= 3 && ntokens <= 5 && (strcmp(tokens[COMMAND_TOKEN].value, "delete") == 0)) {
process_delete_command(c, tokens, ntokens);
} else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "touch") == 0)) {
process_touch_command(c, tokens, ntokens);
} else if (ntokens >= 2 && (strcmp(tokens[COMMAND_TOKEN].value, "stats") == 0)) {
process_stat(c, tokens, ntokens);
} else if (ntokens >= 2 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "flush_all") == 0)) {
time_t exptime = 0;
set_noreply_maybe(c, tokens, ntokens);
pthread_mutex_lock(&c->thread->stats.mutex);
c->thread->stats.flush_cmds++;
pthread_mutex_unlock(&c->thread->stats.mutex);
if(ntokens == (c->noreply ? 3 : 2)) {
settings.oldest_live = current_time - 1;
item_flush_expired();
out_string(c, "OK");
return;
}
exptime = strtol(tokens[1].value, NULL, 10);
if(errno == ERANGE) {
out_string(c, "CLIENT_ERROR bad command line format");
return;
}
/*
If exptime is zero realtime() would return zero too, and
realtime(exptime) - 1 would overflow to the max unsigned
value. So we process exptime == 0 the same way we do when
no delay is given at all.
*/
if (exptime > 0)
settings.oldest_live = realtime(exptime) - 1;
else /* exptime == 0 */
settings.oldest_live = current_time - 1;
item_flush_expired();
out_string(c, "OK");
return;
} else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "version") == 0)) {
out_string(c, "VERSION " VERSION);
} else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "quit") == 0)) {
conn_set_state(c, conn_closing);
} else if (ntokens > 1 && strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0) {
if (ntokens == 5 && strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0) {
int src, dst, rv;
if (settings.slab_reassign == false) {
out_string(c, "CLIENT_ERROR slab reassignment disabled");
return;
}
src = strtol(tokens[2].value, NULL, 10);
dst = strtol(tokens[3].value, NULL, 10);
if (errno == ERANGE) {
out_string(c, "CLIENT_ERROR bad command line format");
return;
}
rv = slabs_reassign(src, dst);
switch (rv) {
case REASSIGN_OK:
out_string(c, "OK");
break;
case REASSIGN_RUNNING:
out_string(c, "BUSY currently processing reassign request");
break;
case REASSIGN_BADCLASS:
out_string(c, "BADCLASS invalid src or dst class id");
break;
case REASSIGN_NOSPARE:
out_string(c, "NOSPARE source class has no spare pages");
break;
case REASSIGN_SRC_DST_SAME:
out_string(c, "SAME src and dst class are identical");
break;
}
return;
} else if (ntokens == 4 &&
(strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) {
process_slabs_automove_command(c, tokens, ntokens);
} else {
out_string(c, "ERROR");
}
} else if ((ntokens == 3 || ntokens == 4) && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) {
process_verbosity_command(c, tokens, ntokens);
} else {
out_string(c, "ERROR");
}
return;
}
本次执行为set命令,故执行process_update_command (c, tokens, ntokens=6, comm=2, handle_cas=false)。其中调用item *it = item_alloc(key, nkey, flags, realtime(exptime), vlen)(数据结构一节讨论);c->item = it; c->ritem = ITEM_data(it); c->rlbytes = it->nbytes;(即将读入的数据value的长度+2) c->cmd = comm;(此处为set);conn_set_state(c, conn_nread)进入conn_nread状态。
进入conn_nread,继续处理。由于我们还未敲入value数据,理所当然会退出处理,进入事件循环,在进入事件循环前先update_event(c, EV_READ | EV_PERSIST)。这时我们在client输入“bar”,event_handler()又一次被调用,重新进入conn_nread状态。
当读完c->rlbytes长度的数据后,执行complete_nread(c)。注意,读数据时,数据直接存放在c->ritem= ITEM_data(it)处,这样减少了数据复制次数。complete_nread(c)调用complete_nread_ascii(c)或complete_nread_binary(c)。
static void complete_nread_ascii(conn *c) {
assert(c != NULL);
item *it = c->item;
int comm = c->cmd;
enum store_item_type ret;
pthread_mutex_lock(&c->thread->stats.mutex);
c->thread->stats.slab_stats[it->slabs_clsid].set_cmds++;
pthread_mutex_unlock(&c->thread->stats.mutex);
if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) != 0) {
out_string(c, "CLIENT_ERROR bad data chunk");
} else {
ret = store_item(it, comm, c); //先获得粒度锁,再调用do_store_item(),内部暂不讨论。
switch (ret) {
case STORED:
out_string(c, "STORED");
break;
case EXISTS:
out_string(c, "EXISTS");
break;
case NOT_FOUND:
out_string(c, "NOT_FOUND");
break;
case NOT_STORED:
out_string(c, "NOT_STORED");
break;
default:
out_string(c, "SERVER_ERROR Unhandled storage type.");
}
}
item_remove(c->item); /* release the c->item reference 应该是在store_item()中,增加了reference,此处减小reference使得最终reference为1。*/
c->item = 0;
}
static void out_string(conn *c, const char *str) {
size_t len;
assert(c != NULL);
if (c->noreply) {
if (settings.verbose > 1)
fprintf(stderr, ">%d NOREPLY %s\n", c->sfd, str);
c->noreply = false;
conn_set_state(c, conn_new_cmd);
return;
}
if (settings.verbose > 1)
fprintf(stderr, ">%d %s\n", c->sfd, str);
/* Nuke a partial output... */
c->msgcurr = 0;
c->msgused = 0;
c->iovused = 0;
add_msghdr(c);
len = strlen(str);
if ((len + 2) > c->wsize) {
/* ought to be always enough. just fail for simplicity */
str = "SERVER_ERROR output line too long";
len = strlen(str);
}
memcpy(c->wbuf, str, len);
memcpy(c->wbuf + len, "\r\n", 2);
c->wbytes = len + 2;
c->wcurr = c->wbuf;
conn_set_state(c, conn_write);
c->write_and_go = conn_new_cmd; //设置写完之后的状态。
return;
}
case conn_write:
/*
* We want to write out a simple response. If we haven't already,
* assemble it into a msgbuf list (this will be a single-entry
* list for TCP or a two-entry list for UDP).
*/
if (c->iovused == 0 || (IS_UDP(c->transport) && c->iovused == 1)) {
if (add_iov(c, c->wcurr, c->wbytes) != 0) { //实际上主要功能为令c使用的(struct msghdr msg)->msg_iov[m->msg_iovlen].iov_base = (void *)(c->wbytes)。
if (settings.verbose > 0) //即初始化struct msghdr msg的缓存空间。对于udp,需要考虑MTU的问题,具体暂不讨论。
fprintf(stderr, "Couldn't build response\n");
conn_set_state(c, conn_closing);
break;
}
}
如果是UDP,需要进一步处理build_udp_headers(c)。然后transmit(c)。transmit()调用sendmsg(c->sfd, &c->msglist[c->msgcurr], 0)来发送数据,每次发送一个msghdr中的数据,并返回TRANSMIT_INCOMPLETE,从而反复调用transmit(c)。一般情况下,最终返回TRANSMIT_COMPLETE。(其他返回情况暂不考虑)
对于TRANSMIT_COMPLETE,调用conn_set_state(c, c->write_and_go),注意前面out_string()中设置c->write_and_go = conn_new_cmd,这样又进入了conn_new_cmd状态,可以继续接收新的指令了。
其他状态:
当运行中出错时,常常会进入此状态。
对于UDP,执行conn_cleanup(c)。如果c->item != NULL,item_remove(c->item);之后检测c->ileft,c->suffixleft,c->write_and_free,c->sasl_conn;对于UDP,最终设置conn_set_state(c, conn_read)。
对于TCP,执行conn_close(c)。首先调用event_del(&c->event),close(c->sfd),令allow_new_conns = true(一般情况下此值一直为true)。然后调用conn_cleanup(c)。最后将struct conn放入freeconns数组,或者conn_free(c)(释放c的各个动态空间)。
在前面conn_read状态中,曾执行过process_update_command(),该函数调用item_alloc(),若返回值为NULL(item都没分配成功,后面的数据又有何用,只能舍弃掉),则向客户端发送错误报告,并在发送完之后进入conn_swallow状态(c->write_and_go = conn_swallow; c->sbytes = vlen;)
实际上就是从客户端读入c->sbytes个字符,并舍弃掉。正常情况下,最终读完c->sbytes个字符,c->sbytes == 0,从而进入conn_new_cmd状态。