Memcached源码阅读之线程交互
Memcached按之前的分析可以知道,其是典型的Master-Worker线程模型,这种模型很典型,其工作模型是Master绑定端口,监听网络连接,接受网络连接之后,通过线程间通信来唤醒Worker线程,Worker线程已经连接的描述符执行读写操作,这种模型简化了整个通信模型,下面分析下这个过程。
- case conn_listening:
- addrlen = sizeof(addr);
-
- if ((sfd = accept(c->sfd, (struct sockaddr *) &addr, &addrlen)) == -1)
- {
-
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- {
- stop = true;
- }
-
- else if (errno == EMFILE)
- {
- if (settings.verbose > 0)
- fprintf(stderr, "Too many open connections\n");
- accept_new_conns(false);
- stop = true;
- }
- else
- {
- perror("accept()");
- stop = true;
- }
- break;
- }
-
- if ((flags = fcntl(sfd, F_GETFL, 0)) < 0
- || fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0)
- {
- perror("setting O_NONBLOCK");
- close(sfd);
- break;
- }
-
- if (settings.maxconns_fast
- && stats.curr_conns + stats.reserved_fds
- >= settings.maxconns - 1)
- {
- str = "ERROR Too many open connections\r\n";
- res = write(sfd, str, strlen(str));
- close(sfd);
- STATS_LOCK();
- stats.rejected_conns++;
- STATS_UNLOCK();
- }
- else
- {
- dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,
- DATA_BUFFER_SIZE, tcp_transport);
- }
-
- stop = true;
- break;
这个是TCP的连接建立过程,由于UDP不需要建立连接,所以直接分发给Worker线程,让Worker线程进行读写操作,而TCP在建立连接之后,也执行连接分发(和UDP的一样),下面看看dispatch_conn_new内部是如何进行链接分发的。
- void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags,
- int read_buffer_size, enum network_transport transport) {
- CQ_ITEM *item = cqi_new();
- char buf[1];
- int tid = (last_thread + 1) % settings.num_threads;
-
- LIBEVENT_THREAD *thread = threads + tid;
-
- last_thread = tid;
-
- item->sfd = sfd;
- item->init_state = init_state;
- item->event_flags = event_flags;
- item->read_buffer_size = read_buffer_size;
- item->transport = transport;
-
- cq_push(thread->new_conn_queue, item);
-
- MEMCACHED_CONN_DISPATCH(sfd, thread->thread_id);
- buf[0] = 'c';
-
- if (write(thread->notify_send_fd, buf, 1) != 1) {
- perror("Writing to thread notify pipe");
- }
- }
投递到子线程的连接队列之后,同时,通过忘子线程的PIPE管道写入字符c来,下面我们看看子线程是如何处理的?
-
- event_set(&me->notify_event, me->notify_receive_fd,
- EV_READ | EV_PERSIST, thread_libevent_process, me);
-
- static void thread_libevent_process(int fd, short which, void *arg) {
- LIBEVENT_THREAD *me = arg;
- CQ_ITEM *item;
- char buf[1];
-
- if (read(fd, buf, 1) != 1)
- if (settings.verbose > 0)
- fprintf(stderr, "Can't read from libevent pipe\n");
-
- switch (buf[0]) {
- case 'c':
- item = cq_pop(me->new_conn_queue);
-
- if (NULL != item) {
- conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
- item->read_buffer_size, item->transport, me->base);
- if (c == NULL) {
- if (IS_UDP(item->transport)) {
- fprintf(stderr, "Can't listen for events on UDP socket\n");
- exit(1);
- } else {
- if (settings.verbose > 0) {
- fprintf(stderr, "Can't listen for events on fd %d\n",
- item->sfd);
- }
- close(item->sfd);
- }
- } else {
- c->thread = me;
- }
- cqi_free(item);
- }
- break;
- }
- }
之前分析过conn_new的执行流程,conn_new里面会建立sfd的网络监听libevent事件,事件回调函数为event_handler。
- event_set(&c->event, sfd, event_flags, event_handler, (void *) c);
- event_base_set(base, &c->event);
而event_handler的执行流程最终会进入到业务处理的状态机中,关于状态机,后续分析。
Memcached源码分析之状态机(一)
按我们之前的描述,Master线程建立连接之后,分发给Worker线程,而Worker线程处理业务逻辑时,会进入状态机,状态机按不同的状态处理业务逻辑,我们在分析连接分发时,已经看到了Master线程进入状态机时在有新连接建立的时候,后续的状态都是业务逻辑的状态,其处理流程如下图所示:
总共有10个状态(代码中的状态不止这些,有些没什么用,此处就没展现),状态listenning状态是Master建立连接的过程,我们已经分析过了,我们接下来分不同的文章分析其余的9中状态。
- enum conn_states {
- conn_listening,
- conn_new_cmd,
- conn_waiting,
- conn_read,
- conn_parse_cmd,
- conn_write,
- conn_nread,
- conn_swallow,
- conn_closing,
- conn_mwrite,
- conn_max_state
- };
这篇文件先分析conn_new_cmd和conn_wating状态,子线程最初进入的状态就是conn_new_cmd状态,这个状态主要是做一些清理。
- case conn_new_cmd:
- --nreqs;
- if (nreqs >= 0)
- {
- reset_cmd_handler(c);
- }
- else
- {
- pthread_mutex_lock(&c->thread->stats.mutex);
- c->thread->stats.conn_yields++;
- pthread_mutex_unlock(&c->thread->stats.mutex);
- if (c->rbytes > 0)
- {
- if (!update_event(c, EV_WRITE | EV_PERSIST))
- {
- if (settings.verbose > 0)
- fprintf(stderr, "Couldn't update event\n");
- conn_set_state(c, conn_closing);
- }
- }
- stop = true;
- }
- break;
-
- static void reset_cmd_handler(conn *c)
- {
- c->cmd = -1;
- c->substate = bin_no_state;
- if (c->item != NULL)
- {
- item_remove(c->item);
- c->item = NULL;
- }
- conn_shrink(c);
- if (c->rbytes > 0)
- {
- conn_set_state(c, conn_parse_cmd);
- }
- else
- {
- conn_set_state(c, conn_waiting);
- }
- }
-
- static void conn_shrink(conn *c)
- {
- assert(c != NULL);
-
- if (IS_UDP(c->transport))
- return;
-
- if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE)
- {
- char *newbuf;
-
- if (c->rcurr != c->rbuf)
- memmove(c->rbuf, c->rcurr, (size_t) c->rbytes);
-
- newbuf = (char *) realloc((void *) c->rbuf, DATA_BUFFER_SIZE);
-
- if (newbuf)
- {
- c->rbuf = newbuf;
- c->rsize = DATA_BUFFER_SIZE;
- }
- c->rcurr = c->rbuf;
- }
-
- if (c->isize > ITEM_LIST_HIGHWAT)
- {
- item **newbuf = (item**) realloc((void *) c->ilist,ITEM_LIST_INITIAL * sizeof(c->ilist[0]));
- if (newbuf)
- {
- c->ilist = newbuf;
- c->isize = ITEM_LIST_INITIAL;
- }
- }
-
- if (c->msgsize > MSG_LIST_HIGHWAT)
- {
- struct msghdr *newbuf = (struct msghdr *) realloc((void *) c->msglist,MSG_LIST_INITIAL * sizeof(c->msglist[0]));
- if (newbuf)
- {
- c->msglist = newbuf;
- c->msgsize = MSG_LIST_INITIAL;
- }
- }
-
- if (c->iovsize > IOV_LIST_HIGHWAT)
- {
- struct iovec *newbuf = (struct iovec *) realloc((void *) c->iov,IOV_LIST_INITIAL * sizeof(c->iov[0]));
- if (newbuf)
- {
- c->iov = newbuf;
- c->iovsize = IOV_LIST_INITIAL;
- }
- }
- }
从conn_new_cmd状态会进入conn_parse_cmd状态(如果有数据)或者conn_waiting(如果没有数据)状态,下面看看conn_waiting状态。
- case conn_waiting:
- if (!update_event(c, EV_READ | EV_PERSIST))
- {
- if (settings.verbose > 0)
- fprintf(stderr, "Couldn't update event\n");
- conn_set_state(c, conn_closing);
- break;
- }
-
- conn_set_state(c, conn_read);
- stop = true;
- break;
-
- static bool update_event(conn *c, const int new_flags)
- {
- assert(c != NULL);
-
- struct event_base *base = c->event.ev_base;
- if (c->ev_flags == new_flags)
- return true;
- if (event_del(&c->event) == -1)
- return false;
- event_set(&c->event, c->sfd, new_flags, event_handler, (void *) c);
- event_base_set(base, &c->event);
- c->ev_flags = new_flags;
- if (event_add(&c->event, 0) == -1)
- return false;
- return true;
- }
备注:图片参考地址,http://blog.chinaunix.net/uid-27767798-id-3415510.html
Memcached源码分析之状态机(二)
通过前面一篇文章分析得知,conn_wating状态是在等待读取数据,conn_wating通过修改libevent事件(修改为读事件)之后就进入了conn_read状态,该状态就是从网络中读取数据,下面我们详细分析conn_read状态。
- case conn_read:
- res = IS_UDP(c->transport) ? try_read_udp(c) : try_read_network(c);
-
- switch (res)
- {
- case READ_NO_DATA_RECEIVED:
- conn_set_state(c, conn_waiting);
- break;
- case READ_DATA_RECEIVED:
- conn_set_state(c, conn_parse_cmd);
- break;
- case READ_ERROR:
- conn_set_state(c, conn_closing);
- break;
- case READ_MEMORY_ERROR:
- break;
- }
- break;
-
- static enum try_read_result try_read_network(conn *c)
- {
- enum try_read_result gotdata = READ_NO_DATA_RECEIVED;
- int res;
- int num_allocs = 0;
- assert(c != NULL);
-
- if (c->rcurr != c->rbuf)
- {
- if (c->rbytes != 0)
- memmove(c->rbuf, c->rcurr, c->rbytes);
- c->rcurr = c->rbuf;
- }
-
- while (1)
- {
- if (c->rbytes >= c->rsize)
- {
- if (num_allocs == 4)
- {
- return gotdata;
- }
- ++num_allocs;
- char *new_rbuf = realloc(c->rbuf, c->rsize * 2);
- if (!new_rbuf)
- {
- if (settings.verbose > 0)
- fprintf(stderr, "Couldn't realloc input buffer\n");
- c->rbytes = 0;
- out_string(c, "SERVER_ERROR out of memory reading request");
- c->write_and_go = conn_closing;
- return READ_MEMORY_ERROR;
- }
- c->rcurr = c->rbuf = new_rbuf;
- c->rsize *= 2;
- }
-
- int avail = c->rsize - c->rbytes;
- res = read(c->sfd, c->rbuf + c->rbytes, avail);
- if (res > 0)
- {
- pthread_mutex_lock(&c->thread->stats.mutex);
- c->thread->stats.bytes_read += res;
- pthread_mutex_unlock(&c->thread->stats.mutex);
- gotdata = READ_DATA_RECEIVED;
- c->rbytes += res;
- if (res == avail)
- {
- continue;
- }
- else
- {
- break;
- }
- }
- if (res == 0)
- {
- return READ_ERROR;
- }
- if (res == -1)
- {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- {
- break;
- }
- return READ_ERROR;
- }
- }
- return gotdata;
- }
上面描述的是TCP的数据读取,下面我们分析下UDP的数据读取,UDP是数据报的形式,读取到一个,就是一个完整的数据报,所以其处理过程简单。
-
- static enum try_read_result try_read_udp(conn *c)
- {
- int res;
-
- assert(c != NULL);
-
- c->request_addr_size = sizeof(c->request_addr);
- res = recvfrom(c->sfd, c->rbuf, c->rsize, 0, &c->request_addr,
- &c->request_addr_size);
- if (res > 8)
- {
- unsigned char *buf = (unsigned char *) c->rbuf;
- pthread_mutex_lock(&c->thread->stats.mutex);
- c->thread->stats.bytes_read += res;
- pthread_mutex_unlock(&c->thread->stats.mutex);
-
-
- c->request_id = buf[0] * 256 + buf[1];
-
-
- if (buf[4] != 0 || buf[5] != 1)
- {
- out_string(c, "SERVER_ERROR multi-packet request not supported");
- return READ_NO_DATA_RECEIVED;
- }
-
-
- res -= 8;
- memmove(c->rbuf, c->rbuf + 8, res);
-
- c->rbytes = res;
- c->rcurr = c->rbuf;
- return READ_DATA_RECEIVED;
- }
- return READ_NO_DATA_RECEIVED;
- }
Memcached源码分析之状态机(三)
按前面2篇文章的分析可以知道,从网络读取了数据之后,将会进入conn_parse_cmd状态,该状态是按协议来解析读取到的网络数据。
- case conn_parse_cmd:
-
- if (try_read_command(c) == 0)
- {
-
- conn_set_state(c, conn_waiting);
- }
-
- break;
-
- static int try_read_command(conn *c)
- {
- assert(c != NULL);
- assert(c->rcurr <= (c->rbuf + c->rsize));
- assert(c->rbytes > 0);
-
- if (c->protocol == negotiating_prot || c->transport == udp_transport)
- {
-
- if ((unsigned char) c->rbuf[0] == (unsigned char) PROTOCOL_BINARY_REQ)
- {
- c->protocol = binary_prot;
- }
- else
- {
- c->protocol = ascii_prot;
- }
-
- if (settings.verbose > 1)
- {
- fprintf(stderr, "%d: Client using the %s protocol\n", c->sfd,
- prot_text(c->protocol));
- }
- }
-
- if (c->protocol == binary_prot)
- {
-
- if (c->rbytes < sizeof(c->binary_header))
- {
-
- return 0;
- }
- else
- {
- #ifdef NEED_ALIGN
-
- if (((long)(c->rcurr)) % 8 != 0)
- {
-
- memmove(c->rbuf, c->rcurr, c->rbytes);
- c->rcurr = c->rbuf;
- if (settings.verbose > 1)
- {
- fprintf(stderr, "%d: Realign input buffer\n", c->sfd);
- }
- }
- #endif
- protocol_binary_request_header* req;
- req = (protocol_binary_request_header*) c->rcurr;
-
- if (settings.verbose > 1)
- {
-
- int ii;
- fprintf(stderr, "<%d Read binary protocol data:", c->sfd);
- for (ii = 0; ii < sizeof(req->bytes); ++ii)
- {
- if (ii % 4 == 0)
- {
- fprintf(stderr, "\n<%d ", c->sfd);
- }
- fprintf(stderr, " 0x%02x", req->bytes[ii]);
- }
- fprintf(stderr, "\n");
- }
-
- c->binary_header = *req;
- c->binary_header.request.keylen = ntohs(req->request.keylen);
- c->binary_header.request.bodylen = ntohl(req->request.bodylen);
- c->binary_header.request.cas = ntohll(req->request.cas);
-
- if (c->binary_header.request.magic != PROTOCOL_BINARY_REQ)
- {
- if (settings.verbose)
- {
- fprintf(stderr, "Invalid magic: %x\n",
- c->binary_header.request.magic);
- }
- conn_set_state(c, conn_closing);
- return -1;
- }
-
- c->msgcurr = 0;
- c->msgused = 0;
- c->iovused = 0;
- if (add_msghdr(c) != 0)
- {
- out_string(c, "SERVER_ERROR out of memory");
- return 0;
- }
-
- c->cmd = c->binary_header.request.opcode;
- c->keylen = c->binary_header.request.keylen;
- c->opaque = c->binary_header.request.opaque;
-
- c->cas = 0;
-
- dispatch_bin_command(c);
-
- c->rbytes -= sizeof(c->binary_header);
- c->rcurr += sizeof(c->binary_header);
- }
- }
文本协议的过程和二进制协议的过程类似,此处不分析,另外dispatch_bin_command是处理具体的(比如get,set等)操作的,和是二进制协议具体相关的,解析完一些数据之后,会进入到conn_nread的流程,也就是读取指定数目数据的过程,这个过程主要是做具体的操作了,比如get,add,set操作。
- case bin_read_set_value:
- complete_update_bin(c);
- break;
- case bin_reading_get_key:
- process_bin_get(c);
- break;
状态机的整个处理过程就介绍到这里,其他的状态我们就不介绍了,后续的文章主要是分析一些数据的操作和内存结构,了解了这些之后,其实其他状态就相对容易很多。
上文来自:http://blog.csdn.net/lcli2009?viewmode=contents