ovsd作为ovs的用户态管理主进程,负责处理ovs-vsctl命令接口、ovs-ofctl命令接口、与controller交互等等。 本篇分析它的整个处理机制。
1、main函数
bridge_init(remote); free(remote); exiting = false; while (!exiting) { //while循环,直到退出 memory_run(); if (memory_should_report()) { struct simap usage; simap_init(&usage); bridge_get_memory_usage(&usage); memory_report(&usage); simap_destroy(&usage); } bridge_run(); //处理controller交互,ovs-ofctl命令 unixctl_server_run(unixctl); //处理ovs-vsctl命令 netdev_run(); memory_wait(); bridge_wait(); unixctl_server_wait(unixctl); netdev_wait(); if (exiting) { poll_immediate_wake(); } poll_block(); //阻塞,当没有请求处理的时候,阻塞在此处 if (should_service_stop()) { exiting = true; } } bridge_exit(); unixctl_server_destroy(unixctl); service_stop();2、poll_block函数
void poll_block(void) { struct poll_loop *loop = poll_loop(); //ovsd进程携带的poll信息,其中最重要的就是fd信息 struct poll_node *node; struct pollfd *pollfds; HANDLE *wevents = NULL; int elapsed; int retval; int i; /* Register fatal signal events before actually doing any real work for * poll_block. */ fatal_signal_wait(); if (loop->timeout_when == LLONG_MIN) { COVERAGE_INC(poll_zero_timeout); } timewarp_run(); pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds); #ifdef _WIN32 wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents); #endif /* Populate with all the fds and events. */ i = 0; HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) { pollfds[i] = node->pollfd; #ifdef _WIN32 wevents[i] = node->wevent; if (node->pollfd.fd && node->wevent) { short int wsa_events = 0; if (node->pollfd.events & POLLIN) { wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE; } if (node->pollfd.events & POLLOUT) { wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE; } WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events); } #endif i++; } retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,<span style="white-space:pre"> </span>//time_poll会最终调用linux的poll函数 loop->timeout_when, &elapsed); if (retval < 0) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval)); } else if (!retval) { log_wakeup(loop->timeout_where, NULL, elapsed); } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) { i = 0; HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) { if (pollfds[i].revents) { log_wakeup(node->where, &pollfds[i], 0); } i++; } } free_poll_nodes(loop); loop->timeout_when = LLONG_MAX; loop->timeout_where = NULL; free(pollfds); free(wevents); /* Handle any pending signals before doing anything else. */ fatal_signal_run(); seq_woke(); }3、time_poll函数
int time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED, long long int timeout_when, int *elapsed) { long long int *last_wakeup = last_wakeup_get(); long long int start; bool quiescent; int retval = 0; time_init(); coverage_clear(); coverage_run(); if (*last_wakeup && !thread_is_pmd()) { log_poll_interval(*last_wakeup); } start = time_msec(); timeout_when = MIN(timeout_when, deadline); quiescent = ovsrcu_is_quiescent(); for (;;) { long long int now = time_msec(); int time_left; if (now >= timeout_when) { time_left = 0; } else if ((unsigned long long int) timeout_when - now > INT_MAX) { time_left = INT_MAX; } else { time_left = timeout_when - now; } if (!quiescent) { if (!time_left) { ovsrcu_quiesce(); } else { ovsrcu_quiesce_start(); } } #ifndef _WIN32 retval = poll(pollfds, n_pollfds, time_left); //调用linux的poll函数 if (retval < 0) { retval = -errno; } #else if (n_pollfds > MAXIMUM_WAIT_OBJECTS) { VLOG_ERR("Cannot handle more than maximum wait objects\n"); } else if (n_pollfds != 0) { retval = WaitForMultipleObjects(n_pollfds, handles, FALSE, time_left); } if (retval < 0) { /* XXX This will be replace by a win error to errno conversion function */ retval = -WSAGetLastError(); retval = -EINVAL; } #endif if (!quiescent && time_left) { ovsrcu_quiesce_end(); } if (deadline <= time_msec()) { #ifndef _WIN32 fatal_signal_handler(SIGALRM); #else VLOG_ERR("wake up from WaitForMultipleObjects after deadline"); fatal_signal_handler(SIGTERM); #endif if (retval < 0) { retval = 0; } break; } if (retval != -EINTR) { break; } } *last_wakeup = time_msec(); refresh_rusage(); *elapsed = *last_wakeup - start; return retval; }
由此可以看到,poll_block会最终调用linux poll函数,那么会监听哪些fd呢,这些fd是合适被放到进程信息中的呢,以下以ofservice为例作为说明,其他的句柄也是类似,只是添加的函数略有不同而已。
bridge_wait函数
void bridge_wait(void) { struct sset types; const char *type; ovsdb_idl_wait(idl); if (daemonize_txn) { ovsdb_idl_txn_wait(daemonize_txn); } if_notifier_wait(); if (ifaces_changed) { poll_immediate_wake(); } sset_init(&types); ofproto_enumerate_types(&types); SSET_FOR_EACH (type, &types) { ofproto_type_wait(type); } sset_destroy(&types); if (!hmap_is_empty(&all_bridges)) { struct bridge *br; HMAP_FOR_EACH (br, node, &all_bridges) { ofproto_wait(br->ofproto); //调用入口 } stats_update_wait(); status_update_wait(); } system_stats_wait(); }ofproto_wait函数
void ofproto_wait(struct ofproto *p) { p->ofproto_class->wait(p); if (p->ofproto_class->port_poll_wait) { p->ofproto_class->port_poll_wait(p); } seq_wait(connectivity_seq_get(), p->change_seq); connmgr_wait(p->connmgr); //调用入口 }connmgr_wait函数
void connmgr_wait(struct connmgr *mgr) { struct ofservice *ofservice; struct ofconn *ofconn; size_t i; LIST_FOR_EACH (ofconn, node, &mgr->all_conns) { ofconn_wait(ofconn); } ofmonitor_wait(mgr); if (mgr->in_band) { in_band_wait(mgr->in_band); } if (mgr->fail_open) { fail_open_wait(mgr->fail_open); } HMAP_FOR_EACH (ofservice, node, &mgr->services) { pvconn_wait(ofservice->pvconn); //监听ofservice句柄 } for (i = 0; i < mgr->n_snoops; i++) { pvconn_wait(mgr->snoops[i]); } }pvconn_wait函数
void pvconn_wait(struct pvconn *pvconn) { (pvconn->pvclass->wait)(pvconn); //实际调用pvconn_pstream_wait函数 }pvconn_pstream_wait函数
static void pvconn_pstream_wait(struct pvconn *pvconn) { struct pvconn_pstream *ps = pvconn_pstream_cast(pvconn); pstream_wait(ps->pstream); }pstream_wait函数
void pstream_wait(struct pstream *pstream) { (pstream->class->wait)(pstream); //实际调用fd_pstream_class的pfd_wait函数 }pfd_wait函数
static void pfd_wait(struct pstream *pstream) { struct fd_pstream *ps = fd_pstream_cast(pstream); poll_fd_wait(ps->fd, POLLIN); //该fd就是介绍ofservice服务建立时打开的句柄 }poll_fd_wait函数
#define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)poll_fd_wait_at函数
/* Registers 'fd' as waiting for the specified 'events' (which should be POLLIN * or POLLOUT or POLLIN | POLLOUT). The following call to poll_block() will * wake up when 'fd' becomes ready for one or more of the requested events. * * On Windows, 'fd' must be a socket. * * The event registration is one-shot: only the following call to poll_block() * is affected. The event will need to be re-registered after poll_block() is * called if it is to persist. * * ('where' is used in debug logging. Commonly one would use poll_fd_wait() to * automatically provide the caller's source file and line number for * 'where'.) */ void poll_fd_wait_at(int fd, short int events, const char *where) { poll_create_node(fd, 0, events, where); }poll_create_node函数
static void poll_create_node(int fd, HANDLE wevent, short int events, const char *where) { struct poll_loop *loop = poll_loop(); //进程poll_loop信息 struct poll_node *node; COVERAGE_INC(poll_create_node); /* Both 'fd' and 'wevent' cannot be set. */ ovs_assert(!fd != !wevent); /* Check for duplicate. If found, "or" the events. */ node = find_poll_node(loop, fd, wevent); if (node) { node->pollfd.events |= events; //如果已监听该fd,则增加监听的event } else { node = xzalloc(sizeof *node); //新建poll_node,并插入到poll_loop中 hmap_insert(&loop->poll_nodes, &node->hmap_node, hash_2words(fd, (uint32_t)wevent)); node->pollfd.fd = fd; node->pollfd.events = events; #ifdef _WIN32 if (!wevent) { wevent = CreateEvent(NULL, FALSE, FALSE, NULL); } #endif node->wevent = wevent; node->where = where; } }至此,了解了句柄是如何添加到poll_loop中,并通过poll_block进行监听,其他句柄的监听也是大同小异,不再赘述。