ovs-vswitch的主要处理流程在主循环while里,话不多说,直接看代码吧。
openvswitch/vswitchd.c
main(int argc, char *argv[])
char *unixctl_path = NULL;
struct unixctl_server *unixctl;
char *remote;
remote = parse_options(argc, argv, &unixctl_path);
unixctl_server_create(unixctl_path, &unixctl);
bridge_init(remote);
/* Create connection to database. */
//只是创建结构体,还没有真正连接数据库
idl = ovsdb_idl_create(remote, &ovsrec_idl_class, true, true);
lacp_init();
bond_init();
...
exiting = false;
cleanup = false;
while (!exiting) {
1. memory_run();
2. bridge_run();
2.0
a. 连接数据库
b. 发送 monitor_cond 请求给 ovsdb-server,一旦数据库配置有改变(比如添加/删除网桥),则会通知 ovs-vswitchd
ovsdb_idl_run(idl);
jsonrpc_session_run(idl->session);
//当s->stream 不为空时,分配 rpc 结构,后续使用 rpc 和 ovsdb-server 通信,
//但底层还是使用 stream 收发数据
if (s->stream) {
stream_connect(s->stream);
s->rpc = jsonrpc_open(s->stream);
rpc = xzalloc(sizeof *rpc);
rpc->name = xstrdup(stream_get_name(stream));
rpc->stream = stream;
byteq_init(&rpc->input, rpc->input_buffer, sizeof rpc->input_buffer);
ovs_list_init(&rpc->output);
s->stream = NULL;
}
//主动连接 ovsdb-server,将连接后的数据流放在 s->stream
switch (reconnect_run(s->reconnect, time_msec())) {
case RECONNECT_CONNECT:
jsonrpc_session_connect(s);
jsonrpc_stream_open(name, &s->stream, s->dscp);
}
//当 rpc 不为空后,就可以循环接收数据了
for (i = 0; jsonrpc_session_is_connected(idl->session) && i < 50; i++) {
//首先发送请求获取 schema
seqno = jsonrpc_session_get_seqno(idl->session);
if (idl->state_seqno != seqno) {
idl->state_seqno = seqno;
json_destroy(idl->request_id);
idl->request_id = NULL;
ovsdb_idl_txn_abort_all(idl);
//发送 get-schema 请求
ovsdb_idl_send_schema_request(idl);
//将state设置为 IDL_S_SCHEMA_REQUESTED
idl->state = IDL_S_SCHEMA_REQUESTED;
}
msg = jsonrpc_session_recv(idl->session);
if (!msg) {
break;
}
//收到 ovsdb-server 发送的通知,表示有配置改变
if (msg->type == JSONRPC_NOTIFY
&& !strcmp(msg->method, "update2")
&& msg->params->type == JSON_ARRAY
&& msg->params->u.array.n == 2
&& msg->params->u.array.elems[0]->type == JSON_STRING) {
/* Database contents changed. */
ovsdb_idl_parse_update(idl, msg->params->u.array.elems[1], OVSDB_UPDATE2);
} else if (msg->type == JSONRPC_REPLY && idl->request_id && json_equal(idl->request_id, msg->id)) {
json_destroy(idl->request_id);
idl->request_id = NULL;
switch (idl->state) {
//收到 get-schema 的 reply
case IDL_S_SCHEMA_REQUESTED:
/* Reply to our "get_schema" request. */
idl->schema = json_clone(msg->result);
//发送 monitor-cond 请求
ovsdb_idl_send_monitor_cond_request(idl);
//设置 state 为 IDL_S_MONITOR_COND_REQUESTED
idl->state = IDL_S_MONITOR_COND_REQUESTED;
break;
case IDL_S_MONITOR_REQUESTED:
case IDL_S_MONITOR_COND_REQUESTED:
/* Reply to our "monitor" or "monitor_cond" request. */
idl->change_seqno++;
ovsdb_idl_clear(idl);
if (idl->state == IDL_S_MONITOR_REQUESTED) {
idl->state = IDL_S_MONITORING;
ovsdb_idl_parse_update(idl, msg->result, OVSDB_UPDATE);
} else { /* IDL_S_MONITOR_COND_REQUESTED. */
//收到 monitor-cond 响应
idl->state = IDL_S_MONITORING_COND;
ovsdb_idl_parse_update(idl, msg->result, OVSDB_UPDATE2);
}
/* Schema is not useful after monitor request is accepted
* by the server. */
json_destroy(idl->schema);
idl->schema = NULL;
break;
case IDL_S_MONITORING_COND:
/* Conditional monitor clauses were updated. Send out
* the next condition changes, in any, immediately. */
ovsdb_idl_send_cond_change(idl);
idl->cond_seqno++;
break;
2.1
//获取配置
const struct ovsrec_open_vswitch *cfg;
cfg = ovsrec_open_vswitch_first(idl);
//使能硬件offload
netdev_set_flow_api_enabled(&cfg->other_config);
if (smap_get_bool(ovs_other_config, "hw-offload", false)) {
netdev_flow_api_enabled = true;
}
2.2 dpdk初始化
dpdk_init(&cfg->other_config);
if (smap_get_bool(ovs_other_config, "dpdk-init", false)) {
dpdk_init__(ovs_other_config);
/* Make sure things are initialized ... */
rte_eal_init(argc, argv);
rte_pdump_init(ovs_rundir());
/* Finally, register the dpdk classes */
netdev_dpdk_register();
netdev_register_provider(&dpdk_class);
netdev_register_provider(&dpdk_ring_class);
netdev_register_provider(&dpdk_vhost_class);
netdev_register_provider(&dpdk_vhost_client_class);
new_class->init()
struct netdev_registered_class *rc;
rc = xmalloc(sizeof *rc);
cmap_insert(&netdev_classes, &rc->cmap_node, hash_string(new_class->type, 0));
rc->class = new_class;
2.3 注册 ofproto_class
/* Initialize the ofproto library. This only needs to run once, but
* it must be done after the configuration is set. If the
* initialization has already occurred, bridge_init_ofproto()
* returns immediately. */
bridge_init_ofproto(cfg);
static bool initialized = false;
if (initialized) {
return;
}
struct shash iface_hints;
shash_init(&iface_hints);
if (cfg) {
//将配置的所有interface插入 iface_hints
for (i = 0; i < cfg->n_bridges; i++) {
const struct ovsrec_bridge *br_cfg = cfg->bridges[i];
int j;
for (j = 0; j < br_cfg->n_ports; j++) {
struct ovsrec_port *port_cfg = br_cfg->ports[j];
int k;
for (k = 0; k < port_cfg->n_interfaces; k++) {
struct ovsrec_interface *if_cfg = port_cfg->interfaces[k];
struct iface_hint *iface_hint;
iface_hint = xmalloc(sizeof *iface_hint);
iface_hint->br_name = br_cfg->name;
iface_hint->br_type = br_cfg->datapath_type;
iface_hint->ofp_port = iface_pick_ofport(if_cfg);
shash_add(&iface_hints, if_cfg->name, iface_hint);
}
}
}
}
ofproto_init(&iface_hints);
//目前只有一种 ofproto_class,即 ofproto_dpif_class
ofproto_class_register(&ofproto_dpif_class);
static const struct ofproto_class **ofproto_classes;
ofproto_classes[n_ofproto_classes++] = new_class;
/* Make a local copy, since we don't own 'iface_hints' elements. */
//先复制到 ofproto.c 中的静态变量 init_ofp_ports 中
SHASH_FOR_EACH(node, iface_hints) {
const struct iface_hint *orig_hint = node->data;
struct iface_hint *new_hint = xmalloc(sizeof *new_hint);
const char *br_type = ofproto_normalize_type(orig_hint->br_type);
new_hint->br_name = xstrdup(orig_hint->br_name);
new_hint->br_type = xstrdup(br_type);
new_hint->ofp_port = orig_hint->ofp_port;
shash_add(&init_ofp_ports, node->name, new_hint);
for (i = 0; i < n_ofproto_classes; i++) {
//目前只支持一种 ofproto_classes,即 ofproto_dpif_class,调用其 init 函数
ofproto_classes[i]->init(&init_ofp_ports); //init(const struct shash *iface_hints)
/* Make a local copy, since we don't own 'iface_hints' elements. */
//再复制到 ofproto_dpif.c 中的静态变量 init_ofp_ports 中
SHASH_FOR_EACH(node, iface_hints) {
const struct iface_hint *orig_hint = node->data;
struct iface_hint *new_hint = xmalloc(sizeof *new_hint);
new_hint->br_name = xstrdup(orig_hint->br_name);
new_hint->br_type = xstrdup(orig_hint->br_type);
new_hint->ofp_port = orig_hint->ofp_port;
shash_add(&init_ofp_ports, node->name, new_hint);
ofproto_unixctl_init();
unixctl_command_register("fdb/flush", "[bridge]", 0, 1, ofproto_unixctl_fdb_flush, NULL);
unixctl_command_register("fdb/show", "bridge", 1, 1, ofproto_unixctl_fdb_show, NULL);
...
ofproto_dpif_trace_init();
unixctl_command_register(
"ofproto/trace",
"{[dp_name] odp_flow | bridge br_flow} [OPTIONS...] "
"[-generate|packet]", 1, INT_MAX, ofproto_unixctl_trace, NULL);
...
udpif_init();
unixctl_command_register("upcall/show", "", 0, 0, upcall_unixctl_show, NULL);
unixctl_command_register("upcall/disable-megaflows", "", 0, 0, upcall_unixctl_disable_megaflows, NULL);
unixctl_command_register("upcall/enable-megaflows", "", 0, 0, upcall_unixctl_enable_megaflows, NULL);
...
initialized = true;
2.4
bridge_run__();
2.4.1
sset_init(&types);
//获取所有type,目前就两种:system和netdev
ofproto_enumerate_types(&types);
sset_clear(types);
//目前只支持一种 ofproto_classes,即 ofproto_dpif_class
for (i = 0; i < n_ofproto_classes; i++) {
ofproto_classes[i]->enumerate_types(types); //dp_enumerate_types
dp_initialize();
if (ovsthread_once_start(&once)) {
tnl_conf_seq = seq_create();
dpctl_unixctl_register();
tnl_port_map_init();
tnl_neigh_cache_init();
route_table_init();
static const struct dpif_class *base_dpif_classes[] = {
#if defined(__linux__) || defined(_WIN32)
&dpif_netlink_class,
#endif
&dpif_netdev_class,
};
for (i = 0; i < ARRAY_SIZE(base_dpif_classes); i++) {
dp_register_provider(base_dpif_classes[i]);
dp_register_provider__(new_class);
new_class->init()
struct registered_dpif_class * registered_class = xmalloc(sizeof *registered_class);
registered_class->dpif_class = new_class;
registered_class->refcount = 0;
shash_add(&dpif_classes, new_class->type, registered_class);
}
}
SHASH_FOR_EACH(node, &dpif_classes) {
const struct registered_dpif_class *registered_class = node->data;
sset_add(types, registered_class->dpif_class->type);
}
}
2.4.2
SSET_FOR_EACH (type, &types) {
ofproto_type_run(type);
datapath_type = ofproto_normalize_type(datapath_type); //return type && type[0] ? type : "system";
//根据 datapath_type 找到 ofproto_class,目前其实就一种
struct ofproto_class * class = ofproto_class_find__(datapath_type);
for (i = 0; i < n_ofproto_classes; i++) {
const struct ofproto_class *class = ofproto_classes[i];
struct sset types;
bool found;
sset_init(&types);
class->enumerate_types(&types);
found = sset_contains(&types, type);
sset_destroy(&types);
if (found) {
return class;
}
}
class->type_run(datapath_type) //type_run(const char *type)
//all_dpif_backers 是个全局变量。根据type找backer。
//system和netdev类型的数据通路会分别创建一个 backer
backer = shash_find_data(&all_dpif_backers, type);
dpif_run(backer->dpif)
//dpif_netlink_run 或者 dpif_netdev_run
dpif->dpif_class->run(dpif);
udpif_run(backer->udpif);
unixctl_command_reply(udpif->conns[i], NULL);
if (backer->recv_set_enable) {
udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
if (!udpif->handlers && !udpif->revalidators) {
dpif_handlers_set(udpif->dpif, n_handlers);
//只有system类型的数据通路才有此函数 dpif_netlink_handlers_set
dpif->dpif_class->handlers_set(dpif, n_handlers);
udpif_start_threads(udpif, n_handlers, n_revalidators);
udpif->n_handlers = n_handlers;
udpif->n_revalidators = n_revalidators;
//启动线程,这些线程貌似只在 system 数据通路时有用
udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);
for (i = 0; i < udpif->n_handlers; i++) {
struct handler *handler = &udpif->handlers[i];
handler->udpif = udpif;
handler->handler_id = i;
handler->thread = ovs_thread_create(
"handler", udpif_upcall_handler, handler);
}
udpif->revalidators = xzalloc(udpif->n_revalidators * sizeof *udpif->revalidators);
for (i = 0; i < udpif->n_revalidators; i++) {
struct revalidator *revalidator = &udpif->revalidators[i];
revalidator->udpif = udpif;
revalidator->thread = ovs_thread_create(
"revalidator", udpif_revalidator, revalidator);
}
}
}
if (backer->need_revalidate) {
//申请new_xcfg,将旧配置更新到new_xcfg
xlate_txn_start();
struct xbridge *xbridge;
struct xlate_cfg *xcfg;
ovs_assert(!new_xcfg);
new_xcfg = xmalloc(sizeof *new_xcfg);
hmap_init(&new_xcfg->xbridges);
hmap_init(&new_xcfg->xbundles);
hmap_init(&new_xcfg->xports);
hmap_init(&new_xcfg->xports_uuid);
xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
if (!xcfg) {
return;
}
HMAP_FOR_EACH (xbridge, hmap_node, &xcfg->xbridges) {
xlate_xbridge_copy(xbridge);
}
//将最新的配置添加到 new_xcfg
HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
struct ofport_dpif *ofport;
struct ofbundle *bundle;
if (ofproto->backer != backer) {
continue;
}
//创建 xbridge,并插入 xcfg->xbridges
xlate_ofproto_set(ofproto, ofproto->up.name,
ofproto->backer->dpif, ofproto->ml,
ofproto->stp, ofproto->rstp, ofproto->ms,
ofproto->mbridge, ofproto->sflow, ofproto->ipfix,
ofproto->netflow,
ofproto->up.forward_bpdu,
connmgr_has_in_band(ofproto->up.connmgr),
&ofproto->backer->rt_support);
//创建 xbundle,并插入 xcfg->xbundles
HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) {
xlate_bundle_set(ofproto, bundle, bundle->name,
bundle->vlan_mode, bundle->qinq_ethtype,
bundle->vlan, bundle->trunks, bundle->cvlans,
bundle->use_priority_tags,
bundle->bond, bundle->lacp,
bundle->floodable, bundle->protected);
}
//创建 xport,并插入 xcfg->xports,xbridge->xports 和 xbundle->xports
HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) {
int stp_port = ofport->stp_port ? stp_port_no(ofport->stp_port) : -1;
xlate_ofport_set(ofproto, ofport->bundle, ofport,
ofport->up.ofp_port, ofport->odp_port,
ofport->up.netdev, ofport->cfm, ofport->bfd,
ofport->lldp, ofport->peer, stp_port,
ofport->rstp_port, ofport->qdscp,
ofport->n_qdscp, ofport->up.pp.config,
ofport->up.pp.state, ofport->is_tunnel,
ofport->may_enable);
}
}
xlate_txn_commit();
//xcfgp指针指向最新的配置 new_xcfg
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
ovsrcu_set(&xcfgp, new_xcfg);
ovsrcu_synchronize();
xlate_xcfg_free(xcfg);
new_xcfg = NULL;
}
}
2.4.3
/* Let each bridge do the work that it needs to do. */
HMAP_FOR_EACH (br, node, &all_bridges) {
ofproto_run(br->ofproto);
p->ofproto_class->run(p); //run(struct ofproto *ofproto_)
if (ofproto->netflow) {
netflow_run(ofproto->netflow);
}
if (ofproto->sflow) {
dpif_sflow_run(ofproto->sflow);
}
if (ofproto->ipfix) {
dpif_ipfix_run(ofproto->ipfix);
}
stp_run(ofproto);
rstp_run(ofproto);
mac_learning_run(ofproto->ml)
mcast_snooping_run(ofproto->ms)
/* Expire OpenFlow flows whose idle_timeout or hard_timeout
* has passed. */
LIST_FOR_EACH_SAFE (rule, next_rule, expirable,
&ofproto->up.expirable) {
rule_expire(rule_dpif_cast(rule), now);
}
//处理 controller,snoop 等连接
connmgr_run(p->connmgr, handle_openflow);
LIST_FOR_EACH_SAFE (ofconn, next_ofconn, node, &mgr->all_conns) {
ofconn_run(ofconn, handle_openflow);
rconn_run(ofconn->rconn);
vconn_run(rc->vconn);
for (i = 0; i < rc->n_monitors; ) {
vconn_run(rc->monitors[i]);
vconn_recv(rc->monitors[i], &msg);
}
/* Limit the number of iterations to avoid starving other tasks. */
for (i = 0; i < 50 && ofconn_may_recv(ofconn); i++) {
struct ofpbuf *of_msg = rconn_recv(ofconn->rconn);
vconn_recv(rc->vconn, &buffer);
//将从controller收到的消息进行复制,发送给所有的monitor,即 snoop
//发送消息给controller时,也会复制,发送给所有monitor,可参考 rconn_send__
copy_to_monitor(rc, buffer);
for (i = 0; i < rc->n_monitors; ) {
struct vconn *vconn = rc->monitors[i];
clone = ofpbuf_clone(b);//buffer
vconn_send(vconn, clone);
}
//处理 openflow 消息
handle_openflow(ofconn, of_msg);
handle_openflow__(ofconn, ofp_msg);
ofptype_decode(&type, oh);
switch (type) {
/* OpenFlow requests. */
case OFPTYPE_ECHO_REQUEST:
return handle_echo_request(ofconn, oh);
case OFPTYPE_FEATURES_REQUEST:
return handle_features_request(ofconn, oh);
}
ofpbuf_delete(of_msg);
}
}
ofmonitor_run(mgr);
//处理 service controller 连接
HMAP_FOR_EACH (ofservice, node, &mgr->services) {
struct vconn *vconn;
pvconn_accept(ofservice->pvconn, &vconn);
struct rconn *rconn;
rconn = rconn_create(ofservice->probe_interval, 0, ofservice->dscp, vconn_get_allowed_versions(vconn));
struct rconn *rc = xzalloc(sizeof *rc);
ovs_mutex_init(&rc->mutex);
rc->state = S_VOID;
rc->state_entered = time_now();
rc->vconn = NULL;
rc->n_monitors = 0;
rconn_connect_unreliably(rconn, vconn, name);
rc->vconn = vconn;
struct ofconn *ofconn;
//创建 ofconn,将新连接添加到 mgr->all_conns
ofconn = ofconn_create(mgr, rconn, OFCONN_SERVICE, ofservice->enable_async_msgs);
struct ofconn *ofconn;
ofconn = xzalloc(sizeof *ofconn);
ofconn->connmgr = mgr;
ovs_list_push_back(&mgr->all_conns, &ofconn->node);
ofconn->rconn = rconn;
ofconn->type = type;
}
//处理 snoop
for (i = 0; i < mgr->n_snoops; i++) {
struct vconn *vconn;
//如果snoop有新连接,则 add_snooper
pvconn_accept(mgr->snoops[i], &vconn);
add_snooper(mgr, vconn);
struct ofconn *ofconn, *best;
/* Pick a controller for monitoring. */
best = NULL;
LIST_FOR_EACH (ofconn, node, &mgr->all_conns) {
if (ofconn->type == OFCONN_PRIMARY
&& (!best || snoop_preference(ofconn) > snoop_preference(best))) {
best = ofconn;
}
}
if (best) {
rconn_add_monitor(best->rconn, vconn);
rc->monitors[rc->n_monitors++] = vconn;
} else {
VLOG_INFO_RL(&rl, "no controller connection to snoop");
vconn_close(vconn);
}
}
}
2.5
bridge_reconfigure(cfg ? cfg : &null_cfg);
2.5.0 解析配置
//flow-limit 决定dpcls流表最大个数限制
ofproto_set_flow_limit(smap_get_int(&ovs_cfg->other_config, "flow-limit",
OFPROTO_FLOW_LIMIT_DEFAULT));
//max-idle 决定dpcls流表超时时间
ofproto_set_max_idle(smap_get_int(&ovs_cfg->other_config, "max-idle",
OFPROTO_MAX_IDLE_DEFAULT));
//vlan-limit 决定vlan头个数,单vlan还是双vlan
ofproto_set_vlan_limit(smap_get_int(&ovs_cfg->other_config, "vlan-limit",
LEGACY_MAX_VLAN_HEADERS));
//给 n_handlers 和 n_revalidators 赋值,用来在udpif_set_threads中决定起几个thread
ofproto_set_threads(smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0),
smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0));
int threads = MAX(count_cpu_cores(), 2);
n_revalidators = MAX(n_revalidators_, 0);
n_handlers = MAX(n_handlers_, 0);
if (!n_revalidators) {
n_revalidators = n_handlers
? MAX(threads - (int) n_handlers, 1)
: threads / 4 + 1;
}
if (!n_handlers) {
n_handlers = MAX(threads - (int) n_revalidators, 1);
}
2.5.1
add_del_bridges(ovs_cfg);
//获取最新的 bridge 配置
/* Collect new bridges' names and types. */
shash_init(&new_br);
for (i = 0; i < cfg->n_bridges; i++) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
const struct ovsrec_bridge *br_cfg = cfg->bridges[i];
//过滤掉 bridge name 包含特殊字符的bridge
if (strchr(br_cfg->name, '/') || strchr(br_cfg->name, '\\')) {
/* Prevent remote ovsdb-server users from accessing arbitrary
* directories, e.g. consider a bridge named "../../../etc/".
*
* Prohibiting "\" is only necessary on Windows but it's no great
* loss elsewhere. */
VLOG_WARN_RL(&rl, "ignoring bridge with invalid name \"%s\"",
br_cfg->name);
//将符号规则的bridge,添加到 new_br
} else if (!shash_add_once(&new_br, br_cfg->name, br_cfg)) {
VLOG_WARN_RL(&rl, "bridge %s specified twice", br_cfg->name);
}
}
//如果 all_bridges 中的bridge在最新bridge配置中不存在了或者type改变了,则需要删除此bridge
/* Get rid of deleted bridges or those whose types have changed.
* Update 'cfg' of bridges that still exist. */
HMAP_FOR_EACH_SAFE (br, next, node, &all_bridges) {
br->cfg = shash_find_data(&new_br, br->name);
if (!br->cfg || strcmp(br->type, ofproto_normalize_type(br->cfg->datapath_type))) {
bridge_destroy(br, true);
}
}
//将新添加的bridge插入 all_bridges
/* Add new bridges. */
SHASH_FOR_EACH(node, &new_br) {
const struct ovsrec_bridge *br_cfg = node->data;
//到 all_bridges 查找是否已经存在bridge,如果不存在则创建
if (!bridge_lookup(br_cfg->name)) {
bridge_create(br_cfg);
struct bridge *br;
br = xzalloc(sizeof *br);
br->name = xstrdup(br_cfg->name);
br->type = xstrdup(ofproto_normalize_type(br_cfg->datapath_type));
//bridge 的配置,包含多少端口等信息
br->cfg = br_cfg;
memcpy(&br->default_ea, &br_cfg->header_.uuid, ETH_ADDR_LEN);
eth_addr_mark_random(&br->default_ea);
hmap_init(&br->ports);
hmap_init(&br->ifaces);
hmap_init(&br->iface_by_name);
hmap_init(&br->mirrors);
hmap_init(&br->mappings);
//新创建的bridge,要插入 all_bridges
hmap_insert(&all_bridges, &br->node, hash_string(br->name, 0));
}
2.5.2 将br上port插入 br->wanted_ports
HMAP_FOR_EACH (br, node, &all_bridges) {
bridge_collect_wanted_ports(br, &br->wanted_ports);
//将bridge上最新配置的port插入hash表 wanted_ports
shash_init(wanted_ports);
for (i = 0; i < br->cfg->n_ports; i++) {
const char *name = br->cfg->ports[i]->name;
if (!shash_add_once(wanted_ports, name, br->cfg->ports[i])) {
VLOG_WARN("bridge %s: %s specified twice as bridge port", br->name, name);
}
}
//自动添加一个和bridge名字相同的 internal port
if (bridge_get_controllers(br, NULL)
&& !shash_find(wanted_ports, br->name)) {
VLOG_WARN("bridge %s: no port named %s, synthesizing one",
br->name, br->name);
ovsrec_interface_init(&br->synth_local_iface);
ovsrec_port_init(&br->synth_local_port);
br->synth_local_port.interfaces = &br->synth_local_ifacep;
br->synth_local_port.n_interfaces = 1;
br->synth_local_port.name = br->name;
br->synth_local_iface.name = br->name;
br->synth_local_iface.type = "internal";
br->synth_local_ifacep = &br->synth_local_iface;
shash_add(wanted_ports, br->name, &br->synth_local_port);
}
bridge_del_ports(br, &br->wanted_ports);
struct shash_node *port_node;
struct port *port, *next;
//删除已经不在wanted_ports中的port
/* Get rid of deleted ports.
* Get rid of deleted interfaces on ports that still exist. */
HMAP_FOR_EACH_SAFE (port, next, hmap_node, &br->ports) {
port->cfg = shash_find_data(wanted_ports, port->name);
if (!port->cfg) {
port_destroy(port);
} else {
//删除 port 下的不需要的 interface
port_del_ifaces(port);
/* Collect list of new interfaces. */
sset_init(&new_ifaces);
for (i = 0; i < port->cfg->n_interfaces; i++) {
const char *name = port->cfg->interfaces[i]->name;
const char *type = port->cfg->interfaces[i]->type;
if (strcmp(type, "null")) {
sset_add(&new_ifaces, name);
}
}
/* Get rid of deleted interfaces. */
LIST_FOR_EACH_SAFE (iface, next, port_elem, &port->ifaces) {
if (!sset_contains(&new_ifaces, iface->name)) {
iface_destroy(iface);
}
}
sset_destroy(&new_ifaces);
}
}
//更新iface的 cfg 和 type
/* Update iface->cfg and iface->type in interfaces that still exist. */
SHASH_FOR_EACH (port_node, wanted_ports) {
const struct ovsrec_port *port_rec = port_node->data;
for (i = 0; i < port_rec->n_interfaces; i++) {
const struct ovsrec_interface *cfg = port_rec->interfaces[i];
struct iface *iface = iface_lookup(br, cfg->name);
const char *type = iface_get_type(cfg, br->cfg);
const char *dp_type = br->cfg->datapath_type;
const char *netdev_type = ofproto_port_open_type(dp_type, type);
if (iface) {
iface->cfg = cfg;
iface->type = type;
iface->netdev_type = netdev_type;
}
}
}
2.5.3 删除最新配置中不存在的或者type改变的 ofproto
bridge_delete_ofprotos();
//types 为 system,netdev
ofproto_enumerate_types(&types);
sset_clear(types);
//只有一个ofproto_classes,即 ofproto_dpif_class
for (i = 0; i < n_ofproto_classes; i++) {
ofproto_classes[i]->enumerate_types(types); //enumerate_types
dp_enumerate_types(types);
//获取所有 dpif_classes 的 type,目前有两个:system和netdev
SHASH_FOR_EACH(node, &dpif_classes) {
const struct registered_dpif_class *registered_class = node->data;
sset_add(types, registered_class->dpif_class->type);
}
}
SSET_FOR_EACH (type, &types) {
ofproto_enumerate_names(type, &names); //enumerate_names
//获取所有 ofproto 的名字
struct ofproto_dpif *ofproto;
sset_clear(names);
HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
if (strcmp(type, ofproto->up.type)) {
continue;
}
sset_add(names, ofproto->up.name);
}
SSET_FOR_EACH (name, &names) {
//遍历names到 all_bridges 寻找,如果找不到或者桥type改变,则需要删除此 ofproto
br = bridge_lookup(name);
if (!br || strcmp(type, br->type)) {
ofproto_delete(name, type);
}
}
}
2.5.4 删除不需要的 port
HMAP_FOR_EACH (br, node, &all_bridges) {
if (br->ofproto) {
/* Main task: Iterate over the ports in 'br->ofproto' and remove the ports
* that are not configured in the database. (This commonly happens when
* ports have been deleted, e.g. with "ovs-vsctl del-port".)
*
* Side tasks: Reconfigure the ports that are still in 'br'. Delete ports
* that have the wrong OpenFlow port number (and arrange to add them back
* with the correct OpenFlow port number). */
bridge_delete_or_reconfigure_ports(br);
2.5.5 为每个桥创建 struct ofproto
HMAP_FOR_EACH_SAFE (br, next, node, &all_bridges) {
if (!br->ofproto) {
ofproto_create(br->name, br->type, &br->ofproto); {
//datapath_type 为 system or netdev
datapath_type = ofproto_normalize_type(datapath_type);
//ofproto_class 为 ofproto_dpif_class
struct ofproto_class * class = ofproto_class_find__(datapath_type);
//分配内存
struct ofproto *ofproto;
ofproto = class->alloc(); //alloc(void)
struct ofproto_dpif *ofproto = xzalloc(sizeof *ofproto);
return &ofproto->up;
ofproto->ofproto_class = class;
ofproto->name = xstrdup(datapath_name);
ofproto->type = xstrdup(datapath_type);
//将 ofproto 插入全局静态变量 all_ofprotos
hmap_insert(&all_ofprotos, &ofproto->hmap_node, hash_string(ofproto->name, 0));
ofproto->ofproto_class->construct(ofproto); //construct(struct ofproto *ofproto_)
struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
ofproto_tunnel_init();
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
if (ovsthread_once_start(&once)) {
fat_rwlock_init(&rwlock);
ovsthread_once_done(&once);
}
open_dpif_backer(ofproto->up.type, &ofproto->backer);
struct dpif_backer *backer;
//首先根据type查找,如果为kernel space datapath,则type为system,
//如果userspace datapath,则type为netdev。
/* All datapaths of a given type share a single dpif backer instance. */
backer = shash_find_data(&all_dpif_backers, type);
//如果已经创建了,则引用计数加1即可
if (backer) {
backer->refcount++;
*backerp = backer;
return 0;
}
//datapath 名字,ovs-netdev 或者 ovs-system
backer_name = xasprintf("ovs-%s", type);
backer = xmalloc(sizeof *backer);
dpif_create_and_open(backer_name, type, &backer->dpif);
dpif_create(name, type, dpifp);
do_open(name, type, true, dpifp);
type = dpif_normalize_type(type);
registered_class = dp_class_lookup(type);
//dpif_netlink_open 或者 dpif_netdev_open
registered_class->dpif_class->open(registered_class->dpif_class, name, create, &dpif);
dpif_open(name, type, dpifp);
do_open(name, type, false, dpifp);
type = dpif_normalize_type(type);
registered_class = dp_class_lookup(type);
registered_class->dpif_class->open(registered_class->dpif_class, name, create, &dpif);
如果为 dpif_netlink_open,则和openvswitch kernel module交互,创建datapath
dp_request.cmd = OVS_DP_CMD_NEW;
upcall_pid = 0;
dp_request.upcall_pid = &upcall_pid;
dp_request.name = name;
dp_request.user_features |= OVS_DP_F_UNALIGNED;
dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
dpif_netlink_dp_transact(&dp_request, &dp, &buf);
backer->udpif = udpif_create(backer, backer->dpif);
struct udpif *udpif = xzalloc(sizeof *udpif);
udpif->dpif = dpif;
udpif->backer = backer;
atomic_init(&udpif->flow_limit, MIN(ofproto_flow_limit, 10000));
udpif->reval_seq = seq_create();
udpif->dump_seq = seq_create();
latch_init(&udpif->exit_latch);
latch_init(&udpif->pause_latch);
ovs_list_push_back(&all_udpifs, &udpif->list_node);
atomic_init(&udpif->enable_ufid, false);
atomic_init(&udpif->n_flows, 0);
atomic_init(&udpif->n_flows_timestamp, LLONG_MIN);
ovs_mutex_init(&udpif->n_flows_mutex);
udpif->ukeys = xmalloc(N_UMAPS * sizeof *udpif->ukeys);
for (int i = 0; i < N_UMAPS; i++) {
cmap_init(&udpif->ukeys[i].cmap);
ovs_mutex_init(&udpif->ukeys[i].mutex);
}
//只有用户空间netdev数据通路才会提供 register_upcall_cb
dpif_register_upcall_cb(dpif, upcall_cb, udpif);
if (dpif->dpif_class->register_upcall_cb) {
dpif->dpif_class->register_upcall_cb(dpif, cb, aux);
}
dpif_register_dp_purge_cb(dpif, dp_purge_cb, udpif);
backer->type = xstrdup(type);
backer->refcount = 1;
hmap_init(&backer->odp_to_ofport_map);
ovs_rwlock_init(&backer->odp_to_ofport_lock);
backer->need_revalidate = 0;
simap_init(&backer->tnl_backers);
backer->recv_set_enable = !ofproto_get_flow_restore_wait();
*backerp = backer;
/* Loop through the ports already on the datapath and remove any
* that we don't need anymore. */
ovs_list_init(&garbage_list);
dpif_port_dump_start(&port_dump, backer->dpif);
while (dpif_port_dump_next(&port_dump, &port)) {
node = shash_find(&init_ofp_ports, port.name);
if (!node && strcmp(port.name, dpif_base_name(backer->dpif))) {
garbage = xmalloc(sizeof *garbage);
garbage->odp_port = port.port_no;
ovs_list_push_front(&garbage_list, &garbage->list_node);
}
}
dpif_port_dump_done(&port_dump);
LIST_FOR_EACH_POP (garbage, list_node, &garbage_list) {
dpif_port_del(backer->dpif, garbage->odp_port, false);
free(garbage);
}
//将 backer 添加到全局变量 all_dpif_backers
shash_add(&all_dpif_backers, type, backer);
dpif_recv_set(backer->dpif, backer->recv_set_enable);
//只有system数据通路会提供 dpif_netlink_recv_set
dpif->dpif_class->recv_set(dpif, enable);
if (backer->recv_set_enable) {
udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
}
uuid_generate(&ofproto->uuid);
ofproto->ml = mac_learning_create(MAC_ENTRY_DEFAULT_IDLE_TIME);
//如果 iface 所属桥的名字是 ofproto 的名字,则将 iface 从 init_ofp_ports 删除
SHASH_FOR_EACH_SAFE (node, next, &init_ofp_ports) {
struct iface_hint *iface_hint = node->data;
if (!strcmp(iface_hint->br_name, ofproto->up.name)) {
/* Check if the datapath already has this port. */
if (dpif_port_exists(ofproto->backer->dpif, node->name)) {
sset_add(&ofproto->ports, node->name);
}
free(iface_hint->br_name);
free(iface_hint->br_type);
free(iface_hint);
shash_delete(&init_ofp_ports, node);
}
}
//将 ofproto 插入 hash 表 all_ofproto_dpifs
hmap_insert(&all_ofproto_dpifs, &ofproto->all_ofproto_dpifs_node,
hash_string(ofproto->up.name, 0));
memset(&ofproto->stats, 0, sizeof ofproto->stats);
//创建 255 个 oftable
enum { N_TABLES = 255 };
ofproto_init_tables(ofproto_, N_TABLES);
ofproto->n_tables = n_tables;
ofproto->tables = xmalloc(n_tables * sizeof *ofproto->tables);
OFPROTO_FOR_EACH_TABLE (table, ofproto) {
oftable_init(table);
table->max_flows = UINT_MAX;
table->n_flows = 0;
}
//添加默认流表
add_internal_flows(ofproto);
add_internal_miss_flow(ofproto, id++, &ofpacts, &ofproto->miss_rule);
add_internal_miss_flow(ofproto, id++, &ofpacts, &ofproto->no_packet_in_rule);
add_internal_miss_flow(ofproto, id++, &ofpacts, &ofproto->drop_frags_rule);
ofproto->datapath_id = pick_datapath_id(ofproto);
//创建 ofport,添加到 ofproto
init_ports(ofproto);//init_ports(struct ofproto *p) {
OFPROTO_PORT_FOR_EACH (&ofproto_port, &dump, p) {
const char *name = ofproto_port.name;
if (shash_find(&p->port_by_name, name)) {
VLOG_WARN_RL(&rl, "%s: ignoring duplicate device %s in datapath", p->name, name);
} else {
struct netdev *netdev;
/* Check if an OpenFlow port number had been requested. */
node = shash_find(&init_ofp_ports, name);
if (node) {
const struct iface_hint *iface_hint = node->data;
simap_put(&p->ofp_requests, name, ofp_to_u16(iface_hint->ofp_port));
}
netdev = ofport_open(p, &ofproto_port, &pp);
struct netdev *netdev;
netdev_open(ofproto_port->name, ofproto_port->type, &netdev);
struct netdev_registered_class *rc;
rc = netdev_lookup_class(type && type[0] ? type : "system");
if (rc && ovs_refcount_try_ref_rcu(&rc->refcnt)) {
netdev = rc->class->alloc();
if (netdev) {
memset(netdev, 0, sizeof *netdev);
netdev->netdev_class = rc->class;
netdev->auto_classified = type && type[0] ? false : true;
rc->class->construct(netdev);
}
}
return netdev;
if (netdev) {
ofport_install(p, netdev, &pp);
struct ofport *ofport;
ofport = p->ofproto_class->port_alloc();
struct ofport_dpif *port = xzalloc(sizeof *port);
return &port->up;
ofport->ofproto = p;
ofport->netdev = netdev;
ofport->change_seq = netdev_get_change_seq(netdev);
ofport->pp = *pp;
ofport->ofp_port = pp->port_no;
ofport->created = time_msec();
/* Add port to 'p'. */
hmap_insert(&p->ports, &ofport->hmap_node, hash_ofp_port(ofport->ofp_port));
shash_add(&p->port_by_name, netdev_name, ofport);
update_mtu(p, ofport);
p->ofproto_class->port_construct(ofport);
//发送添加端口消息
connmgr_send_port_status(p->connmgr, NULL, pp, OFPPR_ADD);
}
}
} //OFPROTO_PORT_FOR_EACH
} //init_ports
} //ofproto_create
} //if (!br->ofproto)
} //HMAP_FOR_EACH_SAFE
2.5.6
//将 wanted_ports 中的 port 添加到 bridge 的 ports,ifaces 和 iface_by_name 表中
//遍历 wanted_ports 所有port的所有iface,将iface(patch类型的不用下发)下发到datapath,
//同时每个iface还会生成ofport,插入 ofproto->ports 中
HMAP_FOR_EACH (br, node, &all_bridges) {
bridge_add_ports(br, &br->wanted_ports);
/* First add interfaces that request a particular port number. */
bridge_add_ports__(br, wanted_ports, true);
/* Then add interfaces that want automatic port number assignment.
* We add these afterward to avoid accidentally taking a specifically
* requested port number. */
//bridge_add_ports__(struct bridge *br, const struct shash *wanted_ports, bool with_requested_port)
bridge_add_ports__(br, wanted_ports, false);
struct shash_node *port_node;
SHASH_FOR_EACH (port_node, wanted_ports) {
const struct ovsrec_port *port_cfg = port_node->data;
for (i = 0; i < port_cfg->n_interfaces; i++) {
const struct ovsrec_interface *iface_cfg = port_cfg->interfaces[i];
requested_ofp_port = iface_get_requested_ofp_port(iface_cfg);
iface_validate_ofport__(cfg->n_ofport_request, cfg->ofport_request);
if ((requested_ofp_port != OFPP_NONE) == with_requested_port) {
//到 br->br->iface_by_name 查看是否已经存在
struct iface *iface = iface_lookup(br, iface_cfg->name);
//不存在则创建
if (!iface) {
iface_create(br, iface_cfg, port_cfg); {
struct netdev *netdev;
struct iface *iface;
ofp_port_t ofp_port;
struct port *port;
char *errp = NULL;
int error;
iface_do_create(br, iface_cfg, &ofp_port, &netdev, &errp); {
type = ofproto_port_open_type(br->cfg->datapath_type, iface_get_type(iface_cfg, br->cfg));
const struct ofproto_class *class;
datapath_type = ofproto_normalize_type(datapath_type);
class = ofproto_class_find__(datapath_type);
//port_open_type
class->port_open_type(datapath_type, port_type)
dpif_port_open_type(datapath_type, port_type);
rc = shash_find_data(&dpif_classes, datapath_type);
//对于netdev dp来说,dpif_netdev_port_open_type
//对于system dp来说,此函数为空,所以返回原始的 port_type
port_type = rc->dpif_class->port_open_type(rc->dpif_class, port_type);
//如果配置的类型为 tap或者system,则返回即可。
//但是配置的类型为 internal 的话,会自动转换成 tap。
return strcmp(type, "internal") ? type : dpif_netdev_class_is_dummy(class) ? "dummy-internal" : "tap";
netdev_open(iface_cfg->name, type, &netdev);
//注册所有的 netdev_class
netdev_initialize();
if (ovsthread_once_start(&once)) {
netdev_vport_patch_register();
netdev_vport_tunnel_register();
netdev_register_provider(&netdev_linux_class);
netdev_register_provider(&netdev_internal_class);
netdev_register_provider(&netdev_tap_class);
new_class->init()
struct netdev_registered_class *rc;
rc = xmalloc(sizeof *rc);
cmap_insert(&netdev_classes, &rc->cmap_node, hash_string(new_class->type, 0));
rc->class = new_class;
}
//type 为 system,internal,tap等
struct netdev_registered_class *rc;
rc = netdev_lookup_class(type && type[0] ? type : "system");
//如果type为system,则 alloc 为 netdev_linux_alloc
netdev = rc->class->alloc();
struct netdev_linux *netdev = xzalloc(sizeof *netdev);
return &netdev->up;
memset(netdev, 0, sizeof *netdev);
netdev->netdev_class = rc->class;
netdev->auto_classified = type && type[0] ? false : true;
netdev->name = xstrdup(name);
//如果type为system,则 construct 为 netdev_linux_construct
rc->class->construct(netdev);
iface_set_netdev_config(iface_cfg, netdev, errp);
netdev_set_config(netdev, &iface_cfg->options, errp);
netdev->netdev_class->set_config(netdev, args ? args : &no_args, &verbose_error);
iface_set_netdev_mtu(iface_cfg, netdev);
netdev_set_mtu(netdev, *iface_cfg->mtu_request);
class->set_mtu(netdev, mtu)
//获取 portid
*ofp_portp = iface_pick_ofport(iface_cfg);
ofproto_port_add(br->ofproto, netdev, ofp_portp); {
//调用 ofproto_dpif_class 中的 port_add
//port_add(struct ofproto *ofproto_, struct netdev *netdev)
ofproto->ofproto_class->port_add(ofproto, netdev); {
struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
//如果是 patch 类型的port直接返回
//patch 端口不需要加到 datapath
if (netdev_vport_is_patch(netdev)) {
sset_add(&ofproto->ghost_ports, netdev_get_name(netdev));
return 0;
}
//获取端口名字
dp_port_name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
//如果datapath还不存在此端口,则将端口加入到datapath,包括tunnle端口
if (!dpif_port_exists(ofproto->backer->dpif, dp_port_name)) {
//将端口加入 datapath
dpif_port_add(ofproto->backer->dpif, netdev, &port_no);
//dpif_netlink_port_add 或者 dpif_netdev_port_add
dpif->dpif_class->port_add(dpif, netdev, &port_no);
//如果为 dpif_netlink_port_add
struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
int error = EOPNOTSUPP;
if (!ovs_tunnels_out_of_tree) {
//openvswitch.ko 为 kernel 源码提供的,则调用此函数
error = dpif_netlink_rtnl_port_create_and_add(dpif, netdev, port_nop);
//创建vport,比如vxlan,会使用到kernel提供的vxlan模块
dpif_netlink_rtnl_port_create(netdev);
type = netdev_to_ovs_vport_type(netdev_get_type(netdev));
tnl_cfg = netdev_get_tunnel_config(netdev);
if (!tnl_cfg) {
return EOPNOTSUPP;
}
kind = vport_type_to_kind(type, tnl_cfg);
if (!kind) {
return EOPNOTSUPP;
}
name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL;
err = dpif_netlink_rtnl_create(tnl_cfg, name, type, kind, flags);
/* tunnel unique info */
switch (type) {
case OVS_VPORT_TYPE_VXLAN:
nl_msg_put_u8(&request, IFLA_VXLAN_LEARNING, 0);
nl_msg_put_u8(&request, IFLA_VXLAN_COLLECT_METADATA, 1);
nl_msg_put_u8(&request, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) {
nl_msg_put_flag(&request, IFLA_VXLAN_GBP);
}
if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
nl_msg_put_flag(&request, IFLA_VXLAN_GPE);
}
nl_msg_put_be16(&request, IFLA_VXLAN_PORT, tnl_cfg->dst_port);
break;
...
}
nl_transact(NETLINK_ROUTE, &request, NULL);
name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
//因为上面已经创建vxlan端口,此处使用OVS_VPORT_TYPE_NETDEV即可,
//表示不用openvswitch.ko创建vxlan端口
dpif_netlink_port_add__(dpif, name, OVS_VPORT_TYPE_NETDEV, NULL, port_nop);
//和kernel module openvswitch.ko 交互,创建vport
dpif_netlink_port_add__(dpif, name, ovs_type, NULL, port_nop);
dpif_netlink_vport_init(&request);
request.cmd = OVS_VPORT_CMD_NEW;
request.dp_ifindex = dpif->dp_ifindex;
request.type = type;
request.name = name;
dpif_netlink_vport_transact(&request, &reply, &buf);
} //ovs_tunnels_out_of_tree
if (error) {
//openvswitch.ko 为 ovs 源码提供的,则调用此函数
//或者创建非tunnel端口,也调用此函数
dpif_netlink_port_add_compat(dpif, netdev, port_nop);
name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
ovs_type = netdev_to_ovs_vport_type(netdev_get_type(netdev));
tnl_cfg = netdev_get_tunnel_config(netdev);
if (tnl_cfg && (tnl_cfg->dst_port != 0 || tnl_cfg->exts)) {
if (tnl_cfg->dst_port) {
nl_msg_put_u16(&options, OVS_TUNNEL_ATTR_DST_PORT,
ntohs(tnl_cfg->dst_port));
}
return dpif_netlink_port_add__(dpif, name, ovs_type, &options, port_nop);
}else {
return dpif_netlink_port_add__(dpif, name, ovs_type, NULL, port_nop);
}
}
}
if (netdev_get_tunnel_config(netdev)) {
sset_add(&ofproto->ghost_ports, devname);
} else {
sset_add(&ofproto->ports, devname);
}
} //port_add
const char *netdev_name = netdev_get_name(netdev);
simap_put(&ofproto->ofp_requests, netdev_name, ofp_to_u16(ofp_port));
//创建/更新 ofport
update_port(ofproto, netdev_name);
struct ofport *port;
//先到 ofproto->ports 根据 ofp_port 查找是否已经存在
port = ofproto_get_port(ofproto, ofproto_port.ofp_port);
//如果已经存在,则更新配置
if (port && !strcmp(netdev_get_name(port->netdev), name)) {
update_mtu(ofproto, port);
}
else {
//如果不存在,则分配 ofport,并插入 ofproto->ports
ofport_install(ofproto, netdev, &pp);
const char *netdev_name = netdev_get_name(netdev);
struct ofport *ofport;
/* Create ofport. */
ofport = p->ofproto_class->port_alloc();
ofport->ofproto = p;
ofport->netdev = netdev;
ofport->change_seq = netdev_get_change_seq(netdev);
ofport->pp = *pp;
ofport->ofp_port = pp->port_no;
ofport->created = time_msec();
/* Add port to 'p'. */
hmap_insert(&p->ports, &ofport->hmap_node,
hash_ofp_port(ofport->ofp_port));
shash_add(&p->port_by_name, netdev_name, ofport);
update_mtu(p, ofport);
/* Let the ofproto_class initialize its private data. */
p->ofproto_class->port_construct(ofport);
}
} //ofproto_port_add
} //iface_do_create
/* Get or create the port structure. */
//到 br->ports 查找是否已经存在port
struct port *port;
port = port_lookup(br, port_cfg->name);
if (!port) {
port = port_create(br, port_cfg);
struct port *port;
port = xzalloc(sizeof *port);
port->bridge = br;
port->name = xstrdup(cfg->name);
port->cfg = cfg;
ovs_list_init(&port->ifaces);
hmap_insert(&br->ports, &port->hmap_node, hash_string(port->name, 0));
}
/* Create the iface structure. */
iface = xzalloc(sizeof *iface);
//将 iface 插入port->ifaces链表
ovs_list_push_back(&port->ifaces, &iface->port_elem);
//将 iface 按名字hash后,插入 br->iface_by_name hashmap 表
hmap_insert(&br->iface_by_name, &iface->name_node, hash_string(iface_cfg->name, 0));
iface->port = port;
iface->name = xstrdup(iface_cfg->name);
iface->ofp_port = ofp_port;
iface->netdev = netdev;
iface->type = iface_get_type(iface_cfg, br->cfg);
iface->netdev_type = ofproto_port_open_type(br->cfg->datapath_type, iface->type);
iface->cfg = iface_cfg;
//将 iface 按port号hash后,插入 br->ifaces 表
hmap_insert(&br->ifaces, &iface->ofp_port_node, hash_ofp_port(ofp_port));
} //iface_create
} //if (!iface)
} //if
} //for
}
}
2.5.7
HMAP_FOR_EACH (br, node, &all_bridges) {
bridge_configure_datapath_id(br);
HMAP_FOR_EACH (port, hmap_node, &br->ports) {
port_configure(port);
const struct ovsrec_port *cfg = port->cfg;
struct ofproto_bundle_settings s;
s.name = port->name;
s.slaves = xmalloc(ovs_list_size(&port->ifaces) * sizeof *s.slaves);
LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
s.slaves[s.n_slaves++] = iface->ofp_port;
}
/* Get VLAN tag. */
s.vlan = -1;
if (cfg->tag && *cfg->tag >= 0 && *cfg->tag <= 4095) {
s.vlan = *cfg->tag;
}
/* Get VLAN trunks. */
s.trunks = NULL;
if (cfg->n_trunks) {
s.trunks = vlan_bitmap_from_array(cfg->trunks, cfg->n_trunks);
}
s.cvlans = NULL;
if (cfg->n_cvlans) {
s.cvlans = vlan_bitmap_from_array(cfg->cvlans, cfg->n_cvlans);
}
/* Get VLAN mode. */
if (cfg->vlan_mode) {
if (!strcmp(cfg->vlan_mode, "access")) {
s.vlan_mode = PORT_VLAN_ACCESS;
} else if (!strcmp(cfg->vlan_mode, "trunk")) {
s.vlan_mode = PORT_VLAN_TRUNK;
} else if (!strcmp(cfg->vlan_mode, "native-tagged")) {
s.vlan_mode = PORT_VLAN_NATIVE_TAGGED;
} else if (!strcmp(cfg->vlan_mode, "native-untagged")) {
s.vlan_mode = PORT_VLAN_NATIVE_UNTAGGED;
} else if (!strcmp(cfg->vlan_mode, "dot1q-tunnel")) {
s.vlan_mode = PORT_VLAN_DOT1Q_TUNNEL;
} else {
/* This "can't happen" because ovsdb-server should prevent it. */
VLOG_WARN("port %s: unknown VLAN mode %s, falling "
"back to trunk mode", port->name, cfg->vlan_mode);
s.vlan_mode = PORT_VLAN_TRUNK;
}
} else {
if (s.vlan >= 0) {
s.vlan_mode = PORT_VLAN_ACCESS;
if (cfg->n_trunks || cfg->n_cvlans) {
VLOG_WARN("port %s: ignoring trunks in favor of implicit vlan",
port->name);
}
} else {
s.vlan_mode = PORT_VLAN_TRUNK;
}
}
s.lacp = port_configure_lacp(port, &lacp_settings);
port_configure_bond(port, &bond_settings);
ofproto_bundle_register(port->bridge->ofproto, port, &s);
//bundle_set
ofproto->ofproto_class->bundle_set(ofproto, aux, s)
struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
struct ofbundle *bundle;
bundle = xmalloc(sizeof *bundle);
bundle->ofproto = ofproto;
//将bundle插入hash表 ofproto->bundles
hmap_insert(&ofproto->bundles, &bundle->hmap_node, hash_pointer(aux, 0));
bundle->aux = aux;
bundle->name = NULL;
ovs_list_init(&bundle->ports);
bundle->vlan_mode = PORT_VLAN_TRUNK;
bundle->qinq_ethtype = ETH_TYPE_VLAN_8021AD;
bundle->vlan = -1;
bundle->trunks = NULL;
bundle->cvlans = NULL;
bundle->use_priority_tags = s->use_priority_tags;
bundle->lacp = NULL;
bundle->bond = NULL;
bundle->floodable = true;
bundle->protected = false;
LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
iface_set_ofport(iface->cfg, iface->ofp_port);
/* Clear eventual previous errors */
ovsrec_interface_set_error(iface->cfg, NULL);
iface_configure_cfm(iface);
iface_configure_qos(iface, port->cfg->qos);
iface_set_mac(br, port, iface);
ofproto_port_set_bfd(br->ofproto, iface->ofp_port, &iface->cfg->bfd);
ofproto_port_set_lldp(br->ofproto, iface->ofp_port, &iface->cfg->lldp);
ofproto_port_set_config(br->ofproto, iface->ofp_port, &iface->cfg->other_config);
}
}
bridge_configure_mirrors(br);
bridge_configure_forward_bpdu(br);
bridge_configure_mac_table(br);
bridge_configure_mcast_snooping(br);
bridge_configure_remotes(br, managers, n_managers);
struct ovsrec_controller **controllers;
size_t n_controllers;
struct ofproto_controller *ocs;
//获取配置的 controller
n_controllers = bridge_get_controllers(br, &controllers);
struct ovsrec_controller **controllers;
size_t n_controllers;
controllers = br->cfg->controller;
n_controllers = br->cfg->n_controller;
ocs = xmalloc((n_controllers + 1) * sizeof *ocs);
n_ocs = 0;
//默认为每个网桥添加controller "/usr/local/var/run/openvswitch/br1.mgmt"
bridge_ofproto_controller_for_mgmt(br, &ocs[n_ocs++]);
oc->target = xasprintf("punix:%s/%s.mgmt", ovs_rundir(), br->name);
oc->max_backoff = 0;
oc->probe_interval = 60;
oc->band = OFPROTO_OUT_OF_BAND;
for (i = 0; i < n_controllers; i++) {
struct ovsrec_controller *c = controllers[i];
bridge_ofproto_controller_from_ovsrec(c, &ocs[n_ocs]);
oc->target = c->target;
oc->max_backoff = c->max_backoff ? *c->max_backoff / 1000 : 8;
oc->probe_interval = c->inactivity_probe ? *c->inactivity_probe / 1000 : 5;
oc->band = (!c->connection_mode || !strcmp(c->connection_mode, "in-band")
? OFPROTO_IN_BAND : OFPROTO_OUT_OF_BAND);
n_ocs++;
}
ofproto_set_controllers(br->ofproto, ocs, n_ocs, bridge_get_allowed_versions(br));
connmgr_set_controllers(p->connmgr, controllers, n_controllers, allowed_versions);
shash_init(&new_controllers);
for (i = 0; i < n_controllers; i++) {
const struct ofproto_controller *c = &controllers[i];
//如果是 active 连接,即主动连接。则调用 add_controller
if (!vconn_verify_name(c->target)) {
add_controller(mgr, c->target, c->dscp, allowed_versions);
char *name = ofconn_make_name(mgr, target);
struct ofconn *ofconn;
ofconn = ofconn_create(mgr, rconn_create(5, 8, dscp, allowed_versions), OFCONN_PRIMARY, true);
struct ofconn *ofconn;
ofconn = xzalloc(sizeof *ofconn);
ofconn->connmgr = mgr;
//将 ofconn 插入 mgr->all_conns 链表,在 connmgr_run 中统一处理 mgr->all_conns 链表上的连接
ovs_list_push_back(&mgr->all_conns, &ofconn->node);
ofconn->rconn = rconn;
ofconn->type = type;
ofconn->enable_async_msgs = enable_async_msgs;
hmap_init(&ofconn->monitors);
ovs_list_init(&ofconn->updates);
hmap_init(&ofconn->bundles);
ofconn->next_bundle_expiry_check = time_msec() + BUNDLE_EXPIRY_INTERVAL;
ofconn_flush(ofconn);
rconn_connect(ofconn->rconn, target, name);
hmap_insert(&mgr->controllers, &ofconn->hmap_node, hash_string(target, 0));
} else if (!pvconn_verify_name(c->target)) {
//如果是被动连接,则调用 ofservice_create
ofservice_create(mgr, c->target, allowed_versions, c->dscp);
struct ofservice *ofservice;
struct pvconn *pvconn;
pvconn_open(target, allowed_versions, dscp, &pvconn);
ofservice = xzalloc(sizeof *ofservice);
hmap_insert(&mgr->services, &ofservice->node, hash_string(target, 0));
ofservice->pvconn = pvconn;
ofservice->allowed_versions = allowed_versions;
}
shash_add_once(&new_controllers, c->target, &controllers[i]);
}
/* Delete controllers that are no longer configured.
* Update configuration of all now-existing controllers. */
HMAP_FOR_EACH_SAFE (ofconn, next_ofconn, hmap_node, &mgr->controllers) {
...
}
/* Delete services that are no longer configured.
* Update configuration of all now-existing services. */
HMAP_FOR_EACH_SAFE (ofservice, next_ofservice, node, &mgr->services) {
...
}
/* Configure OpenFlow controller connection snooping. */
//默认为每个网桥添加 snoop "/usr/local/var/run/openvswitch/br1.snoop"
//用于将和controller通信的消息复制一份
if (!ofproto_has_snoops(br->ofproto)) {
struct sset snoops;
sset_init(&snoops);
sset_add_and_free(&snoops, xasprintf("punix:%s/%s.snoop", ovs_rundir(), br->name));
ofproto_set_snoops(br->ofproto, &snoops);
/* Sets the "snoops" for 'mgr' to the pvconn targets listed in 'snoops'.
*
* A "snoop" is a pvconn to which every OpenFlow message to or from the most
* important controller on 'mgr' is mirrored. */
connmgr_set_snoops(ofproto->connmgr, snoops);
set_pvconns(&mgr->snoops, &mgr->n_snoops, snoops);
pvconns = xmalloc(sset_count(sset) * sizeof *pvconns);
n_pvconns = 0;
SSET_FOR_EACH (name, sset) {
struct pvconn *pvconn;
pvconn_open(name, 0, 0, &pvconn);
pvconns[n_pvconns++] = pvconn;
}
*pvconnsp = pvconns;
*n_pvconnsp = n_pvconns;
}
bridge_configure_netflow(br);
bridge_configure_sflow(br, &sflow_bridge_number);
bridge_configure_ipfix(br);
bridge_configure_spanning_tree(br);
bridge_configure_tables(br);
bridge_configure_dp_desc(br);
bridge_configure_aa(br);
}
bridge_run__();
} //bridge_reconfigure
run_stats_update();
run_status_update();
run_system_stats();
} //bridge_run
3. netdev_run();
struct netdev_registered_class *rc;
CMAP_FOR_EACH (rc, cmap_node, &netdev_classes)
//对于 system 类型,run 为 netdev_linux_run,dpdk类型的run函数为空
//主要处理 link,mtu 改变等事件
rc->class->run(rc->class);
struct nl_sock *sock;
/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
* RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
* if no such socket could be created. */
sock = netdev_linux_notify_sock();
static struct nl_sock *sock;
unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR, RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
if (ovsthread_once_start(&once)) {
nl_sock_create(NETLINK_ROUTE, &sock);
for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
nl_sock_join_mcgroup(sock, mcgroups[i]);
setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &multicast_group, sizeof multicast_group)
}
ovsthread_once_done(&once);
}
do {
error = nl_sock_recv(sock, &buf, false);
if (!error) {
struct rtnetlink_change change;
if (rtnetlink_parse(&buf, &change)) {
if (!change.ifname) {
change.ifname = if_indextoname(change.if_index, dev_name);
}
if (change.ifname) {
netdev_ = netdev_from_name(change.ifname);
}
//is_netdev_linux_class: return netdev_class->run == netdev_linux_run;
if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
ovs_mutex_lock(&netdev->mutex);
netdev_linux_update(netdev, &change);
ovs_mutex_unlock(&netdev->mutex);
}
netdev_close(netdev_);
}
}
} while (!error)
}
也可参考:ovs-vswitchd源码分析 - 简书 (jianshu.com)