3.bridge,ofproto模块是vswitchd的核心,启动的时候初始化网桥(OVSDB配置相关),接着依次运行RPC服务,网桥,网络设备相关的调用。
----------vswitchd/ovs-vswitchd.c
bridge_init(remote);
free(remote);
exiting = false;
while (!exiting) {
worker_run();
bridge_run_fast();
bridge_run();
bridge_run_fast();
unixctl_server_run(unixctl);
netdev_run();
............
---------lib/worker.c
3.1 如果该进程启动了一个worker(client_sock >= 0)并且没有发生故障,调用reply_cb来回复。其实就相当于多线程,时刻待命响应主进程的各种请求。
void worker_run(void){
if (worker_is_running()) {
int error;
error = rxbuf_run(&client_rx, client_sock, sizeof(struct worker_reply));
if (!error) {
struct worker_reply *reply = client_rx.header.data;
reply->reply_cb(&client_rx.payload, client_rx.fds, client_rx.n_fds, reply->reply_aux);
rxbuf_clear(&client_rx);
} else if (error != EAGAIN) {
worker_broke();
VLOG_ABORT("receive from worker failed (%s)",
ovs_retval_to_string(error));
}
}
}
3.2 函数 bridge_run_fast执行间歇性的活动:遍历所有的网桥,然后完成of switch的功能(通过bridge->ofproto)。在每次轮询循环中多次呼叫这个函数很有意义,因为ofprotos使用ofproto-dpif 实现,可以有性能提升。
函数 bridge_run 通过和ovsdb的沟通维护一些系统状态,核心逻辑仍然是通过ofproto完成相应的任务。
void bridge_run_fast(void) {
struct bridge *br;
HMAP_FOR_EACH (br, node, &all_bridges) {
ofproto_run_fast(br->ofproto);
}
}
void bridge_run(void) {
static const struct ovsrec_open_vswitch null_cfg;
const struct ovsrec_open_vswitch *cfg;
struct ovsdb_idl_txn *reconf_txn = NULL;
bool vlan_splinters_changed;
struct bridge *br;
ovsrec_open_vswitch_init((struct ovsrec_open_vswitch *) &null_cfg);
/* (Re)configure if necessary. */
if (!reconfiguring) {
ovsdb_idl_run(idl);
if (ovsdb_idl_is_lock_contended(idl)) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
struct bridge *br, *next_br;
VLOG_ERR_RL(&rl, "another ovs-vswitchd process is running, disabling this process until it goes away");
HMAP_FOR_EACH_SAFE (br, next_br, node, &all_bridges) {
bridge_destroy(br);
}
return;
} else if (!ovsdb_idl_has_lock(idl)) {
return;
}
}
cfg = ovsrec_open_vswitch_first(idl);
/* Let each bridge do the work that it needs to do. */
HMAP_FOR_EACH (br, node, &all_bridges) {
ofproto_run(br->ofproto);
}
/* 重新配置SSL,选择在主循环中而不是在数据库改变的时候配置ssl,是因为在没有数据库改变的情况下key和certificate文件内容也会改变;选择在 bridge_reconfigure()之前是因为这个函数可能需要初始化SSL连接(需要配置SSL)。 */
if (cfg && cfg->ssl) {
const struct ovsrec_ssl *ssl = cfg->ssl;
stream_ssl_set_key_and_cert(ssl->private_key, ssl->certificate);
stream_ssl_set_ca_cert_file(ssl->ca_cert, ssl->bootstrap_ca_cert);
}
if (!reconfiguring) {
/* If VLAN splinters are in use, then we need to reconfigure if VLAN usage has changed. */
vlan_splinters_changed = false;
if (vlan_splinters_enabled_anywhere) {
HMAP_FOR_EACH (br, node, &all_bridges) {
if (ofproto_has_vlan_usage_changed(br->ofproto)) {
vlan_splinters_changed = true;
break;
}
}
}
if (ovsdb_idl_get_seqno(idl) != idl_seqno || vlan_splinters_changed) {
idl_seqno = ovsdb_idl_get_seqno(idl);
if (cfg) {
reconf_txn = ovsdb_idl_txn_create(idl);
bridge_reconfigure(cfg);
} else {
/* We still need to reconfigure to avoid dangling pointers to
* now-destroyed ovsrec structures inside bridge data. */
bridge_reconfigure(&null_cfg);
}
}
}
if (reconfiguring) {
if (cfg) {
if (!reconf_txn) {
reconf_txn = ovsdb_idl_txn_create(idl);
}
if (bridge_reconfigure_continue(cfg)) {
ovsrec_open_vswitch_set_cur_cfg(cfg, cfg->next_cfg);
}
} else {
bridge_reconfigure_continue(&null_cfg);
}
}
if (reconf_txn) {
ovsdb_idl_txn_commit(reconf_txn);
ovsdb_idl_txn_destroy(reconf_txn);
reconf_txn = NULL;
}
/* Refresh interface and mirror stats if necessary. */
if (time_msec() >= iface_stats_timer) {
if (cfg) {
struct ovsdb_idl_txn *txn;
txn = ovsdb_idl_txn_create(idl);
HMAP_FOR_EACH (br, node, &all_bridges) {
struct port *port;
struct mirror *m;
HMAP_FOR_EACH (port, hmap_node, &br->ports) {
struct iface *iface;
LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
iface_refresh_stats(iface);
iface_refresh_status(iface);
}
}
HMAP_FOR_EACH (m, hmap_node, &br->mirrors) {
mirror_refresh_stats(m);
}
}
refresh_controller_status();
ovsdb_idl_txn_commit(txn);
ovsdb_idl_txn_destroy(txn); /* XXX */
}
iface_stats_timer = time_msec() + IFACE_STATS_INTERVAL;
}
run_system_stats();
refresh_instant_stats();
}
3.3 bridge和ofproto结构体定义(vswitchd/bridge.c, ofproto/ofproto-provider.h)
struct bridge {
struct hmap_node node; //用哈希映射表all_bridges 来管理这些br
char *name; /* User-specified arbitrary name. */
char *type; /* Datapath type. */
uint8_t ea[ETH_ADDR_LEN]; /* Bridge Ethernet Address. */
uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
const struct ovsrec_bridge *cfg;
/* OpenFlow switch processing. */
struct ofproto *ofproto; /* OpenFlow switch. */
/* Bridge ports. */
struct hmap ports; /* "struct port"s indexed by name. */
struct hmap ifaces; /* "struct iface"s indexed by ofp_port. */
struct hmap iface_by_name; /* "struct iface"s indexed by name. */
struct list ofpp_garbage; /* "struct ofpp_garbage" slated for removal. */
struct hmap if_cfg_todo; /* "struct if_cfg"s slated for creation. Indexed on 'cfg->name'. */
/* Port mirroring. */
struct hmap mirrors; /* "struct mirror" indexed by UUID. */
};
--------
定义了 OpenFlow switch.通常情况下ofproto 实现只是查询这些域而不该修改它们。
struct ofproto {
struct hmap_node hmap_node; //哈希表管理这些of switch
const struct ofproto_class *ofproto_class; //openflow protocol 的具体实现
char *type; /* Datapath type. */
char *name; /* Datapath name. */
/* Settings. */
uint64_t fallback_dpid; /* Datapath ID if no better choice found. */
uint64_t datapath_id; /* Datapath ID. */
unsigned flow_eviction_threshold; /* Threshold at which to begin flow table eviction. Only affects the
* ofproto-dpif implementation */
bool forward_bpdu; /* Option to allow forwarding of BPDU frames
* when NORMAL action is invoked. */
char *mfr_desc; /* Manufacturer. */
char *hw_desc; /* Hardware. */
char *sw_desc; /* Software version. */
char *serial_desc; /* Serial number. */
char *dp_desc; /* Datapath description. */
enum ofp_config_flags frag_handling; /* One of OFPC_*. */
/* Datapath. */
struct hmap ports; /* Contains "struct ofport"s. */
struct shash port_by_name;
uint16_t max_ports; /* Max possible OpenFlow port num, plus one. */
/* Flow tables. */
struct oftable *tables;
int n_tables;
/* OpenFlow connections. */
struct connmgr *connmgr;
/* Flow table operation tracking. */
int state; /* Internal state. */
struct list pending; /* List of "struct ofopgroup"s. */
unsigned int n_pending; /* list_size(&pending). */
struct hmap deletions; /* All OFOPERATION_DELETE "ofoperation"s. */
/* Flow table operation logging. */
int n_add, n_delete, n_modify; /* Number of unreported ops of each kind. */
long long int first_op, last_op; /* Range of times for unreported ops. */
long long int next_op_report; /* Time to report ops, or LLONG_MAX. */
long long int op_backoff; /* Earliest time to report ops again. */
};
3.4 bridge_run_fast 通过 ofproto_run_fast完成核心功能,呼叫具体of实现的run_fast函数 (p->ofproto_class->run_fast(p))
ofproto.c :ofproto_run_fast() --> ofproto-dpif.c : run_fast()。首先遍历所有的port调用 port_run_fast(),然后检查是否有upcall(内核空间不能处理而传到用户层来的),呼叫handle_upcalls() 。
static int run_fast(struct ofproto *ofproto_) {
struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
//利用container_of 根据ofproto_得到ofproto_dpif 容器的地址,其实就是构造一个ofproto_dpif,填充里面的up域。
struct ofport_dpif *ofport;
unsigned int work;
HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) {
port_run_fast(ofport);
}
/*利用batch IO来优化系统 */
work = 0;
while (work < FLOW_MISS_MAX_BATCH) {
int retval = handle_upcalls(ofproto, FLOW_MISS_MAX_BATCH - work);
if (retval <= 0) {
return -retval;
}
work += retval;
}
return 0;
}
struct ofproto_dpif {
struct hmap_node all_ofproto_dpifs_node; /* In 'all_ofproto_dpifs'. */
struct ofproto up;
struct dpif *dpif;
/* Special OpenFlow rules. */
struct rule_dpif *miss_rule; /* Sends flow table misses to controller. */
struct rule_dpif *no_packet_in_rule; /* Drops flow table misses. */
/* Statistics. */
uint64_t n_matches;
/* Bridging. */
struct netflow *netflow;
struct dpif_sflow *sflow;
struct hmap bundles; /* Contains "struct ofbundle"s. */
struct mac_learning *ml;
struct ofmirror *mirrors[MAX_MIRRORS];
bool has_mirrors;
bool has_bonded_bundles;
/* Expiration. */
struct timer next_expiration;
/* Facets. */
struct hmap facets;
struct hmap subfacets;
struct governor *governor;
/* Revalidation. */
struct table_dpif tables[N_TABLES];
enum revalidate_reason need_revalidate;
struct tag_set revalidate_set;
/* Support for debugging async flow mods. */
struct list completions;
bool has_bundle_action; /* True when the first bundle action appears. */
struct netdev_stats stats; /* To account packets generated and consumed in
* userspace. */
/* Spanning tree. */
struct stp *stp;
long long int stp_last_tick;
/* VLAN splinters. */
struct hmap realdev_vid_map; /* (realdev,vid) -> vlandev. */
struct hmap vlandev_map; /* vlandev -> (realdev,vid). */
};
3.5 函数 port_run_fast 根据CFM(连通性故障管理)中的时间约束ccm_interval(lib/cfm.c),检查是否需要发送连接性检查消息(CCM,来自IEEE 802.1ag)有的话就构造一个CCM消息到packet中,而后发送packet。发送的过程是构造netlink msg attribute,其中有key 和 action,最终调用struct dpif_class中的execute方法(具体实现看dpif_linux_class中字段)感觉是通过generic netlink机制发送到内核空间进行处理。
static void port_run_fast(struct ofport_dpif *ofport) {
if (ofport->cfm && cfm_should_send_ccm(ofport->cfm)) {
struct ofpbuf packet;
ofpbuf_init(&packet, 0);
cfm_compose_ccm(ofport->cfm, &packet, ofport->up.pp.hw_addr);
send_packet(ofport, &packet);
ofpbuf_uninit(&packet);
}
}
----
struct ofport_dpif {
struct ofport up;
uint32_t odp_port;
struct ofbundle *bundle; /* Bundle that contains this port, if any. */
struct list bundle_node; /* In struct ofbundle's "ports" list. */
struct cfm *cfm; /* Connectivity Fault Management, if any. */
tag_type tag; /* Tag associated with this port. */
uint32_t bond_stable_id; /* stable_id to use as bond slave, or 0. */
bool may_enable; /* May be enabled in bonds. */
long long int carrier_seq; /* Carrier status changes. */
/* Spanning tree. */
struct stp_port *stp_port; /* Spanning Tree Protocol, if any. */
enum stp_state stp_state; /* Always STP_DISABLED if STP not in use. */
long long int stp_state_entered;
struct hmap priorities; /* Map of attached 'priority_to_dscp's. */
};
// 通过'ofport'发送 'packet'.
static int send_packet(const struct ofport_dpif *ofport, struct ofpbuf *packet) {
const struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofport->up.ofproto);
struct ofpbuf key, odp_actions;
struct odputil_keybuf keybuf;
uint16_t odp_port;
struct flow flow;
int error;
flow_extract(packet, 0, 0, NULL, 0, &flow); //根据packet提取数据到flow中
odp_port = vsp_realdev_to_vlandev(ofproto, ofport->odp_port, flow.vlan_tci);
//通过实际端口 和 vlan_tci 得到 vlandev (如 eth0.19)
if (odp_port != ofport->odp_port) {
eth_pop_vlan(packet); //从packet中移除最外的VLAN 头,把flow.vlan_tci置为0
flow.vlan_tci = htons(0);
}
ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
odp_flow_key_from_flow(&key, &flow);
//在key中构造attributes ,把flow中相关信息存入其中 ,如 nl_msg_put_u32(buf, OVS_KEY_ATTR_IN_PORT,
// ofp_port_to_odp_port(flow->in_port)); --lib/odf-util.c
ofpbuf_init(&odp_actions, 32);
compose_sflow_action(ofproto, &odp_actions, &flow, odp_port); //这里是在attribute中存入 flow action:output
nl_msg_put_u32(&odp_actions, OVS_ACTION_ATTR_OUTPUT, odp_port);
error = dpif_execute(ofproto->dpif, key.data, key.size, odp_actions.data, odp_actions.size, packet);
//把上面 构造的key和action都装入一个struct dpif_execute(lib/dpif.h)容器中,调用具体实现的execute方法。
ofpbuf_uninit(&odp_actions);
if (error) {
VLOG_WARN_RL(&rl, "%s: failed to send packet on port %"PRIu32" (%s)",
ofproto->up.name, odp_port, strerror(error));
}
ofproto_update_local_port_stats(ofport->up.ofproto, packet->size, 0);
return error;
}
-----------
促使datapath interface来对packet中的以太网帧执行actions,key看似对于packet来说是冗余的,但是仍然包含一些不能从packet中恢复的元数据,比如tunnel和in_port;
int dpif_execute(struct dpif *dpif, const struct nlattr *key, size_t key_len,
const struct nlattr *actions, size_t actions_len, const struct ofpbuf *buf)
{
struct dpif_execute execute;
execute.key = key;
execute.key_len = key_len;
execute.actions = actions;
execute.actions_len = actions_len;
execute.packet = buf;
return dpif_execute__(dpif, &execute);
}
---------
static int dpif_execute__(struct dpif *dpif, const struct dpif_execute *execute)
{
int error;
COVERAGE_INC(dpif_execute);
if (execute->actions_len > 0) {
error = dpif->dpif_class->execute(dpif, execute);
} else {
error = 0;
}
log_execute_message(dpif, execute, error);
return error;
}
(接下来的过程细节有待完善)
----->dpif_linux_execute ---> dpif_linux_execute__ ---> nl_sock_transact -->
|dp-linux.c| |netlink-socket.c|
--------------------------------------------
3.6 函数handle_upcalls 从内核空间datapath 接收内核空间没能处理的包(流?)构造struct dpif_upcall (lib/dpif.h)。
static int handle_upcalls(struct ofproto_dpif *ofproto, unsigned int max_batch) {
struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
int n_processed;
int n_misses;
int i;
assert(max_batch <= FLOW_MISS_MAX_BATCH);
n_misses = 0;
for (n_processed = 0; n_processed < max_batch; n_processed++) {
struct dpif_upcall *upcall = &misses[n_misses];
struct ofpbuf *buf = &miss_bufs[n_misses];
int error;
ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], sizeof miss_buf_stubs[n_misses]); // ??
error = dpif_recv(ofproto->dpif, upcall, buf);
switch (classify_upcall(upcall)) {
case MISS_UPCALL:
/* Handle it later. */
n_misses++;
break;
case SFLOW_UPCALL:
if (ofproto->sflow) {
handle_sflow_upcall(ofproto, upcall);
//对于SFLOW_UPCALL 和 BAD_UPCALL,进行对应处理后释放存有 upcall 消息的 buf,而对于
//MISS_UPCALL 类型,则调用 handle_miss_upcalls 进行后续的处理。
}
ofpbuf_uninit(buf);
break;
case BAD_UPCALL:
ofpbuf_uninit(buf);
break;
}
}
/* Handle deferred MISS_UPCALL processing. */
handle_miss_upcalls(ofproto, misses, n_misses);
for (i = 0; i < n_misses; i++) {
ofpbuf_uninit(&miss_bufs[i]);
}
return n_processed;
}
--------
//轮询dpif 看是否有upcall ,如果有的话就用upcall表征,数据在buf中,这里要确保设置了能够从dpif接收packets即 dpif_recv_set() ;'upcall->packet' 和 'upcall->key'都会指向用户提供的buf中存的数据,所以它们不能分别进行内存释放。
int dpif_recv(struct dpif *dpif, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
int error = dpif->dpif_class->recv(dpif, upcall, buf);
if (!error && !VLOG_DROP_DBG(&dpmsg_rl)) {
struct ds flow; // dynamic string (lib/dynamic-string.h)
char *packet;
//以字符串形式返回代表ethernet frame的内容(lib/ofp-print.c)
packet = ofp_packet_to_string(upcall->packet->data, upcall->packet->size);
ds_init(&flow);
odp_flow_key_format(upcall->key, upcall->key_len, &flow);
//将upcall中的key中的attribute-OVS_KEY_ATTR_*分别解析出来到flow中,只是为了日志输出。
VLOG_DBG("%s: %s upcall:\n%s\n%s",
dpif_name(dpif), dpif_upcall_type_to_string(upcall->type), ds_cstr(&flow), packet);
ds_destroy(&flow);
free(packet);
} else if (error && error != EAGAIN) {
log_operation(dpif, "recv", error);
}
return error;
}
--------------
static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
struct dpif_upcall *upcall;
struct flow_miss *miss;
struct flow_miss misses[FLOW_MISS_MAX_BATCH];
struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
struct hmap todo;
int n_misses;
size_t n_ops;
size_t i;
//构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流的packets 集合到 "flow_miss"structure 中,这样我们可以一同处理它们。
hmap_init(&todo);
n_misses = 0;
for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
struct flow_miss *miss = &misses[n_misses];
struct flow_miss *existing_miss;
struct flow flow;
uint32_t hash;
/*和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度。 */
miss->key_fitness = ofproto_dpif_extract_flow_key( ofproto, upcall->key, upcall->key_len, &flow, &miss->initial_tci, upcall->packet);
if (miss->key_fitness == ODP_FIT_ERROR) {
continue;
}
/* flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的一些指针也会真正有效,层层跟进(比如packet->l4 = b.data)**/
flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);
//将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
hash = flow_hash(&miss->flow, 0);
existing_miss = flow_miss_find(&todo, &miss->flow, hash);
if (!existing_miss) {
hmap_insert(&todo, &miss->hmap_node, hash);
miss->key = upcall->key;
miss->key_len = upcall->key_len;
miss->upcall_type = upcall->type;
list_init(&miss->packets);
n_misses++;
} else {
miss = existing_miss;
}
list_push_back(&miss->packets, &upcall->packet->list_node);
}
/*然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet, handle_flow_miss_with_facet。*/
n_ops = 0;
HMAP_FOR_EACH (miss, hmap_node, &todo) {
handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
}
assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
/* Execute batch. */
for (i = 0; i < n_ops; i++) {
dpif_ops[i] = &flow_miss_ops[i].dpif_op;
}
dpif_operate(ofproto->dpif, dpif_ops, n_ops);
//根据dpif_op中的操作类型分别调用dpif_flow_put/del/execute__(lib/dpif.c)
/* Free memory and update facets. */
for (i = 0; i < n_ops; i++) {
struct flow_miss_op *op = &flow_miss_ops[i];
switch (op->dpif_op.type) {
case DPIF_OP_EXECUTE:
break;
case DPIF_OP_FLOW_PUT:
if (!op->dpif_op.error) {
op->subfacet->path = subfacet_want_path(op->subfacet->slow);
}
break;
case DPIF_OP_FLOW_DEL:
NOT_REACHED();
}
free(op->garbage);
}
hmap_destroy(&todo);
}