ovn-controller是运行在chassis/hypervisor上的后台进程,向上通过OVSDB协议连接到OVN sourthbound数据库,向下通过OVSDB协议连接到ovs数据库,并通过openflow协议连接到ovs-vswitchd。
ovn-controller配置
ovn-controller从本地的ovs数据库中获取大部分它需要的配置。它可以通过db.sock连接到ovs数据库(默认位置是unix:/usr/local/var/run/openvswitch/db.sock)。
ovn-controller从本地ovs数据库的Open_vSwitch表中获取下列参数:
external_ids:system-id:
会更新到sbdb的Chassis的name字段。ovn-controller在运行过程中,不能动态修改system-id。
有两种办法可以修改system-id: 先停掉ovn-controller,再修改。或者修改完后,收到删除Chassis表中旧的字段。
external_ids:hostname:
会更新到sbdb的Chassis的hostname字段
external_ids:ovn-bridge
用来连接逻辑端口的集成桥。如果不指定,则ovn-controller启动过程中会自动创建,集成桥名字默认为br-int。
external_ids:ovn-remote:
指定连接ovn southbound数据库的方式。
external_ids:ovn-encap-type:
指定其他chassis连接本节点时用的隧道类型。可以同时指定多个隧道类型。
用来连接chassis的隧道类型有geneve和stt,用来连接网关和chassis的隧道类型有geneve,stt和vxlan。
external_ids:ovn-encap-ip:
指定其他chassis连接本节点时用的ip地址。
external_ids:ovn-bridge-mappings:
键值对列表,用来映射物理网络名字到本地ovs网桥,意思为可通过此ovs网桥连接到物理网络名字所在的物理网络。
比如phys‐net1:br-eth0,physnet2:br-eth1
ovn-controller还会从本地ovs数据库的其他表中获取下列参数:
Bridge表中的datapath-type:
从集成桥中读取datapath-type字段,将其设置到sbdb的Chassis表的other_config:datapath-type字段。
Open_vSwitch表的iface-types:
设置到sbdb的Chassis表的other_config:iface-types字段
更新ovs数据库
上面列出的字段是ovn-controller需要读取的,同时它还会更新ovs数据库的下列字段
Port表中的external_ids:ovn-chassis-id:
此字段的存在表明这是一个由ovn-controller创建的tunnel端口,用来连接其他chassis。它的值是其他chassis的ID。
Port表中的external_ids:ovn-localnet-port:
此字段的存在表明这是一个由ovn-controller创建的patch端口,用来连接集成桥和其他网桥来实现localnet类型的逻辑端口。
Port表中的external_ids:ovn-l2gateway-port:
此字段的存在表明这是一个由ovn-controller创建的patch端口,用来连接集成桥和其他网桥来实现l2gateway类型的逻辑端口。
OVN southbound数据库
ovn-controller从sbdb中读取内容来指定它的操作。同时也会更新下面的表
Chassis:
ovn-controller启动时会在此表中创建一行,相当于上报自己的信息。
Encap:
ovn-controller启动时会在此表中创建一行,设置隧道类型和ip,用来告诉其他chassis如何连接本节点。
Port_Binding:
一旦逻辑端口对应的物理实体创建在chassis上,此chassis上的ovn-controller就会将Port_Binding中的Chassis字段更新为本节点。
MAC_Binding:
ovn-controller会根据put_arp和put_nd action更新此表。
源码分析
先看一下main函数大体逻辑,再拆分挨个分析。
main
//创建新线程,专门用来处理ovs-vswitchd上送的openflow消息
pinctrl_init();
ovs_thread_create("ovn_pinctrl", pinctrl_handler, &pinctrl);
//注册支持的匹配域到全局变量 symtab
lflow_init();
//通过 ovs_remote 连接到 ovsdb-server
//ovsrec_idl_class 指定了 ovsdb 中所有table的格式。
//false 表示默认不监听任何table
/* Connect to OVS OVSDB instance. */
struct ovsdb_idl_loop ovs_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovs_remote, &ovsrec_idl_class, false, true));
ctrl_register_ovs_idl(ovs_idl_loop.idl);
//注册感兴趣的table和column
ovsdb_idl_get_initial_snapshot(ovs_idl_loop.idl);
//配置 ovn sbdb,只是初始化了结构体 struct ovsdb_idl_loop,此时还不知道连接sbdb的信息,
//连接信息由 ovsdb 中 open_vswitch table 的 external-ids:ovn-remote 指定,等到后面连接到
//ovsdb后,才能真正连sbdb
//sbrec_idl_class 指定了 sbdb 中所有table的格式。
//true 表示默认监听所有table
/* Configure OVN SB database. */
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create_unconnected(&sbrec_idl_class, true));
ovsdb_idl_set_leader_only(ovnsb_idl_loop.idl, false);
//忽略不感兴趣的内容
ovsdb_idl_omit(ovnsb_idl_loop.idl, &sbrec_sb_global_col_external_ids);
ovsdb_idl_omit(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_external_ids);
ovsdb_idl_omit(ovnsb_idl_loop.idl, &sbrec_port_binding_col_external_ids);
ovsdb_idl_omit(ovnsb_idl_loop.idl, &sbrec_ssl_col_external_ids);
...
//定义 Incremental Processing engine node,用来处理增量事件
/* Define inc-proc-engine nodes. */
ENGINE_NODE_CUSTOM_DATA(ct_zones, "ct_zones");
ENGINE_NODE_WITH_CLEAR_TRACK_DATA(runtime_data, "runtime_data");
ENGINE_NODE(mff_ovn_geneve, "mff_ovn_geneve");
#define SB_NODE(NAME, NAME_STR) ENGINE_NODE_SB(NAME, NAME_STR);
SB_NODES
#undef SB_NODE
#define OVS_NODE(NAME, NAME_STR) ENGINE_NODE_OVS(NAME, NAME_STR);
OVS_NODES
#undef OVS_NODE
...
//添加 engine_node 之间的依赖关系
/* Add dependencies between inc-proc-engine nodes. */
engine_add_input(&en_addr_sets, &en_sb_address_set,
addr_sets_sb_address_set_handler);
engine_add_input(&en_port_groups, &en_sb_port_group,
port_groups_sb_port_group_handler);
...
while (!exiting) {
//连接ovsdb并获取数据库内容
ovsdb_idl_loop_run(&ovs_idl_loop);
//连接sbdb并获取数据库内容
ovsdb_idl_loop_run(&ovnsb_idl_loop);
//通过 open_vswitch table 中 external_ids 的 ovn-bridge 指定已经存在的网桥,
//如果指定的网桥不存在,则 ovn-controller 会根据网桥名字自动创建。
//如果没有指定网桥,则 ovn-controller 也会自动创建一个网桥,网桥名字默认为 br-int。
process_br_int
//更新 sbdb 的 chassis 表,增加一行,保存本chassis信息。
//更新 sbdb 的 encap 表,增加一行,保存 tunnel 本端的ip和tunnel 类型
chassis_run
//通过openflow协议连接到集成网桥
ofctrl_run
//获取 chassis table 中其他 chassis 的 encap 信息,根据这些信息在本地的 br-int 上添加到
//其他 chassis 的 tunnel 口
encaps_run
//执行 engine node 的change-handle函数,处理数据库内容变化,将logical flow转换成openflow流表
engine_run
en_flow_output_run
lflow_run
physical_run
en_runtime_data_run
binding_run
//在ha chassis之间的tunnel口上配置bfd
bfd_run
//根据配置创建patch端口
patch_run
//根据pinctrl_handler线程处理的消息,更新sbdb相关表项
pinctrl_run
//下发 openflow 流表信息到集成网桥 ovs-vswitchd
ofctrl_put
//执行ovs-appctl发送的debug命令
unixctl_server_run
}
- pinctrl_handler
新线程处理函数,用来处理ovs-vswitchd上送的ECHO_REQUEST,packet-in等消息
/* pinctrl_handler pthread function. */
static void *
pinctrl_handler(void *arg_)
struct pinctrl *pctrl = arg_;
/* OpenFlow connection to the switch. */
struct rconn *swconn;
/* Last seen sequence number for 'swconn'. When this differs from
* rconn_get_connection_seqno(rconn), 'swconn' has reconnected. */
unsigned int conn_seq_no = 0;
swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP15_VERSION);
while (!latch_is_set(&pctrl->pinctrl_thread_exit)) {
long long int bfd_time = LLONG_MAX;
ovs_mutex_lock(&pinctrl_mutex);
//br_int_name 会在 pinctrl_run 中设置
pinctrl_rconn_setup(swconn, pctrl->br_int_name);
pinctrl_setup(swconn);
//连接到 br_int_name 指定的网桥上
if (br_int_name) {
char *target = xasprintf("unix:%s/%s.mgmt", ovs_rundir(), br_int_name);
//名字不相等时才去连接,只需要连接一次即可
if (strcmp(target, rconn_get_target(swconn))) {
VLOG_INFO("%s: connecting to switch", target);
rconn_connect(swconn, target, target);
}
free(target);
} else {
rconn_disconnect(swconn);
}
ovs_mutex_unlock(&pinctrl_mutex);
rconn_run(swconn);
if (rconn_is_connected(swconn)) {
if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
pinctrl_setup(swconn);
/* Fetch the switch configuration. The response later will allow us to
* change the miss_send_len to UINT16_MAX, so that we can enable
* asynchronous messages. */
queue_msg(swconn, ofpraw_alloc(OFPRAW_OFPT_GET_CONFIG_REQUEST, rconn_get_version(swconn), 0));
/* Set a packet-in format that supports userdata. */
queue_msg(swconn, ofputil_encode_set_packet_in_format(rconn_get_version(swconn), OFPUTIL_PACKET_IN_NXT2));
conn_seq_no = rconn_get_connection_seqno(swconn);
}
//从 ovs-vswitchd 接收消息
for (int i = 0; i < 50; i++) {
struct ofpbuf *msg = rconn_recv(swconn);
if (!msg) {
break;
}
//处理接收到的消息
const struct ofp_header *oh = msg->data;
enum ofptype type;
//解码获取 type
ofptype_decode(&type, oh);
//根据type做不同的处理
pinctrl_recv(swconn, oh, type);
//收到echo request消息,需要发送 echo reply
if (type == OFPTYPE_ECHO_REQUEST) {
queue_msg(swconn, ofputil_encode_echo_reply(oh));
//收到获取配置的响应消息
} else if (type == OFPTYPE_GET_CONFIG_REPLY) {
/* Enable asynchronous messages */
struct ofputil_switch_config config;
ofputil_decode_get_config_reply(oh, &config);
config.miss_send_len = UINT16_MAX;
set_switch_config(swconn, &config);
//收到 ovs-vswitchd 主动发送的 packet_in 消息
} else if (type == OFPTYPE_PACKET_IN) {
process_packet_in(swconn, oh);
struct ofputil_packet_in pin;
struct ofpbuf continuation;
ofputil_decode_packet_in(msg, true, NULL, NULL, &pin, NULL, NULL, &continuation);
struct ofpbuf userdata = ofpbuf_const_initializer(pin.userdata, pin.userdata_len);
const struct action_header *ah = ofpbuf_pull(&userdata, sizeof *ah);
struct dp_packet packet;
dp_packet_use_const(&packet, pin.packet, pin.packet_len);
struct flow headers;
flow_extract(&packet, &headers);
switch (ntohl(ah->opcode)) {
case ACTION_OPCODE_ARP:
pinctrl_handle_arp(swconn, &headers, &packet, &pin.flow_metadata,
&userdata);
break;
...
case ACTION_OPCODE_PUT_FDB:
ovs_mutex_lock(&pinctrl_mutex);
pinctrl_handle_put_fdb(&pin.flow_metadata.flow, &headers);
uint32_t dp_key = ntohll(md->metadata);
uint32_t port_key = md->regs[MFF_LOG_INPORT - MFF_REG0];
//二层学习功能,将fdb信息插入全局变量 put_fdbs,dp_key表示哪个datapath,
//headers->dl_src表示报文源mac,port_key表示接收报文的端口。
//在主线程的 pinctrl_run 函数中将fdb信息更新到 sbdb 的 fdb table 中
ovn_fdb_add(&put_fdbs, dp_key, headers->dl_src, port_key);
notify_pinctrl_main();
ovs_mutex_unlock(&pinctrl_mutex);
break;
}
} else { //忽略其他类型消息
if (VLOG_IS_DBG_ENABLED()) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
char *s = ofp_to_string(oh, ntohs(oh->length), NULL, NULL, 2);
VLOG_DBG_RL(&rl, "OpenFlow packet ignored: %s", s);
free(s);
}
}
ofpbuf_delete(msg);
}
if (may_inject_pkts()) {
ovs_mutex_lock(&pinctrl_mutex);
send_garp_rarp_run(swconn, &send_garp_rarp_time);
send_ipv6_ras(swconn, &send_ipv6_ra_time);
send_ipv6_prefixd(swconn, &send_prefixd_time);
send_mac_binding_buffered_pkts(swconn);
bfd_monitor_send_msg(swconn, &bfd_time);
ovs_mutex_unlock(&pinctrl_mutex);
ip_mcast_querier_run(swconn, &send_mcast_query_time);
}
}
ovs_mutex_lock(&pinctrl_mutex);
svc_monitors_run(swconn, &svc_monitors_next_run_time);
ovs_mutex_unlock(&pinctrl_mutex);
}
- chassis_run
chassis_run用来在sbdb的chassis和encap表中增加本chassis的信息,相当于向集群中上报。
//获取 open_vswitch table 中 external_ids 的 system-id
const char *chassis_id = get_ovs_chassis_id(ovs_table);
const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_table_first(ovs_table);
const char *chassis_id = cfg ? smap_get(&cfg->external_ids, "system-id") : NULL;
if (!chassis_id) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_WARN_RL(&rl, "'system-id' in Open_vSwitch database is missing.");
}
return chassis_id;
//只有open_vswitch table设置了 system-id,才会向sbdb更新本chassis的信息
//更新 sbdb 的 chassis table,增加一行,保存本chassis信息。
//更新 sbdb 的 encap table,增加一行,保存 tunnel 本端的ip和tunnel 类型
const struct sbrec_chassis *chassis = NULL;
const struct sbrec_chassis_private *chassis_private = NULL;
if (chassis_id) {
chassis = chassis_run(ovnsb_idl_txn, sbrec_chassis_by_name,
sbrec_chassis_private_by_name,
ovs_table, chassis_id,
br_int, &transport_zones,
&chassis_private);
struct ovs_chassis_cfg ovs_cfg;
*chassis_private = NULL;
/* Get the chassis config from the ovs table. */
ovs_chassis_cfg_init(&ovs_cfg);
chassis_parse_ovs_config(ovs_table, br_int, &ovs_cfg));
//获取 open_vswitch table 中 external_ids 的 ovn-encap-type 和 ovn-encap-ip
const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_table_first(ovs_table);
const char *encap_type = smap_get(&cfg->external_ids, "ovn-encap-type");
const char *encap_ips = smap_get(&cfg->external_ids, "ovn-encap-ip");
//获取 open_vswitch table 中 external_ids 的 其他参数
ovs_cfg->hostname = get_hostname(&cfg->external_ids);
ovs_cfg->bridge_mappings = get_bridge_mappings(&cfg->external_ids);
ovs_cfg->datapath_type = get_datapath_type(br_int);
ovs_cfg->encap_csum = get_encap_csum(&cfg->external_ids);
ovs_cfg->cms_options = get_cms_options(&cfg->external_ids);
ovs_cfg->monitor_all = get_monitor_all(&cfg->external_ids);
ovs_cfg->chassis_macs = get_chassis_mac_mappings(&cfg->external_ids);
ovs_cfg->enable_lflow_cache = get_enable_lflow_cache(&cfg->external_ids);
ovs_cfg->limit_lflow_cache = get_limit_lflow_cache(&cfg->external_ids);
ovs_cfg->memlimit_lflow_cache = get_memlimit_lflow_cache(&cfg->external_ids);
//将 encap_type 保存到 encap_type_set
chassis_parse_ovs_encap_type(encap_type, &ovs_cfg->encap_type_set)
//将 encap_ips 保存到 encap_ip_set
chassis_parse_ovs_encap_ip(encap_ips, &ovs_cfg->encap_ip_set)
chassis_parse_ovs_iface_types(cfg->iface_types,
cfg->n_iface_types,
&ovs_cfg->iface_types)
ovs_cfg->is_interconn = get_is_interconn(&cfg->external_ids);
//创建/更新 chassis table,添加一个 chassis_id
//可通过 ovn-sbctl list chassis 查看
const struct sbrec_chassis *chassis_rec = NULL;
//根据 chassis_id 查找 sbrec_chassis_by_name,如果不存在,则在 chassis table 中插入一行
bool existed = chassis_get_record(ovnsb_idl_txn, sbrec_chassis_by_name, chassis_id, &chassis_rec);
const struct sbrec_chassis *chassis =
chassis = chassis_lookup_by_name(sbrec_chassis_by_name, chassis_id);
if (!chassis && ovnsb_idl_txn) {
/* Create the chassis record. */
VLOG_DBG("Could not find Chassis, will create it: %s", chassis_id);
*chassis_rec = sbrec_chassis_insert(ovnsb_idl_txn);
return false;
}
*chassis_rec = chassis;
//更新 chassis row 字段
chassis_update(chassis_rec, ovnsb_idl_txn, &ovs_cfg, chassis_id, transport_zones);
//设置 chassis 的name列
sbrec_chassis_set_name(chassis_rec, chassis_id);
//设置chassis的hostname
sbrec_chassis_set_hostname(chassis_rec, ovs_cfg->hostname);
...
update_chassis_transport_zones(transport_zones, chassis_rec);
...
struct sbrec_encap **encaps;
size_t n_encap;
//在 encap table 中添加一行,保存tunnel类型和本端的ip
encaps =
chassis_build_encaps(ovnsb_idl_txn, &ovs_cfg->encap_type_set,
&ovs_cfg->encap_ip_set, chassis_id,
ovs_cfg->encap_csum, &n_encap);
const char *encap_ip;
const char *encap_type;
SSET_FOR_EACH (encap_ip, encap_ip_set) {
SSET_FOR_EACH (encap_type, encap_type_set) {
struct sbrec_encap *encap = sbrec_encap_insert(ovnsb_idl_txn);
sbrec_encap_set_type(encap, encap_type);
sbrec_encap_set_ip(encap, encap_ip);
sbrec_encap_set_options(encap, &options);
sbrec_encap_set_chassis_name(encap, chassis_id);
encaps[tunnel_count] = encap;
tunnel_count++;
}
}
//将 encap 信息更新到本 chassis 的 encaps 列中
sbrec_chassis_set_encaps(chassis_rec, encaps, n_encap);
}
- encaps_run
获取 chassis table 中,其他 chassis 的 encap 信息,根据这些信息在本地的 br-int 上创建到其他所有 chassis 的 tunnel 口
encaps_run(ovs_idl_txn,
bridge_table, br_int,
sbrec_chassis_table_get(ovnsb_idl_loop.idl),
chassis,
sbrec_sb_global_first(ovnsb_idl_loop.idl),
&transport_zones) {
const struct sbrec_chassis *chassis_rec;
const struct ovsrec_bridge *br;
struct tunnel_ctx tc = {
.chassis = SHASH_INITIALIZER(&tc.chassis),
.port_names = SSET_INITIALIZER(&tc.port_names),
.br_int = br_int,
.this_chassis = this_chassis
};
//获取本地所有bridge的所有端口,保存到 tc.port_names
//对于包含"ovn-chassis-id"的端口,说明这是之前添加过的tunnel口,还要将此信息添加到 tc.chassis
/* Collect all port names into tc.port_names.
*
* Collect all the OVN-created tunnels into tc.tunnel_hmap. */
OVSREC_BRIDGE_TABLE_FOR_EACH (br, bridge_table) {
for (size_t i = 0; i < br->n_ports; i++) {
const struct ovsrec_port *port = br->ports[i];
sset_add(&tc.port_names, port->name);
/*
* note that the id here is not just the chassis name, but the
* combination of
*/
const char *id = smap_get(&port->external_ids, "ovn-chassis-id");
if (id) {
if (!shash_find(&tc.chassis, id)) {
struct chassis_node *chassis = xzalloc(sizeof *chassis);
chassis->bridge = br;
chassis->port = port;
shash_add_assert(&tc.chassis, id, chassis);
} else {
/* Duplicate port for ovn-chassis-id. Arbitrarily choose
* to delete this one. */
ovsrec_bridge_update_ports_delvalue(br, port);
}
}
}
}
//遍历 chassis table,添加到其他 chassis 的 tunnel 口
SBREC_CHASSIS_TABLE_FOR_EACH (chassis_rec, chassis_table) {
//跳过本 chassis
if (strcmp(chassis_rec->name, this_chassis->name)) {
/* Create tunnels to the other Chassis belonging to the
* same transport zone */
if (!chassis_tzones_overlap(transport_zones, chassis_rec)) {
VLOG_DBG("Skipping encap creation for Chassis '%s' because "
"it belongs to different transport zones",
chassis_rec->name);
continue;
}
if (smap_get_bool(&chassis_rec->other_config, "is-remote", false)
&& !smap_get_bool(&this_chassis->other_config, "is-interconn",
false)) {
VLOG_DBG("Skipping encap creation for Chassis '%s' because "
"it is remote but this chassis is not interconn.",
chassis_rec->name);
continue;
}
chassis_tunnel_add(chassis_rec, sbg, &tc)
struct sbrec_encap *encap = preferred_encap(chassis_rec);
uint32_t pref_type = get_tunnel_type(encap->type);
for (int i = 0; i < chassis_rec->n_encaps; i++) {
uint32_t tun_type = get_tunnel_type(chassis_rec->encaps[i]->type);
if (tun_type != pref_type) {
continue;
}
//给 ovsdb 添加 tunnel 口
tunnel_add(tc, sbg, chassis_rec->name, chassis_rec->encaps[i]);
struct smap options = SMAP_INITIALIZER(&options);
smap_add(&options, "remote_ip", encap->ip);
smap_add(&options, "key", "flow");
const char *dst_port = smap_get(&encap->options, "dst_port");
const char *csum = smap_get(&encap->options, "csum");
char *tunnel_entry_id = NULL;
//tunnel_entry_id 格式: chassis_id@encap_ip
tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip);
#define OVN_MVTEP_CHASSISID_DELIM '@'
return xasprintf("%s%c%s", chassis_id, OVN_MVTEP_CHASSISID_DELIM, encap_ip);
smap_add(&options, "csum", csum);
smap_add(&options, "dst_port", dst_port);
//根据 tunnel_entry_id 查找 tc->chassis,能找到说明之前添加过
struct chassis_node *chassis = shash_find_data(&tc->chassis, tunnel_entry_id);
//如果 tunnel 仍然被需要,则从 tc->chassis 删除它
if (chassis
&& chassis->port->n_interfaces == 1
&& !strcmp(chassis->port->interfaces[0]->type, encap->type)
&& smap_equal(&chassis->port->interfaces[0]->options, &options)) {
shash_find_and_delete(&tc->chassis, tunnel_entry_id);
free(chassis);
goto exit;
}
/* Choose a name for the new port. If we're replacing an old port, reuse
* its name, otherwise generate a new, unique name. */
char *port_name = (chassis
? xstrdup(chassis->port->name)
: tunnel_create_name(tc, new_chassis_id));
//插入 interface
struct ovsrec_interface *iface = ovsrec_interface_insert(tc->ovs_txn);
ovsrec_interface_set_name(iface, port_name);
ovsrec_interface_set_type(iface, encap->type);
ovsrec_interface_set_options(iface, &options);
//插入 port
struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn);
ovsrec_port_set_name(port, port_name);
ovsrec_port_set_interfaces(port, &iface, 1);
//将 tunnel_entry_id 添加到端口的 external_ids
const struct smap id = SMAP_CONST1(&id, "ovn-chassis-id", tunnel_entry_id);
ovsrec_port_set_external_ids(port, &id);
ovsrec_bridge_update_ports_addvalue(tc->br_int, port);
tuncnt++;
}
}
}
}
- patch_run
sbdb中port_binding端口类型为patch,l3gateway,localnet和l2gateway的端口在物理实体上对应patch端口,
并且会在patch端口的external_ids字段标识端口类型。patch_run的作用就是创建这些逻辑端口对应的物理patch端口。
/* Figure out what patch ports already exist.
*
* ovn-controller does not create or use ports of type "ovn-l3gateway-port"
* or "ovn-logical-patch-port", but older version did. We still recognize
* them here, so that we delete them at the end of this function, to avoid
* leaving useless ports on upgrade. */
//遍历ovs的PORT表,获取当前所有的patch端口(external_ids包含ovn-localnet-port,ovn-l2gateway-port,
//ovn-l3gateway-port和ovn-logical-patch-port的端口)
struct shash existing_ports = SHASH_INITIALIZER(&existing_ports);
const struct ovsrec_port *port;
OVSREC_PORT_TABLE_FOR_EACH (port, port_table) {
if (smap_get(&port->external_ids, "ovn-localnet-port")
|| smap_get(&port->external_ids, "ovn-l2gateway-port")
|| smap_get(&port->external_ids, "ovn-l3gateway-port")
|| smap_get(&port->external_ids, "ovn-logical-patch-port")) {
shash_add(&existing_ports, port->name, port);
}
}
/* Create in the database any patch ports that should exist. Remove from
* 'existing_ports' any patch ports that do exist in the database and
* should be there. */
add_bridge_mappings(ovs_idl_txn, sbrec_port_binding_by_type, bridge_table,
ovs_table, br_int, &existing_ports, chassis,
local_datapaths);
/* Get ovn-bridge-mappings. */
struct shash bridge_mappings = SHASH_INITIALIZER(&bridge_mappings);
//在ovs_table的external_ids字段获取 ovn-bridge-mappings,此字段定义了
//外部网络和网桥的映射关系,比如:externalnet:br-eth1
add_ovs_bridge_mappings(ovs_table, bridge_table, &bridge_mappings);
//ovn-l2gateway-port类型的端口会在port_binding的选项 network_name 指定
//连接到的外部网络,network_name是在ovn-bridge-mappings定义的映射关系的 externalnet
add_bridge_mappings_by_type(ovs_idl_txn, sbrec_port_binding_by_type,
br_int, existing_ports, chassis,
&bridge_mappings, "l2gateway",
"ovn-l2gateway-port", local_datapaths, true);
/* Since having localnet ports that are not mapped on some chassis is a
* supported configuration used to implement multisegment switches with
* fabric L3 routing between segments, log the following message once per
* run but don't unnecessarily pollute the log file; pass
* 'log_missing_bridge = false'.
*/
//同理创建 ovn-localnet-port 类型的端口
add_bridge_mappings_by_type(ovs_idl_txn, sbrec_port_binding_by_type,
br_int, existing_ports, NULL,
&bridge_mappings, "localnet",
"ovn-localnet-port", local_datapaths, false);
shash_destroy(&bridge_mappings);
//删除不再需要的patch端口
/* Now 'existing_ports' only still contains patch ports that exist in the
* database but shouldn't. Delete them from the database. */
struct shash_node *port_node, *port_next_node;
SHASH_FOR_EACH_SAFE (port_node, port_next_node, &existing_ports) {
port = port_node->data;
shash_delete(&existing_ports, port_node);
remove_port(bridge_table, port);
}
shash_destroy(&existing_ports);
-
engine_run
下图是engine node之间的依赖关系,主要是为了减少表项变化带来的负载,一个表项变化,只会引起以它作为输入的节点的变化。
按照不同的功能将openflow table划分如下
/* OpenFlow table numbers.
*
* These are heavily documented in ovn-architecture(7), please update it if
* you make any changes. */
//table 0专门用来将物理端口转换成其对应的逻辑端口
#define OFTABLE_PHY_TO_LOG 0
//table1-7暂时还没用到
//table 8用来对应logical_flow ingress 方向的第一个table,
//直到table 32(8+24)
#define OFTABLE_LOG_INGRESS_PIPELINE 8 /* First of LOG_PIPELINE_LEN tables. */
//table33-36暂时还没用到
//table37用来存放输出端口为其他chassis的流表
#define OFTABLE_REMOTE_OUTPUT 37
//table38用来存放输出端口为本chassis的流表
#define OFTABLE_LOCAL_OUTPUT 38
//table39用来做loopback检查,即输出端口不能为输入端口
#define OFTABLE_CHECK_LOOPBACK 39
//table40用来对应logical_flow egress 方向的第一个table,
//直到table 50(40+10)
#define OFTABLE_LOG_EGRESS_PIPELINE 40 /* First of LOG_PIPELINE_LEN tables. */
#define OFTABLE_SAVE_INPORT 64
//table65用来将逻辑端口转换成物理端口
#define OFTABLE_LOG_TO_PHY 65
#define OFTABLE_MAC_BINDING 66
#define OFTABLE_MAC_LOOKUP 67
#define OFTABLE_CHK_LB_HAIRPIN 68
#define OFTABLE_CHK_LB_HAIRPIN_REPLY 69
#define OFTABLE_CT_SNAT_FOR_VIP 70
#define OFTABLE_GET_FDB 71
#define OFTABLE_LOOKUP_FDB 72
根据sbdb数据库内容更新到openflow流表中
static void
en_flow_output_run(struct engine_node *node, void *data)
lflow_run(&l_ctx_in, &l_ctx_out);
//将 logical flow table 中的流表转换到 openflow table 中
add_logical_flows(l_ctx_in, l_ctx_out);
SBREC_LOGICAL_FLOW_TABLE_FOR_EACH (lflow, l_ctx_in->logical_flow_table) {
consider_logical_flow(lflow, &dhcp_opts, &dhcpv6_opts,
&nd_ra_opts, &controller_event_opts,
l_ctx_in, l_ctx_out));
consider_logical_flow__(lflow, dp,
dhcp_opts, dhcpv6_opts, nd_ra_opts,
controller_event_opts,
l_ctx_in, l_ctx_out)
/* Determine translation of logical table IDs to physical table IDs. */
uint8_t first_ptable = (ingress
? OFTABLE_LOG_INGRESS_PIPELINE
: OFTABLE_LOG_EGRESS_PIPELINE);
//ptable 表示要将流表添加到的 openflow table id,
//对于ingress方向,0-24(logical_flow table id) -> 8-32(openflow table id)
//对于egress方向,0-10(logical_flow table id) -> 40- 50(openflow table id)
uint8_t ptable = first_ptable + lflow->table_id;
//output_ptable 表示 output action 指向的table id,
//对于ingress方向,为 OFTABLE_REMOTE_OUTPUT
//对于egress方向,为 OFTABLE_SAVE_INPORT
uint8_t output_ptable = (ingress
? OFTABLE_REMOTE_OUTPUT
: OFTABLE_SAVE_INPORT);
ovnacts_parse_string(lflow->actions, &pp, &ovnacts, &prereqs);
add_matches_to_flow_table(lflow, dp, matches, ptable, output_ptable, &ovnacts, ingress, l_ctx_in, l_ctx_out);
if (!m->n) {
ofctrl_add_flow(l_ctx_out->flow_table, ptable, lflow->priority,
lflow->header_.uuid.parts[0], &m->match, &ofpacts,
&lflow->header_.uuid);
ofctrl_check_and_add_flow(desired_flows, table_id, priority, cookie, match, actions, sb_uuid, true);
struct desired_flow *f = desired_flow_alloc(table_id, priority, cookie, match, actions);
hmap_insert(&flow_table->match_flow_table, &f->match_hmap_node, f->flow.hash);
} else {
ofctrl_add_or_append_flow(l_ctx_out->flow_table, ptable, lflow->priority, 0,
&m->match, &conj, &lflow->header_.uuid);
struct desired_flow *f = desired_flow_alloc(table_id, priority, cookie, match, actions);
hmap_insert(&desired_flows->match_flow_table, &f->match_hmap_node, f->flow.hash);
}
}
add_neighbor_flows(l_ctx_in->sbrec_port_binding_by_name,
l_ctx_in->mac_binding_table, l_ctx_in->local_datapaths,
l_ctx_out->flow_table);
SBREC_MAC_BINDING_TABLE_FOR_EACH (b, mac_binding_table) {
consider_neighbor_flow(sbrec_port_binding_by_name, local_datapaths, b, flow_table);
ofctrl_add_flow(flow_table, OFTABLE_MAC_BINDING, 100,
b->header_.uuid.parts[0], &get_arp_match,
&ofpacts, &b->header_.uuid);
ofctrl_add_flow(flow_table, OFTABLE_MAC_LOOKUP, 100,
b->header_.uuid.parts[0], &lookup_arp_match,
&ofpacts, &b->header_.uuid);
}
add_lb_hairpin_flows(l_ctx_in->lb_table, l_ctx_in->local_datapaths, l_ctx_out->flow_table);
const struct sbrec_load_balancer *lb;
SBREC_LOAD_BALANCER_TABLE_FOR_EACH (lb, lb_table) {
consider_lb_hairpin_flows(lb, local_datapaths, flow_table);
ofctrl_add_flow(flow_table, OFTABLE_CHK_LB_HAIRPIN, 100,
lb->slb->header_.uuid.parts[0], &hairpin_match,
&ofpacts, &lb->slb->header_.uuid);
}
add_fdb_flows(l_ctx_in->fdb_table, l_ctx_in->local_datapaths, l_ctx_out->flow_table);
const struct sbrec_fdb *fdb;
SBREC_FDB_TABLE_FOR_EACH (fdb, fdb_table) {
consider_fdb_flows(fdb, local_datapaths, flow_table);
ofctrl_add_flow(flow_table, OFTABLE_GET_FDB, 100,
fdb->header_.uuid.parts[0], &match, &ofpacts,
&fdb->header_.uuid);
ofctrl_add_flow(flow_table, OFTABLE_LOOKUP_FDB, 100,
fdb->header_.uuid.parts[0], &lookup_match, &ofpacts,
&fdb->header_.uuid);
}
physical_run(&p_ctx, &fo->flow_table);
/* Set up flows in table 0 for physical-to-logical translation and in table
* 64 for logical-to-physical translation. */
const struct sbrec_port_binding *binding;
SBREC_PORT_BINDING_TABLE_FOR_EACH (binding, p_ctx->port_binding_table) {
consider_port_binding(p_ctx->sbrec_port_binding_by_name,
p_ctx->mff_ovn_geneve, p_ctx->ct_zones,
p_ctx->active_tunnels, p_ctx->local_datapaths,
binding, p_ctx->chassis,
flow_table, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100,
binding->header_.uuid.parts[0],
&match, ofpacts_p, &binding->header_.uuid);
ofctrl_add_flow(flow_table, OFTABLE_LOG_TO_PHY, 100,
binding->header_.uuid.parts[0],
&match, ofpacts_p, &binding->header_.uuid);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG,
tag ? 150 : 100, binding->header_.uuid.parts[0],
&match, ofpacts_p, &binding->header_.uuid);
}
- ofctrl_put
下发 openflow 流表信息到 ovs-vswitchd
/* OpenFlow messages to send to the switch to bring it up-to-date. */
struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
//遍历 flow_table,构造成 openflow 消息后,挂到链表 msgs
update_installed_flows_by_compare(flow_table, &bc, &msgs);
/* Iterate through the desired flows and add those that aren't found
* in the installed flow table. */
struct desired_flow *d;
HMAP_FOR_EACH (d, match_hmap_node, &flow_table->match_flow_table) {
i = installed_flow_lookup(&d->flow);
if (!i) {
ovn_flow_log(&d->flow, "adding installed");
installed_flow_add(&d->flow, bc, msgs);
/* Send flow_mod to add flow. */
struct ofputil_flow_mod fm = {
.match = d->match,
.priority = d->priority,
.table_id = d->table_id,
.ofpacts = d->ofpacts,
.ofpacts_len = d->ofpacts_len,
.new_cookie = htonll(d->cookie),
.command = OFPFC_ADD,
};
add_flow_mod(&fm, bc, msgs);
struct ofpbuf *msg = encode_flow_mod(fm);
struct ofputil_bundle_add_msg bam = {
.bundle_id = bc->bundle_id,
.flags = bc->flags,
.msg = msg->data,
};
struct ofpbuf *bundle_msg;
bundle_msg = ofputil_encode_bundle_add(OFP15_VERSION, &bam);
ofpbuf_delete(msg);
ovs_list_push_back(msgs, &bundle_msg->list_node);
/* Copy 'd' from 'flow_table' to installed_flows. */
i = installed_flow_dup(d);
hmap_insert(&installed_flows, &i->match_hmap_node, i->flow.hash);
link_installed_to_desired(i, d);
}
}
//将链表 msgs 中的 openflow消息发送到 ovs-vswitchd
if (!ovs_list_is_empty(&msgs)) {
/* Add a barrier to the list of messages. */
struct ofpbuf *barrier = ofputil_encode_barrier_request(OFP15_VERSION);
const struct ofp_header *oh = barrier->data;
ovs_be32 xid_ = oh->xid;
ovs_list_push_back(&msgs, &barrier->list_node);
/* Queue the messages. */
struct ofpbuf *msg;
LIST_FOR_EACH_POP (msg, list_node, &msgs) {
queue_msg(msg);
//swconn 是个全局变量,在ofctrl_run中赋值,表示到 ovs-vswitchd 的连接
rconn_send(swconn, msg, tx_counter);
}
}
参考
https://docs.openstack.org/neutron/latest/ovn/faq/index.html
https://docs.openstack.org/neutron/latest/admin/ovn/ovn.html
https://weiti.org/ovn/2018/01/03/ovn-l2-breakout-options.html
https://networkop.co.uk/blog/2016/12/10/ovn-part2/