ovn-northd是ovn中的核心后台进程,主要负责将ovn的高层配置转换成供ovn-controller后台进程使用的逻辑配置,更详细的说就是它将ovn northbound数据库中传统意义上的逻辑网络配置转换成ovn southbound数据库中的逻辑通路中的流表。
有如下两个选项用来指定如何连接到northbound和southbound数据库。
--ovnnb-db=database
用来指定northbound数据库,默认路径为unix:/usr/local/var/run/ovn/ovnnb_db.sock
--ovnsb-db=database
用来指定northbound数据库,默认路径为unix:/usr/local/var/run/ovn/ovnsb_db.sock
下面开始分析源码
int
main(int argc, char *argv[])
//初始化到nbdb的连接
//ovsdb_idl_create第一个参数ovnnb_db 指定连接nbdb数据库的路径
//ovsdb_idl_create第二个参数nbrec_idl_class指定nbdb数据库的schema
//ovsdb_idl_create第三个参数(monitor_everything_by_default)为true表示要默认监听数据库的所有内容
/* We want to detect (almost) all changes to the ovn-nb db. */
struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
//可以选择忽略掉的字段
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
&nbrec_nb_global_col_nb_cfg_timestamp);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_sb_cfg);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
&nbrec_nb_global_col_sb_cfg_timestamp);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_hv_cfg);
ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
&nbrec_nb_global_col_hv_cfg_timestamp);
//初始化到sbdb的连接
//ovsdb_idl_create第一个参数ovnsb_db 指定连接sbdb数据库的路径
//ovsdb_idl_create第二个参数sbrec_idl_class指定sbdb数据库的schema
//ovsdb_idl_create第三个参数(monitor_everything_by_default)为false表示要默认不监听数据库的内容
/* We want to detect only selected changes to the ovn-sb db. */
struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));
//选择性的添加感兴趣的字段
//监听global表中的部分字段
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_sb_global);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_options);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_ipsec);
...
//监听logical_flow表中的部分字段
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_logical_flow_col_logical_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_logical_flow_col_logical_dp_group);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_priority);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);
...
//监听port_binding表中的部分字段
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_logical_port);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_tunnel_key);
add_column_noalert(ovnsb_idl_loop.idl,
&sbrec_port_binding_col_parent_port);
...
//监听fdb表中的部分字段
ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_fdb);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_mac);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_dp_key);
add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_port_key);
while (!exiting) {
//根据db内容进行更新
ovn_db_run(&ctx, sbrec_chassis_by_name, &ovnsb_idl_loop,ovn_internal_version);
//根据northbound数据库更新southbound数据库,详细分析见下面的1
ovnnb_db_run(ctx, sbrec_chassis_by_name, ovnsb_idl_loop, &datapaths, &ports, &lr_list, start_time, ovn_internal_version);
//根据southbound数据库更新northbound数据库,详细分析见下面的2
ovnsb_db_run(ctx, ovnsb_idl_loop, &ports, start_time);
}
- ovnnb_db_run
static void
ovnnb_db_run(struct northd_context *ctx,
struct ovsdb_idl_index *sbrec_chassis_by_name,
struct ovsdb_idl_loop *sb_loop,
struct hmap *datapaths, struct hmap *ports,
struct ovs_list *lr_list,
int64_t loop_start_time,
const char *ovn_internal_version)
//将选项 mac_prefix 设置到全局变量 mac_prefix 中,如果选项 mac_prefix 为全0,则随机生成
const char *mac_addr_prefix = set_mac_prefix(smap_get(&nb->options, "mac_prefix"));
//svc_monitor_mac 地址的作用,主要是给 sbdb 的 Service_Monitor 用
/* MAC allocated for service monitor usage. Just one mac is allocated
* for this purpose and ovn-controller's on each chassis will make use
* of this mac when sending out the packets to monitor the services
* defined in Service_Monitor Southbound table. Since these packets
* all locally handled, having just one mac is good enough. */
static char svc_monitor_mac[ETH_ADDR_STRLEN + 1];
static struct eth_addr svc_monitor_mac_ea;
const struct nbrec_nb_global *nb = nbrec_nb_global_first(ctx->ovnnb_idl);
//如果 nbdb 的 global table 设置了 svc_monitor_mac,则使用设置的值
const char *monitor_mac = smap_get(&nb->options, "svc_monitor_mac");
if (monitor_mac) {
if (eth_addr_from_string(monitor_mac, &svc_monitor_mac_ea)) {
snprintf(svc_monitor_mac, sizeof svc_monitor_mac,
ETH_ADDR_FMT, ETH_ADDR_ARGS(svc_monitor_mac_ea));
} else {
monitor_mac = NULL;
}
}
struct smap options;
smap_clone(&options, &nb->options);
//保存 mac_prefix 到 options 中
smap_add(&options, "mac_prefix", mac_addr_prefix);
//如果 nbdb 的 global table 没有设置 svc_monitor_mac,则随机生成一个
if (!monitor_mac) {
eth_addr_random(&svc_monitor_mac_ea);
snprintf(svc_monitor_mac, sizeof svc_monitor_mac,
ETH_ADDR_FMT, ETH_ADDR_ARGS(svc_monitor_mac_ea));
//保存 svc_monitor_mac 到 options 中
smap_replace(&options, "svc_monitor_mac", svc_monitor_mac);
}
char *max_tunid = xasprintf("%d", get_ovn_max_dp_key_local(ctx));
//保存 max_tunid 到 options 中
smap_replace(&options, "max_tunid", max_tunid);
free(max_tunid);
//保存 northd_internal_version 到 options 中
smap_replace(&options, "northd_internal_version", ovn_internal_version);
nbrec_nb_global_verify_options(nb);
//最后将options更新到 global 表中
nbrec_nb_global_set_options(nb, &options);
//下面的函数build_xxx是ovn-northd的核心代码,操作类似,都是根据northbound中的table更新southbound的table,下面分析几个重点的流程
//将 northbound 中最新的logical switch和router 更新到 southbound Datapath_Binding 表,
//同时为每个datapath分配结构struct ovn_datapath,并加入hmap datapaths。见下面的分析1.1
build_datapaths(ctx, datapaths, lr_list);
//根据nbdb logical_switch_port和logical_router_port表中的数据更新 sbdb 中的PORT_BINDING表,见下面的分析1.2
build_ports(ctx, sbrec_chassis_by_name, datapaths, ports);
build_ovn_lbs(ctx, datapaths, ports, &lbs);
build_ipam(datapaths, ports);
build_port_group_lswitches(ctx, &port_groups, ports);
build_lrouter_groups(ports, lr_list);
build_ip_mcast(ctx, datapaths);
build_mcast_groups(ctx, datapaths, ports, &mcast_groups, &igmp_groups);
build_meter_groups(ctx, &meter_groups);
build_bfd_table(ctx, &bfd_connections, ports);
//根据nbdb中的datapath,port及其他配置,生成sbdb中的logical_flow表,见下面的分析1.3
build_lflows(ctx, datapaths, ports, &port_groups, &mcast_groups,&igmp_groups, &meter_groups, &lbs, &bfd_connections);
1.1 build_datapaths
//获取 nbdb 中所有的 datapath,每个 logical-switch 或者 logical-router 都认为是一个 datapath,
//并将这些 datapath 写到 sbdb 的 datapath_binding table 中,每行存放一个 datapath
build_datapaths(ctx, datapaths, lr_list);
//读取 sbdb 的 datapath_binding table 的所有行,获取 logical-switch 和 logical-router
//读取 nbdb 的 LOGICAL_SWITCH 和 LOGICAL_ROUTER table
//只存在于sbdb的datapath,最终需要从 datapath_binding table 删除
//只存在于nbdb的datapath,最终会插入 datapath_binding table
//同时存在sbdb和nbdb中的datapath,可能需要更新 datapath_binding table
join_datapaths(ctx, datapaths, &sb_only, &nb_only, &both, lr_list);
/* Assign explicitly requested tunnel ids first. */
struct hmap dp_tnlids = HMAP_INITIALIZER(&dp_tnlids);
struct ovn_datapath *od, *next;
LIST_FOR_EACH (od, list, &both) {
ovn_datapath_assign_requested_tnl_id(&dp_tnlids, od);
}
LIST_FOR_EACH (od, list, &nb_only) {
ovn_datapath_assign_requested_tnl_id(&dp_tnlids, od);
}
//保存已经存在的datapath中的所有的 tunnel_key 到 dp_tnlids
/* Keep nonconflicting tunnel IDs that are already assigned. */
LIST_FOR_EACH (od, list, &both) {
if (!od->tunnel_key && ovn_add_tnlid(&dp_tnlids, od->sb->tunnel_key)) {
od->tunnel_key = od->sb->tunnel_key;
}
}
/* Assign new tunnel ids where needed. */
uint32_t hint = 0;
LIST_FOR_EACH_SAFE (od, next, list, &both) {
ovn_datapath_allocate_key(ctx, datapaths, &dp_tnlids, od, &hint);
}
//为新的 datapath 分配 tunnel_key,从 1 开始递增
LIST_FOR_EACH_SAFE (od, next, list, &nb_only) {
ovn_datapath_allocate_key(ctx, datapaths, &dp_tnlids, od, &hint);
}
/* Sync tunnel ids from nb to sb. */
LIST_FOR_EACH (od, list, &both) {
if (od->sb->tunnel_key != od->tunnel_key) {
sbrec_datapath_binding_set_tunnel_key(od->sb, od->tunnel_key);
}
ovn_datapath_update_external_ids(od);
}
//只存在于nbdb的datapath,最终会插入 datapath_binding table
LIST_FOR_EACH (od, list, &nb_only) {
od->sb = sbrec_datapath_binding_insert(ctx->ovnsb_txn);
//更新 datapath_binding 的 external_ids
ovn_datapath_update_external_ids(od);
//更新 datapath_binding 的 tunnel_key
sbrec_datapath_binding_set_tunnel_key(od->sb, od->tunnel_key);
}
ovn_destroy_tnlids(&dp_tnlids);
//只存在于sbdb的datapath,最终需要从 datapath_binding table 删除
/* Delete southbound records without northbound matches. */
LIST_FOR_EACH_SAFE (od, next, list, &sb_only) {
ovs_list_remove(&od->list);
sbrec_datapath_binding_delete(od->sb);
ovn_datapath_destroy(datapaths, od);
}
1.2 build_ports
build_ports(ctx, sbrec_chassis_by_name, datapaths, ports);
struct ovs_list sb_only, nb_only, both;
struct hmap tag_alloc_table = HMAP_INITIALIZER(&tag_alloc_table);
struct hmap chassis_qdisc_queues = HMAP_INITIALIZER(&chassis_qdisc_queues);
/* sset which stores the set of ha chassis group names used. */
struct sset active_ha_chassis_grps =
SSET_INITIALIZER(&active_ha_chassis_grps);
//读取 sbdb 的 PORT_BINDING table 的所有行
//读取 datapaths 中所有的port,对于 logical switch datapath来说,就是读取 logical_switch_port 表,
//对于 logical router datapath来说,就是读取 logical_router_port 表
//将所有的port放入 ports
//将只存在sbdb port_binding表中的放入 sb_only
//将只存在nbdb logical_switch_port和logical_router_port表中的放入 nb_only
//同时在 sbdb和nbdb的放入 both
join_logical_ports(ctx, datapaths, ports, &chassis_qdisc_queues,
&tag_alloc_table, &sb_only, &nb_only, &both);
//如果 sbdb PORT_BINDING table 中没有port,则需要将 MAC_BINDING 清空
/* Purge stale Mac_Bindings if ports are deleted. */
bool remove_mac_bindings = !ovs_list_is_empty(&sb_only);
//如果有 requested-tnl-key 选项,则需要分配显示请求的 tunnel key,
//如果成功,则将key保存到 op->tunnel_key。
//需要为 both 和 nb_only 中的port分配/更新 tunnel_key,不需要为 sb_only 中的port分配/更新 tunnel_key,
//因为 sb_only 中的port已经不再存在,需要删除的。
/* Assign explicitly requested tunnel ids first. */
struct ovn_port *op, *next;
LIST_FOR_EACH (op, list, &both) {
ovn_port_assign_requested_tnl_id(op);
}
LIST_FOR_EACH (op, list, &nb_only) {
ovn_port_assign_requested_tnl_id(op);
}
/* Keep nonconflicting tunnel IDs that are already assigned. */
LIST_FOR_EACH (op, list, &both) {
if (!op->tunnel_key) {
ovn_port_add_tnlid(op, op->sb->tunnel_key);
}
}
//如果没有显示请求 tunnel key,需要自动分配一个
/* Assign new tunnel ids where needed. */
LIST_FOR_EACH_SAFE (op, next, list, &both) {
ovn_port_allocate_key(ports, op);
}
LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
ovn_port_allocate_key(ports, op);
}
//将 nbdb 中的port(包含lsp和lrp) 插入 sbdb 的 port_binding table,并更新相关字段
/* Add southbound record for each unmatched northbound record. */
LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
op->sb = sbrec_port_binding_insert(ctx->ovnsb_txn);
ovn_port_update_sbrec(ctx, sbrec_chassis_by_name, op,
&chassis_qdisc_queues,
&active_ha_chassis_grps);
sbrec_port_binding_set_logical_port(op->sb, op->key);
}
//nbdb中不存在的port,sbdb中也不应该存在。所以将只存在sbdb中的port删除
/* Delete southbound records without northbound matches. */
if (!ovs_list_is_empty(&sb_only)) {
LIST_FOR_EACH_SAFE (op, next, list, &sb_only) {
ovs_list_remove(&op->list);
sbrec_port_binding_delete(op->sb);
ovn_port_destroy(ports, op);
}
}
//将 MAC_BINDING 清空
if (remove_mac_bindings) {
cleanup_mac_bindings(ctx, datapaths, ports);
}
1.3 build_lflows
这里主要是调用 ovn_lflow_add 将流表信息暂时插入lflows中,再遍历lflows将流表插入sbdb的logical_flow表中。
先看一下ovn_lflow_add的实现,如下所示它只是个宏定义,又调用函数ovn_lflow_add_at,其参数为:
LFLOW_MAP: 用来保存流表信息
OD: 结构struct ovn_datapath,表示datapath
PRIORITY: 流表优先级
MATCH: 流表匹配域
ACTIONS: 流表的动作
STAGE: enum ovn_stage枚举类型,其组合了datapath,报文方向和table id。
#define ovn_lflow_add(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS) \
ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, true, \
NULL, OVS_SOURCE_LOCATOR)
//存放流表的结构体
struct ovn_lflow {
struct hmap_node hmap_node;
struct ovn_datapath *od; /* 'logical_datapath' in SB schema. */
struct hmapx od_group; /* Hash map of 'struct ovn_datapath *'. */
enum ovn_stage stage;
uint16_t priority;
char *match;
char *actions;
char *stage_hint;
const char *where;
};
接下来重点看一下enum ovn_stage这个枚举类型,其定义如下,
包含datapath类型(logical switch或者logical router),匹配报文方向(ingress or egress)和table id(每个table实现不同功能)
/* A stage within an OVN logical switch or router.
*
* An "enum ovn_stage" indicates whether the stage is part of a logical switch
* or router, whether the stage is part of the ingress or egress pipeline, and
* the table within that pipeline. The first three components are combined to
* form the stage's full name, e.g. S_SWITCH_IN_PORT_SEC_L2,
* S_ROUTER_OUT_DELIVERY. */
enum ovn_stage {
#define PIPELINE_STAGES \
/* Logical switch ingress stages. */ \
PIPELINE_STAGE(SWITCH, IN, PORT_SEC_L2, 0, "ls_in_port_sec_l2") \
PIPELINE_STAGE(SWITCH, IN, PORT_SEC_IP, 1, "ls_in_port_sec_ip") \
PIPELINE_STAGE(SWITCH, IN, PORT_SEC_ND, 2, "ls_in_port_sec_nd") \
PIPELINE_STAGE(SWITCH, IN, LOOKUP_FDB , 3, "ls_in_lookup_fdb") \
PIPELINE_STAGE(SWITCH, IN, PUT_FDB, 4, "ls_in_put_fdb") \
PIPELINE_STAGE(SWITCH, IN, PRE_ACL, 5, "ls_in_pre_acl") \
PIPELINE_STAGE(SWITCH, IN, PRE_LB, 6, "ls_in_pre_lb") \
PIPELINE_STAGE(SWITCH, IN, PRE_STATEFUL, 7, "ls_in_pre_stateful") \
PIPELINE_STAGE(SWITCH, IN, ACL_HINT, 8, "ls_in_acl_hint") \
PIPELINE_STAGE(SWITCH, IN, ACL, 9, "ls_in_acl") \
PIPELINE_STAGE(SWITCH, IN, QOS_MARK, 10, "ls_in_qos_mark") \
PIPELINE_STAGE(SWITCH, IN, QOS_METER, 11, "ls_in_qos_meter") \
PIPELINE_STAGE(SWITCH, IN, LB, 12, "ls_in_lb") \
PIPELINE_STAGE(SWITCH, IN, STATEFUL, 13, "ls_in_stateful") \
PIPELINE_STAGE(SWITCH, IN, PRE_HAIRPIN, 14, "ls_in_pre_hairpin") \
PIPELINE_STAGE(SWITCH, IN, NAT_HAIRPIN, 15, "ls_in_nat_hairpin") \
PIPELINE_STAGE(SWITCH, IN, HAIRPIN, 16, "ls_in_hairpin") \
PIPELINE_STAGE(SWITCH, IN, ARP_ND_RSP, 17, "ls_in_arp_rsp") \
PIPELINE_STAGE(SWITCH, IN, DHCP_OPTIONS, 18, "ls_in_dhcp_options") \
PIPELINE_STAGE(SWITCH, IN, DHCP_RESPONSE, 19, "ls_in_dhcp_response") \
PIPELINE_STAGE(SWITCH, IN, DNS_LOOKUP, 20, "ls_in_dns_lookup") \
PIPELINE_STAGE(SWITCH, IN, DNS_RESPONSE, 21, "ls_in_dns_response") \
PIPELINE_STAGE(SWITCH, IN, EXTERNAL_PORT, 22, "ls_in_external_port") \
PIPELINE_STAGE(SWITCH, IN, L2_LKUP, 23, "ls_in_l2_lkup") \
PIPELINE_STAGE(SWITCH, IN, L2_UNKNOWN, 24, "ls_in_l2_unknown") \
\
/* Logical switch egress stages. */ \
PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 0, "ls_out_pre_lb") \
PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 1, "ls_out_pre_acl") \
PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \
PIPELINE_STAGE(SWITCH, OUT, LB, 3, "ls_out_lb") \
PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 4, "ls_out_acl_hint") \
PIPELINE_STAGE(SWITCH, OUT, ACL, 5, "ls_out_acl") \
PIPELINE_STAGE(SWITCH, OUT, QOS_MARK, 6, "ls_out_qos_mark") \
PIPELINE_STAGE(SWITCH, OUT, QOS_METER, 7, "ls_out_qos_meter") \
PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 8, "ls_out_stateful") \
PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP, 9, "ls_out_port_sec_ip") \
PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 10, "ls_out_port_sec_l2") \
\
/* Logical router ingress stages. */ \
PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \
PIPELINE_STAGE(ROUTER, IN, LOOKUP_NEIGHBOR, 1, "lr_in_lookup_neighbor") \
PIPELINE_STAGE(ROUTER, IN, LEARN_NEIGHBOR, 2, "lr_in_learn_neighbor") \
PIPELINE_STAGE(ROUTER, IN, IP_INPUT, 3, "lr_in_ip_input") \
PIPELINE_STAGE(ROUTER, IN, DEFRAG, 4, "lr_in_defrag") \
PIPELINE_STAGE(ROUTER, IN, UNSNAT, 5, "lr_in_unsnat") \
PIPELINE_STAGE(ROUTER, IN, DNAT, 6, "lr_in_dnat") \
PIPELINE_STAGE(ROUTER, IN, ECMP_STATEFUL, 7, "lr_in_ecmp_stateful") \
PIPELINE_STAGE(ROUTER, IN, ND_RA_OPTIONS, 8, "lr_in_nd_ra_options") \
PIPELINE_STAGE(ROUTER, IN, ND_RA_RESPONSE, 9, "lr_in_nd_ra_response") \
PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 10, "lr_in_ip_routing") \
PIPELINE_STAGE(ROUTER, IN, IP_ROUTING_ECMP, 11, "lr_in_ip_routing_ecmp") \
PIPELINE_STAGE(ROUTER, IN, POLICY, 12, "lr_in_policy") \
PIPELINE_STAGE(ROUTER, IN, POLICY_ECMP, 13, "lr_in_policy_ecmp") \
PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 14, "lr_in_arp_resolve") \
PIPELINE_STAGE(ROUTER, IN, CHK_PKT_LEN , 15, "lr_in_chk_pkt_len") \
PIPELINE_STAGE(ROUTER, IN, LARGER_PKTS, 16, "lr_in_larger_pkts") \
PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 17, "lr_in_gw_redirect") \
PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 18, "lr_in_arp_request") \
\
/* Logical router egress stages. */ \
PIPELINE_STAGE(ROUTER, OUT, UNDNAT, 0, "lr_out_undnat") \
PIPELINE_STAGE(ROUTER, OUT, SNAT, 1, "lr_out_snat") \
PIPELINE_STAGE(ROUTER, OUT, EGR_LOOP, 2, "lr_out_egr_loop") \
PIPELINE_STAGE(ROUTER, OUT, DELIVERY, 3, "lr_out_delivery")
#define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
S_##DP_TYPE##_##PIPELINE##_##STAGE \
= OVN_STAGE_BUILD(DP_##DP_TYPE, P_##PIPELINE, TABLE),
PIPELINE_STAGES
#undef PIPELINE_STAGE
};
//前8位表示table id,第8位表示 pipeline,第9位表示 dp 类型
/* Returns an "enum ovn_stage" built from the arguments.
*
* (It's better to use ovn_stage_build() for type-safety reasons, but inline
* functions can't be used in enums or switch cases.) */
#define OVN_STAGE_BUILD(DP_TYPE, PIPELINE, TABLE) \
(((DP_TYPE) << 9) | ((PIPELINE) << 8) | (TABLE))
将上面的宏展开后,ovn_stage 定义如下
enum ovn_stage {
S_SWITCH_IN_PORT_SEC_L2 = OVN_STAGE_BUILD(DP_SWITCH, P_IN, 0) = 0 0 00000000
S_SWITCH_IN_PORT_SEC_IP =
S_SWITCH_IN_PORT_SEC_ND =
S_SWITCH_IN_LOOKUP_FDB =
....
}
由上可知ovn_stage包含了三个信息,所以提供了如下的函数可以根据ovn_stage获取这些信息
/* Returns the pipeline to which 'stage' belongs. */
static enum ovn_pipeline
ovn_stage_get_pipeline(enum ovn_stage stage)
{
return (stage >> 8) & 1;
}
/* Returns the table to which 'stage' belongs. */
static uint8_t
ovn_stage_get_table(enum ovn_stage stage)
{
return stage & 0xff;
}
build_lflows此函数中就是根据datapaths,ports等信息构造流表struct ovn_lflow,并将其插入sbdb的logical_switch表中
build_lflows(ctx, datapaths, ports, &port_groups, &mcast_groups,
&igmp_groups, &meter_groups, &lbs, &bfd_connections);
build_lswitch_and_lrouter_flows(datapaths, ports,
port_groups, &lflows, mcgroups,
igmp_groups, meter_groups, lbs,
bfd_connections);
char *svc_check_match = xasprintf("eth.dst == %s", svc_monitor_mac);
struct lswitch_flow_build_info lsi = {
.datapaths = datapaths,
.ports = ports,
.port_groups = port_groups,
.lflows = lflows,
.mcgroups = mcgroups,
.igmp_groups = igmp_groups,
.meter_groups = meter_groups,
.lbs = lbs,
.bfd_connections = bfd_connections,
.svc_check_match = svc_check_match,
.match = DS_EMPTY_INITIALIZER,
.actions = DS_EMPTY_INITIALIZER,
};
/* Combined build - all lflow generation from lswitch and lrouter
* will move here and will be reogranized by iterator type.
*/
HMAP_FOR_EACH (od, key_node, datapaths) {
build_lswitch_and_lrouter_iterate_by_od(od, &lsi);
/* Build Logical Switch Flows. */
build_lswitch_lflows_pre_acl_and_acl(od, lsi->port_groups, lsi->lflows,
lsi->meter_groups, lsi->lbs);
build_fwd_group_lflows(od, lsi->lflows);
build_lswitch_lflows_admission_control(od, lsi->lflows);
/* Logical VLANs not supported. */
if (!is_vlan_transparent(od)) {
/* Block logical VLANs. */
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100,
"vlan.present", "drop;");
}
/* Broadcast/multicast source address is invalid. */
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]", "drop;");
build_lswitch_input_port_sec_od(od, lsi->lflows);
build_lswitch_learn_fdb_od(od, lsi->lflows);
build_lswitch_arp_nd_responder_default(od, lsi->lflows);
build_lswitch_dns_lookup_and_response(od, lsi->lflows);
build_lswitch_dhcp_and_dns_defaults(od, lsi->lflows);
build_lswitch_destination_lookup_bmcast(od, lsi->lflows, &lsi->actions);
build_lswitch_output_port_sec_od(od, lsi->lflows);
/* Build Logical Router Flows. */
build_adm_ctrl_flows_for_lrouter(od, lsi->lflows);
build_neigh_learning_flows_for_lrouter(od, lsi->lflows, &lsi->match,
&lsi->actions);
build_ND_RA_flows_for_lrouter(od, lsi->lflows);
build_static_route_flows_for_lrouter(od, lsi->lflows, lsi->ports,
lsi->bfd_connections);
build_mcast_lookup_flows_for_lrouter(od, lsi->lflows, &lsi->match,
&lsi->actions);
build_ingress_policy_flows_for_lrouter(od, lsi->lflows, lsi->ports);
build_arp_resolve_flows_for_lrouter(od, lsi->lflows);
build_check_pkt_len_flows_for_lrouter(od, lsi->lflows, lsi->ports,
&lsi->match, &lsi->actions);
build_gateway_redirect_flows_for_lrouter(od, lsi->lflows, &lsi->match,
&lsi->actions);
build_arp_request_flows_for_lrouter(od, lsi->lflows, &lsi->match,
&lsi->actions);
build_misc_local_traffic_drop_flows_for_lrouter(od, lsi->lflows);
build_lrouter_arp_nd_for_datapath(od, lsi->lflows);
build_lrouter_nat_defrag_and_lb(od, lsi->lflows, lsi->meter_groups,
lsi->lbs, &lsi->match, &lsi->actions);
}
HMAP_FOR_EACH (op, key_node, ports) {
build_lswitch_and_lrouter_iterate_by_op(op, &lsi);
}
HMAP_FOR_EACH (lb, hmap_node, lbs) {
build_lswitch_arp_nd_service_monitor(lb, lsi.lflows,
&lsi.actions,
&lsi.match);
}
build_lswitch_flows(datapaths, lflows);
//遍历lflows,将lfow信息插入sbdb的logical_flow表中
struct ovn_lflow *next_lflow;
HMAP_FOR_EACH_SAFE (lflow, next_lflow, hmap_node, &lflows) {
//从lflow->stage中获取 pipeline,ingress or egress
const char *pipeline = ovn_stage_get_pipeline_name(lflow->stage);
//从lflow->stage中获取table id
uint8_t table = ovn_stage_get_table(lflow->stage);
//在sbdb中插入一行
sbflow = sbrec_logical_flow_insert(ctx->ovnsb_txn);
if (lflow->od) {
//设置 logical_datapath 字段为 datapath_binding lflow->od->sb
sbrec_logical_flow_set_logical_datapath(sbflow, lflow->od->sb);
}
//设置 logical_dp_group 字段为 lflow->od_group
ovn_sb_set_lflow_logical_dp_group(ctx, &dp_groups, sbflow, &lflow->od_group);
//设置 pipeline 字段
sbrec_logical_flow_set_pipeline(sbflow, pipeline);
//设置 table id
sbrec_logical_flow_set_table_id(sbflow, table);
//设置优先级
sbrec_logical_flow_set_priority(sbflow, lflow->priority);
//设置 match 匹配域
sbrec_logical_flow_set_match(sbflow, lflow->match);
//设置 action 动作
sbrec_logical_flow_set_actions(sbflow, lflow->actions);
/* Trim the source locator lflow->where, which looks something like
* "ovn/northd/ovn-northd.c:1234", down to just the part following the
* last slash, e.g. "ovn-northd.c:1234". */
const char *slash = strrchr(lflow->where, '/');
const char *where = slash ? slash + 1 : lflow->where;
struct smap ids = SMAP_INITIALIZER(&ids);
smap_add(&ids, "stage-name", ovn_stage_to_str(lflow->stage));
smap_add(&ids, "source", where);
if (lflow->stage_hint) {
smap_add(&ids, "stage-hint", lflow->stage_hint);
}
//将其他信息设置到 external_ids
sbrec_logical_flow_set_external_ids(sbflow, &ids);
smap_destroy(&ids);
ovn_lflow_destroy(&lflows, lflow);
}
hmap_destroy(&lflows);
比如通过ovn_lflow_add添加流表用来丢弃vlan报文,其对应的logical flow信息
ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present", "drop;");
//logical flow
table=0 (ls_in_port_sec_l2 ), priority=100 , match=(vlan.present), action=(drop;)
- ovnsb_db_run
检测sbdb中Port_Binding表的chassis列,如果不为空,说明此逻辑端口已经有对应的物理端口,就需要设置nbdb中相应的逻辑端口为up
ovnsb_db_run(ctx, ovnsb_idl_loop, &ports, start_time);
/* Handle changes to the 'chassis' column of the 'Port_Binding' table. When
* this column is not empty, it means we need to set the corresponding logical
* port as 'up' in the northbound DB. */
handle_port_binding_changes(ctx, ports, &ha_ref_chassis_map);
const struct sbrec_port_binding *sb;
SBREC_PORT_BINDING_FOR_EACH(sb, ctx->ovnsb_idl) {
struct ovn_port *op = ovn_port_find(ports, sb->logical_port);
//跳过非逻辑交换机端口
if (!op || !op->nbsp) {
/* The logical port doesn't exist for this port binding. This can
* happen under normal circumstances when ovn-northd hasn't gotten
* around to pruning the Port_Binding yet. */
continue;
}
//默认值false
bool up = false;
//router 类型的端口对端连接的是逻辑路由器端口,永远是up
if (lsp_is_router(op->nbsp)) {
up = true;
//如果port_binding表的chassis列不为空,则设置up为true
} else if (sb->chassis) {
up = smap_get_bool(&sb->chassis->other_config,
OVN_FEATURE_PORT_UP_NOTIF, false)
? sb->n_up && sb->up[0]
: true;
}
//设置nbdb中逻辑交换机端口的up字段
if (!op->nbsp->up || *op->nbsp->up != up) {
nbrec_logical_switch_port_set_up(op->nbsp, &up, 1);
}
if (build_ha_chassis_ref && ctx->ovnsb_txn && sb->chassis) {
/* Check and add the chassis which has claimed this 'sb'
* to the ha chassis group's ref_chassis if required. */
build_ha_chassis_group_ref_chassis(ctx, sb, op,
ha_ref_chassis_map);
}
}
下面贴一下添加ls1上的logical flow信息
root@master:~# ovn-sbctl lflow-list
Datapath: "ls1" (845314a0-ad79-4ac8-ac44-9fe2421478c2) Pipeline: ingress
//eth.src[40] 表示源mac为组播/广播地址
table=0 (ls_in_port_sec_l2 ), priority=100 , match=(eth.src[40]), action=(drop;)
table=0 (ls_in_port_sec_l2 ), priority=100 , match=(vlan.present), action=(drop;)
table=0 (ls_in_port_sec_l2 ), priority=50 , match=(inport == "ls1-vm1" && eth.src == {00:00:00:00:00:01}), action=(next;)
table=0 (ls_in_port_sec_l2 ), priority=50 , match=(inport == "ls1-vm2" && eth.src == {00:00:00:00:00:02}), action=(next;)
table=0 (ls_in_port_sec_l2 ), priority=50 , match=(inport == "ls1-vm3" && eth.src == {00:00:00:00:00:03}), action=(next;)
//match 1 表示匹配所有报文
table=1 (ls_in_port_sec_ip ), priority=0 , match=(1), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm1" && eth.src == 00:00:00:00:00:01 && arp.sha == 00:00:00:00:00:01), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm1" && eth.src == 00:00:00:00:00:01 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:01) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:01)))), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm2" && eth.src == 00:00:00:00:00:02 && arp.sha == 00:00:00:00:00:02), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm2" && eth.src == 00:00:00:00:00:02 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:02) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:02)))), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm3" && eth.src == 00:00:00:00:00:03 && arp.sha == 00:00:00:00:00:03), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=90 , match=(inport == "ls1-vm3" && eth.src == 00:00:00:00:00:03 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:03) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:03)))), action=(next;)
table=2 (ls_in_port_sec_nd ), priority=80 , match=(inport == "ls1-vm1" && (arp || nd)), action=(drop;)
table=2 (ls_in_port_sec_nd ), priority=80 , match=(inport == "ls1-vm2" && (arp || nd)), action=(drop;)
table=2 (ls_in_port_sec_nd ), priority=80 , match=(inport == "ls1-vm3" && (arp || nd)), action=(drop;)
table=2 (ls_in_port_sec_nd ), priority=0 , match=(1), action=(next;)
table=3 (ls_in_lookup_fdb ), priority=0 , match=(1), action=(next;)
table=4 (ls_in_put_fdb ), priority=0 , match=(1), action=(next;)
table=5 (ls_in_pre_acl ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=5 (ls_in_pre_acl ), priority=0 , match=(1), action=(next;)
table=6 (ls_in_pre_lb ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(next;)
table=6 (ls_in_pre_lb ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2), action=(next;)
table=6 (ls_in_pre_lb ), priority=0 , match=(1), action=(next;)
table=7 (ls_in_pre_stateful ), priority=100 , match=(reg0[0] == 1), action=(ct_next;)
table=7 (ls_in_pre_stateful ), priority=0 , match=(1), action=(next;)
table=8 (ls_in_acl_hint ), priority=0 , match=(1), action=(next;)
table=9 (ls_in_acl ), priority=34000, match=(eth.dst == $svc_monitor_mac), action=(next;)
table=9 (ls_in_acl ), priority=0 , match=(1), action=(next;)
table=10(ls_in_qos_mark ), priority=0 , match=(1), action=(next;)
table=11(ls_in_qos_meter ), priority=0 , match=(1), action=(next;)
table=12(ls_in_lb ), priority=0 , match=(1), action=(next;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[1] == 1), action=(ct_commit { ct_label.blocked = 0; }; next;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip4 && sctp), action=(reg1 = ip4.dst; reg2[0..15] = sctp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip4 && tcp), action=(reg1 = ip4.dst; reg2[0..15] = tcp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip4 && udp), action=(reg1 = ip4.dst; reg2[0..15] = udp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip6 && sctp), action=(xxreg1 = ip6.dst; reg2[0..15] = sctp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip6 && tcp), action=(xxreg1 = ip6.dst; reg2[0..15] = tcp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=100 , match=(reg0[2] == 1 && ip6 && udp), action=(xxreg1 = ip6.dst; reg2[0..15] = udp.dst; ct_lb;)
table=13(ls_in_stateful ), priority=0 , match=(1), action=(next;)
table=14(ls_in_pre_hairpin ), priority=0 , match=(1), action=(next;)
table=15(ls_in_nat_hairpin ), priority=0 , match=(1), action=(next;)
table=16(ls_in_hairpin ), priority=0 , match=(1), action=(next;)
table=17(ls_in_arp_rsp ), priority=0 , match=(1), action=(next;)
table=18(ls_in_dhcp_options ), priority=0 , match=(1), action=(next;)
table=19(ls_in_dhcp_response), priority=0 , match=(1), action=(next;)
table=20(ls_in_dns_lookup ), priority=0 , match=(1), action=(next;)
table=21(ls_in_dns_response ), priority=0 , match=(1), action=(next;)
table=22(ls_in_external_port), priority=0 , match=(1), action=(next;)
table=23(ls_in_l2_lkup ), priority=110 , match=(eth.dst == $svc_monitor_mac), action=(handle_svc_check(inport);)
table=23(ls_in_l2_lkup ), priority=70 , match=(eth.mcast), action=(outport = "_MC_flood"; output;)
table=23(ls_in_l2_lkup ), priority=50 , match=(eth.dst == 00:00:00:00:00:01), action=(outport = "ls1-vm1"; output;)
table=23(ls_in_l2_lkup ), priority=50 , match=(eth.dst == 00:00:00:00:00:02), action=(outport = "ls1-vm2"; output;)
table=23(ls_in_l2_lkup ), priority=50 , match=(eth.dst == 00:00:00:00:00:03), action=(outport = "ls1-vm3"; output;)
table=23(ls_in_l2_lkup ), priority=0 , match=(1), action=(outport = get_fdb(eth.dst); next;)
table=24(ls_in_l2_unknown ), priority=50 , match=(outport == "none"), action=(drop;)
table=24(ls_in_l2_unknown ), priority=0 , match=(1), action=(output;)
Datapath: "ls1" (845314a0-ad79-4ac8-ac44-9fe2421478c2) Pipeline: egress
table=0 (ls_out_pre_lb ), priority=110 , match=(eth.src == $svc_monitor_mac), action=(next;)
table=0 (ls_out_pre_lb ), priority=110 , match=(nd || nd_rs || nd_ra || mldv1 || mldv2), action=(next;)
table=0 (ls_out_pre_lb ), priority=0 , match=(1), action=(next;)
table=1 (ls_out_pre_acl ), priority=110 , match=(eth.src == $svc_monitor_mac), action=(next;)
table=1 (ls_out_pre_acl ), priority=0 , match=(1), action=(next;)
table=2 (ls_out_pre_stateful), priority=100 , match=(reg0[0] == 1), action=(ct_next;)
table=2 (ls_out_pre_stateful), priority=0 , match=(1), action=(next;)
table=3 (ls_out_lb ), priority=0 , match=(1), action=(next;)
table=4 (ls_out_acl_hint ), priority=0 , match=(1), action=(next;)
table=5 (ls_out_acl ), priority=34000, match=(eth.src == $svc_monitor_mac), action=(next;)
table=5 (ls_out_acl ), priority=0 , match=(1), action=(next;)
table=6 (ls_out_qos_mark ), priority=0 , match=(1), action=(next;)
table=7 (ls_out_qos_meter ), priority=0 , match=(1), action=(next;)
table=8 (ls_out_stateful ), priority=100 , match=(reg0[1] == 1), action=(ct_commit { ct_label.blocked = 0; }; next;)
table=8 (ls_out_stateful ), priority=100 , match=(reg0[2] == 1), action=(ct_lb;)
table=8 (ls_out_stateful ), priority=0 , match=(1), action=(next;)
table=9 (ls_out_port_sec_ip ), priority=0 , match=(1), action=(next;)
table=10(ls_out_port_sec_l2 ), priority=100 , match=(eth.mcast), action=(output;)
table=10(ls_out_port_sec_l2 ), priority=50 , match=(outport == "ls1-vm1" && eth.dst == {00:00:00:00:00:01}), action=(output;)
table=10(ls_out_port_sec_l2 ), priority=50 , match=(outport == "ls1-vm2" && eth.dst == {00:00:00:00:00:02}), action=(output;)
table=10(ls_out_port_sec_l2 ), priority=50 , match=(outport == "ls1-vm3" && eth.dst == {00:00:00:00:00:03}), action=(output;)