ovn-northd 源码分析

ovn-northd是ovn中的核心后台进程,主要负责将ovn的高层配置转换成供ovn-controller后台进程使用的逻辑配置,更详细的说就是它将ovn northbound数据库中传统意义上的逻辑网络配置转换成ovn southbound数据库中的逻辑通路中的流表。

有如下两个选项用来指定如何连接到northbound和southbound数据库。

--ovnnb-db=database
  用来指定northbound数据库,默认路径为unix:/usr/local/var/run/ovn/ovnnb_db.sock
--ovnsb-db=database
  用来指定northbound数据库,默认路径为unix:/usr/local/var/run/ovn/ovnsb_db.sock

下面开始分析源码

int
main(int argc, char *argv[])
    //初始化到nbdb的连接
    //ovsdb_idl_create第一个参数ovnnb_db 指定连接nbdb数据库的路径
    //ovsdb_idl_create第二个参数nbrec_idl_class指定nbdb数据库的schema
    //ovsdb_idl_create第三个参数(monitor_everything_by_default)为true表示要默认监听数据库的所有内容
    /* We want to detect (almost) all changes to the ovn-nb db. */
    struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
        ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
    //可以选择忽略掉的字段
    ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
                         &nbrec_nb_global_col_nb_cfg_timestamp);
    ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_sb_cfg);
    ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
                         &nbrec_nb_global_col_sb_cfg_timestamp);
    ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_hv_cfg);
    ovsdb_idl_omit_alert(ovnnb_idl_loop.idl,
                         &nbrec_nb_global_col_hv_cfg_timestamp);

    //初始化到sbdb的连接
    //ovsdb_idl_create第一个参数ovnsb_db 指定连接sbdb数据库的路径
    //ovsdb_idl_create第二个参数sbrec_idl_class指定sbdb数据库的schema
    //ovsdb_idl_create第三个参数(monitor_everything_by_default)为false表示要默认不监听数据库的内容
    /* We want to detect only selected changes to the ovn-sb db. */
    struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
        ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));
    //选择性的添加感兴趣的字段
    //监听global表中的部分字段
    ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_sb_global);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_options);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_ipsec);
    ...
    //监听logical_flow表中的部分字段
    ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
    add_column_noalert(ovnsb_idl_loop.idl,
                       &sbrec_logical_flow_col_logical_datapath);
    add_column_noalert(ovnsb_idl_loop.idl,
                       &sbrec_logical_flow_col_logical_dp_group);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_priority);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);
    ...
    //监听port_binding表中的部分字段
    ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
    add_column_noalert(ovnsb_idl_loop.idl,
                       &sbrec_port_binding_col_logical_port);
    add_column_noalert(ovnsb_idl_loop.idl,
                       &sbrec_port_binding_col_tunnel_key);
    add_column_noalert(ovnsb_idl_loop.idl,
                       &sbrec_port_binding_col_parent_port);
    ...
    //监听fdb表中的部分字段
    ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_fdb);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_mac);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_dp_key);
    add_column_noalert(ovnsb_idl_loop.idl, &sbrec_fdb_col_port_key);

    while (!exiting) {
        //根据db内容进行更新
        ovn_db_run(&ctx, sbrec_chassis_by_name, &ovnsb_idl_loop,ovn_internal_version);
            //根据northbound数据库更新southbound数据库,详细分析见下面的1
            ovnnb_db_run(ctx, sbrec_chassis_by_name, ovnsb_idl_loop, &datapaths, &ports, &lr_list, start_time, ovn_internal_version);
            //根据southbound数据库更新northbound数据库,详细分析见下面的2
            ovnsb_db_run(ctx, ovnsb_idl_loop, &ports, start_time);
    }
  1. ovnnb_db_run
static void
ovnnb_db_run(struct northd_context *ctx,
             struct ovsdb_idl_index *sbrec_chassis_by_name,
             struct ovsdb_idl_loop *sb_loop,
             struct hmap *datapaths, struct hmap *ports,
             struct ovs_list *lr_list,
             int64_t loop_start_time,
             const char *ovn_internal_version)
    //将选项 mac_prefix 设置到全局变量 mac_prefix 中,如果选项 mac_prefix 为全0,则随机生成
    const char *mac_addr_prefix = set_mac_prefix(smap_get(&nb->options, "mac_prefix"));

    //svc_monitor_mac 地址的作用,主要是给 sbdb 的 Service_Monitor 用
    /* MAC allocated for service monitor usage. Just one mac is allocated
     * for this purpose and ovn-controller's on each chassis will make use
     * of this mac when sending out the packets to monitor the services
     * defined in Service_Monitor Southbound table. Since these packets
     * all locally handled, having just one mac is good enough. */
    static char svc_monitor_mac[ETH_ADDR_STRLEN + 1];
    static struct eth_addr svc_monitor_mac_ea;

    const struct nbrec_nb_global *nb = nbrec_nb_global_first(ctx->ovnnb_idl);
    //如果 nbdb 的 global table 设置了 svc_monitor_mac,则使用设置的值
    const char *monitor_mac = smap_get(&nb->options, "svc_monitor_mac");
    if (monitor_mac) {
        if (eth_addr_from_string(monitor_mac, &svc_monitor_mac_ea)) {
            snprintf(svc_monitor_mac, sizeof svc_monitor_mac,
                     ETH_ADDR_FMT, ETH_ADDR_ARGS(svc_monitor_mac_ea));
        } else {
            monitor_mac = NULL;
        }
    }

    struct smap options;
    smap_clone(&options, &nb->options);
    //保存 mac_prefix 到 options 中
    smap_add(&options, "mac_prefix", mac_addr_prefix);

    //如果 nbdb 的 global table 没有设置 svc_monitor_mac,则随机生成一个
    if (!monitor_mac) {
        eth_addr_random(&svc_monitor_mac_ea);
        snprintf(svc_monitor_mac, sizeof svc_monitor_mac,
                 ETH_ADDR_FMT, ETH_ADDR_ARGS(svc_monitor_mac_ea));
        //保存 svc_monitor_mac 到 options 中
        smap_replace(&options, "svc_monitor_mac", svc_monitor_mac);
    }

    char *max_tunid = xasprintf("%d", get_ovn_max_dp_key_local(ctx));
    //保存 max_tunid 到 options 中
    smap_replace(&options, "max_tunid", max_tunid);
    free(max_tunid);
    //保存 northd_internal_version 到 options 中
    smap_replace(&options, "northd_internal_version", ovn_internal_version);

    nbrec_nb_global_verify_options(nb);
    //最后将options更新到 global 表中
    nbrec_nb_global_set_options(nb, &options);
    
    //下面的函数build_xxx是ovn-northd的核心代码,操作类似,都是根据northbound中的table更新southbound的table,下面分析几个重点的流程
    
    //将 northbound 中最新的logical switch和router 更新到 southbound Datapath_Binding 表, 
    //同时为每个datapath分配结构struct ovn_datapath,并加入hmap datapaths。见下面的分析1.1
    build_datapaths(ctx, datapaths, lr_list);
    //根据nbdb logical_switch_port和logical_router_port表中的数据更新 sbdb 中的PORT_BINDING表,见下面的分析1.2
    build_ports(ctx, sbrec_chassis_by_name, datapaths, ports);
    build_ovn_lbs(ctx, datapaths, ports, &lbs);
    build_ipam(datapaths, ports);
    build_port_group_lswitches(ctx, &port_groups, ports);
    build_lrouter_groups(ports, lr_list);
    build_ip_mcast(ctx, datapaths);
    build_mcast_groups(ctx, datapaths, ports, &mcast_groups, &igmp_groups);
    build_meter_groups(ctx, &meter_groups);
    build_bfd_table(ctx, &bfd_connections, ports);
    //根据nbdb中的datapath,port及其他配置,生成sbdb中的logical_flow表,见下面的分析1.3
    build_lflows(ctx, datapaths, ports, &port_groups, &mcast_groups,&igmp_groups, &meter_groups, &lbs, &bfd_connections);

1.1 build_datapaths

//获取 nbdb 中所有的 datapath,每个 logical-switch 或者 logical-router 都认为是一个 datapath,
//并将这些 datapath 写到 sbdb 的 datapath_binding table 中,每行存放一个 datapath
build_datapaths(ctx, datapaths, lr_list);
    //读取 sbdb 的 datapath_binding table 的所有行,获取 logical-switch 和 logical-router
    //读取 nbdb 的 LOGICAL_SWITCH 和 LOGICAL_ROUTER table
    //只存在于sbdb的datapath,最终需要从 datapath_binding table 删除
    //只存在于nbdb的datapath,最终会插入 datapath_binding table
    //同时存在sbdb和nbdb中的datapath,可能需要更新 datapath_binding table
    join_datapaths(ctx, datapaths, &sb_only, &nb_only, &both, lr_list);

    /* Assign explicitly requested tunnel ids first. */
    struct hmap dp_tnlids = HMAP_INITIALIZER(&dp_tnlids);
    struct ovn_datapath *od, *next;
    LIST_FOR_EACH (od, list, &both) {
        ovn_datapath_assign_requested_tnl_id(&dp_tnlids, od);
    }
    LIST_FOR_EACH (od, list, &nb_only) {
        ovn_datapath_assign_requested_tnl_id(&dp_tnlids, od);
    }

    //保存已经存在的datapath中的所有的 tunnel_key 到 dp_tnlids
    /* Keep nonconflicting tunnel IDs that are already assigned. */
    LIST_FOR_EACH (od, list, &both) {
        if (!od->tunnel_key && ovn_add_tnlid(&dp_tnlids, od->sb->tunnel_key)) {
            od->tunnel_key = od->sb->tunnel_key;
        }
    }
    
    /* Assign new tunnel ids where needed. */
    uint32_t hint = 0;
    LIST_FOR_EACH_SAFE (od, next, list, &both) {
        ovn_datapath_allocate_key(ctx, datapaths, &dp_tnlids, od, &hint);
    }
    
    //为新的 datapath 分配 tunnel_key,从 1 开始递增
    LIST_FOR_EACH_SAFE (od, next, list, &nb_only) {
        ovn_datapath_allocate_key(ctx, datapaths, &dp_tnlids, od, &hint);
    }
    
    /* Sync tunnel ids from nb to sb. */
    LIST_FOR_EACH (od, list, &both) {
        if (od->sb->tunnel_key != od->tunnel_key) {
            sbrec_datapath_binding_set_tunnel_key(od->sb, od->tunnel_key);
        }
        ovn_datapath_update_external_ids(od);
    }
    //只存在于nbdb的datapath,最终会插入 datapath_binding table
    LIST_FOR_EACH (od, list, &nb_only) {
        od->sb = sbrec_datapath_binding_insert(ctx->ovnsb_txn);
        //更新 datapath_binding 的 external_ids
        ovn_datapath_update_external_ids(od);
        //更新 datapath_binding 的 tunnel_key
        sbrec_datapath_binding_set_tunnel_key(od->sb, od->tunnel_key);
    }
    ovn_destroy_tnlids(&dp_tnlids);

    //只存在于sbdb的datapath,最终需要从 datapath_binding table 删除
    /* Delete southbound records without northbound matches. */
    LIST_FOR_EACH_SAFE (od, next, list, &sb_only) {
        ovs_list_remove(&od->list);
        sbrec_datapath_binding_delete(od->sb);
        ovn_datapath_destroy(datapaths, od);
    }

1.2 build_ports

build_ports(ctx, sbrec_chassis_by_name, datapaths, ports);
    struct ovs_list sb_only, nb_only, both;
    struct hmap tag_alloc_table = HMAP_INITIALIZER(&tag_alloc_table);
    struct hmap chassis_qdisc_queues = HMAP_INITIALIZER(&chassis_qdisc_queues);

    /* sset which stores the set of ha chassis group names used. */
    struct sset active_ha_chassis_grps =
        SSET_INITIALIZER(&active_ha_chassis_grps);

    //读取 sbdb 的 PORT_BINDING table 的所有行
    //读取 datapaths 中所有的port,对于 logical switch datapath来说,就是读取 logical_switch_port 表,
    //对于 logical router datapath来说,就是读取 logical_router_port 表
    //将所有的port放入 ports
    //将只存在sbdb port_binding表中的放入 sb_only
    //将只存在nbdb logical_switch_port和logical_router_port表中的放入 nb_only
    //同时在 sbdb和nbdb的放入 both
    join_logical_ports(ctx, datapaths, ports, &chassis_qdisc_queues,
                       &tag_alloc_table, &sb_only, &nb_only, &both);

    //如果 sbdb PORT_BINDING table 中没有port,则需要将 MAC_BINDING 清空
    /* Purge stale Mac_Bindings if ports are deleted. */
    bool remove_mac_bindings = !ovs_list_is_empty(&sb_only);
    
    //如果有 requested-tnl-key 选项,则需要分配显示请求的 tunnel key,
    //如果成功,则将key保存到 op->tunnel_key。
    //需要为 both 和 nb_only 中的port分配/更新 tunnel_key,不需要为 sb_only 中的port分配/更新 tunnel_key,
    //因为 sb_only 中的port已经不再存在,需要删除的。
    /* Assign explicitly requested tunnel ids first. */
    struct ovn_port *op, *next;
    LIST_FOR_EACH (op, list, &both) {
        ovn_port_assign_requested_tnl_id(op);
    }
    LIST_FOR_EACH (op, list, &nb_only) {
        ovn_port_assign_requested_tnl_id(op);
    }

    /* Keep nonconflicting tunnel IDs that are already assigned. */
    LIST_FOR_EACH (op, list, &both) {
        if (!op->tunnel_key) {
            ovn_port_add_tnlid(op, op->sb->tunnel_key);
        }
    }

    //如果没有显示请求 tunnel key,需要自动分配一个
    /* Assign new tunnel ids where needed. */
    LIST_FOR_EACH_SAFE (op, next, list, &both) {
        ovn_port_allocate_key(ports, op);
    }
    LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
        ovn_port_allocate_key(ports, op);
    }

    //将 nbdb 中的port(包含lsp和lrp) 插入 sbdb 的 port_binding table,并更新相关字段
    /* Add southbound record for each unmatched northbound record. */
    LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
        op->sb = sbrec_port_binding_insert(ctx->ovnsb_txn);
        ovn_port_update_sbrec(ctx, sbrec_chassis_by_name, op,
                              &chassis_qdisc_queues,
                              &active_ha_chassis_grps);
        sbrec_port_binding_set_logical_port(op->sb, op->key);
    }

    //nbdb中不存在的port,sbdb中也不应该存在。所以将只存在sbdb中的port删除
    /* Delete southbound records without northbound matches. */
    if (!ovs_list_is_empty(&sb_only)) {
        LIST_FOR_EACH_SAFE (op, next, list, &sb_only) {
            ovs_list_remove(&op->list);
            sbrec_port_binding_delete(op->sb);
            ovn_port_destroy(ports, op);
        }
    }
    
    //将 MAC_BINDING 清空
    if (remove_mac_bindings) {
        cleanup_mac_bindings(ctx, datapaths, ports);
    }   

1.3 build_lflows
这里主要是调用 ovn_lflow_add 将流表信息暂时插入lflows中,再遍历lflows将流表插入sbdb的logical_flow表中。

先看一下ovn_lflow_add的实现,如下所示它只是个宏定义,又调用函数ovn_lflow_add_at,其参数为:
LFLOW_MAP: 用来保存流表信息
OD: 结构struct ovn_datapath,表示datapath
PRIORITY: 流表优先级
MATCH: 流表匹配域
ACTIONS: 流表的动作
STAGE: enum ovn_stage枚举类型,其组合了datapath,报文方向和table id。

#define ovn_lflow_add(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS) \
    ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, true, \
                     NULL, OVS_SOURCE_LOCATOR)

//存放流表的结构体
struct ovn_lflow {
    struct hmap_node hmap_node;

    struct ovn_datapath *od;     /* 'logical_datapath' in SB schema.  */
    struct hmapx od_group;       /* Hash map of 'struct ovn_datapath *'. */
    enum ovn_stage stage;
    uint16_t priority;
    char *match;
    char *actions;
    char *stage_hint;
    const char *where;
};

接下来重点看一下enum ovn_stage这个枚举类型,其定义如下,
包含datapath类型(logical switch或者logical router),匹配报文方向(ingress or egress)和table id(每个table实现不同功能)

/* A stage within an OVN logical switch or router.
 *
 * An "enum ovn_stage" indicates whether the stage is part of a logical switch
 * or router, whether the stage is part of the ingress or egress pipeline, and
 * the table within that pipeline.  The first three components are combined to
 * form the stage's full name, e.g. S_SWITCH_IN_PORT_SEC_L2,
 * S_ROUTER_OUT_DELIVERY. */
enum ovn_stage {
#define PIPELINE_STAGES                                                   \
    /* Logical switch ingress stages. */                                  \
    PIPELINE_STAGE(SWITCH, IN,  PORT_SEC_L2,    0, "ls_in_port_sec_l2")   \
    PIPELINE_STAGE(SWITCH, IN,  PORT_SEC_IP,    1, "ls_in_port_sec_ip")   \
    PIPELINE_STAGE(SWITCH, IN,  PORT_SEC_ND,    2, "ls_in_port_sec_nd")   \
    PIPELINE_STAGE(SWITCH, IN,  LOOKUP_FDB ,    3, "ls_in_lookup_fdb")    \
    PIPELINE_STAGE(SWITCH, IN,  PUT_FDB,        4, "ls_in_put_fdb")       \
    PIPELINE_STAGE(SWITCH, IN,  PRE_ACL,        5, "ls_in_pre_acl")       \
    PIPELINE_STAGE(SWITCH, IN,  PRE_LB,         6, "ls_in_pre_lb")        \
    PIPELINE_STAGE(SWITCH, IN,  PRE_STATEFUL,   7, "ls_in_pre_stateful")  \
    PIPELINE_STAGE(SWITCH, IN,  ACL_HINT,       8, "ls_in_acl_hint")      \
    PIPELINE_STAGE(SWITCH, IN,  ACL,            9, "ls_in_acl")           \
    PIPELINE_STAGE(SWITCH, IN,  QOS_MARK,      10, "ls_in_qos_mark")      \
    PIPELINE_STAGE(SWITCH, IN,  QOS_METER,     11, "ls_in_qos_meter")     \
    PIPELINE_STAGE(SWITCH, IN,  LB,            12, "ls_in_lb")            \
    PIPELINE_STAGE(SWITCH, IN,  STATEFUL,      13, "ls_in_stateful")      \
    PIPELINE_STAGE(SWITCH, IN,  PRE_HAIRPIN,   14, "ls_in_pre_hairpin")   \
    PIPELINE_STAGE(SWITCH, IN,  NAT_HAIRPIN,   15, "ls_in_nat_hairpin")   \
    PIPELINE_STAGE(SWITCH, IN,  HAIRPIN,       16, "ls_in_hairpin")       \
    PIPELINE_STAGE(SWITCH, IN,  ARP_ND_RSP,    17, "ls_in_arp_rsp")       \
    PIPELINE_STAGE(SWITCH, IN,  DHCP_OPTIONS,  18, "ls_in_dhcp_options")  \
    PIPELINE_STAGE(SWITCH, IN,  DHCP_RESPONSE, 19, "ls_in_dhcp_response") \
    PIPELINE_STAGE(SWITCH, IN,  DNS_LOOKUP,    20, "ls_in_dns_lookup")    \
    PIPELINE_STAGE(SWITCH, IN,  DNS_RESPONSE,  21, "ls_in_dns_response")  \
    PIPELINE_STAGE(SWITCH, IN,  EXTERNAL_PORT, 22, "ls_in_external_port") \
    PIPELINE_STAGE(SWITCH, IN,  L2_LKUP,       23, "ls_in_l2_lkup")       \
    PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")    \
                                                                          \
    /* Logical switch egress stages. */                                   \
    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")         \
    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")        \
    PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")   \
    PIPELINE_STAGE(SWITCH, OUT, LB,           3, "ls_out_lb")             \
    PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     4, "ls_out_acl_hint")       \
    PIPELINE_STAGE(SWITCH, OUT, ACL,          5, "ls_out_acl")            \
    PIPELINE_STAGE(SWITCH, OUT, QOS_MARK,     6, "ls_out_qos_mark")       \
    PIPELINE_STAGE(SWITCH, OUT, QOS_METER,    7, "ls_out_qos_meter")      \
    PIPELINE_STAGE(SWITCH, OUT, STATEFUL,     8, "ls_out_stateful")       \
    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP,  9, "ls_out_port_sec_ip")    \
    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 10, "ls_out_port_sec_l2")    \
                                                                      \
    /* Logical router ingress stages. */                              \
    PIPELINE_STAGE(ROUTER, IN,  ADMISSION,       0, "lr_in_admission")    \
    PIPELINE_STAGE(ROUTER, IN,  LOOKUP_NEIGHBOR, 1, "lr_in_lookup_neighbor") \
    PIPELINE_STAGE(ROUTER, IN,  LEARN_NEIGHBOR,  2, "lr_in_learn_neighbor") \
    PIPELINE_STAGE(ROUTER, IN,  IP_INPUT,        3, "lr_in_ip_input")     \
    PIPELINE_STAGE(ROUTER, IN,  DEFRAG,          4, "lr_in_defrag")       \
    PIPELINE_STAGE(ROUTER, IN,  UNSNAT,          5, "lr_in_unsnat")       \
    PIPELINE_STAGE(ROUTER, IN,  DNAT,            6, "lr_in_dnat")         \
    PIPELINE_STAGE(ROUTER, IN,  ECMP_STATEFUL,   7, "lr_in_ecmp_stateful") \
    PIPELINE_STAGE(ROUTER, IN,  ND_RA_OPTIONS,   8, "lr_in_nd_ra_options") \
    PIPELINE_STAGE(ROUTER, IN,  ND_RA_RESPONSE,  9, "lr_in_nd_ra_response") \
    PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING,      10, "lr_in_ip_routing")   \
    PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING_ECMP, 11, "lr_in_ip_routing_ecmp") \
    PIPELINE_STAGE(ROUTER, IN,  POLICY,          12, "lr_in_policy")       \
    PIPELINE_STAGE(ROUTER, IN,  POLICY_ECMP,     13, "lr_in_policy_ecmp")  \
    PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,     14, "lr_in_arp_resolve")  \
    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   ,  15, "lr_in_chk_pkt_len")  \
    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,     16, "lr_in_larger_pkts")  \
    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,     17, "lr_in_gw_redirect")  \
    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,     18, "lr_in_arp_request")  \
                                                                      \
    /* Logical router egress stages. */                               \
    PIPELINE_STAGE(ROUTER, OUT, UNDNAT,    0, "lr_out_undnat")        \
    PIPELINE_STAGE(ROUTER, OUT, SNAT,      1, "lr_out_snat")          \
    PIPELINE_STAGE(ROUTER, OUT, EGR_LOOP,  2, "lr_out_egr_loop")      \
    PIPELINE_STAGE(ROUTER, OUT, DELIVERY,  3, "lr_out_delivery")

#define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME)   \
    S_##DP_TYPE##_##PIPELINE##_##STAGE                          \
        = OVN_STAGE_BUILD(DP_##DP_TYPE, P_##PIPELINE, TABLE),
    PIPELINE_STAGES
#undef PIPELINE_STAGE
};

//前8位表示table id,第8位表示 pipeline,第9位表示 dp 类型
/* Returns an "enum ovn_stage" built from the arguments.
 *
 * (It's better to use ovn_stage_build() for type-safety reasons, but inline
 * functions can't be used in enums or switch cases.) */
#define OVN_STAGE_BUILD(DP_TYPE, PIPELINE, TABLE) \
    (((DP_TYPE) << 9) | ((PIPELINE) << 8) | (TABLE))

将上面的宏展开后,ovn_stage 定义如下

enum ovn_stage {
    S_SWITCH_IN_PORT_SEC_L2 = OVN_STAGE_BUILD(DP_SWITCH, P_IN, 0) = 0 0 00000000
    S_SWITCH_IN_PORT_SEC_IP = 
    S_SWITCH_IN_PORT_SEC_ND = 
    S_SWITCH_IN_LOOKUP_FDB  =
    ....
}

由上可知ovn_stage包含了三个信息,所以提供了如下的函数可以根据ovn_stage获取这些信息

/* Returns the pipeline to which 'stage' belongs. */
static enum ovn_pipeline
ovn_stage_get_pipeline(enum ovn_stage stage)
{
    return (stage >> 8) & 1;
}

/* Returns the table to which 'stage' belongs. */
static uint8_t
ovn_stage_get_table(enum ovn_stage stage)
{
    return stage & 0xff;
}

build_lflows此函数中就是根据datapaths,ports等信息构造流表struct ovn_lflow,并将其插入sbdb的logical_switch表中

build_lflows(ctx, datapaths, ports, &port_groups, &mcast_groups,
             &igmp_groups, &meter_groups, &lbs, &bfd_connections);
    build_lswitch_and_lrouter_flows(datapaths, ports,
                                    port_groups, &lflows, mcgroups,
                                    igmp_groups, meter_groups, lbs,
                                    bfd_connections);
        char *svc_check_match = xasprintf("eth.dst == %s", svc_monitor_mac);
        struct lswitch_flow_build_info lsi = {
            .datapaths = datapaths,
            .ports = ports,
            .port_groups = port_groups,
            .lflows = lflows,
            .mcgroups = mcgroups,
            .igmp_groups = igmp_groups,
            .meter_groups = meter_groups,
            .lbs = lbs,
            .bfd_connections = bfd_connections,
            .svc_check_match = svc_check_match,
            .match = DS_EMPTY_INITIALIZER,
            .actions = DS_EMPTY_INITIALIZER,
        };
        /* Combined build - all lflow generation from lswitch and lrouter
         * will move here and will be reogranized by iterator type.
         */
        HMAP_FOR_EACH (od, key_node, datapaths) {
            build_lswitch_and_lrouter_iterate_by_od(od, &lsi);
                /* Build Logical Switch Flows. */
                build_lswitch_lflows_pre_acl_and_acl(od, lsi->port_groups, lsi->lflows,
                                                     lsi->meter_groups, lsi->lbs);

                build_fwd_group_lflows(od, lsi->lflows);
                build_lswitch_lflows_admission_control(od, lsi->lflows);
                    /* Logical VLANs not supported. */
                    if (!is_vlan_transparent(od)) {
                        /* Block logical VLANs. */
                        ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100,
                                      "vlan.present", "drop;");
                    }

                    /* Broadcast/multicast source address is invalid. */
                    ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]", "drop;");
                build_lswitch_input_port_sec_od(od, lsi->lflows);
                build_lswitch_learn_fdb_od(od, lsi->lflows);
                build_lswitch_arp_nd_responder_default(od, lsi->lflows);
                build_lswitch_dns_lookup_and_response(od, lsi->lflows);
                build_lswitch_dhcp_and_dns_defaults(od, lsi->lflows);
                build_lswitch_destination_lookup_bmcast(od, lsi->lflows, &lsi->actions);
                build_lswitch_output_port_sec_od(od, lsi->lflows);

                /* Build Logical Router Flows. */
                build_adm_ctrl_flows_for_lrouter(od, lsi->lflows);
                build_neigh_learning_flows_for_lrouter(od, lsi->lflows, &lsi->match,
                                                       &lsi->actions);
                build_ND_RA_flows_for_lrouter(od, lsi->lflows);
                build_static_route_flows_for_lrouter(od, lsi->lflows, lsi->ports,
                                                     lsi->bfd_connections);
                build_mcast_lookup_flows_for_lrouter(od, lsi->lflows, &lsi->match,
                                                     &lsi->actions);
                build_ingress_policy_flows_for_lrouter(od, lsi->lflows, lsi->ports);
                build_arp_resolve_flows_for_lrouter(od, lsi->lflows);
                build_check_pkt_len_flows_for_lrouter(od, lsi->lflows, lsi->ports,
                                                      &lsi->match, &lsi->actions);
                build_gateway_redirect_flows_for_lrouter(od, lsi->lflows, &lsi->match,
                                                         &lsi->actions);
                build_arp_request_flows_for_lrouter(od, lsi->lflows, &lsi->match,
                                                    &lsi->actions);
                build_misc_local_traffic_drop_flows_for_lrouter(od, lsi->lflows);
                build_lrouter_arp_nd_for_datapath(od, lsi->lflows);
                build_lrouter_nat_defrag_and_lb(od, lsi->lflows, lsi->meter_groups,
                                                lsi->lbs, &lsi->match, &lsi->actions);
        }
        HMAP_FOR_EACH (op, key_node, ports) {
            build_lswitch_and_lrouter_iterate_by_op(op, &lsi);
        }
        HMAP_FOR_EACH (lb, hmap_node, lbs) {
            build_lswitch_arp_nd_service_monitor(lb, lsi.lflows,
                                                 &lsi.actions,
                                                 &lsi.match);
        }
        build_lswitch_flows(datapaths, lflows);

    //遍历lflows,将lfow信息插入sbdb的logical_flow表中
    struct ovn_lflow *next_lflow;
    HMAP_FOR_EACH_SAFE (lflow, next_lflow, hmap_node, &lflows) {
        //从lflow->stage中获取 pipeline,ingress or egress
        const char *pipeline = ovn_stage_get_pipeline_name(lflow->stage);
        //从lflow->stage中获取table id
        uint8_t table = ovn_stage_get_table(lflow->stage);
        
        //在sbdb中插入一行
        sbflow = sbrec_logical_flow_insert(ctx->ovnsb_txn);
        if (lflow->od) {
            //设置 logical_datapath 字段为 datapath_binding lflow->od->sb
            sbrec_logical_flow_set_logical_datapath(sbflow, lflow->od->sb);
        }
        //设置 logical_dp_group 字段为 lflow->od_group
        ovn_sb_set_lflow_logical_dp_group(ctx, &dp_groups, sbflow, &lflow->od_group);
        //设置 pipeline 字段
        sbrec_logical_flow_set_pipeline(sbflow, pipeline);
        //设置 table id
        sbrec_logical_flow_set_table_id(sbflow, table);
        //设置优先级
        sbrec_logical_flow_set_priority(sbflow, lflow->priority);
        //设置 match 匹配域
        sbrec_logical_flow_set_match(sbflow, lflow->match);
        //设置 action 动作
        sbrec_logical_flow_set_actions(sbflow, lflow->actions);

        /* Trim the source locator lflow->where, which looks something like
         * "ovn/northd/ovn-northd.c:1234", down to just the part following the
         * last slash, e.g. "ovn-northd.c:1234". */
        const char *slash = strrchr(lflow->where, '/');
        const char *where = slash ? slash + 1 : lflow->where;

        struct smap ids = SMAP_INITIALIZER(&ids);
        smap_add(&ids, "stage-name", ovn_stage_to_str(lflow->stage));
        smap_add(&ids, "source", where);
        if (lflow->stage_hint) {
            smap_add(&ids, "stage-hint", lflow->stage_hint);
        }
        //将其他信息设置到 external_ids
        sbrec_logical_flow_set_external_ids(sbflow, &ids);
        smap_destroy(&ids);

        ovn_lflow_destroy(&lflows, lflow);
    }
    hmap_destroy(&lflows);

比如通过ovn_lflow_add添加流表用来丢弃vlan报文,其对应的logical flow信息

ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present", "drop;");
//logical flow
table=0 (ls_in_port_sec_l2  ), priority=100  , match=(vlan.present), action=(drop;)
  1. ovnsb_db_run
    检测sbdb中Port_Binding表的chassis列,如果不为空,说明此逻辑端口已经有对应的物理端口,就需要设置nbdb中相应的逻辑端口为up
ovnsb_db_run(ctx, ovnsb_idl_loop, &ports, start_time);
    /* Handle changes to the 'chassis' column of the 'Port_Binding' table.  When
     * this column is not empty, it means we need to set the corresponding logical
     * port as 'up' in the northbound DB. */
    handle_port_binding_changes(ctx, ports, &ha_ref_chassis_map);
        const struct sbrec_port_binding *sb;
        SBREC_PORT_BINDING_FOR_EACH(sb, ctx->ovnsb_idl) {
            struct ovn_port *op = ovn_port_find(ports, sb->logical_port);

            //跳过非逻辑交换机端口
            if (!op || !op->nbsp) {
                /* The logical port doesn't exist for this port binding.  This can
                 * happen under normal circumstances when ovn-northd hasn't gotten
                 * around to pruning the Port_Binding yet. */
                continue;
            }

            //默认值false
            bool up = false;

            //router 类型的端口对端连接的是逻辑路由器端口,永远是up
            if (lsp_is_router(op->nbsp)) {
                up = true;
            //如果port_binding表的chassis列不为空,则设置up为true
            } else if (sb->chassis) {
                up = smap_get_bool(&sb->chassis->other_config,
                                   OVN_FEATURE_PORT_UP_NOTIF, false)
                     ? sb->n_up && sb->up[0]
                     : true;
            }

            //设置nbdb中逻辑交换机端口的up字段
            if (!op->nbsp->up || *op->nbsp->up != up) {
                nbrec_logical_switch_port_set_up(op->nbsp, &up, 1);
            }

            if (build_ha_chassis_ref && ctx->ovnsb_txn && sb->chassis) {
                /* Check and add the chassis which has claimed this 'sb'
                 * to the ha chassis group's ref_chassis if required. */
                build_ha_chassis_group_ref_chassis(ctx, sb, op,
                                                   ha_ref_chassis_map);
            }
        }

下面贴一下添加ls1上的logical flow信息

root@master:~# ovn-sbctl lflow-list
Datapath: "ls1" (845314a0-ad79-4ac8-ac44-9fe2421478c2)  Pipeline: ingress
  //eth.src[40] 表示源mac为组播/广播地址
  table=0 (ls_in_port_sec_l2  ), priority=100  , match=(eth.src[40]), action=(drop;)
  table=0 (ls_in_port_sec_l2  ), priority=100  , match=(vlan.present), action=(drop;)
  table=0 (ls_in_port_sec_l2  ), priority=50   , match=(inport == "ls1-vm1" && eth.src == {00:00:00:00:00:01}), action=(next;)
  table=0 (ls_in_port_sec_l2  ), priority=50   , match=(inport == "ls1-vm2" && eth.src == {00:00:00:00:00:02}), action=(next;)
  table=0 (ls_in_port_sec_l2  ), priority=50   , match=(inport == "ls1-vm3" && eth.src == {00:00:00:00:00:03}), action=(next;)
  //match 1 表示匹配所有报文
  table=1 (ls_in_port_sec_ip  ), priority=0    , match=(1), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm1" && eth.src == 00:00:00:00:00:01 && arp.sha == 00:00:00:00:00:01), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm1" && eth.src == 00:00:00:00:00:01 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:01) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:01)))), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm2" && eth.src == 00:00:00:00:00:02 && arp.sha == 00:00:00:00:00:02), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm2" && eth.src == 00:00:00:00:00:02 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:02) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:02)))), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm3" && eth.src == 00:00:00:00:00:03 && arp.sha == 00:00:00:00:00:03), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=90   , match=(inport == "ls1-vm3" && eth.src == 00:00:00:00:00:03 && ip6 && nd && ((nd.sll == 00:00:00:00:00:00 || nd.sll == 00:00:00:00:00:03) || ((nd.tll == 00:00:00:00:00:00 || nd.tll == 00:00:00:00:00:03)))), action=(next;)
  table=2 (ls_in_port_sec_nd  ), priority=80   , match=(inport == "ls1-vm1" && (arp || nd)), action=(drop;)
  table=2 (ls_in_port_sec_nd  ), priority=80   , match=(inport == "ls1-vm2" && (arp || nd)), action=(drop;)
  table=2 (ls_in_port_sec_nd  ), priority=80   , match=(inport == "ls1-vm3" && (arp || nd)), action=(drop;)
  table=2 (ls_in_port_sec_nd  ), priority=0    , match=(1), action=(next;)
  table=3 (ls_in_lookup_fdb   ), priority=0    , match=(1), action=(next;)
  table=4 (ls_in_put_fdb      ), priority=0    , match=(1), action=(next;)
  table=5 (ls_in_pre_acl      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
  table=5 (ls_in_pre_acl      ), priority=0    , match=(1), action=(next;)
  table=6 (ls_in_pre_lb       ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(next;)
  table=6 (ls_in_pre_lb       ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2), action=(next;)
  table=6 (ls_in_pre_lb       ), priority=0    , match=(1), action=(next;)
  table=7 (ls_in_pre_stateful ), priority=100  , match=(reg0[0] == 1), action=(ct_next;)
  table=7 (ls_in_pre_stateful ), priority=0    , match=(1), action=(next;)
  table=8 (ls_in_acl_hint     ), priority=0    , match=(1), action=(next;)
  table=9 (ls_in_acl          ), priority=34000, match=(eth.dst == $svc_monitor_mac), action=(next;)
  table=9 (ls_in_acl          ), priority=0    , match=(1), action=(next;)
  table=10(ls_in_qos_mark     ), priority=0    , match=(1), action=(next;)
  table=11(ls_in_qos_meter    ), priority=0    , match=(1), action=(next;)
  table=12(ls_in_lb           ), priority=0    , match=(1), action=(next;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[1] == 1), action=(ct_commit { ct_label.blocked = 0; }; next;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip4 && sctp), action=(reg1 = ip4.dst; reg2[0..15] = sctp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip4 && tcp), action=(reg1 = ip4.dst; reg2[0..15] = tcp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip4 && udp), action=(reg1 = ip4.dst; reg2[0..15] = udp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip6 && sctp), action=(xxreg1 = ip6.dst; reg2[0..15] = sctp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip6 && tcp), action=(xxreg1 = ip6.dst; reg2[0..15] = tcp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=100  , match=(reg0[2] == 1 && ip6 && udp), action=(xxreg1 = ip6.dst; reg2[0..15] = udp.dst; ct_lb;)
  table=13(ls_in_stateful     ), priority=0    , match=(1), action=(next;)
  table=14(ls_in_pre_hairpin  ), priority=0    , match=(1), action=(next;)
  table=15(ls_in_nat_hairpin  ), priority=0    , match=(1), action=(next;)
  table=16(ls_in_hairpin      ), priority=0    , match=(1), action=(next;)
  table=17(ls_in_arp_rsp      ), priority=0    , match=(1), action=(next;)
  table=18(ls_in_dhcp_options ), priority=0    , match=(1), action=(next;)
  table=19(ls_in_dhcp_response), priority=0    , match=(1), action=(next;)
  table=20(ls_in_dns_lookup   ), priority=0    , match=(1), action=(next;)
  table=21(ls_in_dns_response ), priority=0    , match=(1), action=(next;)
  table=22(ls_in_external_port), priority=0    , match=(1), action=(next;)
  table=23(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac), action=(handle_svc_check(inport);)
  table=23(ls_in_l2_lkup      ), priority=70   , match=(eth.mcast), action=(outport = "_MC_flood"; output;)
  table=23(ls_in_l2_lkup      ), priority=50   , match=(eth.dst == 00:00:00:00:00:01), action=(outport = "ls1-vm1"; output;)
  table=23(ls_in_l2_lkup      ), priority=50   , match=(eth.dst == 00:00:00:00:00:02), action=(outport = "ls1-vm2"; output;)
  table=23(ls_in_l2_lkup      ), priority=50   , match=(eth.dst == 00:00:00:00:00:03), action=(outport = "ls1-vm3"; output;)
  table=23(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
  table=24(ls_in_l2_unknown   ), priority=50   , match=(outport == "none"), action=(drop;)
  table=24(ls_in_l2_unknown   ), priority=0    , match=(1), action=(output;)
  
Datapath: "ls1" (845314a0-ad79-4ac8-ac44-9fe2421478c2)  Pipeline: egress
  table=0 (ls_out_pre_lb      ), priority=110  , match=(eth.src == $svc_monitor_mac), action=(next;)
  table=0 (ls_out_pre_lb      ), priority=110  , match=(nd || nd_rs || nd_ra || mldv1 || mldv2), action=(next;)
  table=0 (ls_out_pre_lb      ), priority=0    , match=(1), action=(next;)
  table=1 (ls_out_pre_acl     ), priority=110  , match=(eth.src == $svc_monitor_mac), action=(next;)
  table=1 (ls_out_pre_acl     ), priority=0    , match=(1), action=(next;)
  table=2 (ls_out_pre_stateful), priority=100  , match=(reg0[0] == 1), action=(ct_next;)
  table=2 (ls_out_pre_stateful), priority=0    , match=(1), action=(next;)
  table=3 (ls_out_lb          ), priority=0    , match=(1), action=(next;)
  table=4 (ls_out_acl_hint    ), priority=0    , match=(1), action=(next;)
  table=5 (ls_out_acl         ), priority=34000, match=(eth.src == $svc_monitor_mac), action=(next;)
  table=5 (ls_out_acl         ), priority=0    , match=(1), action=(next;)
  table=6 (ls_out_qos_mark    ), priority=0    , match=(1), action=(next;)
  table=7 (ls_out_qos_meter   ), priority=0    , match=(1), action=(next;)
  table=8 (ls_out_stateful    ), priority=100  , match=(reg0[1] == 1), action=(ct_commit { ct_label.blocked = 0; }; next;)
  table=8 (ls_out_stateful    ), priority=100  , match=(reg0[2] == 1), action=(ct_lb;)
  table=8 (ls_out_stateful    ), priority=0    , match=(1), action=(next;)
  table=9 (ls_out_port_sec_ip ), priority=0    , match=(1), action=(next;)
  table=10(ls_out_port_sec_l2 ), priority=100  , match=(eth.mcast), action=(output;)
  table=10(ls_out_port_sec_l2 ), priority=50   , match=(outport == "ls1-vm1" && eth.dst == {00:00:00:00:00:01}), action=(output;)
  table=10(ls_out_port_sec_l2 ), priority=50   , match=(outport == "ls1-vm2" && eth.dst == {00:00:00:00:00:02}), action=(output;)
  table=10(ls_out_port_sec_l2 ), priority=50   , match=(outport == "ls1-vm3" && eth.dst == {00:00:00:00:00:03}), action=(output;)

你可能感兴趣的:(ovn-northd 源码分析)