ovs conntrack及nat

本文分析ovs中对conntrack的支持,分为命令行解析ct action及ovs-vswitchd端对conntrack和nat的处理流程。

根据datapath不同,实现也不一样,但是原理都类似。对于kernel datapath来说,使用kernel的conntrack来实现,对于userspace datapath来说,ovs本身来实现,可参考 lib/conntrack.c 文件。

conntrack相关字段

匹配域

可参考 http://openvswitch.org/support/dist-docs/ovs-fields.7.txt

  1. ct_state 连接状态,可能的值如下
new 通过ct action指定报文经过conntrack模块处理,不一定有commit。通常是数据流的第一个数据包
est 表示conntrack模块看到了报文双向数据流,一定是在commit 的conntrack后
rel 表示和已经存在的conntrack相关,比如icmp不可达消息或者ftp的数据流
rpl 表示反方向的报文
inv 无效的,表示conntrack模块没有正确识别到报文,比如L3/L4 protocol handler没有加载,或者L3/L4 protocol handler认为报文错误
trk 表示报文经过了conntrack模块处理,如果这个flag不设置,其他flag都不能被设置
snat 表示报文经过了snat,源ip或者port
dnat 表示报文经过了dnat,目的ip或者port

这些flag得结合"+"或者"-"来使用,"+"表示必须匹配,"-"表示必须不匹配。可以同时指定多个flag,比如 ct_state=+trk+new。

数据包经过ct模块处理了就会设置状态 trk。什么叫经过ct模块处理?流表的action指定了ct,并且报文通过了协议验证。
pkt->md.ct_state = CS_TRACKED

什么是 commit?只有ct的action有了commit,才会在内存中建立connection

  1. ct_zone zone用来隔离连接跟踪表项,可以通过ct zone action来设置
  2. ct_mark 32位的值,可以通过 ct exec(set_field: 1->ct_mark)来设置。报文第一次匹配后,通过此action设置ct_mark到报文的metadata,重新注入datapath时,用来匹配流表指定的ct_mark。
  3. ct_label 128的值,可以通过 ct exec(set_field: 1->ct_label)来设置,用法和ct_mark类似
  4. ct_nw_src / ct_ipv6_src 用来匹配conntrack表项原始方向的源ip
  5. ct_nw_dst / ct_ipv6_dst 用来匹配conntrack表项原始方向的目的ip
  6. ct_nw_proto 用来匹配conntrack表项原始方向的协议类型
  7. ct_tp_src 用来匹配conntrack表项原始方向的源端口号
  8. ct_tp_dst 用来匹配conntrack表项原始方向的目的端口号

匹配域和flow 中下的以下字段对应,用来匹配流表

struct flow {
    ...
    uint8_t ct_state;           /* Connection tracking state. */
    uint8_t ct_nw_proto;        /* CT orig tuple IP protocol. */
    uint16_t ct_zone;           /* Connection tracking zone. */
    uint32_t ct_mark;           /* Connection mark.*/
    ovs_be32 ct_nw_src;         /* CT orig tuple IPv4 source address. */
    ovs_be32 ct_nw_dst;         /* CT orig tuple IPv4 destination address. */
    struct in6_addr ct_ipv6_src; /* CT orig tuple IPv6 source address. */
    struct in6_addr ct_ipv6_dst; /* CT orig tuple IPv6 destination address. */
    ovs_be16 ct_tp_src;         /* CT original tuple source port/ICMP type. */
    ovs_be16 ct_tp_dst;         /* CT original tuple dst port/ICMP code. */
    ...
}

动作

ovs通过ct action实现conntrack,格式如下,ct会将报文送到conntrack模块进行处理
ct([argument][,argument…])

ct支持下面的参数

commit 只有执行了commit,才会在conntrack模块创建conntrack表项
force 强制删除已存在的conntrack表项
table 跳转到指定的table执行
zone 设置zone,隔离conntrack
exec 执行其他action,目前只支持设置ct_mark和ct_label,比如exec(set_field: 1->ct_mark)
alg= 指定alg类型,目前只支持ftp和tftp
nat 指定ip和port

流表例子

#添加nat表项
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"

//在一个ct里指定多次nat,只有最后一个nat生效,可参考do_xlate_actions中,ctx->ct_nat_action = ofpact_get_NAT(a)只有一个ctx->ct_nat_action 
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333), nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"

//可以通过指定多个ct,实现fullnat,即同时转换源目的ip。
//但是这两个ct必须指定不同的zone,否则只有第一个ct生效。因为在 handle_nat 中,判断只有zone不一样才会进行后续的nat操作
//错误方式,指定了src和dst nat,但是zone相同,只有前面的snat生效
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit,nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"

//正确方式,使用不同zone,指定fullnat
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,zone=100, nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit, zone=200, nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"

源码分析

命令行解析ct参数

比如下面这条流表,通过ct_state匹配没经过conntrack处理的报文,一般刚被ovs接收的报文都能匹配到,执行的action是ct,其参数为commit和nat,表示需要创建conntrack表项,同时对报文做snat。
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"

重点分析下命令行解析ct action的代码。
先看下面两个结构体,struct ofpact_conntrack用来保存ct后面的参数,并使用另一个结构体struct ofpact_nat专门保存ct的nat信息。

/* OFPACT_NAT.
 *
 * Used for NXAST_NAT. */
struct ofpact_nat {
    struct ofpact ofpact; //指定类型为 OFPACT_NAT
    uint8_t range_af; /* AF_UNSPEC, AF_INET, or AF_INET6 */
    uint16_t flags;  /* NX_NAT_F_* */
    struct {
        struct {
            uint16_t min;
            uint16_t max;
        } proto;
        union {
            struct {
                ovs_be32 min;
                ovs_be32 max;
            } ipv4;
            struct {
                struct in6_addr min;
                struct in6_addr max;
            } ipv6;
        } addr;
    } range;
};

/* OFPACT_CT.
 *
 * Used for NXAST_CT. */
struct ofpact_conntrack {
    OFPACT_PADDED_MEMBERS(
        struct ofpact ofpact; //{ofpact = {type = OFPACT_CT, raw = 255 '\377', len = 32}
        uint16_t flags; //NX_CT_F_COMMIT和NX_CT_F_FORCE
        uint16_t zone_imm; //zone
        struct mf_subfield zone_src;
        uint16_t alg; //alg类型
        uint8_t recirc_table; //跳转到指定table
    );
    struct ofpact actions[0]; //保存 nat 信息 struct ofpact_nat {type = OFPACT_NAT, raw = 255 '\377', len = 48}
};

解析命令行参数 ct 指定的action,保存到 struct ofpact_conntrack,如果同时指定了nat,则nat信息保存在struct ofpact_nat,位置在 struct ofpact_conntrack->actions,最后会更新 struct ofpact_conntrack->ofpact.len 为总长度

static char * OVS_WARN_UNUSED_RESULT
parse_CT(char *arg, const struct ofputil_port_map *port_map,
         struct ofpbuf *ofpacts, enum ofputil_protocol *usable_protocols)
{
    const size_t ct_offset = ofpacts_pull(ofpacts);
    struct ofpact_conntrack *oc;
    char *error = NULL;
    char *key, *value;

    //ofpact_put_CT 在文件./include/openvswitch/ofp-actions.h:1163 ofpact_put_##ENUM 定义
    //设置 OFPACT_CT
    oc = ofpact_put_CT(ofpacts);
    oc->flags = 0;
    oc->recirc_table = NX_CT_RECIRC_NONE;
    while (ofputil_parse_key_value(&arg, &key, &value)) {
        if (!strcmp(key, "commit")) {
            oc->flags |= NX_CT_F_COMMIT;
        } else if (!strcmp(key, "force")) {
            oc->flags |= NX_CT_F_FORCE;
        } else if (!strcmp(key, "table")) {
            error = str_to_u8(value, "recirc_table", &oc->recirc_table);
            if (!error && oc->recirc_table == NX_CT_RECIRC_NONE) {
                error = xasprintf("invalid table %#"PRIx8, oc->recirc_table);
            }
        } else if (!strcmp(key, "zone")) {
            error = str_to_u16(value, "zone", &oc->zone_imm);

            if (error) {
                free(error);
                error = mf_parse_subfield(&oc->zone_src, value);
                if (error) {
                    return error;
                }
            }
        } else if (!strcmp(key, "alg")) {
            error = str_to_connhelper(value, &oc->alg);
        } else if (!strcmp(key, "nat")) {
            const size_t nat_offset = ofpacts_pull(ofpacts);
            //解析 nat 信息
            error = parse_NAT(value, port_map, ofpacts, usable_protocols);
            /* Update CT action pointer and length. */
            ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
            oc = ofpacts->header;
        } else if (!strcmp(key, "exec")) {
            /* Hide existing actions from ofpacts_parse_copy(), so the
             * nesting can be handled transparently. */
            enum ofputil_protocol usable_protocols2;
            const size_t exec_offset = ofpacts_pull(ofpacts);

            /* Initializes 'usable_protocol2', fold it back to
             * '*usable_protocols' afterwards, so that we do not lose
             * restrictions already in there. */
            //解析 exec 参数,比如 set_field ct(commit,exec(set_field:1->ct_mark)) (->后面的ct_mark为key,前面的1为value)
            //parse_SET_FIELD
            error = ofpacts_parse_copy(value, port_map, ofpacts, &usable_protocols2, false, OFPACT_CT);
            *usable_protocols &= usable_protocols2;
            ofpacts->header = ofpbuf_push_uninit(ofpacts, exec_offset);
            oc = ofpacts->header;
        } else {
            error = xasprintf("invalid argument to \"ct\" action: `%s'", key);
        }
        if (error) {
            break;
        }
    }
    if (!error && oc->flags & NX_CT_F_FORCE && !(oc->flags & NX_CT_F_COMMIT)) {
        error = xasprintf("\"force\" flag requires \"commit\" flag.");
    }
    //更新 struct ofpact_conntrack->ofpact.len,包含nat的长度
    ofpact_finish_CT(ofpacts, &oc);
    ofpbuf_push_uninit(ofpacts, ct_offset);
    return error;
}

static char * OVS_WARN_UNUSED_RESULT
parse_NAT(char *arg,
          const struct ofputil_port_map *port_map OVS_UNUSED,
          struct ofpbuf *ofpacts,
          enum ofputil_protocol *usable_protocols OVS_UNUSED)
{
    struct ofpact_nat *on = ofpact_put_NAT(ofpacts);
    char *key, *value;

    on->flags = 0;
    on->range_af = AF_UNSPEC;

    while (ofputil_parse_key_value(&arg, &key, &value)) {
        char *error = NULL;

        if (!strcmp(key, "src")) {
            on->flags |= NX_NAT_F_SRC;
            error = str_to_nat_range(value, on);
        } else if (!strcmp(key, "dst")) {
            on->flags |= NX_NAT_F_DST;
            error = str_to_nat_range(value, on);
        } else if (!strcmp(key, "persistent")) {
            on->flags |= NX_NAT_F_PERSISTENT;
        } else if (!strcmp(key, "hash")) {
            on->flags |= NX_NAT_F_PROTO_HASH;
        } else if (!strcmp(key, "random")) {
            on->flags |= NX_NAT_F_PROTO_RANDOM;
        } else {
            error = xasprintf("invalid key \"%s\" in \"nat\" argument",
                              key);
        }
        if (error) {
            return error;
        }
    }
    if (on->flags & NX_NAT_F_SRC && on->flags & NX_NAT_F_DST) {
        return xasprintf("May only specify one of \"src\" or \"dst\".");
    }
    if (!(on->flags & NX_NAT_F_SRC || on->flags & NX_NAT_F_DST)) {
        if (on->flags) {
            return xasprintf("Flags allowed only with \"src\" or \"dst\".");
        }
        if (on->range_af != AF_UNSPEC) {
            return xasprintf("Range allowed only with \"src\" or \"dst\".");
        }
    }
    if (on->flags & NX_NAT_F_PROTO_HASH && on->flags & NX_NAT_F_PROTO_RANDOM) {
        return xasprintf("Both \"hash\" and \"random\" are not allowed.");
    }

    return NULL;
}

解析成功后,可能的格式如下,其中struct ofpact_conntrack后面紧跟着其他嵌套的action,struct ofpact_conntrack->ofpact.len指定了ct参数总长度,包含nat和set_field的长度。struct ofpact_conntrack结构肯定是在前面,struct ofpact_nat和struct ofpact_set_field根据命令行指定的顺序而定,可以指定多次。

struct ofpact_conntrack(OFPACT_CT) + struct ofpact_nat(OFPACT_NAT) + struct ofpact_set_field(OFPACT_SET_FIELD)

辅助函数
在上面解析代码中,有一些函数定义需要经过宏展开后才能看到,比如ofpact_put_CT,ofpact_finish_CT和ofpact_put_NAT
这些函数都是在头文件./include/openvswitch/ofp-actions.h中定义的。宏OFPACT定义了五个函数,用来根据action类型进行操作。

#define OFPACT(ENUM, STRUCT, MEMBER, NAME)                              \
    BUILD_ASSERT_DECL(offsetof(struct STRUCT, ofpact) == 0);            \
                                                                        \
    enum { OFPACT_##ENUM##_SIZE                                         \
           = (offsetof(struct STRUCT, MEMBER)                           \
              ? offsetof(struct STRUCT, MEMBER)                         \
              : OFPACT_ALIGN(sizeof(struct STRUCT))) };                 \
                                                                        \
    static inline struct STRUCT *                                       \
    ofpact_get_##ENUM(const struct ofpact *ofpact)                      \
    {                                                                   \
        ovs_assert(ofpact->type == OFPACT_##ENUM);                      \
        return ALIGNED_CAST(struct STRUCT *, ofpact);                   \
    }                                                                   \
                                                                        \
    static inline struct STRUCT *                                       \
    ofpact_get_##ENUM##_nullable(const struct ofpact *ofpact)           \
    {                                                                   \
        ovs_assert(!ofpact || ofpact->type == OFPACT_##ENUM);           \
        return ALIGNED_CAST(struct STRUCT *, ofpact);                   \
    }                                                                   \
                                                                        \
    static inline struct STRUCT *                                       \
    ofpact_put_##ENUM(struct ofpbuf *ofpacts)                           \
    {                                                                   \
        return (struct STRUCT *) ofpact_put(ofpacts, OFPACT_##ENUM,     \
                                            OFPACT_##ENUM##_SIZE);      \
    }                                                                   \
                                                                        \
    static inline void                                                  \
    ofpact_init_##ENUM(struct STRUCT *ofpact)                           \
    {                                                                   \
        ofpact_init(&ofpact->ofpact, OFPACT_##ENUM,                     \
                    OFPACT_##ENUM##_SIZE);                              \
    }                                                                   \
                                                                        \
    static inline void                                                  \
    ofpact_finish_##ENUM(struct ofpbuf *ofpbuf, struct STRUCT **ofpactp) \
    {                                                                   \
        struct ofpact *ofpact = &(*ofpactp)->ofpact;                    \
        ovs_assert(ofpact->type == OFPACT_##ENUM);                      \
        *ofpactp = (struct STRUCT *) ofpact_finish(ofpbuf, ofpact);     \
    }
OFPACTS
#undef OFPACT

OFPACTS 为如下的宏定义:
   ENUM                    STRUCT              MEMBER  NAME
#define OFPACTS                                                         \
    /* Output. */                                                       \
    OFPACT(OUTPUT,          ofpact_output,      ofpact, "output")       \
    ...
    /* Header changes. */                                               \
    OFPACT(SET_FIELD,       ofpact_set_field,   ofpact, "set_field")    \
    ...
    OFPACT(CT,              ofpact_conntrack,   ofpact, "ct")           \
    OFPACT(CT_CLEAR,        ofpact_null,        ofpact, "ct_clear")     \
    OFPACT(NAT,             ofpact_nat,         ofpact, "nat")          \

比如对于CT action来说,宏展开后为

    enum { OFPACT_CT_SIZE                                         \
           = (offsetof(struct ofpact_conntrack, ofpact)                           \
              ? offsetof(struct ofpact_conntrack, ofpact)                         \
              : OFPACT_ALIGN(sizeof(struct ofpact_conntrack))) };                 \
                                                                        \
    static inline struct ofpact_conntrack *                                       \
    ofpact_get_CT(const struct ofpact *ofpact)                      \
    {                                                                   \
        ovs_assert(ofpact->type == OFPACT_CT);                      \
        return ALIGNED_CAST(struct ofpact_conntrack *, ofpact);                   \
    }                                                                   \
                                                                        \
    static inline struct ofpact_conntrack *                                       \
    ofpact_get_CT_nullable(const struct ofpact *ofpact)           \
    {                                                                   \
        ovs_assert(!ofpact || ofpact->type == OFPACT_CT);           \
        return ALIGNED_CAST(struct ofpact_conntrack *, ofpact);                   \
    }                                                                   \
                                                                        \
    static inline struct ofpact_conntrack *                                       \
    ofpact_put_CT(struct ofpbuf *ofpacts)                           \
    {                                                                   \
        return (struct ofpact_conntrack *) ofpact_put(ofpacts, OFPACT_CT,     \
                                            OFPACT_CT_SIZE);      \
    }                                                                   \
                                                                        \
    static inline void                                                  \
    ofpact_init_CT(struct ofpact_conntrack *ofpact)                           \
    {                                                                   \
        ofpact_init(&ofpact->ofpact, OFPACT_CT,                     \
                    OFPACT_CT_SIZE);                              \
    }                                                                   \
                                                                        \
    static inline void                                                  \
    ofpact_finish_CT(struct ofpbuf *ofpbuf, struct ofpact_conntrack **ofpactp) \
    {                                                                   \
        struct ofpact *ofpact = &(*ofpactp)->ofpact;                    \
        ovs_assert(ofpact->type == OFPACT_CT);                      \
        *ofpactp = (struct ofpact_conntrack *) ofpact_finish(ofpbuf, ofpact);     \
    }

ovs-vswitchd端处理

ovs-vswitchd接收到命令行添加流表消息并解析后,添加到本地flow table中,等待匹配报文。

slowpath解析ct action
ovs接收到报文后,查找fastpath失败,继续slowpath查找,如果匹配到的流表的action为ct,处理流程如下

do_xlate_actions
const struct ofpact *a;
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len)
    switch (a->type)
    //action为CT
    case OFPACT_CT:
        //ofpact_get_CT获取struct ofpact_conntrack及其后面嵌套的action
        compose_conntrack_action(ctx, ofpact_get_CT(a));

将 struct ofpact_conntrack 结构中action信息转换到datapath能识别的action结构odp_actions中。

static void
compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc)
    //内部再次调用do_xlate_actions,解析nat和ct_mark,ct_label信息
    do_xlate_actions(ofc->actions, ofpact_ct_get_action_len(ofc), ctx);
        //获取nat信息,保存到 ctx->ct_nat_action,如果指定了多次nat,只有最后一次会生效
        case OFPACT_NAT:
            /* This will be processed by compose_conntrack_action(). */
            ctx->ct_nat_action = ofpact_get_NAT(a);
            break;

        //解析 ct_mark 或者 ct_label 并保存到 flow->ct_mark和 flow->ct_label
        case OFPACT_SET_FIELD:
            set_field = ofpact_get_SET_FIELD(a);
            mf = set_field->field;

            /* Set the field only if the packet actually has it. */
            if (mf_are_prereqs_ok(mf, flow, wc)) {
                mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc);
                mf_set_flow_value_masked(mf, set_field->value,
                                         ofpact_set_field_mask(set_field),
                                         flow);
    if (ofc->zone_src.field) {
        zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow);
    } else {
        zone = ofc->zone_imm;
    }

    //添加第一个 datapath action OVS_ACTION_ATTR_CT
    //OVS_ACTION_ATTR_CT 开始
    ct_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CT);
    if (ofc->flags & NX_CT_F_COMMIT) {
        nl_msg_put_flag(ctx->odp_actions, ofc->flags & NX_CT_F_FORCE ? OVS_CT_ATTR_FORCE_COMMIT : OVS_CT_ATTR_COMMIT);
        if (ctx->xbridge->support.ct_eventmask) {
            nl_msg_put_u32(ctx->odp_actions, OVS_CT_ATTR_EVENTMASK, OVS_CT_EVENTMASK_DEFAULT);
        }
    }

    nl_msg_put_u16(ctx->odp_actions, OVS_CT_ATTR_ZONE, zone);
    put_ct_mark(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
        if (wc->masks.ct_mark) {
            struct {
                uint32_t key;
                uint32_t mask;
            } *odp_ct_mark;

            odp_ct_mark = nl_msg_put_unspec_uninit(odp_actions, OVS_CT_ATTR_MARK, sizeof(*odp_ct_mark));
            odp_ct_mark->key = flow->ct_mark & wc->masks.ct_mark;
            odp_ct_mark->mask = wc->masks.ct_mark;
        }
    put_ct_label(&ctx->xin->flow, ctx->odp_actions, ctx->wc);

    put_ct_helper(ctx, ctx->odp_actions, ofc);
        if (ofc->alg) {
            switch(ofc->alg) {
            case IPPORT_FTP:
                nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "ftp");
                break;
            case IPPORT_TFTP:
                nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "tftp");
                break;
            default:
                xlate_report_error(ctx, "cannot serialize ct_helper %d", ofc->alg);
                break;
            }
        }

    put_ct_nat(ctx);
        struct ofpact_nat *ofn = ctx->ct_nat_action;
        nat_offset = nl_msg_start_nested(ctx->odp_actions, OVS_CT_ATTR_NAT);
        if (ofn->flags & NX_NAT_F_SRC || ofn->flags & NX_NAT_F_DST) {
            nl_msg_put_flag(ctx->odp_actions, ofn->flags & NX_NAT_F_SRC
                            ? OVS_NAT_ATTR_SRC : OVS_NAT_ATTR_DST);
            if (ofn->flags & NX_NAT_F_PERSISTENT) {
                nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PERSISTENT);
            }
            if (ofn->flags & NX_NAT_F_PROTO_HASH) {
                nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_HASH);
            } else if (ofn->flags & NX_NAT_F_PROTO_RANDOM) {
                nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_RANDOM);
            }
            ...
        }
        nl_msg_end_nested(ctx->odp_actions, nat_offset);

    ctx->ct_nat_action = NULL;
    //OVS_ACTION_ATTR_CT 结束
    nl_msg_end_nested(ctx->odp_actions, ct_offset);

    //如果配置 ct(table=x) 则需要添加第二个 datapath action OVS_ACTION_ATTR_RECIRC
    //recirc_table 值为table id,表示需要转到其他table继续执行,比如 actions=ct(table=0)
    //值为 NX_CT_RECIRC_NONE,说明不需要
    if (ofc->recirc_table == NX_CT_RECIRC_NONE) {
        /* If we do not recirculate as part of this action, hide the results of
         * connection tracking from subsequent recirculations. */
        ctx->conntracked = false;
    } else {
        /* Use ct_* fields from datapath during recirculation upcall. */
        ctx->conntracked = true;
        compose_recirculate_and_fork(ctx, ofc->recirc_table);
            uint32_t recirc_id;
            ctx->freezing = true;
            recirc_id = finish_freezing__(ctx, table);
                struct frozen_state state = {
                    //保存需要跳转到的 table id,即 recirc_table
                    .table_id = table,
                    .ofproto_uuid = ctx->xbridge->ofproto->uuid,
                    .stack = ctx->stack.data,
                    .stack_size = ctx->stack.size,
                    .mirrors = ctx->mirrors,
                    .conntracked = ctx->conntracked,
                    .xport_uuid = ctx->xin->xport_uuid,
                    .ofpacts = ctx->frozen_actions.data,
                    .ofpacts_len = ctx->frozen_actions.size,
                    .action_set = ctx->action_set.data,
                    .action_set_len = ctx->action_set.size,
                };
                frozen_metadata_from_flow(&state.metadata, &ctx->xin->flow);

                //获取 recirc_id,保存到 odp_actions,作为datapath的其中一个action
                id = recirc_alloc_id_ctx(&state);
                    uint32_t hash = frozen_state_hash(state);
                    struct recirc_id_node *node = recirc_ref_equal(state, hash);
                    node = recirc_alloc_id__(state, hash);
                        struct recirc_id_node *node = xzalloc(sizeof *node);
                        node->hash = hash;
                        ovs_refcount_init(&node->refcount);
                        frozen_state_clone(CONST_CAST(struct frozen_state *, &node->state), state);
                        cmap_insert(&id_map, &node->id_node, node->id);
                        cmap_insert(&metadata_map, &node->metadata_node, node->hash);
                        return node;
                    node->id;
                nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, id);
        ctx->conntracked = false;
    }

通过上面的解析,对于openflow中的action ct会生成在datapath中用的action,可能包含两种action: OVS_ACTION_ATTR_CT和OVS_ACTION_ATTR_RECIRC,前者又包含了commit(OVS_CT_ATTR_FORCE_COMMIT),ct_mark(OVS_CT_ATTR_MARK), ct_label和nat(OVS_CT_ATTR_NAT)等信息,后者仅仅包含了recirc_id,用来重新注入datapath后查看到table id,即用来跳转到指定table执行。

fastpath执行ct action
在将上面获取到的action添加到datapath后,还需要立即对触发slowpath的报文执行action。

packet_batch_per_flow_execute
    actions = dp_netdev_flow_get_actions(flow);
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, actions->actions, actions->size, now);
        odp_execute_actions
            //遍历执行匹配流表的所有 actions
            NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len)
                int type = nl_attr_type(a);
                //dp_execute_cb
                dp_execute_action(dp, batch, a, may_steal);
                    //执行 ct action
                    case OVS_ACTION_ATTR_CT: {
                        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), nl_attr_get_size(a)) {
                            enum ovs_ct_attr sub_type = nl_attr_type(b);
                            switch(sub_type) {
                            case OVS_CT_ATTR_FORCE_COMMIT:
                                force = true;
                                /* fall through. */
                            case OVS_CT_ATTR_COMMIT:
                                commit = true;
                                break;
                            case OVS_CT_ATTR_ZONE:
                                zone = nl_attr_get_u16(b);
                                break;
                            case OVS_CT_ATTR_HELPER:
                                helper = nl_attr_get_string(b);
                                break;
                            case OVS_CT_ATTR_MARK:
                                setmark = nl_attr_get(b);
                                break;
                            case OVS_CT_ATTR_LABELS:
                                setlabel = nl_attr_get(b);
                                break;
                            case OVS_CT_ATTR_EVENTMASK:
                                /* Silently ignored, as userspace datapath does not generate
                                 * netlink events. */
                                break;
                            case OVS_CT_ATTR_NAT: {
                                const struct nlattr *b_nest;
                                unsigned int left_nest;
                                bool ip_min_specified = false;
                                bool proto_num_min_specified = false;
                                bool ip_max_specified = false;
                                bool proto_num_max_specified = false;
                                memset(&nat_action_info, 0, sizeof nat_action_info);
                                nat_action_info_ref = &nat_action_info;

                                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
                                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);

                                    switch (sub_type_nest) {
                                    case OVS_NAT_ATTR_SRC:
                                    case OVS_NAT_ATTR_DST:
                                        nat_config = true;
                                        nat_action_info.nat_action |=
                                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
                                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
                                        break;
                                    case OVS_NAT_ATTR_IP_MIN:
                                        memcpy(&nat_action_info.min_addr,
                                               nl_attr_get(b_nest),
                                               nl_attr_get_size(b_nest));
                                        ip_min_specified = true;
                                        break;
                                    case OVS_NAT_ATTR_IP_MAX:
                                        memcpy(&nat_action_info.max_addr,
                                               nl_attr_get(b_nest),
                                               nl_attr_get_size(b_nest));
                                        ip_max_specified = true;
                                        break;
                                    case OVS_NAT_ATTR_PROTO_MIN:
                                        nat_action_info.min_port =
                                            nl_attr_get_u16(b_nest);
                                        proto_num_min_specified = true;
                                        break;
                                    case OVS_NAT_ATTR_PROTO_MAX:
                                        nat_action_info.max_port =
                                            nl_attr_get_u16(b_nest);
                                        proto_num_max_specified = true;
                                        break;
                                    //persistent,hash和random在 userspace datapath中没用到
                                    case OVS_NAT_ATTR_PERSISTENT:
                                    case OVS_NAT_ATTR_PROTO_HASH:
                                    case OVS_NAT_ATTR_PROTO_RANDOM:
                                        break;
                                    case OVS_NAT_ATTR_UNSPEC:
                                    case __OVS_NAT_ATTR_MAX:
                                        OVS_NOT_REACHED();
                                    }
                                }

                                if (ip_min_specified && !ip_max_specified) {
                                    nat_action_info.max_addr = nat_action_info.min_addr;
                                }
                                if (proto_num_min_specified && !proto_num_max_specified) {
                                    nat_action_info.max_port = nat_action_info.min_port;
                                }
                                if (proto_num_min_specified || proto_num_max_specified) {
                                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
                                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
                                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
                                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
                                    }
                                }
                                break;
                            }
                            }
                        }

                        /* We won't be able to function properly in this case, hence
                         * complain loudly. */
                        if (nat_config && !commit) {
                            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
                            VLOG_WARN_RL(&rl, "NAT specified without commit.");
                        }
                        //struct dp_netdev 是全局的,所以 dp->conntrack 也是全局的,多个pmd共享dp->conntrack
                        struct dp_netdev *dp = pmd->dp;
                        conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force, commit, zone, setmark, setlabel, helper, nat_action_info_ref);
                    //跳转到其他table执行
                    case OVS_ACTION_ATTR_RECIRC:
                        if (*depth < MAX_RECIRC_DEPTH) {
                            struct dp_packet_batch recirc_pkts;

                            if (!may_steal) {
                               dp_packet_batch_clone(&recirc_pkts, packets_);
                               packets_ = &recirc_pkts;
                            }

                            struct dp_packet *packet;
                            DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
                                //获取 recirc_id
                                packet->md.recirc_id = nl_attr_get_u32(a);
                            }

                            (*depth)++;
                            //重新进入 slowpath,查找指定table的openflow流表
                            dp_netdev_recirculate(pmd, packets_);
                                dp_netdev_input__(pmd, packets, true, 0);
                                    emc_processing
                                    fast_path_processing
                            (*depth)--;

                            return;
                        }

对于ct action来说,conntrack_execute是主要处理函数

int
conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
                  ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
                  const uint32_t *setmark,
                  const struct ovs_key_ct_labels *setlabel,
                  const char *helper,
                  const struct nat_action_info_t *nat_action_info)
    for (size_t i = 0; i < cnt; i++) {
        //从 pkts 中提取报文信息到 ct->key,并判断报文是否合法
        if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone))
            ctx->key.zone = zone;
            ctx->key.dl_type = dl_type;
            extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, !hwol_good_l3_csum);
                key->src.addr.ipv4 = ip->ip_src;
                key->dst.addr.ipv4 = ip->ip_dst;
                key->nw_proto = ip->ip_proto;
            extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3, !hwol_good_l4_csum);
            //计算 hash 值
            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
        {
            //如果报文不合法,则设置 CS_INVALID 后,继续处理下一个报文
            pkts[i]->md.ct_state = CS_INVALID;
            write_ct_md(pkts[i], zone, NULL, NULL, NULL);
            continue;
        }
        //开始处理合法报文
        process_one(ct, pkts[i], &ctx, zone, force, commit, now, setmark, setlabel, nat_action_info, helper);
            struct conn *conn;
            //根据 hash 值,得出一个 hash 桶
            unsigned bucket = hash_to_bucket(ctx->hash);
                #define CONNTRACK_BUCKETS_SHIFT 8
                #define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
                //hash 桶大小 256
                return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;

            //根据 ctx->key 查找 conn,如果是reply方向数据流,则设置reply标志
            conn_key_lookup(&ct->buckets[bucket], ctx, now);
                uint32_t hash = ctx->hash;
                struct conn *conn;
                HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
                    if (!conn_key_cmp(&conn->key, &ctx->key)
                            && !conn_expired(conn, now)) {
                        ctx->conn = conn;
                        ctx->reply = false;
                        break;
                    }
                    if (!conn_key_cmp(&conn->rev_key, &ctx->key)
                            && !conn_expired(conn, now)) {
                        ctx->conn = conn;
                        ctx->reply = true;
                        break;
                    }
                }
            conn = ctx->conn;

            /* Delete found entry if in wrong direction. 'force' implies commit. */
            if (conn && force && ctx->reply) {
                conn_clean(ct, conn, &ct->buckets[bucket]);
                conn = NULL;
            }

            bool create_new_conn = false;
            struct conn conn_for_un_nat_copy;
            conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
            bool ftp_ctl = is_ftp_ctl(pkt);

            if (OVS_LIKELY(conn)) {
                if (ftp_ctl) {
                    /* Keep sequence tracking in sync with the source of the
                     * sequence skew. */
                    if (ctx->reply != conn->seq_skew_dir) {
                        handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
                                       !!nat_action_info);
                        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                            bucket);
                    } else {
                        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                            bucket);
                        handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
                                       !!nat_action_info);
                    }
                } else {
                    create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                        bucket);
                }
                if (nat_action_info && !create_new_conn) {
                    handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
                }

            }else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
                                       nat_action_info)) {
                create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
                                                    bucket);
            } else {
                if (ctx->icmp_related) {
                    /* An icmp related conn should always be found; no new
                       connection is created based on an icmp related packet. */
                    pkt->md.ct_state = CS_INVALID;
                } else {
                    create_new_conn = true;
                }
            }

            if (OVS_UNLIKELY(create_new_conn)) {
                conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info, &conn_for_un_nat_copy, helper, alg_exp);
                    unsigned bucket = hash_to_bucket(ctx->hash);
                    struct conn *nc = NULL;
                    
                    //四层协议判断报文是否有效
                    if (!valid_new(pkt, &ctx->key))
                        return l4_protos[key->nw_proto]->valid_new(pkt);
                    {
                        pkt->md.ct_state = CS_INVALID;
                        return nc;
                    }

                    //设置 CS_NEW
                    pkt->md.ct_state = CS_NEW;

                    //只有设置了 commit,才会将conn添加到hash表
                    if (commit) {
                        //判断是否超过 conn 表项最大限制
                        unsigned int n_conn_limit;
                        atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
                        if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
                            COVERAGE_INC(conntrack_full);
                            return nc;
                        }

                        //创建新表项
                        nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
                            struct conn *newconn;
                            //tcp_new_conn
                            newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
                            newconn->key = *key;
                            return newconn;

                        ctx->conn = nc;
                        nc->rev_key = nc->key;
                        //翻转key
                        conn_key_reverse(&nc->rev_key);

                        if (nat_action_info) {
                            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
                            if (alg_exp) {
                            } else {
                                *conn_for_un_nat_copy = *nc;
                                ct_rwlock_wrlock(&ct->resources_lock);
                                //根据nat配置,选择合适的ip和port
                                bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy);
                                    bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn, ct->hash_basis);
                                        //将 nat 的conn插入 nat_conn_keys
                                        hmap_insert(nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash);
                                if (!nat_res) {
                                    goto nat_res_exhaustion;
                                }
                                /* Update nc with nat adjustments made to
                                 * conn_for_un_nat_copy by nat_select_range_tuple(). */
                                *nc = *conn_for_un_nat_copy;
                                ct_rwlock_unlock(&ct->resources_lock);
                            }
                            //设置 conn_type 为 CT_CONN_TYPE_UN_NAT,表示此表项需要nat
                            conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
                            conn_for_un_nat_copy->nat_info = NULL;
                            conn_for_un_nat_copy->alg = NULL;
                            //将报文做nat转换
                            nat_packet(pkt, nc, ctx->icmp_related);
                                if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
                                    pkt->md.ct_state |= CS_SRC_NAT;
                                    if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
                                        struct ip_header *nh = dp_packet_l3(pkt);
                                        packet_set_ipv4_addr(pkt, &nh->ip_src, conn->rev_key.dst.addr.ipv4_aligned);
                                    }
                                    if (!related) {
                                        pat_packet(pkt, conn);
                                    }
                                } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
                                    pkt->md.ct_state |= CS_DST_NAT;
                                }
                        }
                        //将新建表项插入hash表
                        hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
                        //增加表项个数
                        atomic_count_inc(&ct->n_conn);
                    }
                    return nc;
            }

            write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
                pkt->md.ct_state |= CS_TRACKED;
                pkt->md.ct_zone = zone;
                pkt->md.ct_mark = conn ? conn->mark : 0;
                pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;

                pkt->md.ct_orig_tuple_ipv6 = false;
                if (key) {
                    if (key->dl_type == htons(ETH_TYPE_IP)) {
                        //ct_orig_tuple 保存原始报文(第一次进ct模块时)的五元组信息
                        pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
                            key->src.addr.ipv4_aligned,
                            key->dst.addr.ipv4_aligned,
                            key->nw_proto != IPPROTO_ICMP
                            ? key->src.port : htons(key->src.icmp_type),
                            key->nw_proto != IPPROTO_ICMP
                            ? key->dst.port : htons(key->src.icmp_code),
                            key->nw_proto,
                        };
                    }
                }

            if (conn && setmark) {
                set_mark(pkt, conn, setmark[0], setmark[1]);
            }

            if (conn && setlabel) {
                set_label(pkt, conn, &setlabel[0], &setlabel[1]);
            }
    }
}

清除conntrack表项

创建datapath时,会启动专门的线程clean_thread_main清除超期的conntrack表项

create_dp_netdev
    struct dp_netdev *dp;

    dp = xzalloc(sizeof *dp);
    conntrack_init(&dp->conntrack);
        ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
        
static void *
clean_thread_main(void *f_)
{
    struct conntrack *ct = f_;

    while (!latch_is_set(&ct->clean_thread_exit)) {
        long long next_wake;
        long long now = time_msec();

        next_wake = conntrack_clean(ct, now);

        if (next_wake < now) {
            poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
        } else {
            poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
        }
        latch_wait(&ct->clean_thread_exit);
        poll_block();
    }

    return NULL;
}

参考

https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/
https://zhuanlan.zhihu.com/p/25089778

你可能感兴趣的:(ovs conntrack及nat)