本文分析ovs中对conntrack的支持,分为命令行解析ct action及ovs-vswitchd端对conntrack和nat的处理流程。
根据datapath不同,实现也不一样,但是原理都类似。对于kernel datapath来说,使用kernel的conntrack来实现,对于userspace datapath来说,ovs本身来实现,可参考 lib/conntrack.c 文件。
conntrack相关字段
匹配域
可参考 http://openvswitch.org/support/dist-docs/ovs-fields.7.txt
- ct_state 连接状态,可能的值如下
new 通过ct action指定报文经过conntrack模块处理,不一定有commit。通常是数据流的第一个数据包
est 表示conntrack模块看到了报文双向数据流,一定是在commit 的conntrack后
rel 表示和已经存在的conntrack相关,比如icmp不可达消息或者ftp的数据流
rpl 表示反方向的报文
inv 无效的,表示conntrack模块没有正确识别到报文,比如L3/L4 protocol handler没有加载,或者L3/L4 protocol handler认为报文错误
trk 表示报文经过了conntrack模块处理,如果这个flag不设置,其他flag都不能被设置
snat 表示报文经过了snat,源ip或者port
dnat 表示报文经过了dnat,目的ip或者port
这些flag得结合"+"或者"-"来使用,"+"表示必须匹配,"-"表示必须不匹配。可以同时指定多个flag,比如 ct_state=+trk+new。
数据包经过ct模块处理了就会设置状态 trk。什么叫经过ct模块处理?流表的action指定了ct,并且报文通过了协议验证。
pkt->md.ct_state = CS_TRACKED
什么是 commit?只有ct的action有了commit,才会在内存中建立connection
- ct_zone zone用来隔离连接跟踪表项,可以通过ct zone action来设置
- ct_mark 32位的值,可以通过 ct exec(set_field: 1->ct_mark)来设置。报文第一次匹配后,通过此action设置ct_mark到报文的metadata,重新注入datapath时,用来匹配流表指定的ct_mark。
- ct_label 128的值,可以通过 ct exec(set_field: 1->ct_label)来设置,用法和ct_mark类似
- ct_nw_src / ct_ipv6_src 用来匹配conntrack表项原始方向的源ip
- ct_nw_dst / ct_ipv6_dst 用来匹配conntrack表项原始方向的目的ip
- ct_nw_proto 用来匹配conntrack表项原始方向的协议类型
- ct_tp_src 用来匹配conntrack表项原始方向的源端口号
- ct_tp_dst 用来匹配conntrack表项原始方向的目的端口号
匹配域和flow 中下的以下字段对应,用来匹配流表
struct flow {
...
uint8_t ct_state; /* Connection tracking state. */
uint8_t ct_nw_proto; /* CT orig tuple IP protocol. */
uint16_t ct_zone; /* Connection tracking zone. */
uint32_t ct_mark; /* Connection mark.*/
ovs_be32 ct_nw_src; /* CT orig tuple IPv4 source address. */
ovs_be32 ct_nw_dst; /* CT orig tuple IPv4 destination address. */
struct in6_addr ct_ipv6_src; /* CT orig tuple IPv6 source address. */
struct in6_addr ct_ipv6_dst; /* CT orig tuple IPv6 destination address. */
ovs_be16 ct_tp_src; /* CT original tuple source port/ICMP type. */
ovs_be16 ct_tp_dst; /* CT original tuple dst port/ICMP code. */
...
}
动作
ovs通过ct action实现conntrack,格式如下,ct会将报文送到conntrack模块进行处理
ct([argument][,argument…])
ct支持下面的参数
commit 只有执行了commit,才会在conntrack模块创建conntrack表项
force 强制删除已存在的conntrack表项
table 跳转到指定的table执行
zone 设置zone,隔离conntrack
exec 执行其他action,目前只支持设置ct_mark和ct_label,比如exec(set_field: 1->ct_mark)
alg= 指定alg类型,目前只支持ftp和tftp
nat 指定ip和port
流表例子
#添加nat表项
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"
//在一个ct里指定多次nat,只有最后一个nat生效,可参考do_xlate_actions中,ctx->ct_nat_action = ofpact_get_NAT(a)只有一个ctx->ct_nat_action
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333), nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
//可以通过指定多个ct,实现fullnat,即同时转换源目的ip。
//但是这两个ct必须指定不同的zone,否则只有第一个ct生效。因为在 handle_nat 中,判断只有zone不一样才会进行后续的nat操作
//错误方式,指定了src和dst nat,但是zone相同,只有前面的snat生效
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit,nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
//正确方式,使用不同zone,指定fullnat
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, actions=ct(commit,zone=100, nat(src=10.1.1.240-10.2.2.2:2222-3333)), ct(commit, zone=200, nat(dst=10.1.1.240-10.2.2.2:2222-3333)), veth_r0"
源码分析
命令行解析ct参数
比如下面这条流表,通过ct_state匹配没经过conntrack处理的报文,一般刚被ovs接收的报文都能匹配到,执行的action是ct,其参数为commit和nat,表示需要创建conntrack表项,同时对报文做snat。
ovs-ofctl add-flow br0 "table=0, priority=50, ct_state=-trk, tcp, in_port=veth_l0, actions=ct(commit,nat(src=10.1.1.240-10.2.2.2:2222-3333))"
重点分析下命令行解析ct action的代码。
先看下面两个结构体,struct ofpact_conntrack用来保存ct后面的参数,并使用另一个结构体struct ofpact_nat专门保存ct的nat信息。
/* OFPACT_NAT.
*
* Used for NXAST_NAT. */
struct ofpact_nat {
struct ofpact ofpact; //指定类型为 OFPACT_NAT
uint8_t range_af; /* AF_UNSPEC, AF_INET, or AF_INET6 */
uint16_t flags; /* NX_NAT_F_* */
struct {
struct {
uint16_t min;
uint16_t max;
} proto;
union {
struct {
ovs_be32 min;
ovs_be32 max;
} ipv4;
struct {
struct in6_addr min;
struct in6_addr max;
} ipv6;
} addr;
} range;
};
/* OFPACT_CT.
*
* Used for NXAST_CT. */
struct ofpact_conntrack {
OFPACT_PADDED_MEMBERS(
struct ofpact ofpact; //{ofpact = {type = OFPACT_CT, raw = 255 '\377', len = 32}
uint16_t flags; //NX_CT_F_COMMIT和NX_CT_F_FORCE
uint16_t zone_imm; //zone
struct mf_subfield zone_src;
uint16_t alg; //alg类型
uint8_t recirc_table; //跳转到指定table
);
struct ofpact actions[0]; //保存 nat 信息 struct ofpact_nat {type = OFPACT_NAT, raw = 255 '\377', len = 48}
};
解析命令行参数 ct 指定的action,保存到 struct ofpact_conntrack,如果同时指定了nat,则nat信息保存在struct ofpact_nat,位置在 struct ofpact_conntrack->actions,最后会更新 struct ofpact_conntrack->ofpact.len 为总长度
static char * OVS_WARN_UNUSED_RESULT
parse_CT(char *arg, const struct ofputil_port_map *port_map,
struct ofpbuf *ofpacts, enum ofputil_protocol *usable_protocols)
{
const size_t ct_offset = ofpacts_pull(ofpacts);
struct ofpact_conntrack *oc;
char *error = NULL;
char *key, *value;
//ofpact_put_CT 在文件./include/openvswitch/ofp-actions.h:1163 ofpact_put_##ENUM 定义
//设置 OFPACT_CT
oc = ofpact_put_CT(ofpacts);
oc->flags = 0;
oc->recirc_table = NX_CT_RECIRC_NONE;
while (ofputil_parse_key_value(&arg, &key, &value)) {
if (!strcmp(key, "commit")) {
oc->flags |= NX_CT_F_COMMIT;
} else if (!strcmp(key, "force")) {
oc->flags |= NX_CT_F_FORCE;
} else if (!strcmp(key, "table")) {
error = str_to_u8(value, "recirc_table", &oc->recirc_table);
if (!error && oc->recirc_table == NX_CT_RECIRC_NONE) {
error = xasprintf("invalid table %#"PRIx8, oc->recirc_table);
}
} else if (!strcmp(key, "zone")) {
error = str_to_u16(value, "zone", &oc->zone_imm);
if (error) {
free(error);
error = mf_parse_subfield(&oc->zone_src, value);
if (error) {
return error;
}
}
} else if (!strcmp(key, "alg")) {
error = str_to_connhelper(value, &oc->alg);
} else if (!strcmp(key, "nat")) {
const size_t nat_offset = ofpacts_pull(ofpacts);
//解析 nat 信息
error = parse_NAT(value, port_map, ofpacts, usable_protocols);
/* Update CT action pointer and length. */
ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
oc = ofpacts->header;
} else if (!strcmp(key, "exec")) {
/* Hide existing actions from ofpacts_parse_copy(), so the
* nesting can be handled transparently. */
enum ofputil_protocol usable_protocols2;
const size_t exec_offset = ofpacts_pull(ofpacts);
/* Initializes 'usable_protocol2', fold it back to
* '*usable_protocols' afterwards, so that we do not lose
* restrictions already in there. */
//解析 exec 参数,比如 set_field ct(commit,exec(set_field:1->ct_mark)) (->后面的ct_mark为key,前面的1为value)
//parse_SET_FIELD
error = ofpacts_parse_copy(value, port_map, ofpacts, &usable_protocols2, false, OFPACT_CT);
*usable_protocols &= usable_protocols2;
ofpacts->header = ofpbuf_push_uninit(ofpacts, exec_offset);
oc = ofpacts->header;
} else {
error = xasprintf("invalid argument to \"ct\" action: `%s'", key);
}
if (error) {
break;
}
}
if (!error && oc->flags & NX_CT_F_FORCE && !(oc->flags & NX_CT_F_COMMIT)) {
error = xasprintf("\"force\" flag requires \"commit\" flag.");
}
//更新 struct ofpact_conntrack->ofpact.len,包含nat的长度
ofpact_finish_CT(ofpacts, &oc);
ofpbuf_push_uninit(ofpacts, ct_offset);
return error;
}
static char * OVS_WARN_UNUSED_RESULT
parse_NAT(char *arg,
const struct ofputil_port_map *port_map OVS_UNUSED,
struct ofpbuf *ofpacts,
enum ofputil_protocol *usable_protocols OVS_UNUSED)
{
struct ofpact_nat *on = ofpact_put_NAT(ofpacts);
char *key, *value;
on->flags = 0;
on->range_af = AF_UNSPEC;
while (ofputil_parse_key_value(&arg, &key, &value)) {
char *error = NULL;
if (!strcmp(key, "src")) {
on->flags |= NX_NAT_F_SRC;
error = str_to_nat_range(value, on);
} else if (!strcmp(key, "dst")) {
on->flags |= NX_NAT_F_DST;
error = str_to_nat_range(value, on);
} else if (!strcmp(key, "persistent")) {
on->flags |= NX_NAT_F_PERSISTENT;
} else if (!strcmp(key, "hash")) {
on->flags |= NX_NAT_F_PROTO_HASH;
} else if (!strcmp(key, "random")) {
on->flags |= NX_NAT_F_PROTO_RANDOM;
} else {
error = xasprintf("invalid key \"%s\" in \"nat\" argument",
key);
}
if (error) {
return error;
}
}
if (on->flags & NX_NAT_F_SRC && on->flags & NX_NAT_F_DST) {
return xasprintf("May only specify one of \"src\" or \"dst\".");
}
if (!(on->flags & NX_NAT_F_SRC || on->flags & NX_NAT_F_DST)) {
if (on->flags) {
return xasprintf("Flags allowed only with \"src\" or \"dst\".");
}
if (on->range_af != AF_UNSPEC) {
return xasprintf("Range allowed only with \"src\" or \"dst\".");
}
}
if (on->flags & NX_NAT_F_PROTO_HASH && on->flags & NX_NAT_F_PROTO_RANDOM) {
return xasprintf("Both \"hash\" and \"random\" are not allowed.");
}
return NULL;
}
解析成功后,可能的格式如下,其中struct ofpact_conntrack后面紧跟着其他嵌套的action,struct ofpact_conntrack->ofpact.len指定了ct参数总长度,包含nat和set_field的长度。struct ofpact_conntrack结构肯定是在前面,struct ofpact_nat和struct ofpact_set_field根据命令行指定的顺序而定,可以指定多次。
struct ofpact_conntrack(OFPACT_CT) + struct ofpact_nat(OFPACT_NAT) + struct ofpact_set_field(OFPACT_SET_FIELD)
辅助函数
在上面解析代码中,有一些函数定义需要经过宏展开后才能看到,比如ofpact_put_CT,ofpact_finish_CT和ofpact_put_NAT
这些函数都是在头文件./include/openvswitch/ofp-actions.h中定义的。宏OFPACT定义了五个函数,用来根据action类型进行操作。
#define OFPACT(ENUM, STRUCT, MEMBER, NAME) \
BUILD_ASSERT_DECL(offsetof(struct STRUCT, ofpact) == 0); \
\
enum { OFPACT_##ENUM##_SIZE \
= (offsetof(struct STRUCT, MEMBER) \
? offsetof(struct STRUCT, MEMBER) \
: OFPACT_ALIGN(sizeof(struct STRUCT))) }; \
\
static inline struct STRUCT * \
ofpact_get_##ENUM(const struct ofpact *ofpact) \
{ \
ovs_assert(ofpact->type == OFPACT_##ENUM); \
return ALIGNED_CAST(struct STRUCT *, ofpact); \
} \
\
static inline struct STRUCT * \
ofpact_get_##ENUM##_nullable(const struct ofpact *ofpact) \
{ \
ovs_assert(!ofpact || ofpact->type == OFPACT_##ENUM); \
return ALIGNED_CAST(struct STRUCT *, ofpact); \
} \
\
static inline struct STRUCT * \
ofpact_put_##ENUM(struct ofpbuf *ofpacts) \
{ \
return (struct STRUCT *) ofpact_put(ofpacts, OFPACT_##ENUM, \
OFPACT_##ENUM##_SIZE); \
} \
\
static inline void \
ofpact_init_##ENUM(struct STRUCT *ofpact) \
{ \
ofpact_init(&ofpact->ofpact, OFPACT_##ENUM, \
OFPACT_##ENUM##_SIZE); \
} \
\
static inline void \
ofpact_finish_##ENUM(struct ofpbuf *ofpbuf, struct STRUCT **ofpactp) \
{ \
struct ofpact *ofpact = &(*ofpactp)->ofpact; \
ovs_assert(ofpact->type == OFPACT_##ENUM); \
*ofpactp = (struct STRUCT *) ofpact_finish(ofpbuf, ofpact); \
}
OFPACTS
#undef OFPACT
OFPACTS 为如下的宏定义:
ENUM STRUCT MEMBER NAME
#define OFPACTS \
/* Output. */ \
OFPACT(OUTPUT, ofpact_output, ofpact, "output") \
...
/* Header changes. */ \
OFPACT(SET_FIELD, ofpact_set_field, ofpact, "set_field") \
...
OFPACT(CT, ofpact_conntrack, ofpact, "ct") \
OFPACT(CT_CLEAR, ofpact_null, ofpact, "ct_clear") \
OFPACT(NAT, ofpact_nat, ofpact, "nat") \
比如对于CT action来说,宏展开后为
enum { OFPACT_CT_SIZE \
= (offsetof(struct ofpact_conntrack, ofpact) \
? offsetof(struct ofpact_conntrack, ofpact) \
: OFPACT_ALIGN(sizeof(struct ofpact_conntrack))) }; \
\
static inline struct ofpact_conntrack * \
ofpact_get_CT(const struct ofpact *ofpact) \
{ \
ovs_assert(ofpact->type == OFPACT_CT); \
return ALIGNED_CAST(struct ofpact_conntrack *, ofpact); \
} \
\
static inline struct ofpact_conntrack * \
ofpact_get_CT_nullable(const struct ofpact *ofpact) \
{ \
ovs_assert(!ofpact || ofpact->type == OFPACT_CT); \
return ALIGNED_CAST(struct ofpact_conntrack *, ofpact); \
} \
\
static inline struct ofpact_conntrack * \
ofpact_put_CT(struct ofpbuf *ofpacts) \
{ \
return (struct ofpact_conntrack *) ofpact_put(ofpacts, OFPACT_CT, \
OFPACT_CT_SIZE); \
} \
\
static inline void \
ofpact_init_CT(struct ofpact_conntrack *ofpact) \
{ \
ofpact_init(&ofpact->ofpact, OFPACT_CT, \
OFPACT_CT_SIZE); \
} \
\
static inline void \
ofpact_finish_CT(struct ofpbuf *ofpbuf, struct ofpact_conntrack **ofpactp) \
{ \
struct ofpact *ofpact = &(*ofpactp)->ofpact; \
ovs_assert(ofpact->type == OFPACT_CT); \
*ofpactp = (struct ofpact_conntrack *) ofpact_finish(ofpbuf, ofpact); \
}
ovs-vswitchd端处理
ovs-vswitchd接收到命令行添加流表消息并解析后,添加到本地flow table中,等待匹配报文。
slowpath解析ct action
ovs接收到报文后,查找fastpath失败,继续slowpath查找,如果匹配到的流表的action为ct,处理流程如下
do_xlate_actions
const struct ofpact *a;
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len)
switch (a->type)
//action为CT
case OFPACT_CT:
//ofpact_get_CT获取struct ofpact_conntrack及其后面嵌套的action
compose_conntrack_action(ctx, ofpact_get_CT(a));
将 struct ofpact_conntrack 结构中action信息转换到datapath能识别的action结构odp_actions中。
static void
compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc)
//内部再次调用do_xlate_actions,解析nat和ct_mark,ct_label信息
do_xlate_actions(ofc->actions, ofpact_ct_get_action_len(ofc), ctx);
//获取nat信息,保存到 ctx->ct_nat_action,如果指定了多次nat,只有最后一次会生效
case OFPACT_NAT:
/* This will be processed by compose_conntrack_action(). */
ctx->ct_nat_action = ofpact_get_NAT(a);
break;
//解析 ct_mark 或者 ct_label 并保存到 flow->ct_mark和 flow->ct_label
case OFPACT_SET_FIELD:
set_field = ofpact_get_SET_FIELD(a);
mf = set_field->field;
/* Set the field only if the packet actually has it. */
if (mf_are_prereqs_ok(mf, flow, wc)) {
mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc);
mf_set_flow_value_masked(mf, set_field->value,
ofpact_set_field_mask(set_field),
flow);
if (ofc->zone_src.field) {
zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow);
} else {
zone = ofc->zone_imm;
}
//添加第一个 datapath action OVS_ACTION_ATTR_CT
//OVS_ACTION_ATTR_CT 开始
ct_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CT);
if (ofc->flags & NX_CT_F_COMMIT) {
nl_msg_put_flag(ctx->odp_actions, ofc->flags & NX_CT_F_FORCE ? OVS_CT_ATTR_FORCE_COMMIT : OVS_CT_ATTR_COMMIT);
if (ctx->xbridge->support.ct_eventmask) {
nl_msg_put_u32(ctx->odp_actions, OVS_CT_ATTR_EVENTMASK, OVS_CT_EVENTMASK_DEFAULT);
}
}
nl_msg_put_u16(ctx->odp_actions, OVS_CT_ATTR_ZONE, zone);
put_ct_mark(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
if (wc->masks.ct_mark) {
struct {
uint32_t key;
uint32_t mask;
} *odp_ct_mark;
odp_ct_mark = nl_msg_put_unspec_uninit(odp_actions, OVS_CT_ATTR_MARK, sizeof(*odp_ct_mark));
odp_ct_mark->key = flow->ct_mark & wc->masks.ct_mark;
odp_ct_mark->mask = wc->masks.ct_mark;
}
put_ct_label(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
put_ct_helper(ctx, ctx->odp_actions, ofc);
if (ofc->alg) {
switch(ofc->alg) {
case IPPORT_FTP:
nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "ftp");
break;
case IPPORT_TFTP:
nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "tftp");
break;
default:
xlate_report_error(ctx, "cannot serialize ct_helper %d", ofc->alg);
break;
}
}
put_ct_nat(ctx);
struct ofpact_nat *ofn = ctx->ct_nat_action;
nat_offset = nl_msg_start_nested(ctx->odp_actions, OVS_CT_ATTR_NAT);
if (ofn->flags & NX_NAT_F_SRC || ofn->flags & NX_NAT_F_DST) {
nl_msg_put_flag(ctx->odp_actions, ofn->flags & NX_NAT_F_SRC
? OVS_NAT_ATTR_SRC : OVS_NAT_ATTR_DST);
if (ofn->flags & NX_NAT_F_PERSISTENT) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PERSISTENT);
}
if (ofn->flags & NX_NAT_F_PROTO_HASH) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_HASH);
} else if (ofn->flags & NX_NAT_F_PROTO_RANDOM) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_RANDOM);
}
...
}
nl_msg_end_nested(ctx->odp_actions, nat_offset);
ctx->ct_nat_action = NULL;
//OVS_ACTION_ATTR_CT 结束
nl_msg_end_nested(ctx->odp_actions, ct_offset);
//如果配置 ct(table=x) 则需要添加第二个 datapath action OVS_ACTION_ATTR_RECIRC
//recirc_table 值为table id,表示需要转到其他table继续执行,比如 actions=ct(table=0)
//值为 NX_CT_RECIRC_NONE,说明不需要
if (ofc->recirc_table == NX_CT_RECIRC_NONE) {
/* If we do not recirculate as part of this action, hide the results of
* connection tracking from subsequent recirculations. */
ctx->conntracked = false;
} else {
/* Use ct_* fields from datapath during recirculation upcall. */
ctx->conntracked = true;
compose_recirculate_and_fork(ctx, ofc->recirc_table);
uint32_t recirc_id;
ctx->freezing = true;
recirc_id = finish_freezing__(ctx, table);
struct frozen_state state = {
//保存需要跳转到的 table id,即 recirc_table
.table_id = table,
.ofproto_uuid = ctx->xbridge->ofproto->uuid,
.stack = ctx->stack.data,
.stack_size = ctx->stack.size,
.mirrors = ctx->mirrors,
.conntracked = ctx->conntracked,
.xport_uuid = ctx->xin->xport_uuid,
.ofpacts = ctx->frozen_actions.data,
.ofpacts_len = ctx->frozen_actions.size,
.action_set = ctx->action_set.data,
.action_set_len = ctx->action_set.size,
};
frozen_metadata_from_flow(&state.metadata, &ctx->xin->flow);
//获取 recirc_id,保存到 odp_actions,作为datapath的其中一个action
id = recirc_alloc_id_ctx(&state);
uint32_t hash = frozen_state_hash(state);
struct recirc_id_node *node = recirc_ref_equal(state, hash);
node = recirc_alloc_id__(state, hash);
struct recirc_id_node *node = xzalloc(sizeof *node);
node->hash = hash;
ovs_refcount_init(&node->refcount);
frozen_state_clone(CONST_CAST(struct frozen_state *, &node->state), state);
cmap_insert(&id_map, &node->id_node, node->id);
cmap_insert(&metadata_map, &node->metadata_node, node->hash);
return node;
node->id;
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, id);
ctx->conntracked = false;
}
通过上面的解析,对于openflow中的action ct会生成在datapath中用的action,可能包含两种action: OVS_ACTION_ATTR_CT和OVS_ACTION_ATTR_RECIRC,前者又包含了commit(OVS_CT_ATTR_FORCE_COMMIT),ct_mark(OVS_CT_ATTR_MARK), ct_label和nat(OVS_CT_ATTR_NAT)等信息,后者仅仅包含了recirc_id,用来重新注入datapath后查看到table id,即用来跳转到指定table执行。
fastpath执行ct action
在将上面获取到的action添加到datapath后,还需要立即对触发slowpath的报文执行action。
packet_batch_per_flow_execute
actions = dp_netdev_flow_get_actions(flow);
dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, actions->actions, actions->size, now);
odp_execute_actions
//遍历执行匹配流表的所有 actions
NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len)
int type = nl_attr_type(a);
//dp_execute_cb
dp_execute_action(dp, batch, a, may_steal);
//执行 ct action
case OVS_ACTION_ATTR_CT: {
NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), nl_attr_get_size(a)) {
enum ovs_ct_attr sub_type = nl_attr_type(b);
switch(sub_type) {
case OVS_CT_ATTR_FORCE_COMMIT:
force = true;
/* fall through. */
case OVS_CT_ATTR_COMMIT:
commit = true;
break;
case OVS_CT_ATTR_ZONE:
zone = nl_attr_get_u16(b);
break;
case OVS_CT_ATTR_HELPER:
helper = nl_attr_get_string(b);
break;
case OVS_CT_ATTR_MARK:
setmark = nl_attr_get(b);
break;
case OVS_CT_ATTR_LABELS:
setlabel = nl_attr_get(b);
break;
case OVS_CT_ATTR_EVENTMASK:
/* Silently ignored, as userspace datapath does not generate
* netlink events. */
break;
case OVS_CT_ATTR_NAT: {
const struct nlattr *b_nest;
unsigned int left_nest;
bool ip_min_specified = false;
bool proto_num_min_specified = false;
bool ip_max_specified = false;
bool proto_num_max_specified = false;
memset(&nat_action_info, 0, sizeof nat_action_info);
nat_action_info_ref = &nat_action_info;
NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
switch (sub_type_nest) {
case OVS_NAT_ATTR_SRC:
case OVS_NAT_ATTR_DST:
nat_config = true;
nat_action_info.nat_action |=
((sub_type_nest == OVS_NAT_ATTR_SRC)
? NAT_ACTION_SRC : NAT_ACTION_DST);
break;
case OVS_NAT_ATTR_IP_MIN:
memcpy(&nat_action_info.min_addr,
nl_attr_get(b_nest),
nl_attr_get_size(b_nest));
ip_min_specified = true;
break;
case OVS_NAT_ATTR_IP_MAX:
memcpy(&nat_action_info.max_addr,
nl_attr_get(b_nest),
nl_attr_get_size(b_nest));
ip_max_specified = true;
break;
case OVS_NAT_ATTR_PROTO_MIN:
nat_action_info.min_port =
nl_attr_get_u16(b_nest);
proto_num_min_specified = true;
break;
case OVS_NAT_ATTR_PROTO_MAX:
nat_action_info.max_port =
nl_attr_get_u16(b_nest);
proto_num_max_specified = true;
break;
//persistent,hash和random在 userspace datapath中没用到
case OVS_NAT_ATTR_PERSISTENT:
case OVS_NAT_ATTR_PROTO_HASH:
case OVS_NAT_ATTR_PROTO_RANDOM:
break;
case OVS_NAT_ATTR_UNSPEC:
case __OVS_NAT_ATTR_MAX:
OVS_NOT_REACHED();
}
}
if (ip_min_specified && !ip_max_specified) {
nat_action_info.max_addr = nat_action_info.min_addr;
}
if (proto_num_min_specified && !proto_num_max_specified) {
nat_action_info.max_port = nat_action_info.min_port;
}
if (proto_num_min_specified || proto_num_max_specified) {
if (nat_action_info.nat_action & NAT_ACTION_SRC) {
nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
} else if (nat_action_info.nat_action & NAT_ACTION_DST) {
nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
}
}
break;
}
}
}
/* We won't be able to function properly in this case, hence
* complain loudly. */
if (nat_config && !commit) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
VLOG_WARN_RL(&rl, "NAT specified without commit.");
}
//struct dp_netdev 是全局的,所以 dp->conntrack 也是全局的,多个pmd共享dp->conntrack
struct dp_netdev *dp = pmd->dp;
conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force, commit, zone, setmark, setlabel, helper, nat_action_info_ref);
//跳转到其他table执行
case OVS_ACTION_ATTR_RECIRC:
if (*depth < MAX_RECIRC_DEPTH) {
struct dp_packet_batch recirc_pkts;
if (!may_steal) {
dp_packet_batch_clone(&recirc_pkts, packets_);
packets_ = &recirc_pkts;
}
struct dp_packet *packet;
DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
//获取 recirc_id
packet->md.recirc_id = nl_attr_get_u32(a);
}
(*depth)++;
//重新进入 slowpath,查找指定table的openflow流表
dp_netdev_recirculate(pmd, packets_);
dp_netdev_input__(pmd, packets, true, 0);
emc_processing
fast_path_processing
(*depth)--;
return;
}
对于ct action来说,conntrack_execute是主要处理函数
int
conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
const uint32_t *setmark,
const struct ovs_key_ct_labels *setlabel,
const char *helper,
const struct nat_action_info_t *nat_action_info)
for (size_t i = 0; i < cnt; i++) {
//从 pkts 中提取报文信息到 ct->key,并判断报文是否合法
if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone))
ctx->key.zone = zone;
ctx->key.dl_type = dl_type;
extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, !hwol_good_l3_csum);
key->src.addr.ipv4 = ip->ip_src;
key->dst.addr.ipv4 = ip->ip_dst;
key->nw_proto = ip->ip_proto;
extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3, !hwol_good_l4_csum);
//计算 hash 值
ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
{
//如果报文不合法,则设置 CS_INVALID 后,继续处理下一个报文
pkts[i]->md.ct_state = CS_INVALID;
write_ct_md(pkts[i], zone, NULL, NULL, NULL);
continue;
}
//开始处理合法报文
process_one(ct, pkts[i], &ctx, zone, force, commit, now, setmark, setlabel, nat_action_info, helper);
struct conn *conn;
//根据 hash 值,得出一个 hash 桶
unsigned bucket = hash_to_bucket(ctx->hash);
#define CONNTRACK_BUCKETS_SHIFT 8
#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
//hash 桶大小 256
return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
//根据 ctx->key 查找 conn,如果是reply方向数据流,则设置reply标志
conn_key_lookup(&ct->buckets[bucket], ctx, now);
uint32_t hash = ctx->hash;
struct conn *conn;
HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
if (!conn_key_cmp(&conn->key, &ctx->key)
&& !conn_expired(conn, now)) {
ctx->conn = conn;
ctx->reply = false;
break;
}
if (!conn_key_cmp(&conn->rev_key, &ctx->key)
&& !conn_expired(conn, now)) {
ctx->conn = conn;
ctx->reply = true;
break;
}
}
conn = ctx->conn;
/* Delete found entry if in wrong direction. 'force' implies commit. */
if (conn && force && ctx->reply) {
conn_clean(ct, conn, &ct->buckets[bucket]);
conn = NULL;
}
bool create_new_conn = false;
struct conn conn_for_un_nat_copy;
conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
bool ftp_ctl = is_ftp_ctl(pkt);
if (OVS_LIKELY(conn)) {
if (ftp_ctl) {
/* Keep sequence tracking in sync with the source of the
* sequence skew. */
if (ctx->reply != conn->seq_skew_dir) {
handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
!!nat_action_info);
create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
bucket);
} else {
create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
bucket);
handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
!!nat_action_info);
}
} else {
create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
bucket);
}
if (nat_action_info && !create_new_conn) {
handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
}
}else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
nat_action_info)) {
create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
bucket);
} else {
if (ctx->icmp_related) {
/* An icmp related conn should always be found; no new
connection is created based on an icmp related packet. */
pkt->md.ct_state = CS_INVALID;
} else {
create_new_conn = true;
}
}
if (OVS_UNLIKELY(create_new_conn)) {
conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info, &conn_for_un_nat_copy, helper, alg_exp);
unsigned bucket = hash_to_bucket(ctx->hash);
struct conn *nc = NULL;
//四层协议判断报文是否有效
if (!valid_new(pkt, &ctx->key))
return l4_protos[key->nw_proto]->valid_new(pkt);
{
pkt->md.ct_state = CS_INVALID;
return nc;
}
//设置 CS_NEW
pkt->md.ct_state = CS_NEW;
//只有设置了 commit,才会将conn添加到hash表
if (commit) {
//判断是否超过 conn 表项最大限制
unsigned int n_conn_limit;
atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
COVERAGE_INC(conntrack_full);
return nc;
}
//创建新表项
nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
struct conn *newconn;
//tcp_new_conn
newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
newconn->key = *key;
return newconn;
ctx->conn = nc;
nc->rev_key = nc->key;
//翻转key
conn_key_reverse(&nc->rev_key);
if (nat_action_info) {
nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
if (alg_exp) {
} else {
*conn_for_un_nat_copy = *nc;
ct_rwlock_wrlock(&ct->resources_lock);
//根据nat配置,选择合适的ip和port
bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy);
bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn, ct->hash_basis);
//将 nat 的conn插入 nat_conn_keys
hmap_insert(nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash);
if (!nat_res) {
goto nat_res_exhaustion;
}
/* Update nc with nat adjustments made to
* conn_for_un_nat_copy by nat_select_range_tuple(). */
*nc = *conn_for_un_nat_copy;
ct_rwlock_unlock(&ct->resources_lock);
}
//设置 conn_type 为 CT_CONN_TYPE_UN_NAT,表示此表项需要nat
conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
conn_for_un_nat_copy->nat_info = NULL;
conn_for_un_nat_copy->alg = NULL;
//将报文做nat转换
nat_packet(pkt, nc, ctx->icmp_related);
if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
pkt->md.ct_state |= CS_SRC_NAT;
if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
struct ip_header *nh = dp_packet_l3(pkt);
packet_set_ipv4_addr(pkt, &nh->ip_src, conn->rev_key.dst.addr.ipv4_aligned);
}
if (!related) {
pat_packet(pkt, conn);
}
} else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
pkt->md.ct_state |= CS_DST_NAT;
}
}
//将新建表项插入hash表
hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
//增加表项个数
atomic_count_inc(&ct->n_conn);
}
return nc;
}
write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
pkt->md.ct_state |= CS_TRACKED;
pkt->md.ct_zone = zone;
pkt->md.ct_mark = conn ? conn->mark : 0;
pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
pkt->md.ct_orig_tuple_ipv6 = false;
if (key) {
if (key->dl_type == htons(ETH_TYPE_IP)) {
//ct_orig_tuple 保存原始报文(第一次进ct模块时)的五元组信息
pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
key->src.addr.ipv4_aligned,
key->dst.addr.ipv4_aligned,
key->nw_proto != IPPROTO_ICMP
? key->src.port : htons(key->src.icmp_type),
key->nw_proto != IPPROTO_ICMP
? key->dst.port : htons(key->src.icmp_code),
key->nw_proto,
};
}
}
if (conn && setmark) {
set_mark(pkt, conn, setmark[0], setmark[1]);
}
if (conn && setlabel) {
set_label(pkt, conn, &setlabel[0], &setlabel[1]);
}
}
}
清除conntrack表项
创建datapath时,会启动专门的线程clean_thread_main清除超期的conntrack表项
create_dp_netdev
struct dp_netdev *dp;
dp = xzalloc(sizeof *dp);
conntrack_init(&dp->conntrack);
ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
static void *
clean_thread_main(void *f_)
{
struct conntrack *ct = f_;
while (!latch_is_set(&ct->clean_thread_exit)) {
long long next_wake;
long long now = time_msec();
next_wake = conntrack_clean(ct, now);
if (next_wake < now) {
poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
} else {
poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
}
latch_wait(&ct->clean_thread_exit);
poll_block();
}
return NULL;
}
参考
https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/
https://zhuanlan.zhihu.com/p/25089778