本文介绍ovs分类器中很重要的两个结构体:flow和miniflow,及其相关函数。
flow
flow中保存的是报文相关的字段和其他一些元数据,用来匹配流表,主要包含如下四个层次内容:
a. metadata: 入端口号,寄存器等信息
b. l2: 源目的mac,vlan和mpls等信息
c. l3: ipv4/ipv6源目的ip,ttl等信息
d. l4: 源目的端口号,icmp code和type等信息。
这四部分内容用来在分类器中查找流表时,被分阶段查找使用,低层的字段先匹配,如果匹配成功,则继续匹配更高层的,如果匹配失败,则不用再匹配更高层次的字段,加速匹配速度,同时也能使下发到datapath的流表更模糊。
flow的一个特点是它整个结构是8字节对齐的,在2.8版本中它的大小是672字节。
struct flow {
/* Metadata */
//struct flow_tnl结构体是8字节对齐的
struct flow_tnl tunnel; /* Encapsulating tunnel parameters. */
ovs_be64 metadata; /* OpenFlow Metadata. */
uint32_t regs[FLOW_N_REGS]; /* Registers. */
uint32_t skb_priority; /* Packet priority for QoS. */
uint32_t pkt_mark; /* Packet mark. */
uint32_t dp_hash; /* Datapath computed hash value. The exact
* computation is opaque to the user space. */
union flow_in_port in_port; /* Input port.*/
uint32_t recirc_id; /* Must be exact match. */
uint8_t ct_state; /* Connection tracking state. */
uint8_t ct_nw_proto; /* CT orig tuple IP protocol. */
uint16_t ct_zone; /* Connection tracking zone. */
uint32_t ct_mark; /* Connection mark.*/
ovs_be32 packet_type; /* OpenFlow packet type. */
ovs_u128 ct_label; /* Connection label. */
uint32_t conj_id; /* Conjunction ID. */
ofp_port_t actset_output; /* Output port in action set. */
/* L2, Order the same as in the Ethernet header! (64-bit aligned) */
struct eth_addr dl_dst; /* Ethernet destination address. */
struct eth_addr dl_src; /* Ethernet source address. */
ovs_be16 dl_type; /* Ethernet frame type.
Note: This also holds the Ethertype for L3
packets of type PACKET_TYPE(1, Ethertype) */
uint8_t pad1[2]; /* Pad to 64 bits. */
union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS]; /* VLANs */
ovs_be32 mpls_lse[ROUND_UP(FLOW_MAX_MPLS_LABELS, 2)]; /* MPLS label stack
(with padding). */
/* L3 (64-bit aligned) */
ovs_be32 nw_src; /* IPv4 source address or ARP SPA. */
ovs_be32 nw_dst; /* IPv4 destination address or ARP TPA. */
ovs_be32 ct_nw_src; /* CT orig tuple IPv4 source address. */
ovs_be32 ct_nw_dst; /* CT orig tuple IPv4 destination address. */
struct in6_addr ipv6_src; /* IPv6 source address. */
struct in6_addr ipv6_dst; /* IPv6 destination address. */
struct in6_addr ct_ipv6_src; /* CT orig tuple IPv6 source address. */
struct in6_addr ct_ipv6_dst; /* CT orig tuple IPv6 destination address. */
ovs_be32 ipv6_label; /* IPv6 flow label. */
uint8_t nw_frag; /* FLOW_FRAG_* flags. */
uint8_t nw_tos; /* IP ToS (including DSCP and ECN). */
uint8_t nw_ttl; /* IP TTL/Hop Limit. */
uint8_t nw_proto; /* IP protocol or low 8 bits of ARP opcode. */
struct in6_addr nd_target; /* IPv6 neighbor discovery (ND) target. */
struct eth_addr arp_sha; /* ARP/ND source hardware address. */
struct eth_addr arp_tha; /* ARP/ND target hardware address. */
ovs_be16 tcp_flags; /* TCP flags/ICMPv6 ND options type.
* With L3 to avoid matching L4. */
ovs_be16 pad2; /* Pad to 64 bits. */
struct ovs_key_nsh nsh; /* Network Service Header keys */
/* L4 (64-bit aligned) */
ovs_be16 tp_src; /* TCP/UDP/SCTP source port/ICMP type. */
ovs_be16 tp_dst; /* TCP/UDP/SCTP destination port/ICMP code. */
ovs_be16 ct_tp_src; /* CT original tuple source port/ICMP type. */
ovs_be16 ct_tp_dst; /* CT original tuple dst port/ICMP code. */
ovs_be32 igmp_group_ip4; /* IGMP group IPv4 address/ICMPv6 ND reserved
* field.
* Keep last for BUILD_ASSERT_DECL below. */
ovs_be32 pad3; /* Pad to 64 bits. */
};
miniflow
miniflow是压缩版的struct flow,其包含两部分内容,一个是flowmap用来记录和flow的对应关系,flowmap中的每一个bit对应struct flow中的一个uint64_t字段,如果bit为1,则flow中对应的uint64_t字段为非0值,如果bit为0,则flow中对应的uint64_t字段为0;另一部分是flowmap后面的内存,由调用者根据flowmap中bit为1的个数*8字节申请内存,用来保存flow中非0的uint64_t。
前面说到flow结构体必须是8字节对齐的,就是为了和miniflow配合使用,比如如果struct flow大小为672字节,则包含84个8字节,在miniflow中使用84个bit即可表示flow信息,一个bit对应flow的一个uint64_t。
struct flow是一个很大的结构体,前面提到它占用672字节空间,但是大部分字段都是0,是用不到的。如果flow中只有一个8字节包含非0值,则miniflow使用24字节(flowmap占用固定的16字节,加上flowmap后面保存的8字节内存)即可表示flow的全部有用信息,相比flow的672字节,大大节省了内存。
所以使用miniflow表示flow有如下两个好处
a. 使用miniflow可以节省内存
b. 如果只想遍历flow中的非0字段时,使用miniflow找到对应的非0字段,可以节省时间
miniflow定义如下
//flow是8字节对齐的,除8得到flow中包含8字节的个数
#define FLOW_U64S (sizeof(struct flow) / sizeof(uint64_t))
//map大小为8字节,MAP_T_BITS 为64位
typedef unsigned long long map_t;
#define MAP_T_BITS (sizeof(map_t) * CHAR_BIT)
//每位表示一个u64,FLOWMAP_UNITS 表示最少需要几个64位
#define FLOWMAP_UNITS DIV_ROUND_UP(FLOW_U64S, MAP_T_BITS)
struct flowmap {
map_t bits[FLOWMAP_UNITS];
};
struct miniflow {
struct flowmap map;
/* Followed by:
* uint64_t values[n];
* where 'n' is miniflow_n_values(miniflow). */
};
函数
miniflow_extract
miniflow_extract用来从报文中提取flow信息,并保存到miniflow中,调用miniflow_extract的函数应该保证miniflow有足够的空间容纳FLOW_U64S * 8 字节大小。
/* Caller is responsible for initializing 'dst' with enough storage for
* FLOW_U64S * 8 bytes. */
void
miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
{
const struct pkt_metadata *md = &packet->md;
const void *data = dp_packet_data(packet);
size_t size = dp_packet_size(packet);
ovs_be32 packet_type = packet->packet_type;
uint64_t *values = miniflow_values(dst);
struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
values + FLOW_U64S };
const char *frame;
ovs_be16 dl_type = OVS_BE16_MAX;
uint8_t nw_frag, nw_tos, nw_ttl, nw_proto;
uint8_t *ct_nw_proto_p = NULL;
ovs_be16 ct_tp_src = 0, ct_tp_dst = 0;
...
...
//保存metadata信息到miniflow
if (md->skb_priority || md->pkt_mark) {
miniflow_push_uint32(mf, skb_priority, md->skb_priority);
miniflow_push_uint32(mf, pkt_mark, md->pkt_mark);
}
//保存md->dp_hash到miniflow
miniflow_push_uint32(mf, dp_hash, md->dp_hash);
//保存报文入端口到miniflow
miniflow_push_uint32(mf, in_port, odp_to_u32(md->in_port.odp_port));
...
...
//保存二层信息到miniflow
/* Link layer. */
ASSERT_SEQUENTIAL(dl_dst, dl_src);
miniflow_push_macs(mf, dl_dst, data);
/* VLAN */
union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS];
size_t num_vlans = parse_vlan(&data, &size, vlans);
dl_type = parse_ethertype(&data, &size);
miniflow_push_be16(mf, dl_type, dl_type);
miniflow_pad_to_64(mf, dl_type);
if (num_vlans > 0) {
miniflow_push_words_32(mf, vlans, vlans, num_vlans);
}
...
...
//保存三层信息到miniflow
/* Push both source and destination address at once. */
miniflow_push_words(mf, nw_src, &nh->ip_src, 1);
...
...
//保存四层信息到miniflow
if (OVS_LIKELY(nw_proto == IPPROTO_TCP)) {
if (OVS_LIKELY(size >= TCP_HEADER_LEN)) {
const struct tcp_header *tcp = data;
miniflow_push_be32(mf, arp_tha.ea[2], 0);
miniflow_push_be32(mf, tcp_flags,
TCP_FLAGS_BE32(tcp->tcp_ctl));
miniflow_push_be16(mf, tp_src, tcp->tcp_src);
miniflow_push_be16(mf, tp_dst, tcp->tcp_dst);
miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
}
}
在上面将value保存到miniflow时,用到了几个辅助函数,比如下面的miniflow_push_uint32用来将一个32位的值保存到miniflow中FIELD对应的位置。其首先调用offsetof获取field在flow中的偏移字节数,因为flow是8字节对齐的,所以一个四字节的成员变量要么位于8字节的起始位置,要么位于8字节的中间位置,即对8取模值肯定为0或者4,再调用miniflow_push_uint32_保存到对应的位置,并设置map中对应的bit为1。
#define miniflow_push_uint32_(MF, OFS, VALUE) \
{ \
MINIFLOW_ASSERT(MF.data < MF.end); \
\
//成员变量位于起始位置,需要调用miniflow_set_map设置对应的bit为1
if ((OFS) % 8 == 0) { \
miniflow_set_map(MF, OFS / 8); \
*(uint32_t *)MF.data = VALUE; \
} else if ((OFS) % 8 == 4) { \
//成员变量不在起始位置,要判断此变量所在的bit为1
miniflow_assert_in_map(MF, OFS / 8); \
*((uint32_t *)MF.data + 1) = VALUE; \
MF.data++; \
} \
}
#define miniflow_push_uint32(MF, FIELD, VALUE) \
miniflow_push_uint32_(MF, offsetof(struct flow, FIELD), VALUE)
注意的是,一定要按照flow中成员的顺序保存到miniflow。
miniflow_expand
miniflow_expand用来将miniflow中的值恢复到flow结构体中。
/* Initializes 'dst' as a copy of 'src'. */
void
miniflow_expand(const struct miniflow *src, struct flow *dst)
{
memset(dst, 0, sizeof *dst);
flow_union_with_miniflow(dst, src);
}
/* Perform a bitwise OR of miniflow 'src' flow data with the equivalent
* fields in 'dst', storing the result in 'dst'. */
static inline void
flow_union_with_miniflow(struct flow *dst, const struct miniflow *src)
{
flow_union_with_miniflow_subset(dst, src, src->map);
}
/* Perform a bitwise OR of miniflow 'src' flow data specified in 'subset' with
* the equivalent fields in 'dst', storing the result in 'dst'. 'subset' must
* be a subset of 'src's map. */
static inline void
flow_union_with_miniflow_subset(struct flow *dst, const struct miniflow *src,
struct flowmap subset)
{
uint64_t *dst_u64 = (uint64_t *) dst;
const uint64_t *p = miniflow_get_values(src);
map_t map;
//遍历所有的map
FLOWMAP_FOR_EACH_MAP (map, subset) {
size_t idx;
//遍历map中所有的非0 bit
MAP_FOR_EACH_INDEX(idx, map) {
dst_u64[idx] |= *p++;
}
dst_u64 += MAP_T_BITS;
}
}