通过前面的分析,现在用户空间拿到了自内核传上来的关于未能在内核得到匹配的packet的netlink message的信息,接下来批量处理这些upcalls。流程图如下:
这个模块可以分成三个阶段:1)由dpif_upcall得到flow_miss 集合,构造填充相应的字段;2)构造datapath actions;3)执行flow_miss_op->dpif_op,与内核沟通。
处理upcall的数据结构有:
flow_miss是将具有相同流特征的packets统一起来( batching),性能可能会更优,所以这个结构体要将datapath interface相关的数据队列起来,每个flow_miss对应的是发送的一个活多个数据包,另外可能会在dpif中安装流项。
/* So far we only batch the operations that affect flow setup time the most.
* It's possible to batch more than that, but the benefit might be minimal. */
struct flow_miss {
struct hmap_node hmap_node;
struct flow flow; //流特征;
enum odp_key_fitness key_fitness;
const struct nlattr *key;
size_t key_len;
ovs_be16 initial_tci;
struct list packets; //具有该流特征的所有的packets;
enum dpif_upcall_type upcall_type;
};
枚举体odp_key_fitness表征的是自内核而来的这个flow key(就是nla序列)和我们用户空间期望的匹配程度(如何判断??);
/* These values are arranged so that greater values are "more important" than lesser ones. In particular, a single flow key can fit the descriptions for
* both ODP_FIT_TOO_LITTLE and ODP_FIT_TOO_MUCH. Such a key is treated as ODP_FIT_TOO_LITTLE. */
enum odp_key_fitness {
ODP_FIT_PERFECT, /* The key had exactly the fields we expect. */
ODP_FIT_TOO_MUCH, /* The key had fields we don't understand. */
ODP_FIT_TOO_LITTLE, /* The key lacked fields we expected to see. */
ODP_FIT_ERROR, /* The key was invalid. */
};
struct flow_miss_op {
struct dpif_op dpif_op; //据此可以得到操作类型handler;
struct subfacet *subfacet; // Subfacet ,据此可以得到所有的flow和rule等数据;
void *garbage; /* Pointer to pass to free(), NULL if none. */
uint64_t stub[1024 / 8]; /* Temporary buffer. */
};
----lib/dpif.h
struct dpif_op {
enum dpif_op_type type;
int error;
union {
struct dpif_flow_put flow_put;
struct dpif_flow_del flow_del;
struct dpif_execute execute;
} u;
};
/* Operation batching interface.*/
enum dpif_op_type {
DPIF_OP_FLOW_PUT = 1,
DPIF_OP_FLOW_DEL,
DPIF_OP_EXECUTE,
};
主框架,
static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
struct dpif_upcall *upcall;
struct flow_miss *miss;
struct flow_miss misses[FLOW_MISS_MAX_BATCH]; // 50
struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
struct hmap todo;
int n_misses;
size_t n_ops;
size_t i;
//构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流特征的packets 集合到 "flow_miss"结构体中,这样我们可以一同处理它们。
hmap_init(&todo);
n_misses = 0;
for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) { //遍历这个upcall数据;
struct flow_miss *miss = &misses[n_misses];
struct flow_miss *existing_miss;
struct flow flow;
uint32_t hash;
//和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定
//长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度;
//执行完成后flow结构体得到了填充;
miss->key_fitness = ofproto_dpif_extract_flow_key( ofproto, upcall->key, upcall->key_len, &flow, &miss->initial_tci, upcall->packet);
if (miss->key_fitness == ODP_FIT_ERROR) {
continue;
}
//构造miss->flow结构体,同时也会设置packet中的一些指针字段;
flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);
//将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
hash = flow_hash(&miss->flow, 0);
existing_miss = flow_miss_find(&todo, &miss->flow, hash); //不重复加入;
if (!existing_miss) {
hmap_insert(&todo, &miss->hmap_node, hash);
miss->key = upcall->key;
miss->key_len = upcall->key_len;
miss->upcall_type = upcall->type;
list_init(&miss->packets);
n_misses++;
} else {
miss = existing_miss;
}
list_push_back(&miss->packets, &upcall->packet->list_node);
//将这个upcall对应的packet加入这个flow_miss的packets链表中(效果见图3);
}
//---------------------以上是第一个阶段--------------------------
//然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet,handle_flow_miss_with_facet;
n_ops = 0;
HMAP_FOR_EACH (miss, hmap_node, &todo) {
handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
}
assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
//----------------------第二个阶段------------------------------
/* Execute batch. */
for (i = 0; i < n_ops; i++) {
dpif_ops[i] = &flow_miss_ops[i].dpif_op;
}
dpif_operate(ofproto->dpif, dpif_ops, n_ops);
//这里会调用 dpif_linux_operate ,通过netlink socket和内核通信;
/* Free memory and update facets. */
for (i = 0; i < n_ops; i++) {
struct flow_miss_op *op = &flow_miss_ops[i];
switch (op->dpif_op.type) {
case DPIF_OP_EXECUTE:
break;
case DPIF_OP_FLOW_PUT:
if (!op->dpif_op.error) {
op->subfacet->path = subfacet_want_path(op->subfacet->slow);
}
break;
case DPIF_OP_FLOW_DEL:
NOT_REACHED();
}
free(op->garbage);
}
hmap_destroy(&todo);
}
-------ofproto/ofproto-dpif.c
和函数 odp_flow_key_to_flow()的作用类似,该函数将长度为key_len的netlink attribute解析到flow结构体中
static enum odp_key_fitness ofproto_dpif_extract_flow_key(const struct ofproto_dpif *ofproto,
const struct nlattr *key, size_t key_len,
struct flow *flow, ovs_be16 *initial_tci,
struct ofpbuf *packet)
{
enum odp_key_fitness fitness;
fitness = odp_flow_key_to_flow(key, key_len, flow);
if (fitness == ODP_FIT_ERROR) {
return fitness;
}
*initial_tci = flow->vlan_tci;
//(----ofproto/ofproto-dpif.c)此刻这个flow代表的是从ofproto上接收到的一个packet,检查flow->in_port是否代表的是一个Linux
//VLAN 设备,如果是的话就设置in_port为真正的代表VLAN的设备,vlan_tci=VLAN VID,然后返回true。但是通常的情况下没有设置
//VLAN splinters,所以没有做任何改变直接返回false。
if (vsp_adjust_flow(ofproto, flow)) {
if (packet) {
/* Make the packet resemble the flow, so that it gets sent to an
* OpenFlow controller properly, so that it looks correct for
* sFlow, and so that flow_extract() will get the correct vlan_tci
* if it is called on 'packet'.
*
* The allocated space inside 'packet' probably also contains
* 'key', that is, both 'packet' and 'key' are probably part of a
* struct dpif_upcall (see the large comment on that structure
* definition), so pushing data on 'packet' is in general not a
* good idea since it could overwrite 'key' or free it as a side
* effect. However, it's OK in this special case because we know
* that 'packet' is inside a Netlink attribute: pushing 4 bytes
* will just overwrite the 4-byte "struct nlattr", which is fine
* since we don't need that header anymore. */
eth_push_vlan(packet, flow->vlan_tci);
}
/* Let the caller know that we can't reproduce 'key' from 'flow'. */
if (fitness == ODP_FIT_PERFECT) { // ????
fitness = ODP_FIT_TOO_MUCH;
}
}
return fitness;
}
-------------lib/odp-util.c
将key中一定长度的netlink attribute解析到flow结构体中,这里并没有拿packet作为参数,因为现在理解的OVS_KEY_ATTR_*属性都不需要包数据。现在我们总是可以通过底层协议相关的属性来推断出其他的attributes,比如说,如果在OVS_KEY_ATTR_IPV4或 OVS_KEY_ATTR_IPV6中的协议值是IPPROTO_TCP,那么就会出现属性OVS_KEY_ATTR_TCP。
enum odp_key_fitness odp_flow_key_to_flow(const struct nlattr *key, size_t key_len, struct flow *flow)
{
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1];
uint64_t expected_attrs;
uint64_t present_attrs;
int out_of_range_attr;
memset(flow, 0, sizeof *flow);
/* Parse attributes. */
if (!parse_flow_nlattrs(key, key_len, attrs, &present_attrs, &out_of_range_attr)) {
return ODP_FIT_ERROR;
}
expected_attrs = 0;
/* Metadata. */
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) {
flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]);
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_PRIORITY;
}
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) {
flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK;
}
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUN_ID)) {
flow->tunnel.tun_id = nl_attr_get_be64(attrs[OVS_KEY_ATTR_TUN_ID]);
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_TUN_ID;
}
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IN_PORT)) {
uint32_t in_port = nl_attr_get_u32(attrs[OVS_KEY_ATTR_IN_PORT]);
if (in_port >= UINT16_MAX || in_port >= OFPP_MAX) {
VLOG_ERR_RL(&rl, "in_port %"PRIu32" out of supported range",
in_port);
return ODP_FIT_ERROR;
}
flow->in_port = odp_port_to_ofp_port(in_port);
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_IN_PORT;
} else {
flow->in_port = OFPP_NONE;
}
/* Ethernet header. */
if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_ETHERNET)) {
const struct ovs_key_ethernet *eth_key;
eth_key = nl_attr_get(attrs[OVS_KEY_ATTR_ETHERNET]);
memcpy(flow->dl_src, eth_key->eth_src, ETH_ADDR_LEN);
memcpy(flow->dl_dst, eth_key->eth_dst, ETH_ADDR_LEN);
}
expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERNET;
/* Get Ethertype or 802.1Q TPID or FLOW_DL_TYPE_NONE. */
if (!parse_ethertype(attrs, present_attrs, &expected_attrs, flow)) {
return ODP_FIT_ERROR;
}
if (flow->dl_type == htons(ETH_TYPE_VLAN)) {
return parse_8021q_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);
}
return parse_l3_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);
}
------lib/flow.h
struct flow {
struct flow_tnl tunnel; /* Encapsulating tunnel parameters. */
ovs_be64 metadata; /* OpenFlow Metadata. */
struct in6_addr ipv6_src; /* IPv6 source address. */
struct in6_addr ipv6_dst; /* IPv6 destination address. */
struct in6_addr nd_target; /* IPv6 neighbor discovery (ND) target. */
uint32_t skb_priority; /* Packet priority for QoS. */
uint32_t regs[FLOW_N_REGS]; /* Registers. */
ovs_be32 nw_src; /* IPv4 source address. */
ovs_be32 nw_dst; /* IPv4 destination address. */
ovs_be32 ipv6_label; /* IPv6 flow label. */
uint16_t in_port; /* OpenFlow port number of input port. */
uint32_t skb_mark; /* Packet mark. */
ovs_be16 vlan_tci; /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */
ovs_be16 dl_type; /* Ethernet frame type. */
ovs_be16 tp_src; /* TCP/UDP source port. */
ovs_be16 tp_dst; /* TCP/UDP destination port. */
uint8_t dl_src[6]; /* Ethernet source address. */
uint8_t dl_dst[6]; /* Ethernet destination address. */
uint8_t nw_proto; /* IP protocol or low 8 bits of ARP opcode. */
uint8_t nw_tos; /* IP ToS (including DSCP and ECN). */
uint8_t arp_sha[6]; /* ARP/ND source hardware address. */
uint8_t arp_tha[6]; /* ARP/ND target hardware address. */
uint8_t nw_ttl; /* IP TTL/Hop Limit. */
uint8_t nw_frag; /* FLOW_FRAG_* flags. */
uint8_t zeros[2]; /* Must be zero. */
};
----------lib/flow.c
flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的 一些指针也会真正有效,层层跟进(比如packet->l4 = b.data)l2-以太网头,l3-指向以太网帧的payload(即如果有vlan_header的话就从其后),l4指向IPv4的payload,l7指向TCP,UDP或ICMP头之后的payload;疑问:前面通过nla得到的flow结构,与后面通过upcall->packet解析的flow结构体,二者有何联系??
void flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark,
const struct flow_tnl *tnl, uint16_t ofp_in_port, struct flow *flow)
{
struct ofpbuf b = *packet;
struct eth_header *eth;
COVERAGE_INC(flow_extract);
memset(flow, 0, sizeof *flow);
if (tnl) {
assert(tnl != &flow->tunnel);
flow->tunnel = *tnl;
}
flow->in_port = ofp_in_port;
flow->skb_priority = skb_priority;
flow->skb_mark = skb_mark;
packet->l2 = b.data;
packet->l3 = NULL;
packet->l4 = NULL;
packet->l7 = NULL;
if (b.size < sizeof *eth) {
return;
}
/* Link layer. */
eth = b.data;
memcpy(flow->dl_src, eth->eth_src, ETH_ADDR_LEN);
memcpy(flow->dl_dst, eth->eth_dst, ETH_ADDR_LEN);
/* dl_type, vlan_tci. */
ofpbuf_pull(&b, ETH_ADDR_LEN * 2);
if (eth->eth_type == htons(ETH_TYPE_VLAN)) {
parse_vlan(&b, flow);
}
flow->dl_type = parse_ethertype(&b);
/* Network layer. */
packet->l3 = b.data;
if (flow->dl_type == htons(ETH_TYPE_IP)) {
const struct ip_header *nh = pull_ip(&b);
if (nh) {
packet->l4 = b.data;
flow->nw_src = get_unaligned_be32(&nh->ip_src);
flow->nw_dst = get_unaligned_be32(&nh->ip_dst);
flow->nw_proto = nh->ip_proto;
flow->nw_tos = nh->ip_tos;
if (IP_IS_FRAGMENT(nh->ip_frag_off)) {
flow->nw_frag = FLOW_NW_FRAG_ANY;
if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
flow->nw_frag |= FLOW_NW_FRAG_LATER;
}
}
flow->nw_ttl = nh->ip_ttl;
if (!(nh->ip_frag_off & htons(IP_FRAG_OFF_MASK))) {
if (flow->nw_proto == IPPROTO_TCP) {
parse_tcp(packet, &b, flow);
} else if (flow->nw_proto == IPPROTO_UDP) {
parse_udp(packet, &b, flow);
} else if (flow->nw_proto == IPPROTO_ICMP) {
const struct icmp_header *icmp = pull_icmp(&b);
if (icmp) {
flow->tp_src = htons(icmp->icmp_type);
flow->tp_dst = htons(icmp->icmp_code);
packet->l7 = b.data;
}
}
}
}
} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
if (parse_ipv6(&b, flow)) {
return;
}
packet->l4 = b.data;
if (flow->nw_proto == IPPROTO_TCP) {
parse_tcp(packet, &b, flow);
} else if (flow->nw_proto == IPPROTO_UDP) {
parse_udp(packet, &b, flow);
} else if (flow->nw_proto == IPPROTO_ICMPV6) {
if (parse_icmpv6(&b, flow)) {
packet->l7 = b.data;
}
}
} else if (flow->dl_type == htons(ETH_TYPE_ARP) ||
flow->dl_type == htons(ETH_TYPE_RARP)) {
const struct arp_eth_header *arp = pull_arp(&b);
if (arp && arp->ar_hrd == htons(1)
&& arp->ar_pro == htons(ETH_TYPE_IP)
&& arp->ar_hln == ETH_ADDR_LEN
&& arp->ar_pln == 4) {
/* We only match on the lower 8 bits of the opcode. */
if (ntohs(arp->ar_op) <= 0xff) {
flow->nw_proto = ntohs(arp->ar_op);
}
flow->nw_src = arp->ar_spa;
flow->nw_dst = arp->ar_tpa;
memcpy(flow->arp_sha, arp->ar_sha, ETH_ADDR_LEN);
memcpy(flow->arp_tha, arp->ar_tha, ETH_ADDR_LEN);
}
}
}