用户空间具体是如何处理dpif_upcall ?(1)构造flow_miss批量处理


          通过前面的分析,现在用户空间拿到了自内核传上来的关于未能在内核得到匹配的packet的netlink message的信息,接下来批量处理这些upcalls。流程图如下:


          这个模块可以分成三个阶段:1)由dpif_upcall得到flow_miss 集合,构造填充相应的字段;2)构造datapath actions;3)执行flow_miss_op->dpif_op,与内核沟通

处理upcall的数据结构有:

flow_miss是将具有相同流特征的packets统一起来( batching),性能可能会更优,所以这个结构体要将datapath interface相关的数据队列起来,每个flow_miss对应的是发送的一个活多个数据包,另外可能会在dpif中安装流项。
/* So far we only batch the operations that affect flow setup time the most.
* It's possible to batch more than that, but the benefit might be minimal. */
struct flow_miss {
    struct hmap_node hmap_node;
    struct flow flow;  //流特征;
    enum odp_key_fitness key_fitness;
    const struct nlattr *key;
    size_t key_len;
    ovs_be16 initial_tci;
    struct list packets;    //具有该流特征的所有的packets;
    enum dpif_upcall_type upcall_type;
};

枚举体odp_key_fitness表征的是自内核而来的这个flow key(就是nla序列)和我们用户空间期望的匹配程度(如何判断??);
/* These values are arranged so that greater values are "more important" than  lesser ones.  In particular, a single flow key can fit the descriptions for
* both ODP_FIT_TOO_LITTLE and ODP_FIT_TOO_MUCH.  Such a key is treated as  ODP_FIT_TOO_LITTLE. */
enum odp_key_fitness {
    ODP_FIT_PERFECT,            /* The key had exactly the fields we expect. */
    ODP_FIT_TOO_MUCH,           /* The key had fields we don't understand. */
    ODP_FIT_TOO_LITTLE,         /* The key lacked fields we expected to see. */
    ODP_FIT_ERROR,              /* The key was invalid. */
};
struct flow_miss_op {
    struct dpif_op dpif_op;    //据此可以得到操作类型handler;
    struct subfacet *subfacet;    // Subfacet  ,据此可以得到所有的flow和rule等数据;
    void *garbage;              /* Pointer to pass to free(), NULL if none. */
    uint64_t stub[1024 / 8];    /* Temporary buffer. */
};
----lib/dpif.h
struct dpif_op {
    enum dpif_op_type type;
    int error;
    union {
        struct dpif_flow_put flow_put;
        struct dpif_flow_del flow_del;
        struct dpif_execute execute;
    } u;
};
/* Operation batching interface.*/
enum dpif_op_type {
    DPIF_OP_FLOW_PUT = 1,
    DPIF_OP_FLOW_DEL,
    DPIF_OP_EXECUTE,
};

     主框架,
static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
    struct dpif_upcall *upcall;
    struct flow_miss *miss;
    struct flow_miss misses[FLOW_MISS_MAX_BATCH];   // 50
    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
    struct hmap todo;
    int n_misses;
    size_t n_ops;
    size_t i;

    //构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流特征的packets 集合到 "flow_miss"结构体中,这样我们可以一同处理它们。
    hmap_init(&todo);
    n_misses = 0;
    for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {  //遍历这个upcall数据;
        struct flow_miss *miss = &misses[n_misses];
        struct flow_miss *existing_miss;
        struct flow flow;
        uint32_t hash;

        //和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定
        //长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度;
        //执行完成后flow结构体得到了填充;
        miss->key_fitness = ofproto_dpif_extract_flow_key(  ofproto, upcall->key, upcall->key_len,  &flow, &miss->initial_tci, upcall->packet);
        if (miss->key_fitness == ODP_FIT_ERROR) {
            continue;
        }
        //构造miss->flow结构体,同时也会设置packet中的一些指针字段;
        flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);

        //将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
        hash = flow_hash(&miss->flow, 0);
        existing_miss = flow_miss_find(&todo, &miss->flow, hash);  //不重复加入;
        if (!existing_miss) {
            hmap_insert(&todo, &miss->hmap_node, hash);
            miss->key = upcall->key;
            miss->key_len = upcall->key_len;
            miss->upcall_type = upcall->type;
            list_init(&miss->packets);

            n_misses++;
        } else {
            miss = existing_miss;
        }
        list_push_back(&miss->packets, &upcall->packet->list_node);   
      //将这个upcall对应的packet加入这个flow_miss的packets链表中(效果见图3);
    }
     //---------------------以上是第一个阶段--------------------------
   //然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet,handle_flow_miss_with_facet;
    n_ops = 0;
    HMAP_FOR_EACH (miss, hmap_node, &todo) {
        handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
    }
    assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
      //----------------------第二个阶段------------------------------
    /* Execute batch. */
    for (i = 0; i < n_ops; i++) {
        dpif_ops[i] = &flow_miss_ops[i].dpif_op;
    }
    dpif_operate(ofproto->dpif, dpif_ops, n_ops);
     //这里会调用 dpif_linux_operate ,通过netlink socket和内核通信;
    /* Free memory and update facets. */
    for (i = 0; i < n_ops; i++) {
        struct flow_miss_op *op = &flow_miss_ops[i];

        switch (op->dpif_op.type) {
        case DPIF_OP_EXECUTE:
            break;

        case DPIF_OP_FLOW_PUT:
            if (!op->dpif_op.error) {
                op->subfacet->path = subfacet_want_path(op->subfacet->slow);
            }
            break;

        case DPIF_OP_FLOW_DEL:
            NOT_REACHED();
        }

        free(op->garbage);
    }
    hmap_destroy(&todo);
}

-------ofproto/ofproto-dpif.c
     和函数 odp_flow_key_to_flow()的作用类似,该函数将长度为key_len的netlink attribute解析到flow结构体中
static enum odp_key_fitness  ofproto_dpif_extract_flow_key(const struct ofproto_dpif *ofproto,
                              const struct nlattr *key, size_t key_len,
                              struct flow *flow, ovs_be16 *initial_tci,
                              struct ofpbuf *packet)
{
    enum odp_key_fitness fitness;

    fitness = odp_flow_key_to_flow(key, key_len, flow);
    if (fitness == ODP_FIT_ERROR) {
        return fitness;
    }
    *initial_tci = flow->vlan_tci;
      //(----ofproto/ofproto-dpif.c)此刻这个flow代表的是从ofproto上接收到的一个packet,检查flow->in_port是否代表的是一个Linux                  
     //VLAN 设备,如果是的话就设置in_port为真正的代表VLAN的设备,vlan_tci=VLAN VID,然后返回true。但是通常的情况下没有设置
    //VLAN  splinters,所以没有做任何改变直接返回false。
    if (vsp_adjust_flow(ofproto, flow)) {
        if (packet) {
            /* Make the packet resemble the flow, so that it gets sent to an
             * OpenFlow controller properly, so that it looks correct for
             * sFlow, and so that flow_extract() will get the correct vlan_tci
             * if it is called on 'packet'.
             *
             * The allocated space inside 'packet' probably also contains
             * 'key', that is, both 'packet' and 'key' are probably part of a
             * struct dpif_upcall (see the large comment on that structure
             * definition), so pushing data on 'packet' is in general not a
             * good idea since it could overwrite 'key' or free it as a side
             * effect.  However, it's OK in this special case because we know
             * that 'packet' is inside a Netlink attribute: pushing 4 bytes
             * will just overwrite the 4-byte "struct nlattr", which is fine
             * since we don't need that header anymore. */
            eth_push_vlan(packet, flow->vlan_tci);
        }

        /* Let the caller know that we can't reproduce 'key' from 'flow'. */
        if (fitness == ODP_FIT_PERFECT) {     // ????
            fitness = ODP_FIT_TOO_MUCH;
        }
    }

    return fitness;
}

-------------lib/odp-util.c
     将key中一定长度的netlink attribute解析到flow结构体中,这里并没有拿packet作为参数,因为现在理解的OVS_KEY_ATTR_*属性都不需要包数据。现在我们总是可以通过底层协议相关的属性来推断出其他的attributes,比如说,如果在OVS_KEY_ATTR_IPV4或 OVS_KEY_ATTR_IPV6中的协议值是IPPROTO_TCP,那么就会出现属性OVS_KEY_ATTR_TCP。
enum odp_key_fitness odp_flow_key_to_flow(const struct nlattr *key, size_t key_len, struct flow *flow)
{
    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
    const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1];
    uint64_t expected_attrs;
    uint64_t present_attrs;
    int out_of_range_attr;

    memset(flow, 0, sizeof *flow);

    /* Parse attributes. */
    if (!parse_flow_nlattrs(key, key_len, attrs, &present_attrs, &out_of_range_attr)) {
        return ODP_FIT_ERROR;
    }
    expected_attrs = 0;

    /* Metadata. */
    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) {
        flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]);
        expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_PRIORITY;
    }

    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) {
        flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
        expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK;
    }

    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUN_ID)) {
        flow->tunnel.tun_id = nl_attr_get_be64(attrs[OVS_KEY_ATTR_TUN_ID]);
        expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_TUN_ID;
    }

    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IN_PORT)) {
        uint32_t in_port = nl_attr_get_u32(attrs[OVS_KEY_ATTR_IN_PORT]);
        if (in_port >= UINT16_MAX || in_port >= OFPP_MAX) {
            VLOG_ERR_RL(&rl, "in_port %"PRIu32" out of supported range",
                        in_port);
            return ODP_FIT_ERROR;
        }
        flow->in_port = odp_port_to_ofp_port(in_port);
        expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_IN_PORT;
    } else {
        flow->in_port = OFPP_NONE;
    }

    /* Ethernet header. */
    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_ETHERNET)) {
        const struct ovs_key_ethernet *eth_key;

        eth_key = nl_attr_get(attrs[OVS_KEY_ATTR_ETHERNET]);
        memcpy(flow->dl_src, eth_key->eth_src, ETH_ADDR_LEN);
        memcpy(flow->dl_dst, eth_key->eth_dst, ETH_ADDR_LEN);
    }
    expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERNET;

    /* Get Ethertype or 802.1Q TPID or FLOW_DL_TYPE_NONE. */
    if (!parse_ethertype(attrs, present_attrs, &expected_attrs, flow)) {
        return ODP_FIT_ERROR;
    }

    if (flow->dl_type == htons(ETH_TYPE_VLAN)) {
        return parse_8021q_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);
    }
    return parse_l3_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);
}

------lib/flow.h
struct flow {
    struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. */
    ovs_be64 metadata;          /* OpenFlow Metadata. */
    struct in6_addr ipv6_src;   /* IPv6 source address. */
    struct in6_addr ipv6_dst;   /* IPv6 destination address. */
    struct in6_addr nd_target;  /* IPv6 neighbor discovery (ND) target. */
    uint32_t skb_priority;      /* Packet priority for QoS. */
    uint32_t regs[FLOW_N_REGS]; /* Registers. */
    ovs_be32 nw_src;            /* IPv4 source address. */
    ovs_be32 nw_dst;            /* IPv4 destination address. */
    ovs_be32 ipv6_label;        /* IPv6 flow label. */
    uint16_t in_port;           /* OpenFlow port number of input port. */
    uint32_t skb_mark;          /* Packet mark. */
    ovs_be16 vlan_tci;          /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */
    ovs_be16 dl_type;           /* Ethernet frame type. */
    ovs_be16 tp_src;            /* TCP/UDP source port. */
    ovs_be16 tp_dst;            /* TCP/UDP destination port. */
    uint8_t dl_src[6];          /* Ethernet source address. */
    uint8_t dl_dst[6];          /* Ethernet destination address. */
    uint8_t nw_proto;           /* IP protocol or low 8 bits of ARP opcode. */
    uint8_t nw_tos;             /* IP ToS (including DSCP and ECN). */
    uint8_t arp_sha[6];         /* ARP/ND source hardware address. */
    uint8_t arp_tha[6];         /* ARP/ND target hardware address. */
    uint8_t nw_ttl;             /* IP TTL/Hop Limit. */
    uint8_t nw_frag;            /* FLOW_FRAG_* flags. */
    uint8_t zeros[2];           /* Must be zero. */
};

----------lib/flow.c
      flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的 一些指针也会真正有效,层层跟进(比如packet->l4 = b.data)l2-以太网头,l3-指向以太网帧的payload(即如果有vlan_header的话就从其后),l4指向IPv4的payload,l7指向TCP,UDP或ICMP头之后的payload;疑问:前面通过nla得到的flow结构,与后面通过upcall->packet解析的flow结构体,二者有何联系??
void  flow_extract(struct ofpbuf *packet,  uint32_t skb_priority, uint32_t skb_mark,
             const struct flow_tnl *tnl,  uint16_t ofp_in_port,  struct flow *flow)
{
    struct ofpbuf b = *packet;
    struct eth_header *eth;

    COVERAGE_INC(flow_extract);

    memset(flow, 0, sizeof *flow);

    if (tnl) {
        assert(tnl != &flow->tunnel);
        flow->tunnel = *tnl;
    }
    flow->in_port = ofp_in_port;
    flow->skb_priority = skb_priority;
    flow->skb_mark = skb_mark;

    packet->l2 = b.data;
    packet->l3 = NULL;
    packet->l4 = NULL;
    packet->l7 = NULL;

    if (b.size < sizeof *eth) {
        return;
    }

    /* Link layer. */
    eth = b.data;
    memcpy(flow->dl_src, eth->eth_src, ETH_ADDR_LEN);
    memcpy(flow->dl_dst, eth->eth_dst, ETH_ADDR_LEN);

    /* dl_type, vlan_tci. */
    ofpbuf_pull(&b, ETH_ADDR_LEN * 2);
    if (eth->eth_type == htons(ETH_TYPE_VLAN)) {
        parse_vlan(&b, flow);
    }
    flow->dl_type = parse_ethertype(&b);

    /* Network layer. */
    packet->l3 = b.data;
    if (flow->dl_type == htons(ETH_TYPE_IP)) {
        const struct ip_header *nh = pull_ip(&b);
        if (nh) {
            packet->l4 = b.data;

            flow->nw_src = get_unaligned_be32(&nh->ip_src);
            flow->nw_dst = get_unaligned_be32(&nh->ip_dst);
            flow->nw_proto = nh->ip_proto;

            flow->nw_tos = nh->ip_tos;
            if (IP_IS_FRAGMENT(nh->ip_frag_off)) {
                flow->nw_frag = FLOW_NW_FRAG_ANY;
                if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
                    flow->nw_frag |= FLOW_NW_FRAG_LATER;
                }
            }
            flow->nw_ttl = nh->ip_ttl;

            if (!(nh->ip_frag_off & htons(IP_FRAG_OFF_MASK))) {
                if (flow->nw_proto == IPPROTO_TCP) {
                    parse_tcp(packet, &b, flow);
                } else if (flow->nw_proto == IPPROTO_UDP) {
                    parse_udp(packet, &b, flow);
                } else if (flow->nw_proto == IPPROTO_ICMP) {
                    const struct icmp_header *icmp = pull_icmp(&b);
                    if (icmp) {
                        flow->tp_src = htons(icmp->icmp_type);
                        flow->tp_dst = htons(icmp->icmp_code);
                        packet->l7 = b.data;
                    }
                }
            }
        }
    } else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
        if (parse_ipv6(&b, flow)) {
            return;
        }

        packet->l4 = b.data;
        if (flow->nw_proto == IPPROTO_TCP) {
            parse_tcp(packet, &b, flow);
        } else if (flow->nw_proto == IPPROTO_UDP) {
            parse_udp(packet, &b, flow);
        } else if (flow->nw_proto == IPPROTO_ICMPV6) {
            if (parse_icmpv6(&b, flow)) {
                packet->l7 = b.data;
            }
        }
    } else if (flow->dl_type == htons(ETH_TYPE_ARP) ||
               flow->dl_type == htons(ETH_TYPE_RARP)) {
        const struct arp_eth_header *arp = pull_arp(&b);
        if (arp && arp->ar_hrd == htons(1)
            && arp->ar_pro == htons(ETH_TYPE_IP)
            && arp->ar_hln == ETH_ADDR_LEN
            && arp->ar_pln == 4) {
            /* We only match on the lower 8 bits of the opcode. */
            if (ntohs(arp->ar_op) <= 0xff) {
                flow->nw_proto = ntohs(arp->ar_op);
            }

            flow->nw_src = arp->ar_spa;
            flow->nw_dst = arp->ar_tpa;
            memcpy(flow->arp_sha, arp->ar_sha, ETH_ADDR_LEN);
            memcpy(flow->arp_tha, arp->ar_tha, ETH_ADDR_LEN);
        }
    }
}


你可能感兴趣的:(SDN研究,OVS源码阅读,OVS源码阅读)