DPDK flow_classify 源码阅读

代码部分

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2017 Intel Corporation
 */

#include 
#include 
#include 

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define RX_RING_SIZE 1024
#define TX_RING_SIZE 1024

#define NUM_MBUFS 8191
#define MBUF_CACHE_SIZE 250
#define BURST_SIZE 32

#define MAX_NUM_CLASSIFY 30
#define FLOW_CLASSIFY_MAX_RULE_NUM 91
#define FLOW_CLASSIFY_MAX_PRIORITY 8
#define FLOW_CLASSIFIER_NAME_SIZE 64

#define COMMENT_LEAD_CHAR   ('#')
#define OPTION_RULE_IPV4    "rule_ipv4"
#define RTE_LOGTYPE_FLOW_CLASSIFY   RTE_LOGTYPE_USER3
#define flow_classify_log(format, ...) \
        RTE_LOG(ERR, FLOW_CLASSIFY, format, ##__VA_ARGS__)

#define uint32_t_to_char(ip, a, b, c, d) do {\
        *a = (unsigned char)(ip >> 24 & 0xff);\
        *b = (unsigned char)(ip >> 16 & 0xff);\
        *c = (unsigned char)(ip >> 8 & 0xff);\
        *d = (unsigned char)(ip & 0xff);\
    } while (0)

enum {
    CB_FLD_SRC_ADDR,     // 0
    CB_FLD_DST_ADDR,     // 1
    CB_FLD_SRC_PORT,     // 2
    CB_FLD_SRC_PORT_DLM, // 3 
    CB_FLD_SRC_PORT_MASK,// 4 
    CB_FLD_DST_PORT,     // 5 
    CB_FLD_DST_PORT_DLM, // 6
    CB_FLD_DST_PORT_MASK,// 7 
    CB_FLD_PROTO,        // 8
    CB_FLD_PRIORITY,     // 9 
    CB_FLD_NUM,          // 10 
};

static struct{
    const char *rule_ipv4_name;
} parm_config; // 用于文件访问的。

const char cb_port_delim[] = ":";

static const struct rte_eth_conf port_conf_default = {
    .rxmode = {
        .max_rx_pkt_len = ETHER_MAX_LEN,
        .ignore_offload_bitfield = 1,
    },
};

struct flow_classifier { 
    struct rte_flow_classifier *cls;
};
// flow_classifer 的结构要看 sample guide

/*
struct rte_flow_classifier {

    // classifier的参数,要 create() 时传入结构体。
    char name[RTE_FLOW_CLASSIFIER_MAX_NAME_SZ];
    int socket_id;

    // 其余的内部字段
    // n tuple 过滤器,也就是流规则的匹配项目了。
    struct rte_eth_ntuple_filter ntuple_filter;

    // tables
    struct rte_cls_table tables[RTE_FLOW_CLASSIFY_TABLE_MAX];
    uint32_t table_mask;
    uint32_t num_tables;

    uint16_t nb_pkts;
    struct rte_flow_classify_table_entry
        *entries[RTE_PORT_IN_BURST_SIZE_MAX];
} __rte_cache_aligned;
*/

struct flow_classifier_acl {
    struct flow_classifier cls;
} __rte_cache_aligned;

/* ACL field definitions for IPv4 5 tuple rule */

enum {
    PROTO_FIELD_IPV4, // 0
    SRC_FIELD_IPV4,   // 1
    DST_FIELD_IPV4,   // 2
    SRCP_FIELD_IPV4,  // 3
    DSTP_FIELD_IPV4,  // 4 
    NUM_FIELDS_IPV4   // 5
};

enum {
    PROTO_INPUT_IPV4,
    SRC_INPUT_IPV4,
    DST_INPUT_IPV4,
    SRCP_DESTP_INPUT_IPV4
};


/* 数据结构 rte_acl_field_def:ACL 访问控制表的字段的定义
ACL规则中的每个字段都有一个关联定义。有五个,分别是:
字段的类型 type,
字段的字节数大小 size,
字段的索引(指示哪一个字段)field_index 一个0开始的值,用来指定字段在规则内部的位置,0~n-1表示n个字段。
输入索引 input_index(0-N)  所有输入字段,除了第一个,其他必须以4个连续字节分组,这个input_index就是来指定字段在那个组
偏移量offset 定义了字段的偏移量,为查找指定从缓冲区的起始位置的偏移。
*/

/* 
rule “规则” 有一些独有规则:
    1. 规则定义的第一个字段必须是一个字节的长度
    2. 之后的字段必须以4个连续的字节分组
    这主要是为性能考虑,查找函数处理第一个输入字节做为这个流的设置的一部分,然后这查找函数的内部循环被展开来同时处理4字节的输入。
*/

static struct rte_acl_field_def ipv4_defs[NUM_FIELDS_IPV4] = { // 共 5 个字段,每个字段都要有一个关联的五个定义
    /* first input field - always one byte long. */ // 第一个字段 1个字节
    {
        .type = RTE_ACL_FIELD_TYPE_BITMASK, // type 字段的类型,有3种选项,见https://www.cnblogs.com/danxi/p/6650757.html
        .size = sizeof(uint8_t), // 1个字节
        .field_index = PROTO_FIELD_IPV4, // 两个 index 都是 enum
        .input_index = PROTO_INPUT_IPV4,
        .offset = sizeof(struct ether_hdr) + // todo :数据结构
            offsetof(struct ipv4_hdr, next_proto_id),
    },
    /* next input field (IPv4 source address) - 4 consecutive bytes. */
    {   // 第二个字段 源IP地址
        /* rte_flow uses a bit mask for IPv4 addresses */
        .type = RTE_ACL_FIELD_TYPE_BITMASK, 
        .size = sizeof(uint32_t),
        .field_index = SRC_FIELD_IPV4,
        .input_index = SRC_INPUT_IPV4,
        .offset = sizeof(struct ether_hdr) +
            offsetof(struct ipv4_hdr, src_addr),
    },
    /* next input field (IPv4 destination address) - 4 consecutive bytes. */
    {   // 第三个字段 目的IP地址
        /* rte_flow uses a bit mask for IPv4 addresses */
        .type = RTE_ACL_FIELD_TYPE_BITMASK,
        .size = sizeof(uint32_t),
        .field_index = DST_FIELD_IPV4,
        .input_index = DST_INPUT_IPV4,
        .offset = sizeof(struct ether_hdr) +
            offsetof(struct ipv4_hdr, dst_addr),
    },
    /*
     * Next 2 fields (src & dst ports) form 4 consecutive bytes.
     * They share the same input index.
     */
    // 接下来的 两个端口号 才组成一个 4 字节,所以共享同样的一个 input index
    {
        /* rte_flow uses a bit mask for protocol ports */
        .type = RTE_ACL_FIELD_TYPE_BITMASK, 
        .size = sizeof(uint16_t),
        .field_index = SRCP_FIELD_IPV4,
        .input_index = SRCP_DESTP_INPUT_IPV4,
        .offset = sizeof(struct ether_hdr) + // (todo)
            sizeof(struct ipv4_hdr) +
            offsetof(struct tcp_hdr, src_port),
    },
    {
        /* rte_flow uses a bit mask for protocol ports */
        .type = RTE_ACL_FIELD_TYPE_BITMASK,
        .size = sizeof(uint16_t),
        .field_index = DSTP_FIELD_IPV4,
        .input_index = SRCP_DESTP_INPUT_IPV4,
        .offset = sizeof(struct ether_hdr) +
            sizeof(struct ipv4_hdr) +
            offsetof(struct tcp_hdr, dst_port),
    },
};

/* flow classify data */
static int num_classify_rules; // rules数组的下标
static struct rte_flow_classify_rule *rules[MAX_NUM_CLASSIFY]; // rules 数组
static struct rte_flow_classify_ipv4_5tuple_stats ntuple_stats;  // stats 结构体 (todo)
static struct rte_flow_classify_stats classify_stats = { // 有计数功能
        .stats = (void **)&ntuple_stats
};

/* parameters for rte_flow_classify_validate and
 * rte_flow_classify_table_entry_add functions
 */

/* rte_flow_item 四个字段:
1. type,是 enum 定义。见 rte_flow.h:http://doc.dpdk.org/api/rte__flow_8h_source.html
2. spec,指向相关项类型结构的有效指针,在许多情况下,可以设置成 NULL以请求广泛(非特定)匹配。在此情况下,last 和 mask 也要设置成 NULL
3. last,可以指向相同类型的结构,以定义包含范围。
4. Mask,是在解释spec和last的内容之前应用的简单位掩码
*/
static struct rte_flow_item  eth_item = { RTE_FLOW_ITEM_TYPE_ETH,
    0, 0, 0 };
static struct rte_flow_item  end_item = { RTE_FLOW_ITEM_TYPE_END,
    0, 0, 0 };

/* sample actions:
 * "actions count / end"
 */
struct rte_flow_query_count count = { // 计数器查询的结构体
    .reset = 1, // Reset counters after query
    .hits_set = 1, // 启用 hits 字段
    .bytes_set = 1, // 启用 bytes 字段
    .hits = 0, // Number of hits for this rule
    .bytes = 0, // Number of bytes through this rule
};
static struct rte_flow_action count_action = { RTE_FLOW_ACTION_TYPE_COUNT, &count};
static struct rte_flow_action end_action = { RTE_FLOW_ACTION_TYPE_END, 0}; // 本程序就用到了计数和end 两种 action

static struct rte_flow_action actions[2]; 
// rte_flow_action 见 programmers’ guides 的第九章 :http://doc.dpdk.org/guides/prog_guide/rte_flow.html
// actions 数组代表当 pkt 被 pattern 匹配时要执行的一系列操作。
// 在这个例子里,数组长度为二,actions[0] 就是计数,actions[1] 就是用来提示结尾。

// rte_flow_action的具体定义不清楚
// 估计第一个字段是 enum rte_flow_action_type ,具体的 enum 定义见:http://doc.dpdk.org/api/rte__flow_8h.html#a78f0386e683cfc491462a771df8b971a
// 第二个字段计数器查询的结构体


/* sample attributes */
static struct rte_flow_attr attr;
/* rte_flow_attr 代表一条流规则的属性,文档:http://doc.dpdk.org/api/structrte__flow__attr.html
字段:
uint32_t    group       组号
uint32_t    priority    同组内的优先级
uint32_t    ingress:1   规则适用于入口流量
uint32_t    egress:1    规则适用于出口流量
uint32_t    transfer:1  todo
uint32_t    reserved:29 保留,必须为零。
*/

/* flow_classify.c: * Based on DPDK skeleton forwarding example. */

/*
 * Initializes a given port using global settings and with the RX buffers
 * coming from the mbuf_pool passed as a parameter.
 */
// 端口初始化的代码与 basicfw 一模一样
static inline int
port_init(uint8_t port, struct rte_mempool *mbuf_pool)
{
    struct rte_eth_conf port_conf = port_conf_default;
    struct ether_addr addr;
    const uint16_t rx_rings = 1, tx_rings = 1;
    int retval;
    uint16_t q;
    struct rte_eth_dev_info dev_info;
    struct rte_eth_txconf txconf;

    if (!rte_eth_dev_is_valid_port(port))
        return -1;

    rte_eth_dev_info_get(port, &dev_info);
    if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
        port_conf.txmode.offloads |=
            DEV_TX_OFFLOAD_MBUF_FAST_FREE;

    /* Configure the Ethernet device. */
    retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
    if (retval != 0)
        return retval;

    /* Allocate and set up 1 RX queue per Ethernet port. */
    for (q = 0; q < rx_rings; q++) {
        retval = rte_eth_rx_queue_setup(port, q, RX_RING_SIZE,
                rte_eth_dev_socket_id(port), NULL, mbuf_pool);
        if (retval < 0)
            return retval;
    }

    txconf = dev_info.default_txconf;
    txconf.txq_flags = ETH_TXQ_FLAGS_IGNORE;
    txconf.offloads = port_conf.txmode.offloads;
    /* Allocate and set up 1 TX queue per Ethernet port. */
    for (q = 0; q < tx_rings; q++) {
        retval = rte_eth_tx_queue_setup(port, q, TX_RING_SIZE,
                rte_eth_dev_socket_id(port), &txconf);
        if (retval < 0)
            return retval;
    }

    /* Start the Ethernet port. */
    retval = rte_eth_dev_start(port);
    if (retval < 0)
        return retval;

    /* Display the port MAC address. */
    rte_eth_macaddr_get(port, &addr);
    printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
               " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
            port,
            addr.addr_bytes[0], addr.addr_bytes[1],
            addr.addr_bytes[2], addr.addr_bytes[3],
            addr.addr_bytes[4], addr.addr_bytes[5]);

    /* Enable RX in promiscuous mode for the Ethernet device. */
    rte_eth_promiscuous_enable(port);

    return 0;
}

/*
 * The lcore main. This is the main thread that does the work, reading from
 * an input port classifying the packets and writing to an output port.
 */
static __attribute__((noreturn)) void
lcore_main(struct flow_classifier *cls_app)
{
    uint16_t port;
    int ret;
    int i = 0;

    // 测试:删除一条规则
    ret = rte_flow_classify_table_entry_delete(cls_app->cls,
            rules[7]);
    if (ret)
        printf("table_entry_delete failed [7] %d\n\n", ret);
    else
        printf("table_entry_delete succeeded [7]\n\n");

    /*
     * Check that the port is on the same NUMA node as the polling thread
     * for best performance.
     */
    RTE_ETH_FOREACH_DEV(port)
        if (rte_eth_dev_socket_id(port) > 0 &&
            rte_eth_dev_socket_id(port) != (int)rte_socket_id()) {
            printf("\n\n");
            printf("WARNING: port %u is on remote NUMA node\n",
                   port);
            printf("to polling thread.\n");
            printf("Performance will not be optimal.\n");
        }
    printf("\nCore %u forwarding packets. ", rte_lcore_id());
    printf("[Ctrl+C to quit]\n");

    /* Run until the application is quit or killed. */
    for (;;) {
        /*
         * Receive packets on a port, **classify them** and forward them
         * on the paired port.
         * The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc.
         */
        RTE_ETH_FOREACH_DEV(port) {
            /* Get burst of RX packets, from first port of pair. */
            struct rte_mbuf *bufs[BURST_SIZE];
            const uint16_t nb_rx = rte_eth_rx_burst(port, 0,
                    bufs, BURST_SIZE); // 收包

            if (unlikely(nb_rx == 0))
                continue;

            for (i = 0; i < MAX_NUM_CLASSIFY; i++) { 
                if (rules[i]) {  // 对classifier里的每条规则(用一个数组来保存插入成功时返回的rule指针)

                /* rte_flow_classifier_query(),查看burst中是否有任何数据包与表中的一条流规则匹配。
                参数:流分类器句柄、要处理的数据包的mbuf
                        一个burst的数据包数量、要查询的规则、查询的stat */
                    ret = rte_flow_classifier_query(
                        cls_app->cls, 
                        bufs, nb_rx, rules[i], 
                        &classify_stats);
                    if (ret) 
                        printf(
                            "rule [%d] query failed ret [%d]\n\n",
                            i, ret);

                    else { // 返回 0 代表有match
                        printf(
                        "rule[%d] count=%"PRIu64"\n",
                        i, ntuple_stats.counter1);

                        printf("proto = %d\n",
                        ntuple_stats.ipv4_5tuple.proto);
                    }
                }
            }

            /* Send burst of TX packets, to second port of pair. */
            const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
                    bufs, nb_rx);

            /* Free any unsent packets. */
            if (unlikely(nb_tx < nb_rx)) {
                uint16_t buf;

                for (buf = nb_tx; buf < nb_rx; buf++)
                    rte_pktmbuf_free(bufs[buf]);
            }
        }
    }
}

/*
 * Parse IPv4 5 tuple rules file, ipv4_rules_file.txt.
 * Expected format:
 * '/'  \
 * '/'  \
 *   ":"   \
 *   ":"   \
 * '/'  \
 * 
 */

static int
get_cb_field(char **in, uint32_t *fd, int base, unsigned long lim,
        char dlm)
{
    unsigned long val;
    char *end;

    errno = 0;
    val = strtoul(*in, &end, base);

    /*  unsigned long int strtoul(const char *str, char **endptr, int base) 

    把参数 str 所指向的字符串根据给定的 base 转换为一个无符号长整数(unsigned long int 型)。

    str -- 要转换为无符号长整数的字符串。

    endptr -- 对类型为 char* 的对象的引用,其值会由函数设置为 str 中数值后的下一个字符。
    (end 会指向点分十进制中的下一个点)

    base -- 基数,必须介于 2 和 36(包含)之间,或者是特殊值 0。
    当base = 0,自动判断字符串的类型,并按10进制输出,例如"0xa", 就会把字符串当做16进制处理,输出为 10。
    参考:http://www.runoob.com/cprogramming/c-function-strtoul.html
          https://blog.csdn.net/chuhongcai/article/details/52032926
    */

    if (errno != 0 || end[0] != dlm || val > lim) 
        return -EINVAL;
    *fd = (uint32_t)val;
    *in = end + 1; // 例如 2.2.2.3 会依次转换 2 2 2 3
    return 0;
}

static int
parse_ipv4_net(char *in, uint32_t *addr, uint32_t *mask_len)
{
    // in: 2.2.2.3/24

    uint32_t a, b, c, d, m;

    // 这四个if是判断IP地址的每个点分十进制是否小于255(UINT8_MAX)
    if (get_cb_field(&in, &a, 0, UINT8_MAX, '.'))
        return -EINVAL;
    if (get_cb_field(&in, &b, 0, UINT8_MAX, '.'))
        return -EINVAL;
    if (get_cb_field(&in, &c, 0, UINT8_MAX, '.'))
        return -EINVAL;
    if (get_cb_field(&in, &d, 0, UINT8_MAX, '/'))
        return -EINVAL;

    // 后缀要小于32
    if (get_cb_field(&in, &m, 0, sizeof(uint32_t) * CHAR_BIT, 0))
        return -EINVAL;

    addr[0] = IPv4(a, b, c, d);
    mask_len[0] = m;
    return 0;
}

static int
parse_ipv4_5tuple_rule(char *str, struct rte_eth_ntuple_filter *ntuple_filter)
// 将 txt 中一行输入,转换成一个 rte_eth_ntuple_filter 结构体。
{
    int i, ret;
    char *s, *sp, *in[CB_FLD_NUM];
    static const char *dlm = " \t\n";
    int dim = CB_FLD_NUM; // 10
    uint32_t temp;

    s = str;
    for (i = 0; i != dim; i++, s = NULL) {
        in[i] = strtok_r(s, dlm, &sp); 
        // linux下的字符串切割函数:strtok_r
        /* char *strtok_r(char *str, const char *delim, char **saveptr);
        在str中,返回由delim指定的分界符分开str的单词。
        参考链接:https://blog.csdn.net/hustfoxy/article/details/23473805
        */
        if (in[i] == NULL)
            return -EINVAL;
    }
    /* 一条 rule 占一行,格式,以及分词后的在in数组内的下标如下:
    #源IP/前缀  目的IP/前缀 源端口号 : 掩码 目的端口号 : 掩码 协议/掩码 优先级
    2.2.2.3/24  2.2.2.7/24 32 : 0xffff    33 : 0xffff      17/0xff  0
    0           1          2  3 4         5  6 7           8        9  ← in数组下标 
    */

    /* rte_eth_ntuple_filter  的字段:
    uint16_t    flags
    uint32_t    dst_ip          Destination IP address in big endian.
    uint32_t    dst_ip_mask
    uint32_t    src_ip          in big endian.
    uint32_t    src_ip_mask
    uint16_t    dst_port        Destination port in big endian.
    uint16_t    dst_port_mask
    uint16_t    src_port        in big endian.
    uint16_t    src_port_mask
    uint8_t     proto           L4 protocol.
    uint8_t     proto_mask
    uint8_t     tcp_flags       only meaningful when the proto is TCP.
    uint16_t    priority        seven levels (001b-111b), 111b is highest, used when more than one filter matches.
    uint16_t    queue           Queue assigned to when match
     */

    ret = parse_ipv4_net(in[CB_FLD_SRC_ADDR],
            &ntuple_filter->src_ip,
            &ntuple_filter->src_ip_mask);  // 解析 src_ip 得到IP地址和掩码,放到 ntuple_filter的对应字段里
    if (ret != 0) {
        flow_classify_log("failed to read source address/mask: %s\n",
            in[CB_FLD_SRC_ADDR]);
        return ret;
    }

    ret = parse_ipv4_net(in[CB_FLD_DST_ADDR], // 解析 dst_ip
            &ntuple_filter->dst_ip,
            &ntuple_filter->dst_ip_mask);
    if (ret != 0) {
        flow_classify_log("failed to read source address/mask: %s\n",
            in[CB_FLD_DST_ADDR]);
        return ret;
    }

    if (get_cb_field(&in[CB_FLD_SRC_PORT], &temp, 0, UINT16_MAX, 0))
        return -EINVAL; // 源端口号字符串转 unsigned long ,验证不能大于16位无符号数的最大值。
    ntuple_filter->src_port = (uint16_t)temp;

    if (strncmp(in[CB_FLD_SRC_PORT_DLM], cb_port_delim,
            sizeof(cb_port_delim)) != 0)  // 检查分隔符是否为: 不然是格式错误。
        return -EINVAL;

    if (get_cb_field(&in[CB_FLD_SRC_PORT_MASK], &temp, 0, UINT16_MAX, 0))
        return -EINVAL; // 源端口号掩码
    ntuple_filter->src_port_mask = (uint16_t)temp;

    if (get_cb_field(&in[CB_FLD_DST_PORT], &temp, 0, UINT16_MAX, 0))
        return -EINVAL; // 目的端口号
    ntuple_filter->dst_port = (uint16_t)temp;

    if (strncmp(in[CB_FLD_DST_PORT_DLM], cb_port_delim,
            sizeof(cb_port_delim)) != 0)
        return -EINVAL;

    if (get_cb_field(&in[CB_FLD_DST_PORT_MASK], &temp, 0, UINT16_MAX, 0))
        return -EINVAL; // 目的端口号掩码
    ntuple_filter->dst_port_mask = (uint16_t)temp;

    if (get_cb_field(&in[CB_FLD_PROTO], &temp, 0, UINT8_MAX, '/'))
        return -EINVAL; // 协议号
    ntuple_filter->proto = (uint8_t)temp;

    if (get_cb_field(&in[CB_FLD_PROTO], &temp, 0, UINT8_MAX, 0))
        return -EINVAL; // 协议号掩码
    ntuple_filter->proto_mask = (uint8_t)temp;

    if (get_cb_field(&in[CB_FLD_PRIORITY], &temp, 0, UINT16_MAX, 0))
        return -EINVAL; // 优先级
    ntuple_filter->priority = (uint16_t)temp;
    if (ntuple_filter->priority > FLOW_CLASSIFY_MAX_PRIORITY)
        ret = -EINVAL;

    return ret;
}

/* Bypass comment and empty lines */
static inline int
is_bypass_line(char *buff)
{
    int i = 0;

    /* comment line */
    if (buff[0] == COMMENT_LEAD_CHAR)
        return 1;
    /* empty line */
    while (buff[i] != '\0') {
        if (!isspace(buff[i]))
            return 0;
        i++;
    }
    return 1;
}

static uint32_t
convert_depth_to_bitmask(uint32_t depth_val)
{
    uint32_t bitmask = 0;
    int i, j;

    for (i = depth_val, j = 0; i > 0; i--, j++)
        bitmask |= (1 << (31 - j));
    return bitmask;
}

static int
add_classify_rule(struct rte_eth_ntuple_filter *pattern_ipv4_5tuple,
        struct flow_classifier *cls_app) 
        // 对 rte_flow_classify_table_entry_add() 的一层封装,主要是设定好参数,从rte_eth_ntuple_filter 转换成 flow_item
{
    int ret = -1;
    int key_found;
    struct rte_flow_error error;
    /* rte_flow_item: ACL 规则的详细内容。
    会从最低协议层开始堆叠flow_item来形成一个匹配模式。必须由 end_item 结尾。
    */
    struct rte_flow_item_ipv4 ipv4_spec; // (todo) rte_flow_item . Matches an IPv4 header.
    struct rte_flow_item_ipv4 ipv4_mask;

    struct rte_flow_item ipv4_udp_item;
    struct rte_flow_item ipv4_tcp_item;
    struct rte_flow_item ipv4_sctp_item;

    struct rte_flow_item_udp udp_spec;
    struct rte_flow_item_udp udp_mask;
    struct rte_flow_item udp_item;

    struct rte_flow_item_tcp tcp_spec;
    struct rte_flow_item_tcp tcp_mask;
    struct rte_flow_item tcp_item;

    struct rte_flow_item_sctp sctp_spec;
    struct rte_flow_item_sctp sctp_mask;
    struct rte_flow_item sctp_item;

    struct rte_flow_item pattern_ipv4_5tuple[4]; // ntuple_filter 结构体 --> rte_flow_item 结构体数组
    struct rte_flow_classify_rule *rule;

    uint8_t ipv4_proto;

    if (num_classify_rules >= MAX_NUM_CLASSIFY) {
        printf(
            "\nINFO:  classify rule capacity %d reached\n",
            num_classify_rules);
        return ret;
    }

    /* set up parameters for validate and add */
    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
    ipv4_spec.hdr.next_proto_id = ntuple_filter->proto; // 协议号
    ipv4_spec.hdr.src_addr = ntuple_filter->src_ip; // 源IP
    ipv4_spec.hdr.dst_addr = ntuple_filter->dst_ip; // 目的IP
    ipv4_proto = ipv4_spec.hdr.next_proto_id; 
    // 把这三个参数从ntuple_filter结构体提取到 rte_flow_item_ipv4 的一个专门的结构体:ipv4_spec 

    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
    ipv4_mask.hdr.next_proto_id = ntuple_filter->proto_mask; // 协议掩码
    ipv4_mask.hdr.src_addr = ntuple_filter->src_ip_mask;
    ipv4_mask.hdr.src_addr =
        convert_depth_to_bitmask(ipv4_mask.hdr.src_addr);
    ipv4_mask.hdr.dst_addr = ntuple_filter->dst_ip_mask; // 源IP地址的掩码
    ipv4_mask.hdr.dst_addr =
        convert_depth_to_bitmask(ipv4_mask.hdr.dst_addr); // 目的IP地址的掩码
    // 把这三个参数从ntuple_filter结构体提取到 rte_flow_item_ipv4 的一个专门的结构体 :ipv4_mask

    switch (ipv4_proto) { // 根据协议设置L3、L4的item
    case IPPROTO_UDP: // UDP
        ipv4_udp_item.type = RTE_FLOW_ITEM_TYPE_IPV4;
        ipv4_udp_item.spec = &ipv4_spec;
        ipv4_udp_item.mask = &ipv4_mask;
        ipv4_udp_item.last = NULL;

        udp_spec.hdr.src_port = ntuple_filter->src_port;
        udp_spec.hdr.dst_port = ntuple_filter->dst_port;
        udp_spec.hdr.dgram_len = 0;
        udp_spec.hdr.dgram_cksum = 0;

        udp_mask.hdr.src_port = ntuple_filter->src_port_mask;
        udp_mask.hdr.dst_port = ntuple_filter->dst_port_mask;
        udp_mask.hdr.dgram_len = 0;
        udp_mask.hdr.dgram_cksum = 0;

        udp_item.type = RTE_FLOW_ITEM_TYPE_UDP;
        udp_item.spec = &udp_spec;
        udp_item.mask = &udp_mask;
        udp_item.last = NULL;

        attr.priority = ntuple_filter->priority;
        pattern_ipv4_5tuple[1] = ipv4_udp_item; // L3 item 是 ipv4_upd
        pattern_ipv4_5tuple[2] = udp_item; // L4 item 是 udp_item
        break;
    case IPPROTO_TCP: // TCP
        ipv4_tcp_item.type = RTE_FLOW_ITEM_TYPE_IPV4;
        ipv4_tcp_item.spec = &ipv4_spec;
        ipv4_tcp_item.mask = &ipv4_mask;
        ipv4_tcp_item.last = NULL;

        memset(&tcp_spec, 0, sizeof(tcp_spec));
        tcp_spec.hdr.src_port = ntuple_filter->src_port;
        tcp_spec.hdr.dst_port = ntuple_filter->dst_port;

        memset(&tcp_mask, 0, sizeof(tcp_mask));
        tcp_mask.hdr.src_port = ntuple_filter->src_port_mask;
        tcp_mask.hdr.dst_port = ntuple_filter->dst_port_mask;

        tcp_item.type = RTE_FLOW_ITEM_TYPE_TCP;
        tcp_item.spec = &tcp_spec;
        tcp_item.mask = &tcp_mask;
        tcp_item.last = NULL;

        attr.priority = ntuple_filter->priority;
        pattern_ipv4_5tuple[1] = ipv4_tcp_item; // L3 item 是 ipv4_tcp
        pattern_ipv4_5tuple[2] = tcp_item; // L4 item 是 tcp_item
        break;
    case IPPROTO_SCTP:
        ipv4_sctp_item.type = RTE_FLOW_ITEM_TYPE_IPV4;
        ipv4_sctp_item.spec = &ipv4_spec;
        ipv4_sctp_item.mask = &ipv4_mask;
        ipv4_sctp_item.last = NULL;

        sctp_spec.hdr.src_port = ntuple_filter->src_port;
        sctp_spec.hdr.dst_port = ntuple_filter->dst_port;
        sctp_spec.hdr.cksum = 0;
        sctp_spec.hdr.tag = 0;

        sctp_mask.hdr.src_port = ntuple_filter->src_port_mask;
        sctp_mask.hdr.dst_port = ntuple_filter->dst_port_mask;
        sctp_mask.hdr.cksum = 0;
        sctp_mask.hdr.tag = 0;

        sctp_item.type = RTE_FLOW_ITEM_TYPE_SCTP;
        sctp_item.spec = &sctp_spec;
        sctp_item.mask = &sctp_mask;
        sctp_item.last = NULL;

        attr.priority = ntuple_filter->priority;
        pattern_ipv4_5tuple[1] = ipv4_sctp_item;
        pattern_ipv4_5tuple[2] = sctp_item;
        break;
    default:
        return ret;
    }

    attr.ingress = 1; // rules 适用于入口流量
    
    pattern_ipv4_5tuple[0] = eth_item;// L2 item,放在pattern_ipv4_5tuple[0],一定是eth_item
    // L3 item 放在数组下标1,L4 item放在数组下标2
    pattern_ipv4_5tuple[3] = end_item; // 最后一个 item 一定要用 end_item 结尾。

    actions[0] = count_action; // 流匹配的动作是 计数
    actions[1] = end_action; // (terminated by the END pattern item)

    /* Validate and add rule */
    /* 验证这条规则的有效性
    参数:
    1. classifer 指针
    2. attr 指针,流规则的属性,详细内容见上。
    3. rte_flow_item 结构体数组(terminated by the END pattern item),也就是 ACL 规则的详细内容
    4. rte_flow_action 结构体数组(terminated by the END pattern item),表示流规则的动作,比如QUEUE, DROP, END等等,
    5. struct rte_flow_error,出错时存放信息。
    */
    ret = rte_flow_classify_validate(cls_app->cls, &attr,
            pattern_ipv4_5tuple, actions, &error);
    if (ret) { // 成功时返回 0 
        printf("table entry validate failed ipv4_proto = %u\n",
            ipv4_proto);
        return ret;
    }

    // 调用 rte_flow_classify_table_entry_add() 将规则添加到 rte_flow_classifier 对象中的 table。
    /* 五个参数
        1. classifier 的指针。
        2. attr 指针。
        3. rte_flow_item 结构体数组,也就是 ACL 规则的详细内容。
        4. rte_flow_action 结构体数组,表示流规则的动作。
        5. 一个int指针,如果规则已经存在则返回1,否则返回0。
        6. 仅出错时存放信息。
    */
    rule = rte_flow_classify_table_entry_add(
            cls_app->cls, &attr, pattern_ipv4_5tuple,
            actions, &key_found, &error);
    if (rule == NULL) { // 添加成功时返回的是rule的有效句柄,否则为NULL
        printf("table entry add failed ipv4_proto = %u\n",
            ipv4_proto);
        ret = -1;
        return ret;
    }

    rules[num_classify_rules] = rule; // 将rule存放在一个数组里,方便删除等操作
    num_classify_rules++;
    return 0;
}

static int
add_rules(const char *rule_path, struct flow_classifier *cls_app) 
// 封装一层,主要是文件操作,把txt中的一行解析成 rte_eth_ntuple_filter 结构体
{
    FILE *fh;
    char buff[LINE_MAX];
    unsigned int i = 0;
    unsigned int total_num = 0;
    struct rte_eth_ntuple_filter ntuple_filter; // 用于定义n-tuple过滤器条目的结构体
    int ret;

    fh = fopen(rule_path, "rb"); // 打开 ipv4_rules_file.txt
    if (fh == NULL)
        rte_exit(EXIT_FAILURE, "%s: fopen %s failed\n", __func__,
            rule_path);

    ret = fseek(fh, 0, SEEK_SET); // 设置文件指针fh的位置指向文件开头
    if (ret) // 成功,返回0
        rte_exit(EXIT_FAILURE, "%s: fseek %d failed\n", __func__,
            ret);

    i = 0;
    while (fgets(buff, LINE_MAX, fh) != NULL) { // 读取一行内容
        i++;

        if (is_bypass_line(buff)) // 跳过空行 or 以井号开头的注释
            continue;

        if (total_num >= FLOW_CLASSIFY_MAX_RULE_NUM - 1) { // 有最大规则数量(行数)限制
            printf("\nINFO: classify rule capacity %d reached\n",
                total_num);
            break;
        }

        if (parse_ipv4_5tuple_rule(buff, &ntuple_filter) != 0) // 规则的 parser 解析txt的一行输入,存放到ntuple_filter结构体里
            rte_exit(EXIT_FAILURE,
                "%s Line %u: parse rules error\n",
                rule_path, i);

        if (add_classify_rule(&ntuple_filter, cls_app) != 0) // 添加这条五元组规则到 ACL 中
            rte_exit(EXIT_FAILURE, "add rule error\n");

        total_num++;
    }

    fclose(fh);
    return 0;
}

/* display usage */
static void
print_usage(const char *prgname)
{
    printf("%s usage:\n", prgname);
    printf("[EAL options] --  --"OPTION_RULE_IPV4"=FILE: ");
    printf("specify the ipv4 rules file.\n");
    printf("Each rule occupies one line in the file.\n");
}

/* Parse the argument given in the command line of the application */
// 解析执行 flow_classify 的命令行参数
static int
parse_args(int argc, char **argv)
{
    int opt, ret;
    char **argvopt;
    int option_index;
    char *prgname = argv[0];
    static struct option lgopts[] = {
        {OPTION_RULE_IPV4, 1, 0, 0},
        {NULL, 0, 0, 0}
    };

    argvopt = argv;

    while ((opt = getopt_long(argc, argvopt, "",
                lgopts, &option_index)) != EOF) {

        switch (opt) {
        /* long options */
        case 0:
            if (!strncmp(lgopts[option_index].name,
                    OPTION_RULE_IPV4,
                    sizeof(OPTION_RULE_IPV4)))
                parm_config.rule_ipv4_name = optarg;
            break;
        default:
            print_usage(prgname);
            return -1;
        }
    }

    if (optind >= 0)
        argv[optind-1] = prgname;

    ret = optind-1;
    optind = 1; /* reset getopt lib */
    return ret;
}

/*
 * The main function, which does initialization and calls the lcore_main
 * function.
 */
int
main(int argc, char *argv[])
{
    struct rte_mempool *mbuf_pool;
    uint8_t nb_ports;
    uint16_t portid;
    int ret;
    int socket_id;

    // 以下可以在 dpdk api data struct 中查看
    struct rte_table_acl_params table_acl_params; // ACL table 的参数
    struct rte_flow_classify_table_params cls_table_params; // Parameters for table creation
    struct flow_classifier *cls_app;  // 分流器
    // 分流器的内部结构要见https://doc.dpdk.org/guides/prog_guide/flow_classify_lib.html#classifier-creation
    
    struct rte_flow_classifier_params cls_params; // classifier 的参数
    uint32_t size;

    /* Initialize the Environment Abstraction Layer (EAL). */
    ret = rte_eal_init(argc, argv); // 初始化 EAL
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");

    argc -= ret;
    argv += ret;

    /* parse application arguments (after the EAL ones) */
    ret = parse_args(argc, argv); // 解析 flow_classify 的命令行参数
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Invalid flow_classify parameters\n");

    /* Check that there is an even number of ports to send/receive on. */
    nb_ports = rte_eth_dev_count(); // 网口数目必须是偶数
    if (nb_ports < 2 || (nb_ports & 1))
        rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");

    /* Creates a new mempool in memory to hold the mbufs. */
    // 创建mempool
    mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS * nb_ports,
        MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());

    if (mbuf_pool == NULL)
        rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");

    /* Initialize all ports. */
    RTE_ETH_FOREACH_DEV(portid) // 端口初始化 与basicfw的一样
        if (port_init(portid, mbuf_pool) != 0)
            rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu8 "\n",
                    portid);

    if (rte_lcore_count() > 1) // 只需要一个逻辑核心
        printf("\nWARNING: Too many lcores enabled. Only 1 used.\n");

    socket_id = rte_eth_dev_socket_id(0); // 返回 0 号网口所在的NUMA socket id号

    /* Memory allocation */
    // 为分流器 cls_app 分配内存
    size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct flow_classifier_acl));// 返回大于或等于宏定义参数的第一个缓存对齐值
    cls_app = rte_zmalloc(NULL, size, RTE_CACHE_LINE_SIZE); // DPDK的malloc:从调用该函数的核上的同一个NUMA socket的大页面区域分配堆内存。
                                                            // zmalloc 就是清零 与 calloc 相似
    /* rte_zmalloc 参数三个:
        1. 指示这块区域分配给怎样的object类型。用于debug用途。可以写NULL
        2. size (in bytes) to be allocated,这里分配一个cache缓存行的字节。
        3. align
            if 0, 会返回一个适合任何类型变量的指针,就像 malloc
            否则,返回一个内存区域是 align 的对齐倍数,显然最小对齐是高速缓存行大小,宏:RTE_CACHE_LINE_SIZE
    */
    if (cls_app == NULL) // 分配内存失败
        rte_exit(EXIT_FAILURE, "Cannot allocate classifier memory\n");

    // classifier 的参数 有两个: name 和 socket id
    // 需要在调用 create() API 之前由应用程序初始化
    cls_params.name = "flow_classifier";
    cls_params.socket_id = socket_id;

    // 调用 rte_flow_classifier_create() 函数来创建rte_flow_classifier对象。
    // 参数是 rte_flow_classifier_params 结构体指针
    cls_app->cls = rte_flow_classifier_create(&cls_params);
    if (cls_app->cls == NULL) { // 创建失败
        rte_free(cls_app);
        rte_exit(EXIT_FAILURE, "Cannot create classifier\n");
    }

    /* initialise ACL table params */
    // 填写 ACL 的初始化参数
    // 四个字段:
    table_acl_params.name = "table_acl_ipv4_5tuple"; // ACL的名字
    table_acl_params.n_rules = FLOW_CLASSIFY_MAX_RULE_NUM; // 表中最大ACL规则数量:91 
    table_acl_params.n_rule_fields = RTE_DIM(ipv4_defs); // 一条ACL规则中的有多少个字段(fields)
    //宏定义如下:#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) 直观看就是返回数组的长度。

    memcpy(table_acl_params.field_format, ipv4_defs, sizeof(ipv4_defs)); 
    //  ACL rule 的详细内容 specification
    //  ACL 规则的字段也必须由应用程序初始化。

    /* initialise table create params */
    // 填写 表 的创建参数
    // 三个字段:
    cls_table_params.ops = &rte_table_acl_ops; //表操作(特定于每个表类型),(todo:这里不清楚具体是怎么操作的
    cls_table_params.arg_create = &table_acl_params; // 传递给表的用于创建的参数 这里是ACL的初始化参数结构体的指针
    cls_table_params.type = RTE_FLOW_CLASSIFY_TABLE_ACL_IP4_5TUPLE; // table's type,是一个 enum 

    // rte_flow_classify_table_create() 向classifier对象添加一个表。
    // 参数两个:1. 流分类器的指针 2. 表创建的参数
    ret = rte_flow_classify_table_create(cls_app->cls, &cls_table_params);
    if (ret) { // 返回值:成功时返回 0
        rte_flow_classifier_free(cls_app->cls);
        rte_free(cls_app);
        rte_exit(EXIT_FAILURE, "Failed to create classifier table\n");
    }

    /* read file of IPv4 5 tuple rules and initialize parameters
     * for rte_flow_classify_validate and rte_flow_classify_table_entry_add
     * API's.
     */
    // 然后它读取ipv4_rules_file.txt文件,验证流规则是否合法,然后初始化rte_flow_classify_table_entry_add() API 的参数,使用此API将规则添加到ACL表。
    if (add_rules(parm_config.rule_ipv4_name, cls_app)) {
        rte_flow_classifier_free(cls_app->cls);
        rte_free(cls_app);
        rte_exit(EXIT_FAILURE, "Failed to add rules\n");
    }

    /* Call lcore_main on the master core only. */ // todo
    lcore_main(cls_app);

    return 0;
}

基本看完了,但开头有很多结构体和宏定义,没有办法在 API doc 中找到确切的页面。第一个是因为 DPDK src code 中对那些数据结构有经常的改动,文档上的改动没有跟上。还有就是有用到一些 Intel 各种宏定义,并不是在 DPDK 的 API doc 中有体现。

flow_classify 这个程序做的事情分为如下几步骤:

  1. EAL初始化、端口初始化、分配内存等,与basicfw是一样的。
  2. 创建 flow_classifer对象。这一个过程在代码中体现好几个阶段:为classifier分配内存、填写 ACL 的初始化参数、填写 table 的初始化参数、创建 classifer 对象。
  3. 读取 ipv4_rules_file.txt 这个文件,文件中一行是一个规则,一行的内容是一个ipv4的五元组。如果符合输入的合法性验证要求,就把里面的内容,提成特定的数据结构,插入到 classifer 里。2、3两步过程中封装了多层,还涉及非常多的数据结构和API。不容易搞懂。(其实也不需要完全搞懂,我后面有说,继续往下看)
  4. 添加完规则后进入lcore_main主线程,死循环收包(参照basicfw)。每次收上来的一堆包,就对 classifier 里的每条规则进行都 query,用到DPDK的API。如果其中有符合规则的packet(也就是query rule 匹配),就会在对应 rule 的 counter 加 1 并显示 counter 的数字(匹配成功次数),失败的话就显示“没有匹配到这条规则”的提示语句。然后不论匹配是否成功,都把这批包从另一个端口转发了。

我们可以看看 ipv4_rules_file.txt 这个文件的内容:

#src_ip/masklen dst_ip/masklen src_port : mask dst_port : mask proto/mask priority
#
2.2.2.3/24 2.2.2.7/24 32 : 0xffff 33 : 0xffff 17/0xff 0
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 17/0xff 1
9.9.9.3/24 9.9.9.7/24 32 : 0xffff 33 : 0xffff 6/0xff 2
9.9.8.3/24 9.9.8.7/24 32 : 0xffff 33 : 0xffff 6/0xff 3
6.7.8.9/24 2.3.4.5/24 32 : 0x0000 33 : 0x0000 132/0xff 4
6.7.8.9/32 192.168.0.36/32 10 : 0xffff 11 : 0xffff 6/0xfe 5
6.7.8.9/24 192.168.0.36/24 10 : 0xffff 11 : 0xffff 6/0xfe 6
6.7.8.9/16 192.168.0.36/16 10 : 0xffff 11 : 0xffff 6/0xfe 7
6.7.8.9/8 192.168.0.36/8 10 : 0xffff 11 : 0xffff 6/0xfe 8

可以看到,DPDK 在 classify flow 中对 flow 的定义是根据 IPv4 的五元组 + 优先级来的,优先级就是如果有一个包同时满足了多条规则,则匹配的是优先级最高的那一条。

综上所述,这个flow_classify 的程序的功能就是首先,在文件ipv4_rules_file.txt 中预设一些五元组 + 优先级的 rules,然后运行这个程序。在网口收包时,如果收到了满足某条 rule 的流,则会提示并在相对应的 rule 上计数。由于代码太复杂,所以我们不需要对代码进行修改或自行编程,只需修改ipv4_rules_file.txt 这个文件的内容后,运行自带的程序即可。DPDK还有一个 sample 叫做flow_filtering,我猜想大部分程序内容应该会和flow_classify是相似的,区别会体现在lcore_main主线程中,flow_filtering会把不满足流规则的包丢弃。

运行情况

root@ubuntu:/home/chang/dpdk/examples/flow_classify/build# ./flow_classify -c 1 -n 4 -- --rule_ipv4="../ipv4_rules_file.txt"
EAL: Detected 8 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: PCI device 0000:02:01.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100f net_e1000_em
EAL: PCI device 0000:02:02.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100f net_e1000_em
EAL: PCI device 0000:02:03.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100f net_e1000_em
EAL: PCI device 0000:02:04.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100f net_e1000_em
Port 0 MAC: 00 0c 29 f7 4d 25
Port 1 MAC: 00 0c 29 f7 4d 2f
table_entry_delete succeeded [7]


Core 0 forwarding packets. [Ctrl+C to quit]

rule [0] query failed ret [-22]

rule [1] query failed ret [-22]

rule [2] query failed ret [-22]

rule [3] query failed ret [-22]

rule [4] query failed ret [-22]

rule [5] query failed ret [-22]

rule [6] query failed ret [-22]

rule [7] query failed ret [-22]

rule [8] query failed ret [-22]

rule [0] query failed ret [-22]

rule [1] query failed ret [-22]

rule [2] query failed ret [-22]

rule [3] query failed ret [-22]

rule [4] query failed ret [-22]

rule [5] query failed ret [-22]

rule [6] query failed ret [-22]

rule [7] query failed ret [-22]

rule [8] query failed ret [-22]

rule [0] query failed ret [-22]

rule [1] query failed ret [-22]

rule [2] query failed ret [-22]

rule [3] query failed ret [-22]

rule [4] query failed ret [-22]

rule [5] query failed ret [-22]

rule [6] query failed ret [-22]

rule [7] query failed ret [-22]

rule [8] query failed ret [-22]

我没有改动原来自带的规则文件,因此不会有匹配成功,提示的都是匹配失败。下一步的思路可以是熟悉pktgen等发包工具的使用,发出特定五元组的包,并在规则文件中修改,使其匹配。

reference

  • 他人的文章:
    • 丹西 - DPDK报文分类与访问控制
    • cumirror - DPDK ACL算法介绍
    • Generic Flow API简介
  • 文档:
    • API doc 中的 flow_classify、rte_flow.h
  • programmer's guides 中的
    • 9. Generic flow API 有 Rules 的详细信息。
    • ※25. Flow Classification Library 介绍了创建 classifer、添加规则、parsing、query 的过程。
    • 41. Packet Classification and Access Control,介绍了 ACL 与 rule 的关系。
  • sample guieds 中的
    • 9. Flow Classify Sample Application

你可能感兴趣的:(DPDK flow_classify 源码阅读)