skeleton
功能基础的二层转发工具。将偶数个网口进行配对,从0接收到的包转发到1口中,从1接收到的包转发到0口中,以此类推。
该例程用到了内存缓冲池mbuf_pool以及mbuf进行接包转包。
skeleton
编译及运行结果编译命令如下:
cd examples/skeleton
export RTE_SDK=/home/lianpeng/dpdk-19.11
make clean
make
运行命令如下:
cd build
./skeleton -c 1 #只需要一个核心参与运行即可
结果输出:
EAL: Detected 4 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
EAL: Selected IOVA mode 'PA'
EAL: Probing VFIO support...
EAL: PCI device 0000:00:03.0 on NUMA socket -1
EAL: Invalid NUMA socket, default to 0
EAL: probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:08.0 on NUMA socket -1
EAL: Invalid NUMA socket, default to 0
EAL: probe driver: 8086:100e net_e1000_em
Port 0 MAC: 08 00 27 60 56 7d
Port 1 MAC: 08 00 27 ea 05 4f
Core 0 forwarding packets. [Ctrl+C to quit]
main.c
源码解析#include
#include
#include
#include
#include
#include
#include
#define RX_RING_SIZE 1024 //接收环的大小
#define TX_RING_SIZE 1024 //发送环的大小
#define NUM_MBUFS 8191 //每个网口可以挂最多NUM_MBUFS个包(RX+TX)
#define MBUF_CACHE_SIZE 250 //与内存及高速缓存器有关
#define BURST_SIZE 32 //调用一次函数,从物理层获取的最大包数量
//网口默认配置,RX接收的数据包大小默认为ETHER链路帧包的最大值(MTU)
static const struct rte_eth_conf port_conf_default = {
.rxmode = {
.max_rx_pkt_len = RTE_ETHER_MAX_LEN,
},
};
port_init
函数/* basicfwd.c: Basic DPDK skeleton forwarding example. */
/*
* Initializes a given port using global settings and with the RX buffers
* coming from the mbuf_pool passed as a parameter.
*/
static inline int
port_init(uint16_t port, struct rte_mempool *mbuf_pool)
{
struct rte_eth_conf port_conf = port_conf_default; //初始化网口配置
const uint16_t rx_rings = 1, tx_rings = 1; //队列数量
uint16_t nb_rxd = RX_RING_SIZE; //初始化当前接收队列容量
uint16_t nb_txd = TX_RING_SIZE; //初始化当前发送队列容量
int retval; //函数返回值,临时变量
uint16_t q;
struct rte_eth_dev_info dev_info;
struct rte_eth_txconf txconf;
if (!rte_eth_dev_is_valid_port(port)) //判断当前PORT号是否合法
return -1;
retval = rte_eth_dev_info_get(port, &dev_info);//获取当前的网口配置
if (retval != 0) {
//如果获取失败则报错并返回该retval
printf("Error during getting device (port %u) info: %s\n",
port, strerror(-retval));
return retval;
}
//?
if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
port_conf.txmode.offloads |=
DEV_TX_OFFLOAD_MBUF_FAST_FREE;
/* Configure the Ethernet device. */
// 配置以太网口设备,para:端口号、接收环数、发送环个数、网口配置
retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
if (retval != 0)
return retval;
//检查Rx和Tx描述符(mbuf)的数量是否满足网卡的描述符限制
//不满足将其调整为边界(改变其值)
retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
if (retval != 0)
return retval;
//队列初始化:对指定端口的某个队列
//指定内存描述符数量,报文缓冲区,并配置队列
/* Allocate and set up 1 RX queue per Ethernet port. */
for (q = 0; q < rx_rings; q++) {
retval = rte_eth_rx_queue_setup(port, q, nb_rxd,
rte_eth_dev_socket_id(port), NULL, mbuf_pool);
if (retval < 0)
return retval;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per Ethernet port. */
for (q = 0; q < tx_rings; q++) {
retval = rte_eth_tx_queue_setup(port, q, nb_txd,
rte_eth_dev_socket_id(port), &txconf);
if (retval < 0)
return retval;
}
/* Start the Ethernet port. */
//网口配置好就可以启动了
retval = rte_eth_dev_start(port);
if (retval < 0)
return retval;
/* Display the port MAC address. */
struct rte_ether_addr addr;
retval = rte_eth_macaddr_get(port, &addr);
if (retval != 0)
return retval;
printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
" %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
port,
addr.addr_bytes[0], addr.addr_bytes[1],
addr.addr_bytes[2], addr.addr_bytes[3],
addr.addr_bytes[4], addr.addr_bytes[5]);
//将网卡设置为混杂模式
/* Enable RX in promiscuous mode for the Ethernet device. */
retval = rte_eth_promiscuous_enable(port);
if (retval != 0)
return retval;
return 0;
}
lcore_main
函数,即线程执行的函数/*
* The lcore main. This is the main thread that does the work, reading from
* an input port and writing to an output port.
*/
static __attribute__((noreturn)) void
lcore_main(void)
{
uint16_t port;
/*
* Check that the port is on the same NUMA node as the polling thread
* for best performance.
* 查看成对的网口对应的线程所使用的核心是否共用同一个NUMA结点
* 以获取更高的性能
*/
RTE_ETH_FOREACH_DEV(port)
if (rte_eth_dev_socket_id(port) > 0 &&
rte_eth_dev_socket_id(port) !=
(int)rte_socket_id())
printf("WARNING, port %u is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n", port);
printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
rte_lcore_id());
//无限循环
/* Run until the application is quit or killed. */
for (;;) {
/*
* Receive packets on a port and forward them on the paired
* port. The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc.
*/
RTE_ETH_FOREACH_DEV(port) {
/* Get burst of RX packets, from first port of pair. */
struct rte_mbuf *bufs[BURST_SIZE];
//物理口收包函数
const uint16_t nb_rx = rte_eth_rx_burst(port, 0,
bufs, BURST_SIZE);
// unlikely()是一种分支预测,表明大概率nb_rx != 0
//也就是该网口大概率会接收到包
//算是一种编译优化的方法
if (unlikely(nb_rx == 0))
continue;
/* Send burst of TX packets, to second port of pair. */
//物理口发包函数
const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
bufs, nb_rx);
//对于发不出去的包就把内存释放掉,也就是drop这些包
//对于DPDK的收包和转发来说,都是一次处理多个数据包
//原因是cache行的内存对齐可以一次处理多个地址
//并且可以充分利用处理器内部的乱序执行和并行处理能力。
/* Free any unsent packets. */
if (unlikely(nb_tx < nb_rx)) {
uint16_t buf;
for (buf = nb_tx; buf < nb_rx; buf++)
rte_pktmbuf_free(bufs[buf]);
}
}
}
}
main
函数
/*
* The main function, which does initialization and calls the per-lcore
* functions.
*/
int
main(int argc, char *argv[])
{
struct rte_mempool *mbuf_pool; //内存缓冲池指针
unsigned nb_ports;
uint16_t portid;
/* Initialize the Environment Abstraction Layer (EAL). */
int ret = rte_eal_init(argc, argv); //首先初始化环境抽象层
if (ret < 0)
rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
argc -= ret;
argv += ret;
/* Check that there is an even number of ports to send/receive on. */
nb_ports = rte_eth_dev_count_avail();
//获取网卡数量,如果非偶数或为0则报错
if (nb_ports < 2 || (nb_ports & 1))
rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
/* Creates a new mempool in memory to hold the mbufs. */
//创建一个内存缓冲池,大小为 NUM_MBUFS*网卡数量
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS * nb_ports,
MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool == NULL)
rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
/* Initialize all ports. */
RTE_ETH_FOREACH_DEV(portid)
if (port_init(portid, mbuf_pool) != 0)
rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu16 "\n",
portid);
if (rte_lcore_count() > 1) //只需要一个线程
printf("\nWARNING: Too many lcores enabled. Only 1 used.\n");
/* Call lcore_main on the master core only. */
lcore_main();
return 0;
}
rte_eal_init(argc, argv)
调用函数的源代码如下:
//source code
/* helper to create a mbuf pool */
struct rte_mempool *
rte_pktmbuf_pool_create(const char *name, unsigned n,
unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
int socket_id)
函数参数:
const char *name
, 是内存缓冲池名,可以根据mempool的名字进行查找,使用rte_mempool_lookup()
接口即可;unsigned n
, 表示申请内存空间容纳的包大小,本例程调用时为NUM_MBUFS*nb_ports,对于每个网口有NUM_MBUFS个包;unsigned cache_size
,设置的cache大小,本例程为MBUF_CACHE_SIZE;uint16_t priv_size
,设置的每个包的大小,本例程为RTE_MBUF_DEFAULT_BUF_SIZE,查看宏发现它的值>2048;int socket_id
,当前的socket_id。函数返回值为内存指针,指向缓冲池空间。
static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
函数功能:
对于网口port_id,将队列queue_id的nb_pkts个包放入首地址rx_pkts中。
函数参数:
uint16_t port_id
,网口id;uint16_t queue_id
,队列id;struct rte_mbuf **rx_pkts
,内存指针,指向接收队列的首个数据包;const uint16_t nb_pkts
,最多接收包的个数,本例程为BURST_SIZE。函数返回值为读取到的数据包的数量,<=nb_pkts。
static inline uint16_t
rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id,
struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
函数功能:
对于网口port_id,将内存指针tx_pkts指向的nb_pkts个包放入queue_id队列上。
函数参数:
uint16_t port_id
,网口id,本例程选择的网口id为当前网口与1异或;uint16_t queue_id
,队列id;struct rte_mbuf **tx_pkts
,内存指针,指向发送队列的首个数据包;const uint16_t nb_pkts
,最多发送包的个数,本例程为刚刚网口所接收到的数据包的个数。static inline void rte_pktmbuf_free(struct rte_mbuf *m)
函数功能:
将数据包m的内存空间归还给数据包缓冲池。对于比较大、在内存空间中被分片的数据包,将其所有的分片内存空间都归还给数据包缓冲池。
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif /* likely */
在计算机体系结构中,流水线级指令处理器采用分支预测的方法进行指令预取,但是对于条件指令,如果预取错误将会导致利用率降低。
likely
是倾向于将指令预取为if条件满足的指令块,unlikely
倾向于将指令预取为if条件不满足的指令块。