dpdk-skeleton例程解析

dpdk-skeleton例程解析

  • `skeleton`功能
  • `skeleton`编译及运行结果
  • `main.c`源码解析
  • 重要函数/宏定义
    • 1. 环境抽象层初始化(rte_eal_init)
    • 2. 内存缓冲池初始化
    • 3. 物理层收包函数
    • 4. 物理层发包函数
    • 5. 数据包内存释放(DROP接收到的包)
    • 6. unlikely/likely分支预测

skeleton功能

基础的二层转发工具。将偶数个网口进行配对,从0接收到的包转发到1口中,从1接收到的包转发到0口中,以此类推。
该例程用到了内存缓冲池mbuf_pool以及mbuf进行接包转包。

skeleton编译及运行结果

编译命令如下:

cd examples/skeleton
export RTE_SDK=/home/lianpeng/dpdk-19.11
make clean
make

运行命令如下:

cd build
./skeleton -c 1 #只需要一个核心参与运行即可

结果输出:

EAL: Detected 4 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
EAL: Selected IOVA mode 'PA'
EAL: Probing VFIO support...
EAL: PCI device 0000:00:03.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:08.0 on NUMA socket -1
EAL:   Invalid NUMA socket, default to 0
EAL:   probe driver: 8086:100e net_e1000_em
Port 0 MAC: 08 00 27 60 56 7d
Port 1 MAC: 08 00 27 ea 05 4f

Core 0 forwarding packets. [Ctrl+C to quit]

main.c源码解析

  1. 头文件引用及宏定义
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define RX_RING_SIZE 1024 //接收环的大小
#define TX_RING_SIZE 1024 //发送环的大小

#define NUM_MBUFS 8191 //每个网口可以挂最多NUM_MBUFS个包(RX+TX)
#define MBUF_CACHE_SIZE 250 //与内存及高速缓存器有关
#define BURST_SIZE 32  //调用一次函数,从物理层获取的最大包数量
//网口默认配置,RX接收的数据包大小默认为ETHER链路帧包的最大值(MTU)
static const struct rte_eth_conf port_conf_default = {
     
	.rxmode = {
     
		.max_rx_pkt_len = RTE_ETHER_MAX_LEN,
	},
};
  1. 初始化网口port_init函数
/* basicfwd.c: Basic DPDK skeleton forwarding example. */

/*
 * Initializes a given port using global settings and with the RX buffers
 * coming from the mbuf_pool passed as a parameter.
 */
static inline int
port_init(uint16_t port, struct rte_mempool *mbuf_pool)
{
     
	struct rte_eth_conf port_conf = port_conf_default; //初始化网口配置
	const uint16_t rx_rings = 1, tx_rings = 1; //队列数量
	uint16_t nb_rxd = RX_RING_SIZE; //初始化当前接收队列容量
	uint16_t nb_txd = TX_RING_SIZE; //初始化当前发送队列容量
	int retval; //函数返回值,临时变量
	uint16_t q;
	struct rte_eth_dev_info dev_info;
	struct rte_eth_txconf txconf;

	if (!rte_eth_dev_is_valid_port(port)) //判断当前PORT号是否合法
		return -1;

	retval = rte_eth_dev_info_get(port, &dev_info);//获取当前的网口配置
	if (retval != 0) {
     //如果获取失败则报错并返回该retval
		printf("Error during getting device (port %u) info: %s\n",
				port, strerror(-retval));
		return retval;
	}
	//?
	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
		port_conf.txmode.offloads |=
			DEV_TX_OFFLOAD_MBUF_FAST_FREE;

	/* Configure the Ethernet device. */
	// 配置以太网口设备,para:端口号、接收环数、发送环个数、网口配置
	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
	if (retval != 0)
		return retval;
	//检查Rx和Tx描述符(mbuf)的数量是否满足网卡的描述符限制
	//不满足将其调整为边界(改变其值)
	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
	if (retval != 0)
		return retval;
	//队列初始化:对指定端口的某个队列
	//指定内存描述符数量,报文缓冲区,并配置队列
	/* Allocate and set up 1 RX queue per Ethernet port. */
	for (q = 0; q < rx_rings; q++) {
     
		retval = rte_eth_rx_queue_setup(port, q, nb_rxd,
				rte_eth_dev_socket_id(port), NULL, mbuf_pool);
		if (retval < 0)
			return retval;
	}
	txconf = dev_info.default_txconf;
	txconf.offloads = port_conf.txmode.offloads;
	/* Allocate and set up 1 TX queue per Ethernet port. */
	for (q = 0; q < tx_rings; q++) {
     
		retval = rte_eth_tx_queue_setup(port, q, nb_txd,
				rte_eth_dev_socket_id(port), &txconf);
		if (retval < 0)
			return retval;
	}

	/* Start the Ethernet port. */
	//网口配置好就可以启动了
	retval = rte_eth_dev_start(port);
	if (retval < 0)
		return retval;

	/* Display the port MAC address. */
	struct rte_ether_addr addr;
	retval = rte_eth_macaddr_get(port, &addr);
	if (retval != 0)
		return retval;

	printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
			   " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
			port,
			addr.addr_bytes[0], addr.addr_bytes[1],
			addr.addr_bytes[2], addr.addr_bytes[3],
			addr.addr_bytes[4], addr.addr_bytes[5]);
	//将网卡设置为混杂模式
	/* Enable RX in promiscuous mode for the Ethernet device. */
	retval = rte_eth_promiscuous_enable(port);
	if (retval != 0)
		return retval;
	
	return 0;
}
  1. lcore_main函数,即线程执行的函数
/*
 * The lcore main. This is the main thread that does the work, reading from
 * an input port and writing to an output port.
 */
static __attribute__((noreturn)) void
lcore_main(void)
{
     
	uint16_t port;

	/*
	 * Check that the port is on the same NUMA node as the polling thread
	 * for best performance.
	 * 查看成对的网口对应的线程所使用的核心是否共用同一个NUMA结点
	 * 以获取更高的性能
	 */
	RTE_ETH_FOREACH_DEV(port)
		if (rte_eth_dev_socket_id(port) > 0 &&
				rte_eth_dev_socket_id(port) !=
						(int)rte_socket_id())
			printf("WARNING, port %u is on remote NUMA node to "
					"polling thread.\n\tPerformance will "
					"not be optimal.\n", port);

	printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
			rte_lcore_id());
	//无限循环
	/* Run until the application is quit or killed. */
	for (;;) {
     
		/* 
		 * Receive packets on a port and forward them on the paired
		 * port. The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc.
		 */
		RTE_ETH_FOREACH_DEV(port) {
     

			/* Get burst of RX packets, from first port of pair. */
			struct rte_mbuf *bufs[BURST_SIZE];
			//物理口收包函数
			const uint16_t nb_rx = rte_eth_rx_burst(port, 0,
					bufs, BURST_SIZE);
			// unlikely()是一种分支预测,表明大概率nb_rx != 0
			//也就是该网口大概率会接收到包
			//算是一种编译优化的方法
			if (unlikely(nb_rx == 0))
				continue;

			/* Send burst of TX packets, to second port of pair. */
			//物理口发包函数
			const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
					bufs, nb_rx);
			//对于发不出去的包就把内存释放掉,也就是drop这些包
			//对于DPDK的收包和转发来说,都是一次处理多个数据包
			//原因是cache行的内存对齐可以一次处理多个地址
			//并且可以充分利用处理器内部的乱序执行和并行处理能力。
			/* Free any unsent packets. */
			if (unlikely(nb_tx < nb_rx)) {
     
				uint16_t buf;
				for (buf = nb_tx; buf < nb_rx; buf++)
					rte_pktmbuf_free(bufs[buf]);
			}
		}
	}
}
  1. main函数

/*
 * The main function, which does initialization and calls the per-lcore
 * functions.
 */
int
main(int argc, char *argv[])
{
     
	struct rte_mempool *mbuf_pool; //内存缓冲池指针
	unsigned nb_ports; 
	uint16_t portid;

	/* Initialize the Environment Abstraction Layer (EAL). */
	int ret = rte_eal_init(argc, argv); //首先初始化环境抽象层
	if (ret < 0)
		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");

	argc -= ret;
	argv += ret;

	/* Check that there is an even number of ports to send/receive on. */
	nb_ports = rte_eth_dev_count_avail();
	//获取网卡数量,如果非偶数或为0则报错
	if (nb_ports < 2 || (nb_ports & 1)) 
		rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");

	/* Creates a new mempool in memory to hold the mbufs. */
	//创建一个内存缓冲池,大小为  NUM_MBUFS*网卡数量 
	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS * nb_ports,
		MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());

	if (mbuf_pool == NULL)
		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");

	/* Initialize all ports. */
	RTE_ETH_FOREACH_DEV(portid)
		if (port_init(portid, mbuf_pool) != 0)
			rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu16 "\n",
					portid);

	if (rte_lcore_count() > 1) //只需要一个线程
		printf("\nWARNING: Too many lcores enabled. Only 1 used.\n");

	/* Call lcore_main on the master core only. */
	lcore_main();

	return 0;
}

重要函数/宏定义

1. 环境抽象层初始化(rte_eal_init)

rte_eal_init(argc, argv)

2. 内存缓冲池初始化

调用函数的源代码如下:

//source code
 /* helper to create a mbuf pool */
 struct rte_mempool *
 rte_pktmbuf_pool_create(const char *name, unsigned n,
     unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
     int socket_id)

函数参数:

  1. const char *name, 是内存缓冲池名,可以根据mempool的名字进行查找,使用rte_mempool_lookup()接口即可;
  2. unsigned n, 表示申请内存空间容纳的包大小,本例程调用时为NUM_MBUFS*nb_ports,对于每个网口有NUM_MBUFS个包;
  3. unsigned cache_size,设置的cache大小,本例程为MBUF_CACHE_SIZE;
  4. uint16_t priv_size,设置的每个包的大小,本例程为RTE_MBUF_DEFAULT_BUF_SIZE,查看宏发现它的值>2048;
  5. int socket_id,当前的socket_id。

函数返回值为内存指针,指向缓冲池空间。

3. 物理层收包函数

static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)

函数功能:
对于网口port_id,将队列queue_id的nb_pkts个包放入首地址rx_pkts中。

函数参数:

  1. uint16_t port_id,网口id;
  2. uint16_t queue_id,队列id;
  3. struct rte_mbuf **rx_pkts,内存指针,指向接收队列的首个数据包;
  4. const uint16_t nb_pkts,最多接收包的个数,本例程为BURST_SIZE。

函数返回值为读取到的数据包的数量,<=nb_pkts。

4. 物理层发包函数

static inline uint16_t
rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)

函数功能:
对于网口port_id,将内存指针tx_pkts指向的nb_pkts个包放入queue_id队列上。

函数参数:

  1. uint16_t port_id,网口id,本例程选择的网口id为当前网口与1异或;
  2. uint16_t queue_id,队列id;
  3. struct rte_mbuf **tx_pkts,内存指针,指向发送队列的首个数据包;
  4. const uint16_t nb_pkts,最多发送包的个数,本例程为刚刚网口所接收到的数据包的个数。
    函数返回值为发送的数据包的数量,<=nb_pkts。

5. 数据包内存释放(DROP接收到的包)

static inline void rte_pktmbuf_free(struct rte_mbuf *m)

函数功能:
将数据包m的内存空间归还给数据包缓冲池。对于比较大、在内存空间中被分片的数据包,将其所有的分片内存空间都归还给数据包缓冲池。

6. unlikely/likely分支预测

#ifndef likely
#define likely(x)	__builtin_expect(!!(x), 1)
#endif /* likely */

在计算机体系结构中,流水线级指令处理器采用分支预测的方法进行指令预取,但是对于条件指令,如果预取错误将会导致利用率降低。
likely是倾向于将指令预取为if条件满足的指令块,unlikely倾向于将指令预取为if条件不满足的指令块。

你可能感兴趣的:(dpdk)