DPDK-pdump工作原理解析

目录

 

1 dpdk 配置

2 抓包原理

3 源码解析


1 dpdk 配置

在使用dpdk-pdump之前,需要设置配置参数,在CONFIG中的common_base中分别设置CONFIG_RTE_LIBRTE_PMD_PCAP=y和CONFIG_RTE_LIBRTE_PDUMP=y。

配置环境变量:在 ~/.bashrc中加入如下两行:

export RTE_SDK=dpdk_dir

export RTE_TARGET=x86_64_native_linuxapp_gcc 

2 抓包原理

dpdk-pdump使用时,作为secondary进程依附于primary进程。primary进程中启动server端,初始化pdump抓包框架任务;dpdk-pdump进程是作为client端向primary进程发送开始/停止抓包请求,然后primary进程拷贝一份数据包到ring中,secondary进程从ring中读取出来,并保存为pcap文件。因此,可以看出在primary进程中需要初始化pdump server

3 源码解析

DPDK-pdump工作原理解析_第1张图片

  1. A采用rx-worker-tx的模型进行报文的处理,其中调用rte_pdump_init会启动dump_thread,即图中红色的message线程;
  2. pdump采用secondary模式启动,与A共享mmap映射的内存空间;
  3. pdump启动过程中会创建mbuf_pool和ring,用于后续接收A中报文的拷贝;
  4. pdump会通过rte_eth_dev_attach方式创建vdev,且采用eth_pcap驱动进行初始化,留意init中的open_tx_pcap;
  5. pdump向A发送开启抓包的消息(UDP方式),消息内容为前面创建的mbuf_pool、ring以及抓包的port和对应的queue;
  6. A中的dump_thread收到消息后,获取相应信息,在port上注册call_back函数;
  7. 对于开启抓包的port,在rx_burst/tx_burst时会先调用call_back,这里对应pdump_rx/pdump_tx,它会由mbuf_pool中分配mbuf进行报文的复制,同时enqueue到ring中;(mbuf_pool和ring在步骤3中创建,在步骤5中传递给A)
  8. pdump进行ring的dequeue操作获取拷贝报文;
  9. 拷贝报文通过rte_eth_tx_burst发送给vdev;
  10. vdev通过eth_pcap的tx_pkt_burst发送报文,即调用eth_pcap_tx_dumper完成报文的pcap存储(pcap_dump)。

下面重点讲解源码部分:

其中主函数的主要代码如下:

/* parse app arguments */
	if (argc > 1) {
		ret = launch_args_parse(argc, argv, argp[0]);
		if (ret < 0)
			rte_exit(EXIT_FAILURE, "Invalid argument\n");
	}

	/* create mempool, ring and vdevs info */
	create_mp_ring_vdev();
	enable_pdump();
	dump_packets();

	cleanup_pdump_resources();
	/* dump debug stats */
	print_pdump_stats();

 其中 launch_args_parse()函数主要是解析参数,如果用法为:

  ./dpdk-pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/capture.pcap'

 则将其中的port,queue,rx-dev 分别传输函数 parse_uint_value(), parse_queue(),parse_rxtxdev(),最终将所有的参数分别设置到 struct pdump tuples中。

然后使用create_mp_ring_vdev()创建mempool, rte_ring,vdev:

mbuf_pool = rte_mempool_lookup(mempool_name);
		if (mbuf_pool == NULL) {
			/* create mempool */
			mbuf_pool = rte_pktmbuf_pool_create(mempool_name,
					pt->total_num_mbufs,
					MBUF_POOL_CACHE_SIZE, 0,
					pt->mbuf_data_size,
					rte_socket_id());
			if (mbuf_pool == NULL) {
				cleanup_rings();
				rte_exit(EXIT_FAILURE,
					"Mempool creation failed: %s\n",
					rte_strerror(rte_errno));
			}
		}

后面分别根据传入的参数,创建rx_ring,tx_ring。其中比较重要的函数是rte_eth_dev_attach() 

if (rte_eth_dev_attach(vdev_args, &portid) < 0) {
				cleanup_rings();
				rte_exit(EXIT_FAILURE,
					"vdev creation failed:%s:%d\n",
					__func__, __LINE__);
			}

其中传入的参数 vdev_args:net_pcap_rx_0, tx_iface=/tmp/capture.pcap 

其中rte_eth_dev_attach()主要内容如下:

/* parse devargs, then retrieve device name and args */
	if (rte_eal_parse_devargs_str(devargs, &name, &args))
		goto err;

	ret = rte_eal_dev_attach(name, args);
	if (ret < 0)
		goto err;

	/* no point looking at the port count if no port exists */
	if (!rte_eth_dev_count()) {
		RTE_LOG(ERR, EAL, "No port found for device (%s)\n", name);
		ret = -1;
		goto err;
	}

	/* if nothing happened, there is a bug here, since some driver told us
	 * it did attach a device, but did not create a port.
	 */
	if (current == rte_eth_dev_count()) {
		ret = -1;
		goto err;
	}

	*port_id = eth_dev_last_created_port;
	ret = 0;

上面代码主要中rte_eal_parse_devargs_str()函数主要是提取参数中的后面部分,即tx_iface=/tmp/capture.pcap, 传入rte_eal_dev_attach():

int rte_eal_dev_attach(const char *name, const char *devargs)
{
	struct rte_pci_addr addr;

	if (name == NULL || devargs == NULL) {
		RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n");
		return -EINVAL;
	}

	if (eal_parse_pci_DomBDF(name, &addr) == 0) {
		if (rte_eal_pci_probe_one(&addr) < 0)
			goto err;

	} else {
		if (rte_eal_vdev_init(name, devargs))
			goto err;
	}

	return 0;

这主要是创建虚拟设备。

下面看下主函数中另一个函数enable_pdump()

static void
enable_pdump(void)
{
	int i;
	struct pdump_tuples *pt;
	int ret = 0, ret1 = 0;
     // 此处可以指定server socket路径,可以使用默认值
	if (server_socket_path[0] != 0)
		ret = rte_pdump_set_socket_dir(server_socket_path,
				RTE_PDUMP_SOCKET_SERVER);
	if (ret == 0 && client_socket_path[0] != 0) {
		ret = rte_pdump_set_socket_dir(client_socket_path,
				RTE_PDUMP_SOCKET_CLIENT);
	}
	if (ret < 0) {
		cleanup_pdump_resources();
		rte_exit(EXIT_FAILURE,
				"failed to set socket paths of server:%s, "
				"client:%s\n",
				server_socket_path,
				client_socket_path);
	}

	for (i = 0; i < num_tuples; i++) {
		pt = &pdump_t[i];
		if (pt->dir == RTE_PDUMP_FLAG_RXTX) {
			if (pt->dump_by_type == DEVICE_ID) {
				ret = rte_pdump_enable_by_deviceid(
						pt->device_id,
						pt->queue,
						RTE_PDUMP_FLAG_RX,
						pt->rx_ring,
						pt->mp, NULL);
				ret1 = rte_pdump_enable_by_deviceid(
						pt->device_id,
						pt->queue,
						RTE_PDUMP_FLAG_TX,
						pt->tx_ring,
						pt->mp, NULL);
			} else if (pt->dump_by_type == PORT_ID) {
				ret = rte_pdump_enable(pt->port, pt->queue,
						RTE_PDUMP_FLAG_RX,
						pt->rx_ring, pt->mp, NULL);
				ret1 = rte_pdump_enable(pt->port, pt->queue,
						RTE_PDUMP_FLAG_TX,
						pt->tx_ring, pt->mp, NULL);
			}
		} else if (pt->dir == RTE_PDUMP_FLAG_RX) {
			if (pt->dump_by_type == DEVICE_ID)
				ret = rte_pdump_enable_by_deviceid(
						pt->device_id,
						pt->queue,
						pt->dir, pt->rx_ring,
						pt->mp, NULL);
			else if (pt->dump_by_type == PORT_ID)
				ret = rte_pdump_enable(pt->port, pt->queue,
						pt->dir,
						pt->rx_ring, pt->mp, NULL);
		} else if (pt->dir == RTE_PDUMP_FLAG_TX) {
			if (pt->dump_by_type == DEVICE_ID)
				ret = rte_pdump_enable_by_deviceid(
						pt->device_id,
						pt->queue,
						pt->dir,
						pt->tx_ring, pt->mp, NULL);
			else if (pt->dump_by_type == PORT_ID)
				ret = rte_pdump_enable(pt->port, pt->queue,
						pt->dir,
						pt->tx_ring, pt->mp, NULL);
		}
		if (ret < 0 || ret1 < 0) {
			cleanup_pdump_resources();
			rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));
		}
	}
}

其中比较重要的函数是rte_pdump_enable_by_deviceid(),里面有个创建套接字发送消息的函数:pdump_create_client_socket(struct pdump_request *p)

static int
pdump_create_client_socket(struct pdump_request *p)
{
	int ret, socket_fd;
	int pid;
	int n;
	struct pdump_response server_resp;
	struct sockaddr_un addr, serv_addr, from;
	socklen_t addr_len, serv_len;

	pid = getpid();

	socket_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
	if (socket_fd < 0) {
		RTE_LOG(ERR, PDUMP,
			"client socket(): %s:pid(%d):tid(%u), %s:%d\n",
			strerror(errno), pid, rte_sys_gettid(),
			__func__, __LINE__);
		rte_errno = errno;
		return -1;
	}

	ret = pdump_get_socket_path(addr.sun_path, sizeof(addr.sun_path),
				RTE_PDUMP_SOCKET_CLIENT);
	if (ret != 0) {
		RTE_LOG(ERR, PDUMP,
			"Failed to get client socket path: %s:%d\n",
			__func__, __LINE__);
		rte_errno = errno;
		goto exit;
	}
	addr.sun_family = AF_UNIX;
	addr_len = sizeof(struct sockaddr_un);

	do {
		ret = bind(socket_fd, (struct sockaddr *) &addr, addr_len);
		if (ret) {
			RTE_LOG(ERR, PDUMP,
				"client bind(): %s, %s:%d\n",
				strerror(errno), __func__, __LINE__);
			rte_errno = errno;
			break;
		}

		serv_len = sizeof(struct sockaddr_un);
		memset(&serv_addr, 0, sizeof(serv_addr));
		ret = pdump_get_socket_path(serv_addr.sun_path,
					sizeof(serv_addr.sun_path),
					RTE_PDUMP_SOCKET_SERVER);
		if (ret != 0) {
			RTE_LOG(ERR, PDUMP,
				"Failed to get server socket path: %s:%d\n",
				__func__, __LINE__);
			rte_errno = errno;
			break;
		}
		serv_addr.sun_family = AF_UNIX;

		n =  sendto(socket_fd, p, sizeof(struct pdump_request), 0,
				(struct sockaddr *)&serv_addr, serv_len);
		if (n < 0) {
			RTE_LOG(ERR, PDUMP,
				"failed to send to server:%s, %s:%d\n",
				strerror(errno), __func__, __LINE__);
			rte_errno = errno;
			ret = -1;
			break;
		}

		n = recvfrom(socket_fd, &server_resp,
				sizeof(struct pdump_response), 0,
				(struct sockaddr *)&from, &serv_len);
		if (n < 0) {
			RTE_LOG(ERR, PDUMP,
				"failed to recv from server:%s, %s:%d\n",
				strerror(errno), __func__, __LINE__);
			rte_errno = errno;
			ret = -1;
			break;
		}
		ret = server_resp.err_value;
	} while (0);

exit:
	close(socket_fd);
	unlink(addr.sun_path);
	return ret;
}

上面主要是创建套接字,发送接收信息。

然后是dump_packets()

static inline void
dump_packets(void)
{
	int i;
	struct pdump_tuples *pt;

	while (!quit_signal) {
		for (i = 0; i < num_tuples; i++) {
			pt = &pdump_t[i];
			if (pt->dir & RTE_PDUMP_FLAG_RX)
				pdump_rxtx(pt->rx_ring, pt->rx_vdev_id,
					&pt->stats);
			if (pt->dir & RTE_PDUMP_FLAG_TX)
				pdump_rxtx(pt->tx_ring, pt->tx_vdev_id,
					&pt->stats);
		}
	}
}

pdump_rxtx()函数主要是从队列中取出mbuf,发送
pdump_rxtx(struct rte_ring *ring, uint8_t vdev_id, struct pdump_stats *stats)
{
	/* write input packets of port to vdev for pdump */
	struct rte_mbuf *rxtx_bufs[BURST_SIZE];

	/* first dequeue packets from ring of primary process */
	const uint16_t nb_in_deq = rte_ring_dequeue_burst(ring,
			(void *)rxtx_bufs, BURST_SIZE);
	stats->dequeue_pkts += nb_in_deq;

	if (nb_in_deq) {
		/* then sent on vdev */
		uint16_t nb_in_txd = rte_eth_tx_burst(
				vdev_id,
				0, rxtx_bufs, nb_in_deq);
		stats->tx_pkts += nb_in_txd;

		if (unlikely(nb_in_txd < nb_in_deq)) {
			do {
				rte_pktmbuf_free(rxtx_bufs[nb_in_txd]);
				stats->freed_pkts++;
			} while (++nb_in_txd < nb_in_deq);
		}
	}
}

至此,dpdk-pdump转包程序完成

你可能感兴趣的:(C/C++)