DPDK初始化分析(二)

目录

一、概述

二、流程分析

2.1 rte_eal_intr_init

2.2 rte_mp_channel_init

2.3 rte_mp_dev_hotplug_init

2.4 rte_bus_scan

2.5 rte_bus_get_iommu_class


一、概述

初始化分析的第二个部分,主要包括中断初始化和bus扫描流程。

二、流程分析

2.1 rte_eal_intr_init

int rte_eal_intr_init(void)
{
	int ret = 0;

	/* init the global interrupt source head */
	TAILQ_INIT(&intr_sources);

	/**
	 * create a pipe which will be waited by epoll and notified to
	 * rebuild the wait list of epoll.
	 */
	if (pipe(intr_pipe.pipefd) < 0) {
		rte_errno = errno;
		return -1;
	}

	/* create the host thread to wait/handle the interrupt */
	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
			eal_intr_thread_main, NULL);
	if (ret != 0) {
		rte_errno = -ret;
		RTE_LOG(ERR, EAL,
			"Failed to create thread for interrupt handling\n");
	}

	return ret;
}
  • 没什么可说的,把全局变量列在这:intr_sources, intr_pipe,intr_thread, intr_sources记录所有注册的中断源,后面注册函数会说到, intr_pipe是管道,用于注册后重新rebuild

看一下ctrl thread eal_intr_thread_main,函数不贴了,就是将intr_pipe.readfd和intr_sources中的src->intr_handle.fd加入epoll,然后通过eal_intr_handle_interrupts检测epoll事件,重点看下eal_intr_process_interrupts

		if (events[n].data.fd == intr_pipe.readfd){
			int r = read(intr_pipe.readfd, buf.charbuf,
					sizeof(buf.charbuf));
			RTE_SET_USED(r);
			return -1;
		}
  • 如果读管道有数据,退出,重新返回rebuild waitlist,在注册部分会写write_fd
		rte_spinlock_lock(&intr_lock);
		TAILQ_FOREACH(src, &intr_sources, next)
			if (src->intr_handle.fd ==
					events[n].data.fd)
				break;
		if (src == NULL){
			rte_spinlock_unlock(&intr_lock);
			continue;
		}
  • 事件fd要和src->intr_handle.fd (后面会说到这个fd就是/dev/uioX),不匹配就继续监测下一个事件

接下来根据src->intr_handle.type确定接收字节长度,如UIO是sizeof(buf.uio_intr_count), alarm是sizeof(buf.timerfd_num),VFIO是sizeof(buf.vfio_intr_count)

		if (call) {

			/* Finally, call all callbacks. */
			TAILQ_FOREACH(cb, &src->callbacks, next) {

				/* make a copy and unlock. */
				active_cb = *cb;
				rte_spinlock_unlock(&intr_lock);

				/* call the actual callback */
				active_cb.cb_fn(active_cb.cb_arg);

				/*get the lock back. */
				rte_spinlock_lock(&intr_lock);
			}
		}

如果读取成功呢,执行src上挂的callback。

接下来看中断是如何注册的

int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, rte_intr_callback_fn cb, void *cb_arg)

第二和第三个参数就是刚说到的callback,用下面的结构描述:

struct rte_intr_callback {
	TAILQ_ENTRY(rte_intr_callback) next;
	rte_intr_callback_fn cb_fn;  /**< callback address */
	void *cb_arg;                /**< parameter for callback */
};

在分析函数前,先看一下具体的数据结构,前面提到了intr_sources,来看一下注册的到这个管理结构上的具体数据类型:

struct rte_intr_source {
	TAILQ_ENTRY(rte_intr_source) next;
	struct rte_intr_handle intr_handle; /**< interrupt handle */
	struct rte_intr_cb_list callbacks;  /**< user callbacks */
	uint32_t active;
};

callbacks我们已经说过了,intr_handle就是函数的第一个参数,也一起看下:

struct rte_intr_handle {
	RTE_STD_C11
	union {
		int vfio_dev_fd;  /**< VFIO device file descriptor */
		int uio_cfg_fd;  /**< UIO cfg file desc for uio_pci_generic */
	};
	int fd;	 /**< interrupt event file descriptor */
	enum rte_intr_handle_type type;  /**< handle type */
	uint32_t max_intr;             /**< max interrupt requested */
	uint32_t nb_efd;               /**< number of available efd(event fd) */
	uint8_t efd_counter_size;      /**< size of efd counter, used for vdev */
	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
				       /**< intr vector epoll event */
	int *intr_vec;                 /**< intr vector number array */
};

这个结构在这个系列文章都会接触到,现在先略过,遇到再详细介绍。

其实了解了数据结构,注册过程就非常简单了,就是构造并初始化,把它挂到intr_sources上。

2.2 rte_mp_channel_init

  • /var/run/dpdk/rte/mp_socket_*
  • mp_filter = /var/run/dpdk/rte
  • mp_dir_path = mp_socket_*

 

  • dirname和basename会改变原始字符串,注意恢复一下
static int open_socket_fd(void)
{
	struct sockaddr_un un;

	peer_name[0] = '\0';

	mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);

	memset(&un, 0, sizeof(un));
	un.sun_family = AF_UNIX;

	create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path));

	unlink(un.sun_path); /* May still exist since last run */

	if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
			un.sun_path, strerror(errno));
		close(mp_fd);
		return -1;
	}

	RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
	return mp_fd;
}
  • 这里创建一个AF_UNIX用于通信

接着创建名为rte_mp_handle的控制线程,rte_ctrl_thread_create会尝试将自己绑定在检测到但是没有使用的lcore上,如果lcore都在使用,就在master上执行。最终执行

static void * mp_handle(void *arg __rte_unused)
{
	struct mp_msg_internal msg;
	struct sockaddr_un sa;

	while (1) {
		if (read_msg(&msg, &sa) == 0)
			process_msg(&msg, &sa);
	}

	return NULL;
}

read_msg是对AF_UNIX的标准接收流程:

static int read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
	int msglen;
	struct iovec iov;
	struct msghdr msgh;
	char control[CMSG_SPACE(sizeof(m->msg.fds))];
	struct cmsghdr *cmsg;
	int buflen = sizeof(*m) - sizeof(m->msg.fds);

	memset(&msgh, 0, sizeof(msgh));
	iov.iov_base = m;
	iov.iov_len  = buflen;

	msgh.msg_name = s;
	msgh.msg_namelen = sizeof(*s);
	msgh.msg_iov = &iov;
	msgh.msg_iovlen = 1;
	msgh.msg_control = control;
	msgh.msg_controllen = sizeof(control);

	msglen = recvmsg(mp_fd, &msgh, 0);
	if (msglen < 0) {
		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
		return -1;
	}

	if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
		RTE_LOG(ERR, EAL, "truncted msg\n");
		return -1;
	}

	/* read auxiliary FDs if any */
	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
		if ((cmsg->cmsg_level == SOL_SOCKET) &&
			(cmsg->cmsg_type == SCM_RIGHTS)) {
			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
			break;
		}
	}

	return 0;
}

最终msg在process_msg

static void process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
	struct pending_request *pending_req;
	struct action_entry *entry;
	struct rte_mp_msg *msg = &m->msg;
	rte_mp_t action = NULL;

	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);

	if (m->type == MP_REP || m->type == MP_IGN) {
		struct pending_request *req = NULL;

		pthread_mutex_lock(&pending_requests.lock);
		pending_req = find_pending_request(s->sun_path, msg->name);
		if (pending_req) {
			memcpy(pending_req->reply, msg, sizeof(*msg));
			/* -1 indicates that we've been asked to ignore */
			pending_req->reply_received =
				m->type == MP_REP ? 1 : -1;

			if (pending_req->type == REQUEST_TYPE_SYNC)
				pthread_cond_signal(&pending_req->sync.cond);
			else if (pending_req->type == REQUEST_TYPE_ASYNC)
				req = async_reply_handle_thread_unsafe(
						pending_req);
		} else
			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
		pthread_mutex_unlock(&pending_requests.lock);

		if (req != NULL)
			trigger_async_action(req);
		return;
	}

	pthread_mutex_lock(&mp_mutex_action);
	entry = find_action_entry_by_name(msg->name);
	if (entry != NULL)
		action = entry->action;
	pthread_mutex_unlock(&mp_mutex_action);

	if (!action) {
		if (m->type == MP_REQ && !internal_config.init_complete) {
			/* if this is a request, and init is not yet complete,
			 * and callback wasn't registered, we should tell the
			 * requester to ignore our existence because we're not
			 * yet ready to process this request.
			 */
			struct rte_mp_msg dummy;

			memset(&dummy, 0, sizeof(dummy));
			strlcpy(dummy.name, msg->name, sizeof(dummy.name));
			mp_send(&dummy, s->sun_path, MP_IGN);
		} else {
			RTE_LOG(ERR, EAL, "Cannot find action: %s\n",
				msg->name);
		}
	} else if (action(msg, s->sun_path) < 0) {
		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
	}
}
  • process_msg就是找到注册在action_entry_list的action作为响应,注册接口就是前面提到的:

int __rte_experimental rte_mp_action_register(const char *name, rte_mp_t action)

2.3 rte_mp_dev_hotplug_init

就是注册一个action

#define EAL_DEV_MP_ACTION_REQUEST      "eal_dev_mp_request"

handle_secondary_request

2.4 rte_bus_scan

rte_bus_scan是bus scan提供的主接口,内部会调用所有bus->scan。接口的目的是扫描所有bus下注册的设备

这里分析一下pci设备扫描过程——同内核扫描流程不同,DPDK只是将kernel扫描pci后建立的sysfs信息读取出来,获得内核已经扫描好的pci信息

[drivers/bus/pci/linux/pci.c]

int rte_pci_scan(void)
{
	dir = opendir(rte_pci_get_sysfs_path());

	while ((e = readdir(dir)) != NULL) {
		if (e->d_name[0] == '.')
			continue;

		if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
			continue;

		snprintf(dirname, sizeof(dirname), "%s/%s",
				rte_pci_get_sysfs_path(), e->d_name);

		if (pci_scan_one(dirname, &addr) < 0)
			goto error;
	}
}
  • bus下默认的设备路径在/sys/bus/pci/devices

在linux设备模型中总线类型下挂有属于该bus的device和driver的文件夹,每个文件夹里存在具体的device指向实际的设备文件:

[a@localhost driver]$ ll /sys/bus/pci/devices/
lrwxrwxrwx. 1 root root 0 4月  15 14:22 0000:00:00.0 -> ../../../devices/pci0000:00/0000:00:00.0
lrwxrwxrwx. 1 root root 0 4月  15 14:22 0000:00:01.0 -> ../../../devices/pci0000:00/0000:00:01.0
...

  • parse_pci_addr_format 就是将上图中pci0000:00/0000:00:00.0字符串解析出来:DBDF(domain:bus:device:function)

接下来看pci_scan_one函数,函数名称是针对一个具体的设备说的,函数首先会为扫描到的设备分配空间,再通过访问上述地址,先读取一些诸如vendor_id,device_id这些在pci配置空间但由kernel map出来的信息,如果启用的sriov,还有max_vfs,sriov_numvfs这些信息。这里先看一下pci设备抽象

struct rte_pci_device {
	TAILQ_ENTRY(rte_pci_device) next;   /**< Next probed PCI device. */
	struct rte_device device;           /**< Inherit core device */
	struct rte_pci_addr addr;           /**< PCI location. */
	struct rte_pci_id id;               /**< PCI ID. */
	struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
					    /**< PCI Memory Resource */
	struct rte_intr_handle intr_handle; /**< Interrupt handle */
	struct rte_pci_driver *driver;      /**< PCI driver used in probing */
	uint16_t max_vfs;                   /**< sriov enable if not zero */
	enum rte_kernel_driver kdrv;        /**< Kernel driver passthrough */
	char name[PCI_PRI_STR_SIZE+1];      /**< PCI location (ASCII) */
	struct rte_intr_handle vfio_req_intr_handle;
};

该结构描述了基本的pci设备抽象:

  • 继承rte_device
  • pci设备的属性 rte_pci_addr(DBDF),rte_pci_id(配置空间),rte_mem_resource(io mapping & memory mapping),max_vfs
  • 中断 (intr_handle, vfio_req_intr_handle)
  • driver

这里再顺便看一眼rte_device提供的基本抽象功能:

struct rte_device {
	TAILQ_ENTRY(rte_device) next; /**< Next device */
	const char *name;             /**< Device name */
	const struct rte_driver *driver; /**< Driver assigned after probing */
	const struct rte_bus *bus;    /**< Bus handle assigned on scan */
	int numa_node;                /**< NUMA node connection */
	struct rte_devargs *devargs;  /**< Arguments for latest probing */
};
  • 我们看到在DPDK初始化分析(一)中看到的构建rte_device的参数devargs,还有与device关联的bus和driver

pci_name_set用于设备名称的初始化

pci_name_set(struct rte_pci_device *dev)
{
	struct rte_devargs *devargs;

	/* Each device has its internal, canonical name set. */
	rte_pci_device_name(&dev->addr,
			dev->name, sizeof(dev->name));
	devargs = pci_devargs_lookup(dev);
	dev->device.devargs = devargs;
	/* In blacklist mode, if the device is not blacklisted, no
	 * rte_devargs exists for it.
	 */
	if (devargs != NULL)
		/* If an rte_devargs exists, the generic rte_device uses the
		 * given name as its name.
		 */
		dev->device.name = dev->device.devargs->name;
	else
		/* Otherwise, it uses the internal, canonical form. */
		dev->device.name = dev->name;
}
  • dev->name使用的格式为DBDF
  • dev->device->name使用设备参数devargs的名称,如果没有,保持和dev->name相同
  • dev->device.devargs = devargs

接着会解析设备resouce资源和对应的驱动情况,流程很简单,这里将相关的目录内容列举一下:

resource:

root@localhost e1000]# cat /sys/bus/pci/devices/0000\:02\:01.0/resource
0x00000000fd5a0000 0x00000000fd5bffff 0x0000000000140204             (phy_start,phy_end, flag)
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x00000000fdff0000 0x00000000fdffffff 0x0000000000140204
...

driver:

root@localhost e1000]# ll /sys/bus/pci/devices/0000\:02\:01.0/driver
/sys/bus/pci/devices/0000:02:01.0/driver -> ../../../../bus/pci/drivers/e1000

上图是我在虚拟机上列的e1000 pci设备的驱动信息,在DPDK上接管的端口属于下面的三种:

	if (!ret) {
		if (!strcmp(driver, "vfio-pci"))
			dev->kdrv = RTE_KDRV_VFIO;
		else if (!strcmp(driver, "igb_uio"))
			dev->kdrv = RTE_KDRV_IGB_UIO;
		else if (!strcmp(driver, "uio_pci_generic"))
			dev->kdrv = RTE_KDRV_UIO_GENERIC;
		else
			dev->kdrv = RTE_KDRV_UNKNOWN;
	} else
		dev->kdrv = RTE_KDRV_NONE;

当rte_pci_device的基本信息都收集完成后,会将这个结构加入

 

至此,在bus scan阶段,一个rte_pci_device被初始化的部分列在下面:

struct rte_pci_device {
    TAILQ_ENTRY(rte_pci_device) next
    struct rte_device device;         
    struct rte_pci_addr addr;        
    struct rte_pci_id id;             
    struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
    struct rte_intr_handle intr_handle; /**< Interrupt handle */
    struct rte_pci_driver *driver;      /**< PCI driver used in probing */
    uint16_t max_vfs;                   /**< sriov enable if not zero */
    enum rte_kernel_driver kdrv;        /**< Kernel driver passthrough */
    char name[PCI_PRI_STR_SIZE+1];      /**< PCI location (ASCII) */
    struct rte_intr_handle vfio_req_intr_handle;
};

此时,结构的状态如下图所示:

DPDK初始化分析(二)_第1张图片

2.5 rte_bus_get_iommu_class

如果没有指定iommu(iova)的mode,要使用rte_bus_get_iommu_class检测,对应bus->get_iommu_class,根据代码,如果使用kni,那么会强制iova是RTE_IOVA_PA,上述函数在pci总线下对应rte_pci_get_iommu_class

 

 

 

 

 

你可能感兴趣的:(DPDK)