目录
一、概述
二、流程分析
2.1 rte_eal_intr_init
2.2 rte_mp_channel_init
2.3 rte_mp_dev_hotplug_init
2.4 rte_bus_scan
2.5 rte_bus_get_iommu_class
初始化分析的第二个部分,主要包括中断初始化和bus扫描流程。
int rte_eal_intr_init(void)
{
int ret = 0;
/* init the global interrupt source head */
TAILQ_INIT(&intr_sources);
/**
* create a pipe which will be waited by epoll and notified to
* rebuild the wait list of epoll.
*/
if (pipe(intr_pipe.pipefd) < 0) {
rte_errno = errno;
return -1;
}
/* create the host thread to wait/handle the interrupt */
ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
eal_intr_thread_main, NULL);
if (ret != 0) {
rte_errno = -ret;
RTE_LOG(ERR, EAL,
"Failed to create thread for interrupt handling\n");
}
return ret;
}
看一下ctrl thread eal_intr_thread_main,函数不贴了,就是将intr_pipe.readfd和intr_sources中的src->intr_handle.fd加入epoll,然后通过eal_intr_handle_interrupts检测epoll事件,重点看下eal_intr_process_interrupts
if (events[n].data.fd == intr_pipe.readfd){
int r = read(intr_pipe.readfd, buf.charbuf,
sizeof(buf.charbuf));
RTE_SET_USED(r);
return -1;
}
rte_spinlock_lock(&intr_lock);
TAILQ_FOREACH(src, &intr_sources, next)
if (src->intr_handle.fd ==
events[n].data.fd)
break;
if (src == NULL){
rte_spinlock_unlock(&intr_lock);
continue;
}
接下来根据src->intr_handle.type确定接收字节长度,如UIO是sizeof(buf.uio_intr_count), alarm是sizeof(buf.timerfd_num),VFIO是sizeof(buf.vfio_intr_count)
if (call) {
/* Finally, call all callbacks. */
TAILQ_FOREACH(cb, &src->callbacks, next) {
/* make a copy and unlock. */
active_cb = *cb;
rte_spinlock_unlock(&intr_lock);
/* call the actual callback */
active_cb.cb_fn(active_cb.cb_arg);
/*get the lock back. */
rte_spinlock_lock(&intr_lock);
}
}
如果读取成功呢,执行src上挂的callback。
接下来看中断是如何注册的
int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, rte_intr_callback_fn cb, void *cb_arg)
第二和第三个参数就是刚说到的callback,用下面的结构描述:
struct rte_intr_callback {
TAILQ_ENTRY(rte_intr_callback) next;
rte_intr_callback_fn cb_fn; /**< callback address */
void *cb_arg; /**< parameter for callback */
};
在分析函数前,先看一下具体的数据结构,前面提到了intr_sources,来看一下注册的到这个管理结构上的具体数据类型:
struct rte_intr_source {
TAILQ_ENTRY(rte_intr_source) next;
struct rte_intr_handle intr_handle; /**< interrupt handle */
struct rte_intr_cb_list callbacks; /**< user callbacks */
uint32_t active;
};
callbacks我们已经说过了,intr_handle就是函数的第一个参数,也一起看下:
struct rte_intr_handle {
RTE_STD_C11
union {
int vfio_dev_fd; /**< VFIO device file descriptor */
int uio_cfg_fd; /**< UIO cfg file desc for uio_pci_generic */
};
int fd; /**< interrupt event file descriptor */
enum rte_intr_handle_type type; /**< handle type */
uint32_t max_intr; /**< max interrupt requested */
uint32_t nb_efd; /**< number of available efd(event fd) */
uint8_t efd_counter_size; /**< size of efd counter, used for vdev */
int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */
struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
/**< intr vector epoll event */
int *intr_vec; /**< intr vector number array */
};
这个结构在这个系列文章都会接触到,现在先略过,遇到再详细介绍。
其实了解了数据结构,注册过程就非常简单了,就是构造并初始化,把它挂到intr_sources上。
- dirname和basename会改变原始字符串,注意恢复一下
static int open_socket_fd(void)
{
struct sockaddr_un un;
peer_name[0] = '\0';
mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
memset(&un, 0, sizeof(un));
un.sun_family = AF_UNIX;
create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path));
unlink(un.sun_path); /* May still exist since last run */
if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
un.sun_path, strerror(errno));
close(mp_fd);
return -1;
}
RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
return mp_fd;
}
接着创建名为rte_mp_handle的控制线程,rte_ctrl_thread_create会尝试将自己绑定在检测到但是没有使用的lcore上,如果lcore都在使用,就在master上执行。最终执行
static void * mp_handle(void *arg __rte_unused)
{
struct mp_msg_internal msg;
struct sockaddr_un sa;
while (1) {
if (read_msg(&msg, &sa) == 0)
process_msg(&msg, &sa);
}
return NULL;
}
read_msg是对AF_UNIX的标准接收流程:
static int read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
int buflen = sizeof(*m) - sizeof(m->msg.fds);
memset(&msgh, 0, sizeof(msgh));
iov.iov_base = m;
iov.iov_len = buflen;
msgh.msg_name = s;
msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
msgh.msg_controllen = sizeof(control);
msglen = recvmsg(mp_fd, &msgh, 0);
if (msglen < 0) {
RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
return -1;
}
if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
RTE_LOG(ERR, EAL, "truncted msg\n");
return -1;
}
/* read auxiliary FDs if any */
for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
return 0;
}
最终msg在process_msg
static void process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
struct pending_request *pending_req;
struct action_entry *entry;
struct rte_mp_msg *msg = &m->msg;
rte_mp_t action = NULL;
RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
if (m->type == MP_REP || m->type == MP_IGN) {
struct pending_request *req = NULL;
pthread_mutex_lock(&pending_requests.lock);
pending_req = find_pending_request(s->sun_path, msg->name);
if (pending_req) {
memcpy(pending_req->reply, msg, sizeof(*msg));
/* -1 indicates that we've been asked to ignore */
pending_req->reply_received =
m->type == MP_REP ? 1 : -1;
if (pending_req->type == REQUEST_TYPE_SYNC)
pthread_cond_signal(&pending_req->sync.cond);
else if (pending_req->type == REQUEST_TYPE_ASYNC)
req = async_reply_handle_thread_unsafe(
pending_req);
} else
RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
pthread_mutex_unlock(&pending_requests.lock);
if (req != NULL)
trigger_async_action(req);
return;
}
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
action = entry->action;
pthread_mutex_unlock(&mp_mutex_action);
if (!action) {
if (m->type == MP_REQ && !internal_config.init_complete) {
/* if this is a request, and init is not yet complete,
* and callback wasn't registered, we should tell the
* requester to ignore our existence because we're not
* yet ready to process this request.
*/
struct rte_mp_msg dummy;
memset(&dummy, 0, sizeof(dummy));
strlcpy(dummy.name, msg->name, sizeof(dummy.name));
mp_send(&dummy, s->sun_path, MP_IGN);
} else {
RTE_LOG(ERR, EAL, "Cannot find action: %s\n",
msg->name);
}
} else if (action(msg, s->sun_path) < 0) {
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}
}
int __rte_experimental rte_mp_action_register(const char *name, rte_mp_t action)
就是注册一个action
#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request"
handle_secondary_request
rte_bus_scan是bus scan提供的主接口,内部会调用所有bus->scan。接口的目的是扫描所有bus下注册的设备
这里分析一下pci设备扫描过程——同内核扫描流程不同,DPDK只是将kernel扫描pci后建立的sysfs信息读取出来,获得内核已经扫描好的pci信息
[drivers/bus/pci/linux/pci.c]
int rte_pci_scan(void)
{
dir = opendir(rte_pci_get_sysfs_path());
while ((e = readdir(dir)) != NULL) {
if (e->d_name[0] == '.')
continue;
if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
continue;
snprintf(dirname, sizeof(dirname), "%s/%s",
rte_pci_get_sysfs_path(), e->d_name);
if (pci_scan_one(dirname, &addr) < 0)
goto error;
}
}
在linux设备模型中总线类型下挂有属于该bus的device和driver的文件夹,每个文件夹里存在具体的device指向实际的设备文件:
[a@localhost driver]$ ll /sys/bus/pci/devices/
lrwxrwxrwx. 1 root root 0 4月 15 14:22 0000:00:00.0 -> ../../../devices/pci0000:00/0000:00:00.0
lrwxrwxrwx. 1 root root 0 4月 15 14:22 0000:00:01.0 -> ../../../devices/pci0000:00/0000:00:01.0
...
接下来看pci_scan_one函数,函数名称是针对一个具体的设备说的,函数首先会为扫描到的设备分配空间,再通过访问上述地址,先读取一些诸如vendor_id,device_id这些在pci配置空间但由kernel map出来的信息,如果启用的sriov,还有max_vfs,sriov_numvfs这些信息。这里先看一下pci设备抽象
struct rte_pci_device {
TAILQ_ENTRY(rte_pci_device) next; /**< Next probed PCI device. */
struct rte_device device; /**< Inherit core device */
struct rte_pci_addr addr; /**< PCI location. */
struct rte_pci_id id; /**< PCI ID. */
struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
/**< PCI Memory Resource */
struct rte_intr_handle intr_handle; /**< Interrupt handle */
struct rte_pci_driver *driver; /**< PCI driver used in probing */
uint16_t max_vfs; /**< sriov enable if not zero */
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */
struct rte_intr_handle vfio_req_intr_handle;
};
该结构描述了基本的pci设备抽象:
这里再顺便看一眼rte_device提供的基本抽象功能:
struct rte_device {
TAILQ_ENTRY(rte_device) next; /**< Next device */
const char *name; /**< Device name */
const struct rte_driver *driver; /**< Driver assigned after probing */
const struct rte_bus *bus; /**< Bus handle assigned on scan */
int numa_node; /**< NUMA node connection */
struct rte_devargs *devargs; /**< Arguments for latest probing */
};
pci_name_set用于设备名称的初始化
pci_name_set(struct rte_pci_device *dev)
{
struct rte_devargs *devargs;
/* Each device has its internal, canonical name set. */
rte_pci_device_name(&dev->addr,
dev->name, sizeof(dev->name));
devargs = pci_devargs_lookup(dev);
dev->device.devargs = devargs;
/* In blacklist mode, if the device is not blacklisted, no
* rte_devargs exists for it.
*/
if (devargs != NULL)
/* If an rte_devargs exists, the generic rte_device uses the
* given name as its name.
*/
dev->device.name = dev->device.devargs->name;
else
/* Otherwise, it uses the internal, canonical form. */
dev->device.name = dev->name;
}
接着会解析设备resouce资源和对应的驱动情况,流程很简单,这里将相关的目录内容列举一下:
resource:
root@localhost e1000]# cat /sys/bus/pci/devices/0000\:02\:01.0/resource
0x00000000fd5a0000 0x00000000fd5bffff 0x0000000000140204 (phy_start,phy_end, flag)
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x00000000fdff0000 0x00000000fdffffff 0x0000000000140204
...
driver:
root@localhost e1000]# ll /sys/bus/pci/devices/0000\:02\:01.0/driver
/sys/bus/pci/devices/0000:02:01.0/driver -> ../../../../bus/pci/drivers/e1000
上图是我在虚拟机上列的e1000 pci设备的驱动信息,在DPDK上接管的端口属于下面的三种:
if (!ret) {
if (!strcmp(driver, "vfio-pci"))
dev->kdrv = RTE_KDRV_VFIO;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
dev->kdrv = RTE_KDRV_UIO_GENERIC;
else
dev->kdrv = RTE_KDRV_UNKNOWN;
} else
dev->kdrv = RTE_KDRV_NONE;
当rte_pci_device的基本信息都收集完成后,会将这个结构加入
至此,在bus scan阶段,一个rte_pci_device被初始化的部分列在下面:
struct rte_pci_device {
TAILQ_ENTRY(rte_pci_device) next;
struct rte_device device;
struct rte_pci_addr addr;
struct rte_pci_id id;
struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
struct rte_intr_handle intr_handle; /**< Interrupt handle */
struct rte_pci_driver *driver; /**< PCI driver used in probing */
uint16_t max_vfs; /**< sriov enable if not zero */
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */
struct rte_intr_handle vfio_req_intr_handle;
};
此时,结构的状态如下图所示:
如果没有指定iommu(iova)的mode,要使用rte_bus_get_iommu_class检测,对应bus->get_iommu_class,根据代码,如果使用kni,那么会强制iova是RTE_IOVA_PA,上述函数在pci总线下对应rte_pci_get_iommu_class