本文主要以ixgbe设备为例,说明向dpdk添加一个ixgbe设备的大致过程。
1、使用dpdk的程序(如ovs)调用rte_dev_probe向dpdk注册一个设备,rte_dev_probe的核心处理函数为local_dev_probe,这个函数主要包含了设备总线的匹配,pci设备的bar空间映射,以及最终为设备添加ixgbe驱动。下面看一下这个函数
int
local_dev_probe(const char *devargs, struct rte_device **new_dev)
{
struct rte_device *dev;
struct rte_devargs *da;
int ret;
*new_dev = NULL;
da = calloc(1, sizeof(*da));
if (da == NULL)
return -ENOMEM;
//找到该设备的总线(pci bus)
ret = rte_devargs_parse(da, devargs);
if (ret)
goto err_devarg;
if (da->bus->plug == NULL) {
RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
da->bus->name);
ret = -ENOTSUP;
goto err_devarg;
}
ret = rte_devargs_insert(&da);
if (ret)
goto err_devarg;
/* the rte_devargs will be referenced in the matching rte_device */
//调用rte_pci_scan将设备添加的bus总线上
ret = da->bus->scan();
if (ret)
goto err_devarg;
dev = da->bus->find_device(NULL, cmp_dev_name, da->name);
if (dev == NULL) {
RTE_LOG(ERR, EAL, "Cannot find device (%s)\n",
da->name);
ret = -ENODEV;
goto err_devarg;
}
/* Since there is a matching device, it is now its responsibility
* to manage the devargs we've just inserted. From this point
* those devargs shouldn't be removed manually anymore.
*/
//为设备映射bar资源、找到对应的驱动模块
ret = dev->bus->plug(dev);
if (ret && !rte_dev_is_probed(dev)) { /* if hasn't ever succeeded */
RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n",
dev->name);
return ret;
}
*new_dev = dev;
return ret;
err_devarg:
if (rte_devargs_remove(da) != 0) {
free(da->args);
free(da);
}
return ret;
}
2、local_dev_probe的plug最终调用pci_plug,然后遍历bus上的所有驱动为设备匹配驱动,匹配驱动的函数为rte_pci_match,从这个函数可以看出,其实就是通过匹配驱动的id_table里的信息是否能匹配上设备的pci信息,以ixgbe为例,这里的id_table一开始就定义好的,然后存放在struct rte_pci_driver rte_ixgbe_pmd里,最终通过RTE_PMD_REGISTER_PCI将ixgbe_pmd驱动注册到pci总线上。
int
rte_pci_match(const struct rte_pci_driver *pci_drv,
const struct rte_pci_device *pci_dev)
{
const struct rte_pci_id *id_table;
for (id_table = pci_drv->id_table; id_table->vendor_id != 0;
id_table++) {
/* check if device's identifiers match the driver's ones */
if (id_table->vendor_id != pci_dev->id.vendor_id &&
id_table->vendor_id != PCI_ANY_ID)
continue;
if (id_table->device_id != pci_dev->id.device_id &&
id_table->device_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_vendor_id !=
pci_dev->id.subsystem_vendor_id &&
id_table->subsystem_vendor_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_device_id !=
pci_dev->id.subsystem_device_id &&
id_table->subsystem_device_id != PCI_ANY_ID)
continue;
if (id_table->class_id != pci_dev->id.class_id &&
id_table->class_id != RTE_CLASS_ANY_ID)
continue;
return 1;
}
return 0;
}
3、为设备找到驱动后,接下来一步比较重要的是为设备映射资源信息,如果使用vfio驱动,调用pci_vfio_map_resource,这个函数一开始先通过rte_vfio_setup_device为设备分配vfio_container_id、vfio_group_id,同时设置iommu_type,然后调用dma_map_func将rte_eal_get_configuration()->mem_config的内存信息进行dma映射,这里的mem_config表示dpdk管理的内存信息(从这里看,dpdk应该是一开始会将所有内存都进行dma映射?后面驱动的rx_ring、tx_ring分配dma地址的时候,貌似也没有进一步的dma映射,而是直接使用这里分配好的iova地址。)。 完成dma映射后,通过VFIO_DEVICE_GET_INFO获取设备的信息(主要是pci的bar个数信息及中断信息)。
获取到pci的bar个数信息后,先通过pci_vfio_get_region_info获取每个bar region的地址偏移及大小信息,然后再通过pci_vfio_mmap_bar将其映射到用户空间,并将映射后的bar地址信息存放在rte_pci_device->mem_resource。
4、接下来主要是调用驱动的probe函数,初始化设备信息,如ixgbe,最终调用eth_ixgbe_pci_probe,该probe函数主要是调用rte_eth_dev_create
int __rte_experimental
rte_eth_dev_create(struct rte_device *device, const char *name,
size_t priv_data_size,
ethdev_bus_specific_init ethdev_bus_specific_init,
void *bus_init_params,
ethdev_init_t ethdev_init, void *init_params)
{
struct rte_eth_dev *ethdev;
int retval;
RTE_FUNC_PTR_OR_ERR_RET(*ethdev_init, -EINVAL);
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
//分配一个dev数据结构
ethdev = rte_eth_dev_allocate(name);
if (!ethdev)
return -ENODEV;
if (priv_data_size) {
ethdev->data->dev_private = rte_zmalloc_socket(
name, priv_data_size, RTE_CACHE_LINE_SIZE,
device->numa_node);
if (!ethdev->data->dev_private) {
RTE_LOG(ERR, EAL, "failed to allocate private data");
retval = -ENOMEM;
goto probe_failed;
}
}
} else {
ethdev = rte_eth_dev_attach_secondary(name);
if (!ethdev) {
RTE_LOG(ERR, EAL, "secondary process attach failed, "
"ethdev doesn't exist");
return -ENODEV;
}
}
ethdev->device = device;
if (ethdev_bus_specific_init) {
//初始设备的numa_node等信息
retval = ethdev_bus_specific_init(ethdev, bus_init_params);
if (retval) {
RTE_LOG(ERR, EAL,
"ethdev bus specific initialisation failed");
goto probe_failed;
}
}
//初始化硬件设备,如初始化ixgbe的收发函数,mac地址,设备的pci信息以及对设备关闭中断模式等
//另外这个函数比较重要的是将前面映射的bar0空间地址复制到ixgbe_hw->hw_addr,后面驱动通过这个
//地址操作相关寄存器
retval = ethdev_init(ethdev, init_params);
if (retval) {
RTE_LOG(ERR, EAL, "ethdev initialisation failed");
goto probe_failed;
}
rte_eth_dev_probing_finish(ethdev);
return retval;
probe_failed:
rte_eth_dev_release_port(ethdev);
return retval;
}