DPDK学习(rte_eal_init)
DPDK学习(eal_thread_loop)
struct rte_bus {
TAILQ_ENTRY(rte_bus) next; /**< Next bus object in linked list */
const char *name; /**< Name of the bus */
rte_bus_scan_t scan; /**< Scan for devices attached to bus */
rte_bus_probe_t probe; /**< Probe devices on bus */
rte_bus_find_device_t find_device; /**< Find a device on the bus */
rte_bus_plug_t plug; /**< Probe single device for drivers */
rte_bus_unplug_t unplug; /**< Remove single device from driver */
rte_bus_parse_t parse; /**< Parse a device name */
struct rte_bus_conf conf; /**< Bus configuration */
};
TAILQ_HEAD(rte_bus_list, rte_bus);
#define TAILQ_HEAD(name, type) \
struct name { \
struct type *tqh_first; /* first element */ \
struct type **tqh_last; /* addr of last next element */ \
}
/* 定义rte_bus_list */
struct rte_bus_list rte_bus_list =
TAILQ_HEAD_INITIALIZER(rte_bus_list);
将rte_pci_bus插入rte_bus_list链表
struct rte_pci_bus {
struct rte_bus bus; /**< Inherit the generic class */
struct rte_pci_device_list device_list; /**< List of PCI devices */
struct rte_pci_driver_list driver_list; /**< List of PCI drivers */
};
/* 定义rte_pci_bus */
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = rte_pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_ ##nm, 101); \ /* 声明为gcc构造函数,先于main()执行 */
static void businitfn_ ##nm(void) \
{\
(bus).name = RTE_STR(nm);\
rte_bus_register(&bus); \
}
void
rte_bus_register(struct rte_bus *bus)
{
RTE_VERIFY(bus);
RTE_VERIFY(bus->name && strlen(bus->name));
/* A bus should mandatorily have the scan implemented */
RTE_VERIFY(bus->scan);
RTE_VERIFY(bus->probe);
RTE_VERIFY(bus->find_device);
/* Buses supporting driver plug also require unplug. */
RTE_VERIFY(!bus->plug || bus->unplug);
/* 将rte_pci_bus.bus插入rte_bus_list链表 */
TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name);
}
将rte_ixgbe_pmd插入rte_pci_bus.driver_list链表
struct rte_pci_driver {
TAILQ_ENTRY(rte_pci_driver) next; /**< Next in list. */
struct rte_driver driver; /**< Inherit core driver. */
struct rte_pci_bus *bus; /**< PCI bus reference. */
pci_probe_t *probe; /**< Device Probe function. */
pci_remove_t *remove; /**< Device Remove function. */
const struct rte_pci_id *id_table; /**< ID table, NULL terminated. */
uint32_t drv_flags; /**< Flags contolling handling of device. */
};
/* 定义rte_ixgbe_pmd */
static struct rte_pci_driver rte_ixgbe_pmd = {
.id_table = pci_id_ixgbe_map,
.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC,
.probe = eth_ixgbe_pci_probe,
.remove = eth_ixgbe_pci_remove,
};
RTE_PMD_REGISTER_PCI(net_ixgbe, rte_ixgbe_pmd);
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm); \ /* 声明为gcc构造函数,先于main()执行 */
static void pciinitfn_ ##nm(void) \
{\
(pci_drv).driver.name = RTE_STR(nm);\
rte_pci_register(&pci_drv); \
} \
RTE_PMD_EXPORT_NAME(nm, __COUNTER__)
void
rte_pci_register(struct rte_pci_driver *driver)
{
/* 将rte_ixgbe_pmd插入rte_pci_bus.driver_list链表 */
TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next);
driver->bus = &rte_pci_bus;
}
struct rte_config {
uint32_t master_lcore; /**< Id of the master lcore */
uint32_t lcore_count; /**< Number of available logical cores. */
uint32_t service_lcore_count;/**< Number of available service cores. */
enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */
/** Primary or secondary configuration */
enum rte_proc_type_t process_type;
/**
* Pointer to memory configuration, which may be shared across multiple
* DPDK instances
*/
struct rte_mem_config *mem_config;
} __attribute__((__packed__));
/* 定义rte_config */
static struct rte_config rte_config = {
.mem_config = &early_mem_config,
};
struct lcore_config {
unsigned detected; /**< true if lcore was detected */
pthread_t thread_id; /**< pthread identifier */
int pipe_master2slave[2]; /**< communication pipe with master */
int pipe_slave2master[2]; /**< communication pipe with master */
lcore_function_t * volatile f; /**< function to call */
void * volatile arg; /**< argument of function */
volatile int ret; /**< return value of function */
volatile enum rte_lcore_state_t state; /**< lcore state */
unsigned socket_id; /**< physical socket id for this lcore */
unsigned core_id; /**< core number on socket for this lcore */
int core_index; /**< relative index, starting from 0 */
rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */
uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */
};
/* 定义lcore_config数组 */
struct lcore_config lcore_config[RTE_MAX_LCORE];
MASTER lcore的主循环函数
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc, char **argv)
{
...
/* rte_eal_cpu_init() ->
* eal_cpu_core_id()
* eal_cpu_socket_id()
* 读取/sys/devices/system/[cpu|node]
* 设置lcore_config->[core_role|core_id|socket_id] */
if (rte_eal_cpu_init() < 0) {
rte_eal_init_alert("Cannot detect lcores.");
rte_errno = ENOTSUP;
return -1;
}
/* eal_parse_args() ->
* eal_parse_common_option() ->
* eal_parse_coremask()
* eal_parse_master_lcore()
* eal_parse_lcores()
* eal_adjust_config()
* 解析-c、--master_lcore、--lcores参数
* 在eal_parse_lcores()中确认可用的logical CPU
* 在eal_adjust_config()中设置rte_config.master_lcore为0 (设置第一个lcore为MASTER lcore) */
fctret = eal_parse_args(argc, argv);
if (fctret < 0) {
rte_eal_init_alert("Invalid 'command line' arguments.");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
}
...
/* 初始化大页信息 */
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
return -1;
}
...
/* eal_thread_init_master() ->
* eal_thread_set_affinity()
* 设置当前线程为MASTER lcore
* 在eal_thread_set_affinity()中绑定MASTER lcore到logical CPU */
eal_thread_init_master(rte_config.master_lcore);
...
/* rte_bus_scan() ->
* rte_pci_scan() ->
* pci_scan_one() ->
* pci_parse_sysfs_resource()
* rte_pci_add_device()
* 遍历rte_bus_list链表,调用每个bus的scan函数,pci为rte_pci_scan()
* 遍历/sys/bus/pci/devices目录,为每个DBSF分配struct rte_pci_device
* 逐行读取并解析每个DBSF的resource,保存到dev->mem_resource[i]
* 将dev插入rte_pci_bus.device_list链表 */
if (rte_bus_scan()) {
rte_eal_init_alert("Cannot scan the buses for devices\n");
rte_errno = ENODEV;
return -1;
}
/* pthread_create() ->
* eal_thread_loop() ->
* eal_thread_set_affinity()
* 为每个SLAVE lcore创建线程,线程函数为eal_thread_loop()
* 在eal_thread_set_affinity()中绑定SLAVE lcore到logical CPU */
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
*/
/* MASTER lcore创建pipes用于MASTER和SLAVE lcore间通信(父子线程间通信) */
if (pipe(lcore_config[i].pipe_master2slave) < 0)
rte_panic("Cannot create pipe\n");
if (pipe(lcore_config[i].pipe_slave2master) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT; /* 设置SLAVE lcore的状态为WAIT */
/* create a thread for each lcore */
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
...
}
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
...
/* Probe all the buses and devices/drivers on them */
/* rte_bus_probe() ->
* rte_pci_probe() ->
* pci_probe_all_drivers() ->
* rte_pci_probe_one_driver() ->
* rte_pci_match()
* rte_pci_map_device() ->
* pci_uio_map_resource()
* eth_ixgbe_pci_probe()
* 遍历rte_bus_list链表,调用每个bus的probe函数,pci为rte_pci_probe()
* rte_pci_probe()/pci_probe_all_drivers()分别遍历rte_pci_bus.device_list/driver_list链表,匹配设备和驱动
* 映射BAR,调用驱动的probe函数,ixgbe为eth_ixgbe_pci_probe() */
if (rte_bus_probe()) {
rte_eal_init_alert("Cannot probe devices\n");
rte_errno = ENOTSUP;
return -1;
}
...
}
struct rte_pci_device {
TAILQ_ENTRY(rte_pci_device) next; /**< Next probed PCI device. */
struct rte_device device; /**< Inherit core device */
/* DBSF */
struct rte_pci_add addr; /**< PCI location. */
struct rte_pci_id id; /**< PCI ID. */
struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
/**< PCI Memory Resource */
struct rte_intr_handle intr_handle; /**< Interrupt handle */
struct rte_pci_driver *driver; /**< Associated driver */
uint16_t max_vfs; /**< sriov enable if not zero */
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */
};
struct rte_mem_resource {
/* 总线地址 */
uint64_t phys_addr; /**< Physical address, 0 if not resource. */
uint64_t len; /**< Length of the resource. */
/* 虚拟地址 */
void *addr; /**< Virtual address, NULL when not mapped. */
};
struct mapped_pci_resource {
TAILQ_ENTRY(mapped_pci_resource) next;
/* DBSF */
struct rte_pci_addr pci_addr;
char path[PATH_MAX];
int nb_maps;
struct pci_map maps[PCI_MAX_RESOURCE];
};
TAILQ_HEAD(mapped_pci_res_list, mapped_pci_resource);
/* [root@localhost ~]# cat /sys/bus/pci/devices/0000:07:00.0/resource
* 0x00000000df800000 0x00000000df9fffff 0x000000000014220c
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x00000000dfa04000 0x00000000dfa07fff 0x000000000014220c
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x00000000dfc80000 0x00000000dfcfffff 0x000000000004e200
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 0x0000000000000000 0x0000000000000000 0x0000000000000000
* 每列分别表示start、end、flag */
static int
pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
{
FILE *f;
char buf[BUFSIZ];
int i;
uint64_t phys_addr, end_addr, flags;
f = fopen(filename, "r"); /* 打开/sys/bus/pci/devices/DBSF/resource文件 */
if (f == NULL) {
RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n");
return -1;
}
for (i = 0; i<PCI_MAX_RESOURCE; i++) { /* 最多6个BAR */
if (fgets(buf, sizeof(buf), f) == NULL) { /* 逐行读取 */
RTE_LOG(ERR, EAL,
"%s(): cannot read resource\n", __func__);
goto error;
}
if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
&end_addr, &flags) < 0)
goto error;
if (flags & IORESOURCE_MEM) { /* MEM地址空间 */
dev->mem_resource[i].phys_addr = phys_addr; /* 总线地址 */
dev->mem_resource[i].len = end_addr - phys_addr + 1;
/* not mapped for now */
dev->mem_resource[i].addr = NULL; /* 虚拟地址 */
}
}
fclose(f);
return 0;
error:
fclose(f);
return -1;
}
int
pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
uint64_t *end_addr, uint64_t *flags)
{
union pci_resource_info {
struct {
char *phys_addr;
char *end_addr;
char *flags;
};
char *ptrs[PCI_RESOURCE_FMT_NVAL];
} res_info;
if (rte_strsplit(line, len, res_info.ptrs, 3, ' ') != 3) {
RTE_LOG(ERR, EAL,
"%s(): bad resource format\n", __func__);
return -1;
}
errno = 0;
*phys_addr = strtoull(res_info.phys_addr, NULL, 16); /* 16进制字符串转换为unsigned long long */
*end_addr = strtoull(res_info.end_addr, NULL, 16);
*flags = strtoull(res_info.flags, NULL, 16);
if (errno != 0) {
RTE_LOG(ERR, EAL,
"%s(): bad resource format\n", __func__);
return -1;
}
return 0;
}
int
pci_uio_map_resource(struct rte_pci_device *dev)
{
int i, map_idx = 0, ret;
uint64_t phaddr;
struct mapped_pci_resource *uio_res = NULL;
struct mapped_pci_res_list *uio_res_list =
RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list);
dev->intr_handle.fd = -1;
dev->intr_handle.uio_cfg_fd = -1;
dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
/* secondary processes - use already recorded details */
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return pci_uio_map_secondary(dev);
/* allocate uio resource */
ret = pci_uio_alloc_resource(dev, &uio_res); /* 为/dev/uioX分配struct mapped_pci_resource */
if (ret)
return ret;
/* Map all BARs */
for (i = 0; i != PCI_MAX_RESOURCE; i++) { /* 最多6个BAR */
/* skip empty BAR */
phaddr = dev->mem_resource[i].phys_addr;
if (phaddr == 0) /* 对于0000:07:00.0,当i为0或4时,phaddr非空 */
continue;
ret = pci_uio_map_resource_by_index(dev, i,
uio_res, map_idx); /* 映射BAR */
if (ret)
goto error;
map_idx++;
}
uio_res->nb_maps = map_idx; /* 映射BAR的个数 */
TAILQ_INSERT_TAIL(uio_res_list, uio_res, next); /* 将uio_res插入uio_res_list链表 */
return 0;
error:
for (i = 0; i < map_idx; i++) {
pci_unmap_resource(uio_res->maps[i].addr,
(size_t)uio_res->maps[i].size);
rte_free(uio_res->maps[i].path);
}
pci_uio_free_resource(dev, uio_res);
return -1;
}
int
pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
struct mapped_pci_resource *uio_res, int map_idx)
{
int fd;
char devname[PATH_MAX];
void *mapaddr;
struct rte_pci_addr *loc;
struct pci_map *maps;
loc = &dev->addr;
maps = uio_res->maps;
/* update devname for mmap */
snprintf(devname, sizeof(devname),
"%s/" PCI_PRI_FMT "/resource%d",
pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid,
loc->function, res_idx);
/* allocate memory to keep path */
maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0);
if (maps[map_idx].path == NULL) {
RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
strerror(errno));
return -1;
}
/*
* open resource file, to mmap it
*/
fd = open(devname, O_RDWR); /* 打开/sys/bus/pci/devices/0000:07:00.0/resource0文件 */
if (fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
devname, strerror(errno));
goto error;
}
/* try mapping somewhere close to the end of hugepages */
if (pci_map_addr == NULL)
pci_map_addr = pci_find_max_end_va();
mapaddr = pci_map_resource(pci_map_addr, fd, 0,
(size_t)dev->mem_resource[res_idx].len, 0); /* 映射BAR */
close(fd);
if (mapaddr == MAP_FAILED)
goto error;
pci_map_addr = RTE_PTR_ADD(mapaddr,
(size_t)dev->mem_resource[res_idx].len);
maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr; /* 总线地址 */
maps[map_idx].size = dev->mem_resource[res_idx].len;
maps[map_idx].addr = mapaddr; /* mmap()得到的虚拟地址 */
maps[map_idx].offset = 0;
strcpy(maps[map_idx].path, devname);
dev->mem_resource[res_idx].addr = mapaddr; /* mmap()得到的虚拟地址 */
return 0;
error:
rte_free(maps[map_idx].path);
return -1;
}
void *
pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
int additional_flags)
{
void *mapaddr;
/* Map the PCI memory resource of device */
/* 将从fd + offset开始、大小为size的总线地址映射到从requested_addr开始的虚拟地址 */
mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
MAP_SHARED | additional_flags, fd, offset);
if (mapaddr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
__func__, fd, requested_addr,
(unsigned long)size, (unsigned long)offset,
strerror(errno), mapaddr);
} else
RTE_LOG(DEBUG, EAL, " PCI memory mapped at %p\n", mapaddr);
return mapaddr;
}
static int eth_ixgbe_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
struct rte_pci_device *pci_dev)
{
return rte_eth_dev_pci_generic_probe(pci_dev,
sizeof(struct ixgbe_adapter), eth_ixgbe_dev_init);
}
static inline int
rte_eth_dev_pci_generic_probe(struct rte_pci_device *pci_dev,
size_t private_data_size, eth_dev_pci_callback_t dev_init)
{
...
eth_dev = rte_eth_dev_pci_allocate(pci_dev, private_data_size);
...
ret = dev_init(eth_dev); /* ixgbe为eth_ixgbe_dev_init() */
...
}
static inline struct rte_eth_dev *
rte_eth_dev_pci_allocate(struct rte_pci_device *dev, size_t private_data_size)
{
...
/* rte_eth_dev_allocate() ->
* rte_eth_dev_find_free_port()
* rte_eth_dev_data_alloc()
* eth_dev_get() */
eth_dev = rte_eth_dev_allocate(name);
...
/* 分配private data,ixgbe为struct ixgbe_adapter */
eth_dev->data->dev_private = rte_zmalloc_socket(name,
private_data_size, RTE_CACHE_LINE_SIZE,
dev->device.numa_node);
...
}
struct rte_eth_dev *
rte_eth_dev_allocate(const char *name)
{
...
/* 遍历rte_eth_devices数组,找到一个空闲的设备 */
port_id = rte_eth_dev_find_free_port();
...
/* 分配rte_eth_dev_data数组 */
rte_eth_dev_data_alloc();
...
/* 设置port_id对应的设备的state为RTE_ETH_DEV_ATTACHED */
eth_dev = eth_dev_get(port_id);
...
}
static int
eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
{
...
eth_dev->dev_ops = &ixgbe_eth_dev_ops; /* 注册ixgbe_eth_dev_ops函数表 */
eth_dev->rx_pkt_burst = &ixgbe_recv_pkts; /* burst收包函数 */
eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts; /* burst发包函数 */
eth_dev->tx_pkt_prepare = &ixgbe_prep_pkts;
...
hw->device_id = pci_dev->id.device_id; /* device_id */
hw->vendor_id = pci_dev->id.vendor_id; /* vendor_id */
hw->hw_addr = (void *)pci_dev->mem_resource[0].addr; /* mmap()得到的BAR的虚拟地址 */
...
/* ixgbe_init_shared_code() ->
* ixgbe_set_mac_type()
* ixgbe_init_ops_82599()
* 在ixgbe_set_mac_type()中根据vendor_id和device_id设置hw->mac.type,82599为ixgbe_mac_82599EB
* 根据hw->mac.type调用对应的函数设置hw->mac.ops,82599为ixgbe_init_ops_82599() */
diag = ixgbe_init_shared_code(hw);
...
/* ixgbe_init_hw() ->
* ixgbe_call_func() ->
* ixgbe_init_hw_generic() ->
* ixgbe_reset_hw_82599() ->
* ixgbe_get_mac_addr_generic()
* 得到网卡的mac地址 */
diag = ixgbe_init_hw(hw);
...
ether_addr_copy((struct ether_addr *) hw->mac.perm_addr,
ð_dev->data->mac_addrs[0]); /* 复制网卡的mac地址到eth_dev->data->mac_addrs */
...
}
static const struct eth_dev_ops ixgbe_eth_dev_ops = {
.dev_configure = ixgbe_dev_configure,
.dev_start = ixgbe_dev_start,
...
.rx_queue_setup = ixgbe_dev_rx_queue_setup,
...
.tx_queue_setup = ixgbe_dev_tx_queue_setup,
...
};
图片来源于
http://blog.chinaunix.net/uid-20528014-id-314322.html
http://blog.chinaunix.net/uid-20528014-id-315798.html
http://blog.chinaunix.net/uid-20528014-id-315801.html
DPDK使用mmap()将总线地址映射到用户空间虚拟地址
Kernel使用ioremap()将总线地址映射到内核空间虚拟地址