目录
一、概述
二、驱动注册
三、驱动初始化
3.1 rte_eth_dev_create
3.2 eth_i40e_dev_init
3.2.2 i40e_init_adminq_parameter
3.2.3 i40e_init_adminq
3.2.4 i40e_hw_init
3.2.5 i40e_config_automask
3.2.6 i40e_set_default_pctype_table
3.2.7 i40e_filter_input_set_init
3.2.8 config_floating_veb
3.2.9 i40e_clear_pxe_mode
3.2.10 i40e_dev_sync_phy_type
3.2.11 i40e_configure_registers
3.2.12 i40e_get_cap
3.2.13 i40e_configure_registers
3.2.14 i40e_get_cap
3.2.15 i40e_pf_parameter_init
3.2.16 i40e_res_pool_init
3.2.17 i40e_init_lan_hmc
3.2.18 i40e_configure_lan_hmc
3.2.19 i40e_set_fc
3.2.20 i40e_pf_setup
3.2.21 i40e_vsi_config_double_vlan
3.2.22 i40e_dcb_init_configure
3.2.23 i40e_pf_host_init
3.2.24 rte_intr_callback_register
3.2.25 i40e_pf_config_irq0
3.2.26 i40e_pf_enable_irq0
3.2.27 rte_intr_enable
3.2.28 i40e_flex_payload_reg_set_default
3.2.29 i40e_add_tx_flow_control_drop_filter
3.2.30 i40e_aq_set_mac_config
3.2.31 i40e_tm_conf_init
3.2.32 i40e_init_customized_info
3.2.33 i40e_init_ethtype_filter_list
3.2.34 i40e_init_tunnel_filter_list
3.2.35 i40e_init_fdir_filter_list
3.2.36 i40e_init_queue_region_conf
四、总结
五、参考
分析一下驱动程序的初始化,以i40e为例,一些无关紧要的函数就直接列个标题或做简要说明了。
本文主要还是自己的一个备忘,我自己读的时候也有很多含混不清的地方,这其中有水平不精也有公开版的datasheet很多说的含糊甚至想关寄存器没有直接提供,有的地方仅能通过代码实现反推,所以仅供参考。
在DPDK初始化分析(一)中,我们知道驱动程序的最开始的注册在main之前,如ie40
[driver/net/i40e/i40e_ethdev.c]
RTE_PMD_REGISTER_PCI(net_i40e, rte_i40e_pmd);
注册driver的过程也很简单,就是将driver统一管理起来
void rte_pci_register(struct rte_pci_driver *driver)
{
TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next);
driver->bus = &rte_pci_bus;
}
进而在bus probe时,driver匹配bus下面的device
相比于kernel,driver的结构很简单,如基类:
struct rte_driver {
TAILQ_ENTRY(rte_driver) next; /**< Next in list. */
const char *name; /**< Driver name. */
const char *alias; /**< Driver alias. */
};
一个pci的driver
struct rte_pci_driver {
TAILQ_ENTRY(rte_pci_driver) next; /**< Next in list. */
struct rte_driver driver; /**< Inherit core driver. */
struct rte_pci_bus *bus; /**< PCI bus reference. */
pci_probe_t *probe; /**< Device Probe function. */
pci_remove_t *remove; /**< Device Remove function. */
const struct rte_pci_id *id_table; /**< ID table, NULL terminated. */
uint32_t drv_flags; /**< Flags RTE_PCI_DRV_*. */
};
在上一篇说到bus probe时,有两个地方没有展开,一个时设备资源的map一个时driver 的初始化,下面以i40e为例说明一下:
static struct rte_pci_driver rte_i40e_pmd = {
.id_table = pci_id_i40e_map,
.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC |
RTE_PCI_DRV_IOVA_AS_VA,
.probe = eth_i40e_pci_probe,
.remove = eth_i40e_pci_remove,
};
看到drv_flags里有RTE_PCI_DRV_NEED_MAPPING标志,会执行rte_pci_map_device,完整的分析参见DPDK分析——UIO
当驱动和device匹配后会执行驱动的probe函数,对i40e来说是eth_i40e_pci_probe
[driver/net/i40e/i40e_ethdev.c]
static int eth_i40e_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
struct rte_pci_device *pci_dev)
{
char name[RTE_ETH_NAME_MAX_LEN];
struct rte_eth_devargs eth_da = { .nb_representor_ports = 0 };
int i, retval;
if (pci_dev->device.devargs) {
retval = rte_eth_devargs_parse(pci_dev->device.devargs->args,
ð_da);
if (retval)
return retval;
}
retval = rte_eth_dev_create(&pci_dev->device, pci_dev->device.name,
sizeof(struct i40e_adapter),
eth_dev_pci_specific_init, pci_dev,
eth_i40e_dev_init, NULL);
if (retval || eth_da.nb_representor_ports < 1)
return retval;
/* probe VF representor ports */
struct rte_eth_dev *pf_ethdev = rte_eth_dev_allocated(
pci_dev->device.name);
if (pf_ethdev == NULL)
return -ENODEV;
for (i = 0; i < eth_da.nb_representor_ports; i++) {
struct i40e_vf_representor representor = {
.vf_id = eth_da.representor_ports[i],
.switch_domain_id = I40E_DEV_PRIVATE_TO_PF(
pf_ethdev->data->dev_private)->switch_domain_id,
.adapter = I40E_DEV_PRIVATE_TO_ADAPTER(
pf_ethdev->data->dev_private)
};
/* representor port net_bdf_port */
snprintf(name, sizeof(name), "net_%s_representor_%d",
pci_dev->device.name, eth_da.representor_ports[i]);
retval = rte_eth_dev_create(&pci_dev->device, name,
sizeof(struct i40e_vf_representor), NULL, NULL,
i40e_vf_representor_init, &representor);
if (retval)
PMD_DRV_LOG(ERR, "failed to create i40e vf "
"representor %s.", name);
}
return 0;
}
其中devargs的来源有两个,第一,启动时通过-b/-w指定的。第二通过(rte_eal_hotplug_add->rte_dev_probe->local_dev_probe->rte_devargs_insert)。在i40e这个参数应该没有指定。
rte_eth_dev_allocate分配以太网抽象,名称为pci_dev->device.name,目前DPDK使用全局rte_eth_devices管理以太网设备
struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
rte_eth_dev_allocate就是检测一下设备有没有被分配出去,如果没有则从上述数组中分配第一个空闲的rte_eth_dev,它索引数组的索引称为port_id, 先看一下数据结构:
struct rte_eth_dev {
eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */
/**
* Next two fields are per-device data but *data is shared between
* primary and secondary processes and *process_private is per-process
* private. The second one is managed by PMDs if necessary.
*/
struct rte_eth_dev_data *data; /**< Pointer to device data. */
void *process_private; /**< Pointer to per-process device data. */
const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
struct rte_device *device; /**< Backing device */
struct rte_intr_handle *intr_handle; /**< Device interrupt handle */
/** User application callbacks for NIC interrupts */
struct rte_eth_dev_cb_list link_intr_cbs;
/**
* User-supplied functions called from rx_burst to post-process
* received packets before passing them to the user
*/
struct rte_eth_rxtx_callback *post_rx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];
/**
* User-supplied functions called from tx_burst to pre-process
* received packets before passing them to the driver for transmission.
*/
struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];
enum rte_eth_dev_state state; /**< Flag indicating the port state */
void *security_ctx; /**< Context for security ops */
} __rte_cache_aligned;
rte_eth_dev_data指向设备的数据,也是在全部变量rte_eth_dev_shared_data中,创建设备抽象时通过下面函数关联:
static struct rte_eth_dev * eth_dev_get(uint16_t port_id)
{
struct rte_eth_dev *eth_dev = &rte_eth_devices[port_id];
eth_dev->data = &rte_eth_dev_shared_data->data[port_id];
eth_dev_last_created_port = port_id;
return eth_dev;
}
struct rte_eth_dev_data {
char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */
void **rx_queues; /**< Array of pointers to RX queues. */
void **tx_queues; /**< Array of pointers to TX queues. */
uint16_t nb_rx_queues; /**< Number of RX queues. */
uint16_t nb_tx_queues; /**< Number of TX queues. */
struct rte_eth_dev_sriov sriov; /**< SRIOV data */
void *dev_private;
/**< PMD-specific private data.
* @see rte_eth_dev_release_port()
*/
struct rte_eth_link dev_link; /**< Link-level information & status. */
struct rte_eth_conf dev_conf; /**< Configuration applied to device. */
uint16_t mtu; /**< Maximum Transmission Unit. */
uint32_t min_rx_buf_size;
/**< Common RX buffer size handled by all queues. */
uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */
struct ether_addr *mac_addrs;
/**< Device Ethernet link address.
* @see rte_eth_dev_release_port()
*/
uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR];
/**< Bitmap associating MAC addresses to pools. */
struct ether_addr *hash_mac_addrs;
/**< Device Ethernet MAC addresses of hash filtering.
* @see rte_eth_dev_release_port()
*/
uint16_t port_id; /**< Device [external] port identifier. */
__extension__
uint8_t promiscuous : 1, /**< RX promiscuous mode ON(1) / OFF(0). */
scattered_rx : 1, /**< RX of scattered packets is ON(1) / OFF(0) */
all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). */
dev_started : 1, /**< Device state: STARTED(1) / STOPPED(0). */
lro : 1; /**< RX LRO is ON(1) / OFF(0) */
uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT];
/**< Queues state: STARTED(1) / STOPPED(0). */
uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT];
/**< Queues state: STARTED(1) / STOPPED(0). */
uint32_t dev_flags; /**< Capabilities. */
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough. */
int numa_node; /**< NUMA node connection. */
struct rte_vlan_filter_conf vlan_filter_conf;
/**< VLAN filter configuration. */
struct rte_eth_dev_owner owner; /**< The port owner. */
uint16_t representor_id;
/**< Switch-specific identifier.
* Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags.
*/
} __rte_cache_aligned;
rte_eth_dev分配完成后,分配其私有数据,和kernel netdev_priv类似
if (priv_data_size) {
ethdev->data->dev_private = rte_zmalloc_socket(
name, priv_data_size, RTE_CACHE_LINE_SIZE,
device->numa_node);
if (!ethdev->data->dev_private) {
RTE_LOG(ERR, EAL, "failed to allocate private data");
retval = -ENOMEM;
goto probe_failed;
}
}
在i40e,这个私有数据是
struct i40e_adapter {
/* Common for both PF and VF */
struct i40e_hw hw;
struct rte_eth_dev *eth_dev;
/* Specific for PF or VF */
union {
struct i40e_pf pf;
struct i40e_vf vf;
};
/* For vector PMD */
bool rx_bulk_alloc_allowed;
bool rx_vec_allowed;
bool tx_simple_allowed;
bool tx_vec_allowed;
/* For PTP */
struct rte_timecounter systime_tc;
struct rte_timecounter rx_tstamp_tc;
struct rte_timecounter tx_tstamp_tc;
/* ptype mapping table */
uint32_t ptype_tbl[I40E_MAX_PKT_TYPE] __rte_cache_min_aligned;
/* flow type to pctype mapping table */
uint64_t pctypes_tbl[I40E_FLOW_TYPE_MAX] __rte_cache_min_aligned;
uint64_t flow_types_mask;
uint64_t pctypes_mask;
/* For devargs */
uint8_t use_latest_vec;
/* For RSS reta table update */
uint8_t rss_reta_updated;
};
调用驱动特定的初始化函数,在i40e中是
static inline int eth_dev_pci_specific_init(struct rte_eth_dev *eth_dev, void *bus_device) {
struct rte_pci_device *pci_dev = bus_device;
if (!pci_dev)
return -ENODEV;
rte_eth_copy_pci_info(eth_dev, pci_dev);
return 0;
}
就是copy一下相关的信息,直接贴了:
static inline void
rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev,
struct rte_pci_device *pci_dev)
{
if ((eth_dev == NULL) || (pci_dev == NULL)) {
RTE_ETHDEV_LOG(ERR, "NULL pointer eth_dev=%p pci_dev=%p",
(void *)eth_dev, (void *)pci_dev);
return;
}
eth_dev->intr_handle = &pci_dev->intr_handle;
eth_dev->data->dev_flags = 0;
if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_RMV)
eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
eth_dev->data->kdrv = pci_dev->kdrv;
eth_dev->data->numa_node = pci_dev->device.numa_node;
}
接下来的ethdev_init也是驱动特定的,这里是eth_i40e_dev_init
dev->dev_ops = &i40e_eth_dev_ops;
dev->rx_pkt_burst = i40e_recv_pkts;
dev->tx_pkt_burst = i40e_xmit_pkts;
dev->tx_pkt_prepare = i40e_prep_pkts;
i40e_set_default_ptype_table就是初始化了一下ptype所以类型的数组,ptype是接收描述符wb 的field,表示接收报文的类型。
rte_eth_copy_pci_info这个功能和前面eth_dev_pci_specific_init重合了,不知道的几个意思.
接下来初始化rte_eth_dev私有数据 i40e_hw,没啥好说的,需要注意一点就是:
hw->hw_addr = (uint8_t *)(pci_dev->mem_resource[0].addr);
这里addr是大页映射过的地址,在UIO已经分析过了,所以后续可以直接通过I40E_READ_REG去访问pci bar0空间
接下来执行一些琐碎的东西,包括一些数据结构之间指向关系,和一些硬件部件的reset操作
和kernel驱动函数名字差不多,确定mac_type这个是区分网卡类型的,另外读取portid和pf id,初始化nvm
在i40e中,一个重要的改变是引入了admin queues, 以前许多直接访问的配置、资源都要通过admin queues,这里根据手册的描述解释一下admin queues
引入admin queues的原因:
admin queues分成 ATQ和ARQ
其实这两个队列和收发包队列的原理是一模一样的:队列中是描述符,
队列的大小由ATQBAH, ATQBAL, ATQLEN, ATQH, ATQT, 大概就是下面这个图的意思
无论是ATQ还是ARQ,它们都有direct command和indirect command之分,direct command是指command能容纳在一个desc之内,如果不能容纳,就是indirect command,此时要将desc 的地址位指向一块buffer用于fw使用或传递数据。
所以command分为admin direct command和admin indirect command,这些desc初始化时候的填充和fw回写后的填充在手册里都写了,这里就不在贴了,对照找就行了,下面附上一张opcode的图,说明下面的信息获取都要通过admin queues
说完了原理,接下来分析一下函数:
static inline void i40e_init_adminq_parameter(struct i40e_hw *hw)
{
hw->aq.num_arq_entries = I40E_AQ_LEN;
hw->aq.num_asq_entries = I40E_AQ_LEN;
hw->aq.arq_buf_size = I40E_AQ_BUF_SZ;
hw->aq.asq_buf_size = I40E_AQ_BUF_SZ;
}
该函数的admin queues初始化的主体,DPDK使用下面的结构描述一个admin queues
/* Admin Queue information */
struct i40e_adminq_info {
struct i40e_adminq_ring arq; /* receive queue */
struct i40e_adminq_ring asq; /* send queue */
u32 asq_cmd_timeout; /* send queue cmd write back timeout*/
u16 num_arq_entries; /* receive queue depth */
u16 num_asq_entries; /* send queue depth */
u16 arq_buf_size; /* receive queue buffer size */
u16 asq_buf_size; /* send queue buffer size */
u16 fw_maj_ver; /* firmware major version */
u16 fw_min_ver; /* firmware minor version */
u32 fw_build; /* firmware build number */
u16 api_maj_ver; /* api major version */
u16 api_min_ver; /* api minor version */
struct i40e_spinlock asq_spinlock; /* Send queue spinlock */
struct i40e_spinlock arq_spinlock; /* Receive queue spinlock */
/* last status values on send and receive queues */
enum i40e_admin_queue_err asq_last_status;
enum i40e_admin_queue_err arq_last_status;
};
struct i40e_adminq_ring {
struct i40e_virt_mem dma_head; /* space for dma structures */
struct i40e_dma_mem desc_buf; /* descriptor ring memory */
struct i40e_virt_mem cmd_buf; /* command buffer memory */
union {
struct i40e_dma_mem *asq_bi;
struct i40e_dma_mem *arq_bi;
} r;
u16 count; /* Number of descriptors */
u16 rx_buf_len; /* Admin Receive Queue buffer length */
/* used for interrupt processing */
u16 next_to_use;
u16 next_to_clean;
/* used for queue tracking */
u32 head;
u32 tail;
u32 len;
u32 bah;
u32 bal;
};
i40e_adminq_init_regs初始化上述结构的规格
enum i40e_status_code i40e_init_asq(struct i40e_hw *hw)
{
enum i40e_status_code ret_code = I40E_SUCCESS;
hw->aq.asq.next_to_use = 0;
hw->aq.asq.next_to_clean = 0;
/* allocate the ring memory */
ret_code = i40e_alloc_adminq_asq_ring(hw);
if (ret_code != I40E_SUCCESS)
goto init_adminq_exit;
/* allocate buffers in the rings */
ret_code = i40e_alloc_asq_bufs(hw);
if (ret_code != I40E_SUCCESS)
goto init_adminq_free_rings;
/* initialize base registers */
ret_code = i40e_config_asq_regs(hw);
if (ret_code != I40E_SUCCESS)
goto init_config_regs;
/* success! */
hw->aq.asq.count = hw->aq.num_asq_entries;
goto init_adminq_exit;
}
这个函数是asq admin发送队列的初始化,初始化时head和tail应该时重合的,它们的软件记录next_to_clean/next_to_use也是0
接下来:
enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw)
{
enum i40e_status_code ret_code;
ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf,
i40e_mem_atq_ring,
(hw->aq.num_asq_entries *
sizeof(struct i40e_aq_desc)),
I40E_ADMINQ_DESC_ALIGNMENT);
if (ret_code)
return ret_code;
ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.cmd_buf,
(hw->aq.num_asq_entries *
sizeof(struct i40e_asq_cmd_details)));
if (ret_code) {
i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
return ret_code;
}
return ret_code;
}
i40e_allocate_dma_mem 分配desc空间,注意这部分应该物理地址——BAH,BAL,LEN
i40e_allocate_virt_mem分配command
STATIC enum i40e_status_code i40e_alloc_asq_bufs(struct i40e_hw *hw)
{
enum i40e_status_code ret_code;
struct i40e_dma_mem *bi;
int i;
/* No mapped memory needed yet, just the buffer info structures */
ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.dma_head,
(hw->aq.num_asq_entries * sizeof(struct i40e_dma_mem)));
if (ret_code)
goto alloc_asq_bufs;
hw->aq.asq.r.asq_bi = (struct i40e_dma_mem *)hw->aq.asq.dma_head.va;
/* allocate the mapped buffers */
for (i = 0; i < hw->aq.num_asq_entries; i++) {
bi = &hw->aq.asq.r.asq_bi[i];
ret_code = i40e_allocate_dma_mem(hw, bi,
i40e_mem_asq_buf,
hw->aq.asq_buf_size,
I40E_ADMINQ_DESC_ALIGNMENT);
if (ret_code)
goto unwind_alloc_asq_bufs;
}
alloc_asq_bufs:
return ret_code;
unwind_alloc_asq_bufs:
/* don't try to free the one that failed... */
i--;
for (; i >= 0; i--)
i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);
return ret_code;
}
这个函数分配描述符对应的buffers
附一张图:
接下来i40e_init_arq的asq初始化过程差不多,区别在于arq没有i40e_asq_cmd_details,而且对于接收来说TRQT初始化时应该执行ring末尾,而且desc应该都是预先初始化好的
如在alloc buffer时:
/* allocate the mapped buffers */
for (i = 0; i < hw->aq.num_arq_entries; i++) {
bi = &hw->aq.arq.r.arq_bi[i];
ret_code = i40e_allocate_dma_mem(hw, bi,
i40e_mem_arq_buf,
hw->aq.arq_buf_size,
I40E_ADMINQ_DESC_ALIGNMENT);
if (ret_code)
goto unwind_alloc_arq_bufs;
/* now configure the descriptors for use */
desc = I40E_ADMINQ_DESC(hw->aq.arq, i);
desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF);
if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB);
desc->opcode = 0;
/* This is in accordance with Admin queue design, there is no
* register for buffer size configuration
*/
desc->datalen = CPU_TO_LE16((u16)bi->size);
desc->retval = 0;
desc->cookie_high = 0;
desc->cookie_low = 0;
desc->params.external.addr_high =
CPU_TO_LE32(I40E_HI_DWORD(bi->pa));
desc->params.external.addr_low =
CPU_TO_LE32(I40E_LO_DWORD(bi->pa));
desc->params.external.param0 = 0;
desc->params.external.param1 = 0;
}
而且在初始化dma寄存器时
wr32(hw, hw->aq.arq.bal, I40E_LO_DWORD(hw->aq.arq.desc_buf.pa));
wr32(hw, hw->aq.arq.bah, I40E_HI_DWORD(hw->aq.arq.desc_buf.pa));
/* Update tail in the HW to post pre-allocated buffers */
wr32(hw, hw->aq.arq.tail, hw->aq.num_arq_entries - 1);
接下来通过一个具体的例子说明admin queues的使用方法:
例子是i40e_aq_get_firmware_version,主要是下面两个调用:
i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version);
status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
这是一个send command,先看描述符是如何填充的:
void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc, u16 opcode)
{
/* zero out the desc */
i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc),
I40E_NONDMA_MEM);
desc->opcode = CPU_TO_LE16(opcode);
desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_SI);
}
接着是调用发送接口:
enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw,
struct i40e_aq_desc *desc,
void *buff, /* can be NULL */
u16 buff_size,
struct i40e_asq_cmd_details *cmd_details)
进入函数主题,分段来说:
details = I40E_ADMINQ_DETAILS(hw->aq.asq, hw->aq.asq.next_to_use);
if (cmd_details) {
i40e_memcpy(details,
cmd_details,
sizeof(struct i40e_asq_cmd_details),
I40E_NONDMA_TO_NONDMA);
/* If the cmd_details are defined copy the cookie. The
* CPU_TO_LE32 is not needed here because the data is ignored
* by the FW, only used by the driver
*/
if (details->cookie) {
desc->cookie_high =
CPU_TO_LE32(I40E_HI_DWORD(details->cookie));
desc->cookie_low =
CPU_TO_LE32(I40E_LO_DWORD(details->cookie));
}
} else {
i40e_memset(details, 0,
sizeof(struct i40e_asq_cmd_details),
I40E_NONDMA_MEM);
}
设置标记:
/* clear requested flags and then set additional flags if defined */
desc->flags &= ~CPU_TO_LE16(details->flags_dis);
desc->flags |= CPU_TO_LE16(details->flags_ena);
copy desc
/* initialize the temp desc pointer with the right desc */
desc_on_ring = I40E_ADMINQ_DESC(hw->aq.asq, hw->aq.asq.next_to_use);
/* if the desc is available copy the temp desc to the right place */
i40e_memcpy(desc_on_ring, desc, sizeof(struct i40e_aq_desc),
I40E_NONDMA_TO_DMA);
copy buffer
/* if buff is not NULL assume indirect command */
if (buff != NULL) {
dma_buff = &(hw->aq.asq.r.asq_bi[hw->aq.asq.next_to_use]);
/* copy the user buff into the respective DMA buff */
i40e_memcpy(dma_buff->va, buff, buff_size,
I40E_NONDMA_TO_DMA);
desc_on_ring->datalen = CPU_TO_LE16(buff_size);
/* Update the address values in the desc with the pa value
* for respective buffer
*/
desc_on_ring->params.external.addr_high =
CPU_TO_LE32(I40E_HI_DWORD(dma_buff->pa));
desc_on_ring->params.external.addr_low =
CPU_TO_LE32(I40E_LO_DWORD(dma_buff->pa));
}
增加next_to_use, 回绕处理,如果非延迟,直接写tail,这样可以立即发送出
(hw->aq.asq.next_to_use)++;
if (hw->aq.asq.next_to_use == hw->aq.asq.count)
hw->aq.asq.next_to_use = 0;
if (!details->postpone)
wr32(hw, hw->aq.asq.tail, hw->aq.asq.next_to_use);
非异步且非延迟,尝试轮询结果
if (!details->async && !details->postpone) {
u32 total_delay = 0;
do {
/* AQ designers suggest use of head for better
* timing reliability than DD bit
*/
if (i40e_asq_done(hw))
break;
i40e_usec_delay(50);
total_delay += 50;
} while (total_delay < hw->aq.asq_cmd_timeout);
}
一些简单的操作,注释说的很明白了——一些寄存器处理global reset无法reset,就将他们的值写一个非初始化值,这里列一下备忘
这部分内容本来想放到下一篇,但是驱动初始化时没有相关的背景不易看懂代码,因此就在这里一并说明一下,和大多数网卡一样,报文送到cpu前可以做预解析,手册中称为Receive Classification filters,按照优先级可以分为以下四种(filters type):
不同的filters匹配报文后有不同的动作, 如丢弃报文,将报文送入某个index 的queue等等,具体可以参照手册。
在继续描述之前,列一下手册里提到的一些术语(如果我理解错误,请告诉我:) )
截取了手册的章节,protocol具体类型参见该小节描述
下面用一个图来说明在接收方向,以fd filter为例说明报文解析的过程:
首先关注GLQF_ORT 和GLQF_PIT,这两个寄存器都是预先定义(predefined)的——从NVM中加载进来的,GLQF_OPT参照下图:
GLQF_ORT在手册里没有找到对应的寄存器,这是一个64项,每项四个字节的表
#ifndef I40E_GLQF_ORT
#define I40E_GLQF_ORT(_i) (0x00268900 + ((_i) * 4))
#endif
#ifndef I40E_GLQF_PIT
#define I40E_GLQF_PIT(_i) (0x00268C80 + ((_i) * 4))
#endif
它通过PIT_INDX索引GLQF_PIT,FIELD_CNT指示在GLQF_PIT中的项数,GLQF_PIT是一个32项,每项四个字节的表,每项由下面的组成:
这些字段对照上面的图看就好了。
好了,了解了上面的知识,下面按照我的理解说明一下报文解析的过程:
PRTQF_FD_INSET手册里没有,通过代码看下:
#define I40E_PRTQF_FD_INSET(_i, _j) (0x00250000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */
这说明PRTQF_FD_INSET是64项每项8byte的表,我问对照field vector结构来看看PRTQF_FD_INSET的表项是如何指定提取字段的,这里再贴一下:
上图说明在field vector中DMAC是占三个word(0:2), SMAC也占三个word(3:5),接下来看下代码是如何填充PRTQF_FD_INSET来选择提取这两项的
/* Destination MAC address */
#define I40E_REG_INSET_L2_DMAC 0xE000000000000000ULL
/* Source MAC address */
#define I40E_REG_INSET_L2_SMAC 0x1C00000000000000ULL
可以看到I40E_REG_INSET_L2_DMAC在内存中的值是末尾三个“1”,和field vector中对应,而I40E_REG_INSET_L2_SMAC也是一样,注意下大小端很容易分析出来。
基本流程介绍完了,这时候看下函数本身
void __attribute__((cold))
i40e_set_default_pctype_table(struct rte_eth_dev *dev)
{
struct i40e_adapter *ad =
I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
int i;
for (i = 0; i < I40E_FLOW_TYPE_MAX; i++)
ad->pctypes_tbl[i] = 0ULL;
ad->flow_types_mask = 0ULL;
ad->pctypes_mask = 0ULL;
ad->pctypes_tbl[RTE_ETH_FLOW_FRAG_IPV4] =
(1ULL << I40E_FILTER_PCTYPE_FRAG_IPV4);
ad->pctypes_tbl[RTE_ETH_FLOW_NONFRAG_IPV4_UDP] =
(1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_UDP);
...
for (i = 0; i < I40E_FLOW_TYPE_MAX; i++) {
if (ad->pctypes_tbl[i])
ad->flow_types_mask |= (1ULL << i);
ad->pctypes_mask |= ad->pctypes_tbl[i];
}
}
这个函数初始化pctype,对应ad->pctypes_tbl,由于pctype值都比较大,使用flow_type为索引使用bit保存pctype,其中flow type向下面这样定义:
#define RTE_ETH_FLOW_UNKNOWN 0
#define RTE_ETH_FLOW_RAW 1
#define RTE_ETH_FLOW_IPV4 2
#define RTE_ETH_FLOW_FRAG_IPV4 3
#define RTE_ETH_FLOW_NONFRAG_IPV4_TCP 4
#define RTE_ETH_FLOW_NONFRAG_IPV4_UDP 5
#define RTE_ETH_FLOW_NONFRAG_IPV4_SCTP 6
#define RTE_ETH_FLOW_NONFRAG_IPV4_OTHER 7
#define RTE_ETH_FLOW_IPV6 8
#define RTE_ETH_FLOW_FRAG_IPV6 9
#define RTE_ETH_FLOW_NONFRAG_IPV6_TCP 10
#define RTE_ETH_FLOW_NONFRAG_IPV6_UDP 11
#define RTE_ETH_FLOW_NONFRAG_IPV6_SCTP 12
#define RTE_ETH_FLOW_NONFRAG_IPV6_OTHER 13
#define RTE_ETH_FLOW_L2_PAYLOAD 14
#define RTE_ETH_FLOW_IPV6_EX 15
#define RTE_ETH_FLOW_IPV6_TCP_EX 16
#define RTE_ETH_FLOW_IPV6_UDP_EX 17
#define RTE_ETH_FLOW_PORT 18
/**< Consider device port number as a flow differentiator */
#define RTE_ETH_FLOW_VXLAN 19 /**< VXLAN protocol based flow */
#define RTE_ETH_FLOW_GENEVE 20 /**< GENEVE protocol based flow */
#define RTE_ETH_FLOW_NVGRE 21 /**< NVGRE protocol based flow */
#define RTE_ETH_FLOW_VXLAN_GPE 22 /**< VXLAN-GPE protocol based flow */
#define RTE_ETH_FLOW_MAX 23
下面这条语句保存一个pctype的信息:
ad->pctypes_tbl[RTE_ETH_FLOW_FRAG_IPV4] =
(1ULL << I40E_FILTER_PCTYPE_FRAG_IPV4);
掩码信息,对照看就好了:
ad->flow_types_mask |= (1ULL << i);
ad->pctypes_mask |= ad->pctypes_tbl[i];
这个函数是填充各个filters的input set,其实就是通过分析pctype来填充对应的xxx_inset寄存器。
static void i40e_filter_input_set_init(struct i40e_pf *pf)
{
...
for (pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
pctype <= I40E_FILTER_PCTYPE_L2_PAYLOAD; pctype++) {
flow_type = i40e_pctype_to_flowtype(pf->adapter, pctype);
if (flow_type == RTE_ETH_FLOW_UNKNOWN)
continue;
input_set = i40e_get_default_input_set(pctype);
num = i40e_generate_inset_mask_reg(input_set, mask_reg,
I40E_INSET_MASK_NUM_REG);
inset_reg = i40e_translate_input_set_reg(hw->mac.type,
input_set);
i40e_check_write_reg(hw, I40E_PRTQF_FD_INSET(pctype, 0),
(uint32_t)(inset_reg & UINT32_MAX));
i40e_check_write_reg(hw, I40E_PRTQF_FD_INSET(pctype, 1),
(uint32_t)((inset_reg >>
I40E_32_BIT_WIDTH) & UINT32_MAX));
if (!pf->support_multi_driver) {
i40e_check_write_global_reg(hw,
I40E_GLQF_HASH_INSET(0, pctype),
(uint32_t)(inset_reg & UINT32_MAX));
i40e_check_write_global_reg(hw,
I40E_GLQF_HASH_INSET(1, pctype),
(uint32_t)((inset_reg >>
I40E_32_BIT_WIDTH) & UINT32_MAX));
for (i = 0; i < num; i++) {
i40e_check_write_global_reg(hw,
I40E_GLQF_FD_MSK(i, pctype),
mask_reg[i]);
i40e_check_write_global_reg(hw,
I40E_GLQF_HASH_MSK(i, pctype),
mask_reg[i]);
}
/*clear unused mask registers of the pctype */
for (i = num; i < I40E_INSET_MASK_NUM_REG; i++) {
i40e_check_write_global_reg(hw,
I40E_GLQF_FD_MSK(i, pctype),
0);
i40e_check_write_global_reg(hw,
I40E_GLQF_HASH_MSK(i, pctype),
0);
}
} else {
PMD_DRV_LOG(ERR, "Input set setting is not supported.");
}
I40E_WRITE_FLUSH(hw);
/* store the default input set */
if (!pf->support_multi_driver)
pf->hash_input_set[pctype] = input_set;
pf->fdir.input_set[pctype] = input_set;
}
}
pf->hash_input_set[pctype] = input_set;
pf->fdir.input_set[pctype] = input_set;
最后记录input set
static void i40e_dev_interrupt_handler(void *param)
{
struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
uint32_t icr0;
/* Disable interrupt */
i40e_pf_disable_irq0(hw);
/* read out interrupt causes */
icr0 = I40E_READ_REG(hw, I40E_PFINT_ICR0);
/* No interrupt event indicated */
if (!(icr0 & I40E_PFINT_ICR0_INTEVENT_MASK)) {
PMD_DRV_LOG(INFO, "No interrupt event");
goto done;
}
if (icr0 & I40E_PFINT_ICR0_ECC_ERR_MASK)
PMD_DRV_LOG(ERR, "ICR0: unrecoverable ECC error");
if (icr0 & I40E_PFINT_ICR0_MAL_DETECT_MASK)
PMD_DRV_LOG(ERR, "ICR0: malicious programming detected");
if (icr0 & I40E_PFINT_ICR0_GRST_MASK)
PMD_DRV_LOG(INFO, "ICR0: global reset requested");
if (icr0 & I40E_PFINT_ICR0_PCI_EXCEPTION_MASK)
PMD_DRV_LOG(INFO, "ICR0: PCI exception activated");
if (icr0 & I40E_PFINT_ICR0_STORM_DETECT_MASK)
PMD_DRV_LOG(INFO, "ICR0: a change in the storm control state");
if (icr0 & I40E_PFINT_ICR0_HMC_ERR_MASK)
PMD_DRV_LOG(ERR, "ICR0: HMC error");
if (icr0 & I40E_PFINT_ICR0_PE_CRITERR_MASK)
PMD_DRV_LOG(ERR, "ICR0: protocol engine critical error");
if (icr0 & I40E_PFINT_ICR0_VFLR_MASK) {
PMD_DRV_LOG(INFO, "ICR0: VF reset detected");
i40e_dev_handle_vfr_event(dev);
}
if (icr0 & I40E_PFINT_ICR0_ADMINQ_MASK) {
PMD_DRV_LOG(INFO, "ICR0: adminq event");
i40e_dev_handle_aq_msg(dev);
}
done:
/* Enable interrupt */
i40e_pf_enable_irq0(hw);
}
先分析里面的uio后续分析vfio相关的时候补上
static int uio_intr_enable(const struct rte_intr_handle *intr_handle)
{
const int value = 1;
if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
RTE_LOG(ERR, EAL,
"Error enabling interrupts for fd %d (%s)\n",
intr_handle->fd, strerror(errno));
return -1;
}
return 0;
}
就是向/dev/uioX中写一个,具体参见 DPDK分析——UIO
好多函数只列的标题,后续考虑补上
【1】Intel® Ethernet Controller X710/XXV710/XL710: Datasheet