DPDK初始化分析(五)

目录

一、概述

二、驱动注册

三、驱动初始化

3.1 rte_eth_dev_create

3.2 eth_i40e_dev_init

3.2.1 i40e_init_shared_code

3.2.2 i40e_init_adminq_parameter

3.2.3 i40e_init_adminq

3.2.4 i40e_hw_init

3.2.5 i40e_config_automask

3.2.6 i40e_set_default_pctype_table​

3.2.7 i40e_filter_input_set_init

3.2.8 config_floating_veb

3.2.9 i40e_clear_pxe_mode

3.2.10 i40e_dev_sync_phy_type

3.2.11 i40e_configure_registers

3.2.12 i40e_get_cap

3.2.13 i40e_configure_registers

3.2.14 i40e_get_cap

3.2.15 i40e_pf_parameter_init

3.2.16 i40e_res_pool_init

3.2.17 i40e_init_lan_hmc

3.2.18 i40e_configure_lan_hmc

3.2.19 i40e_set_fc

3.2.20 i40e_pf_setup

3.2.21 i40e_vsi_config_double_vlan

3.2.22 i40e_dcb_init_configure

3.2.23 i40e_pf_host_init

3.2.24 rte_intr_callback_register

3.2.25 i40e_pf_config_irq0

3.2.26 i40e_pf_enable_irq0

3.2.27 rte_intr_enable

3.2.28 i40e_flex_payload_reg_set_default

3.2.29 i40e_add_tx_flow_control_drop_filter

3.2.30 i40e_aq_set_mac_config

3.2.31 i40e_tm_conf_init

3.2.32 i40e_init_customized_info

3.2.33 i40e_init_ethtype_filter_list

3.2.34 i40e_init_tunnel_filter_list

3.2.35 i40e_init_fdir_filter_list

3.2.36 i40e_init_queue_region_conf

四、总结

五、参考


一、概述

分析一下驱动程序的初始化,以i40e为例,一些无关紧要的函数就直接列个标题或做简要说明了。

本文主要还是自己的一个备忘,我自己读的时候也有很多含混不清的地方,这其中有水平不精也有公开版的datasheet很多说的含糊甚至想关寄存器没有直接提供,有的地方仅能通过代码实现反推,所以仅供参考。

二、驱动注册

在DPDK初始化分析(一)中,我们知道驱动程序的最开始的注册在main之前,如ie40

[driver/net/i40e/i40e_ethdev.c]

RTE_PMD_REGISTER_PCI(net_i40e, rte_i40e_pmd);

注册driver的过程也很简单,就是将driver统一管理起来

void rte_pci_register(struct rte_pci_driver *driver)
{
	TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next);
	driver->bus = &rte_pci_bus;
}

进而在bus probe时,driver匹配bus下面的device

相比于kernel,driver的结构很简单,如基类:

struct rte_driver {
	TAILQ_ENTRY(rte_driver) next;  /**< Next in list. */
	const char *name;                   /**< Driver name. */
	const char *alias;              /**< Driver alias. */
};

一个pci的driver

struct rte_pci_driver {
	TAILQ_ENTRY(rte_pci_driver) next;  /**< Next in list. */
	struct rte_driver driver;          /**< Inherit core driver. */
	struct rte_pci_bus *bus;           /**< PCI bus reference. */
	pci_probe_t *probe;                /**< Device Probe function. */
	pci_remove_t *remove;              /**< Device Remove function. */
	const struct rte_pci_id *id_table; /**< ID table, NULL terminated. */
	uint32_t drv_flags;                /**< Flags RTE_PCI_DRV_*. */
};

在上一篇说到bus probe时,有两个地方没有展开,一个时设备资源的map一个时driver 的初始化,下面以i40e为例说明一下:

static struct rte_pci_driver rte_i40e_pmd = {
	.id_table = pci_id_i40e_map,
	.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC |
		     RTE_PCI_DRV_IOVA_AS_VA,
	.probe = eth_i40e_pci_probe,
	.remove = eth_i40e_pci_remove,
};

看到drv_flags里有RTE_PCI_DRV_NEED_MAPPING标志,会执行rte_pci_map_device,完整的分析参见DPDK分析——UIO

三、驱动初始化

当驱动和device匹配后会执行驱动的probe函数,对i40e来说是eth_i40e_pci_probe

[driver/net/i40e/i40e_ethdev.c]

static int eth_i40e_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
	struct rte_pci_device *pci_dev)
{
	char name[RTE_ETH_NAME_MAX_LEN];
	struct rte_eth_devargs eth_da = { .nb_representor_ports = 0 };
	int i, retval;

	if (pci_dev->device.devargs) {
		retval = rte_eth_devargs_parse(pci_dev->device.devargs->args,
				ð_da);
		if (retval)
			return retval;
	}

	retval = rte_eth_dev_create(&pci_dev->device, pci_dev->device.name,
		sizeof(struct i40e_adapter),
		eth_dev_pci_specific_init, pci_dev,
		eth_i40e_dev_init, NULL);

	if (retval || eth_da.nb_representor_ports < 1)
		return retval;

	/* probe VF representor ports */
	struct rte_eth_dev *pf_ethdev = rte_eth_dev_allocated(
		pci_dev->device.name);

	if (pf_ethdev == NULL)
		return -ENODEV;

	for (i = 0; i < eth_da.nb_representor_ports; i++) {
		struct i40e_vf_representor representor = {
			.vf_id = eth_da.representor_ports[i],
			.switch_domain_id = I40E_DEV_PRIVATE_TO_PF(
				pf_ethdev->data->dev_private)->switch_domain_id,
			.adapter = I40E_DEV_PRIVATE_TO_ADAPTER(
				pf_ethdev->data->dev_private)
		};

		/* representor port net_bdf_port */
		snprintf(name, sizeof(name), "net_%s_representor_%d",
			pci_dev->device.name, eth_da.representor_ports[i]);

		retval = rte_eth_dev_create(&pci_dev->device, name,
			sizeof(struct i40e_vf_representor), NULL, NULL,
			i40e_vf_representor_init, &representor);

		if (retval)
			PMD_DRV_LOG(ERR, "failed to create i40e vf "
				"representor %s.", name);
	}

	return 0;
}
  • pci_dev->device.devargs在bus scan的时候赋值

其中devargs的来源有两个,第一,启动时通过-b/-w指定的。第二通过(rte_eal_hotplug_add->rte_dev_probe->local_dev_probe->rte_devargs_insert)。在i40e这个参数应该没有指定。

3.1 rte_eth_dev_create

  • rte_eth_dev_create,类比于内核的netdevice driver probe中的接口创建及注册。

rte_eth_dev_allocate分配以太网抽象,名称为pci_dev->device.name,目前DPDK使用全局rte_eth_devices管理以太网设备

struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];

rte_eth_dev_allocate就是检测一下设备有没有被分配出去,如果没有则从上述数组中分配第一个空闲的rte_eth_dev,它索引数组的索引称为port_id,  先看一下数据结构:

struct rte_eth_dev {
	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
	eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */
	/**
	 * Next two fields are per-device data but *data is shared between
	 * primary and secondary processes and *process_private is per-process
	 * private. The second one is managed by PMDs if necessary.
	 */
	struct rte_eth_dev_data *data;  /**< Pointer to device data. */
	void *process_private; /**< Pointer to per-process device data. */
	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
	struct rte_device *device; /**< Backing device */
	struct rte_intr_handle *intr_handle; /**< Device interrupt handle */
	/** User application callbacks for NIC interrupts */
	struct rte_eth_dev_cb_list link_intr_cbs;
	/**
	 * User-supplied functions called from rx_burst to post-process
	 * received packets before passing them to the user
	 */
	struct rte_eth_rxtx_callback *post_rx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];
	/**
	 * User-supplied functions called from tx_burst to pre-process
	 * received packets before passing them to the driver for transmission.
	 */
	struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];
	enum rte_eth_dev_state state; /**< Flag indicating the port state */
	void *security_ctx; /**< Context for security ops */
} __rte_cache_aligned;

rte_eth_dev_data指向设备的数据,也是在全部变量rte_eth_dev_shared_data中,创建设备抽象时通过下面函数关联:

static struct rte_eth_dev * eth_dev_get(uint16_t port_id)
{
	struct rte_eth_dev *eth_dev = &rte_eth_devices[port_id];

	eth_dev->data = &rte_eth_dev_shared_data->data[port_id];

	eth_dev_last_created_port = port_id;

	return eth_dev;
}

 

struct rte_eth_dev_data {
	char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */

	void **rx_queues; /**< Array of pointers to RX queues. */
	void **tx_queues; /**< Array of pointers to TX queues. */
	uint16_t nb_rx_queues; /**< Number of RX queues. */
	uint16_t nb_tx_queues; /**< Number of TX queues. */

	struct rte_eth_dev_sriov sriov;    /**< SRIOV data */

	void *dev_private;
			/**< PMD-specific private data.
			 *   @see rte_eth_dev_release_port()
			 */

	struct rte_eth_link dev_link;   /**< Link-level information & status. */
	struct rte_eth_conf dev_conf;   /**< Configuration applied to device. */
	uint16_t mtu;                   /**< Maximum Transmission Unit. */
	uint32_t min_rx_buf_size;
			/**< Common RX buffer size handled by all queues. */

	uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */
	struct ether_addr *mac_addrs;
			/**< Device Ethernet link address.
			 *   @see rte_eth_dev_release_port()
			 */
	uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR];
			/**< Bitmap associating MAC addresses to pools. */
	struct ether_addr *hash_mac_addrs;
			/**< Device Ethernet MAC addresses of hash filtering.
			 *   @see rte_eth_dev_release_port()
			 */
	uint16_t port_id;           /**< Device [external] port identifier. */

	__extension__
	uint8_t promiscuous   : 1, /**< RX promiscuous mode ON(1) / OFF(0). */
		scattered_rx : 1,  /**< RX of scattered packets is ON(1) / OFF(0) */
		all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). */
		dev_started : 1,   /**< Device state: STARTED(1) / STOPPED(0). */
		lro         : 1;   /**< RX LRO is ON(1) / OFF(0) */
	uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT];
			/**< Queues state: STARTED(1) / STOPPED(0). */
	uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT];
			/**< Queues state: STARTED(1) / STOPPED(0). */
	uint32_t dev_flags;             /**< Capabilities. */
	enum rte_kernel_driver kdrv;    /**< Kernel driver passthrough. */
	int numa_node;                  /**< NUMA node connection. */
	struct rte_vlan_filter_conf vlan_filter_conf;
			/**< VLAN filter configuration. */
	struct rte_eth_dev_owner owner; /**< The port owner. */
	uint16_t representor_id;
			/**< Switch-specific identifier.
			 *   Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags.
			 */
} __rte_cache_aligned;
  • 看起来就是网卡硬件队列,link情况,统计等信息

rte_eth_dev分配完成后,分配其私有数据,和kernel netdev_priv类似

		if (priv_data_size) {
			ethdev->data->dev_private = rte_zmalloc_socket(
				name, priv_data_size, RTE_CACHE_LINE_SIZE,
				device->numa_node);

			if (!ethdev->data->dev_private) {
				RTE_LOG(ERR, EAL, "failed to allocate private data");
				retval = -ENOMEM;
				goto probe_failed;
			}
		}

在i40e,这个私有数据是

struct i40e_adapter {
	/* Common for both PF and VF */
	struct i40e_hw hw;
	struct rte_eth_dev *eth_dev;

	/* Specific for PF or VF */
	union {
		struct i40e_pf pf;
		struct i40e_vf vf;
	};

	/* For vector PMD */
	bool rx_bulk_alloc_allowed;
	bool rx_vec_allowed;
	bool tx_simple_allowed;
	bool tx_vec_allowed;

	/* For PTP */
	struct rte_timecounter systime_tc;
	struct rte_timecounter rx_tstamp_tc;
	struct rte_timecounter tx_tstamp_tc;

	/* ptype mapping table */
	uint32_t ptype_tbl[I40E_MAX_PKT_TYPE] __rte_cache_min_aligned;
	/* flow type to pctype mapping table */
	uint64_t pctypes_tbl[I40E_FLOW_TYPE_MAX] __rte_cache_min_aligned;
	uint64_t flow_types_mask;
	uint64_t pctypes_mask;

	/* For devargs */
	uint8_t use_latest_vec;

	/* For RSS reta table update */
	uint8_t rss_reta_updated;
};

调用驱动特定的初始化函数,在i40e中是

static inline int eth_dev_pci_specific_init(struct rte_eth_dev *eth_dev, void *bus_device) {
	struct rte_pci_device *pci_dev = bus_device;

	if (!pci_dev)
		return -ENODEV;

	rte_eth_copy_pci_info(eth_dev, pci_dev);

	return 0;
}

就是copy一下相关的信息,直接贴了:

static inline void
rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev,
	struct rte_pci_device *pci_dev)
{
	if ((eth_dev == NULL) || (pci_dev == NULL)) {
		RTE_ETHDEV_LOG(ERR, "NULL pointer eth_dev=%p pci_dev=%p",
			(void *)eth_dev, (void *)pci_dev);
		return;
	}

	eth_dev->intr_handle = &pci_dev->intr_handle;

	eth_dev->data->dev_flags = 0;
	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
		eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_RMV)
		eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;

	eth_dev->data->kdrv = pci_dev->kdrv;
	eth_dev->data->numa_node = pci_dev->device.numa_node;
}

3.2 eth_i40e_dev_init

接下来的ethdev_init也是驱动特定的,这里是eth_i40e_dev_init

	dev->dev_ops = &i40e_eth_dev_ops;
	dev->rx_pkt_burst = i40e_recv_pkts;
	dev->tx_pkt_burst = i40e_xmit_pkts;
	dev->tx_pkt_prepare = i40e_prep_pkts;
  • 关键的接口列了一下,下篇分析报文收发再展开

i40e_set_default_ptype_table就是初始化了一下ptype所以类型的数组,ptype是接收描述符wb 的field,表示接收报文的类型。

DPDK初始化分析(五)_第1张图片

rte_eth_copy_pci_info这个功能和前面eth_dev_pci_specific_init重合了,不知道的几个意思.

接下来初始化rte_eth_dev私有数据 i40e_hw,没啥好说的,需要注意一点就是:

hw->hw_addr = (uint8_t *)(pci_dev->mem_resource[0].addr);

这里addr是大页映射过的地址,在UIO已经分析过了,所以后续可以直接通过I40E_READ_REG去访问pci bar0空间

接下来执行一些琐碎的东西,包括一些数据结构之间指向关系,和一些硬件部件的reset操作

3.2.1 i40e_init_shared_code

和kernel驱动函数名字差不多,确定mac_type这个是区分网卡类型的,另外读取portid和pf id,初始化nvm

3.2.2 i40e_init_adminq_parameter

在i40e中,一个重要的改变是引入了admin queues, 以前许多直接访问的配置、资源都要通过admin queues,这里根据手册的描述解释一下admin queues

引入admin queues的原因:

  • 驱动和具体设备资源中间引入FW抽象接口,这样硬件上一些修改就不会影响到驱动程序
  • 驱动不重要MMIO访问去除
  • 统一VF/PF访问

admin queues分成 ATQ和ARQ

  • ATQ admin transmit queue ,driver通过ATQ发送一个命令,由FW将命令处理结果会写
  • ARQ admin receive queue, driver准备好buffer,当FW post event时,填充描述符和buffer

其实这两个队列和收发包队列的原理是一模一样的:队列中是描述符,

DPDK初始化分析(五)_第2张图片

队列的大小由ATQBAH, ATQBAL, ATQLEN, ATQH, ATQT,  大概就是下面这个图的意思

DPDK初始化分析(五)_第3张图片

  • BAH/BAL base address, 指定环形缓冲区起始地址
  • LEN指定desc count
  • 对transmit来说,driver控制tail,fw控制head,这样当driver发送command时候,将desc填充到tail,并将tail移动,这样fw会检测head-tail这之间的desc进行对command的处理
  • 对receive, tail将desc填充好,所以tail移到末尾,当fw要post event,就使用填充好的desc对应信息

无论是ATQ还是ARQ,它们都有direct command和indirect command之分,direct command是指command能容纳在一个desc之内,如果不能容纳,就是indirect command,此时要将desc 的地址位指向一块buffer用于fw使用或传递数据。

所以command分为admin direct command和admin indirect command,这些desc初始化时候的填充和fw回写后的填充在手册里都写了,这里就不在贴了,对照找就行了,下面附上一张opcode的图,说明下面的信息获取都要通过admin queues

DPDK初始化分析(五)_第4张图片

说完了原理,接下来分析一下函数:

static inline void i40e_init_adminq_parameter(struct i40e_hw *hw)
{
	hw->aq.num_arq_entries = I40E_AQ_LEN;
	hw->aq.num_asq_entries = I40E_AQ_LEN;
	hw->aq.arq_buf_size = I40E_AQ_BUF_SZ;
	hw->aq.asq_buf_size = I40E_AQ_BUF_SZ;
}

3.2.3 i40e_init_adminq

该函数的admin queues初始化的主体,DPDK使用下面的结构描述一个admin queues

/* Admin Queue information */
struct i40e_adminq_info {
	struct i40e_adminq_ring arq;    /* receive queue */
	struct i40e_adminq_ring asq;    /* send queue */
	u32 asq_cmd_timeout;            /* send queue cmd write back timeout*/
	u16 num_arq_entries;            /* receive queue depth */
	u16 num_asq_entries;            /* send queue depth */
	u16 arq_buf_size;               /* receive queue buffer size */
	u16 asq_buf_size;               /* send queue buffer size */
	u16 fw_maj_ver;                 /* firmware major version */
	u16 fw_min_ver;                 /* firmware minor version */
	u32 fw_build;                   /* firmware build number */
	u16 api_maj_ver;                /* api major version */
	u16 api_min_ver;                /* api minor version */

	struct i40e_spinlock asq_spinlock; /* Send queue spinlock */
	struct i40e_spinlock arq_spinlock; /* Receive queue spinlock */

	/* last status values on send and receive queues */
	enum i40e_admin_queue_err asq_last_status;
	enum i40e_admin_queue_err arq_last_status;
};
struct i40e_adminq_ring {
	struct i40e_virt_mem dma_head;	/* space for dma structures */
	struct i40e_dma_mem desc_buf;	/* descriptor ring memory */
	struct i40e_virt_mem cmd_buf;	/* command buffer memory */

	union {
		struct i40e_dma_mem *asq_bi;
		struct i40e_dma_mem *arq_bi;
	} r;

	u16 count;		/* Number of descriptors */
	u16 rx_buf_len;		/* Admin Receive Queue buffer length */

	/* used for interrupt processing */
	u16 next_to_use;
	u16 next_to_clean;

	/* used for queue tracking */
	u32 head;
	u32 tail;
	u32 len;
	u32 bah;
	u32 bal;
};
  • i40e_adminq_ring 是具体ring结构的抽象,对照前面说过的结构很容易理解,在实现中关注next_to_use, next_to_clean,这两个量是记录软件处理ring指针位置的。

i40e_adminq_init_regs初始化上述结构的规格

enum i40e_status_code i40e_init_asq(struct i40e_hw *hw)
{
	enum i40e_status_code ret_code = I40E_SUCCESS;

	hw->aq.asq.next_to_use = 0;
	hw->aq.asq.next_to_clean = 0;

	/* allocate the ring memory */
	ret_code = i40e_alloc_adminq_asq_ring(hw);
	if (ret_code != I40E_SUCCESS)
		goto init_adminq_exit;

	/* allocate buffers in the rings */
	ret_code = i40e_alloc_asq_bufs(hw);
	if (ret_code != I40E_SUCCESS)
		goto init_adminq_free_rings;

	/* initialize base registers */
	ret_code = i40e_config_asq_regs(hw);
	if (ret_code != I40E_SUCCESS)
		goto init_config_regs;

	/* success! */
	hw->aq.asq.count = hw->aq.num_asq_entries;
	goto init_adminq_exit;

}

这个函数是asq admin发送队列的初始化,初始化时head和tail应该时重合的,它们的软件记录next_to_clean/next_to_use也是0

接下来:

enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw)
{
	enum i40e_status_code ret_code;

	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf,
					 i40e_mem_atq_ring,
					 (hw->aq.num_asq_entries *
					 sizeof(struct i40e_aq_desc)),
					 I40E_ADMINQ_DESC_ALIGNMENT);
	if (ret_code)
		return ret_code;

	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.cmd_buf,
					  (hw->aq.num_asq_entries *
					  sizeof(struct i40e_asq_cmd_details)));
	if (ret_code) {
		i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
		return ret_code;
	}

	return ret_code;
}
  • i40e_allocate_dma_mem 分配desc空间,注意这部分应该物理地址——BAH,BAL,LEN

  • i40e_allocate_virt_mem分配command

STATIC enum i40e_status_code i40e_alloc_asq_bufs(struct i40e_hw *hw)
{
	enum i40e_status_code ret_code;
	struct i40e_dma_mem *bi;
	int i;

	/* No mapped memory needed yet, just the buffer info structures */
	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.dma_head,
		(hw->aq.num_asq_entries * sizeof(struct i40e_dma_mem)));
	if (ret_code)
		goto alloc_asq_bufs;
	hw->aq.asq.r.asq_bi = (struct i40e_dma_mem *)hw->aq.asq.dma_head.va;

	/* allocate the mapped buffers */
	for (i = 0; i < hw->aq.num_asq_entries; i++) {
		bi = &hw->aq.asq.r.asq_bi[i];
		ret_code = i40e_allocate_dma_mem(hw, bi,
						 i40e_mem_asq_buf,
						 hw->aq.asq_buf_size,
						 I40E_ADMINQ_DESC_ALIGNMENT);
		if (ret_code)
			goto unwind_alloc_asq_bufs;
	}
alloc_asq_bufs:
	return ret_code;

unwind_alloc_asq_bufs:
	/* don't try to free the one that failed... */
	i--;
	for (; i >= 0; i--)
		i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
	i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);

	return ret_code;
}
  • 这个函数分配描述符对应的buffers

附一张图:

DPDK初始化分析(五)_第5张图片

接下来i40e_init_arq的asq初始化过程差不多,区别在于arq没有i40e_asq_cmd_details,而且对于接收来说TRQT初始化时应该执行ring末尾,而且desc应该都是预先初始化好的

如在alloc buffer时:

	/* allocate the mapped buffers */
	for (i = 0; i < hw->aq.num_arq_entries; i++) {
		bi = &hw->aq.arq.r.arq_bi[i];
		ret_code = i40e_allocate_dma_mem(hw, bi,
						 i40e_mem_arq_buf,
						 hw->aq.arq_buf_size,
						 I40E_ADMINQ_DESC_ALIGNMENT);
		if (ret_code)
			goto unwind_alloc_arq_bufs;

		/* now configure the descriptors for use */
		desc = I40E_ADMINQ_DESC(hw->aq.arq, i);

		desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF);
		if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
			desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB);
		desc->opcode = 0;
		/* This is in accordance with Admin queue design, there is no
		 * register for buffer size configuration
		 */
		desc->datalen = CPU_TO_LE16((u16)bi->size);
		desc->retval = 0;
		desc->cookie_high = 0;
		desc->cookie_low = 0;
		desc->params.external.addr_high =
			CPU_TO_LE32(I40E_HI_DWORD(bi->pa));
		desc->params.external.addr_low =
			CPU_TO_LE32(I40E_LO_DWORD(bi->pa));
		desc->params.external.param0 = 0;
		desc->params.external.param1 = 0;
	}

而且在初始化dma寄存器时

	wr32(hw, hw->aq.arq.bal, I40E_LO_DWORD(hw->aq.arq.desc_buf.pa));
	wr32(hw, hw->aq.arq.bah, I40E_HI_DWORD(hw->aq.arq.desc_buf.pa));

	/* Update tail in the HW to post pre-allocated buffers */
	wr32(hw, hw->aq.arq.tail, hw->aq.num_arq_entries - 1);

接下来通过一个具体的例子说明admin queues的使用方法:

例子是i40e_aq_get_firmware_version,主要是下面两个调用:

	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version);
	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);

这是一个send command,先看描述符是如何填充的:

void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc, u16 opcode)
{
	/* zero out the desc */
	i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc),
		    I40E_NONDMA_MEM);
	desc->opcode = CPU_TO_LE16(opcode);
	desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_SI);
}
  • 主要就是填充opcode,这里是i40e_aqc_opc_get_version

接着是调用发送接口:

enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw,
                struct i40e_aq_desc *desc,
                void *buff, /* can be NULL */
                u16  buff_size,
                struct i40e_asq_cmd_details *cmd_details)

  • desc 就是描述符
  • buff, buff_size 是每个commnad可能用到的buffer,上面的图中可能有展示
  • cmd_details 上面我们也分配了对于的结构了,后面会看到它的作用

进入函数主题,分段来说:

	details = I40E_ADMINQ_DETAILS(hw->aq.asq, hw->aq.asq.next_to_use);
	if (cmd_details) {
		i40e_memcpy(details,
			    cmd_details,
			    sizeof(struct i40e_asq_cmd_details),
			    I40E_NONDMA_TO_NONDMA);

		/* If the cmd_details are defined copy the cookie.  The
		 * CPU_TO_LE32 is not needed here because the data is ignored
		 * by the FW, only used by the driver
		 */
		if (details->cookie) {
			desc->cookie_high =
				CPU_TO_LE32(I40E_HI_DWORD(details->cookie));
			desc->cookie_low =
				CPU_TO_LE32(I40E_LO_DWORD(details->cookie));
		}
	} else {
		i40e_memset(details, 0,
			    sizeof(struct i40e_asq_cmd_details),
			    I40E_NONDMA_MEM);
	}
  • 根据用户传递的cmd_details,将next_to_use指向的内容做一个copy

设置标记:

	/* clear requested flags and then set additional flags if defined */
	desc->flags &= ~CPU_TO_LE16(details->flags_dis);
	desc->flags |= CPU_TO_LE16(details->flags_ena);

copy desc

	/* initialize the temp desc pointer with the right desc */
	desc_on_ring = I40E_ADMINQ_DESC(hw->aq.asq, hw->aq.asq.next_to_use);

	/* if the desc is available copy the temp desc to the right place */
	i40e_memcpy(desc_on_ring, desc, sizeof(struct i40e_aq_desc),
		    I40E_NONDMA_TO_DMA);

copy buffer

	/* if buff is not NULL assume indirect command */
	if (buff != NULL) {
		dma_buff = &(hw->aq.asq.r.asq_bi[hw->aq.asq.next_to_use]);
		/* copy the user buff into the respective DMA buff */
		i40e_memcpy(dma_buff->va, buff, buff_size,
			    I40E_NONDMA_TO_DMA);
		desc_on_ring->datalen = CPU_TO_LE16(buff_size);

		/* Update the address values in the desc with the pa value
		 * for respective buffer
		 */
		desc_on_ring->params.external.addr_high =
				CPU_TO_LE32(I40E_HI_DWORD(dma_buff->pa));
		desc_on_ring->params.external.addr_low =
				CPU_TO_LE32(I40E_LO_DWORD(dma_buff->pa));
	}

增加next_to_use, 回绕处理,如果非延迟,直接写tail,这样可以立即发送出

	(hw->aq.asq.next_to_use)++;
	if (hw->aq.asq.next_to_use == hw->aq.asq.count)
		hw->aq.asq.next_to_use = 0;
	if (!details->postpone)
		wr32(hw, hw->aq.asq.tail, hw->aq.asq.next_to_use);

非异步且非延迟,尝试轮询结果

	if (!details->async && !details->postpone) {
		u32 total_delay = 0;

		do {
			/* AQ designers suggest use of head for better
			 * timing reliability than DD bit
			 */
			if (i40e_asq_done(hw))
				break;
			i40e_usec_delay(50);
			total_delay += 50;
		} while (total_delay < hw->aq.asq_cmd_timeout);
	}

 

3.2.4 i40e_hw_init

一些简单的操作,注释说的很明白了——一些寄存器处理global reset无法reset,就将他们的值写一个非初始化值,这里列一下备忘

  • extended_tag, 清I40E_PFQF_CTL_0,Disable symmetric hash per port

3.2.5 i40e_config_automask

3.2.6 i40e_set_default_pctype_table

这部分内容本来想放到下一篇,但是驱动初始化时没有相关的背景不易看懂代码,因此就在这里一并说明一下,和大多数网卡一样,报文送到cpu前可以做预解析,手册中称为Receive Classification filters,按照优先级可以分为以下四种(filters type):

  • EtherType
  • Flow Director (FD)
  • MAC / VLAN
  • Hash filters (包括RSS)

不同的filters匹配报文后有不同的动作, 如丢弃报文,将报文送入某个index 的queue等等,具体可以参照手册。

在继续描述之前,列一下手册里提到的一些术语(如果我理解错误,请告诉我:) )

  • protocol layers  它是整个报文的一部分,硬件会解析报文前480字节,确定报文类型可以是L2, L3, L4

DPDK初始化分析(五)_第6张图片

截取了手册的章节,protocol具体类型参见该小节描述

  • filter filed 硬件会根据配置从protocol layer中提取最多三个部分,称为filter filed,每个filed最多30bytes

DPDK初始化分析(五)_第7张图片

  • field vector   提取的filter field 组成 filed vector,这是一个128-byte的结构,存储报文的内容

DPDK初始化分析(五)_第8张图片

DPDK初始化分析(五)_第9张图片

DPDK初始化分析(五)_第10张图片

  • pctype        packet classification type,用于指示报文分类
  • input set     指示从field vector提取报文字段的规则,类型由pctype指定

DPDK初始化分析(五)_第11张图片

下面用一个图来说明在接收方向,以fd filter为例说明报文解析的过程:

DPDK初始化分析(五)_第12张图片

首先关注GLQF_ORT 和GLQF_PIT,这两个寄存器都是预先定义(predefined)的——从NVM中加载进来的,GLQF_OPT参照下图:

DPDK初始化分析(五)_第13张图片

GLQF_ORT在手册里没有找到对应的寄存器,这是一个64项,每项四个字节的表

#ifndef I40E_GLQF_ORT
#define I40E_GLQF_ORT(_i)    (0x00268900 + ((_i) * 4))
#endif
#ifndef I40E_GLQF_PIT
#define I40E_GLQF_PIT(_i)    (0x00268C80 + ((_i) * 4))
#endif

它通过PIT_INDX索引GLQF_PIT,FIELD_CNT指示在GLQF_PIT中的项数,GLQF_PIT是一个32项,每项四个字节的表,每项由下面的组成:

DPDK初始化分析(五)_第14张图片

这些字段对照上面的图看就好了。

好了,了解了上面的知识,下面按照我的理解说明一下报文解析的过程:

  1. 报文来临,硬件解析480字节,识别出报文类型,得到protocol layer匹配了GLQF_ORT 的第m项,PIT_INDX=m,FIELD_CNT=2,说明解析报文在GLQF_PIT的index是m且在PIT中占2项
  2. 来看GLQF_PIT的m和m+1项,这两项对应两个filter filed,对应上述寄存器SOURCE_OFFFSIZE,通过这两个字段从protocol layer将filter filed提取出来,填充到field vector中
  3. field vector可以被所有filters共享,这里我们以fd为例,fd通过自己的input set指定从field vector获取的内容,上图中描述的是将input set置位的bit从field vector取出来,在实际读代码的过程中发现input set填充的寄存器PRTQF_FD_INSET和上面的图有些小差异。

PRTQF_FD_INSET手册里没有,通过代码看下:

#define I40E_PRTQF_FD_INSET(_i, _j)      (0x00250000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */

这说明PRTQF_FD_INSET是64项每项8byte的表,我问对照field vector结构来看看PRTQF_FD_INSET的表项是如何指定提取字段的,这里再贴一下:

上图说明在field vector中DMAC是占三个word(0:2), SMAC也占三个word(3:5),接下来看下代码是如何填充PRTQF_FD_INSET来选择提取这两项的

/* Destination MAC address */
#define I40E_REG_INSET_L2_DMAC                   0xE000000000000000ULL
/* Source MAC address */
#define I40E_REG_INSET_L2_SMAC                   0x1C00000000000000ULL

可以看到I40E_REG_INSET_L2_DMAC在内存中的值是末尾三个“1”,和field vector中对应,而I40E_REG_INSET_L2_SMAC也是一样,注意下大小端很容易分析出来。

基本流程介绍完了,这时候看下函数本身

void __attribute__((cold))
i40e_set_default_pctype_table(struct rte_eth_dev *dev)
{
	struct i40e_adapter *ad =
			I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
	struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
	int i;

	for (i = 0; i < I40E_FLOW_TYPE_MAX; i++)
		ad->pctypes_tbl[i] = 0ULL;
	ad->flow_types_mask = 0ULL;
	ad->pctypes_mask = 0ULL;

	ad->pctypes_tbl[RTE_ETH_FLOW_FRAG_IPV4] =
				(1ULL << I40E_FILTER_PCTYPE_FRAG_IPV4);
	ad->pctypes_tbl[RTE_ETH_FLOW_NONFRAG_IPV4_UDP] =
				(1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_UDP);
    ...


	for (i = 0; i < I40E_FLOW_TYPE_MAX; i++) {
		if (ad->pctypes_tbl[i])
			ad->flow_types_mask |= (1ULL << i);
		ad->pctypes_mask |= ad->pctypes_tbl[i];
	}
}

这个函数初始化pctype,对应ad->pctypes_tbl,由于pctype值都比较大,使用flow_type为索引使用bit保存pctype,其中flow type向下面这样定义:

#define RTE_ETH_FLOW_UNKNOWN             0
#define RTE_ETH_FLOW_RAW                 1
#define RTE_ETH_FLOW_IPV4                2
#define RTE_ETH_FLOW_FRAG_IPV4           3
#define RTE_ETH_FLOW_NONFRAG_IPV4_TCP    4
#define RTE_ETH_FLOW_NONFRAG_IPV4_UDP    5
#define RTE_ETH_FLOW_NONFRAG_IPV4_SCTP   6
#define RTE_ETH_FLOW_NONFRAG_IPV4_OTHER  7
#define RTE_ETH_FLOW_IPV6                8
#define RTE_ETH_FLOW_FRAG_IPV6           9
#define RTE_ETH_FLOW_NONFRAG_IPV6_TCP   10
#define RTE_ETH_FLOW_NONFRAG_IPV6_UDP   11
#define RTE_ETH_FLOW_NONFRAG_IPV6_SCTP  12
#define RTE_ETH_FLOW_NONFRAG_IPV6_OTHER 13
#define RTE_ETH_FLOW_L2_PAYLOAD         14
#define RTE_ETH_FLOW_IPV6_EX            15
#define RTE_ETH_FLOW_IPV6_TCP_EX        16
#define RTE_ETH_FLOW_IPV6_UDP_EX        17
#define RTE_ETH_FLOW_PORT               18
	/**< Consider device port number as a flow differentiator */
#define RTE_ETH_FLOW_VXLAN              19 /**< VXLAN protocol based flow */
#define RTE_ETH_FLOW_GENEVE             20 /**< GENEVE protocol based flow */
#define RTE_ETH_FLOW_NVGRE              21 /**< NVGRE protocol based flow */
#define RTE_ETH_FLOW_VXLAN_GPE          22 /**< VXLAN-GPE protocol based flow */
#define RTE_ETH_FLOW_MAX                23

下面这条语句保存一个pctype的信息:

	ad->pctypes_tbl[RTE_ETH_FLOW_FRAG_IPV4] =
				(1ULL << I40E_FILTER_PCTYPE_FRAG_IPV4);

掩码信息,对照看就好了:

ad->flow_types_mask |= (1ULL << i);
ad->pctypes_mask |= ad->pctypes_tbl[i];

3.2.7 i40e_filter_input_set_init

这个函数是填充各个filters的input set,其实就是通过分析pctype来填充对应的xxx_inset寄存器。

static void i40e_filter_input_set_init(struct i40e_pf *pf)
{
    ...
	for (pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
	     pctype <= I40E_FILTER_PCTYPE_L2_PAYLOAD; pctype++) {
		 flow_type = i40e_pctype_to_flowtype(pf->adapter, pctype);

		if (flow_type == RTE_ETH_FLOW_UNKNOWN)
			continue;

		input_set = i40e_get_default_input_set(pctype);

		num = i40e_generate_inset_mask_reg(input_set, mask_reg,
						   I40E_INSET_MASK_NUM_REG);

		inset_reg = i40e_translate_input_set_reg(hw->mac.type,
					input_set);

		i40e_check_write_reg(hw, I40E_PRTQF_FD_INSET(pctype, 0),
				      (uint32_t)(inset_reg & UINT32_MAX));
		i40e_check_write_reg(hw, I40E_PRTQF_FD_INSET(pctype, 1),
				     (uint32_t)((inset_reg >>
				     I40E_32_BIT_WIDTH) & UINT32_MAX));
		if (!pf->support_multi_driver) {
			i40e_check_write_global_reg(hw,
					    I40E_GLQF_HASH_INSET(0, pctype),
					    (uint32_t)(inset_reg & UINT32_MAX));
			i40e_check_write_global_reg(hw,
					     I40E_GLQF_HASH_INSET(1, pctype),
					     (uint32_t)((inset_reg >>
					      I40E_32_BIT_WIDTH) & UINT32_MAX));

			for (i = 0; i < num; i++) {
				i40e_check_write_global_reg(hw,
						    I40E_GLQF_FD_MSK(i, pctype),
						    mask_reg[i]);
				i40e_check_write_global_reg(hw,
						  I40E_GLQF_HASH_MSK(i, pctype),
						  mask_reg[i]);
			}
			/*clear unused mask registers of the pctype */
			for (i = num; i < I40E_INSET_MASK_NUM_REG; i++) {
				i40e_check_write_global_reg(hw,
						    I40E_GLQF_FD_MSK(i, pctype),
						    0);
				i40e_check_write_global_reg(hw,
						  I40E_GLQF_HASH_MSK(i, pctype),
						  0);
			}
		} else {
			PMD_DRV_LOG(ERR, "Input set setting is not supported.");
		}
		I40E_WRITE_FLUSH(hw);

		/* store the default input set */
		if (!pf->support_multi_driver)
			pf->hash_input_set[pctype] = input_set;
		pf->fdir.input_set[pctype] = input_set;
	}
}
  • 首先根据pctype查找到flow_type
  • 根据pctype确定input set,这里input set不是实际写入寄存器的,是使用bit将pctype的信息转换,说明具体要提取报文的信息,具体可以参考i40e_get_default_input_set中的default_inset_table数组
  • 接下来通过i40e_translate_input_set_reg将input set转换成可以写入寄存器的实际数据,都是通过表映射的,寄存器的内容我们已经看到过了,就是每项的bit对应field vector的项,表的名称inset_map_common
  • 接下来就是将寄存器的值写入
  • 以上是fd input set的写入,最后也将hash input set写入,二者差不多,就不写了
pf->hash_input_set[pctype] = input_set;
pf->fdir.input_set[pctype] = input_set;

最后记录input set

3.2.8 config_floating_veb

3.2.9 i40e_clear_pxe_mode

3.2.10 i40e_dev_sync_phy_type

3.2.11 i40e_configure_registers

3.2.12 i40e_get_cap

3.2.13 i40e_configure_registers

3.2.14 i40e_get_cap

3.2.15 i40e_pf_parameter_init

3.2.16 i40e_res_pool_init

3.2.17 i40e_init_lan_hmc

3.2.18 i40e_configure_lan_hmc

3.2.19 i40e_set_fc

3.2.20 i40e_pf_setup

3.2.21 i40e_vsi_config_double_vlan

3.2.22 i40e_dcb_init_configure

3.2.23 i40e_pf_host_init

3.2.24 rte_intr_callback_register

static void i40e_dev_interrupt_handler(void *param)
{
	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
	struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
	uint32_t icr0;

	/* Disable interrupt */
	i40e_pf_disable_irq0(hw);

	/* read out interrupt causes */
	icr0 = I40E_READ_REG(hw, I40E_PFINT_ICR0);

	/* No interrupt event indicated */
	if (!(icr0 & I40E_PFINT_ICR0_INTEVENT_MASK)) {
		PMD_DRV_LOG(INFO, "No interrupt event");
		goto done;
	}
	if (icr0 & I40E_PFINT_ICR0_ECC_ERR_MASK)
		PMD_DRV_LOG(ERR, "ICR0: unrecoverable ECC error");
	if (icr0 & I40E_PFINT_ICR0_MAL_DETECT_MASK)
		PMD_DRV_LOG(ERR, "ICR0: malicious programming detected");
	if (icr0 & I40E_PFINT_ICR0_GRST_MASK)
		PMD_DRV_LOG(INFO, "ICR0: global reset requested");
	if (icr0 & I40E_PFINT_ICR0_PCI_EXCEPTION_MASK)
		PMD_DRV_LOG(INFO, "ICR0: PCI exception activated");
	if (icr0 & I40E_PFINT_ICR0_STORM_DETECT_MASK)
		PMD_DRV_LOG(INFO, "ICR0: a change in the storm control state");
	if (icr0 & I40E_PFINT_ICR0_HMC_ERR_MASK)
		PMD_DRV_LOG(ERR, "ICR0: HMC error");
	if (icr0 & I40E_PFINT_ICR0_PE_CRITERR_MASK)
		PMD_DRV_LOG(ERR, "ICR0: protocol engine critical error");

	if (icr0 & I40E_PFINT_ICR0_VFLR_MASK) {
		PMD_DRV_LOG(INFO, "ICR0: VF reset detected");
		i40e_dev_handle_vfr_event(dev);
	}
	if (icr0 & I40E_PFINT_ICR0_ADMINQ_MASK) {
		PMD_DRV_LOG(INFO, "ICR0: adminq event");
		i40e_dev_handle_aq_msg(dev);
	}

done:
	/* Enable interrupt */
	i40e_pf_enable_irq0(hw);
}

3.2.25 i40e_pf_config_irq0

3.2.26 i40e_pf_enable_irq0

3.2.27 rte_intr_enable

先分析里面的uio后续分析vfio相关的时候补上

static int uio_intr_enable(const struct rte_intr_handle *intr_handle)
{
	const int value = 1;

	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
		RTE_LOG(ERR, EAL,
			"Error enabling interrupts for fd %d (%s)\n",
			intr_handle->fd, strerror(errno));
		return -1;
	}
	return 0;
}

就是向/dev/uioX中写一个,具体参见 DPDK分析——UIO

3.2.28 i40e_flex_payload_reg_set_default

3.2.29 i40e_add_tx_flow_control_drop_filter

3.2.30 i40e_aq_set_mac_config

3.2.31 i40e_tm_conf_init

3.2.32 i40e_init_customized_info

3.2.33 i40e_init_ethtype_filter_list

3.2.34 i40e_init_tunnel_filter_list

3.2.35 i40e_init_fdir_filter_list

3.2.36 i40e_init_queue_region_conf

 

四、总结

好多函数只列的标题,后续考虑补上

五、参考

【1】Intel® Ethernet Controller X710/XXV710/XL710: Datasheet

 

 

 

你可能感兴趣的:(DPDK)