DPDK学习(网卡收发包)

ixgbe_rx/tx_queue

struct ixgbe_rx_queue {
	struct rte_mempool  *mb_pool; /**< mbuf pool to populate RX ring. */
	volatile union ixgbe_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
	uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
	volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
	volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
	struct ixgbe_rx_entry *sw_ring; /**< address of RX software ring. */
	struct ixgbe_scattered_rx_entry *sw_sc_ring; /**< address of scattered Rx software ring. */
	struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
	struct rte_mbuf *pkt_last_seg; /**< Last segment of current packet. */
	uint64_t            mbuf_initializer; /**< value to init mbufs */
	uint16_t            nb_rx_desc; /**< number of RX descriptors. */
	uint16_t            rx_tail;  /**< current value of RDT register. */
	uint16_t            nb_rx_hold; /**< number of held free RX desc. */
	uint16_t rx_nb_avail; /**< nr of staged pkts ready to ret to app */
	uint16_t rx_next_avail; /**< idx of next staged pkt to ret to app */
	uint16_t rx_free_trigger; /**< triggers rx buffer allocation */
	uint16_t            rx_using_sse;
	/**< indicates that vector RX is in use */
#ifdef RTE_IXGBE_INC_VECTOR
	uint16_t            rxrearm_nb;     /**< number of remaining to be re-armed */
	uint16_t            rxrearm_start;  /**< the idx we start the re-arming from */
#endif
	uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
	uint16_t            queue_id; /**< RX queue index. */
	uint16_t            reg_idx;  /**< RX queue register index. */
	uint16_t            pkt_type_mask;  /**< Packet type mask for different NICs. */
	uint8_t             port_id;  /**< Device port identifier. */
	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
	uint8_t             rx_deferred_start; /**< not in global dev start. */
	/** flags to set in mbuf when a vlan is detected. */
	uint64_t            vlan_flags;
	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
	struct rte_mbuf fake_mbuf;
	/** hold packets to return to application */
	struct rte_mbuf *rx_stage[RTE_PMD_IXGBE_RX_MAX_BURST*2];
};

struct ixgbe_tx_queue {
	/** TX ring virtual address. */
	volatile union ixgbe_adv_tx_desc *tx_ring;
	uint64_t            tx_ring_phys_addr; /**< TX ring DMA address. */
	union {
		struct ixgbe_tx_entry *sw_ring; /**< address of SW ring for scalar PMD. */
		struct ixgbe_tx_entry_v *sw_ring_v; /**< address of SW ring for vector PMD */
	};
	volatile uint32_t   *tdt_reg_addr; /**< Address of TDT register. */
	uint16_t            nb_tx_desc;    /**< number of TX descriptors. */
	uint16_t            tx_tail;       /**< current value of TDT reg. */
	/**< Start freeing TX buffers if there are less free descriptors than
	     this value. */
	uint16_t            tx_free_thresh;
	/** Number of TX descriptors to use before RS bit is set. */
	uint16_t            tx_rs_thresh;
	/** Number of TX descriptors used since RS bit was set. */
	uint16_t            nb_tx_used;
	/** Index to last TX descriptor to have been cleaned. */
	uint16_t            last_desc_cleaned;
	/** Total number of TX descriptors ready to be allocated. */
	uint16_t            nb_tx_free;
	uint16_t tx_next_dd; /**< next desc to scan for DD bit */
	uint16_t tx_next_rs; /**< next desc to set RS bit */
	uint16_t            queue_id;      /**< TX queue index. */
	uint16_t            reg_idx;       /**< TX queue register index. */
	uint8_t             port_id;       /**< Device port identifier. */
	uint8_t             pthresh;       /**< Prefetch threshold register. */
	uint8_t             hthresh;       /**< Host threshold register. */
	uint8_t             wthresh;       /**< Write-back threshold reg. */
	uint32_t txq_flags; /**< Holds flags for this TXq */
	uint32_t            ctx_curr;      /**< Hardware context states. */
	/** Hardware context0 history. */
	struct ixgbe_advctx_info ctx_cache[IXGBE_CTX_NUM];
	const struct ixgbe_txq_ops *ops;       /**< txq ops */
	uint8_t             tx_deferred_start; /**< not in global dev start. */
};

ixgbe_rx/tx_entry

struct ixgbe_rx_entry {
	struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
};

struct ixgbe_tx_entry {
	struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
	uint16_t next_id; /**< Index of next descriptor in ring. */
	uint16_t last_id; /**< Index of last scattered descriptor. */
};

rte_eth_rx/tx_burst()

static inline uint16_t
rte_eth_rx_burst(uint8_t port_id, uint16_t queue_id,
		 struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
	/* 得到port_id对应的设备 */
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	...
	/* ixgbe为ixgbe_recv_pkts() */
	int16_t nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
			rx_pkts, nb_pkts);
	...
}

static inline uint16_t
rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
	/* 得到port_id对应的设备 */
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	...
	/* ixgbe为ixgbe_xmit_pkts */
	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}

ixgbe_recv_pkts()

接收时回写:
1、网卡使用DMA写Rx FIFO中的Frame到Rx Ring Buffer中的mbuf,设置desc的DD为1
2、网卡驱动取走mbuf后,设置desc的DD为0,更新RDT

uint16_t
ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts)
{
	...
	nb_rx = 0;
	nb_hold = 0;
	rxq = rx_queue;
	rx_id = rxq->rx_tail; /* 相当于ixgbe的next_to_clean */
	rx_ring = rxq->rx_ring;
	sw_ring = rxq->sw_ring;
	...
	while (nb_rx < nb_pkts) {
		...
		/* 得到rx_tail指向的desc的指针 */
		rxdp = &rx_ring[rx_id];
		/* 若网卡回写的DD为0,跳出循环 */
		staterr = rxdp->wb.upper.status_error;
		if (!(staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
			break;
		/* 得到rx_tail指向的desc */
		rxd = *rxdp;
		...
		/* 分配新mbuf */
		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
		...
		nb_hold++; /* 统计接收的mbuf数 */
		rxe = &sw_ring[rx_id]; /* 得到旧mbuf */
		rx_id++; /* 得到下一个desc的index,注意是一个环形缓冲区 */
		if (rx_id == rxq->nb_rx_desc)
			rx_id = 0;
		...
		rte_ixgbe_prefetch(sw_ring[rx_id].mbuf); /* 预取下一个mbuf */
		...
		if ((rx_id & 0x3) == 0) {
			rte_ixgbe_prefetch(&rx_ring[rx_id]);
			rte_ixgbe_prefetch(&sw_ring[rx_id]);
		}
		...
		rxm = rxe->mbuf; /* rxm指向旧mbuf */
		rxe->mbuf = nmb; /* rxe->mbuf指向新mbuf */
		dma_addr =
			rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb)); /* 得到新mbuf的总线地址 */
		rxdp->read.hdr_addr = 0; /* 清零新mbuf对应的desc的DD,后续网卡会读desc */
		rxdp->read.pkt_addr = dma_addr; /* 设置新mbuf对应的desc的总线地址,后续网卡会读desc */
		...
		pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
				      rxq->crc_len); /* 包长 */
		rxm->data_off = RTE_PKTMBUF_HEADROOM;
		rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
		rxm->nb_segs = 1;
		rxm->next = NULL;
		rxm->pkt_len = pkt_len;
		rxm->data_len = pkt_len;
		rxm->port = rxq->port_id;
		...
		if (likely(pkt_flags & PKT_RX_RSS_HASH)) /* RSS */
			rxm->hash.rss = rte_le_to_cpu_32(
						rxd.wb.lower.hi_dword.rss);
		else if (pkt_flags & PKT_RX_FDIR) { /* FDIR */
			rxm->hash.fdir.hash = rte_le_to_cpu_16(
					rxd.wb.lower.hi_dword.csum_ip.csum) &
					IXGBE_ATR_HASH_MASK;
			rxm->hash.fdir.id = rte_le_to_cpu_16(
					rxd.wb.lower.hi_dword.csum_ip.ip_id);
		}
		...
		rx_pkts[nb_rx++] = rxm; /* 将旧mbuf放入rx_pkts数组 */
	}
	rxq->rx_tail = rx_id; /* rx_tail指向下一个desc */
	...
	nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
	/* 若已处理的mbuf数大于上限(默认为32),更新RDT */
	if (nb_hold > rxq->rx_free_thresh) {
		...
		rx_id = (uint16_t) ((rx_id == 0) ?
				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id); /* 将rx_id写入RDT */
		nb_hold = 0; /* 清零nb_hold */
	}
	rxq->nb_rx_hold = nb_hold; /* 更新nb_rx_hold */
	return nb_rx;
}

ixgbe_xmit_pkts()

发送时回写的三种情况(默认为第一种,回写取决于RS):
1、TXDCTL[n].WTHRESH = 0 and a descriptor that has RS set is ready to be written back.
2、TXDCTL[n].WTHRESH > 0 and TXDCTL[n].WTHRESH descriptors have accumulated.
3、TXDCTL[n].WTHRESH > 0 and the corresponding EITR counter has reached zero. The timer expiration flushes any accumulated descriptors and sets an interrupt event (TXDW).

发送时回写:
1、挂载每个包的最后一个分段时,若当前使用的desc数大于上限(默认为32),设置RS
2、burst发包的最后一个包的最后一个分段,设置RS

uint16_t
ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
		uint16_t nb_pkts)
{
	...
	txq = tx_queue;
	sw_ring = txq->sw_ring;
	txr     = txq->tx_ring;
	tx_id   = txq->tx_tail; /* 相当于ixgbe的next_to_use */
	txe = &sw_ring[tx_id]; /* 得到tx_tail指向的entry */
	txp = NULL;
	...
	/* 若空闲的mbuf数小于下限(默认为32),清理空闲的mbuf */
	if (txq->nb_tx_free < txq->tx_free_thresh)
		ixgbe_xmit_cleanup(txq);
	...
	/* TX loop */
	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
		...
		tx_pkt = *tx_pkts++; /* 待发送的mbuf */
		pkt_len = tx_pkt->pkt_len; /* 待发送的mbuf的长度 */
		...
		nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx); /* 使用的desc数 */
		...
		tx_last = (uint16_t) (tx_id + nb_used - 1); /* tx_last指向最后一个desc */
		...
		if (tx_last >= txq->nb_tx_desc) /* 注意是一个环形缓冲区 */
			tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
		...
		if (nb_used > txq->nb_tx_free) {
			...
			if (ixgbe_xmit_cleanup(txq) != 0) {
				/* Could not clean any descriptors */
				if (nb_tx == 0) /* 若是第一个包(未发包),return 0 */
					return 0;
				goto end_of_tx; /* 若非第一个包(已发包),停止发包,更新发送队列参数 */
			}
			...
		}
		...
		/* 每个包可能包含多个分段,m_seg指向第一个分段 */
		m_seg = tx_pkt;
		do {
			txd = &txr[tx_id]; /* desc */
			txn = &sw_ring[txe->next_id]; /* 下一个entry */
			...
			txe->mbuf = m_seg; /* 将m_seg挂载到txe */
			...
			slen = m_seg->data_len; /* m_seg的长度 */
			buf_dma_addr = rte_mbuf_data_dma_addr(m_seg); /* m_seg的总线地址 */
			txd->read.buffer_addr =
				rte_cpu_to_le_64(buf_dma_addr); /* 总线地址赋给txd->read.buffer_addr */
			txd->read.cmd_type_len =
				rte_cpu_to_le_32(cmd_type_len | slen); /* 长度赋给txd->read.cmd_type_len */
			...
			txe->last_id = tx_last; /* last_id指向最后一个desc */
			tx_id = txe->next_id; /* tx_id指向下一个desc */
			txe = txn; /* txe指向下一个entry */
			m_seg = m_seg->next; /* m_seg指向下一个分段 */
		} while (m_seg != NULL);
		...
		/* 最后一个分段 */
		cmd_type_len |= IXGBE_TXD_CMD_EOP;
		txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used); /* 更新nb_tx_used */
		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used); /* 更新nb_tx_free */
		...
		if (txq->nb_tx_used >= txq->tx_rs_thresh) { /* 若使用的mbuf数大于上限(默认为32),设置RS */
			...
			cmd_type_len |= IXGBE_TXD_CMD_RS;
			...
			txp = NULL; /* txp为NULL表示已设置RS */
		} else
			txp = txd; /* txp非NULL表示未设置RS */
		...
		txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
	}
	...
end_of_tx:
	/* burst发包的最后一个包的最后一个分段 */
	...
	if (txp != NULL) /* 若未设置RS,设置RS */
		txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
	...
	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id); /* 将tx_id写入TDT */
	txq->tx_tail = tx_id; /* tx_tail指向下一个desc */
	...
	return nb_tx;
}

ixgbe_xmit_cleanup()

static inline int
ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
{
	...
	uint16_t last_desc_cleaned = txq->last_desc_cleaned;
	...
	/* 最后一个entry */
	desc_to_clean_to = (uint16_t)(last_desc_cleaned + txq->tx_rs_thresh);
	if (desc_to_clean_to >= nb_tx_desc) /* 注意是环形缓冲区 */
		desc_to_clean_to = (uint16_t)(desc_to_clean_to - nb_tx_desc);
	...
	/* 最后一个entry的最后一个desc */
	desc_to_clean_to = sw_ring[desc_to_clean_to].last_id;
	status = txr[desc_to_clean_to].wb.status;
	/* 若最后一个desc的DD为0,return -1 */
	if (!(status & rte_cpu_to_le_32(IXGBE_TXD_STAT_DD))) {
		...
		return -(1);
	}
	...
	/* 将要清理的desc数 */
	if (last_desc_cleaned > desc_to_clean_to) /* 注意是环形缓冲区 */
		nb_tx_to_clean = (uint16_t)((nb_tx_desc - last_desc_cleaned) +
							desc_to_clean_to);
	else
		nb_tx_to_clean = (uint16_t)(desc_to_clean_to -
						last_desc_cleaned);
	...
	txr[desc_to_clean_to].wb.status = 0; /* 清零DD */
	...
	txq->last_desc_cleaned = desc_to_clean_to; /* 更新last_desc_cleaned */
	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + nb_tx_to_clean); /* 更新nb_tx_free */
	...
	return 0;
}

你可能感兴趣的:(DPDK)