struct ixgbe_rx_queue {
struct rte_mempool *mb_pool; /**< mbuf pool to populate RX ring. */
volatile union ixgbe_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
uint64_t rx_ring_phys_addr; /**< RX ring DMA address. */
volatile uint32_t *rdt_reg_addr; /**< RDT register address. */
volatile uint32_t *rdh_reg_addr; /**< RDH register address. */
struct ixgbe_rx_entry *sw_ring; /**< address of RX software ring. */
struct ixgbe_scattered_rx_entry *sw_sc_ring; /**< address of scattered Rx software ring. */
struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
struct rte_mbuf *pkt_last_seg; /**< Last segment of current packet. */
uint64_t mbuf_initializer; /**< value to init mbufs */
uint16_t nb_rx_desc; /**< number of RX descriptors. */
uint16_t rx_tail; /**< current value of RDT register. */
uint16_t nb_rx_hold; /**< number of held free RX desc. */
uint16_t rx_nb_avail; /**< nr of staged pkts ready to ret to app */
uint16_t rx_next_avail; /**< idx of next staged pkt to ret to app */
uint16_t rx_free_trigger; /**< triggers rx buffer allocation */
uint16_t rx_using_sse;
/**< indicates that vector RX is in use */
#ifdef RTE_IXGBE_INC_VECTOR
uint16_t rxrearm_nb; /**< number of remaining to be re-armed */
uint16_t rxrearm_start; /**< the idx we start the re-arming from */
#endif
uint16_t rx_free_thresh; /**< max free RX desc to hold. */
uint16_t queue_id; /**< RX queue index. */
uint16_t reg_idx; /**< RX queue register index. */
uint16_t pkt_type_mask; /**< Packet type mask for different NICs. */
uint8_t port_id; /**< Device port identifier. */
uint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise. */
uint8_t drop_en; /**< If not 0, set SRRCTL.Drop_En. */
uint8_t rx_deferred_start; /**< not in global dev start. */
/** flags to set in mbuf when a vlan is detected. */
uint64_t vlan_flags;
/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
struct rte_mbuf fake_mbuf;
/** hold packets to return to application */
struct rte_mbuf *rx_stage[RTE_PMD_IXGBE_RX_MAX_BURST*2];
};
struct ixgbe_tx_queue {
/** TX ring virtual address. */
volatile union ixgbe_adv_tx_desc *tx_ring;
uint64_t tx_ring_phys_addr; /**< TX ring DMA address. */
union {
struct ixgbe_tx_entry *sw_ring; /**< address of SW ring for scalar PMD. */
struct ixgbe_tx_entry_v *sw_ring_v; /**< address of SW ring for vector PMD */
};
volatile uint32_t *tdt_reg_addr; /**< Address of TDT register. */
uint16_t nb_tx_desc; /**< number of TX descriptors. */
uint16_t tx_tail; /**< current value of TDT reg. */
/**< Start freeing TX buffers if there are less free descriptors than
this value. */
uint16_t tx_free_thresh;
/** Number of TX descriptors to use before RS bit is set. */
uint16_t tx_rs_thresh;
/** Number of TX descriptors used since RS bit was set. */
uint16_t nb_tx_used;
/** Index to last TX descriptor to have been cleaned. */
uint16_t last_desc_cleaned;
/** Total number of TX descriptors ready to be allocated. */
uint16_t nb_tx_free;
uint16_t tx_next_dd; /**< next desc to scan for DD bit */
uint16_t tx_next_rs; /**< next desc to set RS bit */
uint16_t queue_id; /**< TX queue index. */
uint16_t reg_idx; /**< TX queue register index. */
uint8_t port_id; /**< Device port identifier. */
uint8_t pthresh; /**< Prefetch threshold register. */
uint8_t hthresh; /**< Host threshold register. */
uint8_t wthresh; /**< Write-back threshold reg. */
uint32_t txq_flags; /**< Holds flags for this TXq */
uint32_t ctx_curr; /**< Hardware context states. */
/** Hardware context0 history. */
struct ixgbe_advctx_info ctx_cache[IXGBE_CTX_NUM];
const struct ixgbe_txq_ops *ops; /**< txq ops */
uint8_t tx_deferred_start; /**< not in global dev start. */
};
struct ixgbe_rx_entry {
struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
};
struct ixgbe_tx_entry {
struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
uint16_t next_id; /**< Index of next descriptor in ring. */
uint16_t last_id; /**< Index of last scattered descriptor. */
};
static inline uint16_t
rte_eth_rx_burst(uint8_t port_id, uint16_t queue_id,
struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
/* 得到port_id对应的设备 */
struct rte_eth_dev *dev = &rte_eth_devices[port_id];
...
/* ixgbe为ixgbe_recv_pkts() */
int16_t nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
rx_pkts, nb_pkts);
...
}
static inline uint16_t
rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
/* 得到port_id对应的设备 */
struct rte_eth_dev *dev = &rte_eth_devices[port_id];
...
/* ixgbe为ixgbe_xmit_pkts */
return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}
接收时回写:
1、网卡使用DMA写Rx FIFO中的Frame到Rx Ring Buffer中的mbuf,设置desc的DD为1
2、网卡驱动取走mbuf后,设置desc的DD为0,更新RDT
uint16_t
ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
{
...
nb_rx = 0;
nb_hold = 0;
rxq = rx_queue;
rx_id = rxq->rx_tail; /* 相当于ixgbe的next_to_clean */
rx_ring = rxq->rx_ring;
sw_ring = rxq->sw_ring;
...
while (nb_rx < nb_pkts) {
...
/* 得到rx_tail指向的desc的指针 */
rxdp = &rx_ring[rx_id];
/* 若网卡回写的DD为0,跳出循环 */
staterr = rxdp->wb.upper.status_error;
if (!(staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
break;
/* 得到rx_tail指向的desc */
rxd = *rxdp;
...
/* 分配新mbuf */
nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
...
nb_hold++; /* 统计接收的mbuf数 */
rxe = &sw_ring[rx_id]; /* 得到旧mbuf */
rx_id++; /* 得到下一个desc的index,注意是一个环形缓冲区 */
if (rx_id == rxq->nb_rx_desc)
rx_id = 0;
...
rte_ixgbe_prefetch(sw_ring[rx_id].mbuf); /* 预取下一个mbuf */
...
if ((rx_id & 0x3) == 0) {
rte_ixgbe_prefetch(&rx_ring[rx_id]);
rte_ixgbe_prefetch(&sw_ring[rx_id]);
}
...
rxm = rxe->mbuf; /* rxm指向旧mbuf */
rxe->mbuf = nmb; /* rxe->mbuf指向新mbuf */
dma_addr =
rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb)); /* 得到新mbuf的总线地址 */
rxdp->read.hdr_addr = 0; /* 清零新mbuf对应的desc的DD,后续网卡会读desc */
rxdp->read.pkt_addr = dma_addr; /* 设置新mbuf对应的desc的总线地址,后续网卡会读desc */
...
pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
rxq->crc_len); /* 包长 */
rxm->data_off = RTE_PKTMBUF_HEADROOM;
rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
rxm->nb_segs = 1;
rxm->next = NULL;
rxm->pkt_len = pkt_len;
rxm->data_len = pkt_len;
rxm->port = rxq->port_id;
...
if (likely(pkt_flags & PKT_RX_RSS_HASH)) /* RSS */
rxm->hash.rss = rte_le_to_cpu_32(
rxd.wb.lower.hi_dword.rss);
else if (pkt_flags & PKT_RX_FDIR) { /* FDIR */
rxm->hash.fdir.hash = rte_le_to_cpu_16(
rxd.wb.lower.hi_dword.csum_ip.csum) &
IXGBE_ATR_HASH_MASK;
rxm->hash.fdir.id = rte_le_to_cpu_16(
rxd.wb.lower.hi_dword.csum_ip.ip_id);
}
...
rx_pkts[nb_rx++] = rxm; /* 将旧mbuf放入rx_pkts数组 */
}
rxq->rx_tail = rx_id; /* rx_tail指向下一个desc */
...
nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
/* 若已处理的mbuf数大于上限(默认为32),更新RDT */
if (nb_hold > rxq->rx_free_thresh) {
...
rx_id = (uint16_t) ((rx_id == 0) ?
(rxq->nb_rx_desc - 1) : (rx_id - 1));
IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id); /* 将rx_id写入RDT */
nb_hold = 0; /* 清零nb_hold */
}
rxq->nb_rx_hold = nb_hold; /* 更新nb_rx_hold */
return nb_rx;
}
发送时回写的三种情况(默认为第一种,回写取决于RS):
1、TXDCTL[n].WTHRESH = 0 and a descriptor that has RS set is ready to be written back.
2、TXDCTL[n].WTHRESH > 0 and TXDCTL[n].WTHRESH descriptors have accumulated.
3、TXDCTL[n].WTHRESH > 0 and the corresponding EITR counter has reached zero. The timer expiration flushes any accumulated descriptors and sets an interrupt event (TXDW).
发送时回写:
1、挂载每个包的最后一个分段时,若当前使用的desc数大于上限(默认为32),设置RS
2、burst发包的最后一个包的最后一个分段,设置RS
uint16_t
ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
{
...
txq = tx_queue;
sw_ring = txq->sw_ring;
txr = txq->tx_ring;
tx_id = txq->tx_tail; /* 相当于ixgbe的next_to_use */
txe = &sw_ring[tx_id]; /* 得到tx_tail指向的entry */
txp = NULL;
...
/* 若空闲的mbuf数小于下限(默认为32),清理空闲的mbuf */
if (txq->nb_tx_free < txq->tx_free_thresh)
ixgbe_xmit_cleanup(txq);
...
/* TX loop */
for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
...
tx_pkt = *tx_pkts++; /* 待发送的mbuf */
pkt_len = tx_pkt->pkt_len; /* 待发送的mbuf的长度 */
...
nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx); /* 使用的desc数 */
...
tx_last = (uint16_t) (tx_id + nb_used - 1); /* tx_last指向最后一个desc */
...
if (tx_last >= txq->nb_tx_desc) /* 注意是一个环形缓冲区 */
tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
...
if (nb_used > txq->nb_tx_free) {
...
if (ixgbe_xmit_cleanup(txq) != 0) {
/* Could not clean any descriptors */
if (nb_tx == 0) /* 若是第一个包(未发包),return 0 */
return 0;
goto end_of_tx; /* 若非第一个包(已发包),停止发包,更新发送队列参数 */
}
...
}
...
/* 每个包可能包含多个分段,m_seg指向第一个分段 */
m_seg = tx_pkt;
do {
txd = &txr[tx_id]; /* desc */
txn = &sw_ring[txe->next_id]; /* 下一个entry */
...
txe->mbuf = m_seg; /* 将m_seg挂载到txe */
...
slen = m_seg->data_len; /* m_seg的长度 */
buf_dma_addr = rte_mbuf_data_dma_addr(m_seg); /* m_seg的总线地址 */
txd->read.buffer_addr =
rte_cpu_to_le_64(buf_dma_addr); /* 总线地址赋给txd->read.buffer_addr */
txd->read.cmd_type_len =
rte_cpu_to_le_32(cmd_type_len | slen); /* 长度赋给txd->read.cmd_type_len */
...
txe->last_id = tx_last; /* last_id指向最后一个desc */
tx_id = txe->next_id; /* tx_id指向下一个desc */
txe = txn; /* txe指向下一个entry */
m_seg = m_seg->next; /* m_seg指向下一个分段 */
} while (m_seg != NULL);
...
/* 最后一个分段 */
cmd_type_len |= IXGBE_TXD_CMD_EOP;
txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used); /* 更新nb_tx_used */
txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used); /* 更新nb_tx_free */
...
if (txq->nb_tx_used >= txq->tx_rs_thresh) { /* 若使用的mbuf数大于上限(默认为32),设置RS */
...
cmd_type_len |= IXGBE_TXD_CMD_RS;
...
txp = NULL; /* txp为NULL表示已设置RS */
} else
txp = txd; /* txp非NULL表示未设置RS */
...
txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
}
...
end_of_tx:
/* burst发包的最后一个包的最后一个分段 */
...
if (txp != NULL) /* 若未设置RS,设置RS */
txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
...
IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id); /* 将tx_id写入TDT */
txq->tx_tail = tx_id; /* tx_tail指向下一个desc */
...
return nb_tx;
}
static inline int
ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
{
...
uint16_t last_desc_cleaned = txq->last_desc_cleaned;
...
/* 最后一个entry */
desc_to_clean_to = (uint16_t)(last_desc_cleaned + txq->tx_rs_thresh);
if (desc_to_clean_to >= nb_tx_desc) /* 注意是环形缓冲区 */
desc_to_clean_to = (uint16_t)(desc_to_clean_to - nb_tx_desc);
...
/* 最后一个entry的最后一个desc */
desc_to_clean_to = sw_ring[desc_to_clean_to].last_id;
status = txr[desc_to_clean_to].wb.status;
/* 若最后一个desc的DD为0,return -1 */
if (!(status & rte_cpu_to_le_32(IXGBE_TXD_STAT_DD))) {
...
return -(1);
}
...
/* 将要清理的desc数 */
if (last_desc_cleaned > desc_to_clean_to) /* 注意是环形缓冲区 */
nb_tx_to_clean = (uint16_t)((nb_tx_desc - last_desc_cleaned) +
desc_to_clean_to);
else
nb_tx_to_clean = (uint16_t)(desc_to_clean_to -
last_desc_cleaned);
...
txr[desc_to_clean_to].wb.status = 0; /* 清零DD */
...
txq->last_desc_cleaned = desc_to_clean_to; /* 更新last_desc_cleaned */
txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + nb_tx_to_clean); /* 更新nb_tx_free */
...
return 0;
}