dpdk mbuf之结构体学习

struct rte_mbuf

struct rte_mbuf的源码定义如下,struct rte_mbuf结构体定义的时候,由于成员原来越多,所以有意跨两个cache line大小,
通过MARKER cacheline0和MARKER cacheline1来标示两个cache line, cacheline0到 cacheline1之间的变量都是在cache line 0,原则上将基础性、频繁访问的数据会放在cacheline0里面,而cacheline1之后的成员都是处在cache line1.

struct rte_mbuf {
	MARKER cacheline0;

	void *buf_addr;           /**< Virtual address of segment buffer. */
	/**
	 * Physical address of segment buffer.
	 * Force alignment to 8-bytes, so as to ensure we have the exact
	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
	 * working on vector drivers easier.
	 */
	RTE_STD_C11
	union {
		rte_iova_t buf_iova;
		rte_iova_t buf_physaddr; /**< deprecated */
	} __rte_aligned(sizeof(rte_iova_t));

	/* next 8 bytes are initialised on RX descriptor rearm */
	MARKER64 rearm_data;
	uint16_t data_off;

	/**
	 * Reference counter. Its size should at least equal to the size
	 * of port field (16 bits), to support zero-copy broadcast.
	 * It should only be accessed using the following functions:
	 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
	 * rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
	 * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC
	 * config option.
	 */
	RTE_STD_C11
	union {
		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
		uint16_t refcnt;              /**< Non-atomically accessed refcnt */
	};
	uint16_t nb_segs;         /**< Number of segments. */

	/** Input port (16 bits to support more than 256 virtual ports).
	 * The event eth Tx adapter uses this field to specify the output port.
	 */
	uint16_t port;

	uint64_t ol_flags;        /**< Offload features. */

	/* remaining bytes are set on RX when pulling packet from descriptor */
	MARKER rx_descriptor_fields1;

	/*
	 * The packet type, which is the combination of outer/inner L2, L3, L4
	 * and tunnel types. The packet_type is about data really present in the
	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
	 * vlan is stripped from the data.
	 */
	RTE_STD_C11
	union {
		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
		struct {
			uint32_t l2_type:4; /**< (Outer) L2 type. */
			uint32_t l3_type:4; /**< (Outer) L3 type. */
			uint32_t l4_type:4; /**< (Outer) L4 type. */
			uint32_t tun_type:4; /**< Tunnel type. */
			RTE_STD_C11
			union {
				uint8_t inner_esp_next_proto;
				/**< ESP next protocol type, valid if
				 * RTE_PTYPE_TUNNEL_ESP tunnel type is set
				 * on both Tx and Rx.
				 */
				__extension__
				struct {
					uint8_t inner_l2_type:4;
					/**< Inner L2 type. */
					uint8_t inner_l3_type:4;
					/**< Inner L3 type. */
				};
			};
			uint32_t inner_l4_type:4; /**< Inner L4 type. */
		};
	};

	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
	uint16_t data_len;        /**< Amount of data in segment buffer. */
	/** VLAN TCI (CPU order), valid if PKT_RX_VLAN is set. */
	uint16_t vlan_tci;

	RTE_STD_C11
	union {
		union {
			uint32_t rss;     /**< RSS hash result if RSS enabled */
			struct {
				union {
					struct {
						uint16_t hash;
						uint16_t id;
					};
					uint32_t lo;
					/**< Second 4 flexible bytes */
				};
				uint32_t hi;
				/**< First 4 flexible bytes or FD ID, dependent
				 * on PKT_RX_FDIR_* flag in ol_flags.
				 */
			} fdir;	/**< Filter identifier if FDIR enabled */
			struct {
				uint32_t lo;
				uint32_t hi;
				/**< The event eth Tx adapter uses this field
				 * to store Tx queue id.
				 * @see rte_event_eth_tx_adapter_txq_set()
				 */
			} sched;          /**< Hierarchical scheduler */
			/**< User defined tags. See rte_distributor_process() */
			uint32_t usr;
		} hash;                   /**< hash information */
		struct {
			/**
			 * Application specific metadata value
			 * for egress flow rule match.
			 * Valid if PKT_TX_METADATA is set.
			 * Located here to allow conjunct use
			 * with hash.sched.hi.
			 */
			uint32_t tx_metadata;
			uint32_t reserved;
		};
	};

	/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ is set. */
	uint16_t vlan_tci_outer;

	uint16_t buf_len;         /**< Length of segment buffer. */

	/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
	 * are not normalized but are always the same for a given port.
	 */
	uint64_t timestamp;

	/* second cache line - fields only used in slow path or on TX */
	MARKER cacheline1 __rte_cache_min_aligned;

	RTE_STD_C11
	union {
		void *userdata;   /**< Can be used for external metadata */
		uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */
	};

	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
	struct rte_mbuf *next;    /**< Next segment of scattered packet. */

	/* fields to support TX offloads */
	RTE_STD_C11
	union {
		uint64_t tx_offload;       /**< combined for easy fetch */
		__extension__
		struct {
			uint64_t l2_len:7;
			/**< L2 (MAC) Header Length for non-tunneling pkt.
			 * Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
			 */
			uint64_t l3_len:9; /**< L3 (IP) Header Length. */
			uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
			uint64_t tso_segsz:16; /**< TCP TSO segment size */

			/* fields for TX offloading of tunnels */
			uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
			uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */

			/* uint64_t unused:8; */
		};
	};

	/** Size of the application private data. In case of an indirect
	 * mbuf, it stores the direct mbuf private data size. */
	uint16_t priv_size;

	/** Timesync flags for use with IEEE1588. */
	uint16_t timesync;

	/** Sequence number. See also rte_reorder_insert(). */
	uint32_t seqn;

	/** Shared data for external buffer attached to mbuf. See
	 * rte_pktmbuf_attach_extbuf().
	 */
	struct rte_mbuf_ext_shared_info *shinfo;
}

buf_addr

当前mbuf的虚拟地址,标准buf addr的指向的内存是在mbuf头部开始,偏移一个mbuf头加上一个私有数据的大小。如下所示:
m->buf_addr = (char *)m + sizeof(struct rte_mbuf) + priv_size;
初始化这个变量是在我们创建mbuf的mempool的时候完成的

rte_pktmbuf_pool_create
    rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
        rte_pktmbuf_init
            	m->buf_addr = (char *)m + mbuf_size;

相关api:
rte_pktmbuf_mtod 返回mbuf数据buf的开始位置,已经做了data_off偏移。

buf的物理地址

union {
	rte_iova_t buf_iova;
	rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));

mbuf对应的物理地址,一般mbuf物理地址在初始化mempool的时候就设置了,在mbuf对应obj的head里面存放,如下结构体的objhdr里面的iova/physaddr

struct rte_mempool_objhdr {
	STAILQ_ENTRY(rte_mempool_objhdr) next; /**< Next in list. */
	struct rte_mempool *mp;          /**< The mempool owning the object. */
	RTE_STD_C11
	union {
		rte_iova_t iova;         /**< IO address of the object. */
		phys_addr_t physaddr;    /**< deprecated - Physical address of the object. */
	};
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
	uint64_t cookie;                 /**< Debug cookie. */
#endif
};

这个转化关系如下:
m->buf_iova = rte_mempool_virt2iova(m) + sizeof(struct rte_mbuf) + priv_size;

data_off

这个变量是标识mbuf的data room开始地址到报文起始位置的偏移,默认是设置为RTE_PKTMBUF_HEADROOM(128),
我们在创建一个mbuf的mem pool的时候,会指定data room的大小,如下所示的data_room_size参数,

struct rte_mempool *
rte_pktmbuf_pool_create(const char *name, unsigned int n,
	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
	int socket_id)
{
	return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size,
			data_room_size, socket_id, NULL);
}

data_room_size标识每一个mbuf的数据报文的最大值,一般会设置大于一个mtu+128B的头部预留空间
dpdk提供一个默认宏定义:

#define    RTE_PKTMBUF_HEADROOM 128
#define	RTE_MBUF_DEFAULT_DATAROOM	2048
#define	RTE_MBUF_DEFAULT_BUF_SIZE	(RTE_MBUF_DEFAULT_DATAROOM + RTE_PKTMBUF_HEADROOM)

所以当我们从mbuf pool alloc一块mbuf过来的时候,都会reset一下mbuf的变量,里面就包含了重置data_off,具体如下:

static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m)
{
	m->data_off = (uint16_t)RTE_MIN((uint16_t)RTE_PKTMBUF_HEADROOM,
					(uint16_t)m->buf_len);
}
static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
{
	m->next = NULL;
	m->pkt_len = 0;
	m->tx_offload = 0;
	m->vlan_tci = 0;
	m->vlan_tci_outer = 0;
	m->nb_segs = 1;
	m->port = MBUF_INVALID_PORT;

	m->ol_flags = 0;
	m->packet_type = 0;
	rte_pktmbuf_reset_headroom(m);

	m->data_len = 0;
	__rte_mbuf_sanity_check(m, 1);
}

mbuf应用计数 refcnt

用来表示mbuf被引用的次数,在mbuf被释放的时候,需要检查,确定引用计数只能为1,否则报错。

static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
	RTE_ASSERT(RTE_MBUF_DIRECT(m));
	RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
	RTE_ASSERT(m->next == NULL);
	RTE_ASSERT(m->nb_segs == 1);
	__rte_mbuf_sanity_check(m, 0);
	rte_mempool_put(m->pool, m);
}

具体相关的api:

static inline uint16_t rte_mbuf_refcnt_read(const struct rte_mbuf *m)
static inline void rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
static inline uint16_t rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value)

mbuf分段存储

涉及到的参数有:
uint16_t nb_segs 表示当前的mbuf报文有多少个分段
struct rte_mbuf *next 表示下一个分段的地址,单向链表连接,如下图所示
dpdk mbuf之结构体学习_第1张图片

端口号

uint16_t port u16的端口号,表示输入输出端口号,无效值是UINT16_MAX

卸载特性标识 ol_flags

dpdk用一个u64来定义这个flag,使用的时候是按bit使用的,要么就是某个比特表示特定的意思,要么就是几个比特的组合表示特定的意思, 具体定义如下:

#define PKT_RX_VLAN          (1ULL << 0)
#define PKT_RX_RSS_HASH      (1ULL << 1)  /**< RX packet with RSS hash result. */
#define PKT_RX_FDIR          (1ULL << 2)  /**< RX packet with FDIR match indicate. */
#define PKT_RX_L4_CKSUM_BAD  (1ULL << 3)
#define PKT_RX_IP_CKSUM_BAD  (1ULL << 4)
#define PKT_RX_EIP_CKSUM_BAD (1ULL << 5)  /**< External IP header checksum error. */
#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
#define PKT_RX_IP_CKSUM_MASK ((1ULL << 4) | (1ULL << 7))
#define PKT_RX_IP_CKSUM_BAD     (1ULL << 4)
#define PKT_RX_IP_CKSUM_GOOD    (1ULL << 7)
#define PKT_RX_IP_CKSUM_NONE    ((1ULL << 4) | (1ULL << 7))
#define PKT_RX_L4_CKSUM_MASK ((1ULL << 3) | (1ULL << 8))
#define PKT_RX_L4_CKSUM_BAD     (1ULL << 3)
#define PKT_RX_L4_CKSUM_GOOD    (1ULL << 8)
#define PKT_RX_L4_CKSUM_NONE    ((1ULL << 3) | (1ULL << 8))
#define PKT_RX_IEEE1588_PTP  (1ULL << 9)  /**< RX IEEE1588 L2 Ethernet PT Packet. */
#define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
#define PKT_RX_FDIR_ID       (1ULL << 13) /**< FD id reported if FDIR match. */
#define PKT_RX_FDIR_FLX      (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
#define PKT_RX_LRO           (1ULL << 16)
#define PKT_RX_TIMESTAMP     (1ULL << 17)
#define PKT_RX_SEC_OFFLOAD		(1ULL << 18)
#define PKT_RX_SEC_OFFLOAD_FAILED  	(1ULL << 19)
#define PKT_RX_QINQ          (1ULL << 20)
#define PKT_RX_OUTER_L4_CKSUM_MASK	((1ULL << 21) | (1ULL << 22))
#define PKT_RX_OUTER_L4_CKSUM_UNKNOWN	0
#define PKT_RX_OUTER_L4_CKSUM_BAD	(1ULL << 21)
#define PKT_RX_OUTER_L4_CKSUM_GOOD	(1ULL << 22)
#define PKT_RX_OUTER_L4_CKSUM_INVALID	((1ULL << 21) | (1ULL << 22))
#define PKT_TX_METADATA	(1ULL << 40)
#define PKT_TX_OUTER_UDP_CKSUM     (1ULL << 41)
#define PKT_TX_UDP_SEG	(1ULL << 42)
#define PKT_TX_SEC_OFFLOAD 		(1ULL << 43)
#define PKT_TX_MACSEC        (1ULL << 44)
#define PKT_TX_TUNNEL_VXLAN   (0x1ULL << 45)
#define PKT_TX_TUNNEL_GRE     (0x2ULL << 45)
#define PKT_TX_TUNNEL_IPIP    (0x3ULL << 45)
#define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)
#define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45)
#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
#define PKT_TX_TUNNEL_IP (0xDULL << 45)
#define PKT_TX_TUNNEL_UDP (0xEULL << 45)
#define PKT_TX_TUNNEL_MASK    (0xFULL << 45)
#define PKT_TX_QINQ        (1ULL << 49)   /**< TX packet with double VLAN inserted. */
#define PKT_TX_TCP_SEG       (1ULL << 50)
#define PKT_TX_IEEE1588_TMST (1ULL << 51) /**< TX IEEE1588 packet to timestamp. */
#define PKT_TX_L4_NO_CKSUM   (0ULL << 52) /**< Disable L4 cksum of TX pkt. */
#define PKT_TX_TCP_CKSUM     (1ULL << 52) /**< TCP cksum of TX pkt. computed by NIC. */
#define PKT_TX_SCTP_CKSUM    (2ULL << 52) /**< SCTP cksum of TX pkt. computed by NIC. */
#define PKT_TX_UDP_CKSUM     (3ULL << 52) /**< UDP cksum of TX pkt. computed by NIC. */
#define PKT_TX_L4_MASK       (3ULL << 52) /**< Mask for L4 cksum offload request. */
#define PKT_TX_IP_CKSUM      (1ULL << 54)
#define PKT_TX_IPV4          (1ULL << 55)
#define PKT_TX_IPV6          (1ULL << 56)
#define PKT_TX_VLAN          (1ULL << 57)
#define PKT_TX_OUTER_IP_CKSUM   (1ULL << 58)
#define PKT_TX_OUTER_IPV4   (1ULL << 59)
#define PKT_TX_OUTER_IPV6    (1ULL << 60)
#define EXT_ATTACHED_MBUF    (1ULL << 61)
#define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */

报文类型packet_type

主要用来表示报文的L2/L3/L4 and tunnel information
具体定义在 dpdk/lib/librte_mbuf/rte_mbuf_ptype.h里面

报文长度信息

具体涉及到的变量有:
uint32_t pkt_len 表示总的报文大小的长度,包含所有seg分段报文的报文长度
uint16_t data_len 表示当前mbuf的报文数据长度
uint16_t buf_len 表示当前mbuf的整个buf的长度,包含headroom的长度+data_len
这里把buf_len一起列出来对比讲是因为这三个都是mbuf里面的表示长度的变量,容易混淆,这里一比较容易理解。

vlan信息

主要涉及的变量有:
uint16_t vlan_tci 表示vlan报文的tci,如果ol_flags设置了PKT_RX_VLAN,这个变量才有意义
uint16_t vlan_tci_outer 表示qinq报文的外层vlan信息,如果ol_flags设置了PKT_RX_QINQ,这个变量才有意义

报文的hash信息

用一个u64大小的union来表示,主要包含

  1. 硬件nic RSS后的结果
  2. fdir 过滤标识符信息
  3. sched 分层调度
	union {
		union {
			uint32_t rss;     /**< RSS hash result if RSS enabled */
			struct {
				union {
					struct {
						uint16_t hash;
						uint16_t id;
					};
					uint32_t lo;
					/**< Second 4 flexible bytes */
				};
				uint32_t hi;
				/**< First 4 flexible bytes or FD ID, dependent
				 * on PKT_RX_FDIR_* flag in ol_flags.
				 */
			} fdir;	/**< Filter identifier if FDIR enabled */
			struct {
				uint32_t lo;
				uint32_t hi;
				/**< The event eth Tx adapter uses this field
				 * to store Tx queue id.
				 * @see rte_event_eth_tx_adapter_txq_set()
				 */
			} sched;          /**< Hierarchical scheduler */
			/**< User defined tags. See rte_distributor_process() */
			uint32_t usr;
		} hash;                   /**< hash information */
		struct {
			/**
			 * Application specific metadata value
			 * for egress flow rule match.
			 * Valid if PKT_TX_METADATA is set.
			 * Located here to allow conjunct use
			 * with hash.sched.hi.
			 */
			uint32_t tx_metadata;
			uint32_t reserved;
		};
	};

报文的时间戳timestamp

ol_flags设置了PKT_RX_TIMESTAMP,这个变量才有意义,驱动才会将报文的接受时间戳填充进来。

pool

表示mbuf从这个pool申请来的,释放mbuf的时候用到

static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
	RTE_ASSERT(RTE_MBUF_DIRECT(m));
	RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
	RTE_ASSERT(m->next == NULL);
	RTE_ASSERT(m->nb_segs == 1);
	__rte_mbuf_sanity_check(m, 0);
	rte_mempool_put(m->pool, m);
}

TX offloads

使用一个u64的变量来表示tx offload的信息,一般tx offload需要设置以太网报文头的信息,
如l2_len、l3_len、l4_len等等,这个一般是根据nic硬件支持的类型来设置和使用的。

	union {
		uint64_t tx_offload;       /**< combined for easy fetch */
		__extension__
		struct {
			uint64_t l2_len:7;
			/**< L2 (MAC) Header Length for non-tunneling pkt.
			 * Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
			 */
			uint64_t l3_len:9; /**< L3 (IP) Header Length. */
			uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
			uint64_t tso_segsz:16; /**< TCP TSO segment size */

			/* fields for TX offloading of tunnels */
			uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
			uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */

			/* uint64_t unused:8; */
		};
	};

其他信息

uint16_t priv_size表示mbuf里面私有数据空间大小
uint16_t timesync表示IEEE1588标准的时间同步标志
uint32_t seqn mbuf的序列号,是dpdk的一个排序库用到的表示,它会根据这里的序列号来从排序报文。

你可能感兴趣的:(dpdk)