rss 是网卡提供的分流机制。用来将报表分流到不同的收包队列,以提高收包性能。
引用 Intel 82599 10 GbE Controller Datasheet 其中的 Section 7.1.2.8.1, RSS Hash Function
一节。
- The receive packet is parsed into the header fields used by the hash operation (such as IP addresses, TCP port, etc.)
- A hash calculation is performed.
The 82599 supports a single hash function, as defined by MSFT RSS.
The 82599 therefore does not indicate to the device driver which hash function is used.
The 32-bit result is fed into the packet receive descriptor.- The seven LSBs of the hash result are used as an index into a 128-entry redirection table.
Each entry provides a 4-bit RSS output index.When RSS is enabled, the 82599 provides software with the following information as:
- Required by Microsoft (MSFT) RSS
- Provided for device driver assist:
- * A Dword result of the MSFT RSS hash function, to be used by the stack for flow classification, is written into the receive packet descriptor (required by MSFT RSS).
- * A 4-bit RSS Type field conveys the hash function used for the specific packet (required by MSFT RSS).
Enabling rules:
- RSS is enabled in the MRQC register.
- RSS enabling cannot be done dynamically while it must be preceded by a software reset.
- RSS status field in the descriptor write-back is enabled when the RXCSUM.PCSD bit is set (fragment checksum is disabled).
RSS is therefore mutually exclusive with UDP fragmentation checksum offload.- Support for RSS is not provided when legacy receive descriptor format is used.
Disabling rules:
- Disabling RSS on the fly is not allowed, and the 82599 must be reset after RSS is disabled.
- When RSS is disabled, packets are assigned an RSS output index = zero.
When multiple request queues are enabled in RSS mode, un-decodable packets are assigned an RSS output index = zero.
The 32-bit tag (normally a result of the hash function) equals zero.
注意:
rss hash
。填充到 struct rte_mbuf 的 hash.rss字段中。RSS output index
。rss hash
和 RSS output index
设置为 0。struct rte_eth_rss_conf 用于配置网卡的 rss。
struct rte_eth_rss_conf {
uint8_t *rss_key; /**< If not NULL, 40-byte hash key. */
uint8_t rss_key_len; /**< hash key length in bytes. */
uint64_t rss_hf; /**< Hash functions to apply - see below. */
};
字段 | 描述 |
---|---|
rss_key | rss_key 数组。如果 为 NULL,留给网卡设置 rss_key。 |
rss_key_len | rss_key 数组的字节数。 |
rss_hf | 需要对报文的分析的元组类型。常用的组合有 l3: ETH_RSS_IP , l3+l4: ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP 。 |
static struct rte_eth_conf eth_conf = {
.rxmode = {
.mq_mode = ETH_MQ_RX_RSS /* 使用 RSS 分流 */
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL, /* 留给 网卡设置 rss_key */
.rss_key_len = 0, /* rss_key 数组的字节数 */
.rss_hf = ETH_RSS_IP /* 通过 l3 tuple 计算 rss hash */
| ETH_RSS_UDP /* + 通过 UDP tuple 计算 rss hash */
| ETH_RSS_TCP /* + 通过 TCP tuple 计算 rss hash */
},
/* ... */
}
/* ... */
};
问题:
比如网卡82599ES 使用的是 ixgbe 的驱动。
在 ixgbe_rss_configure() 中,会使用 对称的 rss key 作为默认值。
ixgbe 的驱动 用于 rss 配置的代码:
/* filepath:
* dpdk/drivers/net/ixgbe/ixgbe_rxtx.c
*/
/* ixgbe 的驱动 使用了 对称的 rss key 作为默认值。 */
static uint8_t rss_intel_key[40] = {
0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A,
0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A,
0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A,
0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A,
0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A,
};
static void
ixgbe_rss_configure(struct rte_eth_dev *dev)
{
/* ... */
if (rss_conf.rss_key == NULL)
rss_conf.rss_key = rss_intel_key; /* <-- Default hash key */
ixgbe_hw_rss_hash_set(hw, &rss_conf);
}
应用笔记:
ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP
。为了避免驱动使用了不合适的 rss_key。所以最好还是自己显示指定。
下面的代码,就提供了 对称 和 非对称 rss key 以供选择。
rss_hash_key 的数值可以参考论文Scalable TCP Session Monitoring withSymmetric Receive-side Scaling。
/* 使用 宏来选择 默认的 rss 配置 */
#define RSS_HASH_KEY_DEFAULT rss_hash_key_symmetric
#define RSS_HASH_FUNCTION_DEFAULT (ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP)
/* symmetric_rss_key */
static uint8_t rss_hash_key_symmetric[40] =
{ 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A
, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A
, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A
, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A
, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A, 0x6D, 0x5A
};
/* asymmetric_rss_key */
static uint8_t rss_hash_key_asymmetric[40] =
{ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2
, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0
, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4
, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c
, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
};
/* rss config */
static struct rte_eth_conf eth_conf = {
.rxmode = {
.mq_mode = ETH_MQ_RX_RSS /* ETH_MQ_RX_RSS */
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = RSS_HASH_KEY_DEFAULT, /* <-- 显示指定 rss_key */
.rss_key_len = RTE_DIM(RSS_HASH_KEY_DEFAULT), /* rss_key 数组的字节数 */
.rss_hf = RSS_HASH_FUNCTION_DEFAULT /* <-- 显示指定 rss hash function */
},
/* ... */
}
/* ... */
};
/* 静态的 rss 配置方法 */
int error_code;
error_code = rte_eth_dev_configure(port_id
, rx_queue_count
, tx_queue_count
, ð_conf /* <-- 传入上面定义的 eth_conf */
);
除了上面静态的 rss 配置方法。也可以通过 rte_eth_dev_rss_hash_update() 来动态配置。
下面的例子,就是一个 rte_eth_dev_rss_hash_update() 的包裹函数。
static int
port_set_rss_config(uint_8 port_id
, uint64_t hash_function
, uint8_t *hash_key
, uint8_t hash_key_len
)
{
struct rte_eth_rss_conf rss_conf;
int error_code;
rss_conf.rss_key = hash_key;
rss_conf.rss_key_len = hash_key_len;
rss_conf.rss_hf = hash_function;
error_code = rte_eth_dev_rss_hash_update(port_id, &rss_conf);
if (0 != error_code) {
goto fail;
}
return 0;
/* error handle */
fail:
switch (error_code) {
case -ENODEV:
printf("port index %d invalid\n", port_id);
break;
case -ENOTSUP:
printf("operation not supported by device\n");
break;
default:
printf("operation failed - error_code=%d\n", error_code);
break;
}
return -1;
}
rss hash 的算法可以参考 Intel 82599 10 GbE Controller Datasheet 其中的 Section 7.1.2.8.1, RSS Hash Function
一节。
算法的伪代码表示如下:
ComputeHash(input[], N)
For hash-input input[] of length N bytes (8N bits) and a random secret key K of 320 bits
Result = 0;
For each bit b in input[] {
if (b == 1) then Result ^= (left-most 32 bits of K);
shift K left 1 bit position;
}
return Result;
注意:
例子:
项目 | 数值(十进制) | uint8_t [] 数组形式表示 (小端对齐十六进制) |
---|---|---|
src_ipv4 | 66.9.149.187 | {0xbb, 0x95, 0x09, 0x42} |
dst_ipv4 | 161.142.100.80 | {0x50, 0x64, 0x8e, 0xa1} |
sport | 2794 | {0xea, 0x0a} |
dport | 1766 | {0xe6, 0x06} |
输入参数 | 数值 |
---|---|
input[] | {0xbb, 0x95, 0x09, 0x42, 0x50, 0x64, 0x8e, 0xa1} |
N | 8 |
输入参数 | 数值 |
---|---|
input[] | {0xbb, 0x95, 0x09, 0x42, 0x50, 0x64, 0x8e, 0xa1, 0xea, 0x0a, 0xe6, 0x06} |
N | 12 |
应用笔记:
rss hash
值。DPDK rss hash 实现函数 有两个,分别是 rte_softrss() 和 rte_softrss_be()。
它们都定义在 dpdk/lib/librte_hash/rte_thash.h
中。
注意:
input_tuple
计算出相同的 rss hash。rss_key
少执行了 rte_cpu_to_be_32() 大小端转换。所以相比 rte_softrss() 会高效点。DPDK rte_softrss() 的源代码:
/**
* Generic implementation. Can be used with original rss_key
* @param input_tuple
* Pointer to input tuple
* @param input_len
* Length of input_tuple in 4-bytes chunks
* @param rss_key
* Pointer to RSS hash key.
* @return
* Calculated hash value.
*/
static inline uint32_t
rte_softrss(uint32_t *input_tuple, uint32_t input_len,
const uint8_t *rss_key)
{
uint32_t i, j, ret = 0;
for (j = 0; j < input_len; j++) {
for (i = 0; i < 32; i++) {
if (input_tuple[j] & (1 << (31 - i))) {
ret ^= rte_cpu_to_be_32(((const uint32_t *)rss_key)[j]) << i |
(uint32_t)((uint64_t)(rte_cpu_to_be_32(((const uint32_t *)rss_key)[j + 1])) >>
(32 - i));
}
}
}
return ret;
}
DPDK rte_softrss_be() 的源代码:
/**
* Prepare special converted key to use with rte_softrss_be()
* @param orig
* pointer to original RSS key
* @param targ
* pointer to target RSS key
* @param len
* RSS key length
*/
static inline void
rte_convert_rss_key(const uint32_t *orig, uint32_t *targ, int len)
{
int i;
for (i = 0; i < (len >> 2); i++)
targ[i] = rte_be_to_cpu_32(orig[i]);
}
/**
* Optimized implementation.
* If you want the calculated hash value matches NIC RSS value
* you have to use special converted key with rte_convert_rss_key() fn.
* @param input_tuple
* Pointer to input tuple
* @param input_len
* Length of input_tuple in 4-bytes chunks
* @param *rss_key
* Pointer to RSS hash key.
* @return
* Calculated hash value.
*/
static inline uint32_t
rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
const uint8_t *rss_key)
{
uint32_t i, j, ret = 0;
for (j = 0; j < input_len; j++) {
for (i = 0; i < 32; i++) {
if (input_tuple[j] & (1 << (31 - i))) {
ret ^= ((const uint32_t *)rss_key)[j] << i |
(uint32_t)((uint64_t)(((const uint32_t *)rss_key)[j + 1]) >> (32 - i));
}
}
}
return ret;
}
另外,rte_thash.h 也定义的 struct rte_ipv4_tuple
和 struct rte_ipv6_tuple
用来放置报文的元组信息。
注意:
/**
* length in dwords of input tuple to
* calculate hash of ipv4 header only
*/
#define RTE_THASH_V4_L3_LEN ((sizeof(struct rte_ipv4_tuple) - \
sizeof(((struct rte_ipv4_tuple *)0)->sctp_tag)) / 4)
/**
* length in dwords of input tuple to
* calculate hash of ipv4 header +
* transport header
*/
#define RTE_THASH_V4_L4_LEN ((sizeof(struct rte_ipv4_tuple)) / 4)
/**
* length in dwords of input tuple to
* calculate hash of ipv6 header only
*/
#define RTE_THASH_V6_L3_LEN ((sizeof(struct rte_ipv6_tuple) - \
sizeof(((struct rte_ipv6_tuple *)0)->sctp_tag)) / 4)
/**
* length in dwords of input tuple to
* calculate hash of ipv6 header +
* transport header
*/
#define RTE_THASH_V6_L4_LEN ((sizeof(struct rte_ipv6_tuple)) / 4)
/**
* IPv4 tuple
* addresses and ports/sctp_tag have to be CPU byte order
*/
struct rte_ipv4_tuple {
uint32_t src_addr;
uint32_t dst_addr;
union {
struct {
uint16_t dport;
uint16_t sport;
};
uint32_t sctp_tag;
};
};
/**
* IPv6 tuple
* Addresses have to be filled by rte_thash_load_v6_addr()
* ports/sctp_tag have to be CPU byte order
*/
struct rte_ipv6_tuple {
uint8_t src_addr[16];
uint8_t dst_addr[16];
union {
struct {
uint16_t dport;
uint16_t sport;
};
uint32_t sctp_tag;
};
};
union rte_thash_tuple {
struct rte_ipv4_tuple v4;
struct rte_ipv6_tuple v6;
以下的例子参考的是 dpdk/app/test/test_thash.c
。
union rte_thash_tuple tuple;
uint32_t rss_l3_original, rss_l3l4_original;
uint32_t rss_l3_converted, rss_l3l4_converted;
uint8_t default_rss_key[] = {
0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
};
uint8_t rss_key_be[RTE_DIM(default_rss_key)];
tuple.v4.src_addr = IPv4(66, 9, 149, 187);
tuple.v4.dst_addr = IPv4(161, 142, 100, 80);
tuple.v4.sport = 2794;
tuple.v4.dport = 1766;
/*Calculate hash with original key*/
rss_l3_original = rte_softrss((uint32_t *)&tuple,
RTE_THASH_V4_L3_LEN, default_rss_key);
rss_l3l4_original = rte_softrss((uint32_t *)&tuple,
RTE_THASH_V4_L4_LEN, default_rss_key);
assert(0x323e8fc2 == rss_l3_original);
assert(0x51ccc178 == rss_l3l4_original);
/*Calculate hash with converted key*/
rss_l3_converted = rte_softrss_be((uint32_t *)&tuple,
RTE_THASH_V4_L3_LEN, rss_key_be);
rss_l3l4_converted = rte_softrss_be((uint32_t *)&tuple,
RTE_THASH_V4_L4_LEN, rss_key_be);
assert(0x323e8fc2 == rss_l3_converted);
assert(0x51ccc178 == rss_l3l4_converted);
Intel 82599 10 GbE Controller Datasheet
Scalable TCP Session Monitoring withSymmetric Receive-side Scaling
DPDK 之 Symmetric Receive-side Scaling