device dpdk0 {
rx {
queue_number 8
descriptor_number 1024
rss all
}
tx {
queue_number 8
descriptor_number 1024
}
fdir {
mode perfect
pballoc 64k
status matched
}
! promisc_mode
kni_name dpdk0.kni
}
static struct rte_eth_conf default_port_conf =
{
.rxmode =
{
.mq_mode = ETH_MQ_RX_RSS,
.max_rx_pkt_len = ETHER_MAX_LEN,
.split_hdr_size = 0,
.offloads = DEV_RX_OFFLOAD_IPV4_CKSUM,
},
.rx_adv_conf =
{
.rss_conf =
{
.rss_key = NULL,
.rss_hf = /*ETH_RSS_IP*/ ETH_RSS_TCP,
},
},
.txmode =
{
.mq_mode = ETH_MQ_TX_NONE,
},
.fdir_conf =
{
.mode = RTE_FDIR_MODE_PERFECT,
.pballoc = RTE_FDIR_PBALLOC_64K,
.status = RTE_FDIR_REPORT_STATUS /*_ALWAYS*/,
.mask =
{
.vlan_tci_mask = 0x0,
.ipv4_mask =
{
.src_ip = 0x00000000,
.dst_ip = 0xFFFFFFFF,
},
.ipv6_mask =
{
.src_ip =
{
0, 0, 0, 0
},
.dst_ip =
{ 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
},
.src_port_mask = 0x0000,
/* to be changed according to slave lcore number in use */
.dst_port_mask = 0x00F8,
.mac_addr_byte_mask = 0x00,
.tunnel_type_mask = 0,
.tunnel_id_mask = 0,
},
.drop_queue = 127,
.flex_conf =
{
.nb_payloads = 0,
.nb_flexmasks = 0,
},
},
};
每个lcore都有自己的sa_pool,用于管理本地分配的
相关结构体
enum
{
SA_F_USED = 0x01,
};
/**
* if really need to to save memory, we can;
* 1. use hlist_head
* 2. use uint8_t flag
* 3. remove sa_entry.addr, and get IP from sa_pool->ifa
* 4. to __packed__ sa_entry.
* 5. alloc sa_entries[] for 65536/cpu_num only.
* 6. create sa_entry_pool only if pool_hash hit.
* since when dest (like RS) num may small.
*/
/* socket address (sa) is pair. */
struct sa_entry
{
struct list_head list; /* node of sa_pool. */
//标志,目前只有SA_F_USED,主要标记该端口是否被使用了(空闲/busy状态)
uint32_t flags; /* SA_F_XXX */
union inet_addr addr;
__be16 port;
};
struct sa_entry_pool
{
//sa_entry hash表
struct sa_entry sa_entries[MAX_PORT];
//used sa_entry链表头
struct list_head used_enties;
//free sa_entry链表头
struct list_head free_enties;
/* another way is use total_used/free_cnt in sa_pool,
* so that we need not travels the hash to get stats.
* we use cnt here, since we may need per-pool stats. */
//统计计数
uint16_t used_cnt;
uint16_t free_cnt;
uint32_t miss_cnt;
};
/* no lock needed because inet_ifaddr.sa_pool
* is per-lcore. */
struct sa_pool
{
//sa_pool所属的ip层地址块
struct inet_ifaddr * ifa; /* back-pointer */
//low,high端口
uint16_t low; /* min port */
uint16_t high; /* max port */
//引用计数
rte_atomic32_t refcnt;
/* hashed pools by dest's . if no dest provided,
* just use first pool. it's not need create/destroy pool
* for each dest, that'll be too complicated. */
struct sa_entry_pool *pool_hash;
//hash表中桶个数
uint8_t pool_hash_sz;
uint32_t flags; /* SA_POOL_F_XXX */
/* fdir filter ID */
uint32_t filter_id[MAX_FDIR_PROTO];
};
struct sa_fdir
{
/* the ports one lcore can use means
* "(fdir.mask & port) == port_base" */
//掩码
uint16_t mask; /* filter's port mask */
//所在lcore id,每个lcore上分配一个
lcoreid_t lcore;
//在选取本地lcore可用port时时,lport 经过掩码后,得到的值如果等于 port_base 就会分配到这个核心
__be16 port_base;
uint16_t soft_id; /* current unsed soft-id,
* increase after use. */
};
程序初始化时调用sa_pool_init初始化全局fdir表
static struct sa_fdir sa_fdirs[DPVS_MAX_LCORE];
int sa_pool_init(void)
{
int shift;
lcoreid_t cid;
uint16_t port_base;
/* enabled lcore should not change after init */
//获取已启用的lcore个数和掩码
netif_get_slave_lcores(&sa_nlcore, &sa_lcore_mask);
/* how many mask bits needed ? */
//计算log2(sas_nlcore),向上取整
for (shift = 0; (0x1 << shift) < sa_nlcore; shift++)
{
;
}
if (shift >= 16)
{
return(EDPVS_INVAL); /* bad config */
}
port_base = 0;
for (cid = 0; cid < DPVS_MAX_LCORE; cid++)
{
//如果cid>=64或者该lcore没有被启用,跳过
if (cid >= 64 || !(sa_lcore_mask & (1L << cid)))
{
continue;
}
assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore());
//sa_fdirs是per-lcore数据结构,设置mask掩码,主要是通过&操作代替取余操作
sa_fdirs[cid].mask = ~((~0x0) << shift);
sa_fdirs[cid].lcore = cid;
//fdir计算时,lport 经过掩码后,得到的值如果等于 port_base 就会分配到这个核心
sa_fdirs[cid].port_base = htons(port_base);
sa_fdirs[cid].soft_id = 0;
port_base++;
}
return(EDPVS_OK);
}
在使用ipvsadm添加lip时,ifa_add_set调用sa_pool_create初始化sa_pool
int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high)
{
int err;
struct sa_pool *ap;
struct sa_fdir *fdir;
uint32_t filtids[MAX_FDIR_PROTO];
lcoreid_t cid = rte_lcore_id();
//判断当前lcore是否在sa_lcore_mask中
if (cid > 64 || !((sa_lcore_mask & (1UL << cid))))
{
if (cid == rte_get_master_lcore())
{
return(EDPVS_OK); /* no sapool on master */
}
return(EDPVS_INVAL);
}
//low,high端口默认值分别是1025,65535
low = low ? : DEF_MIN_PORT;
high = high ? : DEF_MAX_PORT;
//参数校验
if (!ifa || low > high || low == 0 || high >= MAX_PORT)
{
RTE_LOG(ERR, SAPOOL, "%s: bad arguments\\n", __func__);
return(EDPVS_INVAL);
}
//通过lcore id获取对应的sa_fdir对象
fdir = &sa_fdirs[cid];
//分配sa_pool对象
ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0);
if (unlikely(!ap))
{
return(EDPVS_NOMEM);
}
//设置sa_pool相关参数
ap->ifa = ifa;
ap->low = low;
ap->high = high;
ap->flags = 0;
rte_atomic32_set(&ap->refcnt, 1);
//分配sa_pool中socket地址的哈希表,默认hash桶的数量为16
err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir);
if (err != EDPVS_OK)
{
goto free_ap;
}
filtids[0] = fdir->soft_id++;
filtids[1] = fdir->soft_id++;
//增加fdir filter,用于fdir匹配
err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr,
fdir->port_base, filtids); /* thread-safe ? */
if (err != EDPVS_OK)
{
goto free_hash;
}
ap->filter_id[0] = filtids[0];
ap->filter_id[1] = filtids[1];
ifa->sa_pool = ap;
/* inc ifa->refcnt to hold it */
rte_atomic32_inc(&ifa->refcnt);
#ifdef CONFIG_DPVS_SAPOOL_DEBUG
{
char addr[64];
RTE_LOG(INFO, SAPOOL, "[%02d] %s: sa pool created -- %s\\n", rte_lcore_id(),
__func__, inet_ntop(ifa->af, &ifa->addr, addr, sizeof(addr)) ? : NULL);
}
#endif
return(EDPVS_OK);
free_hash:
sa_pool_free_hash(ap);
free_ap:
rte_free(ap);
return(err);
}
sa_pool_alloc_hash
static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz,
const struct sa_fdir *fdir)
{
int hash;
struct sa_entry_pool *pool;
uint32_t port; /* should be u32 or 65535==0 */
//分配sa_pool->pool_hash桶分配
ap->pool_hash = rte_malloc(NULL, sizeof(struct sa_entry_pool) * hash_sz,
RTE_CACHE_LINE_SIZE);
if (!ap->pool_hash)
{
return(EDPVS_NOMEM);
}
//设置哈希表桶大小为hash_sz
ap->pool_hash_sz = hash_sz;
/* the big loop takes about 17ms */
for (hash = 0; hash < hash_sz; hash++)
{
pool = &ap->pool_hash[hash];
//初始化used_entry和free_entry链表头
INIT_LIST_HEAD(&pool->used_enties);
INIT_LIST_HEAD(&pool->free_enties);
pool->used_cnt = 0;
pool->free_cnt = 0;
//根据fdir->mask &&((uint16_t)port & fdir->mask) == ntohs(fdir->port_base) 为当前lcore分配地址
for (port = ap->low; port <= ap->high; port++)
{
struct sa_entry *sa;
if (fdir->mask &&
((uint16_t)port & fdir->mask) != ntohs(fdir->port_base))
{
continue;
}
sa = &pool->sa_entries[(uint16_t)port];
sa->addr = ap->ifa->addr;
sa->port = htons((uint16_t)port);
list_add_tail(&sa->list, &pool->free_enties);
pool->free_cnt++;
}
}
return(EDPVS_OK);
}
sa_add_filter
static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid,
const union inet_addr *dip, __be16 dport,
uint32_t filter_id[MAX_FDIR_PROTO])
{
return(__add_del_filter(af, dev, cid, dip, dport, filter_id, true));
}
__add_del_filter
static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid,
const union inet_addr *dip, __be16 dport,
uint32_t filter_id[MAX_FDIR_PROTO], bool add)
{
queueid_t queue;
int err;
enum rte_filter_op op, rop;
//flow Director设置,action为接收数据
struct rte_eth_fdir_filter filt[MAX_FDIR_PROTO] =
{
{
.action.behavior = RTE_ETH_FDIR_ACCEPT,
.action.report_status = RTE_ETH_FDIR_REPORT_ID,
.soft_id = filter_id[0],
},
{
.action.behavior = RTE_ETH_FDIR_ACCEPT,
.action.report_status = RTE_ETH_FDIR_REPORT_ID,
.soft_id = filter_id[1],
},
};
if (af == AF_INET)
{
//IPv4 flow dirctor设置,只对(目的ip,目的端口)进行过滤,,具体为做过mask.dst_port_mask掩码后的dport
//当dport为0,lcore_count=4,则0,4,8,12......的目的端口会进入该lcore
filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP;
filt[0].input.flow.tcp4_flow.ip.dst_ip = dip->in.s_addr;
filt[0].input.flow.tcp4_flow.dst_port = dport;
filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
filt[1].input.flow.udp4_flow.ip.dst_ip = dip->in.s_addr;
filt[1].input.flow.udp4_flow.dst_port = dport;
}
else if (af == AF_INET6)
{
filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_TCP;
memcpy(filt[0].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr));
filt[0].input.flow.tcp6_flow.dst_port = dport;
filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_UDP;
memcpy(filt[1].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr));
filt[1].input.flow.udp6_flow.dst_port = dport;
}
else
{
return(EDPVS_NOTSUPP);
}
//判断设备是否flow director功能,主要对dpdk设备和bond设备,内部调用rte_eth_dev_filter_supported
if (dev->netif_ops && dev->netif_ops->op_filter_supported)
{
if (dev->netif_ops->op_filter_supported(dev, RTE_ETH_FILTER_FDIR) < 0)
{
if (dev->nrxq <= 1)
{
return(EDPVS_OK);
}
RTE_LOG(ERR, SAPOOL, "%s: FDIR is not supported by device %s. Only"
" single rxq can be configured.\\n", __func__, dev->name);
return(EDPVS_NOTSUPP);
}
}
else
{
RTE_LOG(ERR, SAPOOL, "%s: FDIR support of device %s is not known.\\n",
__func__, dev->name);
return(EDPVS_INVAL);
}
//获取lcore上处理port的哪些接收队列
err = netif_get_queue(dev, cid, &queue);
if (err != EDPVS_OK)
{
return(err);
}
//设置flow Director的接收队列为queue,绑定网卡硬件队列
filt[0].action.rx_queue = filt[1].action.rx_queue = queue;
op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE;
//将过滤条件更新到网卡
netif_mask_fdir_filter(af, dev, &filt[0]);
netif_mask_fdir_filter(af, dev, &filt[1]);
//内部调用rte_eth_dev_filter_ctrl添加flow filter规则
err = netif_fdir_filter_set(dev, op, &filt[0]);
if (err != EDPVS_OK)
{
return(err);
}
err = netif_fdir_filter_set(dev, op, &filt[1]);
if (err != EDPVS_OK)
{
rop = add ? RTE_ETH_FILTER_DELETE : RTE_ETH_FILTER_ADD;
netif_fdir_filter_set(dev, rop, &filt[0]);
return(err);
}
#ifdef CONFIG_DPVS_SAPOOL_DEBUG
{
char ipaddr[64];
RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP "
"ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d filterID %d/%d\\n",
add ? "add" : "del", dev->name,
af == AF_INET ? "IPv4" : "IPv6",
inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::",
ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid,
filter_id[0], filter_id[1]);
}
#endif
return(err);
}
netif_mask_fdir_filter
void netif_mask_fdir_filter(int af, const struct netif_port *port,
struct rte_eth_fdir_filter *filt)
{
struct rte_eth_fdir_info fdir_info;
const struct rte_eth_fdir_masks *fmask;
//首先获取fdir_filter中的input flow定义
union rte_eth_fdir_flow * flow = &filt->input.flow;
/**
union rte_eth_fdir_flow {
struct rte_eth_l2_flow l2_flow;
struct rte_eth_udpv4_flow udp4_flow;
struct rte_eth_tcpv4_flow tcp4_flow;
struct rte_eth_sctpv4_flow sctp4_flow;
struct rte_eth_ipv4_flow ip4_flow;
struct rte_eth_udpv6_flow udp6_flow;
struct rte_eth_tcpv6_flow tcp6_flow;
struct rte_eth_sctpv6_flow sctp6_flow;
struct rte_eth_ipv6_flow ipv6_flow;
struct rte_eth_mac_vlan_flow mac_vlan_flow;
struct rte_eth_tunnel_flow tunnel_flow;
};
*/
/* There exists a defect here. If the netif_port 'port' is not PORT_TYPE_GENERAL,
* mask fdir_filter of the port would fail. The correct way to accomplish the
* function is to register this method for all device types. Considering the flow
* is not changed after masking, we just skip netif_ports other than physical ones. */
if (port->type != PORT_TYPE_GENERAL)
{
return;
}
//retrieve fdir information
if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR,
RTE_ETH_FILTER_INFO, &fdir_info) < 0)
{
RTE_LOG(DEBUG, NETIF, "%s: Fail to fetch fdir info of %s !\\n",
__func__, port->name);
return;
}
fmask = &fdir_info.mask;
/* ipv4 flow */
if (af == AF_INET)
{
flow->ip4_flow.src_ip &= fmask->ipv4_mask.src_ip;
flow->ip4_flow.dst_ip &= fmask->ipv4_mask.dst_ip;
flow->ip4_flow.tos &= fmask->ipv4_mask.tos;
flow->ip4_flow.ttl &= fmask->ipv4_mask.ttl;
flow->ip4_flow.proto &= fmask->ipv4_mask.proto;
flow->tcp4_flow.src_port &= fmask->src_port_mask;
flow->tcp4_flow.dst_port &= fmask->dst_port_mask;
return;
}
/* ipv6 flow */
if (af == AF_INET6)
{
flow->ipv6_flow.src_ip[0] &= fmask->ipv6_mask.src_ip[0];
flow->ipv6_flow.src_ip[1] &= fmask->ipv6_mask.src_ip[1];
flow->ipv6_flow.src_ip[2] &= fmask->ipv6_mask.src_ip[2];
flow->ipv6_flow.src_ip[3] &= fmask->ipv6_mask.src_ip[3];
flow->ipv6_flow.dst_ip[0] &= fmask->ipv6_mask.dst_ip[0];
flow->ipv6_flow.dst_ip[1] &= fmask->ipv6_mask.dst_ip[1];
flow->ipv6_flow.dst_ip[2] &= fmask->ipv6_mask.dst_ip[2];
flow->ipv6_flow.dst_ip[3] &= fmask->ipv6_mask.dst_ip[3];
flow->ipv6_flow.tc &= fmask->ipv6_mask.tc;
flow->ipv6_flow.proto &= fmask->ipv6_mask.proto;
flow->ipv6_flow.hop_limits &= fmask->ipv6_mask.hop_limits;
flow->tcp6_flow.src_port &= fmask->src_port_mask;
flow->tcp6_flow.dst_port &= fmask->dst_port_mask;
return;
}
}
netif_port_fdir_dstport_mask_set
/*
* fdir mask must be set according to configured slave lcore number
* */
inline static int netif_port_fdir_dstport_mask_set(struct netif_port *port)
{
uint8_t slave_nb;
int shift;
netif_get_slave_lcores(&slave_nb, NULL);
for (shift = 0; (0x1 << shift) < slave_nb; shift++)
{
;
}
if (shift >= 16)
{
RTE_LOG(ERR, NETIF, "%s: %s's fdir dst_port_mask init failed\\n",
__func__, port->name);
return(EDPVS_NOTSUPP);
}
#if RTE_VERSION >= 0x10040010
port->dev_conf.fdir_conf.mask.dst_port_mask = htons(~((~0x0) << shift));
#else
port->dev_conf.fdir_conf.mask.dst_port_mask = ~((~0x0) << shift);
#endif
RTE_LOG(INFO, NETIF, "%s:dst_port_mask=%0x\\n", port->name,
port->dev_conf.fdir_conf.mask.dst_port_mask);
return(EDPVS_OK);
}
上文fdir设置主要是用于fnat中outbound方向数据流量回到dpvs时,与inbound流量进入dpvs所在lcore保持一致,避免流表查找的锁和缓存失效等。
主要看下dpvs中如何从sa_pool中分配本地地址和本地端口,对于所有新建连接,都会调用dp_vs_conn_new注册流表
dp_vs_conn_new
struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf,
const struct dp_vs_iphdr *iph,
struct dp_vs_conn_param *param,
struct dp_vs_dest *dest,
uint32_t flags)
{
//full-nat 特殊处理
if (dest->fwdmode == DPVS_FWD_MODE_FNAT)
{
//绑定LB本地socket,fullnat做了双向nat
if ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK)
{
goto unbind_dest;
}
}
}
dp_vs_laddr_bind
int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc)
{
struct dp_vs_laddr *laddr = NULL;
int i;
uint16_t sport = 0;
struct sockaddr_storage dsin, ssin;
bool found = false;
//一些常规校验
if (!conn || !conn->dest || !svc)
{
return(EDPVS_INVAL);
}
//如果传输层协议不是TCP或者UDP也直接返回错误,因为在设置fdir时只关注了tcp和udp协议
if (svc->proto != IPPROTO_TCP && svc->proto != IPPROTO_UDP)
{
return(EDPVS_NOTSUPP);
}
if (dp_vs_conn_is_template(conn))
{
return(EDPVS_OK);
}
/*
* some time allocate lport fails for one laddr,
* but there's also some resource on another laddr.
*/
for (i = 0; i < dp_vs_laddr_max_trails && i < svc->num_laddrs; i++)
{
/* select a local IP from service */
//首先选择一个laddr
laddr = __get_laddr(svc);
if (!laddr)
{
RTE_LOG(ERR, IPVS, "%s: no laddr available.\\n", __func__);
return(EDPVS_RESOURCE);
}
//首先将dsin与ssin清零
memset(&dsin, 0, sizeof(struct sockaddr_storage));
memset(&ssin, 0, sizeof(struct sockaddr_storage));
if (laddr->af == AF_INET)
{
//fnat要将目的信息修改为conn选择的RS的ip:port,同时将源ip和port修改为local ip相关,因为要保证outbound方向数据
//同样返回值dpvs
struct sockaddr_in *daddr, *saddr;
daddr = (struct sockaddr_in *)&dsin;
daddr->sin_family = laddr->af;
daddr->sin_addr = conn->daddr.in;
daddr->sin_port = conn->dport;
saddr = (struct sockaddr_in *)&ssin;
saddr->sin_family = laddr->af;
saddr->sin_addr = laddr->addr.in;
}
else
{
struct sockaddr_in6 *daddr, *saddr;
daddr = (struct sockaddr_in6 *)&dsin;
daddr->sin6_family = laddr->af;
daddr->sin6_addr = conn->daddr.in6;
daddr->sin6_port = conn->dport;
saddr = (struct sockaddr_in6 *)&ssin;
saddr->sin6_family = laddr->af;
saddr->sin6_addr = laddr->addr.in6;
}
//sa_fetch获取端口,来填充完整的dsin,ssin地址,选取port主要用于通过fdir后outbound方向数据返回当前lcore处理
if (sa_fetch(laddr->af, laddr->iface, &dsin, &ssin) != EDPVS_OK)
{
char buf[64];
if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL)
{
snprintf(buf, sizeof(buf), "::");
}
#ifdef CONFIG_DPVS_IPVS_DEBUG
RTE_LOG(DEBUG, IPVS, "%s: [%02d] no lport available on %s, "
"try next laddr.\\n", __func__, rte_lcore_id(), buf);
#endif
put_laddr(laddr);
continue;
}
//sport为选取出来的port
sport = (laddr->af == AF_INET ? (((struct sockaddr_in *)&ssin)->sin_port)
: (((struct sockaddr_in6 *)&ssin)->sin6_port));
found = true;
break;
}
//如果选取laddr和lport失败,返回错误
if (!found)
{
#ifdef CONFIG_DPVS_IPVS_DEBUG
RTE_LOG(ERR, IPVS, "%s: [%02d] no lip/lport available !!\\n",
__func__, rte_lcore_id());
#endif
return(EDPVS_RESOURCE);
}
rte_atomic32_inc(&laddr->conn_counts);
/* overwrite related fields in out-tuplehash and conn */
//设置conn的local address和local port
conn->laddr = laddr->addr;
conn->lport = sport;
//设置outbound方向tuplehash_entry的连接地址信息
tuplehash_out(conn).daddr = laddr->addr;
tuplehash_out(conn).dport = sport;
//关联选取的laddr
conn->local = laddr;
return(EDPVS_OK);
}
__get_laddr
static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc)
{
int step;
struct dp_vs_laddr *laddr = NULL;
/* if list not inited ? list_empty() returns true ! */
assert(svc->laddr_list.next);
//如果没有laddr,直接返回
if (list_empty(&svc->laddr_list))
{
return(NULL);
}
//随记产生一个步数
step = __laddr_step(svc);
while (step-- > 0)
{
if (unlikely(!svc->laddr_curr))
{
svc->laddr_curr = svc->laddr_list.next;
}
else
{
svc->laddr_curr = svc->laddr_curr->next;
}
//循环链表,过滤链表头
if (svc->laddr_curr == &svc->laddr_list)
{
svc->laddr_curr = svc->laddr_list.next;
}
}
//获取struct dp_vs_laddr
laddr = list_entry(svc->laddr_curr, struct dp_vs_laddr, list);
rte_atomic32_inc(&laddr->refcnt);
return(laddr);
}
__laddr_step
static inline int __laddr_step(struct dp_vs_service *svc)
{
/* Why can't we always use the next laddr(rr scheduler) to setup new session?
* Because realserver rr/wrr scheduler may get synchronous with the laddr rr
* scheduler. If so, the local IP may stay invariant for a specified realserver,
* which is a hurt for realserver concurrency performance. To avoid the problem,
* we just choose 5% sessions to use the one after the next laddr randomly.
* */
if (strncmp(svc->scheduler->name, "rr", 2) == 0 ||
strncmp(svc->scheduler->name, "wrr", 3) == 0)
{
return((random() % 100) < 5 ? 2 : 1);
}
return(1);
}
sa_fetch
int sa_fetch(int af, struct netif_port *dev,
const struct sockaddr_storage *daddr,
struct sockaddr_storage *saddr)
{
if (unlikely(daddr && daddr->ss_family != af))
{
return(EDPVS_INVAL);
}
if (unlikely(saddr && saddr->ss_family != af))
{
return(EDPVS_INVAL);
}
if (AF_INET == af)
{
return(sa4_fetch(dev, (const struct sockaddr_in *)daddr,
(struct sockaddr_in *)saddr));
}
else if (AF_INET6 == af)
{
return(sa6_fetch(dev, (const struct sockaddr_in6 *)daddr,
(struct sockaddr_in6 *)saddr));
}
else
{
return(EDPVS_NOTSUPP);
}
}
sa4_fetch
/*
* fetch unused pair by given hint.
* given @ap equivalent to @dev+@saddr, and dport is useless.
* with routing's help, the mapping looks like,
*
* +------+------------+-------+-------------------
* | | ap | | Is possible to
* |daddr | dev & saddr| sport | fetch addr pair?
* +------+------------+-------+-------------------
* Y Y ? Y Possible
* Y Y Y ? Possible
* Y Y ? ? Possible
* Y N ? Y Possible
* Y N Y ? Possible
* Y N ? ? Possible
* N Y ? Y Possible
* N Y Y ? Possible
* N Y ? ? Possible
* N N ? Y Not Possible
* N N Y ? Possible
* N N ? ? Not Possible
*
* daddr is a hint to found dev/saddr (by route/netif module).
* dev is also a hint, the saddr(ifa) is the key.
* af is needed when both saddr and daddr are NULL.
*/
static int sa4_fetch(struct netif_port *dev,
const struct sockaddr_in *daddr,
struct sockaddr_in *saddr)
{
struct inet_ifaddr *ifa;
struct flow4 fl;
struct route_entry *rt;
int err;
assert(saddr);
//首先对参数进行校验
if (saddr && saddr->sin_addr.s_addr != INADDR_ANY && saddr->sin_port != 0)
{
return(EDPVS_OK); /* everything is known, why call this function ? */
}
/* if source IP is assiged, we can find ifa->sa_pool
* without @daddr and @dev. */
//如果指定了sin_addr
if (saddr->sin_addr.s_addr)
{
//在net_device上获取对应local ip对应的IP地址信息块inet_ifaddr,主要用于获取sa_pool
ifa = inet_addr_ifa_get(AF_INET, dev, (union inet_addr *)&saddr->sin_addr);
if (!ifa)
{
return(EDPVS_NOTEXIST);
}
//如果没有配置sa_pool,则返回出错
if (!ifa->sa_pool)
{
RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without sapool.", __func__);
inet_addr_ifa_put(ifa);
return(EDPVS_INVAL);
}
//获取lport
err = sa_pool_fetch(sa_pool_hash(ifa->sa_pool, (struct sockaddr_storage *)daddr),
(struct sockaddr_storage *)saddr);
if (err == EDPVS_OK)
{
rte_atomic32_inc(&ifa->sa_pool->refcnt);
}
inet_addr_ifa_put(ifa);
return(err);
}
/* try to find source ifa by @dev and @daddr */
//如果未指定saddr,则首先根据目的地址查找出口路由
memset(&fl, 0, sizeof(struct flow4));
fl.fl4_oif = dev;
fl.fl4_daddr.s_addr = daddr ? daddr->sin_addr.s_addr : htonl(INADDR_ANY);
fl.fl4_saddr.s_addr = saddr ? saddr->sin_addr.s_addr : htonl(INADDR_ANY);
rt = route4_output(&fl);
if (!rt)
{
return(EDPVS_NOROUTE);
}
/* select source address. */
//选择一个源ip地址
if (!rt->src.s_addr)
{
inet_addr_select(AF_INET, rt->port, (union inet_addr *)&rt->dest,
RT_SCOPE_UNIVERSE, (union inet_addr *)&rt->src);
}
//根据选取的源ip获取对应的ip信息控制块
ifa = inet_addr_ifa_get(AF_INET, rt->port, (union inet_addr *)&rt->src);
if (!ifa)
{
route4_put(rt);
return(EDPVS_NOTEXIST);
}
route4_put(rt);
if (!ifa->sa_pool)
{
RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.",
__func__);
inet_addr_ifa_put(ifa);
return(EDPVS_INVAL);
}
/* do fetch socket address */
err = sa_pool_fetch(sa_pool_hash(ifa->sa_pool,
(struct sockaddr_storage *)daddr),
(struct sockaddr_storage *)saddr);
if (err == EDPVS_OK)
{
rte_atomic32_inc(&ifa->sa_pool->refcnt);
}
inet_addr_ifa_put(ifa);
return(err);
}
sa_pool_hash
/* hash dest's . if no dest provided, just use first pool. */
static inline struct sa_entry_pool *
sa_pool_hash(const struct sa_pool *ap, const struct sockaddr_storage *ss)
{
uint32_t hashkey;
assert(ap && ap->pool_hash && ap->pool_hash_sz >= 1);
if (!ss)
{
return(&ap->pool_hash[0]);
}
if (ss->ss_family == AF_INET)
{
uint16_t vect[2];
const struct sockaddr_in *sin = (const struct sockaddr_in *)ss;
vect[0] = ntohl(sin->sin_addr.s_addr) & 0xffff;
vect[1] = ntohs(sin->sin_port);
hashkey = (vect[0] + vect[1]) % ap->pool_hash_sz;
return(&ap->pool_hash[hashkey]);
}
else if (ss->ss_family == AF_INET6)
{
uint32_t vect[5] = { 0 };
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)ss;
vect[0] = sin6->sin6_port;
memcpy(&vect[1], &sin6->sin6_addr, 16);
hashkey = rte_jhash_32b(vect, 5, sin6->sin6_family) % ap->pool_hash_sz;
return(&ap->pool_hash[hashkey]);
}
else
{
return(NULL);
}
}
sa_pool_fetch
static inline int sa_pool_fetch(struct sa_entry_pool *pool,
struct sockaddr_storage *ss)
{
assert(pool && ss);
struct sa_entry * ent;
struct sockaddr_in * sin = (struct sockaddr_in *)ss;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
//从free_entries链表中获取第一个可用sa_entry
ent = list_first_entry_or_null(&pool->free_enties, struct sa_entry, list);
if (!ent)
{
#ifdef CONFIG_DPVS_SAPOOL_DEBUG
RTE_LOG(DEBUG, SAPOOL, "%s: no entry (used/free %d/%d)\\n", __func__,
pool->used_cnt, pool->free_cnt);
#endif
pool->miss_cnt++;
return(EDPVS_RESOURCE);
}
//设置local ip和port
if (ss->ss_family == AF_INET)
{
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = ent->addr.in.s_addr;
sin->sin_port = ent->port;
}
else if (ss->ss_family == AF_INET6)
{
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = ent->addr.in6;
sin6->sin6_port = ent->port;
}
else
{
return(EDPVS_NOTSUPP);
}
//标记entry正在使用中
ent->flags |= SA_F_USED;
//将其移动到used列表中
list_move_tail(&ent->list, &pool->used_enties);
//同时更新使用统计信息
pool->used_cnt++;
pool->free_cnt--;
#ifdef CONFIG_DPVS_SAPOOL_DEBUG
{
char addr[64];
RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d fetched!\\n", __func__,
inet_ntop(ss->ss_family, &ent->addr, addr, sizeof(addr)) ? : NULL,
ntohs(ent->port));
}
#endif
return(EDPVS_OK);
}