arp_init
static int arp_init(void)
{
int i, j;
int err;
//初始化neigh_table哈希表,每个lcore维护自己本地的arp缓存表
for (i = 0; i < DPVS_MAX_LCORE; i++) {
for (j = 0; j < NEIGH_TAB_SIZE; j++) {
INIT_LIST_HEAD(&neigh_table[i][j]);
}
}
master_cid = rte_lcore_id();
arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
//注册pkt_type,主要注册arp类型数据处理函数neigh_resolve_input
if ((err = netif_register_pkt(&arp_pkt_type)) != EDPVS_OK)
return err;
//注册ctrl信息回调
if ((err = sockopt_register(&neigh_sockopts)) != EDPVS_OK)
return err;
//创建neigh_ring,arp信息需要广播至每个lcore
neigh_ring_init();
//注册 loop 任务,其中slave lcore注册为LCORE_JOB_SLOW类型任务,间隔100次loop执行
snprintf(neigh_jobs[0].name, sizeof(neigh_jobs[0].name) - 1, "%s", "neigh_sync");
neigh_jobs[0].func = neigh_process_ring;
neigh_jobs[0].data = NULL;
neigh_jobs[0].type = LCORE_JOB_SLOW;
neigh_jobs[0].skip_loops = NEIGH_PROCESS_MAC_RING_INTERVAL;
if ((err = dpvs_lcore_job_register(&neigh_jobs[0], LCORE_ROLE_FWD_WORKER)) != EDPVS_OK)
return err;
snprintf(neigh_jobs[1].name, sizeof(neigh_jobs[1].name) - 1, "%s", "neigh_sync");
neigh_jobs[1].func = neigh_process_ring;
neigh_jobs[1].data = NULL;
neigh_jobs[1].type = LCORE_JOB_LOOP;
if ((err = dpvs_lcore_job_register(&neigh_jobs[1], LCORE_ROLE_MASTER)) != EDPVS_OK)
return err;
return EDPVS_OK;
}
看下netif_deliver_mbuf中对ARP包的特殊处理
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
uint16_t eth_type,
struct netif_port *dev,
struct netif_queue_conf *qconf,
bool forward2kni,
lcoreid_t cid,
bool pkts_from_ring)
{
...
if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring)
{
struct rte_mempool *mbuf_pool;
struct rte_mbuf * mbuf_clone;
uint8_t i;
struct arp_hdr * arp;
unsigned socket_id;
socket_id = rte_socket_id();
mbuf_pool = pktmbuf_pool[socket_id];
rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr));
arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *);
rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr));
//判断arp_op是否是ARP_OP_REPLY,是的话,需要clone mbuf,然后调用rte_ring_enqueue发送到其他每个核一份
if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY)
{
for (i = 0; i < DPVS_MAX_LCORE; i++)
{
if ((i == cid) || (!is_lcore_id_fwd(i)) ||
(i == rte_get_master_lcore()))
{
continue;
}
/*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/
mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool);
if (mbuf_clone)
{
int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone);
if (unlikely(-EDQUOT == ret))
{
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\\n",
__func__, i);
}
else if (ret < 0)
{
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\\n",
__func__, i);
rte_pktmbuf_free(mbuf_clone);
}
}
}
}
}
...
}
ARP协议pkt_type
static struct pkt_type arp_pkt_type = {
.type = rte_cpu_to_be_16(ETHER_TYPE_ARP),
.func = neigh_resolve_input,
.port = NULL,
};
ARP数据包
/**
* ARP header IPv4 payload.
*/
struct rte_arp_ipv4 {
struct rte_ether_addr arp_sha; /**< sender hardware address */
uint32_t arp_sip; /**< sender IP address */
struct rte_ether_addr arp_tha; /**< target hardware address */
uint32_t arp_tip; /**< target IP address */
} __attribute__((__packed__)) __attribute__((aligned(2)));
/**
* ARP header.
*/
struct rte_arp_hdr {
uint16_t arp_hardware; /* format of hardware address */
#define RTE_ARP_HRD_ETHER 1 /* ARP Ethernet address format */
uint16_t arp_protocol; /* format of protocol address */
uint8_t arp_hlen; /* length of hardware address */
uint8_t arp_plen; /* length of protocol address */
uint16_t arp_opcode; /* ARP opcode (command) */
#define RTE_ARP_OP_REQUEST 1 /* request to resolve address */
#define RTE_ARP_OP_REPLY 2 /* response to previous request */
#define RTE_ARP_OP_REVREQUEST 3 /* request proto addr given hardware */
#define RTE_ARP_OP_REVREPLY 4 /* response giving protocol address */
#define RTE_ARP_OP_INVREQUEST 8 /* request to identify peer */
#define RTE_ARP_OP_INVREPLY 9 /* response identifying peer */
struct rte_arp_ipv4 arp_data;
} __attribute__((__packed__)) __attribute__((aligned(2)));
neigh_resolve_input
int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port)
{
//arp指向接收包的arp首部
struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *);
struct ether_hdr *eth;
uint32_t ipaddr;
struct neighbour_entry *neighbour = NULL;
unsigned int hashkey;
struct inet_ifaddr *ifa;
//根据ARP数据包中的arp_tip获取对应的IP信息配置块
ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr*)&arp->arp_data.arp_tip);
if (!ifa)
return EDPVS_KNICONTINUE;
inet_addr_ifa_put(ifa);
//eth指向L2层首部
eth = (struct ether_hdr *)rte_pktmbuf_prepend(m,
(uint16_t)sizeof(struct ether_hdr));
//判断ARP请求类型,如果是ARP_OP_REQUEST,生成ARP应答包(此处复用接收到的ARP请求包),调用netif_xmit发送出去
if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REQUEST) {
//填充回复包的L2层地址
ether_addr_copy(ð->s_addr, ð->d_addr);
rte_memcpy(ð->s_addr, &port->addr, 6);
//arp包中的操作类型变为ARP_OP_REPLY
arp->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY);
ether_addr_copy(&arp->arp_data.arp_sha, &arp->arp_data.arp_tha);
ether_addr_copy(ð->s_addr, &arp->arp_data.arp_sha);
ipaddr = arp->arp_data.arp_sip;
arp->arp_data.arp_sip = arp->arp_data.arp_tip;
arp->arp_data.arp_tip = ipaddr;
m->l2_len = sizeof(struct ether_hdr);
m->l3_len = sizeof(struct arp_hdr);
netif_xmit(m, port);
return EDPVS_OK;
} else if (arp->arp_op == htons(ARP_OP_REPLY)) {
ipaddr = arp->arp_data.arp_sip;
//如果数据包是ARP_OP_REPLY,根据源ip和网卡生成hashkey
hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port);
//查询neighbour条目
neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr,
port, hashkey);
//如果查找到邻居项缓存,并且不是STATIC类型,则首先更新缓存项中的邻居mac地址
if (neighbour && !(neighbour->flag & NEIGHBOUR_STATIC)) {
neigh_edit(neighbour, &arp->arp_data.arp_sha);
} else {
//否则,创建新的邻居缓存项并加入邻居项hash表中
neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr,
&arp->arp_data.arp_sha, port, hashkey, 0);
if (!neighbour) {
RTE_LOG(ERR, NEIGHBOUR, "%s: add neighbour wrong\\n", __func__);
rte_pktmbuf_free(m);
return EDPVS_NOMEM;
}
}
//更新邻居项状态
neigh_entry_state_trans(neighbour, 1);
//将缓存在邻居项等待队列中的数据报发送出去
neigh_send_mbuf_cach(neighbour);
return EDPVS_KNICONTINUE;
} else {
//其他操作类型,dpvs不处理
rte_pktmbuf_free(m);
return EDPVS_DROP;
}
}
邻居项定义
struct neighbour_entry {
//协议族
int af;
struct list_head neigh_list;
//邻居ip地址
union inet_addr ip_addr;
//邻居mac地址
struct ether_addr eth_addr;
//出口port
struct netif_port *port;
//维护arp状态机的定时器
struct dpvs_timer timer;
//邻居项上等待发送的neighbour_mbuf_entry队列,neighbour_mbuf_entry中有mbuf的指针
struct list_head queue_list;
uint32_t que_num;
//邻居项状态
uint32_t state;
uint32_t ts;
uint8_t flag;
} __rte_cache_aligned;
邻居项状态迁移
enum {
DPVS_NUD_S_NONE = 0,
DPVS_NUD_S_SEND,
DPVS_NUD_S_REACHABLE,
DPVS_NUD_S_PROBE,
DPVS_NUD_S_DELAY,
DPVS_NUD_S_MAX /*Reserved*/
};
#define sNNO DPVS_NUD_S_NONE
#define sNSD DPVS_NUD_S_SEND
#define sNRE DPVS_NUD_S_REACHABLE
#define sNPR DPVS_NUD_S_PROBE
#define sNDE DPVS_NUD_S_DELAY
struct nud_state {
int next_state[DPVS_NUD_S_MAX];
};
static struct nud_state nud_states[] = {
/* sNNO, sNSD, sNRE, sNPR, sNDE*/
/*send arp*/ {
{sNSD, sNSD, sNKP, sNDE, sNDE}},
/*recv arp*/ {
{sNRE, sNRE, sNRE, sNRE, sNRE}},
/*ack confirm*/ {
{sNKP, sNKP, sNRE, sNRE, sNRE}},
/*mbuf ref*/ {
{sNKP, sNKP, sNKP, sNPR, sNKP}},
/*timeout*/ {
{sNNO, sNNO, sNPR, sNNO, sNNO}},
};
邻居项不同状态超时时间
#define DPVS_NEIGH_TIMEOUT_DEF 60
//单位为seconds
static int nud_timeouts[DPVS_NUD_S_MAX] = {
[DPVS_NUD_S_NONE] = 2,
[DPVS_NUD_S_SEND] = 3,
[DPVS_NUD_S_REACHABLE] = DPVS_NEIGH_TIMEOUT_DEF,
[DPVS_NUD_S_PROBE] = 30,
[DPVS_NUD_S_DELAY] = 3,
};
状态迁移处理
void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx)
{
struct timeval timeout;
/* DPVS_NUD_S_KEEP is not a real state, just use it to keep original state */
//如果状态迁移后保持原有状态或者邻居项的状态为STATIC(一般是系统管理员配置的),则不作任何处理
if ((nud_states[idx].next_state[neighbour->state] != DPVS_NUD_S_KEEP)
&& !(neighbour->flag & NEIGHBOUR_STATIC)) {
//首先获取原有状态
int old_state = neighbour->state;
struct timespec now = { 0 };
//设置邻居项的新状态
neighbour->state = nud_states[idx].next_state[neighbour->state];
if (neighbour->state == old_state) {
if (likely(clock_gettime(CLOCK_REALTIME_COARSE, &now)) == 0)
/* frequent timer updates hurt performance,
* do not update timer unless half timeout passed */
if ((now.tv_sec - neighbour->ts) * 2 < nud_timeouts[old_state])
return;
}
//重新获取邻居项的超时时间,更新超时定时器
timeout.tv_sec = nud_timeouts[neighbour->state];
timeout.tv_usec = 0;
dpvs_time_rand_delay(&timeout, 200000); /* delay 200ms randomly to avoid timer performance problem */
dpvs_timer_update_nolock(&neighbour->timer, &timeout, false);
neighbour->ts = now.tv_sec;
#ifdef CONFIG_DPVS_NEIGH_DEBUG
if (neighbour->state != old_state)
{
char buf[512];
dump_neigh_entry(neighbour, buf, sizeof(buf));
RTE_LOG(INFO, NEIGHBOUR, "[%02d] neighbor (%s) trans state: %s -> %s, idx:%d.\\n",
rte_lcore_id(), buf, nud_state_name(old_state),
nud_state_name(neighbour->state), idx);
}
#endif
}
}
创建新的邻居项
neigh_add_table
struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr,
const struct ether_addr *eth_addr,
struct netif_port *port,
unsigned int hashkey, int flag)
{
struct neighbour_entry *new_neighbour=NULL;
struct timeval delay;
lcoreid_t cid = rte_lcore_id();
//创建新的neighbour_entry arp条目,从缓存池中创建
new_neighbour = dpvs_mempool_get(neigh_mempool, sizeof(struct neighbour_entry));
if (unlikely(new_neighbour == NULL))
return NULL;
//邻居项赋值
rte_memcpy(&new_neighbour->ip_addr, ipaddr,
sizeof(union inet_addr));
new_neighbour->flag = flag;
new_neighbour->af = af;
//eth_addr为空时,标识是新建项,新建时邻居项状态为DPVS_NUD_S_NONE;否则是接收到邻居项的回复(但是没有查找到邻居项)时新建
if (eth_addr) {
rte_memcpy(&new_neighbour->eth_addr, eth_addr, 6);
new_neighbour->state = DPVS_NUD_S_REACHABLE;
} else {
new_neighbour->state = DPVS_NUD_S_NONE;
}
new_neighbour->port = port;
new_neighbour->que_num = 0;
//根据邻居项的状态,确定定时器时间
delay.tv_sec = nud_timeouts[new_neighbour->state];
delay.tv_usec = 0;
INIT_LIST_HEAD(&new_neighbour->queue_list);
//加到定时器,如果处于 DPVS_NUD_S_NONE 状态,neighbour_timer_event 会将条目删除
if (!(new_neighbour->flag & NEIGHBOUR_STATIC)) {
dpvs_time_rand_delay(&delay, 200000); /* delay 200ms randomly to avoid timer performance problem */
dpvs_timer_sched(&new_neighbour->timer, &delay,
neighbour_timer_event, new_neighbour, false);
}
//将arp条目添加到arp表中
neigh_hash(new_neighbour, hashkey);
neigh_nums[cid]++;
#ifdef CONFIG_DPVS_NEIGH_DEBUG
{
char buf[512];
dump_neigh_entry(new_neighbour, buf, sizeof(buf));
RTE_LOG(INFO, NEIGHBOUR, "[%02d] add neigh entry: %s\\n", cid, buf);
}
#endif
return new_neighbour;
}
邻居项超时处理
static int neighbour_timer_event(void *data)
{
struct neighbour_entry *neighbour = data;
//如果处于DPVS_NUD_S_NONE状态时超时,则需要清理邻居项
if (neighbour->state == DPVS_NUD_S_NONE) {
return neigh_entry_expire(neighbour);
}
//更新邻居项状态
neigh_entry_state_trans(neighbour, 4);
return DTIMER_OK;
}
邻居项清理
static int neigh_entry_expire(struct neighbour_entry *neighbour)
{
struct neighbour_mbuf_entry *mbuf, *mbuf_next;
lcoreid_t cid = rte_lcore_id();
assert(cid != master_cid);
//首先取消定时器操作
dpvs_timer_cancel_nolock(&neighbour->timer, false);
//将邻居项从hash表中解除
neigh_unhash(neighbour);
#ifdef CONFIG_DPVS_NEIGH_DEBUG
{
char buf[512];
dump_neigh_entry(neighbour, buf, sizeof(buf));
RTE_LOG(INFO, NEIGHBOUR, "%s:[%02d] del neigh entry: %s\\n", __func__, cid, buf);
}
#endif
/* release pkts saved in neighbour entry */
//释放缓存发送队列上的数据包
list_for_each_entry_safe(mbuf, mbuf_next,
&neighbour->queue_list, neigh_mbuf_list) {
list_del(&mbuf->neigh_mbuf_list);
rte_pktmbuf_free(mbuf->m);
dpvs_mempool_put(neigh_mempool, mbuf);
}
//释放邻居项资源
dpvs_mempool_put(neigh_mempool, neighbour);
neigh_nums[cid]--;
return DTIMER_STOP;
}
邻居项确认
void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port)
{
struct neighbour_entry *neighbour;
unsigned int hashkey;
lcoreid_t cid = rte_lcore_id();
/*find nexhop/neighbour to confirm, no matter whether it is the route in*/
hashkey = neigh_hashkey(af, nexthop, port);
list_for_each_entry(neighbour, &neigh_table[cid][hashkey], neigh_list) {
if (neigh_key_cmp(af, neighbour, nexthop, port) &&
!(neighbour->flag & NEIGHBOUR_STATIC)) {
neigh_entry_state_trans(neighbour, 2);
}
}
}
arp请求
static void neigh_state_confirm(struct neighbour_entry *neighbour)
{
union inet_addr saddr, daddr;
memset(&saddr, 0, sizeof(saddr));
if (neighbour->af == AF_INET) {
daddr.in.s_addr = neighbour->ip_addr.in.s_addr;
//选择出口saddr
inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr);
if (!saddr.in.s_addr)
RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__);
//发送ARP请求
if (neigh_send_arp(neighbour->port, saddr.in.s_addr,
daddr.in.s_addr) != EDPVS_OK)
RTE_LOG(ERR, NEIGHBOUR, "%s: send arp failed\\n", __func__);
} else if (neighbour->af == AF_INET6) {
ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6);
inet_addr_select(AF_INET6, neighbour->port, &daddr, 0, &saddr);
if (ipv6_addr_any(&saddr.in6))
RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__);
ndisc_solicit(neighbour, &saddr.in6);
}
}
//构造ARP请求包,发送ARP请求
static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip)
{
struct rte_mbuf *m;
struct ether_hdr *eth;
struct arp_hdr *arp;
uint32_t addr;
m = rte_pktmbuf_alloc(port->mbuf_pool);
if (unlikely(m == NULL)) {
return EDPVS_NOMEM;
}
m->userdata = NULL;
eth = rte_pktmbuf_mtod(m, struct ether_hdr *);
arp = (struct arp_hdr *)ð[1];
memset(ð->d_addr, 0xFF, 6);
ether_addr_copy(&port->addr, ð->s_addr);
eth->ether_type = htons(ETHER_TYPE_ARP);
memset(arp, 0, sizeof(struct arp_hdr));
rte_memcpy(&arp->arp_data.arp_sha, &port->addr, 6);
addr = src_ip;
inetAddrCopy(&arp->arp_data.arp_sip, &addr);
memset(&arp->arp_data.arp_tha, 0, 6);
addr = dst_ip;
inetAddrCopy(&arp->arp_data.arp_tip, &addr);
arp->arp_hrd = htons(ARP_HRD_ETHER);
arp->arp_pro = htons(ETHER_TYPE_IPv4);
arp->arp_hln = 6;
arp->arp_pln = 4;
arp->arp_op = htons(ARP_OP_REQUEST);
m->pkt_len = 60;
m->data_len = 60;
m->l2_len = sizeof(struct ether_hdr);
m->l3_len = sizeof(struct arp_hdr);
memset(&arp[1], 0, 18);
#ifdef CONFIG_DPVS_NEIGH_DEBUG
dump_arp_hdr("send", arp, port->id);
#endif
netif_xmit(m, port);
return EDPVS_OK;
}