VXLAN gro处理不是通过定义常量实现的,而是通过动态注册的实现的,在UDP层处理时我们知道是注册到udp_offload_base全局变量中,我们先看下注册过程
vxlan_socket_create函数
/* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); INIT_WORK(&vs->del_work, vxlan_del_work); sock = vxlan_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { pr_info("Cannot bind port %d, err=%ld\n", ntohs(port), PTR_ERR(sock)); kfree(vs); return ERR_CAST(sock); } vs->sock = sock; atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ vs->udp_offloads.port = port; vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); vxlan_notify_add_rx_port(vs); //注册vxlan gro offload spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_udp_encap_recv; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; }
vxlan_notify_add_rx_port函数
static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) { struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); sa_family_t sa_family = sk->sk_family; __be16 port = inet_sk(sk)->inet_sport; int err; if (sa_family == AF_INET) { err = udp_add_offload(&vs->udp_offloads);<span style="white-space:pre"> </span>//注册vxlan gro offload对象 if (err) pr_warn("vxlan: udp_add_offload failed with status %d\n", err); } rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->netdev_ops->ndo_add_vxlan_port) dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, port); } rcu_read_unlock(); }udp_add_offload函数
int udp_add_offload(struct udp_offload *uo) { struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); if (!new_offload) return -ENOMEM; new_offload->offload = uo; spin_lock(&udp_offload_lock); new_offload->next = udp_offload_base; rcu_assign_pointer(udp_offload_base, new_offload);<span style="white-space:pre"> </span>//注册到udp_offload_base全局变量中 spin_unlock(&udp_offload_lock); return 0; }接下来,我们看下vxlan gro offload的实现,包括vxlan_gro_receive和vxlan_gro_complete:
vxlan_gro_receive函数
static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb, struct udp_offload *uoff) { struct sk_buff *p, **pp = NULL; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, //通过udp_offload对象获得vxlan_sock对象 udp_offloads); u32 flags; struct gro_remcsum grc; skb_gro_remcsum_init(&grc); //初始化grc off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); vh = skb_gro_header_fast(skb, off_vx); //得到vxlan头 if (skb_gro_header_hard(skb, hlen)) { vh = skb_gro_header_slow(skb, hlen, off_vx); if (unlikely(!vh)) goto out; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ //移动到数据区,实际是内层mac头 skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); //csum值刷新 flags = ntohl(vh->vx_flags); if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { //即发送方携带SKB_GSO_TUNNEL_REMCSUM标记 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), //remcsum校验和刷新 ntohl(vh->vx_vni), &grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) //校验不通过,提交当前报文到协议栈 goto out; } flush = 0; for (p = *head; p; p = p->next) { //遍历gro_list中的报文 if (!NAPI_GRO_CB(p)->same_flow) continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); if (vh->vx_flags != vh2->vx_flags || //flags和vni相同才是同一个流 vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } pp = eth_gro_receive(head, skb); //调用mac层的gro_receive out: skb_gro_remcsum_cleanup(skb, &grc); NAPI_GRO_CB(skb)->flush |= flush; return pp; }eth_gro_receive函数
struct sk_buff **eth_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff *p, **pp = NULL; struct ethhdr *eh, *eh2; unsigned int hlen, off_eth; const struct packet_offload *ptype; __be16 type; int flush = 1; off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); eh = skb_gro_header_fast(skb, off_eth); //得到mac头 if (skb_gro_header_hard(skb, hlen)) { eh = skb_gro_header_slow(skb, hlen, off_eth); if (unlikely(!eh)) goto out; } flush = 0; for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; eh2 = (struct ethhdr *)(p->data + off_eth); if (compare_ether_header(eh, eh2)) { //mac头相同则为同一个流 NAPI_GRO_CB(p)->same_flow = 0; continue; } } type = eh->h_proto; //得到3层协议类型,例如IPV4协议 rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (ptype == NULL) {<span style="white-space:pre"> </span>//未找到协议注册的gro offload等,则把报文提交给协议栈 flush = 1; goto out_unlock; } skb_gro_pull(skb, sizeof(*eh)); //报文移到IP头 skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); //刷新csum值 pp = ptype->callbacks.gro_receive(head, skb); //调用ip层的gro_receive函数 out_unlock: rcu_read_unlock(); out: NAPI_GRO_CB(skb)->flush |= flush; return pp; }
static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, struct udp_offload *uoff) { udp_tunnel_gro_complete(skb, nhoff); //设置skb_shinfo(skb)->gso_type值 return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); }eth_gro_complete函数
int eth_gro_complete(struct sk_buff *skb, int nhoff) { struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); //得到mac头 __be16 type = eh->h_proto; struct packet_offload *ptype; int err = -ENOSYS; if (skb->encapsulation) skb_set_inner_mac_header(skb, nhoff); //设置inner mac header rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype != NULL) err = ptype->callbacks.gro_complete(skb, nhoff + //调用ip层的gro_complete函数 sizeof(struct ethhdr)); rcu_read_unlock(); return err; }