4.1.12内核已经支持vxlan报文的gro功能,意味着vxlan报文交给协议栈之前,已经被聚合过了,而在早期的内核中聚合逻辑是在encap_rcv函数之后实现的。
之前分析的UDP报文处理中,可以知道如果udp_sock定义了encap_rcv函数,将会把报文交给该函数处理,而不是传统的保存到sock队列,唤醒进程收包。
udp_sock定义的encap_rcv函数是在vxlan_socket_create函数中设置的,实际是vxlan_udp_encap_recv函数。
vxlan_udp_encap_recv函数
/* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; struct vxlan_metadata md = {0}; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) //报文长度检测 goto error; vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); //得到vxlan头指针,和UDP头长度相同,所以可以这么操作 flags = ntohl(vxh->vx_flags); vni = ntohl(vxh->vx_vni); if (flags & VXLAN_HF_VNI) { //发送的vxlan报文,该flag必须置1 flags &= ~VXLAN_HF_VNI; } else { /* VNI flag always required to be set */ goto bad_flags; } if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) //报文移动到内层报文 goto drop; vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { //VXLAN_HF_RCO意味着发送端的vxlan设置了VXLAN_F_REMCSUM_TX标记 vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, //并且报文的ip_summed == CHECKSUM_PARTIAL !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); //remcsum检测,检测失败丢弃该报文 if (!vxh) goto drop; flags &= ~VXLAN_HF_RCO; //flags去掉VXLAN_HF_RCO标记 vni &= VXLAN_VNI_MASK; //vni去掉低8位内容,仅剩下vni ID } /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) { struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; md.gbp = ntohs(gbp->policy_id); if (gbp->dont_learn) md.gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) md.gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } if (flags || vni & ~VXLAN_VNI_MASK) { //flags没有其他标记,vni低8为0 /* If there are any unprocessed flags remaining treat * this as a malformed packet. This behavior diverges from * VXLAN RFC (RFC7348) which stipulates that bits in reserved * in reserved fields are to be ignored. The approach here * maintains compatibility with previous stack code, and also * is more robust and provides a little more security in * adding extensions to VXLAN. */ goto bad_flags; } md.vni = vxh->vx_vni; vs->rcv(vs, skb, &md); //内核定义了vxlan_rcv,如果是内核自带OVS创建vxlan端口,则使用OVS定义的vxlan_rcv函数。 return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; bad_flags: netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: /* Return non vxlan pkt */ return 1; }vxlan_rcv函数(内核自带OVS创建vxlan端口时指定)
static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, struct vxlan_metadata *md) { struct ovs_tunnel_info tun_info; struct vxlan_port *vxlan_port; struct vport *vport = vs->data; struct iphdr *iph; struct ovs_vxlan_opts opts = { .gbp = md->gbp, }; __be64 key; __be16 flags; flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); vxlan_port = vxlan_vport(vport); if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) flags |= TUNNEL_VXLAN_OPT; /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(md->vni) >> 8); ovs_flow_tun_info_init(&tun_info, iph, udp_hdr(skb)->source, udp_hdr(skb)->dest, key, flags, &opts, sizeof(opts)); ovs_vport_receive(vport, skb, &tun_info); //调用OVS收包函数 }vxlan_rcv(内核自带)
static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, struct vxlan_metadata *md) { struct iphdr *oip = NULL; struct ipv6hdr *oip6 = NULL; struct vxlan_dev *vxlan; struct pcpu_sw_netstats *stats; union vxlan_addr saddr; __u32 vni; int err = 0; union vxlan_addr *remote_ip; vni = ntohl(md->vni) >> 8; /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) goto drop; remote_ip = &vxlan->default_dst.remote_ip; skb_reset_mac_header(skb); skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); skb->protocol = eth_type_trans(skb, vxlan->dev); //解析报文protocol,同时会设置skb的dev为vxlan设备 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); //报文移动到IP头,netif_receive_skb要求 /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) //报文源mac等于vxlan设备的mac,丢弃报文 goto drop; /* Re-examine inner Ethernet packet */ if (remote_ip->sa.sa_family == AF_INET) { oip = ip_hdr(skb); saddr.sin.sin_addr.s_addr = oip->saddr; saddr.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { oip6 = ipv6_hdr(skb); saddr.sin6.sin6_addr = oip6->saddr; saddr.sa.sa_family = AF_INET6; #endif } if ((vxlan->flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) //vxlan fdb表学习,记录mac和ip的对应关系 goto drop; skb_reset_network_header(skb); skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); if (oip) err = IP_ECN_decapsulate(oip, skb); //内外层tos检测 if (unlikely(err)) { if (log_ecn_error) { if (oip6) net_info_ratelimited("non-ECT from %pI6\n", &oip6->saddr); if (oip) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &oip->saddr, oip->tos); } if (err > 1) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; goto drop; } } stats = this_cpu_ptr(vxlan->dev->tstats); u64_stats_update_begin(&stats->syncp); stats->rx_packets++; stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); netif_rx(skb); //交给协议栈处理,skb的dev为vxlan_dev return; drop: /* Consume bad packet */ kfree_skb(skb); }