【Linux4.1.12源码分析】VXLAN报文内核协议栈处理

4.1.12内核已经支持vxlan报文的gro功能,意味着vxlan报文交给协议栈之前,已经被聚合过了,而在早期的内核中聚合逻辑是在encap_rcv函数之后实现的。

之前分析的UDP报文处理中,可以知道如果udp_sock定义了encap_rcv函数,将会把报文交给该函数处理,而不是传统的保存到sock队列,唤醒进程收包。

udp_sock定义的encap_rcv函数是在vxlan_socket_create函数中设置的,实际是vxlan_udp_encap_recv函数。

vxlan_udp_encap_recv函数

/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_sock *vs;
	struct vxlanhdr *vxh;
	u32 flags, vni;
	struct vxlan_metadata md = {0};

	/* Need Vxlan and inner Ethernet header to be present */
	if (!pskb_may_pull(skb, VXLAN_HLEN))	//报文长度检测
		goto error;

	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);	//得到vxlan头指针,和UDP头长度相同,所以可以这么操作
	flags = ntohl(vxh->vx_flags);	
	vni = ntohl(vxh->vx_vni);

	if (flags & VXLAN_HF_VNI) {		//发送的vxlan报文,该flag必须置1
		flags &= ~VXLAN_HF_VNI;
	} else {
		/* VNI flag always required to be set */
		goto bad_flags;
	}

	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))	//报文移动到内层报文
		goto drop;
	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		goto drop;

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {  //VXLAN_HF_RCO意味着发送端的vxlan设置了VXLAN_F_REMCSUM_TX标记
		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,	//并且报文的ip_summed == CHECKSUM_PARTIAL
				    !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL));	//remcsum检测,检测失败丢弃该报文
		if (!vxh)
			goto drop;

		flags &= ~VXLAN_HF_RCO;		//flags去掉VXLAN_HF_RCO标记
		vni &= VXLAN_VNI_MASK;		//vni去掉低8位内容,仅剩下vni ID
	}

	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
		struct vxlanhdr_gbp *gbp;

		gbp = (struct vxlanhdr_gbp *)vxh;
		md.gbp = ntohs(gbp->policy_id);

		if (gbp->dont_learn)
			md.gbp |= VXLAN_GBP_DONT_LEARN;

		if (gbp->policy_applied)
			md.gbp |= VXLAN_GBP_POLICY_APPLIED;

		flags &= ~VXLAN_GBP_USED_BITS;
	}

	if (flags || vni & ~VXLAN_VNI_MASK) {	//flags没有其他标记,vni低8为0
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
		 * maintains compatibility with previous stack code, and also
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */

		goto bad_flags;
	}

	md.vni = vxh->vx_vni;
	vs->rcv(vs, skb, &md);	//内核定义了vxlan_rcv,如果是内核自带OVS创建vxlan端口,则使用OVS定义的vxlan_rcv函数。
	return 0;

drop:
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;

bad_flags:
	netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));

error:
	/* Return non vxlan pkt */
	return 1;
}
vxlan_rcv函数(内核自带OVS创建vxlan端口时指定)

static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
		      struct vxlan_metadata *md)
{
	struct ovs_tunnel_info tun_info;
	struct vxlan_port *vxlan_port;
	struct vport *vport = vs->data;	
	struct iphdr *iph;
	struct ovs_vxlan_opts opts = {
		.gbp = md->gbp,
	};
	__be64 key;
	__be16 flags;

	flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
	vxlan_port = vxlan_vport(vport);
	if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
		flags |= TUNNEL_VXLAN_OPT;

	/* Save outer tunnel values */
	iph = ip_hdr(skb);
	key = cpu_to_be64(ntohl(md->vni) >> 8);
	ovs_flow_tun_info_init(&tun_info, iph,
			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
			       key, flags, &opts, sizeof(opts));

	ovs_vport_receive(vport, skb, &tun_info);	//调用OVS收包函数
}
vxlan_rcv(内核自带)

static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
		      struct vxlan_metadata *md)
{
	struct iphdr *oip = NULL;
	struct ipv6hdr *oip6 = NULL;
	struct vxlan_dev *vxlan;
	struct pcpu_sw_netstats *stats;
	union vxlan_addr saddr;
	__u32 vni;
	int err = 0;
	union vxlan_addr *remote_ip;

	vni = ntohl(md->vni) >> 8;
	/* Is this VNI defined? */
	vxlan = vxlan_vs_find_vni(vs, vni);
	if (!vxlan)
		goto drop;

	remote_ip = &vxlan->default_dst.remote_ip;
	skb_reset_mac_header(skb);	
	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
	skb->protocol = eth_type_trans(skb, vxlan->dev);	//解析报文protocol,同时会设置skb的dev为vxlan设备
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);	//报文移动到IP头,netif_receive_skb要求

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))	//报文源mac等于vxlan设备的mac,丢弃报文
		goto drop;

	/* Re-examine inner Ethernet packet */
	if (remote_ip->sa.sa_family == AF_INET) {
		oip = ip_hdr(skb);
		saddr.sin.sin_addr.s_addr = oip->saddr;
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		oip6 = ipv6_hdr(skb);
		saddr.sin6.sin6_addr = oip6->saddr;
		saddr.sa.sa_family = AF_INET6;
#endif
	}

	if ((vxlan->flags & VXLAN_F_LEARN) &&
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))	//vxlan fdb表学习,记录mac和ip的对应关系
		goto drop;

	skb_reset_network_header(skb);
	skb->mark = md->gbp;

	if (oip6)
		err = IP6_ECN_decapsulate(oip6, skb);
	if (oip)
		err = IP_ECN_decapsulate(oip, skb);	//内外层tos检测

	if (unlikely(err)) {
		if (log_ecn_error) {
			if (oip6)
				net_info_ratelimited("non-ECT from %pI6\n",
						     &oip6->saddr);
			if (oip)
				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
						     &oip->saddr, oip->tos);
		}
		if (err > 1) {
			++vxlan->dev->stats.rx_frame_errors;
			++vxlan->dev->stats.rx_errors;
			goto drop;
		}
	}

	stats = this_cpu_ptr(vxlan->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

	netif_rx(skb);			//交给协议栈处理,skb的dev为vxlan_dev

	return;
drop:
	/* Consume bad packet */
	kfree_skb(skb);
}


你可能感兴趣的:(Linux4.1.12源码分析)