【OVS2.5.0源码分析】vxlan发包流程分析

发包处理函数最终会调用到ovs_vport_send函数,该函数最终会调用vport_ops的send函数。

1、ovs_vport_send函数

void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
	int mtu = vport->dev->mtu;

	if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
		net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
				     vport->dev->name,
				     packet_length(skb), mtu);
		vport->dev->stats.tx_errors++;
		goto drop;
	}

	skb->dev = vport->dev;		//vport关联的设备,vxlan端口的设备为vxlan_4789
	vport->ops->send(skb);		//实际调用ovs_vxlan_netdev_vport_ops的vxlan_xmit函数
	return;

drop:
	kfree_skb(skb);
}
2、vxlan_xmit函数

#define vxlan_xmit rpl_vxlan_xmit			//在3.18内核中,OVS还是使用自己的vxlan实现
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;		//ovs_vport_send函数中完成设置
	struct vxlan_dev *vxlan = netdev_priv(dev);	//该信息在创建vxlan端口对应的net_device时就初始化了
	const struct ip_tunnel_info *info;

	info = skb_tunnel_info(skb);    //得到tunnel信息,即execute_set_action函数设置的内容

	skb_reset_mac_header(skb);

	if ((vxlan->flags & VXLAN_F_PROXY))	//当前没有此标记
		goto out;

	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
	    info && info->mode & IP_TUNNEL_INFO_TX) {
		vxlan_xmit_one(skb, dev, NULL, false);	//发送报文
		return NETDEV_TX_OK;
	}
out:
	pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
	kfree_skb(skb);
	return NETDEV_TX_OK;
}
3、vxlan_xmit_one函数

static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
			   struct vxlan_rdst *rdst, bool did_rsc)
{
	struct ip_tunnel_info *info;
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct sock *sk = vxlan->vn_sock->sock->sk;
	unsigned short family = vxlan_get_sk_family(vxlan->vn_sock);	//通过sock判断是IPV4还是IPV6
	struct rtable *rt = NULL;
	const struct iphdr *old_iph;
	struct flowi4 fl4;
	union vxlan_addr *dst;
	union vxlan_addr remote_ip;
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
	__be16 src_port = 0, dst_port;
	u32 vni;
	__be16 df = 0;
	__u8 tos, ttl;
	int err;
	u32 flags = vxlan->flags;

	info = skb_tunnel_info(skb);		//从skb中获取tunnel info

	if (rdst) {		//不进入此分支
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
		vni = rdst->remote_vni;
		dst = &rdst->remote_ip;
	} else {
		if (!info) {		//说明当前实现,对于报文从vxlan端口出去,必须设置tunnel info
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
		if (family != ip_tunnel_info_af(info))		//判断协议类型是否一致
			goto drop;

		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;	//目的端口优先从tunnel info中获取
		vni = be64_to_cpu(info->key.tun_id);			//VNI信息从tunnel info中获取
		remote_ip.sa.sa_family = family;
		if (family == AF_INET)
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;	//目的IP地址从tunnel info中获取
		else
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;	//目的IP地址从tunnel info中获取
		dst = &remote_ip;
	}

	if (vxlan_addr_any(dst)) {		//目的IP地址为全零,当前不支持
		if (did_rsc) {
			/* short-circuited back to local bridge */
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
		}
		goto drop;
	}

	old_iph = ip_hdr(skb);			//skb的IP头

	ttl = vxlan->cfg.ttl;			//获取vxlan配置中的ttl
	if (!ttl && vxlan_addr_multicast(dst))
		ttl = 1;

	tos = vxlan->cfg.tos;			//获取vxlan配置中的tos
	if (tos == 1)
		tos = ip_tunnel_get_dsfield(old_iph, skb);

	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,	//计算源端口
				     vxlan->cfg.port_max, true);

	if (info) {
		if (info->key.tun_flags & TUNNEL_CSUM)
			flags |= VXLAN_F_UDP_CSUM;
		else
			flags &= ~VXLAN_F_UDP_CSUM;

		ttl = info->key.ttl;		//优先使用tunnel info中的ttl
		tos = info->key.tos;		//优先使用tunnel info中的tos

		if (info->options_len)
			md = ip_tunnel_info_opts(info);
	} else {
		md->gbp = skb->mark;
	}

	if (dst->sa.sa_family == AF_INET) {		//如果是IPV4
		if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
			df = htons(IP_DF);

		memset(&fl4, 0, sizeof(fl4));
		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
		fl4.flowi4_tos = RT_TOS(tos);
		fl4.flowi4_mark = skb->mark;
		fl4.flowi4_proto = IPPROTO_UDP;
		fl4.daddr = dst->sin.sin_addr.s_addr;
		fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;

		rt = ip_route_output_key(vxlan->net, &fl4);		//路由表查找
		if (IS_ERR(rt)) {
			netdev_dbg(dev, "no route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}

		if (rt_dst(rt).dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.collisions++;
			goto rt_tx_error;
		}

		/* Bypass encapsulation if the destination is local */
		if (rt->rt_flags & RTCF_LOCAL &&
		    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {	//正常场景不进入此分支
			struct vxlan_dev *dst_vxlan;

			ip_rt_put(rt);
			dst_vxlan = vxlan_find_vni(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
			goto tx_error;
		}

		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);		
		ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt));		//如果ttl为零,则从rt表项中获取
		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,		//发送报文
				     dst->sin.sin_addr.s_addr, tos, ttl, df,
				     src_port, dst_port, htonl(vni << 8), md,
				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
				     flags);
		if (err < 0) {
			/* skb is already freed. */
			skb = NULL;
			goto rt_tx_error;
		}

		iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
#if IS_ENABLED(CONFIG_IPV6)
	} else {			//IPV6情况,暂不分析
		struct dst_entry *ndst;
		struct flowi6 fl6;
		u32 rt6i_flags;

		memset(&fl6, 0, sizeof(fl6));
		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
		fl6.daddr = dst->sin6.sin6_addr;
		fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
		fl6.flowi6_mark = skb->mark;
		fl6.flowi6_proto = IPPROTO_UDP;

#ifdef HAVE_IPV6_DST_LOOKUP_NET
		if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
#else
#ifdef HAVE_IPV6_STUB
		if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
#else
		ndst = ip6_route_output(vxlan->net, sk, &fl6);
		if (ndst->error) {
#endif
#endif
			netdev_dbg(dev, "no route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}

		if (ndst->dev == dev) {
			netdev_dbg(dev, "circular route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dst_release(ndst);
			dev->stats.collisions++;
			goto tx_error;
		}

		/* Bypass encapsulation if the destination is local */
		rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
		if (rt6i_flags & RTF_LOCAL &&
		    !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
			struct vxlan_dev *dst_vxlan;

			dst_release(ndst);
			dst_vxlan = vxlan_find_vni(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
			goto tx_error;
		}

		ttl = ttl ? : ip6_dst_hoplimit(ndst);
		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
				      0, ttl, src_port, dst_port, htonl(vni << 8), md,
				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
				      flags);
#endif
	}

	return;

drop:
	dev->stats.tx_dropped++;
	goto tx_free;

rt_tx_error:
	ip_rt_put(rt);
tx_error:
	dev->stats.tx_errors++;
tx_free:
	dev_kfree_skb(skb);
}
4、vxlan_xmit_skb函数
static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
			  __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
			  __be16 src_port, __be16 dst_port, __be32 vni,
			  struct vxlan_metadata *md, bool xnet, u32 vxflags)
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);		//默认为false
	int type = 0;

	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
	    skb->ip_summed == CHECKSUM_PARTIAL) {		//默认不进此分支
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
		     skb->csum_offset == offsetof(struct tcphdr, check))) {
			udp_sum = false;
			type |= SKB_GSO_TUNNEL_REMCSUM;

			if (!SKB_GSO_TUNNEL_REMCSUM) {
				kfree_skb(skb);
				return -EOPNOTSUPP;
			}
		}
	}

	min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len	//vxlan外层的大小
			+ VXLAN_HLEN + sizeof(struct iphdr)
			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);			//扩展线性区长度
	if (unlikely(err)) {
		kfree_skb(skb);
		return err;
	}

	skb = vlan_hwaccel_push_inside(skb);			//vlan信息添加到payload中
	if (WARN_ON(!skb))
		return -ENOMEM;

	skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true);	//设置inner相关的head,设置encapsulation
	if (IS_ERR(skb))
		return PTR_ERR(skb);

	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));	//添加vxlan头
	vxh->vx_flags = htonl(VXLAN_HF_VNI);
	vxh->vx_vni = vni;						//设置vni值

	if (type & SKB_GSO_TUNNEL_REMCSUM) {				//默认不进此分支
		u16 hdrlen = sizeof(struct vxlanhdr);
		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
			   VXLAN_RCO_SHIFT;

		if (skb->csum_offset == offsetof(struct udphdr, check))
			data |= VXLAN_RCO_UDP;

		vxh->vx_vni |= htonl(data);
		vxh->vx_flags |= htonl(VXLAN_HF_RCO);

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
			skb->encapsulation = 0;
#endif
		}
	}
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);

	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));	//设置内部协议

	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,		//封装传输层,发送报文
				   ttl, df, src_port, dst_port, xnet,
				   !(vxflags & VXLAN_F_UDP_CSUM));
}

5、udp_tunnel_xmit_skb函数

int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk,
			    struct sk_buff *skb, __be32 src, __be32 dst,
			    __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
			    __be16 dst_port, bool xnet, bool nocheck)		//内核小于3.18,使用OVS实现
{
	struct udphdr *uh;

	__skb_push(skb, sizeof(*uh));		//封装UDP头
	skb_reset_transport_header(skb);	//设置UDP header指针
	uh = udp_hdr(skb);

	uh->dest = dst_port;
	uh->source = src_port;
	uh->len = htons(skb->len);

	udp_set_csum(nocheck, skb, src, dst, skb->len);	//计算UDP csum

	return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,	//IP层封装,发送报文	
			     tos, ttl, df, xnet);
}
6、iptunnel_xmit函数

int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                      __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl,
                      __be16 df, bool xnet)		//内核小于3.18,使用OVS实现
{
	int pkt_len = skb->len;
	struct iphdr *iph;
	int err;

	skb_scrub_packet(skb, xnet);

	skb_clear_hash(skb);
	skb_dst_set(skb, &rt_dst(rt));

#if 0
	/* Do not clear ovs_skb_cb.  It will be done in gso code. */
	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
#endif

	/* Push down and install the IP header. */
	__skb_push(skb, sizeof(struct iphdr));		//添加IP头
	skb_reset_network_header(skb);			//设置ip header指针

	iph = ip_hdr(skb);

	iph->version	=	4;
	iph->ihl	=	sizeof(struct iphdr) >> 2;
	iph->frag_off	=	df;
	iph->protocol	=	proto;
	iph->tos	=	tos;
	iph->daddr	=	dst;
	iph->saddr	=	src;
	iph->ttl	=	ttl;

#ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY
	__ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
#elif defined(HAVE_IP_SELECT_IDENT_USING_NET)
	__ip_select_ident(dev_net(rt->dst.dev), iph,
			  skb_shinfo(skb)->gso_segs ?: 1);	//设置IP header的ID值,此为内核的方法;
#else
	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
#endif

	err = ip_local_out(skb);			//发送报文
	if (unlikely(net_xmit_eval(err)))
		pkt_len = 0;
	return pkt_len;
}
7、ip_local_out函数

int rpl_ip_local_out(struct sk_buff *skb)	//内核小于3.18,使用OVS实现
{
	int ret = NETDEV_TX_OK;
	int id = -1;

	if (!OVS_GSO_CB(skb)->fix_segment)	//如果fix_segment为空,则直接发送报文,不进行GSO分段;默认会进行GSO分段
		return output_ip(skb);

	if (skb_is_gso(skb)) {
		struct iphdr *iph;

		iph = ip_hdr(skb);
		id = ntohs(iph->id);
		skb = tnl_skb_gso_segment(skb, 0, false);	//报文GSO分段
		if (!skb || IS_ERR(skb))
			return 0;
	}  else if (skb->ip_summed == CHECKSUM_PARTIAL) {
		int err;

		err = skb_checksum_help(skb);
		if (unlikely(err))
			return 0;
	}

	while (skb) {
		struct sk_buff *next_skb = skb->next;
		struct iphdr *iph;

		skb->next = NULL;

		iph = ip_hdr(skb);
		if (id >= 0)
			iph->id = htons(id++);

		ret = output_ip(skb);		//发送分段后的报文
		skb = next_skb;
	}
	return ret;
}
8、output_ip函数

static int output_ip(struct sk_buff *skb)
{
	int ret = NETDEV_TX_OK;
	int err;

	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));

#undef ip_local_out
	err = ip_local_out(skb);		//调用linux内核的接口
	if (unlikely(net_xmit_eval(err)))
		ret = err;

	return ret;
}
vxlan中checksum offload特性还待分析,当前仅对默认的行为进行了分析,待后续补充。

你可能感兴趣的:(OVS2.5.0源码分析)