发包处理函数最终会调用到ovs_vport_send函数,该函数最终会调用vport_ops的send函数。
1、ovs_vport_send函数
void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, packet_length(skb), mtu); vport->dev->stats.tx_errors++; goto drop; } skb->dev = vport->dev; //vport关联的设备,vxlan端口的设备为vxlan_4789 vport->ops->send(skb);<span style="white-space:pre"> </span>//实际调用ovs_vxlan_netdev_vport_ops的vxlan_xmit函数 return; drop: kfree_skb(skb); }2、vxlan_xmit函数
#define vxlan_xmit rpl_vxlan_xmit<span style="white-space:pre"> </span>//在3.18内核中,OVS还是使用自己的vxlan实现 netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev;<span style="white-space:pre"> </span>//ovs_vport_send函数中完成设置 struct vxlan_dev *vxlan = netdev_priv(dev);<span style="white-space:pre"> </span>//该信息在创建vxlan端口对应的net_device时就初始化了 const struct ip_tunnel_info *info; info = skb_tunnel_info(skb); //得到tunnel信息,即execute_set_action函数设置的内容 skb_reset_mac_header(skb); if ((vxlan->flags & VXLAN_F_PROXY))<span style="white-space:pre"> </span>//当前没有此标记 goto out; if (vxlan->flags & VXLAN_F_COLLECT_METADATA && info && info->mode & IP_TUNNEL_INFO_TX) { vxlan_xmit_one(skb, dev, NULL, false);<span style="white-space:pre"> </span>//发送报文 return NETDEV_TX_OK; } out: pr_warn("vxlan: unsupported flag set %x", vxlan->flags); kfree_skb(skb); return NETDEV_TX_OK; }3、vxlan_xmit_one函数
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); //通过sock判断是IPV4还是IPV6 struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; union vxlan_addr remote_ip; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; int err; u32 flags = vxlan->flags; info = skb_tunnel_info(skb); //从skb中获取tunnel info if (rdst) { //不进入此分支 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = rdst->remote_vni; dst = &rdst->remote_ip; } else { if (!info) { //说明当前实现,对于报文从vxlan端口出去,必须设置tunnel info WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } if (family != ip_tunnel_info_af(info)) //判断协议类型是否一致 goto drop; dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; //目的端口优先从tunnel info中获取 vni = be64_to_cpu(info->key.tun_id); //VNI信息从tunnel info中获取 remote_ip.sa.sa_family = family; if (family == AF_INET) remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; //目的IP地址从tunnel info中获取 else remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; //目的IP地址从tunnel info中获取 dst = &remote_ip; } if (vxlan_addr_any(dst)) { //目的IP地址为全零,当前不支持 if (did_rsc) { /* short-circuited back to local bridge */ WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", dev->name); } goto drop; } old_iph = ip_hdr(skb); //skb的IP头 ttl = vxlan->cfg.ttl; //获取vxlan配置中的ttl if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; tos = vxlan->cfg.tos; //获取vxlan配置中的tos if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, //计算源端口 vxlan->cfg.port_max, true); if (info) { if (info->key.tun_flags & TUNNEL_CSUM) flags |= VXLAN_F_UDP_CSUM; else flags &= ~VXLAN_F_UDP_CSUM; ttl = info->key.ttl; //优先使用tunnel info中的ttl tos = info->key.tos; //优先使用tunnel info中的tos if (info->options_len) md = ip_tunnel_info_opts(info); } else { md->gbp = skb->mark; } if (dst->sa.sa_family == AF_INET) { //如果是IPV4 if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)) df = htons(IP_DF); memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.daddr = dst->sin.sin_addr.s_addr; fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; rt = ip_route_output_key(vxlan->net, &fl4); //路由表查找 if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (rt_dst(rt).dev == dev) { netdev_dbg(dev, "circular route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.collisions++; goto rt_tx_error; } /* Bypass encapsulation if the destination is local */ if (rt->rt_flags & RTCF_LOCAL && !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { //正常场景不进入此分支 struct vxlan_dev *dst_vxlan; ip_rt_put(rt); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", dev->name); goto tx_error; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt)); //如果ttl为零,则从rt表项中获取 err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, //发送报文 dst->sin.sin_addr.s_addr, tos, ttl, df, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), flags); if (err < 0) { /* skb is already freed. */ skb = NULL; goto rt_tx_error; } iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); #if IS_ENABLED(CONFIG_IPV6) } else { //IPV6情况,暂不分析 struct dst_entry *ndst; struct flowi6 fl6; u32 rt6i_flags; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; #ifdef HAVE_IPV6_DST_LOOKUP_NET if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) { #else #ifdef HAVE_IPV6_STUB if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { #else ndst = ip6_route_output(vxlan->net, sk, &fl6); if (ndst->error) { #endif #endif netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (ndst->dev == dev) { netdev_dbg(dev, "circular route to %pI6\n", &dst->sin6.sin6_addr); dst_release(ndst); dev->stats.collisions++; goto tx_error; } /* Bypass encapsulation if the destination is local */ rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; if (rt6i_flags & RTF_LOCAL && !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; dst_release(ndst); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", dev->name); goto tx_error; } ttl = ttl ? : ip6_dst_hoplimit(ndst); err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, 0, ttl, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), flags); #endif } return; drop: dev->stats.tx_dropped++; goto tx_free; rt_tx_error: ip_rt_put(rt); tx_error: dev->stats.tx_errors++; tx_free: dev_kfree_skb(skb); }4、vxlan_xmit_skb函数
static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be32 vni, struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); //默认为false int type = 0; if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { //默认不进此分支 int csum_start = skb_checksum_start_offset(skb); if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || skb->csum_offset == offsetof(struct tcphdr, check))) { udp_sum = false; type |= SKB_GSO_TUNNEL_REMCSUM; if (!SKB_GSO_TUNNEL_REMCSUM) { kfree_skb(skb); return -EOPNOTSUPP; } } } min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len //vxlan外层的大小 + VXLAN_HLEN + sizeof(struct iphdr) + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); //扩展线性区长度 if (unlikely(err)) { kfree_skb(skb); return err; } skb = vlan_hwaccel_push_inside(skb); //vlan信息添加到payload中 if (WARN_ON(!skb)) return -ENOMEM; skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true); //设置inner相关的head,设置encapsulation if (IS_ERR(skb)) return PTR_ERR(skb); vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); //添加vxlan头 vxh->vx_flags = htonl(VXLAN_HF_VNI); vxh->vx_vni = vni; //设置vni值 if (type & SKB_GSO_TUNNEL_REMCSUM) { //默认不进此分支 u16 hdrlen = sizeof(struct vxlanhdr); u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> VXLAN_RCO_SHIFT; if (skb->csum_offset == offsetof(struct udphdr, check)) data |= VXLAN_RCO_UDP; vxh->vx_vni |= htonl(data); vxh->vx_flags |= htonl(VXLAN_HF_RCO); if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) skb->encapsulation = 0; #endif } } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, vxflags, md); ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); //设置内部协议 return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos, //封装传输层,发送报文 ttl, df, src_port, dst_port, xnet, !(vxflags & VXLAN_F_UDP_CSUM)); }
5、udp_tunnel_xmit_skb函数
int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck) //内核小于3.18,使用OVS实现 { struct udphdr *uh; __skb_push(skb, sizeof(*uh)); //封装UDP头 skb_reset_transport_header(skb); //设置UDP header指针 uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); udp_set_csum(nocheck, skb, src, dst, skb->len); //计算UDP csum return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, //IP层封装,发送报文 tos, ttl, df, xnet); }6、iptunnel_xmit函数
int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) //内核小于3.18,使用OVS实现 { int pkt_len = skb->len; struct iphdr *iph; int err; skb_scrub_packet(skb, xnet); skb_clear_hash(skb); skb_dst_set(skb, &rt_dst(rt)); #if 0 /* Do not clear ovs_skb_cb. It will be done in gso code. */ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); #endif /* Push down and install the IP header. */ __skb_push(skb, sizeof(struct iphdr)); //添加IP头 skb_reset_network_header(skb); //设置ip header指针 iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; #ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY __ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1); #elif defined(HAVE_IP_SELECT_IDENT_USING_NET) __ip_select_ident(dev_net(rt->dst.dev), iph, skb_shinfo(skb)->gso_segs ?: 1); //设置IP header的ID值,此为内核的方法; #else __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); #endif err = ip_local_out(skb); //发送报文 if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; }7、ip_local_out函数
int rpl_ip_local_out(struct sk_buff *skb) //内核小于3.18,使用OVS实现 { int ret = NETDEV_TX_OK; int id = -1; if (!OVS_GSO_CB(skb)->fix_segment) //如果fix_segment为空,则直接发送报文,不进行GSO分段;默认会进行GSO分段 return output_ip(skb); if (skb_is_gso(skb)) { struct iphdr *iph; iph = ip_hdr(skb); id = ntohs(iph->id); skb = tnl_skb_gso_segment(skb, 0, false); //报文GSO分段 if (!skb || IS_ERR(skb)) return 0; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int err; err = skb_checksum_help(skb); if (unlikely(err)) return 0; } while (skb) { struct sk_buff *next_skb = skb->next; struct iphdr *iph; skb->next = NULL; iph = ip_hdr(skb); if (id >= 0) iph->id = htons(id++); ret = output_ip(skb); //发送分段后的报文 skb = next_skb; } return ret; }8、output_ip函数
static int output_ip(struct sk_buff *skb) { int ret = NETDEV_TX_OK; int err; memset(IPCB(skb), 0, sizeof(*IPCB(skb))); #undef ip_local_out err = ip_local_out(skb); //调用linux内核的接口 if (unlikely(net_xmit_eval(err))) ret = err; return ret; }vxlan中checksum offload特性还待分析,当前仅对默认的行为进行了分析,待后续补充。