发包处理函数最终会调用到ovs_vport_send函数,该函数最终会调用vport_ops的send函数。
1、ovs_vport_send函数
void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int mtu = vport->dev->mtu;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
vport->dev->name,
packet_length(skb), mtu);
vport->dev->stats.tx_errors++;
goto drop;
}
skb->dev = vport->dev; //vport关联的设备,vxlan端口的设备为vxlan_4789
vport->ops->send(skb); //实际调用ovs_vxlan_netdev_vport_ops的vxlan_xmit函数
return;
drop:
kfree_skb(skb);
}
2、vxlan_xmit函数
#define vxlan_xmit rpl_vxlan_xmit //在3.18内核中,OVS还是使用自己的vxlan实现
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev; //ovs_vport_send函数中完成设置
struct vxlan_dev *vxlan = netdev_priv(dev); //该信息在创建vxlan端口对应的net_device时就初始化了
const struct ip_tunnel_info *info;
info = skb_tunnel_info(skb); //得到tunnel信息,即execute_set_action函数设置的内容
skb_reset_mac_header(skb);
if ((vxlan->flags & VXLAN_F_PROXY)) //当前没有此标记
goto out;
if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
info && info->mode & IP_TUNNEL_INFO_TX) {
vxlan_xmit_one(skb, dev, NULL, false); //发送报文
return NETDEV_TX_OK;
}
out:
pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
kfree_skb(skb);
return NETDEV_TX_OK;
}
3、vxlan_xmit_one函数
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct vxlan_rdst *rdst, bool did_rsc)
{
struct ip_tunnel_info *info;
struct vxlan_dev *vxlan = netdev_priv(dev);
struct sock *sk = vxlan->vn_sock->sock->sk;
unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); //通过sock判断是IPV4还是IPV6
struct rtable *rt = NULL;
const struct iphdr *old_iph;
struct flowi4 fl4;
union vxlan_addr *dst;
union vxlan_addr remote_ip;
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 src_port = 0, dst_port;
u32 vni;
__be16 df = 0;
__u8 tos, ttl;
int err;
u32 flags = vxlan->flags;
info = skb_tunnel_info(skb); //从skb中获取tunnel info
if (rdst) { //不进入此分支
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
vni = rdst->remote_vni;
dst = &rdst->remote_ip;
} else {
if (!info) { //说明当前实现,对于报文从vxlan端口出去,必须设置tunnel info
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
dev->name);
goto drop;
}
if (family != ip_tunnel_info_af(info)) //判断协议类型是否一致
goto drop;
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; //目的端口优先从tunnel info中获取
vni = be64_to_cpu(info->key.tun_id); //VNI信息从tunnel info中获取
remote_ip.sa.sa_family = family;
if (family == AF_INET)
remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; //目的IP地址从tunnel info中获取
else
remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; //目的IP地址从tunnel info中获取
dst = &remote_ip;
}
if (vxlan_addr_any(dst)) { //目的IP地址为全零,当前不支持
if (did_rsc) {
/* short-circuited back to local bridge */
WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
dev->name);
}
goto drop;
}
old_iph = ip_hdr(skb); //skb的IP头
ttl = vxlan->cfg.ttl; //获取vxlan配置中的ttl
if (!ttl && vxlan_addr_multicast(dst))
ttl = 1;
tos = vxlan->cfg.tos; //获取vxlan配置中的tos
if (tos == 1)
tos = ip_tunnel_get_dsfield(old_iph, skb);
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, //计算源端口
vxlan->cfg.port_max, true);
if (info) {
if (info->key.tun_flags & TUNNEL_CSUM)
flags |= VXLAN_F_UDP_CSUM;
else
flags &= ~VXLAN_F_UDP_CSUM;
ttl = info->key.ttl; //优先使用tunnel info中的ttl
tos = info->key.tos; //优先使用tunnel info中的tos
if (info->options_len)
md = ip_tunnel_info_opts(info);
} else {
md->gbp = skb->mark;
}
if (dst->sa.sa_family == AF_INET) { //如果是IPV4
if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
df = htons(IP_DF);
memset(&fl4, 0, sizeof(fl4));
fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
fl4.flowi4_tos = RT_TOS(tos);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_proto = IPPROTO_UDP;
fl4.daddr = dst->sin.sin_addr.s_addr;
fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;
rt = ip_route_output_key(vxlan->net, &fl4); //路由表查找
if (IS_ERR(rt)) {
netdev_dbg(dev, "no route to %pI4\n",
&dst->sin.sin_addr.s_addr);
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (rt_dst(rt).dev == dev) {
netdev_dbg(dev, "circular route to %pI4\n",
&dst->sin.sin_addr.s_addr);
dev->stats.collisions++;
goto rt_tx_error;
}
/* Bypass encapsulation if the destination is local */
if (rt->rt_flags & RTCF_LOCAL &&
!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { //正常场景不进入此分支
struct vxlan_dev *dst_vxlan;
ip_rt_put(rt);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
dev->name);
goto tx_error;
}
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt)); //如果ttl为零,则从rt表项中获取
err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, //发送报文
dst->sin.sin_addr.s_addr, tos, ttl, df,
src_port, dst_port, htonl(vni << 8), md,
!net_eq(vxlan->net, dev_net(vxlan->dev)),
flags);
if (err < 0) {
/* skb is already freed. */
skb = NULL;
goto rt_tx_error;
}
iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
#if IS_ENABLED(CONFIG_IPV6)
} else { //IPV6情况,暂不分析
struct dst_entry *ndst;
struct flowi6 fl6;
u32 rt6i_flags;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
fl6.daddr = dst->sin6.sin6_addr;
fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = IPPROTO_UDP;
#ifdef HAVE_IPV6_DST_LOOKUP_NET
if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
#else
#ifdef HAVE_IPV6_STUB
if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
#else
ndst = ip6_route_output(vxlan->net, sk, &fl6);
if (ndst->error) {
#endif
#endif
netdev_dbg(dev, "no route to %pI6\n",
&dst->sin6.sin6_addr);
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (ndst->dev == dev) {
netdev_dbg(dev, "circular route to %pI6\n",
&dst->sin6.sin6_addr);
dst_release(ndst);
dev->stats.collisions++;
goto tx_error;
}
/* Bypass encapsulation if the destination is local */
rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
if (rt6i_flags & RTF_LOCAL &&
!(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
dst_release(ndst);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
dev->name);
goto tx_error;
}
ttl = ttl ? : ip6_dst_hoplimit(ndst);
err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
0, ttl, src_port, dst_port, htonl(vni << 8), md,
!net_eq(vxlan->net, dev_net(vxlan->dev)),
flags);
#endif
}
return;
drop:
dev->stats.tx_dropped++;
goto tx_free;
rt_tx_error:
ip_rt_put(rt);
tx_error:
dev->stats.tx_errors++;
tx_free:
dev_kfree_skb(skb);
}
4、vxlan_xmit_skb函数
static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
__be16 src_port, __be16 dst_port, __be32 vni,
struct vxlan_metadata *md, bool xnet, u32 vxflags)
{
struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); //默认为false
int type = 0;
if ((vxflags & VXLAN_F_REMCSUM_TX) &&
skb->ip_summed == CHECKSUM_PARTIAL) { //默认不进此分支
int csum_start = skb_checksum_start_offset(skb);
if (csum_start <= VXLAN_MAX_REMCSUM_START &&
!(csum_start & VXLAN_RCO_SHIFT_MASK) &&
(skb->csum_offset == offsetof(struct udphdr, check) ||
skb->csum_offset == offsetof(struct tcphdr, check))) {
udp_sum = false;
type |= SKB_GSO_TUNNEL_REMCSUM;
if (!SKB_GSO_TUNNEL_REMCSUM) {
kfree_skb(skb);
return -EOPNOTSUPP;
}
}
}
min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len //vxlan外层的大小
+ VXLAN_HLEN + sizeof(struct iphdr)
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head(skb, min_headroom); //扩展线性区长度
if (unlikely(err)) {
kfree_skb(skb);
return err;
}
skb = vlan_hwaccel_push_inside(skb); //vlan信息添加到payload中
if (WARN_ON(!skb))
return -ENOMEM;
skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true); //设置inner相关的head,设置encapsulation
if (IS_ERR(skb))
return PTR_ERR(skb);
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); //添加vxlan头
vxh->vx_flags = htonl(VXLAN_HF_VNI);
vxh->vx_vni = vni; //设置vni值
if (type & SKB_GSO_TUNNEL_REMCSUM) { //默认不进此分支
u16 hdrlen = sizeof(struct vxlanhdr);
u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
VXLAN_RCO_SHIFT;
if (skb->csum_offset == offsetof(struct udphdr, check))
data |= VXLAN_RCO_UDP;
vxh->vx_vni |= htonl(data);
vxh->vx_flags |= htonl(VXLAN_HF_RCO);
if (!skb_is_gso(skb)) {
skb->ip_summed = CHECKSUM_NONE;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
skb->encapsulation = 0;
#endif
}
}
if (vxflags & VXLAN_F_GBP)
vxlan_build_gbp_hdr(vxh, vxflags, md);
ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); //设置内部协议
return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos, //封装传输层,发送报文
ttl, df, src_port, dst_port, xnet,
!(vxflags & VXLAN_F_UDP_CSUM));
}
5、udp_tunnel_xmit_skb函数
int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk,
struct sk_buff *skb, __be32 src, __be32 dst,
__u8 tos, __u8 ttl, __be16 df, __be16 src_port,
__be16 dst_port, bool xnet, bool nocheck) //内核小于3.18,使用OVS实现
{
struct udphdr *uh;
__skb_push(skb, sizeof(*uh)); //封装UDP头
skb_reset_transport_header(skb); //设置UDP header指针
uh = udp_hdr(skb);
uh->dest = dst_port;
uh->source = src_port;
uh->len = htons(skb->len);
udp_set_csum(nocheck, skb, src, dst, skb->len); //计算UDP csum
return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, //IP层封装,发送报文
tos, ttl, df, xnet);
}
6、iptunnel_xmit函数
int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl,
__be16 df, bool xnet) //内核小于3.18,使用OVS实现
{
int pkt_len = skb->len;
struct iphdr *iph;
int err;
skb_scrub_packet(skb, xnet);
skb_clear_hash(skb);
skb_dst_set(skb, &rt_dst(rt));
#if 0
/* Do not clear ovs_skb_cb. It will be done in gso code. */
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
#endif
/* Push down and install the IP header. */
__skb_push(skb, sizeof(struct iphdr)); //添加IP头
skb_reset_network_header(skb); //设置ip header指针
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = proto;
iph->tos = tos;
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
#ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY
__ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
#elif defined(HAVE_IP_SELECT_IDENT_USING_NET)
__ip_select_ident(dev_net(rt->dst.dev), iph,
skb_shinfo(skb)->gso_segs ?: 1); //设置IP header的ID值,此为内核的方法;
#else
__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
#endif
err = ip_local_out(skb); //发送报文
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
}
7、ip_local_out函数
int rpl_ip_local_out(struct sk_buff *skb) //内核小于3.18,使用OVS实现
{
int ret = NETDEV_TX_OK;
int id = -1;
if (!OVS_GSO_CB(skb)->fix_segment) //如果fix_segment为空,则直接发送报文,不进行GSO分段;默认会进行GSO分段
return output_ip(skb);
if (skb_is_gso(skb)) {
struct iphdr *iph;
iph = ip_hdr(skb);
id = ntohs(iph->id);
skb = tnl_skb_gso_segment(skb, 0, false); //报文GSO分段
if (!skb || IS_ERR(skb))
return 0;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
int err;
err = skb_checksum_help(skb);
if (unlikely(err))
return 0;
}
while (skb) {
struct sk_buff *next_skb = skb->next;
struct iphdr *iph;
skb->next = NULL;
iph = ip_hdr(skb);
if (id >= 0)
iph->id = htons(id++);
ret = output_ip(skb); //发送分段后的报文
skb = next_skb;
}
return ret;
}
8、output_ip函数
static int output_ip(struct sk_buff *skb)
{
int ret = NETDEV_TX_OK;
int err;
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
#undef ip_local_out
err = ip_local_out(skb); //调用linux内核的接口
if (unlikely(net_xmit_eval(err)))
ret = err;
return ret;
}
vxlan中checksum offload特性还待分析,当前仅对默认的行为进行了分析,待后续补充。