VXLAN gro处理不是通过定义常量实现的,而是通过动态注册的实现的,在UDP层处理时我们知道是注册到udp_offload_base全局变量中,我们先看下注册过程
vxlan_socket_create函数
/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
vxlan_rcv_t *rcv, void *data,
u32 flags)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
struct socket *sock;
unsigned int h;
bool ipv6 = !!(flags & VXLAN_F_IPV6);
struct udp_tunnel_sock_cfg tunnel_cfg;
vs = kzalloc(sizeof(*vs), GFP_KERNEL);
if (!vs)
return ERR_PTR(-ENOMEM);
for (h = 0; h < VNI_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vs->vni_list[h]);
INIT_WORK(&vs->del_work, vxlan_del_work);
sock = vxlan_create_sock(net, ipv6, port, flags);
if (IS_ERR(sock)) {
pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
PTR_ERR(sock));
kfree(vs);
return ERR_CAST(sock);
}
vs->sock = sock;
atomic_set(&vs->refcnt, 1);
vs->rcv = rcv;
vs->data = data;
vs->flags = (flags & VXLAN_F_RCV_FLAGS);
/* Initialize the vxlan udp offloads structure */
vs->udp_offloads.port = port;
vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive;
vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
spin_lock(&vn->sock_lock);
hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
vxlan_notify_add_rx_port(vs); //注册vxlan gro offload
spin_unlock(&vn->sock_lock);
/* Mark socket as an encapsulation socket. */
tunnel_cfg.sk_user_data = vs;
tunnel_cfg.encap_type = 1;
tunnel_cfg.encap_rcv = vxlan_udp_encap_recv;
tunnel_cfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
return vs;
}
vxlan_notify_add_rx_port函数
static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
{
struct net_device *dev;
struct sock *sk = vs->sock->sk;
struct net *net = sock_net(sk);
sa_family_t sa_family = sk->sk_family;
__be16 port = inet_sk(sk)->inet_sport;
int err;
if (sa_family == AF_INET) {
err = udp_add_offload(&vs->udp_offloads); //注册vxlan gro offload对象
if (err)
pr_warn("vxlan: udp_add_offload failed with status %d\n", err);
}
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
if (dev->netdev_ops->ndo_add_vxlan_port)
dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
port);
}
rcu_read_unlock();
}
udp_add_offload函数
int udp_add_offload(struct udp_offload *uo)
{
struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
if (!new_offload)
return -ENOMEM;
new_offload->offload = uo;
spin_lock(&udp_offload_lock);
new_offload->next = udp_offload_base;
rcu_assign_pointer(udp_offload_base, new_offload); //注册到udp_offload_base全局变量中
spin_unlock(&udp_offload_lock);
return 0;
}
接下来,我们看下vxlan gro offload的实现,包括vxlan_gro_receive和vxlan_gro_complete:
vxlan_gro_receive函数
static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
struct sk_buff *skb,
struct udp_offload *uoff)
{
struct sk_buff *p, **pp = NULL;
struct vxlanhdr *vh, *vh2;
unsigned int hlen, off_vx;
int flush = 1;
struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, //通过udp_offload对象获得vxlan_sock对象
udp_offloads);
u32 flags;
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc); //初始化grc
off_vx = skb_gro_offset(skb);
hlen = off_vx + sizeof(*vh);
vh = skb_gro_header_fast(skb, off_vx); //得到vxlan头
if (skb_gro_header_hard(skb, hlen)) {
vh = skb_gro_header_slow(skb, hlen, off_vx);
if (unlikely(!vh))
goto out;
}
skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ //移动到数据区,实际是内层mac头
skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); //csum值刷新
flags = ntohl(vh->vx_flags);
if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { //即发送方携带SKB_GSO_TUNNEL_REMCSUM标记
vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), //remcsum校验和刷新
ntohl(vh->vx_vni), &grc,
!!(vs->flags &
VXLAN_F_REMCSUM_NOPARTIAL));
if (!vh) //校验不通过,提交当前报文到协议栈
goto out;
}
flush = 0;
for (p = *head; p; p = p->next) { //遍历gro_list中的报文
if (!NAPI_GRO_CB(p)->same_flow)
continue;
vh2 = (struct vxlanhdr *)(p->data + off_vx);
if (vh->vx_flags != vh2->vx_flags || //flags和vni相同才是同一个流
vh->vx_vni != vh2->vx_vni) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
}
pp = eth_gro_receive(head, skb); //调用mac层的gro_receive
out:
skb_gro_remcsum_cleanup(skb, &grc);
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
eth_gro_receive函数
struct sk_buff **eth_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
struct sk_buff *p, **pp = NULL;
struct ethhdr *eh, *eh2;
unsigned int hlen, off_eth;
const struct packet_offload *ptype;
__be16 type;
int flush = 1;
off_eth = skb_gro_offset(skb);
hlen = off_eth + sizeof(*eh);
eh = skb_gro_header_fast(skb, off_eth); //得到mac头
if (skb_gro_header_hard(skb, hlen)) {
eh = skb_gro_header_slow(skb, hlen, off_eth);
if (unlikely(!eh))
goto out;
}
flush = 0;
for (p = *head; p; p = p->next) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
eh2 = (struct ethhdr *)(p->data + off_eth);
if (compare_ether_header(eh, eh2)) { //mac头相同则为同一个流
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
}
type = eh->h_proto; //得到3层协议类型,例如IPV4协议
rcu_read_lock();
ptype = gro_find_receive_by_type(type);
if (ptype == NULL) { //未找到协议注册的gro offload等,则把报文提交给协议栈
flush = 1;
goto out_unlock;
}
skb_gro_pull(skb, sizeof(*eh)); //报文移到IP头
skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); //刷新csum值
pp = ptype->callbacks.gro_receive(head, skb); //调用ip层的gro_receive函数
out_unlock:
rcu_read_unlock();
out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
struct udp_offload *uoff)
{
udp_tunnel_gro_complete(skb, nhoff); //设置skb_shinfo(skb)->gso_type值
return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
}
eth_gro_complete函数
int eth_gro_complete(struct sk_buff *skb, int nhoff)
{
struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); //得到mac头
__be16 type = eh->h_proto;
struct packet_offload *ptype;
int err = -ENOSYS;
if (skb->encapsulation)
skb_set_inner_mac_header(skb, nhoff); //设置inner mac header
rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype != NULL)
err = ptype->callbacks.gro_complete(skb, nhoff + //调用ip层的gro_complete函数
sizeof(struct ethhdr));
rcu_read_unlock();
return err;
}