vxlan端口是通过ovs-vsctl命令创建的,整个调用流程如下:
bridge_reconfigure->bridge_add_ports->bridge_add_ports__->iface_create->iface_do_create->ofproto_port_add->port_add(ofproto_class)->dpif_port_add->dpif_netlink_port_add(dpif_class)->dpif_netlink_port_add__->内核分界线->ovs_vport_cmd_new->new_vport->ovs_vport_add
本篇着重介绍内核部分的处理流程,另外ofproto_class的port_add函数也是非常关键,它决定了系统中vxlan端口只能创建一个:
port_add函数
static int port_add(struct ofproto *ofproto_, struct netdev *netdev) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); const char *devname = netdev_get_name(netdev); char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; const char *dp_port_name; if (netdev_vport_is_patch(netdev)) { //如果是patch端口,不需要创建后端设备 sset_add(&ofproto->ghost_ports, netdev_get_name(netdev)); return 0; } dp_port_name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); if (!dpif_port_exists(ofproto->backer->dpif, dp_port_name)) { //vxlan类型的端口,后端设备是唯一的,只能创建一个 odp_port_t port_no = ODPP_NONE; int error; error = dpif_port_add(ofproto->backer->dpif, netdev, &port_no); if (error) { return error; } if (netdev_get_tunnel_config(netdev)) { simap_put(&ofproto->backer->tnl_backers, dp_port_name, odp_to_u32(port_no)); } } if (netdev_get_tunnel_config(netdev)) { sset_add(&ofproto->ghost_ports, devname); } else { sset_add(&ofproto->ports, devname); } return 0; }
ovs_vport_cmd_new函数
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; u32 port_no; int err; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; port_no = a[OVS_VPORT_ATTR_PORT_NO] ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; if (port_no >= DP_MAX_PORTS) return -EFBIG; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto exit_unlock_free; if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) break; } } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; vport = new_vport(&parms); //创建vport对象 err = PTR_ERR(vport); if (IS_ERR(vport)) { if (err == -EAGAIN) goto restart; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); //构造响应请求信息 BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info); //响应请求 return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; }
/* Called with ovs_mutex. */ static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_add(parms); //创建vport if (!IS_ERR(vport)) { struct datapath *dp = parms->dp; struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); //添加vport到datapath中,根据port_no检索 hlist_add_head_rcu(&vport->dp_hash_node, head); } return vport; }
struct vport *ovs_vport_add(const struct vport_parms *parms) { struct vport_ops *ops; struct vport *vport; ops = ovs_vport_lookup(parms); if (ops) { struct hlist_head *bucket; if (!try_module_get(ops->owner)) return ERR_PTR(-EAFNOSUPPORT); vport = ops->create(parms); //vxlan端口为vxlan_create函数 if (IS_ERR(vport)) { module_put(ops->owner); return vport; } bucket = hash_bucket(ovs_dp_get_net(vport->dp), ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); //添加到datapath中,根据hash检索 return vport; } /* Unlock to attempt module load and return -EAGAIN if load * was successful as we need to restart the port addition * workflow. */ ovs_unlock(); request_module("vport-type-%d", parms->type); ovs_lock(); if (!ovs_vport_lookup(parms)) return ERR_PTR(-EAFNOSUPPORT); else return ERR_PTR(-EAGAIN); }
static struct vport *vxlan_create(const struct vport_parms *parms) { struct vport *vport; vport = vxlan_tnl_create(parms); //创建vxlan端口 if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); //注册设备rx_handler函数 }
static struct vport *vxlan_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct net_device *dev; struct vport *vport; struct nlattr *a; int err; struct vxlan_config conf = { .no_share = true, .flags = VXLAN_F_COLLECT_METADATA, /* Don't restrict the packets that can be sent by MTU */ .mtu = IP_MAX_MTU, }; if (!options) { err = -EINVAL; goto error; } a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); if (a && nla_len(a) == sizeof(u16)) { conf.dst_port = htons(nla_get_u16(a)); } else { /* Require destination port from userspace. */ err = -EINVAL; goto error; } vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); //创建vport对象 if (IS_ERR(vport)) return vport; a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); if (a) { err = vxlan_configure_exts(vport, a, &conf); if (err) { ovs_vport_free(vport); goto error; } } rtnl_lock(); dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); //创建net_device设备 if (IS_ERR(dev)) { rtnl_unlock(); ovs_vport_free(vport); return ERR_CAST(dev); } dev_change_flags(dev, dev->flags | IFF_UP); //会调用设备驱动的open函数 rtnl_unlock(); return vport; error: return ERR_PTR(err); }
static int vxlan_open(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_sock *vs;
int ret = 0;
vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, //创建vxlan socket
vxlan->cfg.no_share, vxlan->flags);
if (IS_ERR(vs))
return PTR_ERR(vs);
vxlan_vs_add_dev(vs, vxlan); //vxlan设备中添加vs和vni
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
ret = vxlan_igmp_join(vxlan);
if (ret == -EADDRINUSE)
ret = 0;
if (ret) {
vxlan_sock_release(vs);
return ret;
}
}
if (vxlan->cfg.age_interval)
mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
return ret;
}
static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
bool no_share, u32 flags)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
bool ipv6 = flags & VXLAN_F_IPV6;
if (!no_share) { //share模式,共用socket
spin_lock(&vn->sock_lock);
vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port,
flags);
if (vs) {
if (!atomic_add_unless(&vs->refcnt, 1, 0))
vs = ERR_PTR(-EBUSY);
spin_unlock(&vn->sock_lock);
return vs;
}
spin_unlock(&vn->sock_lock);
}
return vxlan_socket_create(net, port, flags); //创建vxlan socket
}
static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); INIT_WORK(&vs->del_work, vxlan_del_work); sock = vxlan_create_sock(net, ipv6, port, flags); //创建socket if (IS_ERR(sock)) { pr_info("Cannot bind port %d, err=%ld\n", ntohs(port), PTR_ERR(sock)); kfree(vs); return ERR_CAST(sock); } vs->sock = sock; atomic_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ #ifdef HAVE_UDP_OFFLOAD vs->udp_offloads.port = port; vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; vxlan_notify_add_rx_port(vs); //注册到全局offload对象中,实现vxlan报文的gro功能 #endif spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_udp_encap_recv; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); //设置udp_socket的encp_rcv函数 return vs; }
1、创建net_device设备对象,通过该对象可以获得vxlan_dev以及vxlan配置信息;
2、创建vxlan socket,并设置encap_rcv函数;
3、注册vxlan offload到内核,实现vxlan报文的gro,根据UDP端口号来判断;
内核收到报文后,gro流程中,udp gro receive后,根据UDP的端口可以找到vxlan gro receive,该函数作为内外层的桥梁实现报文的gro,报文聚合后通过netif_receive_skb_internal上送到协议栈,协议栈一直处理到udp socket收包,然后交给vxlan_udp_encap_recv函数进行处理。