网络报文到达主机后,最终会到达协议栈的netif_receive_skb函数,该函数会通过设备对象的rx_handler函数把报文交给OVS处理。 而该rx_handler函数其实就是OVS 定义的netdev_frame_hook函数,本篇内容就是从netdev_frame_hook函数开始,分析报文在datapath中的整个主处理过程。
1、netdev_frame_hook函数
该函数为OVS与内核桥接点,所以函数定义受内核定义影响
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port_receive(skb); return RX_HANDLER_CONSUMED; } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) || \ defined HAVE_RHEL_OVS_HOOK /* Called with rcu_read_lock and bottom-halves disabled. */ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb) { if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return skb; port_receive(skb); return NULL; } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) /* * Used as br_handle_frame_hook. (Cannot run bridge at the same time, even on * different set of devices!) */ /* Called with rcu_read_lock and bottom-halves disabled. */ static struct sk_buff *netdev_frame_hook(struct net_bridge_port *p, struct sk_buff *skb) { port_receive(skb); return NULL; } #else #error #endif
2、port_receive函数
#ifndef HAVE_METADATA_DST #define port_receive(skb) netdev_port_receive(skb, NULL) #else #define port_receive(skb) netdev_port_receive(skb, skb_tunnel_info(skb)) //报文中包含隧道信息,说明协议栈支持隧道报文了 #endif3、netdev_port_receive函数
/* Must be called with rcu_read_lock. */ void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); //通过netdev设备获得vport对象,是实现在datapath中转发的基础 if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, tun_info); //基于vport进行处理 return; error: kfree_skb(skb); }4、ovs_vport_receive函数
int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, const struct ip_tunnel_info *tun_info) { struct sw_flow_key key; int error; OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; mark = skb->mark; skb_scrub_packet(skb, true); skb->mark = mark; tun_info = NULL; } ovs_skb_init_inner_protocol(skb); skb_clear_ovs_gso_cb(skb); /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); //根据报文生成key if (unlikely(error)) { kfree_skb(skb); return error; } ovs_dp_process_packet(skb, &key); //报文处理 return 0; }5、ovs_dp_process_packet函数
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), //查询转发表 &n_mask_hit); if (unlikely(!flow)) { //如果没有查到流表,则上送的upcall线程处理 struct dp_upcall_info upcall; int error; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); else consume_skb(skb); stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); //获取action ovs_execute_actions(dp, skb, sf_acts, key); //对报文执行action stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; u64_stats_update_end(&stats->syncp); }6、ovs_execute_actions函数
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { int level = this_cpu_read(exec_actions_level); int err; if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) { if (net_ratelimit()) pr_warn("%s: packet loop detected, dropping.\n", ovs_dp_name(dp)); kfree_skb(skb); return -ELOOP; } this_cpu_inc(exec_actions_level); err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); //执行action if (!level) process_deferred_actions(dp); this_cpu_dec(exec_actions_level); /* This return status currently does not reflect the errors * encounted during deferred actions execution. Probably needs to * be fixed in the future. */ return err; }7、do_execute_actions函数
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len) { /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and * then freeing the original skbuff is wasteful. So the following code * is slightly obscure just to avoid that. */ int prev_port = -1; const struct nlattr *a; int rem; for (a = attr, rem = len; rem > 0; a = nla_next(a, &rem)) { int err = 0; if (unlikely(prev_port != -1)) { //从某个端口发出 struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); //克隆报文 if (out_skb) do_output(dp, out_skb, prev_port, key); //输出报文,为简化起见,本篇以output为例,其他action在后续分析 prev_port = -1; } switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: prev_port = nla_get_u32(a); break; case OVS_ACTION_ATTR_USERSPACE: output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: execute_hash(skb, key, a); break; case OVS_ACTION_ATTR_PUSH_MPLS: err = push_mpls(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_MPLS: err = pop_mpls(skb, key, nla_get_be16(a)); break; case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_VLAN: err = pop_vlan(skb, key); break; case OVS_ACTION_ATTR_RECIRC: err = execute_recirc(dp, skb, key, a, rem); if (nla_is_last(a, rem)) { /* If this is the last action, the skb has * been consumed or freed. * Return immediately. */ return err; } break; case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_SET_TO_MASKED: err = execute_masked_set_action(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_CT: if (!is_flow_key_valid(key)) { err = ovs_flow_key_update(skb, key); if (err) return err; } err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, nla_data(a)); /* Hide stolen IP fragments from user space. */ if (err) return err == -EINPROGRESS ? 0 : err; break; } if (unlikely(err)) { kfree_skb(skb); return err; } } if (prev_port != -1) do_output(dp, skb, prev_port, key); else consume_skb(skb); return 0; }8、do_output函数
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key) { struct vport *vport = ovs_vport_rcu(dp, out_port); //获取出端口的vport对象 if (likely(vport)) { u16 mru = OVS_CB(skb)->mru; if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { ovs_vport_send(vport, skb); //发送报文 } else if (mru <= vport->dev->mtu) { __be16 ethertype = key->eth.type; if (!is_flow_key_valid(key)) { if (eth_p_mpls(skb->protocol)) ethertype = ovs_skb_get_inner_protocol(skb); else ethertype = vlan_get_protocol(skb); } ovs_fragment(vport, skb, mru, ethertype); } else { OVS_NLERR(true, "Cannot fragment IP frames"); kfree_skb(skb); } } else { kfree_skb(skb); } }9、ovs_vport_send函数
void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, packet_length(skb), mtu); vport->dev->stats.tx_errors++; goto drop; } skb->dev = vport->dev; //skb的dev设备,设置成vport关联的netdev设备 vport->ops->send(skb); //调用vport对应vport_ops的send函数,如果是ovs_netdev_vport_ops,则调用内核的dev_queue_xmit函数,其他种类的vport后续分析 return; drop: kfree_skb(skb); }至此报文从进入到OVS到报文离开OVS已经全部完成,这里只是最简单的流程,后续逐步丰富之。