【OVS2.5.0源码分析】datapath之主流程分析

网络报文到达主机后,最终会到达协议栈的netif_receive_skb函数,该函数会通过设备对象的rx_handler函数把报文交给OVS处理。 而该rx_handler函数其实就是OVS 定义的netdev_frame_hook函数,本篇内容就是从netdev_frame_hook函数开始,分析报文在datapath中的整个主处理过程。

1、netdev_frame_hook函数

该函数为OVS与内核桥接点,所以函数定义受内核定义影响

static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
	struct sk_buff *skb = *pskb;

	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
		return RX_HANDLER_PASS;

	port_receive(skb);
	return RX_HANDLER_CONSUMED;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) || \
      defined HAVE_RHEL_OVS_HOOK
/* Called with rcu_read_lock and bottom-halves disabled. */
static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
{
	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
		return skb;

	port_receive(skb);
	return NULL;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
/*
 * Used as br_handle_frame_hook.  (Cannot run bridge at the same time, even on
 * different set of devices!)
 */
/* Called with rcu_read_lock and bottom-halves disabled. */
static struct sk_buff *netdev_frame_hook(struct net_bridge_port *p,
					 struct sk_buff *skb)
{
	port_receive(skb);
	return NULL;
}
#else
#error
#endif

2、port_receive函数

#ifndef HAVE_METADATA_DST
#define port_receive(skb)  netdev_port_receive(skb, NULL)
#else
#define port_receive(skb)  netdev_port_receive(skb, skb_tunnel_info(skb))   //报文中包含隧道信息,说明协议栈支持隧道报文了
#endif
3、netdev_port_receive函数

/* Must be called with rcu_read_lock. */
void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
{
	struct vport *vport;

	vport = ovs_netdev_get_vport(skb->dev);    //通过netdev设备获得vport对象,是实现在datapath中转发的基础
	if (unlikely(!vport))
		goto error;

	if (unlikely(skb_warn_if_lro(skb)))
		goto error;

	/* Make our own copy of the packet.  Otherwise we will mangle the
	 * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
	 */
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (unlikely(!skb))
		return;

	skb_push(skb, ETH_HLEN);
	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
	ovs_vport_receive(vport, skb, tun_info);     //基于vport进行处理
	return;
error:
	kfree_skb(skb);
}
4、ovs_vport_receive函数

int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
		      const struct ip_tunnel_info *tun_info)
{
	struct sw_flow_key key;
	int error;

	OVS_CB(skb)->input_vport = vport;
	OVS_CB(skb)->mru = 0;
	if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
		u32 mark;

		mark = skb->mark;
		skb_scrub_packet(skb, true);
		skb->mark = mark;
		tun_info = NULL;
	}

	ovs_skb_init_inner_protocol(skb);
	skb_clear_ovs_gso_cb(skb);
	/* Extract flow from 'skb' into 'key'. */
	error = ovs_flow_key_extract(tun_info, skb, &key);	//根据报文生成key
	if (unlikely(error)) {
		kfree_skb(skb);
		return error;
	}
	ovs_dp_process_packet(skb, &key);	//报文处理
	return 0;
}
5、ovs_dp_process_packet函数

void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
	const struct vport *p = OVS_CB(skb)->input_vport;
	struct datapath *dp = p->dp;
	struct sw_flow *flow;
	struct sw_flow_actions *sf_acts;
	struct dp_stats_percpu *stats;
	u64 *stats_counter;
	u32 n_mask_hit;

	stats = this_cpu_ptr(dp->stats_percpu);

	/* Look up flow. */
	flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),	//查询转发表
					 &n_mask_hit);
	if (unlikely(!flow)) {		//如果没有查到流表,则上送的upcall线程处理
		struct dp_upcall_info upcall;
		int error;

		memset(&upcall, 0, sizeof(upcall));
		upcall.cmd = OVS_PACKET_CMD_MISS;
		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
		upcall.mru = OVS_CB(skb)->mru;
		error = ovs_dp_upcall(dp, skb, key, &upcall);
		if (unlikely(error))
			kfree_skb(skb);
		else
			consume_skb(skb);
		stats_counter = &stats->n_missed;
		goto out;
	}

	ovs_flow_stats_update(flow, key->tp.flags, skb);
	sf_acts = rcu_dereference(flow->sf_acts);		//获取action
	ovs_execute_actions(dp, skb, sf_acts, key);		//对报文执行action

	stats_counter = &stats->n_hit;

out:
	/* Update datapath statistics. */
	u64_stats_update_begin(&stats->syncp);
	(*stats_counter)++;
	stats->n_mask_hit += n_mask_hit;
	u64_stats_update_end(&stats->syncp);
}
6、ovs_execute_actions函数

int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
			const struct sw_flow_actions *acts,
			struct sw_flow_key *key)
{
	int level = this_cpu_read(exec_actions_level);
	int err;

	if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
		if (net_ratelimit())
			pr_warn("%s: packet loop detected, dropping.\n",
				ovs_dp_name(dp));

		kfree_skb(skb);
		return -ELOOP;
	}

	this_cpu_inc(exec_actions_level);
	err = do_execute_actions(dp, skb, key,
				 acts->actions, acts->actions_len);    //执行action

	if (!level)
		process_deferred_actions(dp);

	this_cpu_dec(exec_actions_level);

	/* This return status currently does not reflect the errors
	 * encounted during deferred actions execution. Probably needs to
	 * be fixed in the future.
	 */
	return err;
}
7、do_execute_actions函数

static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
			      struct sw_flow_key *key,
			      const struct nlattr *attr, int len)
{
	/* Every output action needs a separate clone of 'skb', but the common
	 * case is just a single output action, so that doing a clone and
	 * then freeing the original skbuff is wasteful.  So the following code
	 * is slightly obscure just to avoid that.
	 */
	int prev_port = -1;
	const struct nlattr *a;
	int rem;

	for (a = attr, rem = len; rem > 0;
	     a = nla_next(a, &rem)) {
		int err = 0;

		if (unlikely(prev_port != -1)) {     //从某个端口发出
			struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);    //克隆报文

			if (out_skb)
				do_output(dp, out_skb, prev_port, key);    //输出报文,为简化起见,本篇以output为例,其他action在后续分析

			prev_port = -1;
		}

		switch (nla_type(a)) {
		case OVS_ACTION_ATTR_OUTPUT:
			prev_port = nla_get_u32(a);
			break;

		case OVS_ACTION_ATTR_USERSPACE:
			output_userspace(dp, skb, key, a, attr, len); 
			break;

		case OVS_ACTION_ATTR_HASH:
			execute_hash(skb, key, a);
			break;

		case OVS_ACTION_ATTR_PUSH_MPLS:
			err = push_mpls(skb, key, nla_data(a));
			break;

		case OVS_ACTION_ATTR_POP_MPLS:
			err = pop_mpls(skb, key, nla_get_be16(a));
			break;

		case OVS_ACTION_ATTR_PUSH_VLAN:
			err = push_vlan(skb, key, nla_data(a));
			break;

		case OVS_ACTION_ATTR_POP_VLAN:
			err = pop_vlan(skb, key);
			break;

		case OVS_ACTION_ATTR_RECIRC:
			err = execute_recirc(dp, skb, key, a, rem);
			if (nla_is_last(a, rem)) {
				/* If this is the last action, the skb has
				 * been consumed or freed.
				 * Return immediately.
				 */
				return err;
			}
			break;

		case OVS_ACTION_ATTR_SET:
			err = execute_set_action(skb, key, nla_data(a));
			break;

		case OVS_ACTION_ATTR_SET_MASKED:
		case OVS_ACTION_ATTR_SET_TO_MASKED:
			err = execute_masked_set_action(skb, key, nla_data(a));
			break;

		case OVS_ACTION_ATTR_SAMPLE:
			err = sample(dp, skb, key, a, attr, len);
			break;

		case OVS_ACTION_ATTR_CT:
			if (!is_flow_key_valid(key)) {
				err = ovs_flow_key_update(skb, key);
				if (err)
					return err;
			}

			err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
					     nla_data(a));

			/* Hide stolen IP fragments from user space. */
			if (err)
				return err == -EINPROGRESS ? 0 : err;
			break;
		}

		if (unlikely(err)) {
			kfree_skb(skb);
			return err;
		}
	}

	if (prev_port != -1)
		do_output(dp, skb, prev_port, key);
	else
		consume_skb(skb);

	return 0;
}
8、do_output函数

static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
		      struct sw_flow_key *key)
{
	struct vport *vport = ovs_vport_rcu(dp, out_port);           //获取出端口的vport对象

	if (likely(vport)) {
		u16 mru = OVS_CB(skb)->mru;

		if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {   
			ovs_vport_send(vport, skb);                   //发送报文
		} else if (mru <= vport->dev->mtu) {
			__be16 ethertype = key->eth.type;

			if (!is_flow_key_valid(key)) {
				if (eth_p_mpls(skb->protocol))
					ethertype = ovs_skb_get_inner_protocol(skb);
				else
					ethertype = vlan_get_protocol(skb);
			}

			ovs_fragment(vport, skb, mru, ethertype);
		} else {
			OVS_NLERR(true, "Cannot fragment IP frames");
			kfree_skb(skb);
		}
	} else {
		kfree_skb(skb);
	}
}
9、ovs_vport_send函数
void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
	int mtu = vport->dev->mtu;

	if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
		net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
				     vport->dev->name,
				     packet_length(skb), mtu);
		vport->dev->stats.tx_errors++;
		goto drop;
	}

	skb->dev = vport->dev;      //skb的dev设备,设置成vport关联的netdev设备
	vport->ops->send(skb);      //调用vport对应vport_ops的send函数,如果是ovs_netdev_vport_ops,则调用内核的dev_queue_xmit函数,其他种类的vport后续分析
	return;

drop:
	kfree_skb(skb);
}
至此报文从进入到OVS到报文离开OVS已经全部完成,这里只是最简单的流程,后续逐步丰富之。



你可能感兴趣的:(网络,openvswitch,ovs)