netif_receive_skb函数是协议栈报文接收的入口,一般由驱动调用,把报文送入协议栈,4.1.12内核做了对sk的封装,目的是什么没还未搞清楚。
1、netif_receive_skb函数
static inline int netif_receive_skb(struct sk_buff *skb) { return netif_receive_skb_sk(skb->sk, skb); }2、netif_receive_skb_sk函数
int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); }3、netif_receive_skb_internal函数
static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); //支持rps,选择报文处理的CPU if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); //交给相应的CPU处理 rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; }4、__netif_receive_skb函数
static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned long pflags = current->flags; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ current->flags |= PF_MEMALLOC; ret = __netif_receive_skb_core(skb, true); tsk_restore_flags(current, pflags, PF_MEMALLOC); } else ret = __netif_receive_skb_core(skb, false); return ret; }5、__netif_receive_skb_core函数
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); //重置network header,此时skb已经指向IP头(没有vlan的情况下) if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); //重置mac len pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; //设置报文的iif值 __this_cpu_inc(softnet_data.processed); if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { //vxlan报文处理,剥除vxlan头 skb = skb_vlan_untag(skb); //剥除vlan头 if (unlikely(!skb)) goto out; } #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); goto ncls; } #endif if (pfmemalloc) //此类报文不允许ptype_all处理,即tcpdump也抓不到 goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { //遍历ptype_all,如果有则做相应处理,例如raw socket和tcpdump实现 if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { //设备上注册ptype_all,做相应的处理,更加精细的控制 if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_CLS_ACT if (static_key_false(&ingress_needed)) { skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; } skb->tc_verd = 0; ncls: #endif if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); //设备rx_handler,加入OVS时会注册为OVS的入口函数 if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { //执行rx_handler处理,例如进入OVS,OVS不支持报头中携带vlan的报文 case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb))) {<span style="white-space:pre"> </span> if (skb_vlan_tag_get_id(skb)) skb->pkt_type = PACKET_OTHERHOST; /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ skb->vlan_tci = 0; } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, //根据全局定义的协议处理报文 &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, //根据设备上注册的协议进行处理 &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); //如果设备发生变化,那么还需要针对新设备的注册协议进行处理 } if (pt_prev) { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); //调用协议处理 } else { drop: atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: return ret; }
1、vlan报文的处理,主要是循环把vlan头剥掉,如果qinq场景,两个vlan都会被剥掉;
2、交给rx_handler处理,例如OVS、linux bridge等;
3、ptype_all处理,例如抓包程序、raw socket等;
4、ptype_base处理,交给协议栈处理,例如ip、arp、rarp等;
后续将逐个针对这些场景进行分析。