本文主要内容:下半部的实现,分析数据包从上半部结束后到L3的处理过程。
内核版本:2.6.37
Author:zhangskd @ csdn blog
下半部的实现
接收数据包的下半部处理流程为:
net_rx_action // 软中断
|--> process_backlog() // 默认poll
|--> __netif_receive_skb() // L2处理函数
|--> ip_rcv() // L3入口
net_rx_action
软中断(NET_RX_SOFTIRQ)的处理函数net_rx_action()主要做了:
遍历sd->poll_list,对于每个处于轮询状态的设备,调用它的poll()函数来处理数据包。
如果设备NAPI被禁止了,则把设备从sd->poll_list上删除,否则把设备移动到sd->poll_list的队尾。
每次软中断最多允许处理netdev_budget(300)个数据包,最长运行时间为2jiffies(2ms)。
每个设备一次最多允许处理weight_p(64)个数据包(非NAPI)。
如果在这次软中断中没处理玩,则再次设置NET_RX_SOFTIRQ标志触发软中断。
- static void net_rx_action(struct softirq_action *h)
- {
- struct softnet_data *sd = &__get_cpu_var(softnet_data);
- unsigned long time_limit = jiffies + 2;
- int budget = netdev_budget;
- void *have;
-
- local_irq_disable();
-
-
- while(! list_empty(&sd->poll_list)) {
-
- struct napi_struct *n;
- int work, weight;
-
-
-
-
-
-
- if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
- goto softnet_break;
-
- local_irq_enable();
-
-
-
-
-
-
-
- n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
-
- have = netpoll_poll_lock(n);
- weight = n->weight;
-
-
-
-
-
-
- work = 0;
-
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-
- work = n->poll(n, weight);
-
- trace_napi_poll(n);
- }
- WARN_ON_ONCE(work > weight);
-
- budget -= work;
-
- local_irq_disable();
-
- if (unlikely(work == weight)) {
-
- if (unlikely(napi_disable_pending(n))) {
- local_irq_enable();
- napi_complete(n);
- local_irq_disable();
-
- } else
-
- list_move_tail(&n->poll_list, &sd->poll_list);
- }
- netpoll_poll_unlock(have);
- }
-
- out:
- net_rps_action_and_irq_enable(sd);
-
- #ifdef CONFIG_NET_DMA
- ...
- #endif
-
- return;
-
- softnet_break:
- sd->time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- goto out;
- }
当调用napi_struct的poll()来处理数据包时,本地中断是开启的,这意味着新的数据包可以继续添加到
输入队列中。
process_backlog
如果网卡驱动不支持NAPI,则默认的napi_struct->poll()函数为process_backlog()。
process_backlog()的主要工作:
1. 处理sd->process_queue中的数据包
分别取出每个skb,从队列中删除。
开本地中断,调用__netif_rx_skb()把skb从L2传递到L3,然后关本地中断。
这说明在处理skb时,是允许网卡中断把数据包添加到接收队列(sd->input_pkt_queue)中的。
2. 如果处理完sd->process_queue中的数据包了,quota还没用完
把接收队列添加到sd->process_queue处理队列的尾部后,初始化接收队列。
接下来会继续处理sd->process_queue中的数据包。
3. 如果本次能处理完sd->process_queue和sd->input_pkt_queue中的所有数据包
把napi_struct从sd->poll_list队列中删除掉,清除NAPI_STATE_SCHED标志。
- static int process_backlog(struct napi_struct *napi, int quota)
- {
- int work = 0;
- struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
-
- #ifdef CONFIG_RPS
- ...
- #endif
-
- napi->weight = weight_p;
- local_irq_disable();
-
- while(work < quota) {
- struct sk_buff *skb;
- unsigned int qlen;
-
-
-
-
- while((skb = __skb_dequeue(&sd->process_queue))) {
- local_irq_enable();
-
- __netif_receive_skb(skb);
-
- local_irq_disable();
- input_queue_head_incr(sd);
-
- if (++work >= quota) {
- local_irq_enable();
- return work;
- }
- }
-
- rps_lock(sd);
- qlen = skb_queue_len(&sd->input_pkt_queue);
-
- if (qlen)
- skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue);
-
-
- if (qlen < quota - work) {
-
- list_del(&napi->poll_list);
-
- napi->state = 0;
- quota = work + qlen;
- }
- rps_unlock(sd);
- }
-
- local_irq_enable();
- return work;
- }
从sk_buff_head队列中取出第一个skb,并把它从队列中删除。
-
-
-
-
-
-
- static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
- {
- struct sk_buff *skb = skb_peek(list);
- if (skb)
- __skb_unlink(skb, list);
- return skb;
- }
把list添加到head的队尾,然后把list重新初始化。
-
-
-
-
-
-
-
- static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head)
- {
- if (! skb_queue_empty(list)) {
- __skb_queue_splice(list, head->prev, (struct sk_buff *)head);
- head->qlen += list->qlen;
- __skb_queue_head_init(list);
- }
- }
__netif_receive_skb
__netif_receive_skb()的主要工作为:
处理NETPOLL、网卡绑定、入口流量控制、桥接、VLAN。
遍历嗅探器(ETH_P_ALL)链表ptype_all。对于每个注册的sniffer,调用它的处理函数
packet_type->func(),例如tcpdump。
赋值skb->network_header,根据skb->protocol从三层协议哈希表ptype_base中找到对应的
三层协议。如果三层协议是ETH_P_IP,相应的packet_type为ip_packet_type, 协议处理函数为ip_rcv()。
- static int __netif_receive_skb(struct sk_buff *skb)
- {
- struct packet_type *ptype, *pt_prev;
- rx_handler_func_t *rx_handler;
- struct net_device *orig_dev;
- struct net_device *master;
- struct net_device *null_or_orig;
- struct net_device *orig_or_bond;
- int ret = NET_RX_DROP;
- __be16 type;
-
- if (! netdev_tstamp_prequeue)
- net_timestamp_check(skb);
- trace_netif_receive_skb(skb);
-
-
- if (netpoll_receive_skb(skb))
- return NET_RX_DROP;
-
- if (! skb->skb_iif)
- skb->skb_iif = skb->dev->ifinex;
-
-
- null_or_orig = NULL;
- orig_dev = skb->dev;
- master = ACCESS_ONCE(orig_dev->master);
-
- if (skb->deliver_no_wcard)
- null_or_orig = orig_dev;
- else if (master) {
- if (skb_bond_should_drop(skb, master)) {
- skb->deliver_no_wcard = 1;
- null_or_orig = orig_dev;
- } else
- skb->dev = master;
- }
-
- __this_cpu_inc(softnet_data.processed);
- skb_reset_network_header(skb);
- skb_reset_network_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
- pt_prev = NULL;
-
- rcu_read_lock();
-
-
- #ifdef CONFIG_NET_CLS_ACT
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- goto ncls;
- }
- #endif
-
-
-
-
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
- ptype->dev == orig_dev) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
- }
-
- #ifdef CONFIG_NET_CLS_ACT
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
- if (! skb)
- goto out;
- ncls:
- #endif
-
-
- rx_handler = rcu_dereference(skb->dev->rx_handler);
- if (rx_handler) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
- skb = rx_handler(skb);
- if (! skb)
- goto out;
- }
-
-
- if (vlan_tx_tag_present(skb)) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
-
- if (vlan_hwaccel_do_receive(&skb)) {
- ret = __netif_receive_skb(skb);
- goto out;
- } else if (unlikely(! skb))
- goto out;
- }
-
-
-
-
-
-
- orig_or_bond = orig_dev;
- if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
- (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
- orig_or_bond = vlan_dev_real_dev(skb->dev);
- }
-
- type = skb->protocol;
-
- list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev
- || ptype->dev == orig_dev || ptype->dev == orig_or_bond)) {
-
-
-
-
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
- }
-
- if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
- } else {
- atomic_long_inc(&skb->dev->rx_dropped);
- kfree_skb(skb);
- ret = NET_RX_DROP;
- }
-
- out:
- rcu_read_unlock();
- return ret;
- }
-
L3协议处理函数
- #define PTYPE_HASH_SIZE (16)
- #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
- static DEFINE_SPINLOCK(ptype_lock);
-
- static struct list_head ptype_base[PTYPE_HASH_SIZE];
- static struct list_head ptype_all;
packet_type用于描述一个协议:
- struct packet_type {
- __be16 type;
- struct net_device *dev;
-
-
- int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
- ...
- struct list_head list;
- }
IP协议:
-
- static struct packet_type ip_packet_type = {
- .type = cpu_to_be16(ETH_P_IP),
- .func = ip_rcv,
- ...
- };
- #define ETH_P_IP 0x0800