Linux网络协议栈分析——从设备驱动到链路层

在网上看到高手总结出来的,省的自己花时间再研究,放到此处以便学习。 

原文:http://ericxiao.cublog.cn/   

 

  成都的天气好像越来越好了,前几天还穿着穿着外套直打哆嗦,到今天已经“拨开阴云见太阳”,暖洋洋的,心情也暖洋洋的。暖和的正好想睡觉。打个呵欠,把网络设备管理这部份总结下吧。    

     Linux素以优秀的网络管理能力而著称,linux为何具有这么高的效率?我们从网络设备的管理说起。

     Linux为何要对网络设备单独管理呢?这是因为。协议栈很多地方都会涉及到网络设备。小至IP地址的设置。大至IP路由的更新。都离不开高效的网络设备管理。将网络设备单独管理可以提高效率!

     每个网络设备,在linux中都会对应一个数据结构,net_device。 就从这个结构说起

Linux 2。6。21中,对net_device定义如下:

struct net_device

{

     //设备的名称,例如常见的“eth0”等

     char          name[IFNAMSIZ];

     //共享内存的起始,结束地址

     unsigned long      mem_end; /* shared mem end  */

     unsigned long      mem_start;    /* shared mem start    */

     //网络设备的I/O基地址

     unsigned long      base_addr;    /* device I/O address  */

     //被赋予的中断号

     unsigned int       irq;     /* device IRQ number   */

     //在多端口设备上使用哪一个端口

     unsigned char      if_port; /* Selectable AUI, TP,..*/

     //为设备分配的DMA通道

     unsigned char      dma;     /* DMA channel         */

     //设备的状态

     unsigned long      state;

     // 下一个net_device

     struct net_device  *next;

     //初始化函数。

     int           (*init)(struct net_device *dev);

     struct net_device  *next_sched;

 

     /* Interface index. Unique device identifier   */

     //设备在内核中对应的序号

     int           ifindex;

     int           iflink;

 

     //获得接口状态的函数指针

     struct net_device_stats* (*get_stats)(struct net_device *dev);

     struct iw_statistics*  (*get_wireless_stats)(struct net_device *dev);

 

     struct iw_handler_def * wireless_handlers;

     struct ethtool_ops *ethtool_ops;

     //传输状态。检查传输是否被锁住

     unsigned long      trans_start;  /* Time (in jiffies) of last Tx  */

     //最使使用的时间

     unsigned long      last_rx; /* Time of last Rx */

     //接口标志

     unsigned short         flags;   /* interface flags (a la BSD)    */

     unsigned short         gflags;

        unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */

        unsigned short          unused_alignment_fixer; /* Because we need priv_flags,

                                                         * and we want to be 32-bit aligned.

                                                         */

 

     unsigned      mtu; /* interface MTU value      */

     unsigned short         type;    /* interface hardware type  */

     unsigned short         hard_header_len;   /* hardware hdr length */

     void          *priv;   /* pointer to private data  */

 

     struct net_device  *master; /* Pointer to master device of a group,

                         * which this device is member of.

                         */

 

     /* Interface address info. */

     unsigned char      broadcast[MAX_ADDR_LEN];    /* hw bcast add    */

     unsigned char      dev_addr[MAX_ADDR_LEN]; /* hw address */

     unsigned char      addr_len; /* hardware address length  */

 

     struct dev_mc_list *mc_list; /* Multicast mac addresses  */

     int           mc_count; /* Number of installed mcasts    */

     int           promiscuity;

     int           allmulti;

 

     int           watchdog_timeo;

     struct timer_list  watchdog_timer;

 

     /* Protocol specific pointers */

    

     void               *atalk_ptr;   /* AppleTalk link */

     void          *ip_ptr; /* IPv4 specific data  */ 

     void                    *dn_ptr;        /* DECnet specific data */

     void                    *ip6_ptr;       /* IPv6 specific data */

     void          *ec_ptr; /* Econet specific data */

     void          *ax25_ptr;    /* AX.25 specific data */

 

     struct list_head   poll_list;    /* Link to poll list   */

     int           quota;

     int           weight;

 

     struct Qdisc       *qdisc;

     struct Qdisc       *qdisc_sleeping;

     struct Qdisc       *qdisc_ingress;

     struct list_head   qdisc_list;

     unsigned long      tx_queue_len; /* Max frames per queue allowed */

 

     /* ingress path synchronizer */

     spinlock_t         ingress_lock;

     /* hard_start_xmit synchronizer */

     spinlock_t         xmit_lock;

     /* cpu id of processor entered to hard_start_xmit or -1,

        if nobody entered there.

      */

     int           xmit_lock_owner;

     /* device queue lock */

     spinlock_t         queue_lock;

     /* Number of references to this device */

     atomic_t      refcnt;

     /* delayed register/unregister */

     struct list_head   todo_list;

     /* device name hash chain */

     struct hlist_node  name_hlist;

     /* device index hash chain */

     struct hlist_node  index_hlist;

 

     /* register/unregister state machine */

     enum { NETREG_UNINITIALIZED=0,

            NETREG_REGISTERING,  /* called register_netdevice */

            NETREG_REGISTERED,   /* completed register todo */

            NETREG_UNREGISTERING,     /* called unregister_netdevice */

            NETREG_UNREGISTERED, /* completed unregister todo */

            NETREG_RELEASED,     /* called free_netdev */

     } reg_state;

 

     /* Net device features */

     int           features;

#define NETIF_F_SG     1    /* Scatter/gather IO. */

#define NETIF_F_IP_CSUM     2    /* Can checksum only TCP/UDP over IPv4. */

#define NETIF_F_NO_CSUM     4    /* Does not require checksum. F.e. loopack. */

#define NETIF_F_HW_CSUM     8    /* Can checksum all the packets. */

#define NETIF_F_HIGHDMA     32   /* Can DMA to high memory. */

#define NETIF_F_FRAGLIST    64   /* Scatter/gather IO. */

#define NETIF_F_HW_VLAN_TX  128  /* Transmit VLAN hw acceleration */

#define NETIF_F_HW_VLAN_RX  256  /* Receive VLAN hw acceleration */

#define NETIF_F_HW_VLAN_FILTER   512  /* Receive filtering on VLAN */

#define NETIF_F_VLAN_CHALLENGED  1024 /* Device cannot handle VLAN packets */

#define NETIF_F_TSO         2048 /* Can offload TCP/IP segmentation */

#define NETIF_F_LLTX        4096 /* LockLess TX */

 

     /* Called after device is detached from network. */

     void          (*uninit)(struct net_device *dev);

     /* Called after last user reference disappears. */

     void          (*destructor)(struct net_device *dev);

 

     /* Pointers to interface service routines.     */

     //打开函数指针

     int           (*open)(struct net_device *dev);

     //设备停用时调用此函数

     int           (*stop)(struct net_device *dev);

     //初始化数据包的传输

     int           (*hard_start_xmit) (struct sk_buff *skb,

                                struct net_device *dev);

#define HAVE_NETDEV_POLL

     //轮询函数

     int           (*poll) (struct net_device *dev, int *quota);

     //建立硬件头信息

     int           (*hard_header) (struct sk_buff *skb,

                            struct net_device *dev,

                            unsigned short type,

                            void *daddr,

                            void *saddr,

                            unsigned len);

     //ARP解析之后,重构头部

     int           (*rebuild_header)(struct sk_buff *skb);

#define HAVE_MULTICAST     

     //多播支持函数    

     void          (*set_multicast_list)(struct net_device *dev);

#define HAVE_SET_MAC_ADDR         

     int           (*set_mac_address)(struct net_device *dev,

                               void *addr);

#define HAVE_PRIVATE_IOCTL

     int           (*do_ioctl)(struct net_device *dev,

                           struct ifreq *ifr, int cmd);

#define HAVE_SET_CONFIG

     int           (*set_config)(struct net_device *dev,

                             struct ifmap *map);

#define HAVE_HEADER_CACHE

     int           (*hard_header_cache)(struct neighbour *neigh,

                                 struct hh_cache *hh);

     void          (*header_cache_update)(struct hh_cache *hh,

                                   struct net_device *dev,

                                   unsigned char *  haddr);

#define HAVE_CHANGE_MTU

     int           (*change_mtu)(struct net_device *dev, int new_mtu);

 

#define HAVE_TX_TIMEOUT

     void          (*tx_timeout) (struct net_device *dev);

 

     void          (*vlan_rx_register)(struct net_device *dev,

                                struct vlan_group *grp);

     void          (*vlan_rx_add_vid)(struct net_device *dev,

                               unsigned short vid);

     void          (*vlan_rx_kill_vid)(struct net_device *dev,

                                unsigned short vid);

 

     int           (*hard_header_parse)(struct sk_buff *skb,

                                 unsigned char *haddr);

     int           (*neigh_setup)(struct net_device *dev, struct neigh_parms *);

     int           (*accept_fastpath)(struct net_device *, struct dst_entry*);

#ifdef CONFIG_NETPOLL

     int           netpoll_rx;

#endif

#ifdef CONFIG_NET_POLL_CONTROLLER

     void                    (*poll_controller)(struct net_device *dev);

#endif

 

     /* bridge stuff */

     //对应的网桥端口(以后分析)

     struct net_bridge_port *br_port;

 

#ifdef CONFIG_NET_DIVERT

     /* this will get initialized at each interface type init routine */

     struct divert_blk  *divert;

#endif /* CONFIG_NET_DIVERT */

 

     /* class/net/name entry */

     struct class_device    class_dev;

     /* how much padding had been added by alloc_netdev() */

     int padded;

}

晕,太多的成员。太庞大了。不要紧,等到要使用到相应成员的时候再来解释好了。

 

 

注意到这么庞大的结构中,有个成员叫: struct net_device  *next,呵呵,很熟悉吧,就是用它来建立网络设备的链表。

每一个网络设备启动的时候,都会调用register_netdev() (drivers/net/net_init.c)

跟踪这个函数:

int register_netdev(struct net_device *dev)

{

     int err;

 

     rtnl_lock();

 

     /*

      *   If the name is a format string the caller wants us to

      *   do a name allocation

      */

      

     if (strchr(dev->name, '%'))

     {

         err = dev_alloc_name(dev, dev->name);

         if (err < 0)

              goto out;

     }

    

     /*

      *   Back compatibility hook. Kill this one in 2.5

      */

    

     if (dev->name[0]==0 || dev->name[0]==' ')

     {

         err = dev_alloc_name(dev, "eth%d");

         if (err < 0)

              goto out;

     }

 

     err = register_netdevice(dev);

 

out:

     rtnl_unlock();

     return err;

}

 

跟踪至: register_netdevice(struct net_device *dev)  (net/core/dev.c)

int register_netdevice(struct net_device *dev)

{

     struct hlist_head *head;

     struct hlist_node *p;

     int ret;

 

     BUG_ON(dev_boot_phase);

     ASSERT_RTNL();

 

     /* When net_device's are persistent, this will be fatal. */

     BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);

 

     spin_lock_init(&dev->queue_lock);

     spin_lock_init(&dev->xmit_lock);

     dev->xmit_lock_owner = -1;

#ifdef CONFIG_NET_CLS_ACT

     spin_lock_init(&dev->ingress_lock);

#endif

 

     ret = alloc_divert_blk(dev);

     if (ret)

         goto out;

 

     dev->iflink = -1;

 

     /* Init, if this function is available */

     //如果dev -> init 被赋值,那么调用此函数

     if (dev->init) {

         ret = dev->init(dev);

         if (ret) {

              if (ret > 0)

                   ret = -EIO;

              goto out_err;

         }

     }

 

     //判断name 是否合法

     if (!dev_valid_name(dev->name)) {

         ret = -EINVAL;

         goto out_err;

     }

     //为此设备分配一个index

     dev->ifindex = dev_new_index();

     if (dev->iflink == -1)

         dev->iflink = dev->ifindex;

 

     /* Check for existence of name */

 

     //所有网络设备,以名字作为哈希主键存在dev_name_head中,该变量是一个哈希数组

     //找到该名字对应的链表

     //如果内核中已经含有此名字的网络设备,出错退出

     head = dev_name_hash(dev->name);

     hlist_for_each(p, head) {

         struct net_device *d

              = hlist_entry(p, struct net_device, name_hlist);

         if (!strncmp(d->name, dev->name, IFNAMSIZ)) {

              ret = -EEXIST;

              goto out_err;

         }

     }

 

     /* Fix illegal SG+CSUM combinations. */

     if ((dev->features & NETIF_F_SG) &&

         !(dev->features & (NETIF_F_IP_CSUM |

                     NETIF_F_NO_CSUM |

                     NETIF_F_HW_CSUM))) {

         printk("%s: Dropping NETIF_F_SG since no checksum feature./n",

                dev->name);

         dev->features &= ~NETIF_F_SG;

     }

 

     /*

      *   nil rebuild_header routine,

      *   that should be never called and used as just bug trap.

      */

 

     //为rebuild_header赋默认值

     if (!dev->rebuild_header)

         dev->rebuild_header = default_rebuild_header;

 

     /*

      *   Default initial state at registry is that the

      *   device is present.

      */

 

     set_bit(__LINK_STATE_PRESENT, &dev->state);

 

     dev->next = NULL;

     dev_init_scheduler(dev);

     write_lock_bh(&dev_base_lock);

     //初始化的时候,有struct net_device **dev_tail = &dev_base;

     //这段代码的意思实际就是:把dev加入dev_base为首结点队链表的尾部

     *dev_tail = dev;

     dev_tail = &dev->next;

     //把此结点加入到以名字为哈希主键的链表数组dev_name_head中

     hlist_add_head(&dev->name_hlist, head);

     //把此结点加到以序号为主键的链表数组dev_index_head中

     hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));

     dev_hold(dev);

     dev->reg_state = NETREG_REGISTERING;

     write_unlock_bh(&dev_base_lock);

 

     /* Notify protocols, that a new device appeared. */

     //在通知链表上发送事件

     notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);

 

     /* Finish registration after unlock */

     net_set_todo(dev);

     ret = 0;

 

out:

     return ret;

out_err:

     free_divert_blk(dev);

     goto out;

}

 

从此可以看出。新加入一个设备时,会插入三个位置:以名字为哈希值组织的dev_name_head ,以序号为主链的哈希数组dev_index_head.还有dev_base.它为快速查找网络设备提供了基础。事实上。在内核中,经常要根据index找到dev. 或者根据name找到dev.我们遇到的时候再分析

 

到现在,我们可以在内核中顺藤摸瓜的找到每一个网络设备了。

还有很重要的。设备更改了配置,要怎么通知跟他相关的子系统呢?例如,网卡更新了IP,如何使路由得到更新?

接着往下看:

注意到上面注册代码中所调用的一个函数notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev).

该函数的作用是,在通知链表上netdev_chain上发送NETDEV_REGISTER消息,所有在与该通知链表关联的子系统都可以收到此消息。以此,可以快速的更新整个系统的配置消息。

以路由子系统为例,来讲述该过程:

在IPV4子系统加载的时候,加调用ip_init(),接着调用fib_init(),然后再调用ip_fib_init()

跟踪一下此函数:

void __init ip_fib_init(void)

{

#ifndef CONFIG_IP_MULTIPLE_TABLES

     ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);

     ip_fib_main_table  = fib_hash_init(RT_TABLE_MAIN);

#else

     fib_rules_init();

#endif

 

     register_netdevice_notifier(&fib_netdev_notifier);

     register_inetaddr_notifier(&fib_inetaddr_notifier);

}

register_netdevice_notifier是做什么的呢?往下跟踪:

int register_netdevice_notifier(struct notifier_block *nb)

{

     struct net_device *dev;

     int err;

 

     rtnl_lock();

     //注册通知链

     err = notifier_chain_register(&netdev_chain, nb);

     if (!err) {

         for (dev = dev_base; dev; dev = dev->next) {

              nb->notifier_call(nb, NETDEV_REGISTER, dev);

 

              if (dev->flags & IFF_UP)

                   nb->notifier_call(nb, NETDEV_UP, dev);

         }

     }

     rtnl_unlock();

     return err;

}

呵呵,它在netdev_chain上注册了通知链,当此链上有事件发生时,会调用fib_netdev_notifiers中的相关信息处理,看一下fib_netdev_notifier的信息:

 

struct notifier_block fib_netdev_notifier = {

     .notifier_call =fib_netdev_event,

};

 

OK,现在越来越具体了,如果netdev_chain有事件,会调用fib_netdev_event处理。继续跟踪:

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)

{

     struct net_device *dev = ptr;

     struct in_device *in_dev = __in_dev_get(dev);

     //设备注销

     if (event == NETDEV_UNREGISTER) {

         fib_disable_ip(dev, 2);

         return NOTIFY_DONE;

     }

 

     if (!in_dev)

         return NOTIFY_DONE;

 

     switch (event) {

     //设备UP

     case NETDEV_UP:

         for_ifa(in_dev) {

              fib_add_ifaddr(ifa);

         } endfor_ifa(in_dev);

#ifdef CONFIG_IP_ROUTE_MULTIPATH

         fib_sync_up(dev);

#endif

         rt_cache_flush(-1);

         break;

     //设备DOWN

     case NETDEV_DOWN:

         fib_disable_ip(dev, 0);

         break;

     //设备参数改变

     case NETDEV_CHANGEMTU:

     case NETDEV_CHANGE:

         rt_cache_flush(0);

         break;

     }

     return NOTIFY_DONE;

}

 

路由部份的代码将在后续的笔记中给出。至此,整个网络设备的架构非常的清晰了!

 

 

它主要完成:对网应对应的net net_device赋初值。并向内核调用register_netdev完成网络设备的注册,网络设备注册我们在上一节中已经说过,这里不再赘述。

看一下net_device中几个关键的函数:

//在设备将打开的时候,调用此函数

netdev->open = e100_open;

//在设备停用的时候调用此函数

netdev->stop = e100_close;

//设备发送数据的时候调用此函数

netdev->hard_start_xmit = e100_xmit_frame;

到此时,网卡的初始化工作已经完成了。之后就可以操作网卡了。

那网卡应该怎么使用呢?必须首先唤起网卡,即使之UP,例如 ifconfig eth0 up

此时,内核会根据接口名字“eth0”找到对应的net_device.然后调用 net_device-> open.即:e100_open。

分析如下:

static int e100_open(struct net_device *netdev)

{

         struct nic *nic = netdev_priv(netdev);

         int err = 0;

 

         //网卡正在UP,关闭载波信号

         netif_carrier_off(netdev);

         if((err = e100_up(nic)))

                   DPRINTK(IFUP, ERR, "Cannot open interface, aborting./n");

         return err;

}

我们关心的是e100_up。跟踪如下:

static int e100_up(struct nic *nic)

{

         int err;

 

         //分配收包队列

         if((err = e100_rx_alloc_list(nic)))

                   return err;

         //分配控制队列

         if((err = e100_alloc_cbs(nic)))

                   goto err_rx_clean_list;

         //硬件初始化

         if((err = e100_hw_init(nic)))

                   goto err_clean_cbs;

         //多播

         e100_set_multicast_list(nic->netdev);

         //开始接收数据

         e100_start_receiver(nic);

         mod_timer(&nic->watchdog, jiffies);

         //注册中断例程

         if((err = request_irq(nic->pdev->irq, e100_intr, SA_SHIRQ,

                   nic->netdev->name, nic->netdev)))

                   goto err_no_irq;

         //启用中断

         e100_enable_irq(nic);

         netif_wake_queue(nic->netdev);

         return 0;

 

err_no_irq:

         del_timer_sync(&nic->watchdog);

err_clean_cbs:

         e100_clean_cbs(nic);

err_rx_clean_list:

         e100_rx_clean_list(nic);

         return err;

}

在此函数中,我们可以看到,它主要完成了:接立接收环形DMA缓冲区。注册了中断处理函数

关于环形DMA缓冲区接立是由e100_rx_alloc_list(nic)完成的

static int e100_rx_alloc_list(struct nic *nic)

{

         struct rx *rx;

         // nic->params.rfds.count,接收缓存的总个数

         unsigned int i, count = nic->params.rfds.count;

         //rx_to_use:正在存在数据的位置

         //rx_to_clean:数据的初始为止。所以。数据的有限位置是从rx_to_use到rx_to_use

         nic->rx_to_use = nic->rx_to_clean = NULL;

         if(!(nic->rxs = kmalloc(sizeof(struct rx) * count, GFP_ATOMIC)))

                   return -ENOMEM;

         memset(nic->rxs, 0, sizeof(struct rx) * count);

         //遍历并建立循环链表

         for(rx = nic->rxs, i = 0; i < count; rx++, i++) {

                   rx->next = (i + 1 < count) ? rx + 1 : nic->rxs;

                   rx->prev = (i == 0) ? nic->rxs + count - 1 : rx - 1;

                   if(e100_rx_alloc_skb(nic, rx)) {

                            e100_rx_clean_list(nic);

                            return -ENOMEM;

                   }

         }

         //初始化起如位置为nic->rxs

         nic->rx_to_use = nic->rx_to_clean = nic->rxs;

 

         return 0;

}

为设备建立DMA映射的主函数为e100_rx_alloc_skb().分析如下:

static inline int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)

{

         unsigned int rx_offset = 2; /* u32 align protocol headers */

         if(!(rx->skb = dev_alloc_skb(RFD_BUF_LEN + rx_offset)))

                   return -ENOMEM;

         /* Align, init, and map the RFD. */

         rx->skb->dev = nic->netdev;

         //在数据存储区之前空出offset空间

skb_reserve(rx->skb, rx_offset);

//skb->data前部置RFD

         memcpy(rx->skb->data, &nic->blank_rfd, sizeof(struct rfd));

         //DMA内存映射,映射至skb->data

         rx->dma_addr = pci_map_single(nic->pdev, rx->skb->data,

                   RFD_BUF_LEN, PCI_DMA_BIDIRECTIONAL);

 

         /* Link the RFD to end of RFA by linking previous RFD to

l        this one, and clearing EL bit of previous.  */

//初始化前一个skb中的控制信息

         if(rx->prev->skb) {

                   struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;

                   put_unaligned(cpu_to_le32(rx->dma_addr),

                            (u32 *)&prev_rfd->link);

                   wmb();

                   prev_rfd->command &= ~cpu_to_le16(cb_el);

                   pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,

                            sizeof(struct rfd), PCI_DMA_TODEVICE);

         }

 

         return 0;

}

在这个函数里,主要完成了:DMA环形链表的建立。在这里涉及到了一个重要的数据结构sk_buff.稍后再给出它的结构分析。在这里我们只要知道在skb->data里储存的是接收数据就OK了。值得一提的是,Intel 100M 网卡对接收数据的处理,跟平时遇到的网卡不一样,接收数据时会由接收控制RU写入接收信息,由此判断接收是否完全等信息。也就是我们在代码里面看到的rfd.所以,在skb->data对应的就是rfd+网络传过来的数据.

到这里,接收准备工作已经完成了。

 

 

 

四:数据接收

为了了解网卡数据接收的过程。有必要先讨论DMA的具体过程。

DMA传输数据可以分为以下几个步骤:

首先:CPU向DMA送命令,如DMA方式,主存地址,传送的字数等,之后CPU执行原来的程序.

然后DMA 控制在 I/O 设备与主存间交换数据。接收数据完后, 向CPU发DMA请求,取得总线控制权,进行数据传送,修改卡上主存地址,修改字数计数器内且检查其值是否为零,不为零则继续传送,若已为零,则向 CPU发中断请求.。

也就是说,网卡收到包时,将它放入当前skb->data中。再来一个包时。DMA会修改卡上主存地址,转到skb->next,将数据放入其中。这也就是,一个skb->data存储一个数据包的原因。

好了,现在就可以来看具体的代码实现了。

当网络数据到络,网卡将其放到DMA内存,然后DMA向CPU报告中断,CPU根据中断向量,找到中断处理例程,也就是我们前面注册的e100_intr()进行处理。

static irqreturn_t e100_intr(int irq, void *dev_id, struct pt_regs *regs)

{

         struct net_device *netdev = dev_id;

         struct nic *nic = netdev_priv(netdev);

         u8 stat_ack = readb(&nic->csr->scb.stat_ack);

 

         DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X/n", stat_ack);

 

         if(stat_ack == stat_ack_not_ours ||          /* Not our interrupt */

            stat_ack == stat_ack_not_present)   /* Hardware is ejected */

                   return IRQ_NONE;

 

         /* Ack interrupt(s) */

         //发送中断ACK。Cpu向设备发送ACK。表示此中断已经处理

         writeb(stat_ack, &nic->csr->scb.stat_ack);

 

         /* We hit Receive No Resource (RNR); restart RU after cleaning */

         if(stat_ack & stat_ack_rnr)

                   nic->ru_running = 0;

         //禁用中断

         e100_disable_irq(nic);

         //CPU开始调度此设备。转而会运行netdev->poll

         netif_rx_schedule(netdev);

 

         return IRQ_HANDLED;

}

netif_rx_schedule(netdev)后,cpu开始调度此设备,轮询设备是否有数据要处理。转后调用netdev->poll函数,即:e100_poll()

static int e100_poll(struct net_device *netdev, int *budget)

{

         struct nic *nic = netdev_priv(netdev);

         unsigned int work_to_do = min(netdev->quota, *budget);

         unsigned int work_done = 0;

         int tx_cleaned;

         //开始对nic中,DMA数据的处理

         e100_rx_clean(nic, &work_done, work_to_do);

         tx_cleaned = e100_tx_clean(nic);

 

         /* If no Rx and Tx cleanup work was done, exit polling mode. */

         if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {

                   netif_rx_complete(netdev);

                   e100_enable_irq(nic);

                   return 0;

         }

 

         *budget -= work_done;

         netdev->quota -= work_done;

 

         return 1;

}

跟踪进e100_rx_clean():

static inline void e100_rx_clean(struct nic *nic, unsigned int *work_done,

         unsigned int work_to_do)

{

         struct rx *rx;

 

         /* Indicate newly arrived packets */

         //遍历环形DMA中的数据,调用e100_rx_indicate()进行处理

         for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {

                   if(e100_rx_indicate(nic, rx, work_done, work_to_do))

                            break; /* No more to clean */

         }

 

         /* Alloc new skbs to refill list */

         for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {

                   if(unlikely(e100_rx_alloc_skb(nic, rx)))

                            break; /* Better luck next time (see watchdog) */

         }

 

         e100_start_receiver(nic);

}

在这里,它会遍历环形DMA中的数据,即从nic->rx_to_clean开始的数据,直至数据全部处理完

进入处理函数:e100_rx_indicate()

static inline int e100_rx_indicate(struct nic *nic, struct rx *rx,

         unsigned int *work_done, unsigned int work_to_do)

{

         struct sk_buff *skb = rx->skb;

         //从这里取得rfd.其中包括了一些接收信息,但不是链路传过来的有效数据

         struct rfd *rfd = (struct rfd *)skb->data;

         u16 rfd_status, actual_size;

 

         if(unlikely(work_done && *work_done >= work_to_do))

                   return -EAGAIN;

         //同步DMA缓存

         pci_dma_sync_single_for_cpu(nic->pdev, rx->dma_addr,

                   sizeof(struct rfd), PCI_DMA_FROMDEVICE);

         //取得接收状态

         rfd_status = le16_to_cpu(rfd->status);

         DPRINTK(RX_STATUS, DEBUG, "status=0x%04X/n", rfd_status);

         /* If data isn't ready, nothing to indicate */

         //没有接收完全,返回

         if(unlikely(!(rfd_status & cb_complete)))

                     return -EAGAIN;

         //取得接收数据的长度

         actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;

         if(unlikely(actual_size > RFD_BUF_LEN - sizeof(struct rfd)))

                   actual_size = RFD_BUF_LEN - sizeof(struct rfd);

 

         //取消DMA缓存映射

         pci_unmap_single(nic->pdev, rx->dma_addr,

                   RFD_BUF_LEN, PCI_DMA_FROMDEVICE);

         //由于RFD不是链路传入的数据,清除

         skb_reserve(skb, sizeof(struct rfd));

         //调整skb中的tail指针,与len更新

         skb_put(skb, actual_size);

         //取得链路层协议

         skb->protocol = eth_type_trans(skb, nic->netdev);

         //接收失败

         if(unlikely(!(rfd_status & cb_ok))) {

                   /* Don't indicate if hardware indicates errors */

                   nic->net_stats.rx_dropped++;

                   dev_kfree_skb_any(skb);

         }

         //数据超长。Drop it  

else if(actual_size > nic->netdev->mtu + VLAN_ETH_HLEN) {

                   /* Don't indicate oversized frames */

                   nic->rx_over_length_errors++;

                   nic->net_stats.rx_dropped++;

                   dev_kfree_skb_any(skb);

         } else {

                   //成功的接收了,更新统计计数

                   nic->net_stats.rx_packets++;

                   nic->net_stats.rx_bytes += actual_size;

                   nic->netdev->last_rx = jiffies;

                   //送至上次协议处理

                   netif_receive_skb(skb);

                   if(work_done)

                            (*work_done)++;

         }

 

         rx->skb = NULL;

 

         return 0;

}

上面代码中要去判断接收是否完全,为什么要去判断呢?根据DMA机制,是网卡把数据放入DMA之后。DMA再向CPU发中断的嘛?呵呵。在这里进行接收完全判断是因为:

1:由其它原因造成的中断

2:在处理中断时候。数据又到达了。网卡依然会把它放至下一个skb。而在代码处理中是遍历处理的,也就是说处理下一个skb的时候,可能网卡正在传数据。

好了,运行到netif_receive_skb()之后,数据包被送到上层。关于后续的处理流程,以后会有专题讨论

 

五:数据的发送

在进入到发送函数之前,我们先来看e100_up()->e100_alloc_cbs函数:

static int e100_alloc_cbs(struct nic *nic)

{

         struct cb *cb;

         unsigned int i, count = nic->params.cbs.count;

 

         nic->cuc_cmd = cuc_start;

         nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = NULL;

         nic->cbs_avail = 0;

        

         //线性DMA映射,这里返回的是虚拟地址,供CPU使用的

         nic->cbs = pci_alloc_consistent(nic->pdev,

                   sizeof(struct cb) * count, &nic->cbs_dma_addr);

         if(!nic->cbs)

                   return -ENOMEM;

         //建立环形的发送缓冲区

         for(cb = nic->cbs, i = 0; i < count; cb++, i++) {

                   cb->next = (i + 1 < count) ? cb + 1 : nic->cbs;

                   cb->prev = (i == 0) ? nic->cbs + count - 1 : cb - 1;

 

                   cb->dma_addr = nic->cbs_dma_addr + i * sizeof(struct cb);

                   cb->link = cpu_to_le32(nic->cbs_dma_addr +

                            ((i+1) % count) * sizeof(struct cb));

                   cb->skb = NULL;

         }

         //初始化各指针,使其指向缓冲初始位置

         nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = nic->cbs;

         nic->cbs_avail = count;

 

         return 0;

}

在这一段代码里,完成了发送的准备工作,建立了发送环形缓存。在发送数剧时,只要把数据送入缓存即可

数据最终会调用dev-> hard_start_xmit函数。在e100代码里,也就是e100_xmit_frame(). 进入里面看下:

static int e100_xmit_frame(struct sk_buff *skb, struct net_device *netdev)

{

         struct nic *nic = netdev_priv(netdev);

         int err;

 

         if(nic->flags & ich_10h_workaround) {

                   e100_exec_cmd(nic, cuc_nop, 0);

                   udelay(1);

         }

         err = e100_exec_cb(nic, skb, e100_xmit_prepare);

         switch(err) {

         case -ENOSPC:

                   /* We queued the skb, but now we're out of space. */

                   netif_stop_queue(netdev);

                   break;

         case -ENOMEM:

                   /* This is a hard error - log it. */

                   DPRINTK(TX_ERR, DEBUG, "Out of Tx resources, returning skb/n");

                   netif_stop_queue(netdev);

                   return 1;

         }

 

         netdev->trans_start = jiffies;

         return 0;

}

继续跟踪进 e100_exec_cb(nic, skb, e100_xmit_prepare);

static inline int e100_exec_cb(struct nic *nic, struct sk_buff *skb,

         void (*cb_prepare)(struct nic *, struct cb *, struct sk_buff *))

{

         struct cb *cb;

         unsigned long flags;

         int err = 0;

 

         spin_lock_irqsave(&nic->cb_lock, flags);

 

         if(unlikely(!nic->cbs_avail)) {

                   err = -ENOMEM;

                   goto err_unlock;

         }

        

         //将skb 推入环形发送缓冲

         //cb_to_use:发送缓冲当前的使用位置

         cb = nic->cb_to_use;

         nic->cb_to_use = cb->next;

         nic->cbs_avail--;

         cb->skb = skb;

 

         if(unlikely(!nic->cbs_avail))

                   err = -ENOSPC;

 

         cb_prepare(nic, cb, skb);

 

         /* Order is important otherwise we'll be in a race with h/w:

          * set S-bit in current first, then clear S-bit in previous. */

         cb->command |= cpu_to_le16(cb_s);

         wmb();

         cb->prev->command &= cpu_to_le16(~cb_s);

         //当发送数据不为空。将余下数剧全部发送

         while(nic->cb_to_send != nic->cb_to_use) {

                   if(unlikely(e100_exec_cmd(nic, nic->cuc_cmd,

                            nic->cb_to_send->dma_addr))) {

                            /* Ok, here's where things get sticky.  It's

                             * possible that we can't schedule the command

                             * because the controller is too busy, so

                             * let's just queue the command and try again

                             * when another command is scheduled. */

                            break;

                   } else {

                            nic->cuc_cmd = cuc_resume;

                            nic->cb_to_send = nic->cb_to_send->next;

                   }

         }

 

err_unlock:

         spin_unlock_irqrestore(&nic->cb_lock, flags);

 

         return err;

}

在这里我们看到,发送数据过程主要由e100_exec_cmd完成。跟踪进去

static inline int e100_exec_cmd(struct nic *nic, u8 cmd, dma_addr_t dma_addr)

{

         unsigned long flags;

         unsigned int i;

         int err = 0;

 

         spin_lock_irqsave(&nic->cmd_lock, flags);

 

         /* Previous command is accepted when SCB clears */

         for(i = 0; i < E100_WAIT_SCB_TIMEOUT; i++) {

                   if(likely(!readb(&nic->csr->scb.cmd_lo)))

                            break;

                   cpu_relax();

                   if(unlikely(i > (E100_WAIT_SCB_TIMEOUT >> 1)))

                            udelay(5);

         }

         if(unlikely(i == E100_WAIT_SCB_TIMEOUT)) {

                   err = -EAGAIN;

                   goto err_unlock;

         }

 

         if(unlikely(cmd != cuc_resume))

                   //将数据的存放地址放入对应寄存器

writel(dma_addr, &nic->csr->scb.gen_ptr);

         //将发送操作写入控制寄存器

         writeb(cmd, &nic->csr->scb.cmd_lo);

 

err_unlock:

         spin_unlock_irqrestore(&nic->cmd_lock, flags);

 

         return err;

}

从此可以看到。Intel 100M网卡对发送数据的处理,只需将地址,命令写入相应的寄存器即可。详细资料可以查看intel 100M网卡的说明。

令人不解的是,在发送数据时,不要将发送长度写入相关寄存器吗?那他又是如何截取的呢?

 

sk_buff结构分析

sk_buff是我们遇到的第二个重要的结构,在内核中经常被缩写成skb.在linux 2.6.21它被定义成:

struct sk_buff {

         //指向下一个skb

struct sk_buff               *next;

//上一个skb

         struct sk_buff               *prev;

 

         struct sk_buf0f_head   *list;

         //对应的sock。这也是个重要的结构,在传输层的时候我们再来分析

         struct sock          *sk;

         //接收或者发送时间戳

         struct timeval               stamp;

         //接收或者发送时对应的net_device

         struct net_device *dev;

         //接收的net_device

         struct net_device *input_dev;

         //数据包对应的真实net_device.关于虚拟设备可以在之后的网桥模式分析中讨论

         struct net_device *real_dev;

         //ip层的相关信息

         union {

                   struct tcphdr       *th;

                   struct udphdr      *uh;

                   struct icmphdr    *icmph;

                   struct igmphdr    *igmph;

                   struct iphdr         *ipiph;

                   struct ipv6hdr     *ipv6h;

                   unsigned char      *raw;

         } h;

         //协议层的相关信息

         union {

                   struct iphdr         *iph;

                   struct ipv6hdr     *ipv6h;

                   struct arphdr       *arph;

                   unsigned char      *raw;

         } nh;

         //链路层的相关信息

         union {

                unsigned char     *raw;

         } mac;

         //在路由子系统中再来分析这一结构

         struct  dst_entry        *dst;

         struct         sec_path    *sp;

 

         /*

          * This is the control buffer. It is free to use for every

          * layer. Please put your private variables there. If you

          * want to keep them across layers you have to do a skb_clone()

          * first. This is owned by whoever has the skb queued ATM.

          */

         char                     cb[40];

         //各层的数据长度

         unsigned int                  len,

                                     data_len,

                                     mac_len,

                                     csum;

         unsigned char                local_df,

                                     cloned,

                                     pkt_type,

                                     ip_summed;

         __u32                           priority;

         unsigned short              protocol,

                                     security;

 

         void                     (*destructor)(struct sk_buff *skb);

#ifdef CONFIG_NETFILTER

        unsigned long                  nfmark;

         __u32                           nfcache;

         __u32                           nfctinfo;

         struct nf_conntrack      *nfct;

#ifdef CONFIG_NETFILTER_DEBUG

        unsigned int           nf_debug;

#endif

#ifdef CONFIG_BRIDGE_NETFILTER

         struct nf_bridge_info    *nf_bridge;

#endif

#endif /* CONFIG_NETFILTER */

#if defined(CONFIG_HIPPI)

         union {

                   __u32                  ifield;

         } private;

#endif

#ifdef CONFIG_NET_SCHED

       __u32                       tc_index;        /* traffic control index */

#ifdef CONFIG_NET_CLS_ACT

         __u32           tc_verd;               /* traffic control verdict */

         __u32           tc_classid;            /* traffic control classid */

#endif

 

#endif

 

 

         /* These elements must be at the end, see alloc_skb() for details.  */

         unsigned int                  truesize;

         //引用计数

         atomic_t              users;

         //存储空间的起始地址

         unsigned char      *head,

         //网络数据的起始起址

                                     *data,

         //存放网络数据的结束地址

                                     *tail,

         //存储空间的结束地址

                                     *end;

}

对应我们上面的网卡驱动分析。接收到的数据是存放在data至tail之间的区域。

Skb通常还有常用的几个函数,一一列举分析如下:

struct sk_buff *alloc_skb(unsigned int size,int gfp_mask)

分配存储空间为sixe的skb,内存分配级别为gfp_mask.注意这里的存储空间的含义,即为skb->data至skb->tail的区域

struct sk_buff *skb_clone(struct sk_buff *skb, int priority)

克隆出的skb指向同一个结构,同时会增加skb的引用计数

struct sk_buff *skb_copy(const struct sk_buff *skb, int priority)

复制一个全新的skb

void kfree_skb(struct sk_buff *skb)

当skb的引用计数为1的时候,释放此skb

unsigned char *skb_put(struct sk_buff *skb, unsigned int len)

使skb的存储空间扩大len.即使tail指针下移

unsigned char *skb_push(struct sk_buff *skb, unsigned int len)

push,即推出一段数据,使data指针下层。

void skb_reserve(struct sk_buff *skb, unsigned int len)

该操作使data指针跟tail指针同时下移,即扩大存储区域之前的空间

int skb_headroom(const struct sk_buff *skb)

返回data之前可用的空间数量

int skb_tailroom(const struct sk_buff *skb)

返回缓存区中可用的空间大小

 

二:从网卡驱动说起。

         以intel 100M 网卡驱动为例简要概述数据包的接收与发送流程。代码见(drivers/net/e100.c)

网卡是属于PCI设备,它的注册跟一般的PCI设备注册没什么两样。

static int __init e100_init_module(void)

{

         if(((1 << debug) - 1) & NETIF_MSG_DRV) {

                   printk(KERN_INFO PFX "%s, %s/n", DRV_DESCRIPTION, DRV_VERSION);

                   printk(KERN_INFO PFX "%s/n", DRV_COPYRIGHT);

         }

 

         //注册PCI

        return pci_module_init(&e100_driver);

}

其中e100_driver对应为网卡的pci_driver.

static struct pci_driver e100_driver = {

         //驱动对应的名字

         .name =         DRV_NAME,

         //匹配类型

         .id_table =     e100_id_table,

         //侦测函数

         .probe =        e100_probe,

         //移除函数,设备移除时将调用此函数

         .remove =       __devexit_p(e100_remove),

#ifdef CONFIG_PM

         .suspend =      e100_suspend,

         .resume =       e100_resume,

#endif

}

当总数探测到PCI设备符合e100_id_table中的参数时,将会调用e100_probe,开始设备的初始化

在e100_probe中:

static int __devinit e100_probe(struct pci_dev *pdev,

         const struct pci_device_id *ent)

{

         struct net_device *netdev;

         struct nic *nic;

         int err;

 

         //分配net_device并为其赋值

         //alloc_etherdev为以太网接口的net_device分配函数。它是alloc_netdev的封装函数

         if(!(netdev = alloc_etherdev(sizeof(struct nic)))) {

                   if(((1 << debug) - 1) & NETIF_MSG_PROBE)

                            printk(KERN_ERR PFX "Etherdev alloc failed, abort./n");

                   return -ENOMEM;

         }

        

         //对netdev中的函数指针赋初值

         netdev->open = e100_open;

         netdev->stop = e100_close;

         netdev->hard_start_xmit = e100_xmit_frame;

         netdev->get_stats = e100_get_stats;

         netdev->set_multicast_list = e100_set_multicast_list;

         netdev->set_mac_address = e100_set_mac_address;

         netdev->change_mtu = e100_change_mtu;

         netdev->do_ioctl = e100_do_ioctl;

         //支持ethtool工具时有效

         SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);

         netdev->tx_timeout = e100_tx_timeout;

         netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;

         //轮询函数

         netdev->poll = e100_poll;

         netdev->weight = E100_NAPI_WEIGHT;

#ifdef CONFIG_NET_POLL_CONTROLLER

         netdev->poll_controller = e100_netpoll;

#endif

 

         //获得net_device私有数据区,并对其赋值

         //私有数据大小是由alloc_etherdev()参数中指定的

         nic = netdev_priv(netdev);

         nic->netdev = netdev;

         nic->pdev = pdev;

         nic->msg_enable = (1 << debug) - 1;

         pci_set_drvdata(pdev, netdev);

 

         //启动网卡.为之后DMA,I/O内存映射做准备

//它实际上是对PCI的控制寄存器赋值来实现的

         if((err = pci_enable_device(pdev))) {

                   DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting./n");

                   goto err_out_free_dev;

         }

 

         //获取该资源相关联的标志

         //如果该设备存在I/O内存,则置IORESOURCE_MEM

         if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {

                   DPRINTK(PROBE, ERR, "Cannot find proper PCI device "

                            "base address, aborting./n");

                   err = -ENODEV;

                   goto err_out_disable_pdev;

         }

 

         //对PCI的6个寄存器都会调用资源分配函数进行申请

         if((err = pci_request_regions(pdev, DRV_NAME))) {

                   DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting./n");

                   goto err_out_disable_pdev;

         }

 

         //探制设备的DMA能力。如果设备支持DMA。pci_set_dma_mask返回0

         pci_set_master(pdev);

         if((err = pci_set_dma_mask(pdev, 0xFFFFFFFFULL))) {

                   DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting./n");

                   goto err_out_free_res;

         }

 

         SET_MODULE_OWNER(netdev);

         SET_NETDEV_DEV(netdev, &pdev->dev);

 

         //映射设备对应的I/O。以后对设备寄存器的操作可以直接转换为对内存的操作

         nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr));

         if(!nic->csr) {

                   DPRINTK(PROBE, ERR, "Cannot map device registers, aborting./n");

                   err = -ENOMEM;

                   goto err_out_free_res;

         }

 

         if(ent->driver_data)

                   nic->flags |= ich;

         else

                   nic->flags &= ~ich;

 

         spin_lock_init(&nic->cb_lock);

         spin_lock_init(&nic->cmd_lock);

         //设置定时器。

         init_timer(&nic->watchdog);

         nic->watchdog.function = e100_watchdog;

         nic->watchdog.data = (unsigned long)nic;

         init_timer(&nic->blink_timer);

         nic->blink_timer.function = e100_blink_led;

         nic->blink_timer.data = (unsigned long)nic;

 

         //为nic->mem建立线性DMA。只是在支持ethtool的时候才有用

if((err = e100_alloc(nic))) {

                   DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting./n");

                   goto err_out_iounmap;

         }

         //对nic成员赋初值

         e100_get_defaults(nic);

         e100_hw_reset(nic);

         e100_phy_init(nic);

         //读取网卡的EEPROM。其中存放着网卡的MAC地址。

         //对EEPROM是通过对I/O映射内存的操作实现的,即nic->csr

         if((err = e100_eeprom_load(nic)))

                   goto err_out_free;

         //设置netdev->dev_addr

         memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);

         if(!is_valid_ether_addr(netdev->dev_addr)) {

                   DPRINTK(PROBE, ERR, "Invalid MAC address from "

                            "EEPROM, aborting./n");

                   err = -EAGAIN;

                   goto err_out_free;

         }

 

         /* Wol magic packet can be enabled from eeprom */

         if((nic->mac >= mac_82558_D101_A4) &&

            (nic->eeprom[eeprom_id] & eeprom_id_wol))

                   nic->flags |= wol_magic;

 

         pci_enable_wake(pdev, 0, nic->flags & (wol_magic | e100_asf(nic)));

         //注册网络设备

         if((err = register_netdev(netdev))) {

                   DPRINTK(PROBE, ERR, "Cannot register net device, aborting./n");

                   goto err_out_free;

         }

 

         DPRINTK(PROBE, INFO, "addr 0x%lx, irq %d, "

                   "MAC addr %02X:%02X:%02X:%02X:%02X:%02X/n",

                   pci_resource_start(pdev, 0), pdev->irq,

                   netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2],

                   netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]);

 

         return 0;

 

err_out_free:

         e100_free(nic);

err_out_iounmap:

         iounmap(nic->csr);

err_out_free_res:

         pci_release_regions(pdev);

err_out_disable_pdev:

         pci_disable_device(pdev);

err_out_free_dev:

         pci_set_drvdata(pdev, NULL);

         free_netdev(netdev);

         return err;

}

 

<<prison break>>第三季的第五集,终于在翘首企盼中姗姗来迟了,scofid用它惊人的智慧一次次化险为夷,但在邪恶的sona监狱他将如何逃脱呢?这我们不得而知,但我们可以分析Linux网络驱动来得到数据包是怎么通过物理接口的这一层“prison”束缚来达到通信目的:-)

         一:预备知识

         关于I/O内存映射。

         设备通过控制总线,数据总线,状态总线与CPU相连。控制总数传送控制信号,例如,网卡的启用。数据总线控制数据传输,例如,网卡发送数据,状态总数一般都是读取设备的当前状态,例如读取网卡的MAC地址。

         在传统的操作中,都是通过读写设备寄存器的值来实现。但是这样耗费了CPU时钟。而且每取一次值都要读取设备寄存器,造成了效率的低下。在现代操作系统中。引用了I/O内存映射。即把寄存器的值映身到主存。对设备寄存器的操作,转换为对主存的操作,这样极大的提高了效率。

         关于DMA

         这是关于设备数据处理的一种方式。传统的处理方法为:当设备接收到数据,向CPU报告中断。CPU处理中断,把数据放到内存。

         在现代操作系统中引入的DMA是指,设备接收到数据时,把数据放至DMA内存,再向CPU产生中断。这样节省了大量的CPU时间

         关于软中断与NAPI

         在现代操作系统中,对中断的处理速度要求越来越高。为了响应中断,将中断分为两部份,即上半部与下半部。上半部将数据推入处理队列,响应中断。然后再由下半部调度完成余下的任务。

         NAPI是2.6新引入的一个概念,它在发生中断的时候,禁用中断。然后处理数据。之后,每隔一定的时候,它会主动向设备询用是否有数据要处理。

         I/O,DMA在后续代码分析中会讨论在linux2.6.21中的实现。软中断与NAPI的详细知识将会在分析中断处理的时候,一一为你道来

你可能感兴趣的:(Linux网络协议栈分析——从设备驱动到链路层)