Socket Buffers 由以下两部分组成:
1) Packet data: 它是在网络上传输的数据,其存储位置对应的PDU (Protocol Data Unit)
2) Management data: 当包在kernel中进行处理时,kernel需要另外一些数据,如pointer, timers等,它们是协议实体间交换信息的ICI(Interface Control Information)
Socket Buffer构成如下图所示:
在Kernel处理过程中,网络数据以Socket Buffer的形式存在。
当app通过socket发送数据时,socket将创建一个对应的socket buffer,并把需要发送的数据(payload)放于其中。当它通过各个协议层时,每一层的包头将被插入到payload的前面,在创建socket buffer时,为包头预留了足够空间。按此方案,payload被copy两次:
1) 从用户空间copy到kernel空间
2) 发送数据到network adapter
在协议层间传递时,其数据变化如下图所示:
Socket Buffer数据结构如下所示:
struct sk_buff { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; ktime_t tstamp; struct sock *sk; struct net_device *dev; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); unsigned long _skb_refdst; #ifdef CONFIG_XFRM struct sec_path *sp; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; kmemcheck_bitfield_begin(flags1); __u8 local_df:1, cloned:1, ip_summed:2, nohdr:1, nfctinfo:3; __u8 pkt_type:3, fclone:2, ipvs_property:1, peeked:1, nf_trace:1; kmemcheck_bitfield_end(flags1); __be16 protocol; void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED struct sk_buff *nfct_reasm; #endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif int skb_iif; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT __u16 tc_verd; /* traffic control verdict */ #endif #endif __u32 rxhash; __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 ooo_okay:1; kmemcheck_bitfield_end(flags2); /* 0/13 bit hole */ #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 dropcount; }; __u16 vlan_tci; sk_buff_data_t transport_header; //传输层头 sk_buff_data_t network_header; //网络层头 sk_buff_data_t mac_header; //链路层头 /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; atomic_t users; }
对Socket Buffers的操作分为以下三类:
1) 创建、释放和复制sokcet buffers
2) 操作sk_buff结构中的参数和指针,主要是改变包数据空间的操作
3) 管理socket buffer队列
其相关函数如下所示:
alloc_skb() include/linux/skbuff.h
dev_alloc_skb() net/core/skbuff.c
skb_copy() net/core/skbuff.c
skb_copy_expand() net/core/skbuff.c
skb_clone() net/core/skbuff.c
kfree_skb() net/core/skbuff.c
dev_kfree_skb() include/linux/skbuff.h
kfree_skbmem() net/core/skbuff.c
include/linux/skbuff.h
skb_get()
skb_unshare()
skb_put()
skb_push()
skb_pull()
skb_tailroom()
skb_headroom()
skb_realloc_headroom()
skb_reserve()
skb_trim()
skb_cow()
如果Socket Buffer不是正在被处理,则它被sk_buff_head管理,它通过双向链表进行管理,如下图所示:
struct sk_buff_head { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; __u32 qlen; spinlock_t lock; };
include/linux/skbuff.h
skb_queue_head_init()
skb_queue_empty()
skb_queue_len()
include/linux/skbuff.h
skb_queue_head()
skb_queue_tail()
skb_dequeue()
skb_dequeue_tail()
skb_queue_purge()
skb_insert()
skb_append()
skb_unlink()
skb_peek()
skb_peek_tail()
在Linux系统中的网络架构,基于软件的协议(software-based protocol)与网络适配器(network adapters)间的接口通过network devices来实现。一个network-device接口需要满足以下要求:
1) 是network adapter的技术抽象
2) 提供统一的接口供协议实体访问
【网络设备】不同于【字符设备】和【块设备】,其主要区别如下:
1) 网络设备在/dev下不存在对应的设备名,即不可通过read和write进行读写操作
2) 网络设备基于包进行处理,且必须经过复杂协议的处理(如TCP和UDP)
net_device定义如下:
struct net_device { /* * This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. */ char name[IFNAMSIZ]; struct pm_qos_request_list pm_qos_req; /* device name hash chain */ struct hlist_node name_hlist; /* snmp alias */ char *ifalias; // 硬件相关的信息 /* * I/O specific fields * FIXME: Merge these and struct ifmap into one */ unsigned long mem_end; /* shared mem end */ unsigned long mem_start; /* shared mem start */ unsigned long base_addr; /* device I/O address */ unsigned int irq; /* device IRQ number */ /* * Some hardware also needs these fields, but they are not * part of the usual set specified in Space.c. */ unsigned long state; struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; /* currently active device features */ u32 features; /* user-changeable features */ u32 hw_features; /* user-requested features */ u32 wanted_features; /* mask of features inheritable by VLAN devices */ u32 vlan_features; /* Interface index. Unique device identifier */ int ifindex; int iflink; struct net_device_stats stats; atomic_long_t rx_dropped; /* dropped packets by core network * Do not use this in drivers. */ // 管理操作 /* Management operations */ const struct net_device_ops *netdev_ops; // 最终调用网络设备驱动方法 const struct ethtool_ops *ethtool_ops; // 硬件头描述 /* Hardware header description */ const struct header_ops *header_ops; unsigned int flags; /* interface flags (a la BSD) */ unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */ unsigned short gflags; unsigned short padded; /* How much padding added by alloc_netdev() */ unsigned char operstate; /* RFC2863 operstate */ unsigned char link_mode; /* mapping policy to operstate */ unsigned char if_port; /* Selectable AUI, TP,..*/ unsigned char dma; /* DMA channel */ unsigned int mtu; /* interface MTU value */ unsigned short type; /* interface hardware type */ unsigned short hard_header_len; /* hardware hdr length */ /* extra head- and tailroom the hardware may need, but not in all cases * can this be guaranteed, especially tailroom. Some cases also use * LL_MAX_HEADER instead to allocate the skb. */ unsigned short needed_headroom; unsigned short needed_tailroom; // 接口地址信息 /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ unsigned char addr_assign_type; /* hw address assignment type */ unsigned char addr_len; /* hardware address length */ unsigned short dev_id; /* for shared network cards */ spinlock_t addr_list_lock; struct netdev_hw_addr_list uc; /* Unicast mac addresses */ struct netdev_hw_addr_list mc; /* Multicast mac addresses */ int uc_promisc; unsigned int promiscuity; unsigned int allmulti; // 协议相关的指针 /* Protocol specific pointers */ void *atalk_ptr; /* AppleTalk link */ struct in_device __rcu *ip_ptr; /* IPv4 specific data */ struct dn_dev __rcu *dn_ptr; /* DECnet specific data */ struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ void *ax25_ptr; /* AX.25 specific data */ struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data, assign before registering */ // 在接收通道中需要缓存的数据 /* * Cache lines mostly used on receive path (including eth_type_trans()) */ unsigned long last_rx; /* Time of last Rx * This should not be set in * drivers, unless really needed, * because network stack (bonding) * use it if/when necessary, to * avoid dirtying this cache line. */ struct net_device *master; /* Pointer to master device of a group, * which this device is member of. */ /* Interface address info used in eth_type_trans() */ unsigned char *dev_addr; /* hw address, (before bcast because most packets are unicast) */ struct netdev_hw_addr_list dev_addrs; /* list of device hw addresses */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ #ifdef CONFIG_RPS struct kset *queues_kset; struct netdev_rx_queue *_rx; /* Number of RX queues allocated at register_netdev() time */ unsigned int num_rx_queues; /* Number of RX queues currently active in device */ unsigned int real_num_rx_queues; #ifdef CONFIG_RFS_ACCEL /* CPU reverse-mapping for RX completion interrupts, indexed * by RX queue number. Assigned by driver. This must only be * set if the ndo_rx_flow_steer operation is defined. */ struct cpu_rmap *rx_cpu_rmap; #endif #endif rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; struct netdev_queue __rcu *ingress_queue; // 在发送通道中需要缓存的数据 /* * Cache lines mostly used on transmit path */ struct netdev_queue *_tx ____cacheline_aligned_in_smp; /* Number of TX queues allocated at alloc_netdev_mq() time */ unsigned int num_tx_queues; /* Number of TX queues currently active in device */ unsigned int real_num_tx_queues; /* root qdisc from userspace point of view */ struct Qdisc *qdisc; unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif /* These may be needed for future network-power-down code. */ /* * trans_start here is expensive for high speed devices on SMP, * please use netdev_queue->trans_start instead. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ int watchdog_timeo; /* used by dev_watchdog() */ struct timer_list watchdog_timer; /* Number of references to this device */ int __percpu *pcpu_refcnt; /* delayed register/unregister */ struct list_head todo_list; /* device index hash chain */ struct hlist_node index_hlist; struct list_head link_watch_list; /* register/unregister state machine */ enum { NETREG_UNINITIALIZED=0, NETREG_REGISTERED, /* completed register_netdevice */ NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ NETREG_DUMMY, /* dummy device for NAPI poll */ } reg_state:8; bool dismantle; /* device is going do be freed */ enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, } rtnl_link_state:16; /* Called from unregister, can be used to call free_netdev */ void (*destructor)(struct net_device *dev); #ifdef CONFIG_NETPOLL struct netpoll_info *npinfo; #endif #ifdef CONFIG_NET_NS /* Network namespace this network device is inside */ struct net *nd_net; #endif /* mid-layer private */ union { void *ml_priv; struct pcpu_lstats __percpu *lstats; /* loopback stats */ struct pcpu_tstats __percpu *tstats; /* tunnel stats */ struct pcpu_dstats __percpu *dstats; /* dummy stats */ }; /* GARP */ struct garp_port __rcu *garp_port; /* class/net/name entry */ struct device dev; /* space for optional device, statistics, and wireless sysfs groups */ const struct attribute_group *sysfs_groups[4]; /* rtnetlink link ops */ const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; u8 num_tc; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; /* n-tuple filter list attached to this device */ struct ethtool_rx_ntuple_list ethtool_ntuple_list; /* phy device may attach itself for hardware timestamping */ struct phy_device *phydev; /* group the device belongs to */ int group; }
net_device是每个网络设备的基础,它不仅包含network adapter硬件信息(如:interrupt, ports, driver functions等),也包含高层网络协议的配置数据(如:IP address, subnet mask等).
在/sys/class/net下列出来所有网络设备的名字,如我的为:
shell@android:/sys/class/net # ll
lrwxrwxrwx root root 2013-07-05 17:08 ip6tnl0
lrwxrwxrwx root root 2013-07-05 17:08 lo (loopback设备)
lrwxrwxrwx root root 2013-07-05 17:08 sit0
lrwxrwxrwx root root 2000-01-01 08:00 wlan0 (Wifi设备)
从上面的协议实例看net_device。
位于文件:kernel/net/core/dev.c
int register_netdev(struct net_device *dev)
void unregister_netdev(struct net_device *dev)
网络设备(net_device)与一个已经存在的network adapter一一对应。
位于文件:kernel/net/core/dev.c
int dev_open(struct net_device *dev)
int dev_close(struct net_device *dev)
位于文件:kernel/net/core/dev.c
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
void free_netdev(struct net_device *dev)
int dev_alloc_name(struct net_device *dev, const char *name)
struct net_device *dev_get_by_index(struct net *net, int ifindex)
struct net_device *dev_get_by_name(struct net *net, const char *name)
void dev_load(struct net *net, const char *name)
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
网络状态有如下值:
NETDEV_UP: 激活一个网络设备 (dev_open)
NETDEV_DOWN: 禁止一个网络设备 (dev_close)
NETDEV_CHANGE: 通知网络设备状态变化
NETDEV_REGISTER: 网络设备已经被注册,但是没有打开实例
NETDEV_UNREGISTER: 网络设备已经被删除
NETDEV_CHANGEMTU: 网络设备MTU被修改
NETDEV_CHANGEADDR: 网络设备硬件地址被修改
NETDEV_CHANGENAME:网络设备名字被修改
int dev_queue_xmit(struct sk_buff *skb) // kernel/net/core/dev.c
它由高层的协议实例调用,以通过一个net_device(skb->dev)发送一个socket buffer.
struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); void (*ndo_set_multicast_list)(struct net_device *dev); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); void (*ndo_tx_timeout) (struct net_device *dev); struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid); void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev, struct netpoll_info *info); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, int queue, u16 vlan, u8 qos); int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate); int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); int (*ndo_setup_tc)(struct net_device *dev, u8 tc); #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); #define NETDEV_FCOE_WWNN 0 #define NETDEV_FCOE_WWPN 1 int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif #ifdef CONFIG_RFS_ACCEL int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, u16 rxq_index, u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); u32 (*ndo_fix_features)(struct net_device *dev, u32 features); int (*ndo_set_features)(struct net_device *dev, u32 features); }
在net_device被激活之前,我们必须找到一个匹配的network adapter。网络驱动(network driver)的初始化函数(init/probe)负责找到一个匹配的network adapter并且使用对应的信息初始化net_device。在驱动的probe函数中,主要完成以下任务:(参考:kernel/drivers/net/pci-skeleton.c)
1) 创建net_device
2) 填充相关的硬件信息
3) 调用register_netdev进行注册
4) 设置net_device->netdev_ops (netdev_ops由驱动实现)
1) 打开: ifconfig wlan0 up->ioctl->dev_open->net_device.netdev_ops.ndo_open
2) 关闭: ifconfig wlan0 down->ioctl->dev_close->net_device.netdev_ops.ndo_stop
在驱动中实现与ndo_start_xmit对应的函数
其流程如下图所示:
在网络驱动的中断处理函数中,首先调用dev_alloc_skb创建socket buffer,然后把接收到的数据copy到其中,最后调用netif_rx把socket buffer放入队列中,供协议层处理。
int netif_rx(struct sk_buff *skb) // kernel/net/core/dev.c
1) 逻辑链路控制(LLC)层由Linux内核实现,网络适配器通过net_device连接到操作系统内核。
2) 数据链路层(Data-Link Layer)由LLC和MAC组成。LLC隐藏了所有不同传输介质的差异,从而以上层协议提供统一的接口;而MAC层则反应了不同传输技术(传输协议,如802.3与802.11不同)的差异。
逻辑链路控制 LLC (Logical Link Control)子层
媒体接入控制 MAC (Medium Access Control)子层
与接入到传输媒体有关的内容都放在 MAC子层,而LLC 子层则与传输媒体无关,不管采用何种协议的局域网对 LLC 子层来说都是透明的。