Socket Buffers 由以下两部分组成:
1) Packet data: 它是在网络上传输的数据,其存储位置对应的PDU (Protocol Data Unit)
2) Management data: 当包在kernel中进行处理时,kernel需要另外一些数据,如pointer, timers等,它们是协议实体间交换信息的ICI(Interface Control Information)
Socket Buffer构成如下图所示:
在Kernel处理过程中,网络数据以Socket Buffer的形式存在。
当app通过socket发送数据时,socket将创建一个对应的socket buffer,并把需要发送的数据(payload)放于其中。当它通过各个协议层时,每一层的包头将被插入到payload的前面,在创建socket buffer时,为包头预留了足够空间。按此方案,payload被copy两次:
1) 从用户空间copy到kernel空间
2) 发送数据到network adapter
在协议层间传递时,其数据变化如下图所示:
Socket Buffer数据结构如下所示:
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
ktime_t tstamp;
struct sock *sk;
struct net_device *dev;
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
unsigned long _skb_refdst;
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
unsigned int len,
data_len;
__u16 mac_len,
hdr_len;
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
__u8 pkt_type:3,
fclone:2,
ipvs_property:1,
peeked:1,
nf_trace:1;
kmemcheck_bitfield_end(flags1);
__be16 protocol;
void (*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ooo_okay:1;
kmemcheck_bitfield_end(flags2);
/* 0/13 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci;
sk_buff_data_t transport_header; //传输层头
sk_buff_data_t network_header; //网络层头
sk_buff_data_t mac_header; //链路层头
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
atomic_t users;
}
对Socket Buffers的操作分为以下三类:
1) 创建、释放和复制sokcet buffers
2) 操作sk_buff结构中的参数和指针,主要是改变包数据空间的操作
3) 管理socket buffer队列
其相关函数如下所示:
alloc_skb() include/linux/skbuff.h
dev_alloc_skb() net/core/skbuff.c
skb_copy() net/core/skbuff.c
skb_copy_expand() net/core/skbuff.c
skb_clone() net/core/skbuff.c
kfree_skb() net/core/skbuff.c
dev_kfree_skb() include/linux/skbuff.h
kfree_skbmem() net/core/skbuff.c
include/linux/skbuff.h
skb_get()
skb_unshare()
skb_put()
skb_push()
skb_pull()
skb_tailroom()
skb_headroom()
skb_realloc_headroom()
skb_reserve()
skb_trim()
skb_cow()
如果Socket Buffer不是正在被处理,则它被sk_buff_head管理,它通过双向链表进行管理,如下图所示:
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;
spinlock_t lock;
};
include/linux/skbuff.h
skb_queue_head_init()
skb_queue_empty()
skb_queue_len()
include/linux/skbuff.h
skb_queue_head()
skb_queue_tail()
skb_dequeue()
skb_dequeue_tail()
skb_queue_purge()
skb_insert()
skb_append()
skb_unlink()
skb_peek()
skb_peek_tail()
在Linux系统中的网络架构,基于软件的协议(software-based protocol)与网络适配器(network adapters)间的接口通过network devices来实现。一个network-device接口需要满足以下要求:
1) 是network adapter的技术抽象
2) 提供统一的接口供协议实体访问
【网络设备】不同于【字符设备】和【块设备】,其主要区别如下:
1) 网络设备在/dev下不存在对应的设备名,即不可通过read和write进行读写操作
2) 网络设备基于包进行处理,且必须经过复杂协议的处理(如TCP和UDP)
net_device定义如下:
struct net_device {
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* of the interface.
*/
char name[IFNAMSIZ];
struct pm_qos_request_list pm_qos_req;
/* device name hash chain */
struct hlist_node name_hlist;
/* snmp alias */
char *ifalias;
// 硬件相关的信息
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned long state;
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
/* currently active device features */
u32 features;
/* user-changeable features */
u32 hw_features;
/* user-requested features */
u32 wanted_features;
/* mask of features inheritable by VLAN devices */
u32 vlan_features;
/* Interface index. Unique device identifier */
int ifindex;
int iflink;
struct net_device_stats stats;
atomic_long_t rx_dropped; /* dropped packets by core network
* Do not use this in drivers.
*/
// 管理操作
/* Management operations */
const struct net_device_ops *netdev_ops; // 最终调用网络设备驱动方法
const struct ethtool_ops *ethtool_ops;
// 硬件头描述
/* Hardware header description */
const struct header_ops *header_ops;
unsigned int flags; /* interface flags (a la BSD) */
unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short gflags;
unsigned short padded; /* How much padding added by alloc_netdev() */
unsigned char operstate; /* RFC2863 operstate */
unsigned char link_mode; /* mapping policy to operstate */
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
unsigned int mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
/* extra head- and tailroom the hardware may need, but not in all cases
* can this be guaranteed, especially tailroom. Some cases also use
* LL_MAX_HEADER instead to allocate the skb.
*/
unsigned short needed_headroom;
unsigned short needed_tailroom;
// 接口地址信息
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
unsigned char addr_assign_type; /* hw address assignment type */
unsigned char addr_len; /* hardware address length */
unsigned short dev_id; /* for shared network cards */
spinlock_t addr_list_lock;
struct netdev_hw_addr_list uc; /* Unicast mac addresses */
struct netdev_hw_addr_list mc; /* Multicast mac addresses */
int uc_promisc;
unsigned int promiscuity;
unsigned int allmulti;
// 协议相关的指针
/* Protocol specific pointers */
void *atalk_ptr; /* AppleTalk link */
struct in_device __rcu *ip_ptr; /* IPv4 specific data */
struct dn_dev __rcu *dn_ptr; /* DECnet specific data */
struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
assign before registering */
// 在接收通道中需要缓存的数据
/*
* Cache lines mostly used on receive path (including eth_type_trans())
*/
unsigned long last_rx; /* Time of last Rx
* This should not be set in
* drivers, unless really needed,
* because network stack (bonding)
* use it if/when necessary, to
* avoid dirtying this cache line.
*/
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr; /* hw address, (before bcast
because most packets are
unicast) */
struct netdev_hw_addr_list dev_addrs; /* list of device
hw addresses */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
#ifdef CONFIG_RPS
struct kset *queues_kset;
struct netdev_rx_queue *_rx;
/* Number of RX queues allocated at register_netdev() time */
unsigned int num_rx_queues;
/* Number of RX queues currently active in device */
unsigned int real_num_rx_queues;
#ifdef CONFIG_RFS_ACCEL
/* CPU reverse-mapping for RX completion interrupts, indexed
* by RX queue number. Assigned by driver. This must only be
* set if the ndo_rx_flow_steer operation is defined. */
struct cpu_rmap *rx_cpu_rmap;
#endif
#endif
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
struct netdev_queue __rcu *ingress_queue;
// 在发送通道中需要缓存的数据
/*
* Cache lines mostly used on transmit path
*/
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues;
/* Number of TX queues currently active in device */
unsigned int real_num_tx_queues;
/* root qdisc from userspace point of view */
struct Qdisc *qdisc;
unsigned long tx_queue_len; /* Max frames per queue allowed */
spinlock_t tx_global_lock;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps;
#endif
/* These may be needed for future network-power-down code. */
/*
* trans_start here is expensive for high speed devices on SMP,
* please use netdev_queue->trans_start instead.
*/
unsigned long trans_start; /* Time (in jiffies) of last Tx */
int watchdog_timeo; /* used by dev_watchdog() */
struct timer_list watchdog_timer;
/* Number of references to this device */
int __percpu *pcpu_refcnt;
/* delayed register/unregister */
struct list_head todo_list;
/* device index hash chain */
struct hlist_node index_hlist;
struct list_head link_watch_list;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state:8;
bool dismantle; /* device is going do be freed */
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
/* Called from unregister, can be used to call free_netdev */
void (*destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;
#endif
#ifdef CONFIG_NET_NS
/* Network namespace this network device is inside */
struct net *nd_net;
#endif
/* mid-layer private */
union {
void *ml_priv;
struct pcpu_lstats __percpu *lstats; /* loopback stats */
struct pcpu_tstats __percpu *tstats; /* tunnel stats */
struct pcpu_dstats __percpu *dstats; /* dummy stats */
};
/* GARP */
struct garp_port __rcu *garp_port;
/* class/net/name entry */
struct device dev;
/* space for optional device, statistics, and wireless sysfs groups */
const struct attribute_group *sysfs_groups[4];
/* rtnetlink link ops */
const struct rtnl_link_ops *rtnl_link_ops;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
u8 num_tc;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
u8 prio_tc_map[TC_BITMASK + 1];
/* n-tuple filter list attached to this device */
struct ethtool_rx_ntuple_list ethtool_ntuple_list;
/* phy device may attach itself for hardware timestamping */
struct phy_device *phydev;
/* group the device belongs to */
int group;
}
net_device是每个网络设备的基础,它不仅包含network adapter硬件信息(如:interrupt, ports, driver functions等),也包含高层网络协议的配置数据(如:IP address, subnet mask等).
在/sys/class/net下列出来所有网络设备的名字,如我的为:
shell@android:/sys/class/net # ll
lrwxrwxrwx root root 2013-07-05 17:08 ip6tnl0
lrwxrwxrwx root root 2013-07-05 17:08 lo (loopback设备)
lrwxrwxrwx root root 2013-07-05 17:08 sit0
lrwxrwxrwx root root 2000-01-01 08:00 wlan0 (Wifi设备)
从上面的协议实例看net_device。
位于文件:kernel/net/core/dev.c
int register_netdev(struct net_device *dev)
void unregister_netdev(struct net_device *dev)
网络设备(net_device)与一个已经存在的network adapter一一对应。
位于文件:kernel/net/core/dev.c
int dev_open(struct net_device *dev)
int dev_close(struct net_device *dev)
位于文件:kernel/net/core/dev.c
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
void free_netdev(struct net_device *dev)
int dev_alloc_name(struct net_device *dev, const char *name)
struct net_device *dev_get_by_index(struct net *net, int ifindex)
struct net_device *dev_get_by_name(struct net *net, const char *name)
void dev_load(struct net *net, const char *name)
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
网络状态有如下值:
NETDEV_UP: 激活一个网络设备 (dev_open)
NETDEV_DOWN: 禁止一个网络设备 (dev_close)
NETDEV_CHANGE: 通知网络设备状态变化
NETDEV_REGISTER: 网络设备已经被注册,但是没有打开实例
NETDEV_UNREGISTER: 网络设备已经被删除
NETDEV_CHANGEMTU: 网络设备MTU被修改
NETDEV_CHANGEADDR: 网络设备硬件地址被修改
NETDEV_CHANGENAME:网络设备名字被修改
int dev_queue_xmit(struct sk_buff *skb) // kernel/net/core/dev.c
它由高层的协议实例调用,以通过一个net_device(skb->dev)发送一个socket buffer.
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
void (*ndo_uninit)(struct net_device *dev);
int (*ndo_open)(struct net_device *dev);
int (*ndo_stop)(struct net_device *dev);
netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
u16 (*ndo_select_queue)(struct net_device *dev,
struct sk_buff *skb);
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
void (*ndo_set_multicast_list)(struct net_device *dev);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
int (*ndo_do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
int (*ndo_set_config)(struct net_device *dev,
struct ifmap *map);
int (*ndo_change_mtu)(struct net_device *dev,
int new_mtu);
int (*ndo_neigh_setup)(struct net_device *dev,
struct neigh_parms *);
void (*ndo_tx_timeout) (struct net_device *dev);
struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
struct rtnl_link_stats64 *storage);
struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
void (*ndo_vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*ndo_vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*ndo_poll_controller)(struct net_device *dev);
int (*ndo_netpoll_setup)(struct net_device *dev,
struct netpoll_info *info);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac);
int (*ndo_set_vf_vlan)(struct net_device *dev,
int queue, u16 vlan, u8 qos);
int (*ndo_set_vf_tx_rate)(struct net_device *dev,
int vf, int rate);
int (*ndo_get_vf_config)(struct net_device *dev,
int vf,
struct ifla_vf_info *ivf);
int (*ndo_set_vf_port)(struct net_device *dev,
int vf,
struct nlattr *port[]);
int (*ndo_get_vf_port)(struct net_device *dev,
int vf, struct sk_buff *skb);
int (*ndo_setup_tc)(struct net_device *dev, u8 tc);
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
int (*ndo_fcoe_ddp_setup)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
int (*ndo_fcoe_ddp_target)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
#endif
#ifdef CONFIG_RFS_ACCEL
int (*ndo_rx_flow_steer)(struct net_device *dev,
const struct sk_buff *skb,
u16 rxq_index,
u32 flow_id);
#endif
int (*ndo_add_slave)(struct net_device *dev,
struct net_device *slave_dev);
int (*ndo_del_slave)(struct net_device *dev,
struct net_device *slave_dev);
u32 (*ndo_fix_features)(struct net_device *dev,
u32 features);
int (*ndo_set_features)(struct net_device *dev,
u32 features);
}
在net_device被激活之前,我们必须找到一个匹配的network adapter。网络驱动(network driver)的初始化函数(init/probe)负责找到一个匹配的network adapter并且使用对应的信息初始化net_device。在驱动的probe函数中,主要完成以下任务:(参考:kernel/drivers/net/pci-skeleton.c)
1) 创建net_device
2) 填充相关的硬件信息
3) 调用register_netdev进行注册
4) 设置net_device->netdev_ops (netdev_ops由驱动实现)
1) 打开: ifconfig wlan0 up->ioctl->dev_open->net_device.netdev_ops.ndo_open
2) 关闭: ifconfig wlan0 down->ioctl->dev_close->net_device.netdev_ops.ndo_stop
在驱动中实现与ndo_start_xmit对应的函数
其流程如下图所示:
在网络驱动的中断处理函数中,首先调用dev_alloc_skb创建socket buffer,然后把接收到的数据copy到其中,最后调用netif_rx把socket buffer放入队列中,供协议层处理。
int netif_rx(struct sk_buff *skb) // kernel/net/core/dev.c
1) 逻辑链路控制(LLC)层由Linux内核实现,网络适配器通过net_device连接到操作系统内核。
2) 数据链路层(Data-Link Layer)由LLC和MAC组成。LLC隐藏了所有不同传输介质的差异,从而以上层协议提供统一的接口;而MAC层则反应了不同传输技术(传输协议,如802.3与802.11不同)的差异。
逻辑链路控制 LLC (Logical Link Control)子层
媒体接入控制 MAC (Medium Access Control)子层
与接入到传输媒体有关的内容都放在 MAC子层,而LLC 子层则与传输媒体无关,不管采用何种协议的局域网对 LLC 子层来说都是透明的。