本文分析基于Linux Kernel 3.2.1
原创作品,转载请标明http://blog.csdn.net/yming0221/article/details/7979838
更多请查看专栏http://blog.csdn.net/column/details/linux-kernel-net.html
作者:闫明
Linux内核中协议族有INET协议族,UNIX协议族等,我们还是以INET协议族为例。
下面是内核中的协议族声明:
-
- #define AF_UNSPEC 0
- #define AF_UNIX 1 /* Unix domain sockets */
- #define AF_LOCAL 1 /* POSIX name for AF_UNIX */
- #define AF_INET 2 /* Internet IP Protocol */
- #define AF_AX25 3 /* Amateur Radio AX.25 */
- #define AF_IPX 4 /* Novell IPX */
- #define AF_APPLETALK 5 /* AppleTalk DDP */
- #define AF_NETROM 6 /* Amateur Radio NET/ROM */
- #define AF_BRIDGE 7 /* Multiprotocol bridge */
- #define AF_ATMPVC 8 /* ATM PVCs */
- #define AF_X25 9 /* Reserved for X.25 project */
- #define AF_INET6 10 /* IP version 6 */
- #define AF_ROSE 11 /* Amateur Radio X.25 PLP */
- #define AF_DECnet 12 /* Reserved for DECnet project */
- #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/
- #define AF_SECURITY 14 /* Security callback pseudo AF */
- #define AF_KEY 15 /* PF_KEY key management API */
- #define AF_NETLINK 16
- #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */
- #define AF_PACKET 17 /* Packet family */
- #define AF_ASH 18 /* Ash */
- #define AF_ECONET 19 /* Acorn Econet */
- #define AF_ATMSVC 20 /* ATM SVCs */
- #define AF_RDS 21 /* RDS sockets */
- #define AF_SNA 22 /* Linux SNA Project (nutters!) */
- #define AF_IRDA 23 /* IRDA sockets */
- #define AF_PPPOX 24 /* PPPoX sockets */
- #define AF_WANPIPE 25 /* Wanpipe API Sockets */
- #define AF_LLC 26 /* Linux LLC */
- #define AF_CAN 29 /* Controller Area Network */
- #define AF_TIPC 30 /* TIPC sockets */
- #define AF_BLUETOOTH 31 /* Bluetooth sockets */
- #define AF_IUCV 32 /* IUCV sockets */
- #define AF_RXRPC 33 /* RxRPC sockets */
- #define AF_ISDN 34 /* mISDN sockets */
- #define AF_PHONET 35 /* Phonet sockets */
- #define AF_IEEE802154 36 /* IEEE802154 sockets */
- #define AF_CAIF 37 /* CAIF sockets */
- #define AF_ALG 38 /* Algorithm sockets */
- #define AF_NFC 39 /* NFC sockets */
- #define AF_MAX 40 /* For now.. */
内核中的PF_***和AF_***其实可以混用,它的宏定义如下:
-
- #define PF_UNSPEC AF_UNSPEC
- #define PF_UNIX AF_UNIX
- #define PF_LOCAL AF_LOCAL
- #define PF_INET AF_INET
- #define PF_AX25 AF_AX25
- #define PF_IPX AF_IPX
- #define PF_APPLETALK AF_APPLETALK
- #define PF_NETROM AF_NETROM
- #define PF_BRIDGE AF_BRIDGE
- #define PF_ATMPVC AF_ATMPVC
- #define PF_X25 AF_X25
- #define PF_INET6 AF_INET6
- #define PF_ROSE AF_ROSE
- #define PF_DECnet AF_DECnet
- #define PF_NETBEUI AF_NETBEUI
- #define PF_SECURITY AF_SECURITY
- #define PF_KEY AF_KEY
- #define PF_NETLINK AF_NETLINK
- #define PF_ROUTE AF_ROUTE
- #define PF_PACKET AF_PACKET
- #define PF_ASH AF_ASH
- #define PF_ECONET AF_ECONET
- #define PF_ATMSVC AF_ATMSVC
- #define PF_RDS AF_RDS
- #define PF_SNA AF_SNA
- #define PF_IRDA AF_IRDA
- #define PF_PPPOX AF_PPPOX
- #define PF_WANPIPE AF_WANPIPE
- #define PF_LLC AF_LLC
- #define PF_CAN AF_CAN
- #define PF_TIPC AF_TIPC
- #define PF_BLUETOOTH AF_BLUETOOTH
- #define PF_IUCV AF_IUCV
- #define PF_RXRPC AF_RXRPC
- #define PF_ISDN AF_ISDN
- #define PF_PHONET AF_PHONET
- #define PF_IEEE802154 AF_IEEE802154
- #define PF_CAIF AF_CAIF
- #define PF_ALG AF_ALG
- #define PF_NFC AF_NFC
- #define PF_MAX AF_MAX
以后的分析就是以INET协议族为例来分析的。
下面的结构体就是在系统初始化时用来管理协议族初始化的结构体:
- struct net_proto_family {
- int family;
- int (*create)(struct net *net, struct socket *sock,
- int protocol, int kern);
- struct module *owner;
- };
第一个属性就是协议族的宏定义,如PF_INET;
第二个属性就是协议族对应的初始化函数指针;
INET协议族对应该结构的定义如下:
- static const struct net_proto_family inet_family_ops = {
- .family = PF_INET,
- .create = inet_create,
- .owner = THIS_MODULE,
- };
下面结构体是协议族操作集结构体定义:
- struct proto_ops {
- int family;
- struct module *owner;
- int (*release) (struct socket *sock);
- int (*bind) (struct socket *sock,
- struct sockaddr *myaddr,
- int sockaddr_len);
- int (*connect) (struct socket *sock,
- struct sockaddr *vaddr,
- int sockaddr_len, int flags);
- int (*socketpair)(struct socket *sock1,
- struct socket *sock2);
- int (*accept) (struct socket *sock,
- struct socket *newsock, int flags);
- int (*getname) (struct socket *sock,
- struct sockaddr *addr,
- int *sockaddr_len, int peer);
- unsigned int (*poll) (struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
- int (*ioctl) (struct socket *sock, unsigned int cmd,
- unsigned long arg);
- #ifdef CONFIG_COMPAT
- int (*compat_ioctl) (struct socket *sock, unsigned int cmd,
- unsigned long arg);
- #endif
- int (*listen) (struct socket *sock, int len);
- int (*shutdown) (struct socket *sock, int flags);
- int (*setsockopt)(struct socket *sock, int level,
- int optname, char __user *optval, unsigned int optlen);
- int (*getsockopt)(struct socket *sock, int level,
- int optname, char __user *optval, int __user *optlen);
- #ifdef CONFIG_COMPAT
- int (*compat_setsockopt)(struct socket *sock, int level,
- int optname, char __user *optval, unsigned int optlen);
- int (*compat_getsockopt)(struct socket *sock, int level,
- int optname, char __user *optval, int __user *optlen);
- #endif
- int (*sendmsg) (struct kiocb *iocb, struct socket *sock,
- struct msghdr *m, size_t total_len);
- int (*recvmsg) (struct kiocb *iocb, struct socket *sock,
- struct msghdr *m, size_t total_len,
- int flags);
- int (*mmap) (struct file *file, struct socket *sock,
- struct vm_area_struct * vma);
- ssize_t (*sendpage) (struct socket *sock, struct page *page,
- int offset, size_t size, int flags);
- ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len, unsigned int flags);
- };
INET协议族中TCP和UDP协议对应的上述操作集的定义不同:
TCP协议z在INET层操作集inet_stream_ops
- const struct proto_ops inet_stream_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- .poll = tcp_poll,
- .ioctl = inet_ioctl,
- .listen = inet_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- .splice_read = tcp_splice_read,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
- .compat_ioctl = inet_compat_ioctl,
- #endif
- };
UDP协议在INET层操作集inet_dgram_ops
- const struct proto_ops inet_dgram_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_dgram_connect,
- .socketpair = sock_no_socketpair,
- .accept = sock_no_accept,
- .getname = inet_getname,
- .poll = udp_poll,
- .ioctl = inet_ioctl,
- .listen = sock_no_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
- .compat_ioctl = inet_compat_ioctl,
- #endif
- };
上面两个操作集是属于INET协议族层次,可以由协议族层套接字socket来管理,下面是协议族层析的套接字结构体(BSD Socket)定义:
-
-
-
-
-
-
-
-
-
-
- struct socket {
- socket_state state;
-
- kmemcheck_bitfield_begin(type);
- short type;
- kmemcheck_bitfield_end(type);
-
- unsigned long flags;
-
- struct socket_wq __rcu *wq;
-
- struct file *file;
- struct sock *sk;
- const struct proto_ops *ops;
- };
最后一个属性就指向了上面所述的操作集。若使用TCP协议,ops就是inet_stream_ops,若是UDP协议,ops就是inet_dgram_ops。
short type属性的取值可以是如下值:
- enum sock_type {
- SOCK_DGRAM = 1,
- SOCK_STREAM = 2,
- SOCK_RAW = 3,
- SOCK_RDM = 4,
- SOCK_SEQPACKET = 5,
- SOCK_DCCP = 6,
- SOCK_PACKET = 10,
- };
传输层的协议操作集结构体定义:
- struct proto {
- void (*close)(struct sock *sk,
- long timeout);
- int (*connect)(struct sock *sk,
- struct sockaddr *uaddr,
- int addr_len);
- int (*disconnect)(struct sock *sk, int flags);
-
- struct sock * (*accept) (struct sock *sk, int flags, int *err);
-
- int (*ioctl)(struct sock *sk, int cmd,
- unsigned long arg);
- int (*init)(struct sock *sk);
- void (*destroy)(struct sock *sk);
- void (*shutdown)(struct sock *sk, int how);
- int (*setsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
- unsigned int optlen);
- int (*getsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
- int __user *option);
- #ifdef CONFIG_COMPAT
- int (*compat_setsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- unsigned int optlen);
- int (*compat_getsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- int __user *option);
- int (*compat_ioctl)(struct sock *sk,
- unsigned int cmd, unsigned long arg);
- #endif
- int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len);
- int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg,
- size_t len, int noblock, int flags,
- int *addr_len);
- int (*sendpage)(struct sock *sk, struct page *page,
- int offset, size_t size, int flags);
- int (*bind)(struct sock *sk,
- struct sockaddr *uaddr, int addr_len);
-
- int (*backlog_rcv) (struct sock *sk,
- struct sk_buff *skb);
-
-
- void (*hash)(struct sock *sk);
- void (*unhash)(struct sock *sk);
- void (*rehash)(struct sock *sk);
- int (*get_port)(struct sock *sk, unsigned short snum);
- void (*clear_sk)(struct sock *sk, int size);
-
-
- #ifdef CONFIG_PROC_FS
- unsigned int inuse_idx;
- #endif
-
-
- void (*enter_memory_pressure)(struct sock *sk);
- atomic_long_t *memory_allocated;
- struct percpu_counter *sockets_allocated;
-
-
-
-
-
-
- int *memory_pressure;
- long *sysctl_mem;
- int *sysctl_wmem;
- int *sysctl_rmem;
- int max_header;
- bool no_autobind;
-
- struct kmem_cache *slab;
- unsigned int obj_size;
- int slab_flags;
-
- struct percpu_counter *orphan_count;
-
- struct request_sock_ops *rsk_prot;
- struct timewait_sock_ops *twsk_prot;
-
- union {
- struct inet_hashinfo *hashinfo;
- struct udp_table *udp_table;
- struct raw_hashinfo *raw_hash;
- } h;
-
- struct module *owner;
-
- char name[32];
-
- struct list_head node;
- #ifdef SOCK_REFCNT_DEBUG
- atomic_t socks;
- #endif
- };
该结构体和proto_ops的区别是:该结构体和具体的传输层协议相关,其中的函数指针指向对应的协议的相应的操作函数。
TCP协议的操作集定义如下:
- struct proto tcp_prot = {
- .name = "TCP",
- .owner = THIS_MODULE,
- .close = tcp_close,
- .connect = tcp_v4_connect,
- .disconnect = tcp_disconnect,
- .accept = inet_csk_accept,
- .ioctl = tcp_ioctl,
- .init = tcp_v4_init_sock,
- .destroy = tcp_v4_destroy_sock,
- .shutdown = tcp_shutdown,
- .setsockopt = tcp_setsockopt,
- .getsockopt = tcp_getsockopt,
- .recvmsg = tcp_recvmsg,
- .sendmsg = tcp_sendmsg,
- .sendpage = tcp_sendpage,
- .backlog_rcv = tcp_v4_do_rcv,
- .hash = inet_hash,
- .unhash = inet_unhash,
- .get_port = inet_csk_get_port,
- .enter_memory_pressure = tcp_enter_memory_pressure,
- .sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
- .memory_pressure = &tcp_memory_pressure,
- .sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
- .max_header = MAX_TCP_HEADER,
- .obj_size = sizeof(struct tcp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
- .twsk_prot = &tcp_timewait_sock_ops,
- .rsk_prot = &tcp_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
- .no_autobind = true,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
- #endif
- };
UDP协议的操作集则为:
- struct proto udp_prot = {
- .name = "UDP",
- .owner = THIS_MODULE,
- .close = udp_lib_close,
- .connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
- .ioctl = udp_ioctl,
- .destroy = udp_destroy_sock,
- .setsockopt = udp_setsockopt,
- .getsockopt = udp_getsockopt,
- .sendmsg = udp_sendmsg,
- .recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
- .hash = udp_lib_hash,
- .unhash = udp_lib_unhash,
- .rehash = udp_v4_rehash,
- .get_port = udp_v4_get_port,
- .memory_allocated = &udp_memory_allocated,
- .sysctl_mem = sysctl_udp_mem,
- .sysctl_wmem = &sysctl_udp_wmem_min,
- .sysctl_rmem = &sysctl_udp_rmem_min,
- .obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
- .h.udp_table = &udp_table,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
- #endif
- .clear_sk = sk_prot_clear_portaddr_nulls,
- };
现在介绍struct socket结构体中一个属性struct sock类型的结构体指针,这个结构体就是传输层的套接字,所有套接字通过该结构来使用网络协议的所有服务。定义如下:
若sk_family是PF_INET,则sk_type可以取值:SOCK_STREAM,SOCK_DGRAM,SOCK_RAW。其中sk_prot就是指向具体协议的操作集,如TCP协议就为tcp_prot。
若要将协议族操作集和具体协议操作集整合起来为IP协议提供接口,就需要下面的结构体定义:
- struct inet_protosw {
- struct list_head list;
-
-
- unsigned short type;
- unsigned short protocol;
-
- struct proto *prot;
- const struct proto_ops *ops;
-
- char no_check;
- unsigned char flags;
- };
INET三种套接字定义的inetsw_array数组如下:
- static struct inet_protosw inetsw_array[] =
- {
- {
- .type = SOCK_STREAM,
- .protocol = IPPROTO_TCP,
- .prot = &tcp_prot,
- .ops = &inet_stream_ops,
- .no_check = 0,
- .flags = INET_PROTOSW_PERMANENT |
- INET_PROTOSW_ICSK,
- },
-
- {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_UDP,
- .prot = &udp_prot,
- .ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_PERMANENT,
- },
-
- {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_ICMP,
- .prot = &ping_prot,
- .ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
- },
-
- {
- .type = SOCK_RAW,
- .protocol = IPPROTO_IP,
- .prot = &raw_prot,
- .ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
- }
- };
不过,在初始化的时候我们会将上面数组中的的元素按套接字类型插入inetsw链表数组中。其定义如下:
- static struct list_head inetsw[SOCK_MAX];
那内核中套接字struct socket、struct sock、struct inet_sock、struct tcp_sock、struct raw_sock、struct udp_sock、struct inet_connection_sock、struct inet_timewait_sock和struct tcp_timewait_sock的关系是怎样的呢?
*struct socket这个是BSD层的socket,应用程序会用过系统调用首先创建该类型套接字,它和具体协议无关。
*struct inet_sock是INET协议族使用的socket结构,可以看成位于INET层,是struct sock的一个扩展。它的第一个属性就是struct sock结构。
*struct sock是与具体传输层协议相关的套接字,所有内核的操作都基于这个套接字。
*struct tcp_sock是TCP协议的套接字表示,它是对struct inet_connection_sock的扩展,其第一个属性就是struct inet_connection_sock inet_conn。
*struct raw_sock是原始类型的套接字表示,ICMP协议就使用这种套接字,其是对struct sock的扩展。
*struct udp_sock是UDP协议套接字表示,其是对struct inet_sock套接字的扩展。
*struct inet_connetction_sock是所有面向连接协议的套接字,是对struct inet_sock套接字扩展。
后面两个是用于控制超时的套接字。
就拿struct inet_sock和struct sock为例来说明,为什么内核中可以直接将sock结构体首地址强制转换成inet_sock的首地址?并且inet_sock的大小要大于sock,直接进行如下强制转换
- static inline struct inet_sock *inet_sk(const struct sock *sk)
- {
- return (struct inet_sock *)sk;
- }
不会发生内存非法访问吗?!那就是在分配的时候并不只是分配的struct sock结构体大小的存储空间!
可以细看sock结构体分配的代码:
- struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
- struct proto *prot)
- {
- struct sock *sk;
-
- sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
- if (sk) {
- sk->sk_family = family;
- sk->sk_prot = sk->sk_prot_creator = prot;
- sock_lock_init(sk);
- sock_net_set(sk, get_net(net));
- atomic_set(&sk->sk_wmem_alloc, 1);
-
- sock_update_classid(sk);
- }
-
- return sk;
- }
紧接着调用sk_prot_alloc函数分配:
- static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
- int family)
- {
- struct sock *sk;
- struct kmem_cache *slab;
-
- slab = prot->slab;
- if (slab != NULL) {
- sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
- ..............................
- } else
- sk = kmalloc(prot->obj_size, priority);
-
- .....................
-
- return sk;
- ......................
- }
上面的代码中首先判断高速缓存中是否可用,如果不可用,直接在内存分配空间,不过大小都是prot->obj_size。
如果是TCP协议中的tcp_prot中指明该属性的大小为.obj_size = sizeof(struct tcp_sock)。
所以,程序中给struct sock指针分配的不是该结构体的实际大小,而是大于其实际大小,以便其扩展套接字的属性占用。
以图例说明tcp_sock是如何从sock强制转换来的:
下篇将分析套接字的绑定、连接等一系列操作的实现。
下篇将分析套接字的操作函数。