socket 源码解析之创建

数据结构 

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 *  @type: socket type (%SOCK_STREAM, etc)
 */
struct socket {
	socket_state		state;	//socket 的状态
	unsigned long		flags;	//socket 的标志位
	const struct proto_ops	*ops; //socket 的函数操作表
	struct fasync_struct	*fasync_list;	//socket 的异步唤醒队列
	struct file		*file;	// 与socket关联的文件指针
	struct sock		*sk;	// 代表具体协议内容的 sock 结构指针
	wait_queue_head_t	wait;	// 等待队列
	short			type;	//socket 的类型
};

从 socket 结构体可以看出 socket 是通用的套接字结构体的公共部分,而其中的 sock 结构体则是与使用的具体协议相关的部分,可以理解成从 socket 中抽象出 sock 部分,sock 结构体是根据使用的协议挂入到 socket 中,下面了解下 sock 结构体。

struct sock {
	/*
	 * Now struct inet_timewait_sock also uses sock_common, so please just
	 * don't add nothing before this first member (__sk_common) --acme
	 */
	struct sock_common	__sk_common;	// 与 inet_timewait_sock 共享使用
#define sk_family		__sk_common.skc_family	// 地址族
#define sk_state		__sk_common.skc_state	// 连接状态
#define sk_reuse		__sk_common.skc_reuse	// 确定复用地址
#define sk_bound_dev_if		__sk_common.skc_bound_dev_if	//绑定设备 ID
#define sk_node			__sk_common.skc_node	// 链入主哈希表
#define sk_bind_node		__sk_common.skc_bind_node	// 链入绑定哈希表
#define sk_refcnt		__sk_common.skc_refcnt	// 使用计数
#define sk_hash			__sk_common.skc_hash	// 哈希值
#define sk_prot			__sk_common.skc_prot	// 协议函数表
#define sk_net			__sk_common.skc_net	// 所属的网络空间
	unsigned char		sk_shutdown : 2,	// 是否关闭,mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
				sk_no_check : 2,	// 是否检查数据包
				sk_userlocks : 4;	// 用户锁,%SO_SNDBUF and %SO_RCVBUF settings
	unsigned char		sk_protocol; // 使用协议族的哪一种协议
	unsigned short		sk_type;	// socket 的类型,例如 SOCK_STREAM 等
	int			sk_rcvbuf;	// 接受缓冲区的长度(字节数)
	socket_lock_t		sk_lock;	// 用于同步
	/*
	 * The backlog queue is special, it is always used with
	 * the per-socket spinlock held and requires low latency
	 * access. Therefore we special case it's implementation.
	 */
	struct {
		struct sk_buff *head;	// 记录最先接收到的数据包
		struct sk_buff *tail;	// 记录最后接收到的数据包
	} sk_backlog; // 后备队列
	wait_queue_head_t	*sk_sleep;	//sock 的等待队列
	struct dst_entry	*sk_dst_cache;	// 路由项缓存
	struct xfrm_policy	*sk_policy[2];	//流策略
	rwlock_t		sk_dst_lock;	// 路由项缓存锁
	atomic_t		sk_rmem_alloc;	// 接受队列的字节数
	atomic_t		sk_wmem_alloc;	// 发送队列的字节数
	atomic_t		sk_omem_alloc;	// 可选择/其他 的字节数
	int			sk_sndbuf;	// 发送缓存的总长度
	struct sk_buff_head	sk_receive_queue;	//接收队列(接收到的数据包队列)
	struct sk_buff_head	sk_write_queue;		//发送队列(正在发送的数据包队列)
	struct sk_buff_head	sk_async_wait_queue;	//DMA 复制的数据包 TODO
	int			sk_wmem_queued;	//全部数据包占用内存计数
	int			sk_forward_alloc;	//记录可用内存长度
	gfp_t			sk_allocation;	//分配模式
	int			sk_route_caps;	//路由的兼容性标志位
	int			sk_gso_type;	//GSO 通用分段类型 TODO
	unsigned int		sk_gso_max_size; //用于建立 GSO 通用分段的最大长度
	int			sk_rcvlowat;	//SO_RCVLOWAT 设置
	unsigned long 		sk_flags;	//SO_BROADCAST、SO_KEEPALIVE、SO_OOBINLINE、SO_LINGER 设置
	unsigned long	        sk_lingertime;	//停留时间,确定关闭时间
	struct sk_buff_head	sk_error_queue;	// 错误数据包队列
	struct proto		*sk_prot_creator;	//sock 创建接口
	rwlock_t		sk_callback_lock;	// 为后半部处理使用的锁
	int			sk_err,			//出错码
				sk_err_soft;	//持续出现的错误
	atomic_t		sk_drops;	//原始 socket 发送的计数器
	unsigned short		sk_ack_backlog;		//当前监听到的连接数量
	unsigned short		sk_max_ack_backlog;	//在 listen() 函数中监听到的连接数量
	__u32			sk_priority;	//优先级
	struct ucred		sk_peercred;	// SO_PEERCRED 设置
	long			sk_rcvtimeo;	// SO_RCVTIMEO 设置接受超时时间
	long			sk_sndtimeo;	// SO_SNDTIMEO 设置发送超时时间
	struct sk_filter      	*sk_filter;	//sock 的过滤器
	void			*sk_protinfo;	//私有区域,当不使用slab高速缓存时由协议族定义
	struct timer_list	sk_timer;	//sock 的冲刷定时器
	ktime_t			sk_stamp;		//最后接收数据包的时间
	struct socket		*sk_socket;	//对应的 socket 指针
	void			*sk_user_data;	//rpc 提供的数据
	struct page		*sk_sndmsg_page;	// 发送数据块所在的缓冲页
	struct sk_buff		*sk_send_head;	// 发送数据包的队列头
	__u32			sk_sndmsg_off;	//发送数据块在缓冲页的结尾
	int			sk_write_pending;	//等待发送的数量
	void			*sk_security;	//用于安全模式
	__u32			sk_mark;	//通用的数据包掩码
	/* XXX 4 bytes hole on 64 bit */
	void			(*sk_state_change)(struct sock *sk);			//sock 状态改变后调用的函数
	void			(*sk_data_ready)(struct sock *sk, int bytes);	//在数据被处理完成后调用的函数
	void			(*sk_write_space)(struct sock *sk);				//发送空间可以使用后调用的函数
	void			(*sk_error_report)(struct sock *sk);			//处理错误的函数
  	int			(*sk_backlog_rcv)(struct sock *sk,					//处理库存数据包函数
						  struct sk_buff *skb);  
	void                    (*sk_destruct)(struct sock *sk);		//sock 的销毁函数
};

与应用程序密切相关的共用部分放在了socket结构中,而与协议相关的内容则放在sock结构中,然后使socket与sock挂钩,设计灵活巧妙。

我们看到sock中数据包的结构通过sk_buff来体现,每个协议都是通过sk_buff结构体用于封装、载运数据包,我们可以看下其数据结构。

struct sk_buff {
	/* These two members must be first. */
	struct sk_buff		*next;	//队列中的下一个数据包
	struct sk_buff		*prev;	//队列中的前一个数据包

	struct sock		*sk;	//指向所属的 sock 数据包
	ktime_t			tstamp;	//数据包到达的时间
	struct net_device	*dev;	//接收数据包的网络设备

	union {
		struct  dst_entry	*dst;	//路由项
		struct  rtable		*rtable;	//路由表
	};
	struct	sec_path	*sp;	//用于 xfrm 的安全路径

	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
	 * want to keep them across layers you have to do a skb_clone()
	 * first. This is owned by whoever has the skb queued ATM.
	 */
	char			cb[48];	// cb 控制块

	unsigned int		len,	//全部数据块的总长度
				data_len;		//分段、分散数据块的总长度
	__u16			mac_len,	//链路层头部的长度
				hdr_len;		//在克隆数据包时可写的头部长度
	union {
		__wsum		csum;		//校验和
		struct {
			__u16	csum_start;	//校验和在数据包头部 skb->head 中的起始位置
			__u16	csum_offset;//校验和保存到 csum_start 中的位置
		};
	};
	__u32			priority;	//数据包在队列中的优先级
	__u8			local_df:1,	//是否允许本地数据分段
				cloned:1,		//是否允许被克隆
				ip_summed:2,	//IP校验和标志
				nohdr:1,		//运载时使用,表示不能被修改头部
				nfctinfo:3;		//数据包连接关系
	__u8			pkt_type:3,	//数据包的类型
				fclone:2,		//数据包克隆关系
				ipvs_property:1,//数据包所属的 ipvs
				peeked:1,		//数据包是否属于操作状态
				nf_trace:1;		//netfilter 对数据包的跟踪标志
	__be16			protocol;	//底层驱动使用的数据包协议

	void			(*destructor)(struct sk_buff *skb);					//销毁数据包的函数
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	struct nf_conntrack	*nfct;
	struct sk_buff		*nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
	struct nf_bridge_info	*nf_bridge;	//关于网桥的数据
#endif

	int			iif;
#ifdef CONFIG_NETDEVICES_MULTIQUEUE
	__u16			queue_mapping;
#endif
#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
	__u16			tc_verd;	/* traffic control verdict */
#endif
#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
	__u8			ndisc_nodetype:2;
#endif
	/* 14 bit hole */

#ifdef CONFIG_NET_DMA
	dma_cookie_t		dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
	__u32			secmark;
#endif

	__u32			mark;

	sk_buff_data_t		transport_header;	//指向数据块中传输层头部
	sk_buff_data_t		network_header;		//指向数据块中网络层头部
	sk_buff_data_t		mac_header;			//指向数据块中链路层头部
	/* These elements must be at the end, see alloc_skb() for details.  */
	sk_buff_data_t		tail;	//指向数据块的结束地址
	sk_buff_data_t		end;	//指向缓冲块的结束地址
	unsigned char		*head,	//指向缓冲块的开始地址
				*data;			//指向数据块的开始地址
	unsigned int		truesize;	//数据包的实际长度(结构长度与数据块长度之和)
	atomic_t		users;	//数据包的使用计数器
};

共用部分 socket 结构体、通用部分 sock 结构体、专用部分 inet_sock 结构体。

tcp_sock 内容与 tcp 协议紧密相关,我们看其内容

struct tcp_sock {
	/* inet_connection_sock has to be the first member of tcp_sock */
	struct inet_connection_sock	inet_conn;	//由注释看到该结构体必须在 tcp_sock 头部 TODO why?
	u16	tcp_header_len;	/* Bytes of tcp header to send	发送的 tcp 头部字节数	*/
	u16	xmit_size_goal;	/* Goal for segmenting output packets 分段传送的数据包数量 	*/

/*
 *	Header prediction flags 头部的预置位
 *	0x5?10 << 16 + snd_wnd in net byte order
 */
	__be32	pred_flags;

/*
 *	RFC793 variables by their proper names. This means you can
 *	read the code and the spec side by side (and laugh ...)
 *	See RFC793 and RFC1122. The RFC writes these in capitals.
 */
 	u32	rcv_nxt;	/* What we want to receive next 下一个要接收的目标	*/
	u32	copied_seq;	/* Head of yet unread data	代表还没有读取的数据	*/
	u32	rcv_wup;	/* rcv_nxt on last window update sent rcv_nxt 在最后一次窗口更新时内容	*/
 	u32	snd_nxt;	/* Next sequence we send	下一个要发送的目标	*/

 	u32	snd_una;	/* First byte we want an ack for 第一个要 ack 的字节	*/
 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet 最近发送数据包中的尾字节 */
	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) 最后一次接收到 ack 的时间 */
	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) 最后一次发送数据包的时间 */

	/* Data for direct copy to user 直接复制给用户的数据 */
	struct {
		struct sk_buff_head	prequeue;			//预处理队列
		struct task_struct	*task;				//预处理进程
		struct iovec		*iov;				//用户程序(应用程序)接收数据的缓冲区
		int			memory;						//预处理数据包计数器
		int			len;						//预处理长度
#ifdef CONFIG_NET_DMA
		/* members for async copy 异步复制的内容 */
		struct dma_chan		*dma_chan;
		int			wakeup;
		struct dma_pinned_list	*pinned_list;
		dma_cookie_t		dma_cookie;
#endif
	} ucopy;

	u32	snd_wl1;	/* Sequence for window update  窗口更新的顺序		*/
	u32	snd_wnd;	/* The window we expect to receive 期望接收的窗口	*/
	u32	max_window;	/* Maximal window ever seen from peer 从对方获得的最大窗口	*/
	u32	mss_cache;	/* Cached effective mss, not including SACKS 有效的 mss,不包括 SACKS TODO mss、SACKS */

	u32	window_clamp;	/* Maximal window to advertise	对外公布的最大窗口	*/
	u32	rcv_ssthresh;	/* Current window clamp		当前窗口	*/

	u32	frto_highmark;	/* snd_nxt when RTO occurred 在 rto 时的 snd_nxt */
	u8	reordering;	/* Packet reordering metric.	预设的数据包数量	*/
	u8	frto_counter;	/* Number of new acks after RTO  rto 后的 ack 次数 */
	u8	nonagle;	/* Disable Nagle algorithm?      是否使用 Nagle 算法 TODO Nagle    */
	u8	keepalive_probes; /* num of allowed keep alive probes 允许持有的数量	*/

/* RTT measurement */
	u32	srtt;		/* smoothed round trip time << 3 	*/
	u32	mdev;		/* medium deviation			*/
	u32	mdev_max;	/* maximal mdev for the last rtt period	*/
	u32	rttvar;		/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/

	u32	packets_out;	/* Packets which are "in flight" 处于飞行中的数据包数量	*/
	u32	retrans_out;	/* Retransmitted packets out	转发的数据包数量	*/
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
	struct tcp_options_received rx_opt;

/*
 *	Slow start and congestion control (see also Nagle, and Karn & Partridge) TODO 慢启动与阻塞控制
 */
 	u32	snd_ssthresh;	/* Slow start size threshold	慢启动的起点值	*/
 	u32	snd_cwnd;	/* Sending congestion window	发送的阻塞窗口	*/
	u32	snd_cwnd_cnt;	/* Linear increase counter	线性计数器	*/
	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this 不允许 snd_cwnd 超过的值 */
	u32	snd_cwnd_used;
	u32	snd_cwnd_stamp;

	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here 超出分段规则的队列 */

 	u32	rcv_wnd;	/* Current receiver window	当前接收窗口	*/
	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer tcp 发送数据的顺序号 */
	u32	pushed_seq;	/* Last pushed seq, required to talk to windows 最后送出的顺序号,需要通知窗口 */

/*	SACKs data	*/
	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

	struct tcp_sack_block recv_sack_cache[4];

	struct sk_buff *highest_sack;   /* highest skb with SACK received
					 * (validity guaranteed only if
					 * sacked_out > 0)
					 */

	/* from STCP, retrans queue hinting */
	struct sk_buff* lost_skb_hint;

	struct sk_buff *scoreboard_skb_hint;
	struct sk_buff *retransmit_skb_hint;
	struct sk_buff *forward_skb_hint;

	int     lost_cnt_hint;
	int     retransmit_cnt_hint;

	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */

	u16	advmss;		/* Advertised MSS			*/
	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
	u32	lost_out;	/* Lost packets			*/
	u32	sacked_out;	/* SACK'd packets			*/
	u32	fackets_out;	/* FACK'd packets			*/
	u32	high_seq;	/* snd_nxt at onset of congestion	*/

	u32	retrans_stamp;	/* Timestamp of the last retransmit,
				 * also used in SYN-SENT to remember stamp of
				 * the first SYN. */
	u32	undo_marker;	/* tracking retrans started here. */
	int	undo_retrans;	/* number of undoable retransmissions. */
	u32	urg_seq;	/* Seq of received urgent pointer */
	u16	urg_data;	/* Saved octet of OOB data and control flags */
	u8	urg_mode;	/* In urgent mode		*/
	u8	ecn_flags;	/* ECN status bits.			*/
	u32	snd_up;		/* Urgent pointer		*/

	u32	total_retrans;	/* Total retransmits for entire connection */
	u32	bytes_acked;	/* Appropriate Byte Counting - RFC3465 */

	unsigned int		keepalive_time;	  /* time before keep alive takes place */
	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
	int			linger2;

	unsigned long last_synq_overflow; 

	u32	tso_deferred;

/* Receiver side RTT estimation */
	struct {
		u32	rtt;
		u32	seq;
		u32	time;
	} rcv_rtt_est;

/* Receiver queue space 接受队列空间 */
	struct {
		int	space;
		u32	seq;
		u32	time;
	} rcvq_space;

/* TCP-specific MTU probe information. TCP 指定的 MTU 检验内容 */
	struct {
		u32		  probe_seq_start;
		u32		  probe_seq_end;
	} mtu_probe;

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
	struct tcp_sock_af_ops	*af_specific;

/* TCP MD5 Signagure Option information */
	struct tcp_md5sig_info	*md5sig_info;
#endif
};

demo

了解了一些数据结构之后,下面将正式开始介绍 socket 相关的源码。

我们先来看下正常的服务器使用的流程


int main()
{
    struct sockaddr_in server_address;
    struct sockaddr_in client_address;

    server_fd = socket(AF_INET,SOCK_STREAM,0);
 
    server_address.sin_family = AF_INET;
    server_address.sin_addr.s_addr = inet_addr("192.168.1.1");
    server_address.sin_port = htons(54188);
    server_len = sizeof(server_address);
 
    bind(server_fd,(struct sockaddr*)&server_address,server_len);
    
    /*创建一个Socket的监听队列(允许接收10个连接),监听客户端Socket的连接请求*/
    listen(server_fd,10);
    
    while(1) {
        char recv[20];
        printf("server is waiting\n");
        /*程序运行到此处时,说明客户端的连接请求已经到来,接受它的连接请求,克隆出一个Socket与客户端建立连接,并将客户端的“电话号码”记录在client_address中,函数返回建立连接的ID号*/
        client_len = sizeof(client_address);
        client_fd = accept(server_fd,(struct sockaddr*)&client_address,&client_len);
        /*使用read和write函数接收客户端字符然后发回客户端*/
    
        read(client_fd,recv,20);
    
        write(client_fd,back,20);
        printf("received from client= %s\n",recv);
    
        close(client_fd);
    }
    close(server_fd);
    exit(0);
}

无非先是 socket() 创建服务器socket ,然后bind() 将地址结构与 socket 挂钩起来,于是 listen()监听客户端的连接请求,然后通过accept()然后得到fd,根据vfs即访问文件的方式访问套接字,read/write。

socket 的创建

服务器调用socket()函数,其调用的库函数在glibc源码中找到

#include 
#include 

/* Create a new socket of type TYPE in domain DOMAIN, using
   protocol PROTOCOL.  If PROTOCOL is zero, one is chosen automatically.
   Returns a file descriptor for the new socket, or -1 for errors.  */
int
__socket (domain, type, protocol)
     int domain;
     int type;
     int protocol;
{
  __set_errno (ENOSYS);
  return -1;
}


weak_alias (__socket, socket)
stub_warning (socket)
#include 

这里看到使用 weak_alias() 函数为 socket() 函数声明了一个“函数别名”_socket(),跟踪其_socket.S汇编代码,发现其通过调用system_call() 函数根据系统函数调用表sys_call_table最终执行的系统调用函数为sys_socketcall(),它也是bind()、listen()、accept()等函数的系统调用入口。

/* Define unique numbers for the operations permitted on socket.  Linux
   uses a single system call for all these functions.  The relevant code
   file is /usr/include/linux/net.h.
   We cannot use a enum here because the values are used in assembler
   code.  */

#define SOCKOP_socket		1
#define SOCKOP_bind		2
#define SOCKOP_connect		3
#define SOCKOP_listen		4
#define SOCKOP_accept		5
#define SOCKOP_getsockname	6
#define SOCKOP_getpeername	7
#define SOCKOP_socketpair	8
#define SOCKOP_send		9
#define SOCKOP_recv		10
#define SOCKOP_sendto		11
#define SOCKOP_recvfrom		12
#define SOCKOP_shutdown		13
#define SOCKOP_setsockopt	14
#define SOCKOP_getsockopt	15
#define SOCKOP_sendmsg		16
#define SOCKOP_recvmsg		17

asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
	int ret;
	u32 a[6];
	u32 a0, a1;

	if (call < SYS_SOCKET || call > SYS_RECVMSG)
		return -EINVAL;
	if (copy_from_user(a, args, nas[call]))
		return -EFAULT;
	a0 = a[0];
	a1 = a[1];

	switch (call) {
	case SYS_SOCKET:
		ret = sys_socket(a0, a1, a[2]);
		break;
	case SYS_BIND:
		ret = sys_bind(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_CONNECT:
		ret = sys_connect(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_LISTEN:
		ret = sys_listen(a0, a1);
		break;
	case SYS_ACCEPT:
		ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_GETSOCKNAME:
		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_GETPEERNAME:
		ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_SOCKETPAIR:
		ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
		break;
	case SYS_SEND:
		ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
		break;
	case SYS_SENDTO:
		ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
		break;
	case SYS_RECV:
		ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
		break;
	case SYS_RECVFROM:
		ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
		break;
	case SYS_SHUTDOWN:
		ret = sys_shutdown(a0,a1);
		break;
	case SYS_SETSOCKOPT:
		ret = compat_sys_setsockopt(a0, a1, a[2],
				compat_ptr(a[3]), a[4]);
		break;
	case SYS_GETSOCKOPT:
		ret = compat_sys_getsockopt(a0, a1, a[2],
				compat_ptr(a[3]), compat_ptr(a[4]));
		break;
	case SYS_SENDMSG:
		ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_RECVMSG:
		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
		break;
	default:
		ret = -EINVAL;
		break;
	}
	return ret;
}

然后根据调用号 SOCKOP_socket 找到对应的系统调用函数 sys_socket()

asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

先是通过sock_create() 函数创建 socket,然后通过 sock_map_fd 函数与vfs虚拟文件系统建立关联,返回相应fd即retval统一管理。

分配并初始化 socket 结构

我们先跟踪sock_create()函数,这函数负责分配并初始化 socket 结构。

int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

可以看到sock_create函数前三个参数就是socket()函数传入的参数,最后一个 socket** 参数负责接收socket结果,这里继续调用__sock_create() 函数

static int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			       current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();	//分配 socket 结构空间
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;		//记录socket 的类型

#if defined(CONFIG_KMOD)
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (net_families[family] == NULL)			//检查协议族操作表
		request_module("net-pf-%d", family);	//安装协议族操作表
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);	//得到相应的协议族操作表
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(net, sock, protocol);		//执行取得的协议族操作表的 create 函数
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;		// 返回创建结果

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

可以看到这个函数的操作,先是给socket结构分配相应空间,再通过family参数 AF_INET(2) 取得相应协议族操作表,再执行协议族操作表中的create函数将结果返回。我们先看 sock_alloc() 函数,它负责为服务器程序分配 socket 结构和文件节点。

static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode(sock_mnt->mnt_sb);	//在文件系统中创建文件节点同时分配 socket 结构
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);					//取得 socket 结构指针

	inode->i_mode = S_IFSOCK | S_IRWXUGO;	//设置文件节点的模式
	inode->i_uid = current->fsuid;			//设置为当前进程的uid
	inode->i_gid = current->fsgid;			//设置为当前进程的gid

	get_cpu_var(sockets_in_use)++;			
	put_cpu_var(sockets_in_use);			//设置当前的 sockets_in_use++
	return sock;
}

这里 sock_mnt 是 socket 网络文件系统的根节点,这儿相当于在socket网络文件系统中分配一个inode节点,于是服务器程序可以通过相应的inode节点通过read/write操作进行读写。我们先看new_inode() 函数。

struct inode *new_inode(struct super_block *sb)
{
	/*
	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
	 * error if st_ino won't fit in target struct field. Use 32bit counter
	 * here to attempt to avoid that.
	 */
	static unsigned int last_ino;
	struct inode * inode;

	spin_lock_prefetch(&inode_lock);
	
	inode = alloc_inode(sb);	//调用超级块函数操作表
	if (inode) {				//对分配得到inode处理
		spin_lock(&inode_lock);
		inodes_stat.nr_inodes++;
		list_add(&inode->i_list, &inode_in_use);
		list_add(&inode->i_sb_list, &sb->s_inodes);
		inode->i_ino = ++last_ino;
		inode->i_state = 0;
		spin_unlock(&inode_lock);
	}
	return inode;
}

我们再看其 SOCKET_I(inode) 函数。

static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

#define container_of(ptr, type, member) ({			\
	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
	(type *)( (char *)__mptr - offsetof(type,member) );})

这儿有点乱,稍微整理下相当于

#define container_of(inode, struct socket_alloc, vfs_inode) ({			\
	const typeof( ((struct socket_alloc *)0)->vfs_inode ) *__mptr = (inode);	\
	(struct socket_alloc *)( (char *)__mptr - offsetof(struct socket_alloc,vfs_inode) );})

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

#define OFFSETOF(strct, elem)	((long)&(((struct strct *)NULL)->elem))

这样就很简单了,offsetof宏相当于elem在struct中的偏移量,即vfs_inode在struct socket_alloc中的偏移量,再由inode即vfs_inode地址减去其在struct socket_alloc中的偏移量,得到了struct socket_alloc的首地址,同时又是socket的首地址。这个宏仅仅是计算指针偏移量得到socket首地址,socket_alloc结构的地址分配在new_inode() 函数中进行。

回到new_inode() 函数,我们看到其通过alloc_inode()函数调用超级块的函数操作表。

static struct inode *alloc_inode(struct super_block *sb)
{
	static const struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static const struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)
		inode = sb->s_op->alloc_inode(sb);
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

	if (inode) {
		...
	}
	return inode;
}

这时候的sb->s_op已经在sock_init()过程中,经过get_sb_pseudo()函数将其赋值为 sockfs_ops,即此时调用的为sockfs_ops->alloc_inode 函数。

static struct super_operations sockfs_ops = {
	.alloc_inode =	sock_alloc_inode,
	.destroy_inode =sock_destroy_inode,
	.statfs =	simple_statfs,
};

查找sockfs_ops结构体此时调用的是sock_alloc_inode函数完成socket_alloc结构的分配。

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);	//分配 socket_alloc 结构
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);	// 初始化等待队列的头
	// 初始化socket
	ei->socket.fasync_list = NULL;
	ei->socket.state = SS_UNCONNECTED;    //状态设置为未连接
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}

这里进行内存分配与socket结构的初始化。可以看到 kmem_cache_alloc 函数从slab高速缓存 sock_init_cache 直接进行分配,这个缓存块是在sock_init()中通过init_inodecache()函数建立的。

[ TODO kmem_cache_alloc 跟 kmem_cache_create 两个slab函数 ]

使用协议族的函数表初始化 socket

回到__sock_create() 函数,我们看到先是通过 net_families[2] 判断是否为NULL即是否安装了AF_INET的协议族操作表(这个过程在内核初始化的时候进行),在这里把相关过程列一下

inet_init -> fs_initcall(inet_init);

#define fs_initcall(fn)			__define_initcall("5",fn,5)

static int __init inet_init(void)
{
    ...

	/*
	 *	Tell SOCKET that we are alive...
	 */

	(void)sock_register(&inet_family_ops);

    ...
}

int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
		       NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (net_families[ops->family])
		err = -EEXIST;
	else {
		net_families[ops->family] = ops;
		err = 0;
	}
	spin_unlock(&net_family_lock);

	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
	return err;
}

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

最终通过sock_register 函数将inet_family_ops注册到net_families[PF_INET]中,在这里PF_INET就是AF_INET

#define PF_INET		AF_INET

继续回到__sock_create() 函数,我们看到执行了协议族操作表inet_family_ops的create函数即 inet_create

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;
	// 检查 socket 类型及加密字符
	if (sock->type != SOCK_RAW &&	//原始类型
	    sock->type != SOCK_DGRAM &&	//数据报类型,UDP协议
	    !inet_ehash_secret)
		build_ehash_secret();

从socket()函数传进来的socket类型参数为SOCK_STREAM即流类型,并且判断是否有了加密字符,否则调用 build_ehash_secret函数来设置

void build_ehash_secret(void)
{
	u32 rnd;
	do {
		get_random_bytes(&rnd, sizeof(rnd)); //得到非 0 随机数
	} while (rnd == 0);
	spin_lock_bh(&inetsw_lock);
	if (!inet_ehash_secret)
		inet_ehash_secret = rnd;	//使用随机数作为加密字符
	spin_unlock_bh(&inetsw_lock);
}

回到 inet_create 函数,注意到变量 struct inet_protosw *answer,inet_protosw结构体用于IP协议对应 socket 的接口,也就是靠近 socket 层的协议信息均保存在这个数据结构中,每一个IP协议都有这么一个接口结构。

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
	struct list_head list;

        /* These two fields form the lookup key. 下面两个变量用于校对使用 */
	unsigned short	 type;	   /* This is the 2nd argument to socket(2). 对应于socket的类型 */
	unsigned short	 protocol; /* This is the L4 protocol number. IP协议编码 */

	struct proto	 *prot;	/* 对应的协议结构体指针 */
	const struct proto_ops *ops; /* 对应协议的函数操作表指针 */
  
	int              capability; /* Which (if any) capability do
				      * we need to use this socket
				      * interface?
                                      */
	char             no_check;   /* checksum on rcv/xmit/none? 是否在接收/发送的过程中使用校验和 */
	unsigned char	 flags;      /* See INET_PROTOSW_* below. 标志位 */
};

继续看 inet_create 函数

	sock->state = SS_UNCONNECTED; //设置socket的状态为'未连接状态'

	/* Look for the requested type/protocol pair. */
	answer = NULL;
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();	// rcu 锁的操作,适合读多写少情况
	list_for_each_rcu(p, &inetsw[sock->type]) {
		answer = list_entry(p, struct inet_protosw, list);

		/* Check the non-wild match. 检查协议编码是否与内核已经注册的协议相同 */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. 检查是否属于虚拟IP协议 */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
		answer = NULL;
	}

	if (unlikely(answer == NULL)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) 是否指定了名称
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP) 否则就是通用的名称
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	err = -EAFNOSUPPORT;
	if (!inet_netns_ok(net, protocol))
		goto out_rcu_unlock;
rcu_read_lock 跟 rcu_read_unlock 之间是读临界区
#define list_for_each_rcu(pos, head) \
	for (pos = rcu_dereference((head)->next); \
		prefetch(pos->next), pos != (head); \
		pos = rcu_dereference(pos->next))

#define rcu_dereference(p)     ({ \
				typeof(p) _________p1 = ACCESS_ONCE(p); \
				smp_read_barrier_depends(); \
				(_________p1); \
				})

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

可以看到宏 list_for_each_rcu 作用,在rcu保护下循环检查inetsw数组,直到找到符合 socket 类型的队列,这队列是inet_protosw结构。inetsw队列数组也是在inet_init() 函数中注册完成的。

static int __init inet_init(void)
{
	struct sk_buff *dummy_skb;
	struct inet_protosw *q;
	struct list_head *r;
	int rc = -EINVAL;

    ...

    (void)sock_register(&inet_family_ops);
    
    ...

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

    ...
}

static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,				//TCP数据流协议
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.capability = -1,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,				//UDP数据报协议
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.capability = -1,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
       },


       {
	       .type =       SOCK_RAW,				//RAW原始套接字
	       .protocol =   IPPROTO_IP,	/* wild card  虚拟IP类型*/
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .capability = CAP_NET_RAW,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

从inet_init()函数看到,使用 inet_register_protosw() 函数注册这个数组。

static struct list_head inetsw[SOCK_MAX];

void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	int protocol = p->protocol;
	struct list_head *last_perm;

	spin_lock_bh(&inetsw_lock);

	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. 检查参数P的类型是否超越了内核范围 */
	answer = NULL;
	last_perm = &inetsw[p->type];
	list_for_each(lh, &inetsw[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);

		/* Check only the non-wild match. */
		if (INET_PROTOSW_PERMANENT & answer->flags) {
			if (protocol == answer->protocol)
				break;
			last_perm = lh;
		}

		answer = NULL;
	}
	if (answer)
		goto out_permanent;

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 */
	list_add_rcu(&p->list, last_perm);
out:
	spin_unlock_bh(&inetsw_lock);

	synchronize_net();

	return;

out_permanent:
	printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
	       protocol);
	goto out;

out_illegal:
	printk(KERN_ERR
	       "Ignoring attempt to register invalid socket type %d.\n",
	       p->type);
	goto out;
}

该函数通过宏 list_for_each 循环inetsw数组,通过对比要插入的参数p是否INET_PROTOSW_PERMANENT标志、并且与队列属于同一种协议,如果符合则链入p->list队列中。

可以看到inet_init()函数将数组inetsw_array中的元素逐一链入到inetsw数组的队列中。

注册这块告一段落,回到inet_create函数中。

还记得 server_fd = socket(AF_INET,SOCK_STREAM,0); 所以protocol 为 0,且type为SOCK_STREAM即TCP协议类型,所以answer指向TCP协议的inet_protosw结构,然后protocol为IPPROTO_IP(0)那么不等于TCP协议的inet_protosw结构的protocol

    IPPROTO_IP = 0,	   /* Dummy protocol for TCP.  */
#define IPPROTO_IP		IPPROTO_IP

于是,protocol = answer->protocol即peotocol被赋值为TCP协议的protocol 6。其中capability为-1,然后inet_netns_ok判断

我们继续看 inet_create 函数

	sock->ops = answer->ops;	//inet_stream_ops
	answer_prot = answer->prot;	//tcp_prot
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();

	BUG_TRAP(answer_prot->slab != NULL);

	err = -ENOBUFS;
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);	//分配 sock 结构
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;

	inet = inet_sk(sk);		
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->id = 0;

	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_family	   = PF_INET;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;	//设置处理库存函数

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares. 这里允许用户指定 socket 的编号,创建时自动共享
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);	//调用运输层钩子函数init tcp_prot 
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

可以看到将tcp结构的操作函数 inet_stream_ops 挂钩给了socket的协议操作函数,将answer->prot赋值给answer_prot,作为型参传递给了 sk_alloc() 函数使用。[ TODO socket  -- 传输层 proto -- 网络层 inet_proto ]

分配并初始化 sock 结构

我们看 sk_alloc()函数,其中prot参数为answer->prot,即tcp_prot

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot)
{
	struct sock *sk;

	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
		sock_net_set(sk, get_net(net));
	}

	return sk;
}

用sk_prot_alloc() 函数分配一个通用的sock结构体

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
	if (slab != NULL)
		sk = kmem_cache_alloc(slab, priority);	//内存管理的slab分配函数,从sock高速缓冲池中分配
	else
		sk = kmalloc(prot->obj_size, priority);	//通用的告诉缓冲池中分配空间结构

	if (sk != NULL) {
		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
	}

	return sk;

out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
}

根据prot结构是否提供了slab高速缓存来确定是在高速缓存分配或者在通用缓冲中分配。

分配成功后对family赋值、将tcp_prot结构赋值到 sk_prot 跟 sk_prot_creator上,然后在sock_lock_init 函数中对sock结构中的起同步作用的sk_lock锁进行初始化。其中 sk_lock 是 socket_lock_t 类型的变量,可以说它是专用于 socket 的锁

typedef struct {
	spinlock_t		slock;
	int			owned;
	wait_queue_head_t	wq;
	/*
	 * We express the mutex-alike socket_lock semantics
	 * to the lock validator by explicitly managing
	 * the slock as a lock variant (in addition to
	 * the slock itself):
	 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif
} socket_lock_t;

可以看到其中包含了一个自旋锁 slock 跟 一个等待队列头 wq。sock_lock_init_class_and_name 是对其内容的初始化。

static inline void sock_lock_init(struct sock *sk)
{
	sock_lock_init_class_and_name(sk,
			af_family_slock_key_strings[sk->sk_family],
			af_family_slock_keys + sk->sk_family,
			af_family_key_strings[sk->sk_family],
			af_family_keys + sk->sk_family);
}

#define sock_lock_init_class_and_name(sk, sname, skey, name, key) 	\
do {									\
	sk->sk_lock.owned = 0;					\
	init_waitqueue_head(&sk->sk_lock.wq);				\
	spin_lock_init(&(sk)->sk_lock.slock);				\
	debug_check_no_locks_freed((void *)&(sk)->sk_lock,		\
			sizeof((sk)->sk_lock));				\
	lockdep_set_class_and_name(&(sk)->sk_lock.slock,		\
		       	(skey), (sname));				\
	lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);	\
} while (0)

回到sk_alloc中,传入的net参数为 current->nsproxy->net_ns,这个是当前进程中记录的网络空间的结构,调用sock_net_set(sk, get_net(net)) 函数记录下所属的 net 空间结构,get_net(net)则是增加 net 结构的计数器。

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
#ifdef CONFIG_NET_NS
	sk->sk_net = net;
#endif
}

static inline struct net *get_net(struct net *net)
{
	atomic_inc(&net->count);
	return net;
}

回到 inet_create 函数,sk_alloc 分配了 sock 结构并初始化之后,如果分配失败则 sk == NULL 直接退出

接下来调用inet = inet_sk(sk),通过 sock 指针得到 struct inet_sock * inet指针

static inline struct inet_sock *inet_sk(const struct sock *sk)
{
	return (struct inet_sock *)sk;
}

struct inet_sock {
	/* sk and pinet6 has to be the first two members of inet_sock */
	struct sock		sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
	struct ipv6_pinfo	*pinet6;
#endif
	/* Socket demultiplex comparisons on incoming packets. */
	__be32			daddr;        //目标地址
	__be32			rcv_saddr;    
	__be16			dport;        //目标端口
	__u16			num;          //端口
	__be32			saddr;
	__s16			uc_ttl;
	__u16			cmsg_flags;
	struct ip_options	*opt;
	__be16			sport;
	__u16			id;
	__u8			tos;
	__u8			mc_ttl;
	__u8			pmtudisc;
	__u8			recverr:1,
				is_icsk:1,
				freebind:1,
				hdrincl:1,
				mc_loop:1;
	int			mc_index;
	__be32			mc_addr;
	struct ip_mc_socklist	*mc_list;
	struct {
		unsigned int		flags;
		unsigned int		fragsize;
		struct ip_options	*opt;
		struct dst_entry	*dst;
		int			length; /* Total length of all frames */
		__be32			addr;
		struct flowi		fl;
	} cork;
};

可以了解到这是 socket 的专用数据结构。

再往后,调用 sock_init_data(sock, sk) 对新分配的sock结构做进一步初始化,将socket与sock的内容挂钩起来。

void sock_init_data(struct socket *sock, struct sock *sk)
{	/* 队列并非采用通用的 list_head 来维护,而是使用 skb_buffer 队列: */
	skb_queue_head_init(&sk->sk_receive_queue);	//初始化接收队列
	skb_queue_head_init(&sk->sk_write_queue);	//初始化发送队列
	skb_queue_head_init(&sk->sk_error_queue);	//初始化错误数据包队列
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue); //DMA 复制的数据包队列
#endif

	sk->sk_send_head	=	NULL;	//发送数据包的队列头

	init_timer(&sk->sk_timer);		//初始化 sock 的冲刷定时器

	sk->sk_allocation	=	GFP_KERNEL;				//分配模式,无内存可用时可引起休眠
	sk->sk_rcvbuf		=	sysctl_rmem_default;	//接受缓冲区的长度 32767
	sk->sk_sndbuf		=	sysctl_wmem_default;	//发送缓存的总长度 32767
	sk->sk_state		=	TCP_CLOSE;
	sk->sk_socket		=	sock;					//指向对应的 socket 结构

	sock_set_flag(sk, SOCK_ZAPPED);

	if (sock) {
		sk->sk_type	=	sock->type;
		sk->sk_sleep	=	&sock->wait;
		sock->sk	=	sk;							//回指对应的 scok 结构
	} else
		sk->sk_sleep	=	NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);

	sk->sk_state_change	=	sock_def_wakeup;
	sk->sk_data_ready	=	sock_def_readable;
	sk->sk_write_space	=	sock_def_write_space;
	sk->sk_error_report	=	sock_def_error_report;
	sk->sk_destruct		=	sock_def_destruct;

	sk->sk_sndmsg_page	=	NULL;
	sk->sk_sndmsg_off	=	0;

	sk->sk_peercred.pid 	=	0;
	sk->sk_peercred.uid	=	-1;
	sk->sk_peercred.gid	=	-1;
	sk->sk_write_pending	=	0;
	sk->sk_rcvlowat		=	1;
	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp = ktime_set(-1L, 0);

	atomic_set(&sk->sk_refcnt, 1);
	atomic_set(&sk->sk_drops, 0);
}

注意到这里对三个重要数据包队列头的初始化,是 sk_buff_head 结构

struct sk_buff_head {
	/* These two members must be first. */
	struct sk_buff	*next;
	struct sk_buff	*prev;

	__u32		qlen;
	spinlock_t	lock;
};

可以看到这是一个双向队列结构,其中qlen是队列长度、lock用于并发控制的锁。

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.backlog_rcv		= tcp_v4_do_rcv,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_mem		= sysctl_tcp_mem,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
};

然后调用了sk->sk_prot->init(sk)即tcp_prot->init()函数,即 tcp_v4_init_sock 函数

static int tcp_v4_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	skb_queue_head_init(&tp->out_of_order_queue);
	tcp_init_xmit_timers(sk);
	tcp_prequeue_init(tp);

	icsk->icsk_rto = TCP_TIMEOUT_INIT;
	tp->mdev = TCP_TIMEOUT_INIT;

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tp->snd_cwnd = 2;

	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
	tp->snd_cwnd_clamp = ~0;
	tp->mss_cache = 536;

	tp->reordering = sysctl_tcp_reordering;
	icsk->icsk_ca_ops = &tcp_init_congestion_ops;

	sk->sk_state = TCP_CLOSE;

	sk->sk_write_space = sk_stream_write_space;
	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

	icsk->icsk_af_ops = &ipv4_specific;
	icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
	tp->af_specific = &tcp_sock_ipv4_specific;
#endif

	sk->sk_sndbuf = sysctl_tcp_wmem[1];
	sk->sk_rcvbuf = sysctl_tcp_rmem[1];

	atomic_inc(&tcp_sockets_allocated);

	return 0;
}

inet_connection_sock 结构是不是很熟悉?是 tcp_sock 结构体的第一个成员。这里也都是一些初始化赋值操作,最后递增tcp_sockets_allocated。

socket 源码解析之创建_第1张图片

struct inet_connection_sock {
	/* inet_sock has to be the first member! */
	struct inet_sock	  icsk_inet;				//INET 协议族的 sock 结构
	struct request_sock_queue icsk_accept_queue;	//确定接收队列
	struct inet_bind_bucket	  *icsk_bind_hash;		//绑定的桶结构
	unsigned long		  icsk_timeout;				//超时
 	struct timer_list	  icsk_retransmit_timer;	//没有 ACK 时的重发定时器
 	struct timer_list	  icsk_delack_timer;		//确定删除定时器
	__u32			  icsk_rto;						//重发超时
	__u32			  icsk_pmtu_cookie;				//最近的 pmtu		
	const struct tcp_congestion_ops *icsk_ca_ops;	//拥挤情况时的处理函数
	const struct inet_connection_sock_af_ops *icsk_af_ops;	//AF_INET指定的函数操作表
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);	//同步 mss 的函数指针
	__u8			  icsk_ca_state;		//拥挤情况的处理状态
	__u8			  icsk_retransmits;		//重发数量
	__u8			  icsk_pending;			//挂起
	__u8			  icsk_backoff;			//允许连接的数量
	__u8			  icsk_syn_retries;		//允许重新SYN的数量
	__u8			  icsk_probes_out;		//探测到未应答的窗口
	__u16			  icsk_ext_hdr_len;		//网络协议头部的长度
	struct {
		__u8		  pending;	 /* ACK is pending			   */
		__u8		  quick;	 /* Scheduled number of quick acks	   */
		__u8		  pingpong;	 /* The session is interactive		   */
		__u8		  blocked;	 /* Delayed ACK was blocked by socket lock */
		__u32		  ato;		 /* Predicted tick of soft clock	   */
		unsigned long	  timeout;	 /* Currently scheduled timeout		   */
		__u32		  lrcvtime;	 /* timestamp of last received data packet */
		__u16		  last_seg_size; /* Size of last incoming segment	   */
		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
	} icsk_ack;
	struct {
		int		  enabled;

		/* Range of MTUs to search */
		int		  search_high;
		int		  search_low;

		/* Information on the current probe. */
		int		  probe_size;
	} icsk_mtup;
	u32			  icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
};

到这里 socket 算是创建并初始化完成,我们可以看到上图的数据结构,此时socket的 state 为未连接状态,sock 的 sk_state 为关闭状态。然后回到 sys_socket 函数中,执行retval = sock_map_fd(sock)。

socket与文件系统

int sock_map_fd(struct socket *sock)
{
	struct file *newfile;
	int fd = sock_alloc_fd(&newfile);			//为 socket 分配文件号跟文件结构

	if (likely(fd >= 0)) {
		int err = sock_attach_fd(sock, newfile);	//挂载 socket 跟文件结构

		if (unlikely(err < 0)) {				//出错则释放文件跟文件号
			put_filp(newfile);
			put_unused_fd(fd);
			return err;
		}
		fd_install(fd, newfile);				//使文件与文件号挂钩
	}
	return fd;
}

先通过sock_alloc_fd申请文件结构空间与文件号

static int sock_alloc_fd(struct file **filep)
{
	int fd;

	fd = get_unused_fd();	//得到空闲文件号
	if (likely(fd >= 0)) {
		struct file *file = get_empty_filp();	//分配文件结构空间

		*filep = file;
		if (unlikely(!file)) {
			put_unused_fd(fd);
			return -ENFILE;
		}
	} else
		*filep = NULL;
	return fd;
}

这儿涉及到文件系统的操作,分别从当前进程获取到空闲的fd,再从文件系统分配空闲的文件结构空间,如果分配失败则释放。

分配申请成功,然后执行sock_attach_fd

static int sock_attach_fd(struct socket *sock, struct file *file)
{
	struct dentry *dentry;
	struct qstr name = { .name = "" };

	dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);	//创建一个 socket 的文件系统目录项,sock_mnt是 vfsmount 类型
	if (unlikely(!dentry))
		return -ENOMEM;

	dentry->d_op = &sockfs_dentry_operations;	//将 socket文件系统的目录操作表挂入到目录项的操作表中
	/*
	 * We dont want to push this dentry into global dentry hash table.
	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
	 * This permits a working /proc/$pid/fd/XXX on sockets
	 */
	dentry->d_flags &= ~DCACHE_UNHASHED;
	d_instantiate(dentry, SOCK_INODE(sock));

	sock->file = file;
	init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
		  &socket_file_ops);		// socket 文件结构进行初始化,传入的是 socket_file_ops 操作表
	SOCK_INODE(sock)->i_fop = &socket_file_ops;
	file->f_flags = O_RDWR;
	file->f_pos = 0;
	file->private_data = sock;		//可以在文件系统中通过 private_data 找到对应的 socket

	return 0;
}

我们看一下 sockfs_dentry_operations socket文件系统的目录操作表

static struct dentry_operations sockfs_dentry_operations = {
	.d_delete = sockfs_delete_dentry,
	.d_dname  = sockfs_dname,
};

同时看一下 socket_file_ops 文件操作表

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

其实我们通过read/write对socket进行读写,但内部是通过这个函数表映射到具体的socket操作,给用户一种操作文件的方便性,统一性。

bind()

再看demo的服务器代码,在socket()创建完毕后,通过bind(server_fd,(struct sockaddr*)&server_address,server_len)绑定地址给socket。

我们还是跟踪其实现,bind直接在sys_socketcall()函数中,对照参数SYS_BIND,找到系统调用sys_bind()

asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err, fput_needed;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);     //通过 fd 找到对应的 socket
	if (sock) {
		err = move_addr_to_kernel(umyaddr, addrlen, address);   //将传入的地址从用户空间复制到内核空间
		if (err >= 0) {
			err = security_socket_bind(sock,
						   (struct sockaddr *)address,
						   addrlen);
			if (!err)
				err = sock->ops->bind(sock,
						      (struct sockaddr *)
						      address, addrlen);                  //调用具体协议的绑定函数,inet_stream_ops->bind()
		}
		fput_light(sock->file, fput_needed);
	}
	return err;
}

我们继续看 sockfd_lookup_light 函数

static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
	struct file *file;
	struct socket *sock;

	*err = -EBADF;
	file = fget_light(fd, fput_needed);		//根据 fd 找到文件指针
	if (file) {
		sock = sock_from_file(file, err);	//在文件指针中获得 socket 指针
		if (sock)
			return sock;
		fput_light(file, *fput_needed);
	}
	return NULL;
}

fget_light/fput_light 是文件操作,fget_light从当前进程的files_struct 结构中找到文件系统中file文件指针,增加计数,fput_light减计数,如果sock结构取到则直接返回。在这里我们重点看下 sock_from_file 函数

static struct socket *sock_from_file(struct file *file, int *err)
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

	*err = -ENOTSOCK;
	return NULL;
}

之前提过的 file->private_data域是存储 socket 指针。通过sockfd_lookup_light函数我们得到了之前创建并初始化的socket,然后通过 move_addr_to_kernel 函数将地址复制到内核空间。

int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
{
	if (ulen < 0 || ulen > MAX_SOCK_ADDR)
		return -EINVAL;
	if (ulen == 0)
		return 0;
	if (copy_from_user(kaddr, uaddr, ulen))
		return -EFAULT;
	return audit_sockaddr(ulen, kaddr);
}

再往下看 security_socket_bind 涉及到 security ,没有设置直接返回 0 。于是调用了 sock->ops->bind()方法,由于我们的socket->ops之前绑定了answer->ops(忘记了可以翻看上面的inet_create函数),即这里实际上调用的是inet_stream_ops->bind

const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl,
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = tcp_sendmsg,
	.recvmsg	   = sock_common_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = tcp_sendpage,
	.splice_read	   = tcp_splice_read,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_sock_common_setsockopt,
	.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};

对照上面这个结构体我们找到.bind执行的是inet_bind()函数。

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct inet_sock *inet = inet_sk(sk);
	unsigned short snum;
	int chk_addr_ret;
	int err;

	/* If the socket has its own bind function then use it. (RAW) */
	if (sk->sk_prot->bind) {
		err = sk->sk_prot->bind(sk, uaddr, addr_len);   //如果 socket 提供了自己的绑定函数就使用它,这里的sk->sk_prot为tcp_prot
		goto out;
	}
	err = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); //在路由中检查地址类型

	/* Not specified by any standard per-se, however it breaks too
	 * many applications when removed.  It is unfortunate since
	 * allowing applications to make a non-local bind solves
	 * several problems with systems using dynamic addressing.
	 * (ie. your servers still start up even if your ISDN link
	 *  is temporarily down)
	 */
	err = -EADDRNOTAVAIL;
	if (!sysctl_ip_nonlocal_bind &&
	    !inet->freebind &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
	    chk_addr_ret != RTN_LOCAL &&                //是否单播类型
	    chk_addr_ret != RTN_MULTICAST &&            //是否组播类型
	    chk_addr_ret != RTN_BROADCAST)              //是否广播类型
		goto out;

	snum = ntohs(addr->sin_port);                   //取得端口号
	err = -EACCES;
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
		goto out;

	/*      We keep a pair of addresses. rcv_saddr is the one
	 *      used by hash lookups, and saddr is used for transmit.
	 *
	 *      In the BSD API these are the same except where it
	 *      would be illegal to use them (multicast/broadcast) in
	 *      which case the sending device address is used.
	 */
	lock_sock(sk);  //加锁

	/* Check these errors (active socket, double bind). */
	err = -EINVAL;
	if (sk->sk_state != TCP_CLOSE || inet->num) //检查状态、端口是否已经指定
		goto out_release_sock;

	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;          //rcv_saddr用于哈希查找、saddr用于发送(赋值为ip地址)
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		inet->saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. 检查是否允许绑定 */
	if (sk->sk_prot->get_port(sk, snum)) {              // inet_csk_get_port()
		inet->saddr = inet->rcv_saddr = 0;              // 检查失败就情况设置的地址
		err = -EADDRINUSE;
		goto out_release_sock;
	}

	if (inet->rcv_saddr)                        //如果已经设置了地址就增加锁标志,表示已经绑定了地址
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)                                   //如果已经设置了端口就增加锁标志,表示已经绑定了端口
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->sport = htons(inet->num);             //记录端口
	inet->daddr = 0;                            //初始化目标地址
	inet->dport = 0;                            //初始化目标端口
	sk_dst_reset(sk);                           //初始化缓存的路由内容
	err = 0;
out_release_sock:
	release_sock(sk);   //解锁
out:
	return err;
}

我们可以看到sk->prot为tcp_prot,查找该结构我们并未发现.bind项,于是继续往下执行。这里涉及到两个数据结构sockaddr_in跟sockaddr

struct sockaddr_in {
  sa_family_t		sin_family;	/* Address family		*/
  __be16		sin_port;	/* Port number			*/
  struct in_addr	sin_addr;	/* Internet address		*/

  /* Pad to size of `struct sockaddr'. */
  unsigned char		__pad[__SOCK_SIZE__ - sizeof(short int) -
			sizeof(unsigned short int) - sizeof(struct in_addr)];
};

struct sockaddr {
  sa_family_t		sin_family;	/* Address family		*/
  char            sa_data[14];
}

可以看到因为两个结构体长度相同,结构相似可以互相强制类型转换,可能考虑到兼容性问题,在inet_bind()函数中将之前sockaddr类型转回sockaddr_in类型。

通过sock_net(sk)返回了sk->sk_net指针,如果用户没有自定义网络空间则返回系统默认init_net 结构指针,然后调用 inet_addr_type() 函数检查地址的类型。

unsigned int inet_addr_type(struct net *net, __be32 addr)
{
	return __inet_dev_addr_type(net, NULL, addr);
}

static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
					    __be32 addr)
{
	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
	struct fib_table *local_table;

	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))		//检查地址是否是零地址或广播地址
		return RTN_BROADCAST;
	if (ipv4_is_multicast(addr))							//检查地址是否是组播地址
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

	local_table = fib_get_table(net, RT_TABLE_LOCAL);		//查找本地路由函数表
	if (local_table) {
		ret = RTN_UNICAST;
		if (!local_table->tb_lookup(local_table, &fl, &res)) {
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
			fib_res_put(&res);
		}
	}
	return ret;
}

代码中出现了 struct flowi 结构用于路由键值。flowi.nl_u 是一个联合体,包含了ip4_u、ip6_u、dn_u这三个结构体,于是可以理解 struct flowi        fl = { .nl_u = { .ip4_u = { .daddr = addr } } } 将ip地址赋值给路由键值 fl的目标地址。我们可以稍微看下flowi的结构体

struct flowi {		//路由键值结构
	int	oif;		//负责发送的网络设备
	int	iif;		//负责接收的网络设备
	__u32	mark;	//子网掩码

	union {
		struct {
			__be32			daddr;		//目标地址
			__be32			saddr;		//源地址,即发送方地址
			__u8			tos;		//服务类型TOS
			__u8			scope;		//范围
		} ip4_u;
		
		struct {
			struct in6_addr		daddr;
			struct in6_addr		saddr;
			__be32			flowlabel;
		} ip6_u;

		struct {
			__le16			daddr;
			__le16			saddr;
			__u8			scope;
		} dn_u;
	} nl_u;								//该联合体主要用于网络层
#define fld_dst		nl_u.dn_u.daddr		
#define fld_src		nl_u.dn_u.saddr
#define fld_scope	nl_u.dn_u.scope
#define fl6_dst		nl_u.ip6_u.daddr
#define fl6_src		nl_u.ip6_u.saddr
#define fl6_flowlabel	nl_u.ip6_u.flowlabel
#define fl4_dst		nl_u.ip4_u.daddr
#define fl4_src		nl_u.ip4_u.saddr
#define fl4_tos		nl_u.ip4_u.tos
#define fl4_scope	nl_u.ip4_u.scope

	__u8	proto;		//传输层协议
	__u8	flags;		//标志位
	union {
		struct {
			__be16	sport;	//源端口,发送方端口
			__be16	dport;	//目标端口,接收方端口
		} ports;

		struct {
			__u8	type;
			__u8	code;
		} icmpt;			//ICMP 类型

		struct {
			__le16	sport;
			__le16	dport;
		} dnports;

		__be32		spi;

		struct {
			__u8	type;
		} mht;
	} uli_u;							//该联合体主要用于传输层
#define fl_ip_sport	uli_u.ports.sport
#define fl_ip_dport	uli_u.ports.dport
#define fl_icmp_type	uli_u.icmpt.type
#define fl_icmp_code	uli_u.icmpt.code
#define fl_ipsec_spi	uli_u.spi
#define fl_mh_type	uli_u.mht.type
	__u32           secid;	/* used by xfrm; see secid.txt */
} __attribute__((__aligned__(BITS_PER_LONG/8)));

struct fib_result 结构是路由查找结果,struct fib_table 则是路由函数表结构体。函数中先检查ip地址addr是否是零地址、本地的广播地址、组播地址

static inline bool ipv4_is_zeronet(__be32 addr)
{
	return (addr & htonl(0xff000000)) == htonl(0x00000000);
}

可以看到该函数是检查addr的高8位是否为零判断是否是零网地址。

static inline bool ipv4_is_lbcast(__be32 addr)
{
	/* limited broadcast */
	return addr == htonl(INADDR_BROADCAST);
}

#define	INADDR_BROADCAST	((unsigned long int) 0xffffffff)

是否全1判断为广播地址

static inline bool ipv4_is_multicast(__be32 addr)
{
	return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

addr的高4位为1110则是属于多播地址

零网地址、广播地址则直接返回RTN_BROADCAST,多播地址返回RTN_MULTICAST,若都不是则查找具体的函数路由表返回查找结果。

fib_get_table 函数在内核中有两块地方,根据是否配置了CONFIG_IP_MULTIPLT_TABLES,我们挑简单的单路由函数表分析。

static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
	struct hlist_head *ptr;

	ptr = id == RT_TABLE_LOCAL ?
		&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
		&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
	return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}

此时传进来的net为sock_net(sk),即系统默认的init_net,id则是 RT_TABLE_LOCAL ,net->ipv4是netns_ipv4结构类型,装载着IPV4协议在网络空间中的信息。

struct netns_ipv4 {
#ifdef CONFIG_SYSCTL
	struct ctl_table_header	*forw_hdr;
	struct ctl_table_header	*frags_hdr;
	struct ctl_table_header	*ipv4_hdr;
#endif
	struct ipv4_devconf	*devconf_all;
	struct ipv4_devconf	*devconf_dflt;
#ifdef CONFIG_IP_MULTIPLE_TABLES
	struct fib_rules_ops	*rules_ops;
#endif
	struct hlist_head	*fib_table_hash;
	struct sock		*fibnl;

	struct sock		**icmp_sk;
	struct sock		*tcp_sock;

	struct netns_frags	frags;
#ifdef CONFIG_NETFILTER
	struct xt_table		*iptable_filter;
	struct xt_table		*iptable_mangle;
	struct xt_table		*iptable_raw;
	struct xt_table		*arptable_filter;
#endif

	int sysctl_icmp_echo_ignore_all;
	int sysctl_icmp_echo_ignore_broadcasts;
	int sysctl_icmp_ignore_bogus_error_responses;
	int sysctl_icmp_ratelimit;
	int sysctl_icmp_ratemask;
	int sysctl_icmp_errors_use_inbound_ifaddr;
};

IPV4所有的路由函数表都会链入到 fib_table_hash 数组中,数组的每个元素为hlist_head即队列,每个路由函数表通过其内部结构 tb_hlist 头链入到对应的队列中。

[TODO]https://blog.csdn.net/panxj856856/article/details/87981937

如果找到了本地路由函数表,那么调用本地路由函数表的 local_table->tb_lookup(local_table, &fl, &res) 函数根据键值 fl 返回 struct fib_result 结构,传递下来的dev参数为null,所以此时会修改ret = res.type,然后返回。

回到inet_bind函数中,snum = ntohs(addr->sin_port) 取得端口号,检查是否小于1024(系统保留了0~1023端口号)以及是否有绑定权限,然后检查状态、端口是否已经指定,再将ip地址赋值给inet的接收地址和源地址上。如果 ip地址类型是组播或者广播或者零网地址,则将源地址改为0,而接收地址不变。

然后调用tcp_prot->get_prot 即inet_csk_get_port检查端口是否允许绑定。

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;	//tcp_prot->h.hashinfo tcp_hashinfo
	struct inet_bind_hashbucket *head;
	struct hlist_node *node;
	struct inet_bind_bucket *tb;
	int ret;
	struct net *net = sock_net(sk);		//通过sock得到net结构

代码过长,我们一点一点看。我们看到先得到hashinfo结构,它是inet_hashinfo结构的指针,通过tcp_prot.h.hashinfo得到的tcp_hashinfo,我们看下inet_hashinfo结构,用来封装各种协议的绑定哈希表。

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */ //已经连接的sock结构都链入到该哈希桶,它有两个队列,一个是连接的sock 队列,一个为定时等待的sock队列
	struct inet_ehash_bucket	*ehash;	//已经建立连接的哈希桶
	rwlock_t			*ehash_locks;	//队列锁
	unsigned int			ehash_size;	//队列长度
	unsigned int			ehash_locks_mask;	//锁掩码

	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash;	//管理端口号的哈希桶

	unsigned int			bhash_size;	//哈希桶长度
	/* Note : 4 bytes padding on 64 bit arches */

	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct hlist_head		listening_hash[INET_LHTABLE_SIZE];	//监听哈希队列

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * are often dirty.
	 */
	rwlock_t			lhash_lock ____cacheline_aligned;
	atomic_t			lhash_users;
	wait_queue_head_t		lhash_wait;	//等待队列头
	struct kmem_cache			*bind_bucket_cachep;	//高速缓存
};

 可以看到这个结构是为了维护INET协议族的hash表使用的。

我们还看到一个数据结构

struct inet_bind_hashbucket {	//哈希桶结构
	spinlock_t		lock;		//自旋锁
	struct hlist_head	chain;	//桶队列
};

 这是一个带着自旋锁的哈希桶,chain代表着各个桶的哈希队列。

再往下struct hlist_node *node ,它是hash表的链头,被链入到hlist_head结构中。

下一行还有struct inet_bind_bucket *tb

struct inet_bind_bucket {			//桶结构
	struct net		*ib_net;		//网络空间指针
	unsigned short		port;		//端口号
	signed short		fastreuse;	//可以重复使用
	struct hlist_node	node;		//链入哈希桶的chain中的哈希头
	struct hlist_head	owners;		//sock 结构队列
};

这个结构链入到哈希桶inet_bind_hashbucket 结构中

继续inet_csk_get_port函数

	local_bh_disable();					//是net指向内核的init_net网络空间
	if (!snum) {						//如果端口号没有指定
		int remaining, rover, low, high;

		inet_get_local_port_range(&low, &high);
		remaining = (high - low) + 1;
		rover = net_random() % remaining + low;

		do {	//在内核中查找一个端口号
			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
			spin_lock(&head->lock);
			inet_bind_bucket_for_each(tb, node, &head->chain)
				if (tb->ib_net == net && tb->port == rover)
					goto next;
			break;
		next:
			spin_unlock(&head->lock);
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0)
			goto fail;

		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;

可以看到snum我们是指定了的,但如果没有指定(端口号为0)则进入这个条件分支,表示由内核分配一个端口号。

先调用inet_get_local_port_range()函数取得端口号的取值范围。

void inet_get_local_port_range(int *low, int *high)
{
	unsigned seq;
	do {
		seq = read_seqbegin(&sysctl_port_range_lock);

		*low = sysctl_local_port_range[0];
		*high = sysctl_local_port_range[1];
	} while (read_seqretry(&sysctl_port_range_lock, seq));
}

通过乐观锁的方式从内核的端口范围数组sysctl_local_port_range读出两个值{32768, 61000}

然后通过随机数的方式计算出推荐端口号 rover = net_random() % remaining + low。并且保证推荐端口号是未使用过的,如果全部使用过,则ret为1,go fail,否则就以适合的推荐端口号为端口号。

head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
			spin_lock(&head->lock);
			inet_bind_bucket_for_each(tb, node, &head->chain)
				if (tb->ib_net == net && tb->port == rover)
					goto next;

这一块的逻辑类似于hashmap.get(rover),就是将rover取哈希取余为下标,在tcp_hashinfo的bhash哈希桶中,取出对应的哈希队列chain,然后遍历队列如果有相同的则说明绑定过了。

#define inet_bind_bucket_for_each(tb, node, head) \
	hlist_for_each_entry(tb, node, head, node)

#define hlist_for_each_entry(tpos, pos, head, member)			 \
	for (pos = (head)->first;					 \
	     pos && ({ prefetch(pos->next); 1;}) &&			 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

在这里我们可以看到细节,桶结构 inet_bind_bucket 是通过node节点链入到哈希桶 inet_bind_hashbucket 的chain队列上去。

我们的服务器程序指定了端口号,那么不需要内核分配端口号,我们继续inet_csk_get_port函数

	} else {	//在哈希桶队列中查找相同端口的桶结构
		head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, node, &head->chain)
			if (tb->ib_net == net && tb->port == snum)
				goto tb_found;
	}
	tb = NULL;
	goto tb_not_found;
tb_found:
	if (!hlist_empty(&tb->owners)) {	//检查 sock 队列是否为空
		if (tb->fastreuse > 0 &&
		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
			goto success;	//复用
		} else {
			ret = 1;	//桶结构中的 sock 队列是否存在冲突
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
				goto fail_unlock;
		}
	}
tb_not_found:	//桶结构不存在则创建
	ret = 1;
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) {
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
			tb->fastreuse = 1;	//设置桶结构可以复用
		else
			tb->fastreuse = 0;
	} else if (tb->fastreuse &&
		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
		tb->fastreuse = 0;
success:
	if (!inet_csk(sk)->icsk_bind_hash) //还没有绑定桶结构
		inet_bind_hash(sk, tb, snum);	//绑定
	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}

现在哈希桶结构中查找指定端口对应的桶结构tb,如果找到则tb_found处,tb->owners是一个sock队列头,如果这个队列不为空,就检查其是否支持快速复用即fastreuse为1,然后再看我们的sock是否也允许复用,且未处于监听状态那么跳入sucess处。否则执行inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb) 函数,即ipv4_specific->bind_conflict,即 inet_csk_bind_conflict 函数

int inet_csk_bind_conflict(const struct sock *sk,
			   const struct inet_bind_bucket *tb)
{
	const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
	struct sock *sk2;
	struct hlist_node *node;
	int reuse = sk->sk_reuse;

	/*
	 * Unlike other sk lookup places we do not check
	 * for sk_net here, since _all_ the socks listed
	 * in tb->owners list belong to the same net - the
	 * one this bucket belongs to.
	 */

	sk_for_each_bound(sk2, node, &tb->owners) {
		if (sk != sk2 &&
		    !inet_v6_ipv6only(sk2) &&
		    (!sk->sk_bound_dev_if ||
		     !sk2->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {	//是否同一设备
			if (!reuse || !sk2->sk_reuse ||
			    sk2->sk_state == TCP_LISTEN) {
				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
				    sk2_rcv_saddr == sk_rcv_saddr)				//是否绑定地址相同
					break;
			}
		}
	}
	return node != NULL;
}

代码很简单,宏sk_for_each_bound 是遍历tb->owners队列,其中每个sock结构为sk2,然后对比sk跟sk2,如果设备相同、绑定的地址也相同就”冲突“了。

回到inet_csk_get_port函数,如果没找到桶结构转到tb_not_found处,通过 inet_bind_bucket_create 函数创建桶结构,并将端口号等内容记录到新建的桶结构中,并将桶结构链入到哈希桶中。

/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
						 struct net *net,
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);	//申请桶结构空间

	if (tb != NULL) {
		tb->ib_net       = hold_net(net);		//记录网络空间
		tb->port      = snum;					//记录端口号
		tb->fastreuse = 0;						//快速复用初始化0,根据sock调整
		INIT_HLIST_HEAD(&tb->owners);			//初始化 sock 队列
		hlist_add_head(&tb->node, &head->chain);//将桶结构链入到哈希桶中
	}
	return tb;
}

在sucess处通过 inet_csk(sk)->icsk_bind_hash 判断是否还没有绑定桶结构。inet_csk直接将sock指针强转为 inet_connection_sock,然后判断其icsk_bind_hash是否有值。inet_connection_sock用于INET协议族连接sock,前面有出现。如果sock没有绑定桶结构则通过 inet_bind_hash 函数将 sock 链入到桶结构的sock队列中。

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
	inet_sk(sk)->num = snum;
	sk_add_bind_node(sk, &tb->owners);
	inet_csk(sk)->icsk_bind_hash = tb;
}

首先将inet_connection_sock的端口号赋值,然后将当前sock绑定到桶结构的owners队列中,然后inet_connection_sock的icsk_bind_hash记下tb桶。绑定工作就完毕了。

回到inet_bind()函数

	/* Make sure we are allowed to bind here. 检查是否允许绑定 */
	if (sk->sk_prot->get_port(sk, snum)) {              // inet_csk_get_port()
		inet->saddr = inet->rcv_saddr = 0;              // 检查失败就情况设置的地址
		err = -EADDRINUSE;
		goto out_release_sock;
	}

	if (inet->rcv_saddr)                        //如果已经设置了地址就增加锁标志,表示已经绑定了地址
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)                                   //如果已经设置了端口就增加锁标志,表示已经绑定了端口
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->sport = htons(inet->num);             //记录端口
	inet->daddr = 0;                            //初始化目标地址
	inet->dport = 0;                            //初始化目标端口
	sk_dst_reset(sk);                           //初始化缓存的路由内容
	err = 0;
out_release_sock:
	release_sock(sk);   //解锁
out:
	return err;
}

bind()到这里就告一段落,中间我们遗留了本地路由函数表、local_table->tb_lookup(local_table, &fl, &res)相关的内容。

 

你可能感兴趣的:(源码学习,Linux,内核,tcpip)