/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
* @ops: protocol specific socket operations
* @fasync_list: Asynchronous wake up list
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wait: wait queue for several uses
* @type: socket type (%SOCK_STREAM, etc)
*/
struct socket {
socket_state state; //socket 的状态
unsigned long flags; //socket 的标志位
const struct proto_ops *ops; //socket 的函数操作表
struct fasync_struct *fasync_list; //socket 的异步唤醒队列
struct file *file; // 与socket关联的文件指针
struct sock *sk; // 代表具体协议内容的 sock 结构指针
wait_queue_head_t wait; // 等待队列
short type; //socket 的类型
};
从 socket 结构体可以看出 socket 是通用的套接字结构体的公共部分,而其中的 sock 结构体则是与使用的具体协议相关的部分,可以理解成从 socket 中抽象出 sock 部分,sock 结构体是根据使用的协议挂入到 socket 中,下面了解下 sock 结构体。
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common; // 与 inet_timewait_sock 共享使用
#define sk_family __sk_common.skc_family // 地址族
#define sk_state __sk_common.skc_state // 连接状态
#define sk_reuse __sk_common.skc_reuse // 确定复用地址
#define sk_bound_dev_if __sk_common.skc_bound_dev_if //绑定设备 ID
#define sk_node __sk_common.skc_node // 链入主哈希表
#define sk_bind_node __sk_common.skc_bind_node // 链入绑定哈希表
#define sk_refcnt __sk_common.skc_refcnt // 使用计数
#define sk_hash __sk_common.skc_hash // 哈希值
#define sk_prot __sk_common.skc_prot // 协议函数表
#define sk_net __sk_common.skc_net // 所属的网络空间
unsigned char sk_shutdown : 2, // 是否关闭,mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
sk_no_check : 2, // 是否检查数据包
sk_userlocks : 4; // 用户锁,%SO_SNDBUF and %SO_RCVBUF settings
unsigned char sk_protocol; // 使用协议族的哪一种协议
unsigned short sk_type; // socket 的类型,例如 SOCK_STREAM 等
int sk_rcvbuf; // 接受缓冲区的长度(字节数)
socket_lock_t sk_lock; // 用于同步
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
*/
struct {
struct sk_buff *head; // 记录最先接收到的数据包
struct sk_buff *tail; // 记录最后接收到的数据包
} sk_backlog; // 后备队列
wait_queue_head_t *sk_sleep; //sock 的等待队列
struct dst_entry *sk_dst_cache; // 路由项缓存
struct xfrm_policy *sk_policy[2]; //流策略
rwlock_t sk_dst_lock; // 路由项缓存锁
atomic_t sk_rmem_alloc; // 接受队列的字节数
atomic_t sk_wmem_alloc; // 发送队列的字节数
atomic_t sk_omem_alloc; // 可选择/其他 的字节数
int sk_sndbuf; // 发送缓存的总长度
struct sk_buff_head sk_receive_queue; //接收队列(接收到的数据包队列)
struct sk_buff_head sk_write_queue; //发送队列(正在发送的数据包队列)
struct sk_buff_head sk_async_wait_queue; //DMA 复制的数据包 TODO
int sk_wmem_queued; //全部数据包占用内存计数
int sk_forward_alloc; //记录可用内存长度
gfp_t sk_allocation; //分配模式
int sk_route_caps; //路由的兼容性标志位
int sk_gso_type; //GSO 通用分段类型 TODO
unsigned int sk_gso_max_size; //用于建立 GSO 通用分段的最大长度
int sk_rcvlowat; //SO_RCVLOWAT 设置
unsigned long sk_flags; //SO_BROADCAST、SO_KEEPALIVE、SO_OOBINLINE、SO_LINGER 设置
unsigned long sk_lingertime; //停留时间,确定关闭时间
struct sk_buff_head sk_error_queue; // 错误数据包队列
struct proto *sk_prot_creator; //sock 创建接口
rwlock_t sk_callback_lock; // 为后半部处理使用的锁
int sk_err, //出错码
sk_err_soft; //持续出现的错误
atomic_t sk_drops; //原始 socket 发送的计数器
unsigned short sk_ack_backlog; //当前监听到的连接数量
unsigned short sk_max_ack_backlog; //在 listen() 函数中监听到的连接数量
__u32 sk_priority; //优先级
struct ucred sk_peercred; // SO_PEERCRED 设置
long sk_rcvtimeo; // SO_RCVTIMEO 设置接受超时时间
long sk_sndtimeo; // SO_SNDTIMEO 设置发送超时时间
struct sk_filter *sk_filter; //sock 的过滤器
void *sk_protinfo; //私有区域,当不使用slab高速缓存时由协议族定义
struct timer_list sk_timer; //sock 的冲刷定时器
ktime_t sk_stamp; //最后接收数据包的时间
struct socket *sk_socket; //对应的 socket 指针
void *sk_user_data; //rpc 提供的数据
struct page *sk_sndmsg_page; // 发送数据块所在的缓冲页
struct sk_buff *sk_send_head; // 发送数据包的队列头
__u32 sk_sndmsg_off; //发送数据块在缓冲页的结尾
int sk_write_pending; //等待发送的数量
void *sk_security; //用于安全模式
__u32 sk_mark; //通用的数据包掩码
/* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk); //sock 状态改变后调用的函数
void (*sk_data_ready)(struct sock *sk, int bytes); //在数据被处理完成后调用的函数
void (*sk_write_space)(struct sock *sk); //发送空间可以使用后调用的函数
void (*sk_error_report)(struct sock *sk); //处理错误的函数
int (*sk_backlog_rcv)(struct sock *sk, //处理库存数据包函数
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk); //sock 的销毁函数
};
与应用程序密切相关的共用部分放在了socket结构中,而与协议相关的内容则放在sock结构中,然后使socket与sock挂钩,设计灵活巧妙。
我们看到sock中数据包的结构通过sk_buff来体现,每个协议都是通过sk_buff结构体用于封装、载运数据包,我们可以看下其数据结构。
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next; //队列中的下一个数据包
struct sk_buff *prev; //队列中的前一个数据包
struct sock *sk; //指向所属的 sock 数据包
ktime_t tstamp; //数据包到达的时间
struct net_device *dev; //接收数据包的网络设备
union {
struct dst_entry *dst; //路由项
struct rtable *rtable; //路由表
};
struct sec_path *sp; //用于 xfrm 的安全路径
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48]; // cb 控制块
unsigned int len, //全部数据块的总长度
data_len; //分段、分散数据块的总长度
__u16 mac_len, //链路层头部的长度
hdr_len; //在克隆数据包时可写的头部长度
union {
__wsum csum; //校验和
struct {
__u16 csum_start; //校验和在数据包头部 skb->head 中的起始位置
__u16 csum_offset;//校验和保存到 csum_start 中的位置
};
};
__u32 priority; //数据包在队列中的优先级
__u8 local_df:1, //是否允许本地数据分段
cloned:1, //是否允许被克隆
ip_summed:2, //IP校验和标志
nohdr:1, //运载时使用,表示不能被修改头部
nfctinfo:3; //数据包连接关系
__u8 pkt_type:3, //数据包的类型
fclone:2, //数据包克隆关系
ipvs_property:1,//数据包所属的 ipvs
peeked:1, //数据包是否属于操作状态
nf_trace:1; //netfilter 对数据包的跟踪标志
__be16 protocol; //底层驱动使用的数据包协议
void (*destructor)(struct sk_buff *skb); //销毁数据包的函数
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge; //关于网桥的数据
#endif
int iif;
#ifdef CONFIG_NETDEVICES_MULTIQUEUE
__u16 queue_mapping;
#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
/* 14 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
__u32 mark;
sk_buff_data_t transport_header; //指向数据块中传输层头部
sk_buff_data_t network_header; //指向数据块中网络层头部
sk_buff_data_t mac_header; //指向数据块中链路层头部
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail; //指向数据块的结束地址
sk_buff_data_t end; //指向缓冲块的结束地址
unsigned char *head, //指向缓冲块的开始地址
*data; //指向数据块的开始地址
unsigned int truesize; //数据包的实际长度(结构长度与数据块长度之和)
atomic_t users; //数据包的使用计数器
};
共用部分 socket 结构体、通用部分 sock 结构体、专用部分 inet_sock 结构体。
tcp_sock 内容与 tcp 协议紧密相关,我们看其内容
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn; //由注释看到该结构体必须在 tcp_sock 头部 TODO why?
u16 tcp_header_len; /* Bytes of tcp header to send 发送的 tcp 头部字节数 */
u16 xmit_size_goal; /* Goal for segmenting output packets 分段传送的数据包数量 */
/*
* Header prediction flags 头部的预置位
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
* See RFC793 and RFC1122. The RFC writes these in capitals.
*/
u32 rcv_nxt; /* What we want to receive next 下一个要接收的目标 */
u32 copied_seq; /* Head of yet unread data 代表还没有读取的数据 */
u32 rcv_wup; /* rcv_nxt on last window update sent rcv_nxt 在最后一次窗口更新时内容 */
u32 snd_nxt; /* Next sequence we send 下一个要发送的目标 */
u32 snd_una; /* First byte we want an ack for 第一个要 ack 的字节 */
u32 snd_sml; /* Last byte of the most recently transmitted small packet 最近发送数据包中的尾字节 */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) 最后一次接收到 ack 的时间 */
u32 lsndtime; /* timestamp of last sent data packet (for restart window) 最后一次发送数据包的时间 */
/* Data for direct copy to user 直接复制给用户的数据 */
struct {
struct sk_buff_head prequeue; //预处理队列
struct task_struct *task; //预处理进程
struct iovec *iov; //用户程序(应用程序)接收数据的缓冲区
int memory; //预处理数据包计数器
int len; //预处理长度
#ifdef CONFIG_NET_DMA
/* members for async copy 异步复制的内容 */
struct dma_chan *dma_chan;
int wakeup;
struct dma_pinned_list *pinned_list;
dma_cookie_t dma_cookie;
#endif
} ucopy;
u32 snd_wl1; /* Sequence for window update 窗口更新的顺序 */
u32 snd_wnd; /* The window we expect to receive 期望接收的窗口 */
u32 max_window; /* Maximal window ever seen from peer 从对方获得的最大窗口 */
u32 mss_cache; /* Cached effective mss, not including SACKS 有效的 mss,不包括 SACKS TODO mss、SACKS */
u32 window_clamp; /* Maximal window to advertise 对外公布的最大窗口 */
u32 rcv_ssthresh; /* Current window clamp 当前窗口 */
u32 frto_highmark; /* snd_nxt when RTO occurred 在 rto 时的 snd_nxt */
u8 reordering; /* Packet reordering metric. 预设的数据包数量 */
u8 frto_counter; /* Number of new acks after RTO rto 后的 ack 次数 */
u8 nonagle; /* Disable Nagle algorithm? 是否使用 Nagle 算法 TODO Nagle */
u8 keepalive_probes; /* num of allowed keep alive probes 允许持有的数量 */
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
u32 mdev; /* medium deviation */
u32 mdev_max; /* maximal mdev for the last rtt period */
u32 rttvar; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
u32 packets_out; /* Packets which are "in flight" 处于飞行中的数据包数量 */
u32 retrans_out; /* Retransmitted packets out 转发的数据包数量 */
/*
* Options received (usually on last packet, some only on SYN packets).
*/
struct tcp_options_received rx_opt;
/*
* Slow start and congestion control (see also Nagle, and Karn & Partridge) TODO 慢启动与阻塞控制
*/
u32 snd_ssthresh; /* Slow start size threshold 慢启动的起点值 */
u32 snd_cwnd; /* Sending congestion window 发送的阻塞窗口 */
u32 snd_cwnd_cnt; /* Linear increase counter 线性计数器 */
u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this 不允许 snd_cwnd 超过的值 */
u32 snd_cwnd_used;
u32 snd_cwnd_stamp;
struct sk_buff_head out_of_order_queue; /* Out of order segments go here 超出分段规则的队列 */
u32 rcv_wnd; /* Current receiver window 当前接收窗口 */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer tcp 发送数据的顺序号 */
u32 pushed_seq; /* Last pushed seq, required to talk to windows 最后送出的顺序号,需要通知窗口 */
/* SACKs data */
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
struct tcp_sack_block recv_sack_cache[4];
struct sk_buff *highest_sack; /* highest skb with SACK received
* (validity guaranteed only if
* sacked_out > 0)
*/
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
u16 advmss; /* Advertised MSS */
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
u32 fackets_out; /* FACK'd packets */
u32 high_seq; /* snd_nxt at onset of congestion */
u32 retrans_stamp; /* Timestamp of the last retransmit,
* also used in SYN-SENT to remember stamp of
* the first SYN. */
u32 undo_marker; /* tracking retrans started here. */
int undo_retrans; /* number of undoable retransmissions. */
u32 urg_seq; /* Seq of received urgent pointer */
u16 urg_data; /* Saved octet of OOB data and control flags */
u8 urg_mode; /* In urgent mode */
u8 ecn_flags; /* ECN status bits. */
u32 snd_up; /* Urgent pointer */
u32 total_retrans; /* Total retransmits for entire connection */
u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */
unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
unsigned long last_synq_overflow;
u32 tso_deferred;
/* Receiver side RTT estimation */
struct {
u32 rtt;
u32 seq;
u32 time;
} rcv_rtt_est;
/* Receiver queue space 接受队列空间 */
struct {
int space;
u32 seq;
u32 time;
} rcvq_space;
/* TCP-specific MTU probe information. TCP 指定的 MTU 检验内容 */
struct {
u32 probe_seq_start;
u32 probe_seq_end;
} mtu_probe;
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
struct tcp_sock_af_ops *af_specific;
/* TCP MD5 Signagure Option information */
struct tcp_md5sig_info *md5sig_info;
#endif
};
了解了一些数据结构之后,下面将正式开始介绍 socket 相关的源码。
我们先来看下正常的服务器使用的流程
int main()
{
struct sockaddr_in server_address;
struct sockaddr_in client_address;
server_fd = socket(AF_INET,SOCK_STREAM,0);
server_address.sin_family = AF_INET;
server_address.sin_addr.s_addr = inet_addr("192.168.1.1");
server_address.sin_port = htons(54188);
server_len = sizeof(server_address);
bind(server_fd,(struct sockaddr*)&server_address,server_len);
/*创建一个Socket的监听队列(允许接收10个连接),监听客户端Socket的连接请求*/
listen(server_fd,10);
while(1) {
char recv[20];
printf("server is waiting\n");
/*程序运行到此处时,说明客户端的连接请求已经到来,接受它的连接请求,克隆出一个Socket与客户端建立连接,并将客户端的“电话号码”记录在client_address中,函数返回建立连接的ID号*/
client_len = sizeof(client_address);
client_fd = accept(server_fd,(struct sockaddr*)&client_address,&client_len);
/*使用read和write函数接收客户端字符然后发回客户端*/
read(client_fd,recv,20);
write(client_fd,back,20);
printf("received from client= %s\n",recv);
close(client_fd);
}
close(server_fd);
exit(0);
}
无非先是 socket() 创建服务器socket ,然后bind() 将地址结构与 socket 挂钩起来,于是 listen()监听客户端的连接请求,然后通过accept()然后得到fd,根据vfs即访问文件的方式访问套接字,read/write。
服务器调用socket()函数,其调用的库函数在glibc源码中找到
#include
#include
/* Create a new socket of type TYPE in domain DOMAIN, using
protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically.
Returns a file descriptor for the new socket, or -1 for errors. */
int
__socket (domain, type, protocol)
int domain;
int type;
int protocol;
{
__set_errno (ENOSYS);
return -1;
}
weak_alias (__socket, socket)
stub_warning (socket)
#include
这里看到使用 weak_alias() 函数为 socket() 函数声明了一个“函数别名”_socket(),跟踪其_socket.S汇编代码,发现其通过调用system_call() 函数根据系统函数调用表sys_call_table最终执行的系统调用函数为sys_socketcall(),它也是bind()、listen()、accept()等函数的系统调用入口。
/* Define unique numbers for the operations permitted on socket. Linux
uses a single system call for all these functions. The relevant code
file is /usr/include/linux/net.h.
We cannot use a enum here because the values are used in assembler
code. */
#define SOCKOP_socket 1
#define SOCKOP_bind 2
#define SOCKOP_connect 3
#define SOCKOP_listen 4
#define SOCKOP_accept 5
#define SOCKOP_getsockname 6
#define SOCKOP_getpeername 7
#define SOCKOP_socketpair 8
#define SOCKOP_send 9
#define SOCKOP_recv 10
#define SOCKOP_sendto 11
#define SOCKOP_recvfrom 12
#define SOCKOP_shutdown 13
#define SOCKOP_setsockopt 14
#define SOCKOP_getsockopt 15
#define SOCKOP_sendmsg 16
#define SOCKOP_recvmsg 17
asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
int ret;
u32 a[6];
u32 a0, a1;
if (call < SYS_SOCKET || call > SYS_RECVMSG)
return -EINVAL;
if (copy_from_user(a, args, nas[call]))
return -EFAULT;
a0 = a[0];
a1 = a[1];
switch (call) {
case SYS_SOCKET:
ret = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
ret = sys_bind(a0, compat_ptr(a1), a[2]);
break;
case SYS_CONNECT:
ret = sys_connect(a0, compat_ptr(a1), a[2]);
break;
case SYS_LISTEN:
ret = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_GETSOCKNAME:
ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_GETPEERNAME:
ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_SOCKETPAIR:
ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
break;
case SYS_SEND:
ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_SENDTO:
ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
break;
case SYS_RECV:
ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_RECVFROM:
ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
break;
case SYS_SHUTDOWN:
ret = sys_shutdown(a0,a1);
break;
case SYS_SETSOCKOPT:
ret = compat_sys_setsockopt(a0, a1, a[2],
compat_ptr(a[3]), a[4]);
break;
case SYS_GETSOCKOPT:
ret = compat_sys_getsockopt(a0, a1, a[2],
compat_ptr(a[3]), compat_ptr(a[4]));
break;
case SYS_SENDMSG:
ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMSG:
ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
然后根据调用号 SOCKOP_socket 找到对应的系统调用函数 sys_socket()
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock);
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
先是通过sock_create() 函数创建 socket,然后通过 sock_map_fd 函数与vfs虚拟文件系统建立关联,返回相应fd即retval统一管理。
我们先跟踪sock_create()函数,这函数负责分配并初始化 socket 结构。
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
可以看到sock_create函数前三个参数就是socket()函数传入的参数,最后一个 socket** 参数负责接收socket结果,这里继续调用__sock_create() 函数
static int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc(); //分配 socket 结构空间
if (!sock) {
if (net_ratelimit())
printk(KERN_WARNING "socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type; //记录socket 的类型
#if defined(CONFIG_KMOD)
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (net_families[family] == NULL) //检查协议族操作表
request_module("net-pf-%d", family); //安装协议族操作表
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //得到相应的协议族操作表
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol); //执行取得的协议族操作表的 create 函数
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock; // 返回创建结果
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
可以看到这个函数的操作,先是给socket结构分配相应空间,再通过family参数 AF_INET(2) 取得相应协议族操作表,再执行协议族操作表中的create函数将结果返回。我们先看 sock_alloc() 函数,它负责为服务器程序分配 socket 结构和文件节点。
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode(sock_mnt->mnt_sb); //在文件系统中创建文件节点同时分配 socket 结构
if (!inode)
return NULL;
sock = SOCKET_I(inode); //取得 socket 结构指针
inode->i_mode = S_IFSOCK | S_IRWXUGO; //设置文件节点的模式
inode->i_uid = current->fsuid; //设置为当前进程的uid
inode->i_gid = current->fsgid; //设置为当前进程的gid
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use); //设置当前的 sockets_in_use++
return sock;
}
这里 sock_mnt 是 socket 网络文件系统的根节点,这儿相当于在socket网络文件系统中分配一个inode节点,于是服务器程序可以通过相应的inode节点通过read/write操作进行读写。我们先看new_inode() 函数。
struct inode *new_inode(struct super_block *sb)
{
/*
* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
static unsigned int last_ino;
struct inode * inode;
spin_lock_prefetch(&inode_lock);
inode = alloc_inode(sb); //调用超级块函数操作表
if (inode) { //对分配得到inode处理
spin_lock(&inode_lock);
inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use);
list_add(&inode->i_sb_list, &sb->s_inodes);
inode->i_ino = ++last_ino;
inode->i_state = 0;
spin_unlock(&inode_lock);
}
return inode;
}
我们再看其 SOCKET_I(inode) 函数。
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
这儿有点乱,稍微整理下相当于
#define container_of(inode, struct socket_alloc, vfs_inode) ({ \
const typeof( ((struct socket_alloc *)0)->vfs_inode ) *__mptr = (inode); \
(struct socket_alloc *)( (char *)__mptr - offsetof(struct socket_alloc,vfs_inode) );})
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
#define OFFSETOF(strct, elem) ((long)&(((struct strct *)NULL)->elem))
这样就很简单了,offsetof宏相当于elem在struct中的偏移量,即vfs_inode在struct socket_alloc中的偏移量,再由inode即vfs_inode地址减去其在struct socket_alloc中的偏移量,得到了struct socket_alloc的首地址,同时又是socket的首地址。这个宏仅仅是计算指针偏移量得到socket首地址,socket_alloc结构的地址分配在new_inode() 函数中进行。
回到new_inode() 函数,我们看到其通过alloc_inode()函数调用超级块的函数操作表。
static struct inode *alloc_inode(struct super_block *sb)
{
static const struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
if (inode) {
...
}
return inode;
}
这时候的sb->s_op已经在sock_init()过程中,经过get_sb_pseudo()函数将其赋值为 sockfs_ops,即此时调用的为sockfs_ops->alloc_inode 函数。
static struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
.destroy_inode =sock_destroy_inode,
.statfs = simple_statfs,
};
查找sockfs_ops结构体此时调用的是sock_alloc_inode函数完成socket_alloc结构的分配。
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); //分配 socket_alloc 结构
if (!ei)
return NULL;
init_waitqueue_head(&ei->socket.wait); // 初始化等待队列的头
// 初始化socket
ei->socket.fasync_list = NULL;
ei->socket.state = SS_UNCONNECTED; //状态设置为未连接
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
return &ei->vfs_inode;
}
这里进行内存分配与socket结构的初始化。可以看到 kmem_cache_alloc 函数从slab高速缓存 sock_init_cache 直接进行分配,这个缓存块是在sock_init()中通过init_inodecache()函数建立的。
[ TODO kmem_cache_alloc 跟 kmem_cache_create 两个slab函数 ]
回到__sock_create() 函数,我们看到先是通过 net_families[2] 判断是否为NULL即是否安装了AF_INET的协议族操作表(这个过程在内核初始化的时候进行),在这里把相关过程列一下
inet_init -> fs_initcall(inet_init);
#define fs_initcall(fn) __define_initcall("5",fn,5)
static int __init inet_init(void)
{
...
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
...
}
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (net_families[ops->family])
err = -EEXIST;
else {
net_families[ops->family] = ops;
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
最终通过sock_register 函数将inet_family_ops注册到net_families[PF_INET]中,在这里PF_INET就是AF_INET
#define PF_INET AF_INET
继续回到__sock_create() 函数,我们看到执行了协议族操作表inet_family_ops的create函数即 inet_create
static int inet_create(struct net *net, struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
// 检查 socket 类型及加密字符
if (sock->type != SOCK_RAW && //原始类型
sock->type != SOCK_DGRAM && //数据报类型,UDP协议
!inet_ehash_secret)
build_ehash_secret();
从socket()函数传进来的socket类型参数为SOCK_STREAM即流类型,并且判断是否有了加密字符,否则调用 build_ehash_secret函数来设置
void build_ehash_secret(void)
{
u32 rnd;
do {
get_random_bytes(&rnd, sizeof(rnd)); //得到非 0 随机数
} while (rnd == 0);
spin_lock_bh(&inetsw_lock);
if (!inet_ehash_secret)
inet_ehash_secret = rnd; //使用随机数作为加密字符
spin_unlock_bh(&inetsw_lock);
}
回到 inet_create 函数,注意到变量 struct inet_protosw *answer,inet_protosw结构体用于IP协议对应 socket 的接口,也就是靠近 socket 层的协议信息均保存在这个数据结构中,每一个IP协议都有这么一个接口结构。
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. 下面两个变量用于校对使用 */
unsigned short type; /* This is the 2nd argument to socket(2). 对应于socket的类型 */
unsigned short protocol; /* This is the L4 protocol number. IP协议编码 */
struct proto *prot; /* 对应的协议结构体指针 */
const struct proto_ops *ops; /* 对应协议的函数操作表指针 */
int capability; /* Which (if any) capability do
* we need to use this socket
* interface?
*/
char no_check; /* checksum on rcv/xmit/none? 是否在接收/发送的过程中使用校验和 */
unsigned char flags; /* See INET_PROTOSW_* below. 标志位 */
};
继续看 inet_create 函数
sock->state = SS_UNCONNECTED; //设置socket的状态为'未连接状态'
/* Look for the requested type/protocol pair. */
answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock(); // rcu 锁的操作,适合读多写少情况
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. 检查协议编码是否与内核已经注册的协议相同 */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. 检查是否属于虚拟IP协议 */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
answer = NULL;
}
if (unlikely(answer == NULL)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) 是否指定了名称
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP) 否则就是通用的名称
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
err = -EAFNOSUPPORT;
if (!inet_netns_ok(net, protocol))
goto out_rcu_unlock;
rcu_read_lock 跟 rcu_read_unlock 之间是读临界区
#define list_for_each_rcu(pos, head) \
for (pos = rcu_dereference((head)->next); \
prefetch(pos->next), pos != (head); \
pos = rcu_dereference(pos->next))
#define rcu_dereference(p) ({ \
typeof(p) _________p1 = ACCESS_ONCE(p); \
smp_read_barrier_depends(); \
(_________p1); \
})
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
可以看到宏 list_for_each_rcu 作用,在rcu保护下循环检查inetsw数组,直到找到符合 socket 类型的队列,这队列是inet_protosw结构。inetsw队列数组也是在inet_init() 函数中注册完成的。
static int __init inet_init(void)
{
struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
...
(void)sock_register(&inet_family_ops);
...
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
...
}
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM, //TCP数据流协议
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM, //UDP数据报协议
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW, //RAW原始套接字
.protocol = IPPROTO_IP, /* wild card 虚拟IP类型*/
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
从inet_init()函数看到,使用 inet_register_protosw() 函数注册这个数组。
static struct list_head inetsw[SOCK_MAX];
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. 检查参数P的类型是否超越了内核范围 */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
}
answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
synchronize_net();
return;
out_permanent:
printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
protocol);
goto out;
out_illegal:
printk(KERN_ERR
"Ignoring attempt to register invalid socket type %d.\n",
p->type);
goto out;
}
该函数通过宏 list_for_each 循环inetsw数组,通过对比要插入的参数p是否INET_PROTOSW_PERMANENT标志、并且与队列属于同一种协议,如果符合则链入p->list队列中。
可以看到inet_init()函数将数组inetsw_array中的元素逐一链入到inetsw数组的队列中。
注册这块告一段落,回到inet_create函数中。
还记得 server_fd = socket(AF_INET,SOCK_STREAM,0); 所以protocol 为 0,且type为SOCK_STREAM即TCP协议类型,所以answer指向TCP协议的inet_protosw结构,然后protocol为IPPROTO_IP(0)那么不等于TCP协议的inet_protosw结构的protocol
IPPROTO_IP = 0, /* Dummy protocol for TCP. */
#define IPPROTO_IP IPPROTO_IP
于是,protocol = answer->protocol即peotocol被赋值为TCP协议的protocol 6。其中capability为-1,然后inet_netns_ok判断
我们继续看 inet_create 函数
sock->ops = answer->ops; //inet_stream_ops
answer_prot = answer->prot; //tcp_prot
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //分配 sock 结构
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; //设置处理库存函数
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares. 这里允许用户指定 socket 的编号,创建时自动共享
*/
inet->sport = htons(inet->num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk); //调用运输层钩子函数init tcp_prot
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
可以看到将tcp结构的操作函数 inet_stream_ops 挂钩给了socket的协议操作函数,将answer->prot赋值给answer_prot,作为型参传递给了 sk_alloc() 函数使用。[ TODO socket -- 传输层 proto -- 网络层 inet_proto ]
我们看 sk_alloc()函数,其中prot参数为answer->prot,即tcp_prot
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
}
return sk;
}
用sk_prot_alloc() 函数分配一个通用的sock结构体
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
slab = prot->slab;
if (slab != NULL)
sk = kmem_cache_alloc(slab, priority); //内存管理的slab分配函数,从sock高速缓冲池中分配
else
sk = kmalloc(prot->obj_size, priority); //通用的告诉缓冲池中分配空间结构
if (sk != NULL) {
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
goto out_free_sec;
}
return sk;
out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}
根据prot结构是否提供了slab高速缓存来确定是在高速缓存分配或者在通用缓冲中分配。
分配成功后对family赋值、将tcp_prot结构赋值到 sk_prot 跟 sk_prot_creator上,然后在sock_lock_init 函数中对sock结构中的起同步作用的sk_lock锁进行初始化。其中 sk_lock 是 socket_lock_t 类型的变量,可以说它是专用于 socket 的锁
typedef struct {
spinlock_t slock;
int owned;
wait_queue_head_t wq;
/*
* We express the mutex-alike socket_lock semantics
* to the lock validator by explicitly managing
* the slock as a lock variant (in addition to
* the slock itself):
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} socket_lock_t;
可以看到其中包含了一个自旋锁 slock 跟 一个等待队列头 wq。sock_lock_init_class_and_name 是对其内容的初始化。
static inline void sock_lock_init(struct sock *sk)
{
sock_lock_init_class_and_name(sk,
af_family_slock_key_strings[sk->sk_family],
af_family_slock_keys + sk->sk_family,
af_family_key_strings[sk->sk_family],
af_family_keys + sk->sk_family);
}
#define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
do { \
sk->sk_lock.owned = 0; \
init_waitqueue_head(&sk->sk_lock.wq); \
spin_lock_init(&(sk)->sk_lock.slock); \
debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
sizeof((sk)->sk_lock)); \
lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
(skey), (sname)); \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
回到sk_alloc中,传入的net参数为 current->nsproxy->net_ns,这个是当前进程中记录的网络空间的结构,调用sock_net_set(sk, get_net(net)) 函数记录下所属的 net 空间结构,get_net(net)则是增加 net 结构的计数器。
static inline
void sock_net_set(struct sock *sk, struct net *net)
{
#ifdef CONFIG_NET_NS
sk->sk_net = net;
#endif
}
static inline struct net *get_net(struct net *net)
{
atomic_inc(&net->count);
return net;
}
回到 inet_create 函数,sk_alloc 分配了 sock 结构并初始化之后,如果分配失败则 sk == NULL 直接退出
接下来调用inet = inet_sk(sk),通过 sock 指针得到 struct inet_sock * inet指针
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
return (struct inet_sock *)sk;
}
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32 daddr; //目标地址
__be32 rcv_saddr;
__be16 dport; //目标端口
__u16 num; //端口
__be32 saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options *opt;
__be16 sport;
__u16 id;
__u8 tos;
__u8 mc_ttl;
__u8 pmtudisc;
__u8 recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1;
int mc_index;
__be32 mc_addr;
struct ip_mc_socklist *mc_list;
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct dst_entry *dst;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
} cork;
};
可以了解到这是 socket 的专用数据结构。
再往后,调用 sock_init_data(sock, sk) 对新分配的sock结构做进一步初始化,将socket与sock的内容挂钩起来。
void sock_init_data(struct socket *sock, struct sock *sk)
{ /* 队列并非采用通用的 list_head 来维护,而是使用 skb_buffer 队列: */
skb_queue_head_init(&sk->sk_receive_queue); //初始化接收队列
skb_queue_head_init(&sk->sk_write_queue); //初始化发送队列
skb_queue_head_init(&sk->sk_error_queue); //初始化错误数据包队列
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue); //DMA 复制的数据包队列
#endif
sk->sk_send_head = NULL; //发送数据包的队列头
init_timer(&sk->sk_timer); //初始化 sock 的冲刷定时器
sk->sk_allocation = GFP_KERNEL; //分配模式,无内存可用时可引起休眠
sk->sk_rcvbuf = sysctl_rmem_default; //接受缓冲区的长度 32767
sk->sk_sndbuf = sysctl_wmem_default; //发送缓存的总长度 32767
sk->sk_state = TCP_CLOSE;
sk->sk_socket = sock; //指向对应的 socket 结构
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_sleep = &sock->wait;
sock->sk = sk; //回指对应的 scok 结构
} else
sk->sk_sleep = NULL;
rwlock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
sk->sk_peercred.pid = 0;
sk->sk_peercred.uid = -1;
sk->sk_peercred.gid = -1;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
注意到这里对三个重要数据包队列头的初始化,是 sk_buff_head 结构
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;
spinlock_t lock;
};
可以看到这是一个双向队列结构,其中qlen是队列长度、lock用于并发控制的锁。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
然后调用了sk->sk_prot->init(sk)即tcp_prot->init()函数,即 tcp_v4_init_sock 函数
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
tp->snd_cwnd = 2;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_af_ops = &ipv4_specific;
icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv4_specific;
#endif
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
atomic_inc(&tcp_sockets_allocated);
return 0;
}
inet_connection_sock 结构是不是很熟悉?是 tcp_sock 结构体的第一个成员。这里也都是一些初始化赋值操作,最后递增tcp_sockets_allocated。
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet; //INET 协议族的 sock 结构
struct request_sock_queue icsk_accept_queue; //确定接收队列
struct inet_bind_bucket *icsk_bind_hash; //绑定的桶结构
unsigned long icsk_timeout; //超时
struct timer_list icsk_retransmit_timer; //没有 ACK 时的重发定时器
struct timer_list icsk_delack_timer; //确定删除定时器
__u32 icsk_rto; //重发超时
__u32 icsk_pmtu_cookie; //最近的 pmtu
const struct tcp_congestion_ops *icsk_ca_ops; //拥挤情况时的处理函数
const struct inet_connection_sock_af_ops *icsk_af_ops; //AF_INET指定的函数操作表
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); //同步 mss 的函数指针
__u8 icsk_ca_state; //拥挤情况的处理状态
__u8 icsk_retransmits; //重发数量
__u8 icsk_pending; //挂起
__u8 icsk_backoff; //允许连接的数量
__u8 icsk_syn_retries; //允许重新SYN的数量
__u8 icsk_probes_out; //探测到未应答的窗口
__u16 icsk_ext_hdr_len; //网络协议头部的长度
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
到这里 socket 算是创建并初始化完成,我们可以看到上图的数据结构,此时socket的 state 为未连接状态,sock 的 sk_state 为关闭状态。然后回到 sys_socket 函数中,执行retval = sock_map_fd(sock)。
int sock_map_fd(struct socket *sock)
{
struct file *newfile;
int fd = sock_alloc_fd(&newfile); //为 socket 分配文件号跟文件结构
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile); //挂载 socket 跟文件结构
if (unlikely(err < 0)) { //出错则释放文件跟文件号
put_filp(newfile);
put_unused_fd(fd);
return err;
}
fd_install(fd, newfile); //使文件与文件号挂钩
}
return fd;
}
先通过sock_alloc_fd申请文件结构空间与文件号
static int sock_alloc_fd(struct file **filep)
{
int fd;
fd = get_unused_fd(); //得到空闲文件号
if (likely(fd >= 0)) {
struct file *file = get_empty_filp(); //分配文件结构空间
*filep = file;
if (unlikely(!file)) {
put_unused_fd(fd);
return -ENFILE;
}
} else
*filep = NULL;
return fd;
}
这儿涉及到文件系统的操作,分别从当前进程获取到空闲的fd,再从文件系统分配空闲的文件结构空间,如果分配失败则释放。
分配申请成功,然后执行sock_attach_fd
static int sock_attach_fd(struct socket *sock, struct file *file)
{
struct dentry *dentry;
struct qstr name = { .name = "" };
dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); //创建一个 socket 的文件系统目录项,sock_mnt是 vfsmount 类型
if (unlikely(!dentry))
return -ENOMEM;
dentry->d_op = &sockfs_dentry_operations; //将 socket文件系统的目录操作表挂入到目录项的操作表中
/*
* We dont want to push this dentry into global dentry hash table.
* We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
* This permits a working /proc/$pid/fd/XXX on sockets
*/
dentry->d_flags &= ~DCACHE_UNHASHED;
d_instantiate(dentry, SOCK_INODE(sock));
sock->file = file;
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
&socket_file_ops); // socket 文件结构进行初始化,传入的是 socket_file_ops 操作表
SOCK_INODE(sock)->i_fop = &socket_file_ops;
file->f_flags = O_RDWR;
file->f_pos = 0;
file->private_data = sock; //可以在文件系统中通过 private_data 找到对应的 socket
return 0;
}
我们看一下 sockfs_dentry_operations socket文件系统的目录操作表
static struct dentry_operations sockfs_dentry_operations = {
.d_delete = sockfs_delete_dentry,
.d_dname = sockfs_dname,
};
同时看一下 socket_file_ops 文件操作表
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
其实我们通过read/write对socket进行读写,但内部是通过这个函数表映射到具体的socket操作,给用户一种操作文件的方便性,统一性。
再看demo的服务器代码,在socket()创建完毕后,通过bind(server_fd,(struct sockaddr*)&server_address,server_len)绑定地址给socket。
我们还是跟踪其实现,bind直接在sys_socketcall()函数中,对照参数SYS_BIND,找到系统调用sys_bind()
asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed); //通过 fd 找到对应的 socket
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, address); //将传入的地址从用户空间复制到内核空间
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)address,
addrlen);
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
address, addrlen); //调用具体协议的绑定函数,inet_stream_ops->bind()
}
fput_light(sock->file, fput_needed);
}
return err;
}
我们继续看 sockfd_lookup_light 函数
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
struct file *file;
struct socket *sock;
*err = -EBADF;
file = fget_light(fd, fput_needed); //根据 fd 找到文件指针
if (file) {
sock = sock_from_file(file, err); //在文件指针中获得 socket 指针
if (sock)
return sock;
fput_light(file, *fput_needed);
}
return NULL;
}
fget_light/fput_light 是文件操作,fget_light从当前进程的files_struct 结构中找到文件系统中file文件指针,增加计数,fput_light减计数,如果sock结构取到则直接返回。在这里我们重点看下 sock_from_file 函数
static struct socket *sock_from_file(struct file *file, int *err)
{
if (file->f_op == &socket_file_ops)
return file->private_data; /* set in sock_map_fd */
*err = -ENOTSOCK;
return NULL;
}
之前提过的 file->private_data域是存储 socket 指针。通过sockfd_lookup_light函数我们得到了之前创建并初始化的socket,然后通过 move_addr_to_kernel 函数将地址复制到内核空间。
int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
{
if (ulen < 0 || ulen > MAX_SOCK_ADDR)
return -EINVAL;
if (ulen == 0)
return 0;
if (copy_from_user(kaddr, uaddr, ulen))
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
}
再往下看 security_socket_bind 涉及到 security ,没有设置直接返回 0 。于是调用了 sock->ops->bind()方法,由于我们的socket->ops之前绑定了answer->ops(忘记了可以翻看上面的inet_create函数),即这里实际上调用的是inet_stream_ops->bind
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
对照上面这个结构体我们找到.bind执行的是inet_bind()函数。
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
unsigned short snum;
int chk_addr_ret;
int err;
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len); //如果 socket 提供了自己的绑定函数就使用它,这里的sk->sk_prot为tcp_prot
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); //在路由中检查地址类型
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
!inet->freebind &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL && //是否单播类型
chk_addr_ret != RTN_MULTICAST && //是否组播类型
chk_addr_ret != RTN_BROADCAST) //是否广播类型
goto out;
snum = ntohs(addr->sin_port); //取得端口号
err = -EACCES;
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk); //加锁
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->num) //检查状态、端口是否已经指定
goto out_release_sock;
inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; //rcv_saddr用于哈希查找、saddr用于发送(赋值为ip地址)
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. 检查是否允许绑定 */
if (sk->sk_prot->get_port(sk, snum)) { // inet_csk_get_port()
inet->saddr = inet->rcv_saddr = 0; // 检查失败就情况设置的地址
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->rcv_saddr) //如果已经设置了地址就增加锁标志,表示已经绑定了地址
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) //如果已经设置了端口就增加锁标志,表示已经绑定了端口
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->sport = htons(inet->num); //记录端口
inet->daddr = 0; //初始化目标地址
inet->dport = 0; //初始化目标端口
sk_dst_reset(sk); //初始化缓存的路由内容
err = 0;
out_release_sock:
release_sock(sk); //解锁
out:
return err;
}
我们可以看到sk->prot为tcp_prot,查找该结构我们并未发现.bind项,于是继续往下执行。这里涉及到两个数据结构sockaddr_in跟sockaddr
struct sockaddr_in {
sa_family_t sin_family; /* Address family */
__be16 sin_port; /* Port number */
struct in_addr sin_addr; /* Internet address */
/* Pad to size of `struct sockaddr'. */
unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) -
sizeof(unsigned short int) - sizeof(struct in_addr)];
};
struct sockaddr {
sa_family_t sin_family; /* Address family */
char sa_data[14];
}
可以看到因为两个结构体长度相同,结构相似可以互相强制类型转换,可能考虑到兼容性问题,在inet_bind()函数中将之前sockaddr类型转回sockaddr_in类型。
通过sock_net(sk)返回了sk->sk_net指针,如果用户没有自定义网络空间则返回系统默认init_net 结构指针,然后调用 inet_addr_type() 函数检查地址的类型。
unsigned int inet_addr_type(struct net *net, __be32 addr)
{
return __inet_dev_addr_type(net, NULL, addr);
}
static inline unsigned __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
__be32 addr)
{
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
struct fib_result res;
unsigned ret = RTN_BROADCAST;
struct fib_table *local_table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) //检查地址是否是零地址或广播地址
return RTN_BROADCAST;
if (ipv4_is_multicast(addr)) //检查地址是否是组播地址
return RTN_MULTICAST;
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
#endif
local_table = fib_get_table(net, RT_TABLE_LOCAL); //查找本地路由函数表
if (local_table) {
ret = RTN_UNICAST;
if (!local_table->tb_lookup(local_table, &fl, &res)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
fib_res_put(&res);
}
}
return ret;
}
代码中出现了 struct flowi 结构用于路由键值。flowi.nl_u 是一个联合体,包含了ip4_u、ip6_u、dn_u这三个结构体,于是可以理解 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } } 将ip地址赋值给路由键值 fl的目标地址。我们可以稍微看下flowi的结构体
struct flowi { //路由键值结构
int oif; //负责发送的网络设备
int iif; //负责接收的网络设备
__u32 mark; //子网掩码
union {
struct {
__be32 daddr; //目标地址
__be32 saddr; //源地址,即发送方地址
__u8 tos; //服务类型TOS
__u8 scope; //范围
} ip4_u;
struct {
struct in6_addr daddr;
struct in6_addr saddr;
__be32 flowlabel;
} ip6_u;
struct {
__le16 daddr;
__le16 saddr;
__u8 scope;
} dn_u;
} nl_u; //该联合体主要用于网络层
#define fld_dst nl_u.dn_u.daddr
#define fld_src nl_u.dn_u.saddr
#define fld_scope nl_u.dn_u.scope
#define fl6_dst nl_u.ip6_u.daddr
#define fl6_src nl_u.ip6_u.saddr
#define fl6_flowlabel nl_u.ip6_u.flowlabel
#define fl4_dst nl_u.ip4_u.daddr
#define fl4_src nl_u.ip4_u.saddr
#define fl4_tos nl_u.ip4_u.tos
#define fl4_scope nl_u.ip4_u.scope
__u8 proto; //传输层协议
__u8 flags; //标志位
union {
struct {
__be16 sport; //源端口,发送方端口
__be16 dport; //目标端口,接收方端口
} ports;
struct {
__u8 type;
__u8 code;
} icmpt; //ICMP 类型
struct {
__le16 sport;
__le16 dport;
} dnports;
__be32 spi;
struct {
__u8 type;
} mht;
} uli_u; //该联合体主要用于传输层
#define fl_ip_sport uli_u.ports.sport
#define fl_ip_dport uli_u.ports.dport
#define fl_icmp_type uli_u.icmpt.type
#define fl_icmp_code uli_u.icmpt.code
#define fl_ipsec_spi uli_u.spi
#define fl_mh_type uli_u.mht.type
__u32 secid; /* used by xfrm; see secid.txt */
} __attribute__((__aligned__(BITS_PER_LONG/8)));
struct fib_result 结构是路由查找结果,struct fib_table 则是路由函数表结构体。函数中先检查ip地址addr是否是零地址、本地的广播地址、组播地址
static inline bool ipv4_is_zeronet(__be32 addr)
{
return (addr & htonl(0xff000000)) == htonl(0x00000000);
}
可以看到该函数是检查addr的高8位是否为零判断是否是零网地址。
static inline bool ipv4_is_lbcast(__be32 addr)
{
/* limited broadcast */
return addr == htonl(INADDR_BROADCAST);
}
#define INADDR_BROADCAST ((unsigned long int) 0xffffffff)
是否全1判断为广播地址
static inline bool ipv4_is_multicast(__be32 addr)
{
return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}
addr的高4位为1110则是属于多播地址
零网地址、广播地址则直接返回RTN_BROADCAST,多播地址返回RTN_MULTICAST,若都不是则查找具体的函数路由表返回查找结果。
fib_get_table 函数在内核中有两块地方,根据是否配置了CONFIG_IP_MULTIPLT_TABLES,我们挑简单的单路由函数表分析。
static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct hlist_head *ptr;
ptr = id == RT_TABLE_LOCAL ?
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}
此时传进来的net为sock_net(sk),即系统默认的init_net,id则是 RT_TABLE_LOCAL ,net->ipv4是netns_ipv4结构类型,装载着IPV4协议在网络空间中的信息。
struct netns_ipv4 {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
struct ctl_table_header *frags_hdr;
struct ctl_table_header *ipv4_hdr;
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops;
#endif
struct hlist_head *fib_table_hash;
struct sock *fibnl;
struct sock **icmp_sk;
struct sock *tcp_sock;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
struct xt_table *iptable_mangle;
struct xt_table *iptable_raw;
struct xt_table *arptable_filter;
#endif
int sysctl_icmp_echo_ignore_all;
int sysctl_icmp_echo_ignore_broadcasts;
int sysctl_icmp_ignore_bogus_error_responses;
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_errors_use_inbound_ifaddr;
};
IPV4所有的路由函数表都会链入到 fib_table_hash 数组中,数组的每个元素为hlist_head即队列,每个路由函数表通过其内部结构 tb_hlist 头链入到对应的队列中。
[TODO]https://blog.csdn.net/panxj856856/article/details/87981937
如果找到了本地路由函数表,那么调用本地路由函数表的 local_table->tb_lookup(local_table, &fl, &res) 函数根据键值 fl 返回 struct fib_result 结构,传递下来的dev参数为null,所以此时会修改ret = res.type,然后返回。
回到inet_bind函数中,snum = ntohs(addr->sin_port) 取得端口号,检查是否小于1024(系统保留了0~1023端口号)以及是否有绑定权限,然后检查状态、端口是否已经指定,再将ip地址赋值给inet的接收地址和源地址上。如果 ip地址类型是组播或者广播或者零网地址,则将源地址改为0,而接收地址不变。
然后调用tcp_prot->get_prot 即inet_csk_get_port检查端口是否允许绑定。
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //tcp_prot->h.hashinfo tcp_hashinfo
struct inet_bind_hashbucket *head;
struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk); //通过sock得到net结构
代码过长,我们一点一点看。我们看到先得到hashinfo结构,它是inet_hashinfo结构的指针,通过tcp_prot.h.hashinfo得到的tcp_hashinfo,我们看下inet_hashinfo结构,用来封装各种协议的绑定哈希表。
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
* TIME_WAIT sockets use a separate chain (twchain).
*/ //已经连接的sock结构都链入到该哈希桶,它有两个队列,一个是连接的sock 队列,一个为定时等待的sock队列
struct inet_ehash_bucket *ehash; //已经建立连接的哈希桶
rwlock_t *ehash_locks; //队列锁
unsigned int ehash_size; //队列长度
unsigned int ehash_locks_mask; //锁掩码
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct inet_bind_hashbucket *bhash; //管理端口号的哈希桶
unsigned int bhash_size; //哈希桶长度
/* Note : 4 bytes padding on 64 bit arches */
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
* is just local port number.
*/
struct hlist_head listening_hash[INET_LHTABLE_SIZE]; //监听哈希队列
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* are often dirty.
*/
rwlock_t lhash_lock ____cacheline_aligned;
atomic_t lhash_users;
wait_queue_head_t lhash_wait; //等待队列头
struct kmem_cache *bind_bucket_cachep; //高速缓存
};
可以看到这个结构是为了维护INET协议族的hash表使用的。
我们还看到一个数据结构
struct inet_bind_hashbucket { //哈希桶结构
spinlock_t lock; //自旋锁
struct hlist_head chain; //桶队列
};
这是一个带着自旋锁的哈希桶,chain代表着各个桶的哈希队列。
再往下struct hlist_node *node ,它是hash表的链头,被链入到hlist_head结构中。
下一行还有struct inet_bind_bucket *tb
struct inet_bind_bucket { //桶结构
struct net *ib_net; //网络空间指针
unsigned short port; //端口号
signed short fastreuse; //可以重复使用
struct hlist_node node; //链入哈希桶的chain中的哈希头
struct hlist_head owners; //sock 结构队列
};
这个结构链入到哈希桶inet_bind_hashbucket 结构中
继续inet_csk_get_port函数
local_bh_disable(); //是net指向内核的init_net网络空间
if (!snum) { //如果端口号没有指定
int remaining, rover, low, high;
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
rover = net_random() % remaining + low;
do { //在内核中查找一个端口号
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
if (++rover > high)
rover = low;
} while (--remaining > 0);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0)
goto fail;
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
可以看到snum我们是指定了的,但如果没有指定(端口号为0)则进入这个条件分支,表示由内核分配一个端口号。
先调用inet_get_local_port_range()函数取得端口号的取值范围。
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
do {
seq = read_seqbegin(&sysctl_port_range_lock);
*low = sysctl_local_port_range[0];
*high = sysctl_local_port_range[1];
} while (read_seqretry(&sysctl_port_range_lock, seq));
}
通过乐观锁的方式从内核的端口范围数组sysctl_local_port_range读出两个值{32768, 61000}
然后通过随机数的方式计算出推荐端口号 rover = net_random() % remaining + low。并且保证推荐端口号是未使用过的,如果全部使用过,则ret为1,go fail,否则就以适合的推荐端口号为端口号。
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
这一块的逻辑类似于hashmap.get(rover),就是将rover取哈希取余为下标,在tcp_hashinfo的bhash哈希桶中,取出对应的哈希队列chain,然后遍历队列如果有相同的则说明绑定过了。
#define inet_bind_bucket_for_each(tb, node, head) \
hlist_for_each_entry(tb, node, head, node)
#define hlist_for_each_entry(tpos, pos, head, member) \
for (pos = (head)->first; \
pos && ({ prefetch(pos->next); 1;}) && \
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
在这里我们可以看到细节,桶结构 inet_bind_bucket 是通过node节点链入到哈希桶 inet_bind_hashbucket 的chain队列上去。
我们的服务器程序指定了端口号,那么不需要内核分配端口号,我们继续inet_csk_get_port函数
} else { //在哈希桶队列中查找相同端口的桶结构
head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == snum)
goto tb_found;
}
tb = NULL;
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) { //检查 sock 队列是否为空
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
goto success; //复用
} else {
ret = 1; //桶结构中的 sock 队列是否存在冲突
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
goto fail_unlock;
}
}
tb_not_found: //桶结构不存在则创建
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1; //设置桶结构可以复用
else
tb->fastreuse = 0;
} else if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
success:
if (!inet_csk(sk)->icsk_bind_hash) //还没有绑定桶结构
inet_bind_hash(sk, tb, snum); //绑定
BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
现在哈希桶结构中查找指定端口对应的桶结构tb,如果找到则tb_found处,tb->owners是一个sock队列头,如果这个队列不为空,就检查其是否支持快速复用即fastreuse为1,然后再看我们的sock是否也允许复用,且未处于监听状态那么跳入sucess处。否则执行inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb) 函数,即ipv4_specific->bind_conflict,即 inet_csk_bind_conflict 函数
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb)
{
const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
struct hlist_node *node;
int reuse = sk->sk_reuse;
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
sk_for_each_bound(sk2, node, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { //是否同一设备
if (!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) {
const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr ||
sk2_rcv_saddr == sk_rcv_saddr) //是否绑定地址相同
break;
}
}
}
return node != NULL;
}
代码很简单,宏sk_for_each_bound 是遍历tb->owners队列,其中每个sock结构为sk2,然后对比sk跟sk2,如果设备相同、绑定的地址也相同就”冲突“了。
回到inet_csk_get_port函数,如果没找到桶结构转到tb_not_found处,通过 inet_bind_bucket_create 函数创建桶结构,并将端口号等内容记录到新建的桶结构中,并将桶结构链入到哈希桶中。
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
*/
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); //申请桶结构空间
if (tb != NULL) {
tb->ib_net = hold_net(net); //记录网络空间
tb->port = snum; //记录端口号
tb->fastreuse = 0; //快速复用初始化0,根据sock调整
INIT_HLIST_HEAD(&tb->owners); //初始化 sock 队列
hlist_add_head(&tb->node, &head->chain);//将桶结构链入到哈希桶中
}
return tb;
}
在sucess处通过 inet_csk(sk)->icsk_bind_hash 判断是否还没有绑定桶结构。inet_csk直接将sock指针强转为 inet_connection_sock,然后判断其icsk_bind_hash是否有值。inet_connection_sock用于INET协议族连接sock,前面有出现。如果sock没有绑定桶结构则通过 inet_bind_hash 函数将 sock 链入到桶结构的sock队列中。
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
inet_sk(sk)->num = snum;
sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb;
}
首先将inet_connection_sock的端口号赋值,然后将当前sock绑定到桶结构的owners队列中,然后inet_connection_sock的icsk_bind_hash记下tb桶。绑定工作就完毕了。
回到inet_bind()函数
/* Make sure we are allowed to bind here. 检查是否允许绑定 */
if (sk->sk_prot->get_port(sk, snum)) { // inet_csk_get_port()
inet->saddr = inet->rcv_saddr = 0; // 检查失败就情况设置的地址
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->rcv_saddr) //如果已经设置了地址就增加锁标志,表示已经绑定了地址
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) //如果已经设置了端口就增加锁标志,表示已经绑定了端口
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->sport = htons(inet->num); //记录端口
inet->daddr = 0; //初始化目标地址
inet->dport = 0; //初始化目标端口
sk_dst_reset(sk); //初始化缓存的路由内容
err = 0;
out_release_sock:
release_sock(sk); //解锁
out:
return err;
}
bind()到这里就告一段落,中间我们遗留了本地路由函数表、local_table->tb_lookup(local_table, &fl, &res)相关的内容。