struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
其中有如下成员初始化
.h.hashinfo = &tcp_hashinfo,
而tcp_hashinfo的结构体类型如下
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
* TIME_WAIT sockets use a separate chain (twchain).
*/
struct inet_ehash_bucket *ehash; //established hash链表
spinlock_t *ehash_locks;
unsigned int ehash_mask; //内核打印信息为512
unsigned int ehash_locks_mask;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct inet_bind_hashbucket *bhash; //绑定的bind hash桶链表
unsigned int bhash_size; //上面 bhash 指针的个数,即所能bind hashbucket,内核打印为512
/* 4 bytes hole on 64 bit */
struct kmem_cache *bind_bucket_cachep;
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* might be often dirty.
*/
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
* is just local port number.
*/
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
____cacheline_aligned_in_smp;
atomic_t bsockets;
};
在这个结构体内部主要定义了如下成员
a. established hash桶及相关参数
struct inet_ehash_bucket *ehash; //established hash桶
spinlock_t *ehash_locks; //established 锁
unsigned int ehash_mask; //established屏蔽字,内核打印信息为512
unsigned int ehash_locks_mask; //established屏蔽字锁
其中inet_ehash_bucket桶定义如下
struct inet_ehash_bucket {
struct hlist_nulls_head chain;
struct hlist_nulls_head twchain;
};
struct hlist_nulls_head {
struct hlist_nulls_node *first; //节点
};
struct hlist_nulls_node {
struct hlist_nulls_node *next, **pprev; //hash链表指针成员
};
b. bind hash桶及相应的内存缓存指针
struct inet_bind_hashbucket *bhash; //bind hash桶链表
unsigned int bhash_size; //上面 bind bhash 指针的个数,内核打印为512
/* 4 bytes hole on 64 bit */
struct kmem_cache *bind_bucket_cachep; //bind桶内存缓存
其中inet_bind_hashbucket bind hash桶定义如下
struct inet_bind_hashbucket {
spinlock_t lock;
struct hlist_head chain;
};
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
源码路径: linux-3.10.x\net\ipv4\tcp.c
源码调用: inet_init()-->tcp_init()
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned int i;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
percpu_counter_init(&tcp_sockets_allocated, 0); //TCP socket的个数
percpu_counter_init(&tcp_orphan_count, 0); //tcp 孤儿的个数
//分配一个 tcp_hashinfo.bind_bucket_cachep 内存缓存
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
*
* The methodology is similar to that of the buffer cache.
*/
//分配一个 established bucket链表
tcp_hashinfo.ehash =
alloc_large_system_hash("TCP established",
sizeof(struct inet_ehash_bucket),
thash_entries,
17, /* one slot per 128 KB of memory */
0,
NULL,
//返回ehash_mask屏蔽字
&tcp_hashinfo.ehash_mask,
0,
thash_entries ? 0 : 512 * 1024);
//遍历ehash_mask屏蔽字
for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
//初始化链表
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
}
//分配并初始化锁,及锁的个数
if (inet_ehash_locks_alloc(&tcp_hashinfo)) //ehash锁分配
panic("TCP: failed to alloc ehash_locks");
//---------------------------------------------------------------------
//分配bind hash桶链表
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1, //512+1
17, /* one slot per 128 KB of memory */
0,
//内核日志打印 tcp_hashinfo.bhash_size 为512
&tcp_hashinfo.bhash_size,
NULL,
0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) { //遍历绑定的hashbucket
spin_lock_init(&tcp_hashinfo.bhash[i].lock); //初始化 bind hash链表的锁
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); //初始化 bind hash链表
}
cnt = tcp_hashinfo.ehash_mask + 1;
tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
tcp_init_mem(&init_net);
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
sysctl_tcp_rmem[2] = max(87380, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
//内核输出如上信息:“TCP: Hash tables configured (established 512 bind 512)”
tcp_metrics_init();
tcp_register_congestion_control(&tcp_reno);
tcp_tasklet_init();
}
前面在1~3所描述的内容是为4做“嫁衣”,现在开始inet_csk_get_port()源码分析
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
//得到该sock对应协议族的全局的底层容器 hashinfo = tcp_hashinfo ,
//其中它在struct proto tcp_prot内部初始化。而tcp_hashinfo的部分成
//员是在 tcp_init()函数内部初始化,要搞清楚这里的关系,一定要查看
//tcp_init() 函数内部的实现
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk); //获取当前socket对应的用户id
local_bh_disable();
//端口无效(我们的应用程序在开发的时候配置的无效端口,所以这里会随机
//分配一个),这种情况就是随机绑定一个没有使用的端口
if (!snum) { //端口无效
int remaining, rover, low, high;
again:
inet_get_local_port_range(&low, &high); //获取端口范围,一般就是1到65535,就是我们常用的端口号范围,当然也可以自己配置
remaining = (high - low) + 1; //剩余端口个数
smallest_rover = rover = net_random() % remaining + low; //随机分配一个数字作为端口
smallest_size = -1;
do {
//是否是保留的端口
if (inet_is_reserved_local_port(rover))
goto next_nolock; //如果是保留的端口就切换到下一个,即++rover
//通过端口号,即哈希值,确定其所在的链表head
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
/* 锁住哈希桶 */
spin_lock(&head->lock);
/* 从头遍历哈希桶,在inet_bind_bucket_for_each函数内部运用了
container_of机制,通过指针成员获取其对应的结构体,这里既是tb*/
inet_bind_bucket_for_each(tb, &head->chain)
/* 如果端口被使用了,就进行冲突检测 */
if (net_eq(ib_net(tb), net) && tb->port == rover) {
if (((tb->fastreuse > 0 && //tb中的参数可“快速重用”
sk->sk_reuse && //socket参数允许快速重用
sk->sk_state != TCP_LISTEN) || //不在监听状态
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
uid_eq(tb->fastuid, uid))) && //socket用户id相等
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */
smallest_rover = rover; /* 记下这个端口 */
/* 如果系统绑定的端口已经很多了,那么就判断端口是否有绑定冲突*/
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover; /* 没有冲突,使用此端口 */
goto tb_found;
}
}
/* 检查是否有端口绑定冲突,该端口是否能重用 */
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
goto next; /* 此端口不可重用,看下一个 */
}
/* 找到了没被用的端口,退出 */
break; //如果一个桶遍历过了,没有冲突的,那么就需要在下面建立一个inet_bind_bucket
next:
spin_unlock(&head->lock);
next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0) {
if (smallest_size != -1) {
snum = smallest_rover;
goto have_snum;
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover; /* 自动选择的可用端口 */
} else { /* 如果应用层有指定要绑定的端口 */
have_snum: //有端口
/* 走到这里,表示用户已经自己绑定了端口
1. inet_bhashfn(net, snum, hashinfo->bhash_size): 计算struct inet_bind_hashbucket指针索引
2. head = &hashinfo->bhash[*]: 返回struct inet_bind_hashbucket hash桶指针,即端口所在的哈希桶
3. inet_bind_bucket_for_each(tb, &head->chain):遍历当前hash桶内部的chain(hlist)链表,该链表
上注册了已被绑定端口,通过该chain链表及node成员找到(运用container_of)找到所属的结构体,即
结构体为tb (struct inet_bind_bucket),具体的端口绑定到链表详见inet_bind_bucket_create()函
数内部的实现
4. net_eq(ib_net(tb), net) && tb->port == snum: 是否是同一个net[个人理解,这个应该是创建一个socket就对应一个net] && 端口是否相等
*/
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
goto tb_found; /* 发现端口在用 */
}
tb = NULL;
goto tb_not_found;
tb_found:
/* 端口上有绑定sock时 */
if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
//根据socket的参数判断当前的端口是否快速重用
if (((tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) { /* 指定端口的情况 */
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { /* 端口绑定冲突 */
/* 自动分配的端口绑定冲突了,再次尝试,最多重试5次。
* 我觉得以下if不必要,因为自动选择时goto tb_found之前都有检测过了。
*/
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}
goto fail_unlock;
}
}
}
tb_not_found: //到这里表示在hash桶里面没有找到端口
ret = 1;
/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) { //在inet_bind_bucket_create()函数内部tb->owners初始化为NULL
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) //sk->sk_reuse变量在inet_create()函数内部初始化的
tb->fastreuse = 1;
else
tb->fastreuse = 0;
if (sk->sk_reuseport) { //端口重用
tb->fastreuseport = 1;
tb->fastuid = uid; //用户id
} else
tb->fastreuseport = 0;
} else {
if (tb->fastreuse && //重用
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) //禁止端口复用 || socket状态为监听
tb->fastreuse = 0; //禁止重用
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) //端口禁止重用 || 用户id不相等
tb->fastreuseport = 0;
}
success:
/* 赋值icsk中的inet_bind_bucket */
if (!inet_csk(sk)->icsk_bind_hash) //未绑定hash桶, 在下面的 inet_bind_hash()函数内部绑定
inet_bind_hash(sk, tb, snum); //重要,将hash桶绑定到sk->sk_prot->h.hashinfo上
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
该函数的主要功能是通过协议层bind()函数绑定的本地端口判断,端口是否为无效?
a. 端口无效就使用inet_get_local_port_range()自动分配一个端口,否则就使用应用层经过系统调用传入的端口进行绑定
b. 端口已经分配或协议层传入的有效就进行端口有效性判断,确定端口是否在使用并且申请该端口的用户id使用相同,如果端口未被使用就进入到d. 否则进入c.
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
goto tb_found; /* 发现端口在用 */
c. 判定该端口对应的tb(struct inet_bind_bucket)是否在使用、确定该端口是否可重用、在进行冲突检测
if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
d. 当端口未被使用就申请一个inet_bind_bucket bind桶,保存当前应用层的端口,最后将端口信息添加到hash桶链表上,详见下面的流程图结构
/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb != NULL) {
write_pnet(&tb->ib_net, hold_net(net)); //tb->ib_net指向命名空间为net
tb->port = snum; //绑定端口
tb->fastreuse = 0;
tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners); //初始化tb->owners链表为NULL
hlist_add_head(&tb->node, &head->chain); //将分配的节点tb->node添加到hash桶链表head->chain上
}
return tb;
}
e. 当申请一个inet_bind_bucket tb桶成功后,通过inet_bind_hash(sk, tb, snum)将该tb添加到sk->sk_bind_node链表上
f. 至此,就完成了端口在hash桶及sock上的绑定。
这里有个核心的功能,就是通过结构体的成员获取该结构体的指针
inet_bind_bucket_for_each(tb, &head->chain)
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
struct inet_bind_hashbucket {
spinlock_t lock;
struct hlist_head chain;
};
struct inet_bind_bucket {
#ifdef CONFIG_NET_NS
struct net *ib_net; //网络命名空间net
#endif
unsigned short port; //端口
signed char fastreuse; //快速重用,初始化为0
signed char fastreuseport; //快速重用端口,初始化0
kuid_t fastuid;
int num_owners; //初始化为0
struct hlist_node node;
struct hlist_head owners;
};
inet_bind_bucket_for_each(tb, &head->chain)
#define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node)
#define hlist_for_each_entry(pos, head, member) \
for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
#define hlist_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? hlist_entry(____ptr, type, member) : NULL; \
})
#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
另外这位大神写的很详细:点击打开链接