TCP收到对端发送的数据后,通常不能立即交付应用进程。在应用进程取走数据之前,数据需要保存在接收缓存之中。如果应用进程取数据的速度比TCP从对端收数据的速度慢,则接收缓存中的数据会越来越多。因此在skb被放入接收缓存之前必须检查接收缓存能容纳的内存数,如果超出限制则必须丢弃skb。
tcp_rcv_established中会检查接收缓存的使用情况:
5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 5077 const struct tcphdr *th, unsigned int len) 5078 { 5079 struct tcp_sock *tp = tcp_sk(sk); ... 5201 if (!eaten) { ... 5205 if ((int)skb->truesize > sk->sk_forward_alloc) //剩余空间无法容纳skb 5206 goto step5;//进入慢速路径 ... 5221 /* Bulk data transfer: receiver */ 5222 eaten = tcp_queue_rcv(sk, skb, tcp_header_len, 5223 &fragstolen); ... 5265 step5: ... 5275 tcp_data_queue(sk, skb); ...tcp_queue_rcv 函数:
4244 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, 4245 bool *fragstolen) 4246 { 4247 int eaten; 4248 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); 4249 4250 __skb_pull(skb, hdrlen); 4251 eaten = (tail && 4252 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; 4253 tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4254 if (!eaten) { 4255 __skb_queue_tail(&sk->sk_receive_queue, skb); 4256 skb_set_owner_r(skb, sk); 4257 } 4258 return eaten; 4259 }skb_set_owner_r函数:
1995 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) 1996 { 1997 skb_orphan(skb); 1998 skb->sk = sk; 1999 skb->destructor = sock_rfree; 2000 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 2001 sk_mem_charge(sk, skb->truesize); //sk->sk_forward_alloc -= size 2002 }在tcp_data_queue 函数中:
4300 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4301 { 4302 const struct tcphdr *th = tcp_hdr(skb); 4303 struct tcp_sock *tp = tcp_sk(sk); 4304 int eaten = -1; 4305 bool fragstolen = false; ... 4344 if (eaten <= 0) { 4345 queue_and_out: 4346 if (eaten < 0 && 4347 tcp_try_rmem_schedule(sk, skb, skb->truesize))//检查是否可以占用接收缓存的skb->truesize大小的空间 4348 goto drop; 4349 4350 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4351 } ... 4415 tcp_data_queue_ofo(sk, skb); 4416 }tcp_try_rmem_schedule会试着调整接收缓存空间来接收数据 :
4061 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 4062 unsigned int size) 4063 { 4064 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || //已分配内存超过限制 4065 !sk_rmem_schedule(sk, skb, size)) { //接收缓存无法容纳size大小的数据 4066 4067 if (tcp_prune_queue(sk) < 0) //整理接收队列 4068 return -1; 4069 4070 if (!sk_rmem_schedule(sk, skb, size)) { //再次检查缓存空间是否够用 4071 if (!tcp_prune_ofo_queue(sk)) //清空乱序队列,释放缓存空间 4072 return -1; 4073 4074 if (!sk_rmem_schedule(sk, skb, size)) //再次检查缓存空间是否够用 4075 return -1; 4076 } 4077 } 4078 return 0; 4079 }sk_rmem_schedule 用于检查缓存空间是否够用:
1375 static inline bool 1376 sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) 1377 { 1378 if (!sk_has_account(sk)) 1379 return true; 1380 return size<= sk->sk_forward_alloc || //剩余预分配内存够用 1381 __sk_mem_schedule(sk, size, SK_MEM_RECV) || //<span style="color:#000000;">增加预分配内存和已分配内存</span> 1382 skb_pfmemalloc(skb); //skb中的内存是用PFMEMALLOC方式申请的,这种方式申请的是紧急内存 1383 }tcp_prune_queue 和tcp_prune_ofo_queue分别用于整理接收队列和乱序队列:
<span style="color:#000000;">4594 static bool tcp_prune_ofo_queue(struct sock *sk) 4595 { 4596 struct tcp_sock *tp = tcp_sk(sk); 4597 bool res = false; 4598 4599 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4600 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); 4601 __skb_queue_purge(&tp->out_of_order_queue); //释放乱序队列中的所有数据 4602 4603 /* Reset SACK state. A conforming SACK implementation will 4604 * do the same at a timeout based retransmit. When a connection 4605 * is in a sad state like this, we care only about integrity 4606 * of the connection not performance. 4607 */ 4608 if (tp->rx_opt.sack_ok) 4609 tcp_sack_reset(&tp->rx_opt); 4610 sk_mem_reclaim(sk); //更新缓存空间信息 4611 res = true; 4612 } 4613 return res; 4614 } ...</span> 4623 static int tcp_prune_queue(struct sock *sk) 4624 { 4625 struct tcp_sock *tp = tcp_sk(sk); 4626 4627 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); 4628 4629 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); 4630 4631 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 4632 tcp_clamp_window(sk); //试图缩小接收缓存大小并更新最大通告窗口大小 4633 else if (sk_under_memory_pressure(sk)) 4634 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); //缩小最大通告窗口大小 4635 4636 tcp_collapse_ofo_queue(sk); //合并乱序队列中连续的数据块以节省空间 4637 if (!skb_queue_empty(&sk->sk_receive_queue)) 4638 tcp_collapse(sk, &sk->sk_receive_queue, 4639 skb_peek(&sk->sk_receive_queue), 4640 NULL, 4641 tp->copied_seq, tp->rcv_nxt); //合并接收队列中未被读取的数据 4642 sk_mem_reclaim(sk); //更新缓存空间信息 4643 4644 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) //接收缓存够用了 4645 return 0; 4646 4647 /* Collapsing did not help, destructive actions follow. 4648 * This must not ever occur. */ 4649 4650 tcp_prune_ofo_queue(sk); //清理乱序队列 ... 4662 tp->pred_flags = 0; //内存紧张,禁用快速处理路径 4663 return -1; 4664 }tcp_clamp_window用于更新最大通告窗口大小:
410 static void tcp_clamp_window(struct sock *sk) 411 { 412 struct tcp_sock *tp = tcp_sk(sk); 413 struct inet_connection_sock *icsk = inet_csk(sk); 414 415 icsk->icsk_ack.quick = 0; 416 417 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && //接收缓存大小小于最大接收缓存大小 418 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && //应用进程没有设置接收缓存大小 419 !sk_under_memory_pressure(sk) && //不处于内存压力之下 420 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { //全局已分配TCP内存小于最低限制 421 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 422 sysctl_tcp_rmem[2]); 423 } 424 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 425 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); 426 }当剩余预分配内存比较多时 sk_mem_reclaim函数会回收一部分 预分配内存:
1385 static inline void sk_mem_reclaim(struct sock *sk) 1386 { 1387 if (!sk_has_account(sk)) 1388 return; 1389 if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) 1390 __sk_mem_reclaim(sk); 1391 }__sk_mem_reclaim:
2005 void __sk_mem_reclaim(struct sock *sk) 2006 { 2007 sk_memory_allocated_sub(sk, 2008 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); 2009 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 2010 2011 if (sk_under_memory_pressure(sk) && 2012 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2013 sk_leave_memory_pressure(sk); 2014 }TCP调用 tcp_data_queue_ofo 函数将skb放入乱序队列时也会使用skb_set_owner_r函数更新接收缓存信息:
4121 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4122 { 4123 struct tcp_sock *tp = tcp_sk(sk); 4124 struct sk_buff *skb1; 4125 u32 seq, end_seq; ... 4239 end: 4240 if (skb) 4241 skb_set_owner_r(skb, sk); 4242 }
应用进程在tcp_sendmsg函数中将数据读完毕后,接收缓存中的skb就会被释放。skb释放时会调用在skb_set_owner_r函数中设置的sock_rfree函数:
1560 void sock_rfree(struct sk_buff *skb) 1561 { 1562 struct sock *sk = skb->sk; 1563 unsigned int len = skb->truesize; 1564 1565 atomic_sub(len, &sk->sk_rmem_alloc); 1566 sk_mem_uncharge(sk, len); //释放预分配内存 1567 }接收缓冲区放入数据或移除数据后,由tcp_rcv_space_adjust函数更新内存信息:
522 void tcp_rcv_space_adjust(struct sock *sk) 523 { 524 struct tcp_sock *tp = tcp_sk(sk); 525 int time; 526 int space; 527 528 if (tp->rcvq_space.time == 0) 529 goto new_measure; 530 531 time = tcp_time_stamp - tp->rcvq_space.time; 532 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 533 return; 534 535 space = 2 * (tp->copied_seq - tp->rcvq_space.seq); 536 537 space = max(tp->rcvq_space.space, space); 538 539 if (tp->rcvq_space.space != space) { //有新的数据被应用进程copy出去 540 int rcvmem; 541 542 tp->rcvq_space.space = space; 543 544 if (sysctl_tcp_moderate_rcvbuf && 545 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 546 int new_clamp = space; 547 548 /* Receive space grows, normalize in order to 549 * take into account packet headers and sk_buff 550 * structure overhead. 551 */ 552 space /= tp->advmss; 553 if (!space) 554 space = 1; 555 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 556 while (tcp_win_from_space(rcvmem) < tp->advmss) 557 rcvmem += 128; 558 space *= rcvmem; 559 space = min(space, sysctl_tcp_rmem[2]); 560 if (space > sk->sk_rcvbuf) { 561 sk->sk_rcvbuf = space; 562 563 /* Make the window clamp follow along. */ 564 tp->window_clamp = new_clamp; 565 } 566 } 567 } 568 569 new_measure: 570 tp->rcvq_space.seq = tp->copied_seq; 571 tp->rcvq_space.time = tcp_time_stamp; 572 }上面我们了解了接收队列和乱序队列的管理,接下来看看其它类型的接收队列(prequeue队列、异步等待队列、backlog队列)的缓存管理。
(1)prequeue队列:
1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) 1920 { 1921 struct tcp_sock *tp = tcp_sk(sk); 1922 1923 if (sysctl_tcp_low_latency || !tp->ucopy.task) 1924 return false; 1925 1926 if (skb->len <= tcp_hdrlen(skb) && 1927 skb_queue_len(&tp->ucopy.prequeue) == 0) 1928 return false; 1929 1930 skb_dst_force(skb); 1931 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1932 tp->ucopy.memory += skb->truesize; 1933 if (tp->ucopy.memory > sk->sk_rcvbuf) { 1934 struct sk_buff *skb1; 1935 1936 BUG_ON(sock_owned_by_user(sk)); 1937 1938 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { 1939 sk_backlog_rcv(sk, skb1); 1940 NET_INC_STATS_BH(sock_net(sk), 1941 LINUX_MIB_TCPPREQUEUEDROPPED); 1942 } 1943 1944 tp->ucopy.memory = 0; ...可见prequeue队列的缓存管理很简单:超出限制则删除全部数据。
(2)异步等待队列:放入这个队列中的skb不纳入缓存管理。
(3)backlog队列:
1961 int tcp_v4_rcv(struct sk_buff *skb) 1962 { ... 2039 } else if (unlikely(sk_add_backlog(sk, skb, 2040 sk->sk_rcvbuf + sk->sk_sndbuf))) { ...sk_add_backlog函数会检查发送缓存和接收缓存的和是否有可用空间:
768 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb, 769 unsigned int limit) 770 { 771 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); 772 773 return qsize > limit; 774 } 775 776 /* The per-socket spinlock must be held here. */ 777 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, 778 unsigned int limit) 779 { 780 if (sk_rcvqueues_full(sk, skb, limit)) 781 return -ENOBUFS; 782 783 __sk_add_backlog(sk, skb); 784 sk->sk_backlog.len += skb->truesize; 785 return 0; 786 }