socket编程中tcp发送的调用有好几个:send, sendmsg, sendpage。不过这些调用都可以归结到内核函数
tcp_sendmsg。
int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; int mss_now, size_goal; int err, copied; long timeo; /* 由于该函数有系统调用调用,sock加锁 */ lock_sock(sk); TCP_CHECK_TIMER(sk); flags = msg->msg_flags; /* 发送超时计算 */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); /* 计算当前MSS,并设置xmit_size_goal,该值一般等于MSS,不过在GSO 存在的话就不是了 */ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; iov = msg->msg_iov; copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error;
下面就进入发送数据流程,主要的工作是把iov中的数据搬移到满足大小限制的SKB数据包中。
while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; iov++; while (seglen > 0) { int copy; skb = tcp_write_queue_tail(sk); if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; /* select_size函数用来决定分配多少内存,一般情况下是 分配一个MSS的大小;对于SG下,则分配一个头部的长度*/ skb = sk_stream_alloc_skb(sk, select_size(sk), sk->sk_allocation); if (!skb) goto wait_for_memory; /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); /* 最大的数据量限制,一般为MSS */ copy = size_goal; } /* Try to append data to the end of skb. */ if (copy > seglen) copy = seglen; /* Where to copy to? */ if (skb_tailroom(skb) > 0) { /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); if ((err = skb_add_data(skb, from, copy)) != 0) goto do_fault; } else { int merge = 0; int i = skb_shinfo(skb)->nr_frags; struct page *page = TCP_PAGE(sk); int off = TCP_OFF(sk); if (skb_can_coalesce(skb, i, page, off) && off != PAGE_SIZE) { /* We can extend the last page * fragment. */ merge = 1; } else if (i == MAX_SKB_FRAGS || (!i && !(sk->sk_route_caps & NETIF_F_SG))) { /* Need to add new fragment and cannot * do this because interface is non-SG, * or because all the page slots are * busy. */ tcp_mark_push(tp, skb); goto new_segment; } else if (page) { if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; off = 0; } } else off = 0; if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (!page) { /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; } /* Time to copy data. We are close to * the end! */ err = skb_copy_to_page(sk, from, skb, page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. */ if (!TCP_PAGE(sk)) { TCP_PAGE(sk) = page; TCP_OFF(sk) = 0; } goto do_error; } /* Update the skb. */ if (merge) { skb_shinfo(skb)->frags[i - 1].size += copy; } else { skb_fill_page_desc(skb, i, page, off, copy); if (TCP_PAGE(sk)) { get_page(page); } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; } } TCP_OFF(sk) = off + copy; }
对于数据的添加,如果队列尾的skb有剩余空间的话,就添加适量的数据到该原有的skb中,否则就构造新的skb。
对于不支持SG机制的网卡来说,数据是添加到skb的线性缓存区;而支持SG机制下,则copy到skb的page数组中,
不过超过数组的大小后就要重新分配skb了。
sk_stream_alloc_skb函数是tcp的缓存分配函数,实现了缓存控制机制。
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { if (sk_wmem_schedule(sk, skb->truesize)) { /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ skb_reserve(skb, skb_tailroom(skb) - size); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; }
接下来是数据的发送处理
if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; tp->write_seq += copy; /* 更新skb中数据的最后一个序号*/ TCP_SKB_CB(skb)->end_seq += copy; skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; /* 数据全部copy完,跳转后调用tcp_push发送 */ if ((seglen -= copy) == 0 && iovlen == 0) goto out; if (skb->len < size_goal || (flags & MSG_OOB)) continue; /* 待发送的数据大小超过最大可见窗口的一半,调用__tcp_push_pending_frames */ if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) /* 发送队列中仅有一个skb时,调用tcp_push_one发送该skb */ tcp_push_one(sk, mss_now); continue;
tcp_push函数也是通过调用__tcp_push_pending_frames来发送数据包。
__tcp_push_pending_frames和tcp_push_one的区别是前者发送多个数据包,后者只发送一个。
这2个函数最终都调用tcp_write_xmit。
下面分析该函数
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; sent_pkts = 0; /*如果发送的不仅是一个数据包,就要进行MTU probe */ if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return 0; } else if (result > 0) { sent_pkts = 1; } } while ((skb = tcp_send_head(sk))) { unsigned int limit; /* tso 处理,不支持gso时tso_segs为1,否则是数据包长度除以MSS的大小*/ tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); /* 拥塞窗口检测 */ cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) break; /* 发送窗口检测 */ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; if (tso_segs == 1) { /* Nagle算法检测 */ if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota); /* 如果当前skb的长度大于MSS,则需要分片 */ if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) break; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; /* Advance the send_head. This one is sent out. * This call will increment packets_out(处于网络中的数据包个数). */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; if (push_one) break; } if (likely(sent_pkts)) { tcp_cwnd_validate(sk); return 0; } return !tp->packets_out && tcp_send_head(sk); }
如果没有数据包被发送并且发送队列不为空就返回1,否则返回0.
tcp_cwnd_test函数
/* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) { u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1) return 1; /* 目前还处于网络流中的数据包个数 */ in_flight = tcp_packets_in_flight(tp); cwnd = tp->snd_cwnd; if (in_flight < cwnd) return (cwnd - in_flight); /* 当前可发送的数据包个数是拥塞窗口的剩余的值(减去in_flight数据包个数)*/ return 0; }
如果有数据包发送后,调用函数tcp_cwnd_validate来调整拥塞窗口
/* Congestion window validation. (RFC2861) */ static void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } }