Linxu内核在默认情况下,把urgent data实现为OOB数据
在内核态,使用kernel_sendmsg/kernel_sendpage完成发送,只不过需要加上MSG_OOB标志,表示要发送的URG数据。
分片主要在kernel_sendmsg中完成,在OOB数据的处理上,它和kernel_sendpage是一致
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
。。。。。。。。。。。。。。
/*如果flags设置了MSG_OOB该接口其实返回的mss_now关闭了TSO功能*/
mss_now = tcp_send_mss(sk, &size_goal, flags);
。。。。。。。。。。。。。。
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
while (seglen > 0) {
int copy = 0;
int max = size_goal;
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
}
if (copy <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = sk->sk_sndmsg_page;
int off;
if (page && page_count(page) == 1)
sk->sk_sndmsg_off = 0;
off = sk->sk_sndmsg_off;
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
sk->sk_sndmsg_page = page = NULL;
off = 0;
}
} else
off = 0;
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
}
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page_nocache(sk, from, skb,
page, off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!sk->sk_sndmsg_page) {
sk->sk_sndmsg_page = page;
sk->sk_sndmsg_off = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, page, off, copy);
if (sk->sk_sndmsg_page) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
sk->sk_sndmsg_page = page;
}
}
sk->sk_sndmsg_off = off + copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
/*对于OOB数据,即使一个分片用光,如果还有
send_buff和OOB数据,就继续积累分片*/
if (skb->len < max || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
tcp_sendmsg中,涉及对OOB数据的处理主要有:
1、在调用tcp_send_mss确定分片大小的时候:
static int tcp_send_mss(struct sock *sk,int *size_goal, int flags)
{
intmss_now;
mss_now= tcp_current_mss(sk);
/*如果是OOB数据,large_allowed=0,关闭TSO*/
*size_goal= tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
returnmss_now;
}
如果是OOB数据,其实是关闭了TSO功能,这样做的原因是:天知道各个网卡芯片在执行分片的时候咋个处理TCP报头中的URG标志和urgent point
2、在确定何时开始执行分片的发送的时候:
如果是OOB数据,即使当前已经积累了一整个分片,也不会想普通的数据一样执行发送(tcp_push),而是继续积累直到用户下发的数据全部分片或者snd_buf/内存用尽。
3、执行tcp_push的时候:
在用户下发的数据全部分片或者snd_buf/内存用尽后,进入tcp_push执行发送操作(所有的OOB数据,都会通过这个接口来执行发送)
static inline void tcp_push(struct sock*sk, int flags, int mss_now,
int nonagle)
{
if(tcp_send_head(sk)) {
structtcp_sock *tp = tcp_sk(sk);
if(!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp,tcp_write_queue_tail(sk));
*tcp_mark_urg设置tp->snd_up,标识进入OOB数据发送模式,设置urgent point
指向urgentdata接受后的第一个字符*/
tcp_mark_urg(tp,flags);
__tcp_push_pending_frames(sk,mss_now,
(flags & MSG_MORE) ? TCP_NAGLE_CORK :nonagle);
}
}
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
if (before(tp->snd_up, tcb->seq + 0x10000)) {
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
只要当前待发送的skb的seq在tcp_sock记录的urgent point前面,就需要在报头中对URG标志置位,同时如果tcp_sock记录的urgent point。如果该报文的seq距离大于16为能表示的最大值,就置TCP报头中的urgentpoint为65535。
在收到对方ACK的处理流程tcp_ack--->tcp_clean_rtx_queue中:
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;