【Linux4.1.12源码分析】协议栈gro收包之TCP处理

TCP gro实现定义在tcpv4_offload对象

static const struct net_offload tcpv4_offload = {
	.callbacks = {
		.gso_segment	=	tcp4_gso_segment,
		.gro_receive	=	tcp4_gro_receive,
		.gro_complete	=	tcp4_gro_complete,
	},
};
tcp4_gro_receive函数

static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	/* Don't bother verifying checksum if we're going to flush anyway. */
	if (!NAPI_GRO_CB(skb)->flush &&
	    skb_gro_checksum_validate(skb, IPPROTO_TCP,
				      inet_gro_compute_pseudo)) {	//如果flush为0,需要检测csum
		NAPI_GRO_CB(skb)->flush = 1;	//如果检测失败则flush置1,报文将被提交到协议栈
		return NULL;
	}

	return tcp_gro_receive(head, skb);	//TCP gro receive处理,与IP协议无关
}
tcp_gro_receive函数

struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	struct sk_buff **pp = NULL;
	struct sk_buff *p;
	struct tcphdr *th;
	struct tcphdr *th2;
	unsigned int len;
	unsigned int thlen;
	__be32 flags;
	unsigned int mss = 1;
	unsigned int hlen;
	unsigned int off;
	int flush = 1;
	int i;

	off = skb_gro_offset(skb);
	hlen = off + sizeof(*th);
	th = skb_gro_header_fast(skb, off);	//得到TCP头
	if (skb_gro_header_hard(skb, hlen)) {
		th = skb_gro_header_slow(skb, hlen, off);
		if (unlikely(!th))
			goto out;
	}

	thlen = th->doff * 4;	//得到TCP头的长度
	if (thlen < sizeof(*th))
		goto out;

	hlen = off + thlen;
	if (skb_gro_header_hard(skb, hlen)) {	//检测报文
		th = skb_gro_header_slow(skb, hlen, off);
		if (unlikely(!th))
			goto out;
	}

	skb_gro_pull(skb, thlen);	//报文移动到payload数据区

	len = skb_gro_len(skb);		//得到报文的数据区长度
	flags = tcp_flag_word(th);

	for (; (p = *head); head = &p->next) {		//遍历gro_list中的报文
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		th2 = tcp_hdr(p);	//得到报文tcp头

		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {	//源和目的端口不一致的不是同一个流
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}

		goto found;	//找到同一个流的报文,则跳出循环,即p指向同一个流的skb
	}

	goto out_check_final;

found:
	/* Include the IP ID check below from the inner most IP hdr */
	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;	//得到flush值,经过MAC/IP层设置
	flush |= (__force int)(flags & TCP_FLAG_CWR);	//如果当前报文携带CWR标记,则flush置1
	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &	//如果当前报文和同流报文在(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)标记之外的标记不相同,则置flush为1
		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);	//如果当前报文和同流报文的ack_seq不同,则置flush为1
	for (i = sizeof(*th); i < thlen; i += 4)	//如果当前报文和同流报文的TCP头option信息不同,则置flush为1
		flush |= *(u32 *)((u8 *)th + i) ^
			 *(u32 *)((u8 *)th2 + i);

	mss = tcp_skb_mss(p);	//得到mss值

	flush |= (len - 1) >= mss;	//如果当前报文数据区长度超过mss,则置flush为1
	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);	//如果当前报文和同流报文不连续,则置flush为1

	if (flush || skb_gro_receive(head, skb)) {	//如果flush为0,则把当前报文合并到同流报文
		mss = 1;
		goto out_check_final;
	}

	p = *head;		//同流报文	
	th2 = tcp_hdr(p);
	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);	//如果当前报文包含(TCP_FLAG_FIN | TCP_FLAG_PSH)标记,则同流报文也添加该标记

out_check_final:
	flush = len < mss;		//报文长度小于mss,一般是一个流的最后报文,需要尽快提交报文
	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |	//如果报文携带这5个标记,则flush为1
					TCP_FLAG_RST | TCP_FLAG_SYN |
					TCP_FLAG_FIN));

	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))	//p不为空,即找到同流报文,两种场景,1)同流报文超过65536;2)flush为1
		pp = head;

out:
	NAPI_GRO_CB(skb)->flush |= (flush != 0);	//设置当前报文的flush,决定是否提交当前报文到协议栈

	return pp;
}
skb_gro_receive函数

int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
	unsigned int offset = skb_gro_offset(skb);
	unsigned int headlen = skb_headlen(skb);
	unsigned int len = skb_gro_len(skb);
	struct sk_buff *lp, *p = *head;		//p指向gro_list中与当前报文同流的skb
	unsigned int delta_truesize;

	if (unlikely(p->len + len >= 65536))	//超过最大报文数,返回错误将提交同流报文
		return -E2BIG;

	lp = NAPI_GRO_CB(p)->last;	//初始时,last指向p自身
	pinfo = skb_shinfo(lp);

	if (headlen <= offset) {	//如果线性区长度小于offset,即frag中还有报头数据
		skb_frag_t *frag;
		skb_frag_t *frag2;
		int i = skbinfo->nr_frags;
		int nr_frags = pinfo->nr_frags + i;	//合并后的frag数

		if (nr_frags > MAX_SKB_FRAGS)	//如果合并后的frag超过最大frag数,则需要merge
			goto merge;

		offset -= headlen;
		pinfo->nr_frags = nr_frags;
		skbinfo->nr_frags = 0;

		frag = pinfo->frags + nr_frags;
		frag2 = skbinfo->frags + i;
		do {
			*--frag = *--frag2;
		} while (--i);

		frag->page_offset += offset;		//修正第一个frag,需要减掉报头数据
		skb_frag_size_sub(frag, offset);

		/* all fragments truesize : remove (head size + sk_buff) */
		delta_truesize = skb->truesize -
				 SKB_TRUESIZE(skb_end_offset(skb));

		skb->truesize -= skb->data_len;
		skb->len -= skb->data_len;
		skb->data_len = 0;

		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;		//当前报文被合并,待释放
		goto done;
	} else if (skb->head_frag) {		//ixgbe驱动创建的skb,该标记为true
		int nr_frags = pinfo->nr_frags;
		skb_frag_t *frag = pinfo->frags + nr_frags;
		struct page *page = virt_to_head_page(skb->head);	//得到线性区的page
		unsigned int first_size = headlen - offset;
		unsigned int first_offset;

		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)	//合并后的frag数超过最大frag数,则需要merge
			goto merge;

		first_offset = skb->data -
			       (unsigned char *)page_address(page) +
			       offset;

		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;

		frag->page.p	  = page;		//该frag报文报文线性区中的数据
		frag->page_offset = first_offset;
		skb_frag_size_set(frag, first_size);

		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);	//拷贝frag
		/* We dont need to clear skbinfo->nr_frags here */

		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;	//当前报文被合并,待释放
		goto done;
	}

merge:
	delta_truesize = skb->truesize;
	if (offset > headlen) {		//如果offset大于报文的线性区长度,意味着frag中有部分数据是报文头
		unsigned int eat = offset - headlen;

		skbinfo->frags[0].page_offset += eat;		//调整frag0中的数据,减掉报文头
		skb_frag_size_sub(&skbinfo->frags[0], eat);
		skb->data_len -= eat;
		skb->len -= eat;
		offset = headlen;
	}

	__skb_pull(skb, offset);	//当前报文移动到数据区

	if (NAPI_GRO_CB(p)->last == p)	//初始状态时(skb第一次放到gro_list中),且没有merge过
		skb_shinfo(p)->frag_list = skb;		//报文保存到frag_list中
	else
		NAPI_GRO_CB(p)->last->next = skb;	//报文保存到frag_list中的最后一个报文的
	NAPI_GRO_CB(p)->last = skb;	//merge过以后,报文都放在frag_list链表中
	__skb_header_release(skb);	//释放skb的线性区
	lp = p;

done:
	NAPI_GRO_CB(p)->count++;	//count加一,最后设置为segs
	p->data_len += len;		//同流报文的长度加上当前报文的数据区长度
	p->truesize += delta_truesize;	//同流报文的truesize加上当前报文的truesize
	p->len += len;		//同流报文的长度增加当前报文的长度
	if (lp != p) {		//当lp与p不相同时,lp报文相关长度信息也需要调整
		lp->data_len += len;
		lp->truesize += delta_truesize;
		lp->len += len;
	}
	NAPI_GRO_CB(skb)->same_flow = 1;	//same_flow置1,说明报文已经被合并到gro_list中
	return 0;
}


tcp4_gro_complete函数

static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct tcphdr *th = tcp_hdr(skb);

	th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,		//刷新check值
				  iph->daddr, 0);
	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;	//置GSO_TCPV4标记

	return tcp_gro_complete(skb);
}
tcp_gro_complete
int tcp_gro_complete(struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);

	skb->csum_start = (unsigned char *)th - skb->head;	//设置ip_summed及相关值
	skb->csum_offset = offsetof(struct tcphdr, check);
	skb->ip_summed = CHECKSUM_PARTIAL;

	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;	//设置segs

	if (th->cwr)
		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;	//如果当前报文携带cwr标记,则携带SKB_GSO_TCP_ECN标记

	return 0;
}

tcp4_gro_receive实现了报文合并,根据报文的不同,有几种可能:

1)找到同流报文,合并报文后超过65536,该同流报文将会被提交给协议栈,当前报文在mac层被放到gro_list;

2)找到同流报文,合并成功,当前报文未携带相关flag,将不会有报文被提交到协议栈;

3)找到同流报文,合并成功,当前报文携带相关flag,同流报文和当前报文一起被提交到协议栈;

4)未找到同流报文,当前报文携带相关flag,当前报文将被提交给协议栈;

5)未找到同流报文,当前报文未携带相关flag,当前报文将被保存到gro_list中;

你可能感兴趣的:(Linux4.1.12源码分析)