TCP gro实现定义在tcpv4_offload对象
static const struct net_offload tcpv4_offload = {
.callbacks = {
.gso_segment = tcp4_gso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
},
};
tcp4_gro_receive函数
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
inet_gro_compute_pseudo)) { //如果flush为0,需要检测csum
NAPI_GRO_CB(skb)->flush = 1; //如果检测失败则flush置1,报文将被提交到协议栈
return NULL;
}
return tcp_gro_receive(head, skb); //TCP gro receive处理,与IP协议无关
}
tcp_gro_receive函数
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
struct sk_buff *p;
struct tcphdr *th;
struct tcphdr *th2;
unsigned int len;
unsigned int thlen;
__be32 flags;
unsigned int mss = 1;
unsigned int hlen;
unsigned int off;
int flush = 1;
int i;
off = skb_gro_offset(skb);
hlen = off + sizeof(*th);
th = skb_gro_header_fast(skb, off); //得到TCP头
if (skb_gro_header_hard(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
goto out;
}
thlen = th->doff * 4; //得到TCP头的长度
if (thlen < sizeof(*th))
goto out;
hlen = off + thlen;
if (skb_gro_header_hard(skb, hlen)) { //检测报文
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
goto out;
}
skb_gro_pull(skb, thlen); //报文移动到payload数据区
len = skb_gro_len(skb); //得到报文的数据区长度
flags = tcp_flag_word(th);
for (; (p = *head); head = &p->next) { //遍历gro_list中的报文
if (!NAPI_GRO_CB(p)->same_flow)
continue;
th2 = tcp_hdr(p); //得到报文tcp头
if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { //源和目的端口不一致的不是同一个流
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
goto found; //找到同一个流的报文,则跳出循环,即p指向同一个流的skb
}
goto out_check_final;
found:
/* Include the IP ID check below from the inner most IP hdr */
flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; //得到flush值,经过MAC/IP层设置
flush |= (__force int)(flags & TCP_FLAG_CWR); //如果当前报文携带CWR标记,则flush置1
flush |= (__force int)((flags ^ tcp_flag_word(th2)) & //如果当前报文和同流报文在(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)标记之外的标记不相同,则置flush为1
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq); //如果当前报文和同流报文的ack_seq不同,则置flush为1
for (i = sizeof(*th); i < thlen; i += 4) //如果当前报文和同流报文的TCP头option信息不同,则置flush为1
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
mss = tcp_skb_mss(p); //得到mss值
flush |= (len - 1) >= mss; //如果当前报文数据区长度超过mss,则置flush为1
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); //如果当前报文和同流报文不连续,则置flush为1
if (flush || skb_gro_receive(head, skb)) { //如果flush为0,则把当前报文合并到同流报文
mss = 1;
goto out_check_final;
}
p = *head; //同流报文
th2 = tcp_hdr(p);
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); //如果当前报文包含(TCP_FLAG_FIN | TCP_FLAG_PSH)标记,则同流报文也添加该标记
out_check_final:
flush = len < mss; //报文长度小于mss,一般是一个流的最后报文,需要尽快提交报文
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | //如果报文携带这5个标记,则flush为1
TCP_FLAG_RST | TCP_FLAG_SYN |
TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) //p不为空,即找到同流报文,两种场景,1)同流报文超过65536;2)flush为1
pp = head;
out:
NAPI_GRO_CB(skb)->flush |= (flush != 0); //设置当前报文的flush,决定是否提交当前报文到协议栈
return pp;
}
skb_gro_receive函数
int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
struct sk_buff *lp, *p = *head; //p指向gro_list中与当前报文同流的skb
unsigned int delta_truesize;
if (unlikely(p->len + len >= 65536)) //超过最大报文数,返回错误将提交同流报文
return -E2BIG;
lp = NAPI_GRO_CB(p)->last; //初始时,last指向p自身
pinfo = skb_shinfo(lp);
if (headlen <= offset) { //如果线性区长度小于offset,即frag中还有报头数据
skb_frag_t *frag;
skb_frag_t *frag2;
int i = skbinfo->nr_frags;
int nr_frags = pinfo->nr_frags + i; //合并后的frag数
if (nr_frags > MAX_SKB_FRAGS) //如果合并后的frag超过最大frag数,则需要merge
goto merge;
offset -= headlen;
pinfo->nr_frags = nr_frags;
skbinfo->nr_frags = 0;
frag = pinfo->frags + nr_frags;
frag2 = skbinfo->frags + i;
do {
*--frag = *--frag2;
} while (--i);
frag->page_offset += offset; //修正第一个frag,需要减掉报头数据
skb_frag_size_sub(frag, offset);
/* all fragments truesize : remove (head size + sk_buff) */
delta_truesize = skb->truesize -
SKB_TRUESIZE(skb_end_offset(skb));
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
skb->data_len = 0;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; //当前报文被合并,待释放
goto done;
} else if (skb->head_frag) { //ixgbe驱动创建的skb,该标记为true
int nr_frags = pinfo->nr_frags;
skb_frag_t *frag = pinfo->frags + nr_frags;
struct page *page = virt_to_head_page(skb->head); //得到线性区的page
unsigned int first_size = headlen - offset;
unsigned int first_offset;
if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) //合并后的frag数超过最大frag数,则需要merge
goto merge;
first_offset = skb->data -
(unsigned char *)page_address(page) +
offset;
pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
frag->page.p = page; //该frag报文报文线性区中的数据
frag->page_offset = first_offset;
skb_frag_size_set(frag, first_size);
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); //拷贝frag
/* We dont need to clear skbinfo->nr_frags here */
delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; //当前报文被合并,待释放
goto done;
}
merge:
delta_truesize = skb->truesize;
if (offset > headlen) { //如果offset大于报文的线性区长度,意味着frag中有部分数据是报文头
unsigned int eat = offset - headlen;
skbinfo->frags[0].page_offset += eat; //调整frag0中的数据,减掉报文头
skb_frag_size_sub(&skbinfo->frags[0], eat);
skb->data_len -= eat;
skb->len -= eat;
offset = headlen;
}
__skb_pull(skb, offset); //当前报文移动到数据区
if (NAPI_GRO_CB(p)->last == p) //初始状态时(skb第一次放到gro_list中),且没有merge过
skb_shinfo(p)->frag_list = skb; //报文保存到frag_list中
else
NAPI_GRO_CB(p)->last->next = skb; //报文保存到frag_list中的最后一个报文的
NAPI_GRO_CB(p)->last = skb; //merge过以后,报文都放在frag_list链表中
__skb_header_release(skb); //释放skb的线性区
lp = p;
done:
NAPI_GRO_CB(p)->count++; //count加一,最后设置为segs
p->data_len += len; //同流报文的长度加上当前报文的数据区长度
p->truesize += delta_truesize; //同流报文的truesize加上当前报文的truesize
p->len += len; //同流报文的长度增加当前报文的长度
if (lp != p) { //当lp与p不相同时,lp报文相关长度信息也需要调整
lp->data_len += len;
lp->truesize += delta_truesize;
lp->len += len;
}
NAPI_GRO_CB(skb)->same_flow = 1; //same_flow置1,说明报文已经被合并到gro_list中
return 0;
}
tcp4_gro_complete函数
static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, //刷新check值
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; //置GSO_TCPV4标记
return tcp_gro_complete(skb);
}
tcp_gro_complete
int tcp_gro_complete(struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
skb->csum_start = (unsigned char *)th - skb->head; //设置ip_summed及相关值
skb->csum_offset = offsetof(struct tcphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL;
skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; //设置segs
if (th->cwr)
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; //如果当前报文携带cwr标记,则携带SKB_GSO_TCP_ECN标记
return 0;
}
1)找到同流报文,合并报文后超过65536,该同流报文将会被提交给协议栈,当前报文在mac层被放到gro_list;
2)找到同流报文,合并成功,当前报文未携带相关flag,将不会有报文被提交到协议栈;
3)找到同流报文,合并成功,当前报文携带相关flag,同流报文和当前报文一起被提交到协议栈;
4)未找到同流报文,当前报文携带相关flag,当前报文将被提交给协议栈;
5)未找到同流报文,当前报文未携带相关flag,当前报文将被保存到gro_list中;