TCP gro实现定义在tcpv4_offload对象
static const struct net_offload tcpv4_offload = { .callbacks = { .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, };tcp4_gro_receive函数
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, inet_gro_compute_pseudo)) { //如果flush为0,需要检测csum NAPI_GRO_CB(skb)->flush = 1; //如果检测失败则flush置1,报文将被提交到协议栈 return NULL; } return tcp_gro_receive(head, skb); //TCP gro receive处理,与IP协议无关 }tcp_gro_receive函数
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; unsigned int len; unsigned int thlen; __be32 flags; unsigned int mss = 1; unsigned int hlen; unsigned int off; int flush = 1; int i; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header_fast(skb, off); //得到TCP头 if (skb_gro_header_hard(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) goto out; } thlen = th->doff * 4; //得到TCP头的长度 if (thlen < sizeof(*th)) goto out; hlen = off + thlen; if (skb_gro_header_hard(skb, hlen)) { //检测报文 th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) goto out; } skb_gro_pull(skb, thlen); //报文移动到payload数据区 len = skb_gro_len(skb); //得到报文的数据区长度 flags = tcp_flag_word(th); for (; (p = *head); head = &p->next) { //遍历gro_list中的报文 if (!NAPI_GRO_CB(p)->same_flow) continue; th2 = tcp_hdr(p); //得到报文tcp头 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { //源和目的端口不一致的不是同一个流 NAPI_GRO_CB(p)->same_flow = 0; continue; } goto found; //找到同一个流的报文,则跳出循环,即p指向同一个流的skb } goto out_check_final; found: /* Include the IP ID check below from the inner most IP hdr */ flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; //得到flush值,经过MAC/IP层设置 flush |= (__force int)(flags & TCP_FLAG_CWR); //如果当前报文携带CWR标记,则flush置1 flush |= (__force int)((flags ^ tcp_flag_word(th2)) & //如果当前报文和同流报文在(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)标记之外的标记不相同,则置flush为1 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); //如果当前报文和同流报文的ack_seq不同,则置flush为1 for (i = sizeof(*th); i < thlen; i += 4) //如果当前报文和同流报文的TCP头option信息不同,则置flush为1 flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); mss = tcp_skb_mss(p); //得到mss值 flush |= (len - 1) >= mss; //如果当前报文数据区长度超过mss,则置flush为1 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); //如果当前报文和同流报文不连续,则置flush为1 if (flush || skb_gro_receive(head, skb)) { //如果flush为0,则把当前报文合并到同流报文 mss = 1; goto out_check_final; } p = *head; //同流报文 th2 = tcp_hdr(p); tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); //如果当前报文包含(TCP_FLAG_FIN | TCP_FLAG_PSH)标记,则同流报文也添加该标记 out_check_final: flush = len < mss; //报文长度小于mss,一般是一个流的最后报文,需要尽快提交报文 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | //如果报文携带这5个标记,则flush为1 TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) //p不为空,即找到同流报文,两种场景,1)同流报文超过65536;2)flush为1 pp = head; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); //设置当前报文的flush,决定是否提交当前报文到协议栈 return pp; }skb_gro_receive函数
int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); struct sk_buff *lp, *p = *head; //p指向gro_list中与当前报文同流的skb unsigned int delta_truesize; if (unlikely(p->len + len >= 65536)) //超过最大报文数,返回错误将提交同流报文 return -E2BIG; lp = NAPI_GRO_CB(p)->last; //初始时,last指向p自身 pinfo = skb_shinfo(lp); if (headlen <= offset) { //如果线性区长度小于offset,即frag中还有报头数据 skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; //合并后的frag数 if (nr_frags > MAX_SKB_FRAGS) //如果合并后的frag超过最大frag数,则需要merge goto merge; offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); frag->page_offset += offset; //修正第一个frag,需要减掉报头数据 skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ delta_truesize = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); skb->truesize -= skb->data_len; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; //当前报文被合并,待释放 goto done; } else if (skb->head_frag) { //ixgbe驱动创建的skb,该标记为true int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); //得到线性区的page unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) //合并后的frag数超过最大frag数,则需要merge goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; frag->page.p = page; //该frag报文报文线性区中的数据 frag->page_offset = first_offset; skb_frag_size_set(frag, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); //拷贝frag /* We dont need to clear skbinfo->nr_frags here */ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; //当前报文被合并,待释放 goto done; } merge: delta_truesize = skb->truesize; if (offset > headlen) { //如果offset大于报文的线性区长度,意味着frag中有部分数据是报文头 unsigned int eat = offset - headlen; skbinfo->frags[0].page_offset += eat; //调整frag0中的数据,减掉报文头 skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; } __skb_pull(skb, offset); //当前报文移动到数据区 if (NAPI_GRO_CB(p)->last == p) //初始状态时(skb第一次放到gro_list中),且没有merge过 skb_shinfo(p)->frag_list = skb; //报文保存到frag_list中 else NAPI_GRO_CB(p)->last->next = skb; //报文保存到frag_list中的最后一个报文的 NAPI_GRO_CB(p)->last = skb; //merge过以后,报文都放在frag_list链表中 __skb_header_release(skb); //释放skb的线性区 lp = p; done: NAPI_GRO_CB(p)->count++; //count加一,最后设置为segs p->data_len += len; //同流报文的长度加上当前报文的数据区长度 p->truesize += delta_truesize; //同流报文的truesize加上当前报文的truesize p->len += len; //同流报文的长度增加当前报文的长度 if (lp != p) { //当lp与p不相同时,lp报文相关长度信息也需要调整 lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; //same_flow置1,说明报文已经被合并到gro_list中 return 0; }
tcp4_gro_complete函数
static int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, //刷新check值 iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; //置GSO_TCPV4标记 return tcp_gro_complete(skb); }tcp_gro_complete
int tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); skb->csum_start = (unsigned char *)th - skb->head; //设置ip_summed及相关值 skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; //设置segs if (th->cwr) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; //如果当前报文携带cwr标记,则携带SKB_GSO_TCP_ECN标记 return 0; }
1)找到同流报文,合并报文后超过65536,该同流报文将会被提交给协议栈,当前报文在mac层被放到gro_list;
2)找到同流报文,合并成功,当前报文未携带相关flag,将不会有报文被提交到协议栈;
3)找到同流报文,合并成功,当前报文携带相关flag,同流报文和当前报文一起被提交到协议栈;
4)未找到同流报文,当前报文携带相关flag,当前报文将被提交给协议栈;
5)未找到同流报文,当前报文未携带相关flag,当前报文将被保存到gro_list中;