5.1 GRO(Generic Receive Offload)

  GRO(Generic Receive Offload)的功能将多个 TCP 数据聚合在一个skb结构,然后作为一个大数据包交付给上层的网络协议栈,以减少上层协议栈处理skb的开销,提高系统接收TCP数据包的性能。这个功能需要网卡驱动程序的支持。合并了多个skb的超级 skb能够一次性通过网络协议栈,从而减轻CPU负载。
     GRO是针对网络收包流程进行改进的,并且只有NAPI类型的驱动才支持此功能。因此如果要支持GRO,不仅要内核支持,驱动也必须调用相应的接口来开启此功能。用ethtool -K gro on来开启GRO,如果报错就说明网卡驱动本身就不支持GRO。
     GRO与TSO类似,但TSO只支持发送数据包。支持GRO的驱动会在NAPI的回调poll方法中读取数据包,然后调用GRO的接口napi_gro_receive或者napi_gro_frags来将数据包送进协议栈。

     下面分析GRO处理数据包的流程。网卡驱动在收包时会调用napi_gro_receive函数接收数据:

3863 static void skb_gro_reset_offset(struct sk_buff *skb)
3864 {
3865     const struct skb_shared_info *pinfo = skb_shinfo(skb);
3866     const skb_frag_t *frag0 = &pinfo->frags[0];
3867
3868     NAPI_GRO_CB(skb)->data_offset = 0;
3869     NAPI_GRO_CB(skb)->frag0 = NULL;
3870     NAPI_GRO_CB(skb)->frag0_len = 0;
3871
3872     if (skb->mac_header == skb->tail &&
3873         pinfo->nr_frags &&
3874         !PageHighMem(skb_frag_page(frag0))) {//如果mac_header和skb->tail相等并且地址不在高端内存,则说明包头保存在skb_shinfo中
3875         NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3876         NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3877     }
3878 }
3879     
3880 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3881 {   
3882     skb_gro_reset_offset(skb);
3883
3884     return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3885 }
   GRO将数据送进协议栈的点有两处,一个是在napi_skb_finish里,它会通过判断dev_gro_receive的返回值,来决定是否需要将数据包送入进协议栈;还有一个点是当napi的循环执行完毕执行napi_complete的时候。先来看napi_skb_finish 函数:
3836 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3837 {
3838     switch (ret) {
3839     case GRO_NORMAL://将数据包送进协议栈
3840         if (netif_receive_skb(skb))
3841             ret = GRO_DROP;
3842         break;
3843
3844     case GRO_DROP:
3845         kfree_skb(skb);
3846         break;
3847
3848     case GRO_MERGED_FREE://表示skb可以被free,因为GRO已经将skb合并并保存起来
3849         if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3850             kmem_cache_free(skbuff_head_cache, skb);
3851         else
3852             __kfree_skb(skb);
3853         break;
3854     
3855     case GRO_HELD://这个表示当前数据已经被GRO保存起来,但是并没有进行合并,因此skb还需要保存。
3856     case GRO_MERGED:
3857         break;
3858     }       
3859
3860     return ret;
3861 }
  dev_gro_receive函数用于合并skb,并决定是否将合并后的大skb送入网络协议栈:
3743 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3744 {
3745     struct sk_buff **pp = NULL;    
3746     struct packet_offload *ptype;  
3747     __be16 type = skb->protocol;   
3748     struct list_head *head = &offload_base;
3749     int same_flow;
3750     enum gro_result ret;
3751
3752     if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))//不支持GRO
3753         goto normal;     
3754
3755     if (skb_is_gso(skb) || skb_has_frag_list(skb))//skb是GSO或skb有非线性空间的数据;这个skb已经是聚合状态了,不需要再次聚合
3756         goto normal;     
3757
3758     gro_list_prepare(napi, skb);//比较GRO队列中的skb与当前skb在链路层是否属于同一个流
3759
3760     rcu_read_lock();
3761     list_for_each_entry_rcu(ptype, head, list) {//遍历GRO处理函数注册队列
3762         if (ptype->type != type || !ptype->callbacks.gro_receive)//查找网络层GRO处理函数
3763             continue;
3764
3765         skb_set_network_header(skb, skb_gro_offset(skb));
3766         skb_reset_mac_len(skb);
3767         NAPI_GRO_CB(skb)->same_flow = 0;
3768         NAPI_GRO_CB(skb)->flush = 0;
3769         NAPI_GRO_CB(skb)->free = 0;
3770
3771         pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);//调用inet_gro_receive或ipv6_gro_receive合并skb
3772         break;
3773     }
3774     rcu_read_unlock();
3775
3776     if (&ptype->list == head)//没有找到处理函数
3777         goto normal;
3778
3779     same_flow = NAPI_GRO_CB(skb)->same_flow;
3780     ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3781
3782     if (pp) {
3783         struct sk_buff *nskb = *pp;
3784
3785         *pp = nskb->next;
3786         nskb->next = NULL;
3787         napi_gro_complete(nskb);//更新聚合后的skb信息,将包送入协议栈
3788         napi->gro_count--;
3789     }
3790
3791     if (same_flow)//找到与当前skb属与同一个流的skb,此时当前skb已经聚合到所属的流中
3792         goto ok;
3793
3794     if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)//skb不能聚合或GRO队列已满
3795         goto normal;
3796
3797     napi->gro_count++;
3798     NAPI_GRO_CB(skb)->count = 1;
3799     NAPI_GRO_CB(skb)->age = jiffies;
3800     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3801     skb->next = napi->gro_list;//将skb是一个新包,在gro_list中没有能与之合并的,需要加入到GRO队列中
3802     napi->gro_list = skb;
3803     ret = GRO_HELD; //存储当前包,不能释放
3804
3805 pull:
3806     if (skb_headlen(skb) < skb_gro_offset(skb)) {//如果头不全部在线性区,则需要将其copy到线性区
3807         int grow = skb_gro_offset(skb) - skb_headlen(skb);
3808
3809         BUG_ON(skb->end - skb->tail < grow);
3810
3811         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);//将第一个页的内容转移到线性空间
3812
3813         skb->tail += grow;
3814         skb->data_len -= grow;
3815
3816         skb_shinfo(skb)->frags[0].page_offset += grow;
3817         skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3818
3819         if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {//第一个页的内容已经全部转移到线性空间
3820             skb_frag_unref(skb, 0); //释放页
3821             memmove(skb_shinfo(skb)->frags,
3822                 skb_shinfo(skb)->frags + 1,
3823                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));//数组内容向前移动
3824         }
3825     }
3826
3827 ok:
3828     return ret;
3829
3830 normal:
3831     ret = GRO_NORMAL;
3832     goto pull;
3833 }
   inet_gro_receive 函数是网络层skb聚合处理函数:
1351 static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1352                      struct sk_buff *skb)
1353 {       
1354     const struct net_offload *ops;
1355     struct sk_buff **pp = NULL;
1356     struct sk_buff *p;
1357     const struct iphdr *iph;
1358     unsigned int hlen;
1359     unsigned int off;
1360     unsigned int id;
1361     int flush = 1;
1362     int proto;
1363
1364     off = skb_gro_offset(skb);
1365     hlen = off + sizeof(*iph); //MAC + IP头长度
1366     iph = skb_gro_header_fast(skb, off);
1367     if (skb_gro_header_hard(skb, hlen)) {
1368         iph = skb_gro_header_slow(skb, hlen, off);
1369         if (unlikely(!iph))
1370             goto out;
1371     }
1372
1373     proto = iph->protocol;
1374
1375     rcu_read_lock();
1376     ops = rcu_dereference(inet_offloads[proto]);//找到传输层注册的处理函数
1377     if (!ops || !ops->callbacks.gro_receive)
1378         goto out_unlock;
1379
1380     if (*(u8 *)iph != 0x45)//不是IPv4且头长度不是20字节
1381         goto out_unlock;
1382
1383     if (unlikely(ip_fast_csum((u8 *)iph, 5)))//检验和非法
1384         goto out_unlock;
1385
1386     id = ntohl(*(__be32 *)&iph->id);
1387     flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
1388     id >>= 16;
1389
1390     for (p = *head; p; p = p->next) {
1391         struct iphdr *iph2;
1392
1393         if (!NAPI_GRO_CB(p)->same_flow)//与当前包不是一个流
1394             continue;
1395
1396         iph2 = ip_hdr(p);
1397
1398         if ((iph->protocol ^ iph2->protocol) |
1399             ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1400             ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {//三元组不匹配,与当前包不是一个流
1401             NAPI_GRO_CB(p)->same_flow = 0;
1402             continue;
1403         }
1404
1405         /* All fields must match except length and checksum. */
1406         NAPI_GRO_CB(p)->flush |=
1407             (iph->ttl ^ iph2->ttl) |
1408             (iph->tos ^ iph2->tos) |
1409             ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);//检查ttl、tos、id顺序,如果不符合则不是一个流
1410
1411         NAPI_GRO_CB(p)->flush |= flush;
1412     }
1413
1414     NAPI_GRO_CB(skb)->flush |= flush;
1415     skb_gro_pull(skb, sizeof(*iph));
1416     skb_set_transport_header(skb, skb_gro_offset(skb));
1417
1418     pp = ops->callbacks.gro_receive(head, skb);//指向tcp4_gro_receive或tcp6_gro_receive
1419
1420 out_unlock:
1421     rcu_read_unlock();
1422
1423 out:
1424     NAPI_GRO_CB(skb)->flush |= flush;
1425
1426     return pp;
1427 }
  tcp4_gro_receive函数 是传输层skb聚合处理函数

2806 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2807 {       
2808     const struct iphdr *iph = skb_gro_network_header(skb);
2809     __wsum wsum;
2810     __sum16 sum;
2811
2812     switch (skb->ip_summed) {
2813     case CHECKSUM_COMPLETE:
2814         if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2815                   skb->csum)) {
2816             skb->ip_summed = CHECKSUM_UNNECESSARY;
2817             break;
2818         }
2819 flush:
2820         NAPI_GRO_CB(skb)->flush = 1;
2821         return NULL;
2822     
2823     case CHECKSUM_NONE:
2824         wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2825                       skb_gro_len(skb), IPPROTO_TCP, 0);
2826         sum = csum_fold(skb_checksum(skb,
2827                          skb_gro_offset(skb),
2828                          skb_gro_len(skb),
2829                          wsum));
2830         if (sum)
2831             goto flush;
2832
2833         skb->ip_summed = CHECKSUM_UNNECESSARY;
2834         break;
2835     }
2836
2837     return tcp_gro_receive(head, skb);
2838 }
   tcp_gro_receive函数:
3011 struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3012 {
3013     struct sk_buff **pp = NULL;
3014     struct sk_buff *p;
3015     struct tcphdr *th;
3016     struct tcphdr *th2;
3017     unsigned int len;
3018     unsigned int thlen;
3019     __be32 flags;
3020     unsigned int mss = 1;
3021     unsigned int hlen;
3022     unsigned int off;
3023     int flush = 1;
3024     int i;
3025
3026     off = skb_gro_offset(skb);
3027     hlen = off + sizeof(*th);
3028     th = skb_gro_header_fast(skb, off);
3029     if (skb_gro_header_hard(skb, hlen)) {
3030         th = skb_gro_header_slow(skb, hlen, off);
3031         if (unlikely(!th))
3032             goto out;
3033     }
3034
3035     thlen = th->doff * 4;
3036     if (thlen < sizeof(*th))
3037         goto out;
3038
3039     hlen = off + thlen;
3040     if (skb_gro_header_hard(skb, hlen)) {
3041         th = skb_gro_header_slow(skb, hlen, off);
3042         if (unlikely(!th))
3043             goto out;
3044     }
3045
3046     skb_gro_pull(skb, thlen);
3047
3048     len = skb_gro_len(skb);
3049     flags = tcp_flag_word(th);
3050
3051     for (; (p = *head); head = &p->next) {
3052         if (!NAPI_GRO_CB(p)->same_flow)//IP层与当前包不是一个流
3053             continue;
3054
3055         th2 = tcp_hdr(p);
3056
3057         if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {//源端口不匹配,与当前包不是一个流
3058             NAPI_GRO_CB(p)->same_flow = 0;
3059             continue;
3060         }
3061
3062         goto found;
3063     }
3064
3065     goto out_check_final;
3066
3067 found:
3068     flush = NAPI_GRO_CB(p)->flush;
3069     flush |= (__force int)(flags & TCP_FLAG_CWR);//发生了拥塞,那么前面被缓存的数据包需要马上被送入协议栈,以便进行TCP的拥塞控制
3070     flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3071           ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));//如果控制标志位除了CWR、FIN、PSH外有不相同的则需要马上被送入协议栈
3072     flush |= (__force int)(th->ack_seq ^ th2->ack_seq);//确认号不相同则需要马上被送入协议栈,以便尽快释放TCP发送缓存中的空间
3073     for (i = sizeof(*th); i < thlen; i += 4)
3074         flush |= *(u32 *)((u8 *)th + i) ^
3075              *(u32 *)((u8 *)th2 + i);//选项字段不同也不行
3076
3077     mss = skb_shinfo(p)->gso_size;
3078
3079     flush |= (len - 1) >= mss;
3080     flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3081
3082     if (flush || skb_gro_receive(head, skb)) { //flush非0意味着需要将skb立即送入协议栈;这样的包不能调用skb_gro_receive进行合并
3083         mss = 1;
3084         goto out_check_final;
3085     }
3086
3087     p = *head;
3088     th2 = tcp_hdr(p);
3089     tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3090
3091 out_check_final:
3092     flush = len < mss;
3093     flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3094                     TCP_FLAG_RST | TCP_FLAG_SYN |
3095                     TCP_FLAG_FIN));//如果包中有URG、PSH、RST、SYN、FIN标记中的任意一个,则将合并后的包立即交付协议栈
3096
3097     if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3098         pp = head;
3099
3100 out:
3101     NAPI_GRO_CB(skb)->flush |= flush;
3102
3103     return pp;
3104 }
   这里有一个疑问:为什么匹配TCP流时只检查源端口而不检查目的端口?猜测:因为检查了seq和ack_seq。源IP和源端口相同意味着包来自与同一台主机,这种情况下seq和ack_seq都匹配且目的端口不同的概率极低,所以不用检查目的端口。
    skb_gro_receive函数用于合并同流的skb:
2942 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2943 {
2944     struct sk_buff *p = *head;
2945     struct sk_buff *nskb;
2946     struct skb_shared_info *skbinfo = skb_shinfo(skb);
2947     struct skb_shared_info *pinfo = skb_shinfo(p);
2948     unsigned int headroom;
2949     unsigned int len = skb_gro_len(skb);
2950     unsigned int offset = skb_gro_offset(skb);
2951     unsigned int headlen = skb_headlen(skb);
2952     unsigned int delta_truesize;
2953
2954     if (p->len + len >= 65536)
2955         return -E2BIG;
2956
2957     if (pinfo->frag_list)  //frag_list中有skb,证明不支持分散-聚集IO
2958         goto merge;
2959     else if (headlen <= offset) {//有一部分头在page中
2960         skb_frag_t *frag;
2961         skb_frag_t *frag2;
2962         int i = skbinfo->nr_frags;
2963         int nr_frags = pinfo->nr_frags + i;
2964
2965         offset -= headlen;
2966
2967         if (nr_frags > MAX_SKB_FRAGS)
2968             return -E2BIG;
2969
2970         pinfo->nr_frags = nr_frags;
2971         skbinfo->nr_frags = 0;
2972
2973         frag = pinfo->frags + nr_frags;
2974         frag2 = skbinfo->frags + i;
2975         do {//遍历赋值,将skb的frag加到pinfo的frgas后面
2976             *--frag = *--frag2;
2977         } while (--i);
2978
2979         frag->page_offset += offset;//去除剩余的头,只保留数据部分
2980         skb_frag_size_sub(frag, offset);
2981
2982         /* all fragments truesize : remove (head size + sk_buff) */
2983         delta_truesize = skb->truesize -
2984                  SKB_TRUESIZE(skb_end_offset(skb));
2985
2986         skb->truesize -= skb->data_len;
2987         skb->len -= skb->data_len;
2988         skb->data_len = 0;
2989
2990         NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
2991         goto done;
2992     } else if (skb->head_frag) {//支持分散-聚集IO
2993         int nr_frags = pinfo->nr_frags;
2994         skb_frag_t *frag = pinfo->frags + nr_frags;
2995         struct page *page = virt_to_head_page(skb->head);
2996         unsigned int first_size = headlen - offset;
2997         unsigned int first_offset;
2998
2999         if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
3000             return -E2BIG;
3001
3002         first_offset = skb->data -
3003                    (unsigned char *)page_address(page) +
3004                    offset;
3005
3006         pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
3007
3008         frag->page.p      = page;
3009         frag->page_offset = first_offset;
3010         skb_frag_size_set(frag, first_size);
3011
3012         memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
3013         /* We dont need to clear skbinfo->nr_frags here */
3014
3015         delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
3016         NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
3017         goto done;
3018     } else if (skb_gro_len(p) != pinfo->gso_size)
3019         return -E2BIG;
3020 //不支持分散-聚集IO,则网卡不会将数据放在skb的frags数组中
3021     headroom = skb_headroom(p);
3022     nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);//申请新的skb
3023     if (unlikely(!nskb))
3024         return -ENOMEM;
3025
3026     __copy_skb_header(nskb, p);
3027     nskb->mac_len = p->mac_len;
3028
3029     skb_reserve(nskb, headroom);
3030     __skb_put(nskb, skb_gro_offset(p));
3031
3032     skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
3033     skb_set_network_header(nskb, skb_network_offset(p));
3034     skb_set_transport_header(nskb, skb_transport_offset(p));
3035
3036     __skb_pull(p, skb_gro_offset(p));
3037     memcpy(skb_mac_header(nskb), skb_mac_header(p),
3038            p->data - skb_mac_header(p));
3039
3040     skb_shinfo(nskb)->frag_list = p;//将旧GRO队列头放入frag_list队列中
3041     skb_shinfo(nskb)->gso_size = pinfo->gso_size;
3042     pinfo->gso_size = 0;
3043     skb_header_release(p);
3044     NAPI_GRO_CB(nskb)->last = p;
3045
3046     nskb->data_len += p->len;
3047     nskb->truesize += p->truesize;
3048     nskb->len += p->len;
3049
3050     *head = nskb;
3051     nskb->next = p->next;
3052     p->next = NULL;
3053
3054     p = nskb;
3055
3056 merge:
3057     delta_truesize = skb->truesize;
3058     if (offset > headlen) {
3059         unsigned int eat = offset - headlen;
3060
3061         skbinfo->frags[0].page_offset += eat;
3062         skb_frag_size_sub(&skbinfo->frags[0], eat);
3063         skb->data_len -= eat;
3064         skb->len -= eat;
3065         offset = headlen;
3066     }
3067
3068     __skb_pull(skb, offset);
3069
3070     NAPI_GRO_CB(p)->last->next = skb;//将包放入GRO队列中
3071     NAPI_GRO_CB(p)->last = skb;
3072     skb_header_release(skb);
3073
3074 done:
3075     NAPI_GRO_CB(p)->count++;
3076     p->data_len += len;
3077     p->truesize += delta_truesize;
3078     p->len += len;
3079
3080     NAPI_GRO_CB(skb)->same_flow = 1;  //标识当前skb已经找到同流的skb并进行了合并
3081     return 0;
3082 }

  可见当网卡支持分散-聚集IO时,GRO会将多个skb合并到一个skb的frag page数组中,否则会合并到skb的的frag_list中。

  即使在上述流程中skb被放入GRO队列中保存而没有被立即送入协议栈,它们也不会在队列中滞留太长时间,因为在收包软中断中会调用napi_gro_flush函数将GRO队列中的包送入协议栈:

3696 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3697 {
3698     struct sk_buff *skb, *prev = NULL;
3699
3700     /* scan list and build reverse chain */
3701     for (skb = napi->gro_list; skb != NULL; skb = skb->next) {//按照从老到新的顺序构建链表
3702         skb->prev = prev;
3703         prev = skb;      
3704     }
3705
3706     for (skb = prev; skb; skb = prev) {
3707         skb->next = NULL;
3708
3709         if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)//只交付老的包且当前包是刚刚加入的
3710             return;      
3711
3712         prev = skb->prev;
3713         napi_gro_complete(skb);  //将包送入协议栈     
3714         napi->gro_count--;
3715     }
3716
3717     napi->gro_list = NULL;
3718 }
   包加入GRO队列的时间比当前仅晚1个jiffies也会被视作旧包并交付协议栈处理,可见如果软中断每个jiffies都调用一次napi_gro_flush函数的话,开启GRO功能最多增加1个jiffies(1ms或10ms)的延迟 .
   napi_gro_complete 函数会先处理一下聚合包各层协议的首部信息,再将包交付网络协议栈:

3658 static int napi_gro_complete(struct sk_buff *skb)
3659 {                   
3660     struct packet_offload *ptype;
3661     __be16 type = skb->protocol;
3662     struct list_head *head = &offload_base;
3663     int err = -ENOENT;
3664
3665     BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3666     
3667     if (NAPI_GRO_CB(skb)->count == 1) {
3668         skb_shinfo(skb)->gso_size = 0;
3669         goto out;
3670     }
3671
3672     rcu_read_lock();
3673     list_for_each_entry_rcu(ptype, head, list) {
3674         if (ptype->type != type || !ptype->callbacks.gro_complete)
3675             continue;
3676     
3677         err = ptype->callbacks.gro_complete(skb);//指向inet_gro_complete或ipv6_gro_complete
3678         break;
3679     }
3680     rcu_read_unlock();
3681
3682     if (err) {
3683         WARN_ON(&ptype->list == head);
3684         kfree_skb(skb);
3685         return NET_RX_SUCCESS;
3686     }
3687
3688 out:
3689     return netif_receive_skb(skb);//进入网络协议栈
3690 }
  inet_gro_receive函数处理网络层首部信息:
1429 static int inet_gro_complete(struct sk_buff *skb)
1430 {
1431     __be16 newlen = htons(skb->len - skb_network_offset(skb));
1432     struct iphdr *iph = ip_hdr(skb);
1433     const struct net_offload *ops;
1434     int proto = iph->protocol;
1435     int err = -ENOSYS;   
1436
1437     csum_replace2(&iph->check, iph->tot_len, newlen);//重新计算IP检验和,因为聚合在一起的包共用一个IP头
1438     iph->tot_len = newlen;//重新计算包长
1439
1440     rcu_read_lock();     
1441     ops = rcu_dereference(inet_offloads[proto]);
1442     if (WARN_ON(!ops || !ops->callbacks.gro_complete))
1443         goto out_unlock;
1444
1445     err = ops->callbacks.gro_complete(skb);//指向tcp4_gro_complete或tcp6_gro_complete
1446
1447 out_unlock:
1448     rcu_read_unlock();
1449
1450     return err;
1451 }
  tcp4_gro_complete函数处理TCP首部信息:
2840 int tcp4_gro_complete(struct sk_buff *skb)
2841 {
2842     const struct iphdr *iph = ip_hdr(skb);
2843     struct tcphdr *th = tcp_hdr(skb);
2844     
2845     th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2846                   iph->saddr, iph->daddr, 0);//重算聚合后的包的TCP检验和
2847     skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2848
2849     return tcp_gro_complete(skb);
2850 }       
   tcp_gro_complete函数:
3107 int tcp_gro_complete(struct sk_buff *skb)
3108 {       
3109     struct tcphdr *th = tcp_hdr(skb);
3110
3111     skb->csum_start = skb_transport_header(skb) - skb->head;
3112     skb->csum_offset = offsetof(struct tcphdr, check);
3113     skb->ip_summed = CHECKSUM_PARTIAL;
3114         
3115     skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3116
3117     if (th->cwr)
3118         skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3119
3120     return 0;
3121 }   

  可见,GRO的基本原理是将MAC层、IP层和TCP层都能合并的包的头只留一个,数据部分在frag数组或frag_list中存储,这样大大提高了包携带数据的效率。
     在完成GRO处理后,skb会被交付到Linux网络协议栈入口进行协议处理。聚合后的skb在被送入到网络协议栈后,在网络层协议、TCP协议处理函数中会调用pskb_may_pull函数将GRO skb的数据整合到线性空间:

1961 int tcp_v4_rcv(struct sk_buff *skb)
1962 {
1963     const struct iphdr *iph;
1964     const struct tcphdr *th;
1965     struct sock *sk;
1966     int ret;
1967     struct net *net = dev_net(skb->dev);
1968 
1969     if (skb->pkt_type != PACKET_HOST)
1970         goto discard_it;
1971 
1972     /* Count it even if it's bad */
1973     TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1974 
1975     if (!pskb_may_pull(skb, sizeof(struct tcphdr)))   //将TCP基本首部整合到线性空间
1976         goto discard_it;
1977 
1978     th = tcp_hdr(skb);
1979 
1980     if (th->doff < sizeof(struct tcphdr) / 4)
1981         goto bad_packet;
1982     if (!pskb_may_pull(skb, th->doff * 4))  //将TCP全部首部数据整合到线性空间
1983         goto discard_it;
...

  pskb_may_pull函数:

1459 static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
1460 {
1461     if (likely(len <= skb_headlen(skb))) //线性区中的数据长度够pull len个字节
1462         return 1;
1463     if (unlikely(len > skb->len)) //skb中数据总长度小于len,包异常
1464         return 0;
1465     return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; //需要增加线性区中的数据
1466 }
  __pskb_pull_tail函数:

1453 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1454 {
1455     /* If skb has not enough free space at tail, get new one
1456      * plus 128 bytes for future expansions. If we have enough
1457      * room at tail, reallocate without expansion only if skb is cloned.
1458      */
1459     int i, k, eat = (skb->tail + delta) - skb->end;
1460     
1461     if (eat > 0 || skb_cloned(skb)) {//如果空间不足或是有共享空间
1462         if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
1463                      GFP_ATOMIC))//扩展线性空间
1464             return NULL;
1465     }
1466
1467     if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))//将非线性空间中的数据填充到线性空间
1468         BUG();
1469         
1470     /* Optimization: no fragments, no reasons to preestimate
1471      * size of pulled pages. Superb.
1472      */
1473     if (!skb_has_frag_list(skb))//frag_list中没有skb
1474         goto pull_pages;
1475
1476     /* Estimate size of pulled pages. */
1477     eat = delta;
1478     for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1479         int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1480
1481         if (size >= eat)//仅使用frags中的数据就填满了pull的长度
1482             goto pull_pages;
1483         eat -= size;
1484     }
1485                 
1486     /* If we need update frag list, we are in troubles.
1487      * Certainly, it possible to add an offset to skb data,
1488      * but taking into account that pulling is expected to
1489      * be very rare operation, it is worth to fight against
1490      * further bloating skb head and crucify ourselves here instead.
1491      * Pure masohism, indeed. 8)8)
1492      */
1493     if (eat) {//整理frag_list队列
1494         struct sk_buff *list = skb_shinfo(skb)->frag_list;
1495         struct sk_buff *clone = NULL;
1496         struct sk_buff *insp = NULL;
1497
1498         do {
1499             BUG_ON(!list);
1500
1501             if (list->len <= eat) {
1502                 /* Eaten as whole. */
1503                 eat -= list->len;
1504                 list = list->next;
1505                 insp = list;
1506             } else {
1507                 /* Eaten partially. */
1508
1509                 if (skb_shared(list)) {
1510                     /* Sucks! We need to fork list. :-( */
1511                     clone = skb_clone(list, GFP_ATOMIC);
1512                     if (!clone)
1513                         return NULL;
1514                     insp = list->next;
1515                     list = clone;
1516                 } else {
1517                     /* This may be pulled without
1518                      * problems. */
1519                     insp = list;
1520                 }
1521                 if (!pskb_pull(list, eat)) {
1522                     kfree_skb(clone);
1523                     return NULL;
1524                 }
1525                 break;
1526             }
1527         } while (eat);
1528
1529         /* Free pulled out fragments. */
1530         while ((list = skb_shinfo(skb)->frag_list) != insp) {//释放已经合并的skb
1531             skb_shinfo(skb)->frag_list = list->next;
1532             kfree_skb(list);
1533         }
1534         /* And insert new clone at head. */
1535         if (clone) {
1536             clone->next = list;
1537             skb_shinfo(skb)->frag_list = clone;
1538         }
1539     }
1540     /* Success! Now we may commit changes to skb data. */
1541
1542 pull_pages:
1543     eat = delta;
1544     k = 0;
1545     for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1546         int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1547
1548         if (size <= eat) {//释放已经合并的page
1549             skb_frag_unref(skb, i);
1550             eat -= size;
1551         } else {
1552             skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1553             if (eat) {
1554                 skb_shinfo(skb)->frags[k].page_offset += eat;
1555                 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
1556                 eat = 0;
1557             }
1558             k++;
1559         }
1560     }
1561     skb_shinfo(skb)->nr_frags = k;
1562
1563     skb->tail     += delta;
1564     skb->data_len -= delta;
1565
1566     return skb_tail_pointer(skb);
1567 }
   skb_copy_bits函数的功能是将非线性空间中的数据填充到线性空间:
1585 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
1586 {   
1587     int start = skb_headlen(skb);
1588     struct sk_buff *frag_iter;
1589     int i, copy;
1590
1591     if (offset > (int)skb->len - len)
1592         goto fault;
1593
1594     /* Copy header. */
1595     if ((copy = start - offset) > 0) {
1596         if (copy > len)
1597             copy = len;
1598         skb_copy_from_linear_data_offset(skb, offset, to, copy);
1599         if ((len -= copy) == 0)
1600             return 0;
1601         offset += copy;
1602         to     += copy;
1603     }       
1604                 
1605     for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {//将frags数组中数据整理到线性区
1606         int end;
1607         skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1608
1609         WARN_ON(start > offset + len);
1610
1611         end = start + skb_frag_size(f);
1612         if ((copy = end - offset) > 0) {
1613             u8 *vaddr;
1614
1615             if (copy > len)
1616                 copy = len;
1617
1618             vaddr = kmap_atomic(skb_frag_page(f));
1619             memcpy(to,
1620                    vaddr + f->page_offset + offset - start,
1621                    copy);//复制页中数据到线性区
1622             kunmap_atomic(vaddr);
1623
1624             if ((len -= copy) == 0)//copy的长度如果达到了要pull的长度,则可以结束了
1625                 return 0;
1626             offset += copy;
1627             to     += copy;
1628         }
1629         start = end;
1630     }
1631
1632     skb_walk_frags(skb, frag_iter) {//frags队列中的page都已经合入线性区,但还是没有凑够要pull的长度,需要整理frag_list队列
1633         int end;
1634
1635         WARN_ON(start > offset + len);
1636
1637         end = start + frag_iter->len;
1638         if ((copy = end - offset) > 0) {
1639             if (copy > len)
1640                 copy = len;
1641             if (skb_copy_bits(frag_iter, offset - start, to, copy))
1642                 goto fault;
1643             if ((len -= copy) == 0)//copy的长度如果达到了要pull的长度,则可以结束了
1644                 return 0;
1645             offset += copy;
1646             to     += copy;
1647         }
1648         start = end;
1649     }
1650
1651     if (!len)
1652         return 0;
1653
1654 fault:
1655     return -EFAULT;
1656 }

  pskb_may_pull的整合保证了TCP首部数据全部被放入线性空间,从而使GRO不影响TCP协议的处理。在应用进程使用系统调用收数据时,会将仍然分散在不连续空间中的数据copy到应用进程的缓存空间中。应用进程会使用tcp_recvmsg函数收取数据:

1545 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546         size_t len, int nonblock, int flags, int *addr_len)
1547 {   
1548     struct tcp_sock *tp = tcp_sk(sk);
...
1856                 err = skb_copy_datagram_iovec(skb, offset,
1857                         msg->msg_iov, used);//copy数据到用户缓存
...
   skb_copy_datagram_iovec函数可以将skb的线性区和非线性区中的数据全部copy进用户缓存,Linux以GRO方式接收的数据会在这个函数中全部交付应用进程:
314 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
315                 struct iovec *to, int len)
316 {
317     int start = skb_headlen(skb);
318     int i, copy = start - offset;
319     struct sk_buff *frag_iter;
320
321     trace_skb_copy_datagram_iovec(skb, len);
322
323     /* Copy header. */
324     if (copy > 0) {
325         if (copy > len)
326             copy = len;
327         if (memcpy_toiovec(to, skb->data + offset, copy))//copy线性区中的数据
328             goto fault;
329         if ((len -= copy) == 0)
330             return 0;
331         offset += copy;
332     }
333
334     /* Copy paged appendix. Hmm... why does this look so complicated? */
335     for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
336         int end;
337         const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
338
339         WARN_ON(start > offset + len);
340
341         end = start + skb_frag_size(frag);
342         if ((copy = end - offset) > 0) {
343             int err;
344             u8  *vaddr;
345             struct page *page = skb_frag_page(frag);
346
347             if (copy > len)
348                 copy = len;
349             vaddr = kmap(page);            
350             err = memcpy_toiovec(to, vaddr + frag->page_offset +
351                          offset - start, copy);//copy frags数组中存放的page里的数据
352             kunmap(page);
353             if (err)      
354                 goto fault;
355             if (!(len -= copy))                
356                 return 0;
357             offset += copy;
358         }
359         start = end;
360     }
361
362     skb_walk_frags(skb, frag_iter) {
363         int end;
364
365         WARN_ON(start > offset + len);
366
367         end = start + frag_iter->len;
368         if ((copy = end - offset) > 0) {
369             if (copy > len)
370                 copy = len;
371             if (skb_copy_datagram_iovec(frag_iter,
372                             offset - start,
373                             to, copy))//copy挂在frag_list中的数据
374                 goto fault;
375             if ((len -= copy) == 0)
376                 return 0;
377             offset += copy;
378         }
379         start = end;
380     }
381     if (!len)
382         return 0;
383
384 fault:
385     return -EFAULT;
386 }
  至此,GRO的功能全部完成。

你可能感兴趣的:(tcp,linux内核)