下面分析GRO处理数据包的流程。网卡驱动在收包时会调用napi_gro_receive函数接收数据:
3863 static void skb_gro_reset_offset(struct sk_buff *skb) 3864 { 3865 const struct skb_shared_info *pinfo = skb_shinfo(skb); 3866 const skb_frag_t *frag0 = &pinfo->frags[0]; 3867 3868 NAPI_GRO_CB(skb)->data_offset = 0; 3869 NAPI_GRO_CB(skb)->frag0 = NULL; 3870 NAPI_GRO_CB(skb)->frag0_len = 0; 3871 3872 if (skb->mac_header == skb->tail && 3873 pinfo->nr_frags && 3874 !PageHighMem(skb_frag_page(frag0))) {//如果mac_header和skb->tail相等并且地址不在高端内存,则说明包头保存在skb_shinfo中 3875 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 3876 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 3877 } 3878 } 3879 3880 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3881 { 3882 skb_gro_reset_offset(skb); 3883 3884 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 3885 }GRO将数据送进协议栈的点有两处,一个是在napi_skb_finish里,它会通过判断dev_gro_receive的返回值,来决定是否需要将数据包送入进协议栈;还有一个点是当napi的循环执行完毕执行napi_complete的时候。先来看napi_skb_finish 函数:
3836 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 3837 { 3838 switch (ret) { 3839 case GRO_NORMAL://将数据包送进协议栈 3840 if (netif_receive_skb(skb)) 3841 ret = GRO_DROP; 3842 break; 3843 3844 case GRO_DROP: 3845 kfree_skb(skb); 3846 break; 3847 3848 case GRO_MERGED_FREE://表示skb可以被free,因为GRO已经将skb合并并保存起来 3849 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 3850 kmem_cache_free(skbuff_head_cache, skb); 3851 else 3852 __kfree_skb(skb); 3853 break; 3854 3855 case GRO_HELD://这个表示当前数据已经被GRO保存起来,但是并没有进行合并,因此skb还需要保存。 3856 case GRO_MERGED: 3857 break; 3858 } 3859 3860 return ret; 3861 }dev_gro_receive函数用于合并skb,并决定是否将合并后的大skb送入网络协议栈:
3743 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3744 { 3745 struct sk_buff **pp = NULL; 3746 struct packet_offload *ptype; 3747 __be16 type = skb->protocol; 3748 struct list_head *head = &offload_base; 3749 int same_flow; 3750 enum gro_result ret; 3751 3752 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))//不支持GRO 3753 goto normal; 3754 3755 if (skb_is_gso(skb) || skb_has_frag_list(skb))//skb是GSO或skb有非线性空间的数据;这个skb已经是聚合状态了,不需要再次聚合 3756 goto normal; 3757 3758 gro_list_prepare(napi, skb);//比较GRO队列中的skb与当前skb在链路层是否属于同一个流 3759 3760 rcu_read_lock(); 3761 list_for_each_entry_rcu(ptype, head, list) {//遍历GRO处理函数注册队列 3762 if (ptype->type != type || !ptype->callbacks.gro_receive)//查找网络层GRO处理函数 3763 continue; 3764 3765 skb_set_network_header(skb, skb_gro_offset(skb)); 3766 skb_reset_mac_len(skb); 3767 NAPI_GRO_CB(skb)->same_flow = 0; 3768 NAPI_GRO_CB(skb)->flush = 0; 3769 NAPI_GRO_CB(skb)->free = 0; 3770 3771 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);//调用inet_gro_receive或ipv6_gro_receive合并skb 3772 break; 3773 } 3774 rcu_read_unlock(); 3775 3776 if (&ptype->list == head)//没有找到处理函数 3777 goto normal; 3778 3779 same_flow = NAPI_GRO_CB(skb)->same_flow; 3780 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 3781 3782 if (pp) { 3783 struct sk_buff *nskb = *pp; 3784 3785 *pp = nskb->next; 3786 nskb->next = NULL; 3787 napi_gro_complete(nskb);//更新聚合后的skb信息,将包送入协议栈 3788 napi->gro_count--; 3789 } 3790 3791 if (same_flow)//找到与当前skb属与同一个流的skb,此时当前skb已经聚合到所属的流中 3792 goto ok; 3793 3794 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)//skb不能聚合或GRO队列已满 3795 goto normal; 3796 3797 napi->gro_count++; 3798 NAPI_GRO_CB(skb)->count = 1; 3799 NAPI_GRO_CB(skb)->age = jiffies; 3800 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 3801 skb->next = napi->gro_list;//将skb是一个新包,在gro_list中没有能与之合并的,需要加入到GRO队列中 3802 napi->gro_list = skb; 3803 ret = GRO_HELD; //存储当前包,不能释放 3804 3805 pull: 3806 if (skb_headlen(skb) < skb_gro_offset(skb)) {//如果头不全部在线性区,则需要将其copy到线性区 3807 int grow = skb_gro_offset(skb) - skb_headlen(skb); 3808 3809 BUG_ON(skb->end - skb->tail < grow); 3810 3811 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);//将第一个页的内容转移到线性空间 3812 3813 skb->tail += grow; 3814 skb->data_len -= grow; 3815 3816 skb_shinfo(skb)->frags[0].page_offset += grow; 3817 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); 3818 3819 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {//第一个页的内容已经全部转移到线性空间 3820 skb_frag_unref(skb, 0); //释放页 3821 memmove(skb_shinfo(skb)->frags, 3822 skb_shinfo(skb)->frags + 1, 3823 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));//数组内容向前移动 3824 } 3825 } 3826 3827 ok: 3828 return ret; 3829 3830 normal: 3831 ret = GRO_NORMAL; 3832 goto pull; 3833 }inet_gro_receive 函数是网络层skb聚合处理函数:
1351 static struct sk_buff **inet_gro_receive(struct sk_buff **head, 1352 struct sk_buff *skb) 1353 { 1354 const struct net_offload *ops; 1355 struct sk_buff **pp = NULL; 1356 struct sk_buff *p; 1357 const struct iphdr *iph; 1358 unsigned int hlen; 1359 unsigned int off; 1360 unsigned int id; 1361 int flush = 1; 1362 int proto; 1363 1364 off = skb_gro_offset(skb); 1365 hlen = off + sizeof(*iph); //MAC + IP头长度 1366 iph = skb_gro_header_fast(skb, off); 1367 if (skb_gro_header_hard(skb, hlen)) { 1368 iph = skb_gro_header_slow(skb, hlen, off); 1369 if (unlikely(!iph)) 1370 goto out; 1371 } 1372 1373 proto = iph->protocol; 1374 1375 rcu_read_lock(); 1376 ops = rcu_dereference(inet_offloads[proto]);//找到传输层注册的处理函数 1377 if (!ops || !ops->callbacks.gro_receive) 1378 goto out_unlock; 1379 1380 if (*(u8 *)iph != 0x45)//不是IPv4且头长度不是20字节 1381 goto out_unlock; 1382 1383 if (unlikely(ip_fast_csum((u8 *)iph, 5)))//检验和非法 1384 goto out_unlock; 1385 1386 id = ntohl(*(__be32 *)&iph->id); 1387 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); 1388 id >>= 16; 1389 1390 for (p = *head; p; p = p->next) { 1391 struct iphdr *iph2; 1392 1393 if (!NAPI_GRO_CB(p)->same_flow)//与当前包不是一个流 1394 continue; 1395 1396 iph2 = ip_hdr(p); 1397 1398 if ((iph->protocol ^ iph2->protocol) | 1399 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | 1400 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {//三元组不匹配,与当前包不是一个流 1401 NAPI_GRO_CB(p)->same_flow = 0; 1402 continue; 1403 } 1404 1405 /* All fields must match except length and checksum. */ 1406 NAPI_GRO_CB(p)->flush |= 1407 (iph->ttl ^ iph2->ttl) | 1408 (iph->tos ^ iph2->tos) | 1409 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);//检查ttl、tos、id顺序,如果不符合则不是一个流 1410 1411 NAPI_GRO_CB(p)->flush |= flush; 1412 } 1413 1414 NAPI_GRO_CB(skb)->flush |= flush; 1415 skb_gro_pull(skb, sizeof(*iph)); 1416 skb_set_transport_header(skb, skb_gro_offset(skb)); 1417 1418 pp = ops->callbacks.gro_receive(head, skb);//指向tcp4_gro_receive或tcp6_gro_receive 1419 1420 out_unlock: 1421 rcu_read_unlock(); 1422 1423 out: 1424 NAPI_GRO_CB(skb)->flush |= flush; 1425 1426 return pp; 1427 }tcp4_gro_receive函数 是传输层skb聚合处理函数:
2806 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2807 { 2808 const struct iphdr *iph = skb_gro_network_header(skb); 2809 __wsum wsum; 2810 __sum16 sum; 2811 2812 switch (skb->ip_summed) { 2813 case CHECKSUM_COMPLETE: 2814 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, 2815 skb->csum)) { 2816 skb->ip_summed = CHECKSUM_UNNECESSARY; 2817 break; 2818 } 2819 flush: 2820 NAPI_GRO_CB(skb)->flush = 1; 2821 return NULL; 2822 2823 case CHECKSUM_NONE: 2824 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 2825 skb_gro_len(skb), IPPROTO_TCP, 0); 2826 sum = csum_fold(skb_checksum(skb, 2827 skb_gro_offset(skb), 2828 skb_gro_len(skb), 2829 wsum)); 2830 if (sum) 2831 goto flush; 2832 2833 skb->ip_summed = CHECKSUM_UNNECESSARY; 2834 break; 2835 } 2836 2837 return tcp_gro_receive(head, skb); 2838 }tcp_gro_receive函数:
3011 struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) 3012 { 3013 struct sk_buff **pp = NULL; 3014 struct sk_buff *p; 3015 struct tcphdr *th; 3016 struct tcphdr *th2; 3017 unsigned int len; 3018 unsigned int thlen; 3019 __be32 flags; 3020 unsigned int mss = 1; 3021 unsigned int hlen; 3022 unsigned int off; 3023 int flush = 1; 3024 int i; 3025 3026 off = skb_gro_offset(skb); 3027 hlen = off + sizeof(*th); 3028 th = skb_gro_header_fast(skb, off); 3029 if (skb_gro_header_hard(skb, hlen)) { 3030 th = skb_gro_header_slow(skb, hlen, off); 3031 if (unlikely(!th)) 3032 goto out; 3033 } 3034 3035 thlen = th->doff * 4; 3036 if (thlen < sizeof(*th)) 3037 goto out; 3038 3039 hlen = off + thlen; 3040 if (skb_gro_header_hard(skb, hlen)) { 3041 th = skb_gro_header_slow(skb, hlen, off); 3042 if (unlikely(!th)) 3043 goto out; 3044 } 3045 3046 skb_gro_pull(skb, thlen); 3047 3048 len = skb_gro_len(skb); 3049 flags = tcp_flag_word(th); 3050 3051 for (; (p = *head); head = &p->next) { 3052 if (!NAPI_GRO_CB(p)->same_flow)//IP层与当前包不是一个流 3053 continue; 3054 3055 th2 = tcp_hdr(p); 3056 3057 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {//源端口不匹配,与当前包不是一个流 3058 NAPI_GRO_CB(p)->same_flow = 0; 3059 continue; 3060 } 3061 3062 goto found; 3063 } 3064 3065 goto out_check_final; 3066 3067 found: 3068 flush = NAPI_GRO_CB(p)->flush; 3069 flush |= (__force int)(flags & TCP_FLAG_CWR);//发生了拥塞,那么前面被缓存的数据包需要马上被送入协议栈,以便进行TCP的拥塞控制 3070 flush |= (__force int)((flags ^ tcp_flag_word(th2)) & 3071 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));//如果控制标志位除了CWR、FIN、PSH外有不相同的则需要马上被送入协议栈 3072 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);//确认号不相同则需要马上被送入协议栈,以便尽快释放TCP发送缓存中的空间 3073 for (i = sizeof(*th); i < thlen; i += 4) 3074 flush |= *(u32 *)((u8 *)th + i) ^ 3075 *(u32 *)((u8 *)th2 + i);//选项字段不同也不行 3076 3077 mss = skb_shinfo(p)->gso_size; 3078 3079 flush |= (len - 1) >= mss; 3080 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); 3081 3082 if (flush || skb_gro_receive(head, skb)) { //flush非0意味着需要将skb立即送入协议栈;这样的包不能调用skb_gro_receive进行合并 3083 mss = 1; 3084 goto out_check_final; 3085 } 3086 3087 p = *head; 3088 th2 = tcp_hdr(p); 3089 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); 3090 3091 out_check_final: 3092 flush = len < mss; 3093 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | 3094 TCP_FLAG_RST | TCP_FLAG_SYN | 3095 TCP_FLAG_FIN));//如果包中有URG、PSH、RST、SYN、FIN标记中的任意一个,则将合并后的包立即交付协议栈 3096 3097 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 3098 pp = head; 3099 3100 out: 3101 NAPI_GRO_CB(skb)->flush |= flush; 3102 3103 return pp; 3104 }这里有一个疑问:为什么匹配TCP流时只检查源端口而不检查目的端口?猜测:因为检查了seq和ack_seq。源IP和源端口相同意味着包来自与同一台主机,这种情况下seq和ack_seq都匹配且目的端口不同的概率极低,所以不用检查目的端口。
2942 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2943 { 2944 struct sk_buff *p = *head; 2945 struct sk_buff *nskb; 2946 struct skb_shared_info *skbinfo = skb_shinfo(skb); 2947 struct skb_shared_info *pinfo = skb_shinfo(p); 2948 unsigned int headroom; 2949 unsigned int len = skb_gro_len(skb); 2950 unsigned int offset = skb_gro_offset(skb); 2951 unsigned int headlen = skb_headlen(skb); 2952 unsigned int delta_truesize; 2953 2954 if (p->len + len >= 65536) 2955 return -E2BIG; 2956 2957 if (pinfo->frag_list) //frag_list中有skb,证明不支持分散-聚集IO 2958 goto merge; 2959 else if (headlen <= offset) {//有一部分头在page中 2960 skb_frag_t *frag; 2961 skb_frag_t *frag2; 2962 int i = skbinfo->nr_frags; 2963 int nr_frags = pinfo->nr_frags + i; 2964 2965 offset -= headlen; 2966 2967 if (nr_frags > MAX_SKB_FRAGS) 2968 return -E2BIG; 2969 2970 pinfo->nr_frags = nr_frags; 2971 skbinfo->nr_frags = 0; 2972 2973 frag = pinfo->frags + nr_frags; 2974 frag2 = skbinfo->frags + i; 2975 do {//遍历赋值,将skb的frag加到pinfo的frgas后面 2976 *--frag = *--frag2; 2977 } while (--i); 2978 2979 frag->page_offset += offset;//去除剩余的头,只保留数据部分 2980 skb_frag_size_sub(frag, offset); 2981 2982 /* all fragments truesize : remove (head size + sk_buff) */ 2983 delta_truesize = skb->truesize - 2984 SKB_TRUESIZE(skb_end_offset(skb)); 2985 2986 skb->truesize -= skb->data_len; 2987 skb->len -= skb->data_len; 2988 skb->data_len = 0; 2989 2990 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 2991 goto done; 2992 } else if (skb->head_frag) {//支持分散-聚集IO 2993 int nr_frags = pinfo->nr_frags; 2994 skb_frag_t *frag = pinfo->frags + nr_frags; 2995 struct page *page = virt_to_head_page(skb->head); 2996 unsigned int first_size = headlen - offset; 2997 unsigned int first_offset; 2998 2999 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 3000 return -E2BIG; 3001 3002 first_offset = skb->data - 3003 (unsigned char *)page_address(page) + 3004 offset; 3005 3006 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 3007 3008 frag->page.p = page; 3009 frag->page_offset = first_offset; 3010 skb_frag_size_set(frag, first_size); 3011 3012 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 3013 /* We dont need to clear skbinfo->nr_frags here */ 3014 3015 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 3016 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 3017 goto done; 3018 } else if (skb_gro_len(p) != pinfo->gso_size) 3019 return -E2BIG; 3020 //不支持分散-聚集IO,则网卡不会将数据放在skb的frags数组中 3021 headroom = skb_headroom(p); 3022 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);//申请新的skb 3023 if (unlikely(!nskb)) 3024 return -ENOMEM; 3025 3026 __copy_skb_header(nskb, p); 3027 nskb->mac_len = p->mac_len; 3028 3029 skb_reserve(nskb, headroom); 3030 __skb_put(nskb, skb_gro_offset(p)); 3031 3032 skb_set_mac_header(nskb, skb_mac_header(p) - p->data); 3033 skb_set_network_header(nskb, skb_network_offset(p)); 3034 skb_set_transport_header(nskb, skb_transport_offset(p)); 3035 3036 __skb_pull(p, skb_gro_offset(p)); 3037 memcpy(skb_mac_header(nskb), skb_mac_header(p), 3038 p->data - skb_mac_header(p)); 3039 3040 skb_shinfo(nskb)->frag_list = p;//将旧GRO队列头放入frag_list队列中 3041 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 3042 pinfo->gso_size = 0; 3043 skb_header_release(p); 3044 NAPI_GRO_CB(nskb)->last = p; 3045 3046 nskb->data_len += p->len; 3047 nskb->truesize += p->truesize; 3048 nskb->len += p->len; 3049 3050 *head = nskb; 3051 nskb->next = p->next; 3052 p->next = NULL; 3053 3054 p = nskb; 3055 3056 merge: 3057 delta_truesize = skb->truesize; 3058 if (offset > headlen) { 3059 unsigned int eat = offset - headlen; 3060 3061 skbinfo->frags[0].page_offset += eat; 3062 skb_frag_size_sub(&skbinfo->frags[0], eat); 3063 skb->data_len -= eat; 3064 skb->len -= eat; 3065 offset = headlen; 3066 } 3067 3068 __skb_pull(skb, offset); 3069 3070 NAPI_GRO_CB(p)->last->next = skb;//将包放入GRO队列中 3071 NAPI_GRO_CB(p)->last = skb; 3072 skb_header_release(skb); 3073 3074 done: 3075 NAPI_GRO_CB(p)->count++; 3076 p->data_len += len; 3077 p->truesize += delta_truesize; 3078 p->len += len; 3079 3080 NAPI_GRO_CB(skb)->same_flow = 1; //标识当前skb已经找到同流的skb并进行了合并 3081 return 0; 3082 }
可见当网卡支持分散-聚集IO时,GRO会将多个skb合并到一个skb的frag page数组中,否则会合并到skb的的frag_list中。
即使在上述流程中skb被放入GRO队列中保存而没有被立即送入协议栈,它们也不会在队列中滞留太长时间,因为在收包软中断中会调用napi_gro_flush函数将GRO队列中的包送入协议栈:
3696 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 3697 { 3698 struct sk_buff *skb, *prev = NULL; 3699 3700 /* scan list and build reverse chain */ 3701 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {//按照从老到新的顺序构建链表 3702 skb->prev = prev; 3703 prev = skb; 3704 } 3705 3706 for (skb = prev; skb; skb = prev) { 3707 skb->next = NULL; 3708 3709 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)//只交付老的包且当前包是刚刚加入的 3710 return; 3711 3712 prev = skb->prev; 3713 napi_gro_complete(skb); //将包送入协议栈 3714 napi->gro_count--; 3715 } 3716 3717 napi->gro_list = NULL; 3718 }包加入GRO队列的时间比当前仅晚1个jiffies也会被视作旧包并交付协议栈处理,可见如果软中断每个jiffies都调用一次napi_gro_flush函数的话,开启GRO功能最多增加1个jiffies(1ms或10ms)的延迟 .
3658 static int napi_gro_complete(struct sk_buff *skb) 3659 { 3660 struct packet_offload *ptype; 3661 __be16 type = skb->protocol; 3662 struct list_head *head = &offload_base; 3663 int err = -ENOENT; 3664 3665 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 3666 3667 if (NAPI_GRO_CB(skb)->count == 1) { 3668 skb_shinfo(skb)->gso_size = 0; 3669 goto out; 3670 } 3671 3672 rcu_read_lock(); 3673 list_for_each_entry_rcu(ptype, head, list) { 3674 if (ptype->type != type || !ptype->callbacks.gro_complete) 3675 continue; 3676 3677 err = ptype->callbacks.gro_complete(skb);//指向inet_gro_complete或ipv6_gro_complete 3678 break; 3679 } 3680 rcu_read_unlock(); 3681 3682 if (err) { 3683 WARN_ON(&ptype->list == head); 3684 kfree_skb(skb); 3685 return NET_RX_SUCCESS; 3686 } 3687 3688 out: 3689 return netif_receive_skb(skb);//进入网络协议栈 3690 }inet_gro_receive函数处理网络层首部信息:
1429 static int inet_gro_complete(struct sk_buff *skb) 1430 { 1431 __be16 newlen = htons(skb->len - skb_network_offset(skb)); 1432 struct iphdr *iph = ip_hdr(skb); 1433 const struct net_offload *ops; 1434 int proto = iph->protocol; 1435 int err = -ENOSYS; 1436 1437 csum_replace2(&iph->check, iph->tot_len, newlen);//重新计算IP检验和,因为聚合在一起的包共用一个IP头 1438 iph->tot_len = newlen;//重新计算包长 1439 1440 rcu_read_lock(); 1441 ops = rcu_dereference(inet_offloads[proto]); 1442 if (WARN_ON(!ops || !ops->callbacks.gro_complete)) 1443 goto out_unlock; 1444 1445 err = ops->callbacks.gro_complete(skb);//指向tcp4_gro_complete或tcp6_gro_complete 1446 1447 out_unlock: 1448 rcu_read_unlock(); 1449 1450 return err; 1451 }tcp4_gro_complete函数处理TCP首部信息:
2840 int tcp4_gro_complete(struct sk_buff *skb) 2841 { 2842 const struct iphdr *iph = ip_hdr(skb); 2843 struct tcphdr *th = tcp_hdr(skb); 2844 2845 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), 2846 iph->saddr, iph->daddr, 0);//重算聚合后的包的TCP检验和 2847 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 2848 2849 return tcp_gro_complete(skb); 2850 }tcp_gro_complete函数:
3107 int tcp_gro_complete(struct sk_buff *skb) 3108 { 3109 struct tcphdr *th = tcp_hdr(skb); 3110 3111 skb->csum_start = skb_transport_header(skb) - skb->head; 3112 skb->csum_offset = offsetof(struct tcphdr, check); 3113 skb->ip_summed = CHECKSUM_PARTIAL; 3114 3115 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; 3116 3117 if (th->cwr) 3118 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; 3119 3120 return 0; 3121 }
可见,GRO的基本原理是将MAC层、IP层和TCP层都能合并的包的头只留一个,数据部分在frag数组或frag_list中存储,这样大大提高了包携带数据的效率。
在完成GRO处理后,skb会被交付到Linux网络协议栈入口进行协议处理。聚合后的skb在被送入到网络协议栈后,在网络层协议、TCP协议处理函数中会调用pskb_may_pull函数将GRO skb的数据整合到线性空间:
1961 int tcp_v4_rcv(struct sk_buff *skb) 1962 { 1963 const struct iphdr *iph; 1964 const struct tcphdr *th; 1965 struct sock *sk; 1966 int ret; 1967 struct net *net = dev_net(skb->dev); 1968 1969 if (skb->pkt_type != PACKET_HOST) 1970 goto discard_it; 1971 1972 /* Count it even if it's bad */ 1973 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); 1974 1975 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) //将TCP基本首部整合到线性空间 1976 goto discard_it; 1977 1978 th = tcp_hdr(skb); 1979 1980 if (th->doff < sizeof(struct tcphdr) / 4) 1981 goto bad_packet; 1982 if (!pskb_may_pull(skb, th->doff * 4)) //将TCP全部首部数据整合到线性空间 1983 goto discard_it; ...
pskb_may_pull函数:
1459 static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) 1460 { 1461 if (likely(len <= skb_headlen(skb))) //线性区中的数据长度够pull len个字节 1462 return 1; 1463 if (unlikely(len > skb->len)) //skb中数据总长度小于len,包异常 1464 return 0; 1465 return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; //需要增加线性区中的数据 1466 }__pskb_pull_tail函数:
1453 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) 1454 { 1455 /* If skb has not enough free space at tail, get new one 1456 * plus 128 bytes for future expansions. If we have enough 1457 * room at tail, reallocate without expansion only if skb is cloned. 1458 */ 1459 int i, k, eat = (skb->tail + delta) - skb->end; 1460 1461 if (eat > 0 || skb_cloned(skb)) {//如果空间不足或是有共享空间 1462 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 1463 GFP_ATOMIC))//扩展线性空间 1464 return NULL; 1465 } 1466 1467 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))//将非线性空间中的数据填充到线性空间 1468 BUG(); 1469 1470 /* Optimization: no fragments, no reasons to preestimate 1471 * size of pulled pages. Superb. 1472 */ 1473 if (!skb_has_frag_list(skb))//frag_list中没有skb 1474 goto pull_pages; 1475 1476 /* Estimate size of pulled pages. */ 1477 eat = delta; 1478 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1479 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1480 1481 if (size >= eat)//仅使用frags中的数据就填满了pull的长度 1482 goto pull_pages; 1483 eat -= size; 1484 } 1485 1486 /* If we need update frag list, we are in troubles. 1487 * Certainly, it possible to add an offset to skb data, 1488 * but taking into account that pulling is expected to 1489 * be very rare operation, it is worth to fight against 1490 * further bloating skb head and crucify ourselves here instead. 1491 * Pure masohism, indeed. 8)8) 1492 */ 1493 if (eat) {//整理frag_list队列 1494 struct sk_buff *list = skb_shinfo(skb)->frag_list; 1495 struct sk_buff *clone = NULL; 1496 struct sk_buff *insp = NULL; 1497 1498 do { 1499 BUG_ON(!list); 1500 1501 if (list->len <= eat) { 1502 /* Eaten as whole. */ 1503 eat -= list->len; 1504 list = list->next; 1505 insp = list; 1506 } else { 1507 /* Eaten partially. */ 1508 1509 if (skb_shared(list)) { 1510 /* Sucks! We need to fork list. :-( */ 1511 clone = skb_clone(list, GFP_ATOMIC); 1512 if (!clone) 1513 return NULL; 1514 insp = list->next; 1515 list = clone; 1516 } else { 1517 /* This may be pulled without 1518 * problems. */ 1519 insp = list; 1520 } 1521 if (!pskb_pull(list, eat)) { 1522 kfree_skb(clone); 1523 return NULL; 1524 } 1525 break; 1526 } 1527 } while (eat); 1528 1529 /* Free pulled out fragments. */ 1530 while ((list = skb_shinfo(skb)->frag_list) != insp) {//释放已经合并的skb 1531 skb_shinfo(skb)->frag_list = list->next; 1532 kfree_skb(list); 1533 } 1534 /* And insert new clone at head. */ 1535 if (clone) { 1536 clone->next = list; 1537 skb_shinfo(skb)->frag_list = clone; 1538 } 1539 } 1540 /* Success! Now we may commit changes to skb data. */ 1541 1542 pull_pages: 1543 eat = delta; 1544 k = 0; 1545 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1546 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1547 1548 if (size <= eat) {//释放已经合并的page 1549 skb_frag_unref(skb, i); 1550 eat -= size; 1551 } else { 1552 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1553 if (eat) { 1554 skb_shinfo(skb)->frags[k].page_offset += eat; 1555 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1556 eat = 0; 1557 } 1558 k++; 1559 } 1560 } 1561 skb_shinfo(skb)->nr_frags = k; 1562 1563 skb->tail += delta; 1564 skb->data_len -= delta; 1565 1566 return skb_tail_pointer(skb); 1567 }skb_copy_bits函数的功能是将非线性空间中的数据填充到线性空间:
1585 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 1586 { 1587 int start = skb_headlen(skb); 1588 struct sk_buff *frag_iter; 1589 int i, copy; 1590 1591 if (offset > (int)skb->len - len) 1592 goto fault; 1593 1594 /* Copy header. */ 1595 if ((copy = start - offset) > 0) { 1596 if (copy > len) 1597 copy = len; 1598 skb_copy_from_linear_data_offset(skb, offset, to, copy); 1599 if ((len -= copy) == 0) 1600 return 0; 1601 offset += copy; 1602 to += copy; 1603 } 1604 1605 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {//将frags数组中数据整理到线性区 1606 int end; 1607 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1608 1609 WARN_ON(start > offset + len); 1610 1611 end = start + skb_frag_size(f); 1612 if ((copy = end - offset) > 0) { 1613 u8 *vaddr; 1614 1615 if (copy > len) 1616 copy = len; 1617 1618 vaddr = kmap_atomic(skb_frag_page(f)); 1619 memcpy(to, 1620 vaddr + f->page_offset + offset - start, 1621 copy);//复制页中数据到线性区 1622 kunmap_atomic(vaddr); 1623 1624 if ((len -= copy) == 0)//copy的长度如果达到了要pull的长度,则可以结束了 1625 return 0; 1626 offset += copy; 1627 to += copy; 1628 } 1629 start = end; 1630 } 1631 1632 skb_walk_frags(skb, frag_iter) {//frags队列中的page都已经合入线性区,但还是没有凑够要pull的长度,需要整理frag_list队列 1633 int end; 1634 1635 WARN_ON(start > offset + len); 1636 1637 end = start + frag_iter->len; 1638 if ((copy = end - offset) > 0) { 1639 if (copy > len) 1640 copy = len; 1641 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 1642 goto fault; 1643 if ((len -= copy) == 0)//copy的长度如果达到了要pull的长度,则可以结束了 1644 return 0; 1645 offset += copy; 1646 to += copy; 1647 } 1648 start = end; 1649 } 1650 1651 if (!len) 1652 return 0; 1653 1654 fault: 1655 return -EFAULT; 1656 }
pskb_may_pull的整合保证了TCP首部数据全部被放入线性空间,从而使GRO不影响TCP协议的处理。在应用进程使用系统调用收数据时,会将仍然分散在不连续空间中的数据copy到应用进程的缓存空间中。应用进程会使用tcp_recvmsg函数收取数据:
1545 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1546 size_t len, int nonblock, int flags, int *addr_len) 1547 { 1548 struct tcp_sock *tp = tcp_sk(sk); ... 1856 err = skb_copy_datagram_iovec(skb, offset, 1857 msg->msg_iov, used);//copy数据到用户缓存 ...skb_copy_datagram_iovec函数可以将skb的线性区和非线性区中的数据全部copy进用户缓存,Linux以GRO方式接收的数据会在这个函数中全部交付应用进程:
314 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, 315 struct iovec *to, int len) 316 { 317 int start = skb_headlen(skb); 318 int i, copy = start - offset; 319 struct sk_buff *frag_iter; 320 321 trace_skb_copy_datagram_iovec(skb, len); 322 323 /* Copy header. */ 324 if (copy > 0) { 325 if (copy > len) 326 copy = len; 327 if (memcpy_toiovec(to, skb->data + offset, copy))//copy线性区中的数据 328 goto fault; 329 if ((len -= copy) == 0) 330 return 0; 331 offset += copy; 332 } 333 334 /* Copy paged appendix. Hmm... why does this look so complicated? */ 335 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 336 int end; 337 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 338 339 WARN_ON(start > offset + len); 340 341 end = start + skb_frag_size(frag); 342 if ((copy = end - offset) > 0) { 343 int err; 344 u8 *vaddr; 345 struct page *page = skb_frag_page(frag); 346 347 if (copy > len) 348 copy = len; 349 vaddr = kmap(page); 350 err = memcpy_toiovec(to, vaddr + frag->page_offset + 351 offset - start, copy);//copy frags数组中存放的page里的数据 352 kunmap(page); 353 if (err) 354 goto fault; 355 if (!(len -= copy)) 356 return 0; 357 offset += copy; 358 } 359 start = end; 360 } 361 362 skb_walk_frags(skb, frag_iter) { 363 int end; 364 365 WARN_ON(start > offset + len); 366 367 end = start + frag_iter->len; 368 if ((copy = end - offset) > 0) { 369 if (copy > len) 370 copy = len; 371 if (skb_copy_datagram_iovec(frag_iter, 372 offset - start, 373 to, copy))//copy挂在frag_list中的数据 374 goto fault; 375 if ((len -= copy) == 0) 376 return 0; 377 offset += copy; 378 } 379 start = end; 380 } 381 if (!len) 382 return 0; 383 384 fault: 385 return -EFAULT; 386 }至此,GRO的功能全部完成。