<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">网卡驱动接收到报文后,通过netif_receive_skb提交报文到协议栈处理,由于网络设备MTU一般都设置为1500,对于TCP报文如果收到报文后就提交给协议栈处理是非常低效的,一般是通过聚合后再提交给协议栈,可以极大的降低内核的开销。 内核提供了napi_gro_receive函数,通过该函数可以实现报文聚合后再提交给协议栈。</span>
1、napi_gro_receive函数
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); //初始化NAPI_GRO_CB结构体 return napi_skb_finish(dev_gro_receive(napi, skb), skb); //gro收包并提交给协议栈处理,dev_gro_receive函数的返回值决定如何处理报文 }2、napi_skb_finish函数
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: if (netif_receive_skb_internal(skb)) //返回值为normal,则直接提交报文给协议栈 ret = GRO_DROP; break; case GRO_DROP: kfree_skb(skb); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) //报文已经merge,需要释放skb kmem_cache_free(skbuff_head_cache, skb); else __kfree_skb(skb); break; case GRO_HELD: case GRO_MERGED: //报文已经被保存到gro_list中,不要求释放skb break; } return ret; }3、dev_gro_receive函数
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *head = &offload_base; //packet_offload链表 int same_flow; enum gro_result ret; int grow; if (!(skb->dev->features & NETIF_F_GRO)) //如果设备不支持GRO,则直接提交报文给协议栈处理 goto normal; if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) //如果报文是GSO报文,包含frag_list,或csum_bad则提交给协议栈处理 goto normal; gro_list_prepare(napi, skb); //遍历gro_list中的报文和当前报文是否同流,相同的入口设备、vlan_tci、mac头相同 rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { //遍历packet_offload链表,找到和当前协议相同的packet_offload,IP报文为ip_packet_offload if (ptype->type != type || !ptype->callbacks.gro_receive) continue; skb_set_network_header(skb, skb_gro_offset(skb)); //设置network header,驱动调用napi_gro_receive前需要把报文移到network header skb_reset_mac_len(skb); //设置mac长度 NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { //根据ip_summed字段初始化参数 case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; NAPI_GRO_CB(skb)->csum_cnt = 0; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; NAPI_GRO_CB(skb)->csum_valid = 0; break; default: NAPI_GRO_CB(skb)->csum_cnt = 0; NAPI_GRO_CB(skb)->csum_valid = 0; } pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); //调用网络层的gro_receive函数 break; } rcu_read_unlock(); if (&ptype->list == head) //没有匹配到packet_offload对象,则直接提交报文给协议栈 goto normal; same_flow = NAPI_GRO_CB(skb)->same_flow; //网络层gro_receive处理后,same_flow可能被刷新 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { //如果pp不为空,说明该报文需要提交给协议栈 struct sk_buff *nskb = *pp; *pp = nskb->next; nskb->next = NULL; napi_gro_complete(nskb); //提交给协议栈 napi->gro_count--; } if (same_flow) //如果是相同的流,则返回GRO_MERGED_FREE 或 GRO_MERGED,报文不会被提交给协议栈 goto ok; if (NAPI_GRO_CB(skb)->flush) //未匹配到流,且flush被置1,则直接提交报文给协议栈 goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { //gro_list中的报文超过了设定值 struct sk_buff *nskb = napi->gro_list; /* locate the end of the list to select the 'oldest' flow */ while (nskb->next) { pp = &nskb->next; nskb = *pp; } *pp = NULL; nskb->next = NULL; napi_gro_complete(nskb); //取出最早的报文,提交给协议栈处理 } else { napi->gro_count++; } NAPI_GRO_CB(skb)->count = 1; //未匹配到流,且flush未被置1,则把该报文插入到gro_list中,待以后匹配处理 NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; pull: grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) //当前数据偏移如果超过线性区,则需要扩展线性区,线性区长度由驱动保证够用 gro_pull_from_frag0(skb, grow); //扩展报文线性区 ok: return ret; normal: ret = GRO_NORMAL; goto pull; }
4、napi_gro_complete函数
static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { //count等于1,说明只有当前一个报文,直接提交给协议栈 skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = ptype->callbacks.gro_complete(skb, 0); //调用网络层的gro_complete函数 break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return NET_RX_SUCCESS; } out: return netif_receive_skb_internal(skb); //提交给网络协议栈 }
总结下,一个报文有几种命运:
1)当前报文立即被提交给协议栈处理;
2)当前报文被合并到gro_list,不提交给协议栈;
3)gro_list中已合并的报文满足条件被提交给协议栈;
4)gro_list中最早的报文被提交给协议栈;