报文提交给内核协议栈处理后,最终会调用到__netif_receive_skb_core函数,如果报文没有被rx_handler消费掉,最终会交给ptype_base中注册的协议处理,包括内核注册的协议,也包括raw socket等创建的协议处理。本文将分析普通ipv4报文的处理过程,处理入口函数为ip_rcv函数。
1、ip_rcv函数
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) //丢弃掉不是发往本机的报文,网卡开启混杂模式会收到此类报文 goto drop; IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); //检查是否skb为share,是则克隆报文 if (!skb) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } if (!pskb_may_pull(skb, sizeof(struct iphdr))) //确保skb还可以容纳标准的报头(即20字节) goto inhdr_error; iph = ip_hdr(skb); //得到IP头 /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) //ip头长度至少为20字节(ihl>=5),只支持v4 goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); IP_ADD_STATS_BH(dev_net(dev), IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) //确保skb还可以容纳实际的报头(ihl*4) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) //ip头csum校验 goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { //去除多余的字节 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } skb->transport_header = skb->network_header + iph->ihl*4; //设置传输层header /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); //清空cb,即inet_skb_parm值 /* Must drop socket now because of tproxy. */ skb_orphan(skb); return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, //调用netfilter,实现iptables功能,通过后调用ip_rcv_finish dev, NULL, ip_rcv_finish); csum_error: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); inhdr_error: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: return NET_RX_DROP; }
2、ip_rcv_finish函数
static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { const struct net_protocol *ipprot; int protocol = iph->protocol; //得到传输层协议 ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); //对于socket报文,可以通过socket快速获取路由表 /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, //路由查询,决定后续处理:向上传递、转发、丢弃 iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), LINUX_MIB_IPRPFILTER); goto drop; } } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; rt = skb_rtable(skb); //得到路由表项,统计组播和广播报文 if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); //后续处理,本机处理为ip_local_deliver,转发为ip_forward drop: kfree_skb(skb); return NET_RX_DROP; }
3、ip_local_deliver函数
int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) //如果是ip分片报文,则需要报文组装完整后才能提交给上层 return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); //调用netfilter,实现iptables功能,通过后调用ip_local_deliver_finish }4、ip_local_deliver_finish函数
static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); __skb_pull(skb, skb_network_header_len(skb)); //报文移动到传输层头 rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; //得到IP头中的协议类型,即4层协议 const struct net_protocol *ipprot; int raw; resubmit: raw = raw_local_deliver(skb, protocol); //AF_INET的raw sock处理入口 ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { int ret; if (!ipprot->no_policy) { //4.1.12内核中,所有协议的no_policy都为1,条件不成立 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { //ipsec策略检测 kfree_skb(skb); goto out; } nf_reset(skb); } ret = ipprot->handler(skb); //交给上层处理报文,UDP/TCP/ICMP等等 if (ret < 0) { protocol = -ret; goto resubmit; } IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); } else { //协议未定义 if (!raw) { //如果不是raw,则检测ipse策略,如果检测通过则发送ICMP消息 if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb(skb); } else { IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } out: rcu_read_unlock(); return 0; }