本文主要讲解了Linux内核IP层的整体架构和对从网卡接受的报文处理流程,使用的内核的版本是2.6.32.27
为了方便理解,本文采用整体流程图加伪代码的方式对Linxu内核中IP整体实现架构和对网卡报文的处理流程进行了讲解,希望可以对大家有所帮助。阅读本文章假设大家对C语言有了一定的了解
IP层的整体实现架构
IP层接受底层数据报文的处理流程
/* * 在NET_RX_SOFTIRQ软中后,由ETH_P_IP触发的ipv4协议入口函数 */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { /* * 过滤掉送往其他主机的数据包(这时网卡正在处于混杂模式) */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; iph = ip_hdr(skb); /*头的长度是否至少是IP头长度(5); 是否是IPV4报文*/ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; /*IP头长度是否正确,不是伪造的长度*/ if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); /*检查校验和*/ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; len = ntohs(iph->tot_len); if (skb->len < len) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /*实际尺寸不匹配套接字缓冲(skb->len)中维护的信息,则调用skb_trim调整数据包的长度*/ if (pskb_trim_rcsum(skb, len)) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } /*调用IP_PRE_ROUTING(NF_INET_PRE_ROUTING)上注册的钩子, *在调用钩子处理完之后,调用钩子处理完成之后,调用ip_rcv_finish * 后面讲防火墙的时候,我们会仔细梳理*/ return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); } /* NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish)*/ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \ NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN) { int __ret; \ if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, thresh, 1)) == 1)\ __ret = (okfn)(skb); \ __ret; } static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh, int cond) { /*逐个调用注册的防火墙钩子*/ return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); } /* * 接收完数据包后的后续处理函数 */ static int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; /* * 激活ip_route_input,确定报文的路由,如果ip_route_input无法从FIB中找到路由 * 则丢弃数据报文,ip_route_input将在IP路由中的专题中进行讲解 */ if (skb_dst(skb) == NULL) { int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { goto drop; } } /*检查IP报头里面是否含有选项,如果含有建立ip_options*/ if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; /*根据dst_entry的结果,使用skb_dst(skb)->input(skb)进行IP的路由选择 *传递给本地计算机的单播或多播,进入 ip_local_deliver(); *单播转发的报文进入ip_forward() *多播转发进入ip_mr_input() */ return dst_input(skb); { skb_dst(skb)->input(skb) } drop: kfree_skb(skb); return NET_RX_DROP; } /*目的地分发策略的注册*/ static int __mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { //....... rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; //...... } static int __mkroute_output(struct rtable **result, struct fib_result *res, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags) { //...... if (flags & RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->u.dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(oldflp->fl4_dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } } #endif } //...... }
如果IP报文需要转发,那么分析流程如下
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------- /*单播转发处理,负责处理转发相关的所有动作*/ int ip_forward(struct sk_buff *skb) { /*删除不是PACKET_HOST的数据包*/ if (skb->pkt_type != PACKET_HOST) goto drop; /*TTL递减为1之间,丢弃该报,并返回ICMP_TIME_EXCEEDED*/ if (ip_hdr(skb)->ttl <= 1) goto too_many_hops; /*如果skb->len大于MTU值,且Dont-Fragment被职位,则丢弃此报文, *并返回ICMP_FRAG_NEEDED*/ if (unlikely(skb->len > dst_mtu(&rt->u.dst) && (ip_hdr(skb)->frag_off & htons(IP_DF)))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; } /*检查是否有足够的空间用于输出网络设备中的MAC报头dst.header_len() *调用skb_cow来创建一个新的足够长的skb,并且拷贝原来的所有数据 */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; iph = ip_hdr(skb); /*TTL减少1*/ ip_decrease_ttl(iph); /*使用IP_FORWARD中注册的钩子函数,当防火墙中的钩子都与运行完成后, *进入ip_forward_finish*/ return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); sr_failed: /* * Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop; too_many_hops: /* Tell the sender its packet died... */ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); return NET_RX_DROP; } /* * 该函数没有什么用途,除非启用了FASTROUTE, * 将处理后的函数报文送入output阶段 */ static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); /*使用ip_forward_options处理IP选项*/ if (unlikely(opt->optlen)) ip_forward_options(skb); /*送入到输出阶段*/ return dst_output(skb); { skb_dst(skb)->output(skb); } } /*目的地分发策略的注册*/ static int __mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { //....... rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; //...... } int ip_output(struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; /*将skb->dev指向输出设备的dev*/ skb->dev = dev; /*设置2层包类型为ETH_P_IP*/ skb->protocol = htons(ETH_P_IP); /*使用防火墙中的NF_IP_POST_ROUTING中注册的钩子函数进行处理, *处理完成之后进入ip_finish_output处理*/ return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /*判定是否进行IP分片*/ static int ip_finish_output(struct sk_buff *skb) { /*如果报文尺寸大于MTU,则进行IP分片后送入ip_finish_output2 *否则直接送入ip_finish_output2 */ if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); } static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .output = neigh_resolve_output, .hh_output = dev_queue_xmit, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .output = neigh_resolve_output, .hh_output = dev_queue_xmit, }; static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, __be16 protocol) { struct hh_cache *hh; //...... if (n->nud_state & NUD_CONNECTED) hh->hh_output = n->ops->hh_output; /*也就是dev_queue_xmit*/ else hh->hh_output = n->ops->output; //...... } static void neigh_suspect(struct neighbour *neigh) { //..... neigh->output = neigh->ops->output; /*也就是neigh_resolve_output*/ } static inline int ip_finish_output2(struct sk_buff *skb) { /*如果2层头数据空间不够,则重新分配足够长度的SKB,并将数据复制到新的SKB后释放原来SKB*/ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); if (skb->sk) skb_set_owner_w(skb2, skb->sk); kfree_skb(skb); skb = skb2; } /*如果路由出口项中已经含有2层包头缓存的引用(dst->hh),进入neigh_hh_output*/ if (dst->hh) return neigh_hh_output(dst->hh, skb); /*如果没有dst->hh,有dst->neighbour,则启动地址解析协议,也就是neigh_resolve_output*/ else if (dst->neighbour) return dst->neighbour->output(skb); } static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) { /*直接复制2层包头到套接字的包数据空间中*/ memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); skb_push(skb, hh_len); /*调用hh->hh_output(skb),也就是dev_queue_xmit进行硬件发送*/ return hh->hh_output(skb); }
如果IP是上送本地CPU的报文,处理流程如下
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------- /*包的本地投递*/ int ip_local_deliver(struct sk_buff *skb) { /*收集并组装IP分片,如果还没有收集完成,那么就等待IP分片组装完成*/ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } /*进入NF_IP_LOCAL_IN的过滤器处理,处理完成后进入ip_local_deliver_finish*/ return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); } /*IP层处理完成后的协议分发函数*/ static int ip_local_deliver_finish(struct sk_buff *skb) { resubmit: /*如果是RAW-IP报文,送往RAW-IP对应的处理???*/ raw = raw_local_deliver(skb, protocol); /*MAX_INET_PROTOS-1 为IP报头中协议的模, *这里计算对应协议在ipprot中被散列的位置*/ hash = protocol & (MAX_INET_PROTOS - 1); /*IP层上的ipprot负责管理所有的传输协议*/ ipprot = rcu_dereference(inet_protos[hash]); /*如果找到相应的协议,那么调用对应的处理例程*/ if (ipprot != NULL) { ret = ipprot->handler(skb); if (ret < 0) { protocol = -ret; goto resubmit; } } /*找不到相应的处理例程*/ else { /*又是RAW-IP报文,会在RAW-IP处理例程??? * 就丢弃,并想对端发送ICMP_DEST_UNREACH,ICMP_PROT_UNREACH*/ if (!raw) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb(skb); } return 0; } static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, /*TCP*/ }; static const struct net_protocol udp_protocol = { .handler = udp_rcv, /*UDP*/ }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, /*ICMP*/ }; static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, /*IGMP*/ };
通过上面的分析讲解,我们就可以很清楚的了解到IP是如何接受一个来自于网卡的数据包,并如何进行三层报文
关于二层数据报文的处理流程和是如何送到三层进行处理,请参考我前面的博客
希望大家批评指正