网络收包流程从网卡驱动开始,一直往上,涉及NAPI、GRO、RPS等特性,但是一般最后都会调用__netif_receive_skb函数:
函数主要有几个处理:
1、vlan报文的处理,主要是循环把vlan头剥掉,如果qinq场景,两个vlan都会被剥掉;
2、交给rx_handler处理,例如OVS、linux bridge等;
3、ptype_all处理,例如抓包程序、raw socket等;
4、ptype_base处理,交给协议栈处理,例如ip、arp、rarp等;
- static int __netif_receive_skb(struct sk_buff *skb)
- {
- struct packet_type *ptype, *pt_prev;
- rx_handler_func_t *rx_handler;
- struct net_device *orig_dev;
- struct net_device *null_or_dev;
- bool deliver_exact = false;
- int ret = NET_RX_DROP;
- __be16 type;
-
- if (!netdev_tstamp_prequeue)
- net_timestamp_check(skb);
-
- trace_netif_receive_skb(skb);
-
- if (netpoll_receive_skb(skb))
- return NET_RX_DROP;
-
- if (!skb->skb_iif)
- skb->skb_iif = skb->dev->ifindex;
- orig_dev = skb->dev;
-
- skb_reset_network_header(skb); //把L3、L4的头都指向data数据结构,到这里的时候skb已经处理完L2层的头了
- skb_reset_transport_header(skb);
- skb_reset_mac_len(skb);
-
- pt_prev = NULL;
-
- rcu_read_lock();
-
- another_round:
-
- __this_cpu_inc(softnet_data.processed);
-
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
- skb = vlan_untag(skb);
- if (unlikely(!skb))
- goto out;
- }
-
- #ifdef CONFIG_NET_CLS_ACT
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- goto ncls;
- }
- #endif
-
- list_for_each_entry_rcu(ptype, &ptype_all, list) { //把包交给特定协议相关的处理函数前,先调用ptype_all中注册的函数
- if (!ptype->dev || ptype->dev == skb->dev) { //最常见的为tcpdump,该工具就是从这里拿到所有收到的包的
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype; //pt_prev的加入是为了优化,只有当找到下一个匹配的时候,才执行这一次的回调函数
- }
- }
-
- #ifdef CONFIG_NET_CLS_ACT
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
- if (!skb)
- goto out;
- ncls:
- #endif
- rx_handler = rcu_dereference(skb->dev->rx_handler); //由具体驱动决定
- if (rx_handler) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
- switch (rx_handler(&skb)) {
- case RX_HANDLER_CONSUMED:
- goto out;
- case RX_HANDLER_ANOTHER:
- goto another_round;
- case RX_HANDLER_EXACT:
- deliver_exact = true;
- case RX_HANDLER_PASS:
- break;
- default:
- BUG();
- }
- }
-
- if (vlan_tx_tag_present(skb)) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
- if (vlan_do_receive(&skb)) {
- ret = __netif_receive_skb(skb);
- goto out;
- } else if (unlikely(!skb))
- goto out;
- }
-
- /* deliver only exact match when indicated */
- null_or_dev = deliver_exact ? skb->dev : NULL;
-
- type = skb->protocol;
- list_for_each_entry_rcu(ptype,
- &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type &&
- (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
- ptype->dev == orig_dev)) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev); //atomic_inc(&skb->users);
- pt_prev = ptype;
- }
- }
-
- if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); //一般的最后这一次没有引用计数的增加,直接调用函数
- } else {
- atomic_long_inc(&skb->dev->rx_dropped);
- kfree_skb(skb);
- /* Jamal, now you will not able to escape explaining
- * me how you were going to use this. :-)
- */
- ret = NET_RX_DROP;
- }
-
- out:
- rcu_read_unlock();
- return ret;
- }
该函数涉及两个全局变量:
- static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
- static struct list_head ptype_all __read_mostly
看几个常见的packet_type,这些都在相应的协议初始化的时候调用dev_add_pack加入到特性的链表中:
- static struct packet_type ip_packet_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_IP),
- .func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
- };
-
- static struct packet_type arp_packet_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_ARP),
- .func = arp_rcv,
- }
在ip_rcv函数中会对L3头做一些有效性检测:
- int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
- {
- const struct iphdr *iph;
- u32 len;
-
- /* When the interface is in promisc. mode, drop all the crap
- * that it receives, do not try to analyse it.
- */
- if (skb->pkt_type == PACKET_OTHERHOST) //驱动根据MAC地址设置的,如果MAC地址不是本机的话,在这里丢弃。
- goto drop;
-
-
- IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
-
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
- goto out;
- }
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto inhdr_error;
-
- iph = ip_hdr(skb);
-
- /*
- * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
- *
- * Is the datagram acceptable?
- *
- * 1. Length at least the size of an ip header
- * 2. Version of 4
- * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
- * 4. Doesn't have a bogus length
- */
-
- if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
-
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto inhdr_error;
-
- iph = ip_hdr(skb);
-
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) //校验ip头是否正确
- goto inhdr_error;
- len = ntohs(iph->tot_len); //iph中的大小是真正的大小,skb中len的大小是驱动中设置的,当包很小的时候,会进行填充,因此会比iph中的大
- if (skb->len < len) {//以r8169为例,如果收到udp的包负载为1,则iph中的大小为20+8+1=29。但是此时skb->len=46=64(min)-14-4(vlan)
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
- goto drop;
- } else if (len < (iph->ihl*4))
- goto inhdr_error;
-
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- if (pskb_trim_rcsum(skb, len)) { //去除填充的数据
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
-
- /* Remove any debris in the socket control block */
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-
- /* Must drop socket now because of tproxy. */
- skb_orphan(skb);
-
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
- ip_rcv_finish);
-
- inhdr_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
- drop:
- kfree_skb(skb);
- out:
- return NET_RX_DROP;
- }
然后调用ip_rcv_finish:
- static int ip_rcv_finish(struct sk_buff *skb)
- {
- const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt;
-
- /*
- * Initialise the virtual path cache for the packet. It describes
- * how the packet travels inside Linux networking.
- */
- if (skb_dst(skb) == NULL) {
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,//路由寻找,根据目的地址判断是本地接收还是转发(使能forward的话)
- iph->tos, skb->dev);
- if (unlikely(err)) {
- if (err == -EHOSTUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INADDRERRORS);
- else if (err == -ENETUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INNOROUTES);
- else if (err == -EXDEV)
- NET_INC_STATS_BH(dev_net(skb->dev),
- LINUX_MIB_IPRPFILTER);
- goto drop;
- }
- }
-
- #ifdef CONFIG_IP_ROUTE_CLASSID
- if (unlikely(skb_dst(skb)->tclassid)) {
- struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
- u32 idx = skb_dst(skb)->tclassid;
- st[idx&0xFF].o_packets++;
- st[idx&0xFF].o_bytes += skb->len;
- st[(idx>>16)&0xFF].i_packets++;
- st[(idx>>16)&0xFF].i_bytes += skb->len;
- }
- #endif
-
- if (iph->ihl > 5 && ip_rcv_options(skb))
- goto drop;
-
- rt = skb_rtable(skb);
- if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
- skb->len);
- } else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
- skb->len);
-
- return dst_input(skb); //skb_dst(skb)->input(skb);路由寻找过程中赋值,本地接收的话为:ip_local_deliver
-
- drop:
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- int ip_local_deliver(struct sk_buff *skb)
- {
- /*
- * Reassemble IP fragments.
- */
-
- if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
- return 0;
- }
-
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
- ip_local_deliver_finish);
- }
略过ip defrag流程,直接调用ip_local_deliver_finish,该函数根据L3头指定的L4协议,调用特定的函数:
- static int ip_local_deliver_finish(struct sk_buff *skb)
- {
- struct net *net = dev_net(skb->dev);
-
- __skb_pull(skb, ip_hdrlen(skb)); //增加data,略过L3头,此时data指向L4头
-
- /* Point into the IP datagram, just past the header. */
- skb_reset_transport_header(skb);
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol; //L4类型,如TCP或者UDP
- int hash, raw;
- const struct net_protocol *ipprot;
-
- resubmit:
- raw = raw_local_deliver(skb, protocol); //
-
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]); //udp_protocol
- if (ipprot != NULL) {
- int ret;
-
- if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- if (net_ratelimit())
- printk("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
- kfree_skb(skb);
- goto out;
- }
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
- }
- ret = ipprot->handler(skb); //udp_rcv
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
- }
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- } else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- } else
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
- }
- }
- out:
- rcu_read_unlock();
-
- return 0;
- }
udp调用udp_rcv,最后调用__udp4_lib_rcv:
- int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
- int proto)
- {
- struct sock *sk;
- struct udphdr *uh;
- unsigned short ulen;
- struct rtable *rt = skb_rtable(skb);
- __be32 saddr, daddr;
- struct net *net = dev_net(skb->dev);
-
- /*
- * Validate the packet.
- */
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto drop; /* No space for header. */
-
- uh = udp_hdr(skb);
- ulen = ntohs(uh->len);
- saddr = ip_hdr(skb)->saddr;
- daddr = ip_hdr(skb)->daddr;
-
- if (ulen > skb->len)
- goto short_packet;
-
- if (proto == IPPROTO_UDP) {
- /* UDP validates ulen. */
- if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
- goto short_packet;
- uh = udp_hdr(skb);
- }
-
- if (udp4_csum_init(skb, uh, proto))
- goto csum_error;
-
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable);
-
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); //根据ip地址以及端口号查找对应的sock数据结构
- //接收进程在对应的链表中睡眠
- if (sk != NULL) { //不为空说明有对应的进程在等待这数据
- int ret = udp_queue_rcv_skb(sk, skb);
- sock_put(sk);
-
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0)
- return -ret;
- return 0;
- }
-
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto drop;
- nf_reset(skb);
-
- /* No socket. Drop packet silently, if checksum is wrong */
- if (udp_lib_checksum_complete(skb))
- goto csum_error;
-
- UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
- /*
- * Hmm. We got an UDP packet to a port to which we
- * don't wanna listen. Ignore it.
- */
- kfree_skb(skb);
- return 0;
- }
首先看一下sock的hash查找函数:__udp4_lib_lookup_skb,该函数涉及hash表的一些查找,主要看一下具体的匹配函数:
- static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
- unsigned short hnum,
- __be16 sport, __be32 daddr, __be16 dport, int dif)
- {
- int score = -1;
-
- if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
- !ipv6_only_sock(sk)) {
- struct inet_sock *inet = inet_sk(sk);
-
- score = (sk->sk_family == PF_INET ? 1 : 0); //一般为PF_INET
- if (inet->inet_rcv_saddr) { //bind指定地址的话有设置,否则为INADDR_ANY
- if (inet->inet_rcv_saddr != daddr)
- return -1;
- score += 2;
- }
- if (inet->inet_daddr) { //一般为0,参考inet_bind函数
- if (inet->inet_daddr != saddr)
- return -1;
- score += 2;
- }
- if (inet->inet_dport) { //一般为0
- if (inet->inet_dport != sport)
- return -1;
- score += 2;
- }
- if (sk->sk_bound_dev_if) { //一般为0
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 2;
- }
- }
- return score;
- }
该函数使用端口号寻找hash表中项,然后根据各个参数决定score,score大于-1表示找到对应的sock
找到sock后,去掉一些有效性检测,udp_queue_rcv_skb的逻辑如下:
- if (sk_rcvqueues_full(sk, skb)) //超过限值,sk->sk_rmem_alloc
- goto drop;
-
- rc = 0;
-
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk)
分成两种情况:
1)sk没有被人占用,则把skb加入sk_receive_queue,然后唤醒等待的进程。
2
)
如果sk被人占用,则把skb加入backlog链表,释放sk的时候会处理这种流程
先看第一种情况:
- int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
- {
- int err;
- int skb_len;
- unsigned long flags;
- struct sk_buff_head *list = &sk->sk_receive_queue; //获取链表头
-
- /* Cast sk->rcvbuf to unsigned... It is pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
- atomic_inc(&sk->sk_drops);
- trace_sock_rcvqueue_full(sk, skb);
- return -ENOMEM;
- }
-
- err = sk_filter(sk, skb);
- if (err)
- return err;
-
- if (!sk_rmem_schedule(sk, skb->truesize)) {
- atomic_inc(&sk->sk_drops);
- return -ENOBUFS;
- }
-
- skb->dev = NULL;
- skb_set_owner_r(skb, sk);
-
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
- */
- skb_len = skb->len;
-
- /* we escape from rcu protected region, make sure we dont leak
- * a norefcounted dst
- */
- skb_dst_force(skb);
- spin_lock_irqsave(&list->lock, flags);
- skb->dropcount = atomic_read(&sk->sk_drops);
- __skb_queue_tail(list, skb);
- spin_unlock_irqrestore(&list->lock, flags);
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len); //sock_init_data初始化的时候赋值为:sock_def_readable
- return 0;
- }
- static void sock_def_readable(struct sock *sk, int len)
- {
- struct socket_wq *wq;
-
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
- wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
- POLLRDNORM | POLLRDBAND);
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
- rcu_read_unlock();
- }
再看第二种情况,加入到对应的链表:
- static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
- {
- /* dont let skb dst not refcounted, we are going to leave rcu lock */
- skb_dst_force(skb);
-
- if (!sk->sk_backlog.tail)
- sk->sk_backlog.head = skb;
- else
- sk->sk_backlog.tail->next = skb;
-
- sk->sk_backlog.tail = skb;
- skb->next = NULL;
- }
释放sock的时候会判断该链表:
- void release_sock(struct sock *sk)
- {
- /*
- * The sk_lock has mutex_unlock() semantics:
- */
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
- spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_backlog.tail)
- __release_sock(sk);
- sk->sk_lock.owned = 0;
- if (waitqueue_active(&sk->sk_lock.wq))
- wake_up(&sk->sk_lock.wq);
- spin_unlock_bh(&sk->sk_lock.slock);
- }
__release_sock会遍历tail对应链表上的所有skb,分别调用sk_backlog_rcv函数:
- static void __release_sock(struct sock *sk)
- __releases(&sk->sk_lock.slock)
- __acquires(&sk->sk_lock.slock)
- {
- struct sk_buff *skb = sk->sk_backlog.head;
-
- do {
- sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
- bh_unlock_sock(sk);
-
- do {
- struct sk_buff *next = skb->next;
-
- WARN_ON_ONCE(skb_dst_is_noref(skb));
- skb->next = NULL;
- sk_backlog_rcv(sk, skb); //sk->sk_backlog_rcv(sk, skb)=sk->sk_prot->backlog_rcv
-
- /*
- * We are in process context here with softirqs
- * disabled, use cond_resched_softirq() to preempt.
- * This is safe to do because we've taken the backlog
- * queue private:
- */
- cond_resched_softirq();
-
- skb = next;
- } while (skb != NULL);
-
- bh_lock_sock(sk);
- } while ((skb = sk->sk_backlog.head) != NULL);
-
- /*
- * Doing the zeroing here guarantee we can not loop forever
- * while a wild producer attempts to flood us.
- */
- sk->sk_backlog.len = 0;
- }
对于udp为__udp_queue_rcv_skb:
- static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
- {
- int rc;
-
- if (inet_sk(sk)->inet_daddr)
- sock_rps_save_rxhash(sk, skb->rxhash);
-
- rc = ip_queue_rcv_skb(sk, skb); //调用sock_queue_rcv_skb,回到第一种处理情况
- if (rc < 0) {
- int is_udplite = IS_UDPLITE(sk);
-
- /* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- trace_udp_fail_queue_rcv_skb(rc, sk);
- return -1;
- }
-
- return 0;
-
- }
一句话总结,对应udp而言,__netif_receive_skb把底层传上来的skb放到sock对应的sk_receive_queue链表中,然后唤醒等待数据的进程。
ARP报文处理:
在netif_receive_skb()函数中,可以看出处理的是像ARP、IP这些链路层以上的协议,那么,链路层报头是在哪里去掉的呢?答案是网卡驱动中,在调用netif_receive_skb()前,
skb->protocol = eth_type_trans(skb, bp->dev);
该函数对处理后skb>data跳过以太网报头,由mac_header指示以太网报头:
进入netif_receive_skb()函数
list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list)
按照协议类型依次由相应的协议模块进行处理,而所以的协议模块处理都会注册在ptype_base中,实际是链表结构。
net/core/dev.c
static struct list_head ptype_base __read_mostly; /* Taps */
而相应的协议模块是通过dev_add_pack()函数加入的:
void dev_add_pack(struct packet_type *pt)
{
int hash;
spin_lock_bh(&ptype_lock);
if (pt->type == htons(ETH_P_ALL))
list_add_rcu(&pt->list, &ptype_all);
else {
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
list_add_rcu(&pt->list, &ptype_base[hash]);
}
spin_unlock_bh(&ptype_lock);
}
以ARP处理为例
该模块的定义,它会在arp_init()中注册进ptype_base链表中:
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
然后在根据报文的TYPE来在ptype_base中查找相应协议模块进行处理时,实际调用arp_rcv()进行接收
arp_rcv() --> arp_process()
arp = arp_hdr(skb);
……
arp_ptr= (unsigned char *)(arp+1);
sha= arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
arp_ptr += dev->addr_len;
memcpy(&tip, arp_ptr, 4);
操作后这指针位置:
然后判断是ARP请求报文,这时先查询路由表ip_route_input()
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input(skb, tip, sip, 0, dev) == 0)
在ip_route_input()函数中,先在cache中查询是否存在相应的路由表项:
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
缓存的路由项在内核中组织成hash表的形式,因此在查询时,先算出的hash值,再用该项- rt_hash_table[hash].chain即可。这里可以看到,缓存路由项包括了源IP地址、目的IP地址、网卡号。
如果在缓存中没有查到匹配项,或指定不查询cache,则查询路由表ip_route_input_slow();
进入ip_route_input_slow()函数,最终调用fib_lookup()得到查询结果fib_result
if ((err = fib_lookup(net, &fl, &res)) != 0)
如果结果fib_result合法,则需要更新路由缓存,将此次查询结果写入缓存
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
在查找完路由表后,回到arp_process()函数,如果路由项指向本地,则应由本机接收该报文:
if (addr_type == RTN_LOCAL) {
……
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
neigh_release(n);
}
}
goto out;
}
首先更新邻居表neigh_event_ns(),然后发送ARP响应 – arp_send。
至此,大致的ARP流程完成。由于ARP部分涉及到路由表以及邻居表,这都是很大的概念。