5. 网络层和传输层收包处理

网络层和传输层收包处理

  1. 网络层在哪收包?
  2. 如何递交到传输层?

在这里我们不讨论路由子系统,也就是假设我知道数据包应该被网络层哪个函数处理。

框架

有了上一节的铺垫,知道协议栈初始化时注册了哪些处理函数。我们分析__netif_receive_skn_core函数中那个函数指针ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);在IPv4中调用了ip_rcv, IPv6中调用了ipv6_rcv。我们以ip_rcv为例子继续分析。先放上一张蓝图,后面看代码能给我们一个框架印象。

转发流程.jpg

实现

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    const struct iphdr *iph;
    struct net *net;
    u32 len;

    //......

    return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
               net, NULL, skb, dev, NULL,
               ip_rcv_finish);
}

ip_rcv函数在对skbip头做必要的检查后,想往常一样会调用ip_rcv_finish函数,这类finish后缀的函数,是在natfilter检查通过后调用的,关于natfilter,后面再说,这个坑和路由子系统一样,大坑。

static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    int (*edemux)(struct sk_buff *skb);
    struct net_device *dev = skb->dev;
    struct rtable *rt;
    int err;

    /* if ingress device is enslaved to an L3 master device pass the
     * skb to its handler for processing
     */
    skb = l3mdev_ip_rcv(skb);
    if (!skb)
        return NET_RX_SUCCESS;

    //数据包分流
    if (net->ipv4.sysctl_ip_early_demux &&
        !skb_dst(skb) &&
        !skb->sk &&
        !ip_is_fragment(iph)) {
        const struct net_protocol *ipprot;
        int protocol = iph->protocol;

        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
            err = edemux(skb);
            if (unlikely(err))
                goto drop_error;
            /* must reload iph, skb->head might have changed */
            iph = ip_hdr(skb);
        }
    }

    /*
     *  Initialise the virtual path cache for the packet. It describes
     *  how the packet travels inside Linux networking.
     */
    //路由查找
    if (!skb_valid_dst(skb)) {
        err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                       iph->tos, dev);
        if (unlikely(err))
            goto drop_error;
    }

#ifdef CONFIG_IP_ROUTE_CLASSID
    if (unlikely(skb_dst(skb)->tclassid)) {
        struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
        u32 idx = skb_dst(skb)->tclassid;
        st[idx&0xFF].o_packets++;
        st[idx&0xFF].o_bytes += skb->len;
        st[(idx>>16)&0xFF].i_packets++;
        st[(idx>>16)&0xFF].i_bytes += skb->len;
    }
#endif

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb_rtable(skb);
    if (rt->rt_type == RTN_MULTICAST) {
        __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
    } else if (rt->rt_type == RTN_BROADCAST) {
        __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
    } else if (skb->pkt_type == PACKET_BROADCAST ||
           skb->pkt_type == PACKET_MULTICAST) {
        struct in_device *in_dev = __in_dev_get_rcu(dev);

        /* RFC 1122 3.3.6:
         *
         *   When a host sends a datagram to a link-layer broadcast
         *   address, the IP destination address MUST be a legal IP
         *   broadcast or IP multicast address.
         *
         *   A host SHOULD silently discard a datagram that is received
         *   via a link-layer broadcast (see Section 2.4) but does not
         *   specify an IP multicast or broadcast destination address.
         *
         * This doesn't explicitly say L2 *broadcast*, but broadcast is
         * in a way a form of multicast and the most common use case for
         * this is 802.11 protecting against cross-station spoofing (the
         * so-called "hole-196" attack) so do it for both.
         */
        if (in_dev &&
            IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
            goto drop;
    }
    
    //调用路由子系统分配的处理函数,可能为ip_local_deliver或者ip_forward
    return dst_input(skb);

drop:
    kfree_skb(skb);
    return NET_RX_DROP;

drop_error:
    if (err == -EXDEV)
        __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
    goto drop;
}

ip_rcv_finish函数,这个函数最重要的任务就是,通过路由子系统,找到我们的数据包后续应该交给谁来处理,这个任务是ip_route_input_noref函数完成的。再开篇我们说,在这我们预设知道数据包后续将交给谁处理。

  1. 发往本地的数据包:ip_local_deliver
  2. 转发的数据包:ip_forward
  3. 组播数据包:ip_mr_input

接下来我们以发往本地的TCP包作为例子继续分析。

int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *  Reassemble IP fragments.
     */
    struct net *net = dev_net(skb->dev);

    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }

    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
               net, NULL, skb, skb->dev, NULL,
               ip_local_deliver_finish);
}

同样的手法,继续看ip_local_deliver_finish

static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    __skb_pull(skb, skb_network_header_len(skb));

    rcu_read_lock();
    {
        int protocol = ip_hdr(skb)->protocol;
        const struct net_protocol *ipprot;
        int raw;

    resubmit:
        raw = raw_local_deliver(skb, protocol);
        //从inet_protos数组中取出对应的net_protocol元素,TCP的为tcp_protocol
        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot) {
            int ret;

            if (!ipprot->no_policy) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    kfree_skb(skb);
                    goto out;
                }
                nf_reset(skb);
            }
            //调用L4协议的处理函数,对于TCP,调用tcp_protocol->handler,为tcp_v4_rcv()
            ret = ipprot->handler(skb);
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
        } else {
            if (!raw) {
                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                    icmp_send(skb, ICMP_DEST_UNREACH,
                          ICMP_PROT_UNREACH, 0);
                }
                kfree_skb(skb);
            } else {
                __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
                consume_skb(skb);
            }
        }
    }
 out:
    rcu_read_unlock();

    return 0;
}

ip_local_deliver_finish函数以ip_hdr(skb)->protocol作为索引,从全局变量inet_protos中找到对应协议的处理函数。inet_protos是啥?在哪初始化的?前面在协议栈初始化时已经介绍了。inet_init函数通过inet_add_protocol调用注册的。

if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)      //tcp协议的注册
        pr_crit("%s: Cannot add TCP protocol\n", __func__);

static struct net_protocol tcp_protocol = {
    .early_demux    =   tcp_v4_early_demux,
    .early_demux_handler =  tcp_v4_early_demux,
    .handler    =   tcp_v4_rcv,     //对应ip_local_deliver_finish = ipprot->handler(skb);
    .err_handler    =   tcp_v4_err,
    .no_policy  =   1,
    .netns_ok   =   1,
    .icmp_strict_tag_validation = 1,
};

继续看tcp_v4_rcv

int tcp_v4_rcv(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);
    int sdif = inet_sdif(skb);
    const struct iphdr *iph;
    const struct tcphdr *th;
    bool refcounted;
    struct sock *sk;
    int ret;
    //......
lookup:
    //通过四元组得到对应的sock。
    sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
                   th->dest, sdif, &refcounted);
    if (!sk)
        goto no_tcp_socket;

process:
    //如果是time_wait状态,则进入相关处理(这次不会分析time_wait状态,以后分析tcp的断开状态变迁时,会详细分析这个).  
    if (sk->sk_state == TCP_TIME_WAIT)
        goto do_time_wait;
    //......
    
    //这个宏很简单就是判断(sk)->sk_lock.owned.也就是当进程上下文在使用这个sock时为1
    if (!sock_owned_by_user(sk)) {
        //tcp_v4_do_rcv处理这个skb(其实也就是直接放到receive_queue中).  
        ret = tcp_v4_do_rcv(sk, skb);
    } else if (tcp_add_backlog(sk, skb)) {//当有进程在使用这个sock则放buf到sk_backlog中。  
        goto discard_and_relse;
    }
    bh_unlock_sock(sk);

    //......
}

tcp_v4_rcv函数只要做以下几个工作:

  1. 设置TCP_CB;
  2. 查找控制块;
  3. 根据控制块状态做不同处理,包括TCP_TIME_WAIT状态处理,TCP_NEW_SYN_RECV状态处理,TCP_LISTEN状态处理;
  4. 接收TCP段;

我们假设收到这个数据包时,TCP连接已经建立(sk->sk_state == TCP_ESTABLISHED)。我们会tcp_v4_do_rcv函数去将数据包保存到接收队列,等待交给对应的应用层进程。

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
    struct sock *rsk;

    if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
        struct dst_entry *dst = sk->sk_rx_dst;

        sock_rps_save_rxhash(sk, skb);
        sk_mark_napi_id(sk, skb);
        if (dst) {
            if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
                !dst->ops->check(dst, 0)) {
                dst_release(dst);
                sk->sk_rx_dst = NULL;
            }
        }
        tcp_rcv_established(sk, skb);
        return 0;
    }
    
    //......
}

如上,tcp_v4_do_rcv函数对于已经建立的连接,会通过tcp_rcv_established将数据包,加入到连接对应的struct sock接收队列中,这样应用层经常在通过read类的系统IO接口就可以接收到数据了。tcp_rcv_established有点复杂。打算将这个单独出来。大佬写的tcp_rcv_established函数分析

总结

到这,我们协议栈的收包任务就完成了。一共花了五篇博客的篇幅,显然这里基本没有什么报文处理的细节,就连skb_buf怎么用也没有介绍。只是简单的讲协议的调用流程过了一遍,这样在后面如果分析细节时,能做到心中有数,知道当前的处理时位于协议栈的哪一块,在哪个路径下处理的。

打算将后续的内容分为如下几个大块:

  1. 网络编程接口函数和在内核的处理;
  2. 协议栈发包流程;
  3. 协议栈各层级的处理细节;

你可能感兴趣的:(5. 网络层和传输层收包处理)