【内核IPSec代码分析2】报文转发处理过程

注:本人使用的LINUX内核版本是2.6.35。

端到端或点到端的IPSec VPN会使用到IPv4报文的转发处理过程。LINUX系统作为IPSec VPN安全网关,网关内侧网络的一台终端发送IPv4报文到网关外侧网络,报文需经网关路由转发出去,目的可能为另一台安全网关或另一台终端。

IPv4报文转发处理是由ip_forward函数完成的。下面简单介绍一下儿ip_forward是如何被调用的。
net/ipv4/ip_input.c: ip_rcv()是处理接收的ipv4报文的入口函数
ip_rcv():
-> 调用NF_INET_PRE_ROUTING Netfilter回调,然后调用ip_rcv_finish
ip_rcv_finish:
-> 调用ip_route_input_noref查询路由
-> 调用ip_route_input_slow
-> 根据源和目的ip,调用fib_lookup查询FIB表
-> 如果为广播ip,走广播处理,如果为本地ip,走本地处理:将dst->input设为ip_local_deliver;如果都不是,则调用ip_mkroute_input,将dst->input设为ip_forward,dst->output设为ip_output
-> 调用dst_input(),即dst->input(),所以转发就会调用到ip_forward

ip_forward:
-> 调用xfrm4_policy_check(XFRM_POLICY_FWD, skb)
-> 调用xfrm4_route_forward(skb)
-> 调用xfrm_route_forward()
-> 调用__xfrm_route_forward()
-> 调用NF_INET_FORWARD Netfilter,然后调用ip_forward_finish

int ip_forward(struct sk_buff *skb)
{
    u32 mtu;
    struct iphdr *iph;  /* Our header */
    struct rtable *rt;  /* Route we use */
    struct ip_options *opt  = &(IPCB(skb)->opt);

    /* that should never happen */
    if (skb->pkt_type != PACKET_HOST)
        goto drop;

    if (unlikely(skb->sk))
        goto drop;

    if (skb_warn_if_lro(skb))
        goto drop;

    /* 检查转发IPSec策略 */
    if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
        goto drop;

    if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
        return NET_RX_SUCCESS;

    skb_forward_csum(skb);

    /*
     *  According to the RFC, we must first decrease the TTL field. If
     *  that reaches zero, we must reply an ICMP control message telling
     *  that the packet's lifetime expired.
     */
    if (ip_hdr(skb)->ttl <= 1)
        goto too_many_hops;

    /* 查询安全路由 */
    if (!xfrm4_route_forward(skb))
        goto drop;

    rt = skb_rtable(skb);

    if (opt->is_strictroute && rt->rt_uses_gateway)
        goto sr_failed;

    IPCB(skb)->flags |= IPSKB_FORWARDED;
    mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
    if (ip_exceeds_mtu(skb, mtu)) {
        IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
              htonl(mtu));
        goto drop;
    }

    /* We are about to mangle packet. Copy it! */
    if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
        goto drop;
    iph = ip_hdr(skb);

    /* Decrease ttl after skb cow done */
    ip_decrease_ttl(iph);

    /*
     *  We now generate an ICMP HOST REDIRECT giving the route
     *  we calculated.
     */
    if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
        !skb_sec_path(skb))
        ip_rt_send_redirect(skb);

    skb->priority = rt_tos2priority(iph->tos);

    return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
               skb->dev, rt->dst.dev, ip_forward_finish);

sr_failed:
    /*
     *  Strict routing permits no gatewaying
     */
     icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
     goto drop;

too_many_hops:
    /* Tell the sender its packet died... */
    IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
    icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}

ip_forward_finish:
-> 调用dst_output_sk
-> 调用dst->output即安全路由的xfrm4_output,详见《发送过程》

static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
{
    struct ip_options *opt  = &(IPCB(skb)->opt);

    IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
    IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);

    if (unlikely(opt->optlen))
        ip_forward_options(skb);

    skb_sender_cpu_clear(skb);
    return dst_output_sk(sk, skb);
}

__xfrm_route_forward():
-> 调用xfrm_decode_session
-> 调用__xfrm_decode_session,调用(struct xfrm_policy_afinfo *) afinfo->decode_session,即net/ipv4/ipsec/xfrm4_policy.c:_decode_session4(),初始化并解析报文中的一些信息到struct flowi fl(包括mark,sport,dport,proto,spi,saddr,daddr,tos)
-> 这时skb的dst为原始路由,调用skb_skb_force强制使原始路由至少被引用一次
-> 调用xfrm_lookup(skb, dst, fl)查询IPSec 策略获得新的安全路由(struct dst_entry *)dst
-> 调用skb_dst_set()设置skb的dst为新的安全路由

static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
    struct net *net = dev_net(skb->dev);

    return  !net->xfrm.policy_count[XFRM_POLICY_OUT] ||
        (skb_dst(skb)->flags & DST_NOXFRM) ||
        __xfrm_route_forward(skb, family);
}

static inline int xfrm4_route_forward(struct sk_buff *skb)
{
    return xfrm_route_forward(skb, AF_INET);
}

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
    struct net *net = dev_net(skb->dev);
    struct flowi fl;
    struct dst_entry *dst;
    int res = 1;

    if (xfrm_decode_session(skb, &fl, family) < 0) {
        XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
        return 0;
    }

    skb_dst_force(skb);

    dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
    if (IS_ERR(dst)) {
        res = 0;
        dst = NULL;
    }
    skb_dst_set(skb, dst);
    return res;
}

xfrm_lookup(
Struct net *net,
Struct dst_entry **dst_p, //*dst_p不为空,为原始路由dst
Struct flowi *fl, //xfrm_decode_session 得到的流信息
Struct sock *sk,
Int flags):
-> 如果Sk->sk_policy[XFRM_POLICY_OUT]不为空(可以通过socket的set option设置策略),则调用Xfrm_sk_policy_lookup从socket的policy中查找匹配流信息fl的策略,然后调用Xfrm_expand_policies,最后调用Xfrm_resolve_and_create_bundle从已建立好的IPSECSA状态中查找匹配策略的SA状态
-> 如果Sk->sk_policy[XFRM_POLICY_OUT]为空(可能还有其他情况),则调用Flow_cache_lookup(net,fl, family, dir, resolver,dst_orig),dst_orig为原始路由dst_entry,flow_cache_lookup返回的是struct flow_cache_object结构体指针,这个结构体类似一个基类,它有两个派生类,一个是struct xfrm_dst_entry,一个是struct xfrm_policy,这个结构体指针会被保存在struct flow_cache_entry中,具体是保存的哪种取决于dir和resolver函数。这里是OUT方向和xfrm_bundle_lookup,所以返回的是struct xfrm_dst类型。另一个使用flow_cache_lookup的地方是在xfrm_policy_check,那里返回则是struct xfrm_policy类型,这里就不多说了。如果获得flo成功,则返回它对应的struct xfrm_dst的struct dst_entry。这两个类型也是基类和派生类的关系,如果是原始路由则派生类是struct rtable,如果是安全路由则派生类是struct xfrm_dst。

/* Main function: finds/creates a bundle for given flow.
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 */
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                  const struct flowi *fl,
                  struct sock *sk, int flags)
{
    struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
    struct flow_cache_object *flo;
    struct xfrm_dst *xdst;
    struct dst_entry *dst, *route;
    u16 family = dst_orig->ops->family;
    u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
    int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

    dst = NULL;
    xdst = NULL;
    route = NULL;

    if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
        num_pols = 1;
        pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
        err = xfrm_expand_policies(fl, family, pols,
                       &num_pols, &num_xfrms);
        if (err < 0)
            goto dropdst;

        if (num_pols) {
            if (num_xfrms <= 0) {
                drop_pols = num_pols;
                goto no_transform;
            }

            xdst = xfrm_resolve_and_create_bundle(
                    pols, num_pols, fl,
                    family, dst_orig);
            if (IS_ERR(xdst)) {
                xfrm_pols_put(pols, num_pols);
                err = PTR_ERR(xdst);
                goto dropdst;
            } else if (xdst == NULL) {
                num_xfrms = 0;
                drop_pols = num_pols;
                goto no_transform;
            }

            dst_hold(&xdst->u.dst);
            xdst->u.dst.flags |= DST_NOCACHE;
            route = xdst->route;
        }
    }

    if (xdst == NULL) {
        struct xfrm_flo xflo;

        xflo.dst_orig = dst_orig;
        xflo.flags = flags;

        /* To accelerate a bit...  */
        if ((dst_orig->flags & DST_NOXFRM) ||
            !net->xfrm.policy_count[XFRM_POLICY_OUT])
            goto nopol;

        flo = flow_cache_lookup(net, fl, family, dir,
                    xfrm_bundle_lookup, &xflo);
        if (flo == NULL)
            goto nopol;
        if (IS_ERR(flo)) {
            err = PTR_ERR(flo);
            goto dropdst;
        }
        xdst = container_of(flo, struct xfrm_dst, flo);

        num_pols = xdst->num_pols;
        num_xfrms = xdst->num_xfrms;
        memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
        route = xdst->route;
    }

    dst = &xdst->u.dst;
    if (route == NULL && num_xfrms > 0) {
        /* The only case when xfrm_bundle_lookup() returns a
         * bundle with null route, is when the template could
         * not be resolved. It means policies are there, but
         * bundle could not be created, since we don't yet
         * have the xfrm_state's. We need to wait for KM to
         * negotiate new SA's or bail out with error.*/
        if (net->xfrm.sysctl_larval_drop) {
            XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
            err = -EREMOTE;
            goto error;
        }

        err = -EAGAIN;

        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
        goto error;
    }

no_transform:
    if (num_pols == 0)
        goto nopol;

    if ((flags & XFRM_LOOKUP_ICMP) &&
        !(pols[0]->flags & XFRM_POLICY_ICMP)) {
        err = -ENOENT;
        goto error;
    }

    for (i = 0; i < num_pols; i++)
        pols[i]->curlft.use_time = get_seconds();

    if (num_xfrms < 0) {
        /* Prohibit the flow */
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
        err = -EPERM;
        goto error;
    } else if (num_xfrms > 0) {
        /* Flow transformed */
        dst_release(dst_orig);
    } else {
        /* Flow passes untransformed */
        dst_release(dst);
        dst = dst_orig;
    }
ok:
    xfrm_pols_put(pols, drop_pols);
    if (dst && dst->xfrm &&
        dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
        dst->flags |= DST_XFRM_TUNNEL;
    return dst;

nopol:
    if (!(flags & XFRM_LOOKUP_ICMP)) {
        dst = dst_orig;
        goto ok;
    }
    err = -ENOENT;
error:
    dst_release(dst);
dropdst:
    if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
        dst_release(dst_orig);
    xfrm_pols_put(pols, drop_pols);
    return ERR_PTR(err);
}

下面是最重要的一个函数了,为了加快IPSec转发处理的速度,内核将以前已经匹配到策略的流(由selector选择)通过flow_cache_entry缓存了起来,将它匹配到的策略和查找到的SA状态保存在这个结构体中,这样下次再转发这个流的报文时直接从缓存中查找。

下面还需整理:
Flow_cache_lookup(net,fl, family, dir, resolver,dst_orig):
-> 根据flowi key在flow cache hash table中查找,如果没有找到,则创建fle(flow cache entry),并赋值family,dir,key(是从__xfrm_lookup传过来的fl),并调用resolver(xfrm_bundle_lookup),创建fle->object,
回调Xfrm_bundle_lookup(net, fl, family, dir, oldflo, dst_orig)
Struct Flow_cache_object flo是struct xfrm_dst的成员,可以通过container_of获得struct xfrm_dst的指针 xdst。
如果oldflo不为空,则用container_of取得xdst,检查xdst中的polices是否有用。
如果oldflo为空(可能还有其他情况),xdst为空,则
__xfrm_policy_lookup保存到pols
调用xfrm_policy_lookup_bytype在subtype和maintype中查找
从net->xfrm.policy_bydst[dir].table哈希表中查找,从net->xfrm.policy_inexact[dir]中查找,选出两个中priority最大的policy存到pol[0]。
调用xfrm_expand_policies,如果为subtype找到的policy,则从maintype中再找,如果找到则存到pol[1]
得到总共的pol_nr和pols的xfrm_nr。

Xfrm_resolve_and_create_bundle保存到new_xdst
1、调用xfrm_tmpl_resolve – 遍历policy调用xfrm_tmpl_resolve_one
根据policy->xfrm_nr遍历policy的xfrm_vec,struct xfrm_tmpl
调用Xfrm_state_find根据family,reqid,mark,dst_addr,src_addr,mode, proto,spid从net->xfrm.state_bydst哈希表中查找xfrm_state,先使用src_addr和dst_addr查,如果没找到则用dst_addr查,如果还没找到,则。。。创建xfrm_state
如果policy个数有多个,则调用xfrm_state_sort选择一个
2、调用xfrm_bundle_create()根据多个xfrm_state生成bundle
遍历xfrm_state
调用xfrm_alloc_dst分配xdst->调用dst_alloc(&net->xfrm.xfrm4_dst_ops)从mem_cache中分配struct xfrm_dst,并用struct dst_entry *dst对dst_entry进行初始化
如果不是传输模式,要使用隧道的目的地址重查路由,获得新的dst_orig,用于最后报文的发送
Xfrm4_fill_dst
赋值dst1->output为xfrm_state的outer_mode->afinfo->output
将每一个xfrm_state的dst_entry串成一个双向链表,next指向上一级处理,child指向下一级处理,最后一级处理的child是dst_orig。

释放原xdst,返回&new_xdst->flo

你可能感兴趣的:(LINUX内核)