注:本人使用的LINUX内核版本是2.6.35。
端到端或点到端的IPSec VPN会使用到IPv4报文的转发处理过程。LINUX系统作为IPSec VPN安全网关,网关内侧网络的一台终端发送IPv4报文到网关外侧网络,报文需经网关路由转发出去,目的可能为另一台安全网关或另一台终端。
IPv4报文转发处理是由ip_forward函数完成的。下面简单介绍一下儿ip_forward是如何被调用的。
net/ipv4/ip_input.c: ip_rcv()是处理接收的ipv4报文的入口函数
ip_rcv():
-> 调用NF_INET_PRE_ROUTING Netfilter回调,然后调用ip_rcv_finish
ip_rcv_finish:
-> 调用ip_route_input_noref查询路由
-> 调用ip_route_input_slow
-> 根据源和目的ip,调用fib_lookup查询FIB表
-> 如果为广播ip,走广播处理,如果为本地ip,走本地处理:将dst->input设为ip_local_deliver;如果都不是,则调用ip_mkroute_input,将dst->input设为ip_forward,dst->output设为ip_output
-> 调用dst_input(),即dst->input(),所以转发就会调用到ip_forward
ip_forward:
-> 调用xfrm4_policy_check(XFRM_POLICY_FWD, skb)
-> 调用xfrm4_route_forward(skb)
-> 调用xfrm_route_forward()
-> 调用__xfrm_route_forward()
-> 调用NF_INET_FORWARD Netfilter,然后调用ip_forward_finish
int ip_forward(struct sk_buff *skb)
{
u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
goto drop;
if (unlikely(skb->sk))
goto drop;
if (skb_warn_if_lro(skb))
goto drop;
/* 检查转发IPSec策略 */
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
skb_forward_csum(skb);
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
/* 查询安全路由 */
if (!xfrm4_route_forward(skb))
goto drop;
rt = skb_rtable(skb);
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
goto drop;
}
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
skb->dev, rt->dst.dev, ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
ip_forward_finish:
-> 调用dst_output_sk
-> 调用dst->output即安全路由的xfrm4_output,详见《发送过程》
static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
return dst_output_sk(sk, skb);
}
__xfrm_route_forward():
-> 调用xfrm_decode_session
-> 调用__xfrm_decode_session,调用(struct xfrm_policy_afinfo *) afinfo->decode_session,即net/ipv4/ipsec/xfrm4_policy.c:_decode_session4(),初始化并解析报文中的一些信息到struct flowi fl(包括mark,sport,dport,proto,spi,saddr,daddr,tos)
-> 这时skb的dst为原始路由,调用skb_skb_force强制使原始路由至少被引用一次
-> 调用xfrm_lookup(skb, dst, fl)查询IPSec 策略获得新的安全路由(struct dst_entry *)dst
-> 调用skb_dst_set()设置skb的dst为新的安全路由
static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
struct net *net = dev_net(skb->dev);
return !net->xfrm.policy_count[XFRM_POLICY_OUT] ||
(skb_dst(skb)->flags & DST_NOXFRM) ||
__xfrm_route_forward(skb, family);
}
static inline int xfrm4_route_forward(struct sk_buff *skb)
{
return xfrm_route_forward(skb, AF_INET);
}
int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
struct net *net = dev_net(skb->dev);
struct flowi fl;
struct dst_entry *dst;
int res = 1;
if (xfrm_decode_session(skb, &fl, family) < 0) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
return 0;
}
skb_dst_force(skb);
dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
if (IS_ERR(dst)) {
res = 0;
dst = NULL;
}
skb_dst_set(skb, dst);
return res;
}
xfrm_lookup(
Struct net *net,
Struct dst_entry **dst_p, //*dst_p不为空,为原始路由dst
Struct flowi *fl, //xfrm_decode_session 得到的流信息
Struct sock *sk,
Int flags):
-> 如果Sk->sk_policy[XFRM_POLICY_OUT]不为空(可以通过socket的set option设置策略),则调用Xfrm_sk_policy_lookup从socket的policy中查找匹配流信息fl的策略,然后调用Xfrm_expand_policies,最后调用Xfrm_resolve_and_create_bundle从已建立好的IPSECSA状态中查找匹配策略的SA状态
-> 如果Sk->sk_policy[XFRM_POLICY_OUT]为空(可能还有其他情况),则调用Flow_cache_lookup(net,fl, family, dir, resolver,dst_orig),dst_orig为原始路由dst_entry,flow_cache_lookup返回的是struct flow_cache_object结构体指针,这个结构体类似一个基类,它有两个派生类,一个是struct xfrm_dst_entry,一个是struct xfrm_policy,这个结构体指针会被保存在struct flow_cache_entry中,具体是保存的哪种取决于dir和resolver函数。这里是OUT方向和xfrm_bundle_lookup,所以返回的是struct xfrm_dst类型。另一个使用flow_cache_lookup的地方是在xfrm_policy_check,那里返回则是struct xfrm_policy类型,这里就不多说了。如果获得flo成功,则返回它对应的struct xfrm_dst的struct dst_entry。这两个类型也是基类和派生类的关系,如果是原始路由则派生类是struct rtable,如果是安全路由则派生类是struct xfrm_dst。
/* Main function: finds/creates a bundle for given flow.
*
* At the moment we eat a raw IP route. Mostly to speed up lookups
* on interfaces with disabled IPsec.
*/
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
const struct flowi *fl,
struct sock *sk, int flags)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct flow_cache_object *flo;
struct xfrm_dst *xdst;
struct dst_entry *dst, *route;
u16 family = dst_orig->ops->family;
u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
dst = NULL;
xdst = NULL;
route = NULL;
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
goto dropdst;
if (num_pols) {
if (num_xfrms <= 0) {
drop_pols = num_pols;
goto no_transform;
}
xdst = xfrm_resolve_and_create_bundle(
pols, num_pols, fl,
family, dst_orig);
if (IS_ERR(xdst)) {
xfrm_pols_put(pols, num_pols);
err = PTR_ERR(xdst);
goto dropdst;
} else if (xdst == NULL) {
num_xfrms = 0;
drop_pols = num_pols;
goto no_transform;
}
dst_hold(&xdst->u.dst);
xdst->u.dst.flags |= DST_NOCACHE;
route = xdst->route;
}
}
if (xdst == NULL) {
struct xfrm_flo xflo;
xflo.dst_orig = dst_orig;
xflo.flags = flags;
/* To accelerate a bit... */
if ((dst_orig->flags & DST_NOXFRM) ||
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
flo = flow_cache_lookup(net, fl, family, dir,
xfrm_bundle_lookup, &xflo);
if (flo == NULL)
goto nopol;
if (IS_ERR(flo)) {
err = PTR_ERR(flo);
goto dropdst;
}
xdst = container_of(flo, struct xfrm_dst, flo);
num_pols = xdst->num_pols;
num_xfrms = xdst->num_xfrms;
memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
route = xdst->route;
}
dst = &xdst->u.dst;
if (route == NULL && num_xfrms > 0) {
/* The only case when xfrm_bundle_lookup() returns a
* bundle with null route, is when the template could
* not be resolved. It means policies are there, but
* bundle could not be created, since we don't yet
* have the xfrm_state's. We need to wait for KM to
* negotiate new SA's or bail out with error.*/
if (net->xfrm.sysctl_larval_drop) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
err = -EREMOTE;
goto error;
}
err = -EAGAIN;
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
goto error;
}
no_transform:
if (num_pols == 0)
goto nopol;
if ((flags & XFRM_LOOKUP_ICMP) &&
!(pols[0]->flags & XFRM_POLICY_ICMP)) {
err = -ENOENT;
goto error;
}
for (i = 0; i < num_pols; i++)
pols[i]->curlft.use_time = get_seconds();
if (num_xfrms < 0) {
/* Prohibit the flow */
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
err = -EPERM;
goto error;
} else if (num_xfrms > 0) {
/* Flow transformed */
dst_release(dst_orig);
} else {
/* Flow passes untransformed */
dst_release(dst);
dst = dst_orig;
}
ok:
xfrm_pols_put(pols, drop_pols);
if (dst && dst->xfrm &&
dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
dst->flags |= DST_XFRM_TUNNEL;
return dst;
nopol:
if (!(flags & XFRM_LOOKUP_ICMP)) {
dst = dst_orig;
goto ok;
}
err = -ENOENT;
error:
dst_release(dst);
dropdst:
if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
dst_release(dst_orig);
xfrm_pols_put(pols, drop_pols);
return ERR_PTR(err);
}
下面是最重要的一个函数了,为了加快IPSec转发处理的速度,内核将以前已经匹配到策略的流(由selector选择)通过flow_cache_entry缓存了起来,将它匹配到的策略和查找到的SA状态保存在这个结构体中,这样下次再转发这个流的报文时直接从缓存中查找。
下面还需整理:
Flow_cache_lookup(net,fl, family, dir, resolver,dst_orig):
-> 根据flowi key在flow cache hash table中查找,如果没有找到,则创建fle(flow cache entry),并赋值family,dir,key(是从__xfrm_lookup传过来的fl),并调用resolver(xfrm_bundle_lookup),创建fle->object,
回调Xfrm_bundle_lookup(net, fl, family, dir, oldflo, dst_orig)
Struct Flow_cache_object flo是struct xfrm_dst的成员,可以通过container_of获得struct xfrm_dst的指针 xdst。
如果oldflo不为空,则用container_of取得xdst,检查xdst中的polices是否有用。
如果oldflo为空(可能还有其他情况),xdst为空,则
__xfrm_policy_lookup保存到pols
调用xfrm_policy_lookup_bytype在subtype和maintype中查找
从net->xfrm.policy_bydst[dir].table哈希表中查找,从net->xfrm.policy_inexact[dir]中查找,选出两个中priority最大的policy存到pol[0]。
调用xfrm_expand_policies,如果为subtype找到的policy,则从maintype中再找,如果找到则存到pol[1]
得到总共的pol_nr和pols的xfrm_nr。
Xfrm_resolve_and_create_bundle保存到new_xdst
1、调用xfrm_tmpl_resolve – 遍历policy调用xfrm_tmpl_resolve_one
根据policy->xfrm_nr遍历policy的xfrm_vec,struct xfrm_tmpl
调用Xfrm_state_find根据family,reqid,mark,dst_addr,src_addr,mode, proto,spid从net->xfrm.state_bydst哈希表中查找xfrm_state,先使用src_addr和dst_addr查,如果没找到则用dst_addr查,如果还没找到,则。。。创建xfrm_state
如果policy个数有多个,则调用xfrm_state_sort选择一个
2、调用xfrm_bundle_create()根据多个xfrm_state生成bundle
遍历xfrm_state
调用xfrm_alloc_dst分配xdst->调用dst_alloc(&net->xfrm.xfrm4_dst_ops)从mem_cache中分配struct xfrm_dst,并用struct dst_entry *dst对dst_entry进行初始化
如果不是传输模式,要使用隧道的目的地址重查路由,获得新的dst_orig,用于最后报文的发送
Xfrm4_fill_dst
赋值dst1->output为xfrm_state的outer_mode->afinfo->output
将每一个xfrm_state的dst_entry串成一个双向链表,next指向上一级处理,child指向下一级处理,最后一级处理的child是dst_orig。
释放原xdst,返回&new_xdst->flo