netfilter之conntrack连接跟踪

连接跟踪conntrack是状态防火墙和NAT的基础,每个经过conntrack处理的数据包的skb->nfctinfo都会设置如下值之一,后续流程中NAT模块根据此值做不同的处理,filter模块可以在扩展匹配中指定state进行不同的处理。

enum ip_conntrack_info {
    /* Part of an established connection (either direction). */
    //收到双向报文,连接已经建立,对original方向报文设置此标志
    IP_CT_ESTABLISHED,

    /* Like NEW, but related to an existing connection, or ICMP error
       (in either direction). */
    IP_CT_RELATED,

    /* Started a new connection to track (only
           IP_CT_DIR_ORIGINAL); may be a retransmission. */
    //收到original方向数据包,连接还未建立
    IP_CT_NEW,

    /* >= this indicates reply direction */
    //收到reply方向数据包,说明连接建立
    IP_CT_IS_REPLY,

    IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
    IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
    IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,   
    /* Number of distinct IP_CT types (no NEW in reply dirn). */
    IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
};

nf hook函数注册

跟连接跟踪相关的hook函数包含下面两个:重组相关的和conntrack处理相关的。

注册报文重组hook函数。

static struct nf_hook_ops ipv4_defrag_ops[] = {
    {
        .hook       = ipv4_conntrack_defrag,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_PRE_ROUTING,
        .priority   = NF_IP_PRI_CONNTRACK_DEFRAG,
    },
    {
        .hook           = ipv4_conntrack_defrag,
        .owner          = THIS_MODULE,
        .pf             = NFPROTO_IPV4,
        .hooknum        = NF_INET_LOCAL_OUT,
        .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
    },
};
static int __init nf_defrag_init(void)
{
    return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
}

module_init(nf_defrag_init);

注册conntrack hook函数。

/* Connection tracking may drop packets, but never alters them, so
   make it the first hook. */
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
    {
        .hook       = ipv4_conntrack_in,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_PRE_ROUTING,
        .priority   = NF_IP_PRI_CONNTRACK,
    },
    {
        .hook       = ipv4_conntrack_local,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_CONNTRACK,
    },
    {
        .hook       = ipv4_helper,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_POST_ROUTING,
        .priority   = NF_IP_PRI_CONNTRACK_HELPER,
    },
    {
        .hook       = ipv4_confirm,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_POST_ROUTING,
        .priority   = NF_IP_PRI_CONNTRACK_CONFIRM,
    },
    {
        .hook       = ipv4_helper,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_CONNTRACK_HELPER,
    },
    {
        .hook       = ipv4_confirm,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_CONNTRACK_CONFIRM,
    },
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
{
    ...
    ret = nf_register_hooks(ipv4_conntrack_ops,
                ARRAY_SIZE(ipv4_conntrack_ops));
    ...
}

上面的hook函数会注册到二维数组nf_hook中。

enum nf_inet_hooks {
    NF_INET_PRE_ROUTING,
    NF_INET_LOCAL_IN,
    NF_INET_FORWARD,
    NF_INET_LOCAL_OUT,
    NF_INET_POST_ROUTING,
    NF_INET_NUMHOOKS
};
enum {
    NFPROTO_UNSPEC =  0,
    NFPROTO_INET   =  1,
    NFPROTO_IPV4   =  2,
    NFPROTO_ARP    =  3,
    NFPROTO_BRIDGE =  7,
    NFPROTO_IPV6   = 10,
    NFPROTO_DECNET = 12,
    NFPROTO_NUMPROTO,
};
extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];

int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
{
    unsigned int i;

    for (i = 0; i < n; i++) {
         nf_register_hook(®[i]);
    }
}

int nf_register_hook(struct nf_hook_ops *reg)
{
    struct nf_hook_ops *elem;

    mutex_lock(&nf_hook_mutex);
    list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
        if (reg->priority < elem->priority)
            break;
    }
    list_add_rcu(®->list, elem->list.prev);
    mutex_unlock(&nf_hook_mutex);
#ifdef HAVE_JUMP_LABEL
    static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
    return 0;
}

重组和conntrack hook注册成功后,nf_hook内容如下标黄,这也是ipv4的连接跟踪模块用到的hook函数,小括号中的数字是hook函数的优先级。在同一个hook点上,数字越小优先级越高。


image.png

发送给本机的数据会经过 NF_INET_PRE_ROUTING 和 NF_INET_LOCAL_IN 两个hook点,所以hook函数调用顺序为:
ipv4_conntrack_defrag -> ipv4_conntrack_in -> ipv4_helper -> ipv4_confirm。

由本机转发的数据会经过 NF_INET_PRE_ROUTING 和 NF_INET_FORWARD 和 NF_INET_POST_ROUTING 三个hook点,所以hook函数调用顺序为:
ipv4_conntrack_defrag -> ipv4_conntrack_in -> ipv4_helper -> ipv4_confirm。

本机发送的数据会经过 NF_INET_LOCAL_OUT 和NF_INET_POST_ROUTING 两个hook点,所以hook函数调用顺序为:
ipv4_conntrack_defrag -> ipv4_conntrack_local -> ipv4_helper -> ipv4_confirm。

可看到不管数据包从哪来到哪去,经过的连接跟踪模块处理基本是一样的,唯一的区别是ipv4_conntrack_in和ipv4_conntrack_local,后者增加了对数据包长度的校验,即只有从本机发出去的报文才需要校验长度。

static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
                      struct sk_buff *skb,
                      const struct net_device *in,
                      const struct net_device *out,
                      int (*okfn)(struct sk_buff *))
{
    return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
}

static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
                     struct sk_buff *skb,
                     const struct net_device *in,
                     const struct net_device *out,
                     int (*okfn)(struct sk_buff *))
{
    /* root is playing with raw sockets. */
    if (skb->len < sizeof(struct iphdr) ||
        ip_hdrlen(skb) < sizeof(struct iphdr))
        return NF_ACCEPT;
    return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
}

hook函数执行

下面分别分析这四个hook函数。

  1. ipv4_conntrack_defrag
    重组分片报文。重组完整前不让数据包进行下一步
static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
                      struct sk_buff *skb,
                      const struct net_device *in,
                      const struct net_device *out,
                      int (*okfn)(struct sk_buff *))
{
    struct sock *sk = skb->sk;
    struct inet_sock *inet = inet_sk(skb->sk);
    //对于PF_INET类型的socket,并且inet->nodefrag置位了,则
    //不允许重组,返回NF_ACCEPT
    if (sk && (sk->sk_family == PF_INET) &&
        inet->nodefrag)
        return NF_ACCEPT;

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#if !IS_ENABLED(CONFIG_NF_NAT)
    /* Previously seen (loopback)?  Ignore.  Do this before
       fragment check. */
    //nfct不为空,并且没有IPS_TEMPLATE_BIT标志,说明此ct是
    //在raw表匹配到target为notrack的规则。
    if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
        return NF_ACCEPT;
#endif
#endif
    /* Gather fragments. */
    //如果是分片报文,只处理分片报文
    if (ip_is_fragment(ip_hdr(skb))) {
        //获取重组的user。user表示谁来执行重组,或者说在哪个
        //模块哪个阶段重组
        enum ip_defrag_users user =
            nf_ct_defrag_user(ops->hooknum, skb);
        //返回值为非零表示未完成重组(只收到第一片或者某几
        //片),需要将skb保存到队列,或者重组过程出错,此时
        //需要释放skb。
        if (nf_ct_ipv4_gather_frags(skb, user))
            return NF_STOLEN;
    }
    return NF_ACCEPT;
}
  1. nf_conntrack_in
    连接跟踪处理的主函数nf_conntrack_in,其会用到l3proto和l4proto函数,先看一下这两组函数的注册。

注册l3proto Ipv4的处理函数。

struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
    .l3proto     = PF_INET,
    .name        = "ipv4",
    .pkt_to_tuple    = ipv4_pkt_to_tuple, //获取源目的ip
    .invert_tuple    = ipv4_invert_tuple,  //源目的ip调换
    .print_tuple     = ipv4_print_tuple,  //打印出源目的ip
    .get_l4proto     = ipv4_get_l4proto, //获取ip报文总长度和四层协议号
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
    .tuple_to_nlattr = ipv4_tuple_to_nlattr,
    .nlattr_tuple_size = ipv4_nlattr_tuple_size,
    .nlattr_to_tuple = ipv4_nlattr_to_tuple,
    .nla_policy  = ipv4_nla_policy,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
    .ctl_table_path  = "net/ipv4/netfilter",
#endif
    .init_net    = ipv4_init_net,
    .me      = THIS_MODULE,
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
    ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);

int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
{
    int ret = 0;
    struct nf_conntrack_l3proto *old;

    if (proto->l3proto >= AF_MAX)
        return -EBUSY;

    if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size)
        return -EINVAL;

    mutex_lock(&nf_ct_proto_mutex);
    old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
                    lockdep_is_held(&nf_ct_proto_mutex));

    if (proto->nlattr_tuple_size)
        proto->nla_size = 3 * proto->nlattr_tuple_size();

    rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
}

注册l4proto处理函数

int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
    rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto);

//ipv4的l4注册了tcp,udp和icmp这三种协议
static int __init nf_conntrack_l3proto_ipv4_init(void)
    ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
    ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
    ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);

//以udp为例说明
struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
{
    .l3proto        = PF_INET,
    .l4proto        = IPPROTO_UDP,
    .name           = "udp",
    .pkt_to_tuple       = udp_pkt_to_tuple, //获取源目的port
    .invert_tuple       = udp_invert_tuple,//源目的port调换
    .print_tuple        = udp_print_tuple,//打印源目的port
    .packet         = udp_packet, //更新ct中定时器超时时间并更新统计计数
    .get_timeouts       = udp_get_timeouts, //获取ct超时时间
    .new            = udp_new, //创建新ct时调用
    .error          = udp_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
    .tuple_to_nlattr    = nf_ct_port_tuple_to_nlattr,
    .nlattr_to_tuple    = nf_ct_port_nlattr_to_tuple,
    .nlattr_tuple_size  = nf_ct_port_nlattr_tuple_size,
    .nla_policy     = nf_ct_port_nla_policy,
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
    .ctnl_timeout       = {
        .nlattr_to_obj  = udp_timeout_nlattr_to_obj,
        .obj_to_nlattr  = udp_timeout_obj_to_nlattr,
        .nlattr_max = CTA_TIMEOUT_UDP_MAX,
        .obj_size   = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
        .nla_policy = udp_timeout_nla_policy,
    },
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
    .init_net       = udp_init_net,
    .get_net_proto      = udp_get_net_proto,
};

nf_conntrack_in 会用到上面注册的l3proto和l4proto函数

unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
        struct sk_buff *skb)
{
    struct nf_conn *ct, *tmpl = NULL;
    enum ip_conntrack_info ctinfo;
    struct nf_conntrack_l3proto *l3proto;
    struct nf_conntrack_l4proto *l4proto;
    unsigned int *timeouts;
    unsigned int dataoff;
    u_int8_t protonum;
    int set_reply = 0;
    int ret;
    //如果skb已经有ct了,并且有template标志
    //IPS_TEMPLATE_BIT,说明报文在raw表经过了
    //notrack处理,不用记录在连接跟踪表,可以直接返回。
    //如果有template标志IPS_TEMPLATE_BIT,说明是helper相关 
    //的处理,保存tmpl,将skb->nfct置空,后面重新给它分配ct
    if (skb->nfct) {
        /* Previously seen (loopback or untracked)?  Ignore. */
        tmpl = (struct nf_conn *)skb->nfct;
        if (!nf_ct_is_template(tmpl)) {
            NF_CT_STAT_INC_ATOMIC(net, ignore);
            return NF_ACCEPT;
        }
        skb->nfct = NULL;
    }

    /* rcu_read_lock()ed by nf_hook_slow */
    //根据pf到nf_ct_l3protos获取l3proto
    //对于ipv4,l3proto为nf_conntrack_l3proto_ipv4,用于获取三层源目的ip。
    l3proto = __nf_ct_l3proto_find(pf);
    ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
                   &dataoff, &protonum);
    if (ret <= 0) {
        pr_debug("not prepared to track yet or error occurred\n");
        NF_CT_STAT_INC_ATOMIC(net, error);
        NF_CT_STAT_INC_ATOMIC(net, invalid);
        ret = -ret;
        goto out;
    }
    //根据pf和四层协议号到nf_ct_protos找l4proto。
    //对于udp协议来说,l4proto就是nf_conntrack_l4proto_udp4,
    //用于获取四层源目的端口号等信息。
    l4proto = __nf_ct_l4proto_find(pf, protonum);

    /* It may be an special packet, error, unclean...
     * inverse of the return code tells to the netfilter
     * core what to do with the packet. */
    if (l4proto->error != NULL) {
        ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
                     pf, hooknum);
        if (ret <= 0) {
            NF_CT_STAT_INC_ATOMIC(net, error);
            NF_CT_STAT_INC_ATOMIC(net, invalid);
            ret = -ret;
            goto out;
        }
        /* ICMP[v6] protocol trackers may assign one conntrack. */
        if (skb->nfct)
            goto out;
    }

    ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
                   l3proto, l4proto, &set_reply, &ctinfo);
    if (!ct) {
        /* Not valid part of a connection */
        NF_CT_STAT_INC_ATOMIC(net, invalid);
        ret = NF_ACCEPT;
        goto out;
    }

    if (IS_ERR(ct)) {
        /* Too stressed to deal. */
        NF_CT_STAT_INC_ATOMIC(net, drop);
        ret = NF_DROP;
        goto out;
    }

    NF_CT_ASSERT(skb->nfct);

    /* Decide what timeout policy we want to apply to this flow. */
    //不同的四层协议根据各自特点提供了不同的超时时间,udp提供如下两种
    //static unsigned int udp_timeouts[UDP_CT_MAX] = {
    //  [UDP_CT_UNREPLIED]  = 30*HZ,
    //  [UDP_CT_REPLIED]    = 180*HZ,
    //};
    timeouts = nf_ct_timeout_lookup(net, ct, l4proto);

    //对于udp来说,调用udp_packet->nf_ct_refresh_acct更新ct超
    //时时间,保证此条数据流不断,ct就不会被删除。并且更新统
    //计计数到acct中。
    ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
    if (ret <= 0) {
        /* Invalid: inverse of the return code tells
         * the netfilter core what to do */
        pr_debug("nf_conntrack_in: Can't track with proto module\n");
        nf_conntrack_put(skb->nfct);
        skb->nfct = NULL;
        NF_CT_STAT_INC_ATOMIC(net, invalid);
        if (ret == -NF_DROP)
            NF_CT_STAT_INC_ATOMIC(net, drop);
        ret = -ret;
        goto out;
    }

    //对于一个新创建的连接跟踪项后,当第一次收到reply方向的数
    //据包后,则会设置nf_conn->status的IPS_SEEN_REPLY_BIT
    //位为1,当设置成功且IPS_SEEN_REPLY_BIT位的原来值为0
    //时,则调用nf_conntrack_event_cache ,由nfnetlink模块处理
    //状态改变的事件。
    if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
        nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
    if (tmpl) {
        /* Special case: we have to repeat this hook, assign the
         * template again to this packet. We assume that this packet
         * has no conntrack assigned. This is used by nf_ct_tcp. */
        if (ret == NF_REPEAT)
            skb->nfct = (struct nf_conntrack *)tmpl;
        else
            nf_ct_put(tmpl);
    }

    return ret;
}
/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
static inline struct nf_conn *
resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
          struct sk_buff *skb,
          unsigned int dataoff,
          u_int16_t l3num,
          u_int8_t protonum,
          struct nf_conntrack_l3proto *l3proto,
          struct nf_conntrack_l4proto *l4proto,
          int *set_reply,
          enum ip_conntrack_info *ctinfo)
{
    struct nf_conntrack_tuple tuple;
    struct nf_conntrack_tuple_hash *h;
    struct nf_conn *ct;
    u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
    u32 hash;
    //获取五元组信息
    if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
                 dataoff, l3num, protonum, &tuple, l3proto,
                 l4proto)) {
        pr_debug("resolve_normal_ct: Can't get tuple\n");
        return NULL;
    }

    /* look for tuple match */
    hash = hash_conntrack_raw(&tuple, zone);
    //到全局confirm表net->ct.hash中查找是否已经存在此条流
    h = __nf_conntrack_find_get(net, zone, &tuple, hash);
    if (!h) {
        //如果查找不到,需要分配一个ct。根据tuple获取反方向的
        //reply tuple,将他俩赋值给ct的tuplehash。并将original
        //的tuplehash挂到 net.ct.pcpu_lists->unconfirmed表中
        h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
                   skb, dataoff, hash);
        if (!h)
            return NULL;
        if (IS_ERR(h))
            return (void *)h;
    }
    ct = nf_ct_tuplehash_to_ctrack(h);

    //如果是reply方向的数据包,设置 ctinfo = 
    //IP_CT_ESTABLISHED_REPLY,
    //如果是original方向的数据包,分为如下几种情况:
    //a. original方向第一个数据包,则设置ctinfo = IP_CT_NEW
    //b. original方向的非第一个数据包,并且已经收到reply的数据
    //包,则设置ctinfo = IP_CT_ESTABLISHED
    //c.original方向的数据包,并且是其他连接的期望连接,则设置ctinfo = IP_CT_RELATED
    //d. original方向的非第一个数据包,但是还没有收到reply包,也设置ctinfo = IP_CT_NEW
    /* It exists; we have (non-exclusive) reference. */
    if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
        *ctinfo = IP_CT_ESTABLISHED_REPLY;
        /* Please set reply bit if this packet OK */
        *set_reply = 1;
    } else {
        /* Once we've had two way comms, always ESTABLISHED. */
        if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
            pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
            *ctinfo = IP_CT_ESTABLISHED;
        } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
            pr_debug("nf_conntrack_in: related packet for %p\n",
                 ct);
            *ctinfo = IP_CT_RELATED;
        } else {
            pr_debug("nf_conntrack_in: new packet for %p\n", ct);
            *ctinfo = IP_CT_NEW;
        }
        *set_reply = 0;
    }
    //将ct和ctinfo保存到数据包
    skb->nfct = &ct->ct_general;
    skb->nfctinfo = *ctinfo;
    return ct;
}

//只有original方向的报文才会执行此函数
//分配ct
/* Allocate a new conntrack: we return -ENOMEM if classification
   failed due to stress.  Otherwise it really is unclassifiable. */
static struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
           const struct nf_conntrack_tuple *tuple,
           struct nf_conntrack_l3proto *l3proto,
           struct nf_conntrack_l4proto *l4proto,
           struct sk_buff *skb,
           unsigned int dataoff, u32 hash)
{
    struct nf_conn *ct;
    struct nf_conn_help *help;
    struct nf_conntrack_tuple repl_tuple;
    struct nf_conntrack_ecache *ecache;
    struct nf_conntrack_expect *exp = NULL;
    u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
    struct nf_conn_timeout *timeout_ext;
    unsigned int *timeouts;
    //由tuple获取reply方向的tuple
    if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
        pr_debug("Can't invert tuple.\n");
        return NULL;
    }

    ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
                  hash);
    if (IS_ERR(ct))
        return (struct nf_conntrack_tuple_hash *)ct;

    if (tmpl && nfct_synproxy(tmpl)) {
        nfct_seqadj_ext_add(ct);
        nfct_synproxy_ext_add(ct);
    }

    timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
    if (timeout_ext)
        timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
    else
        timeouts = l4proto->get_timeouts(net);

    if (!l4proto->new(ct, skb, dataoff, timeouts)) {
        nf_conntrack_free(ct);
        pr_debug("init conntrack: can't track with proto module\n");
        return NULL;
    }

    if (timeout_ext)
        nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);

    nf_ct_acct_ext_add(ct, GFP_ATOMIC);
    nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
    nf_ct_labels_ext_add(ct);

    ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
    nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
                 ecache ? ecache->expmask : 0,
                 GFP_ATOMIC);

    local_bh_disable();
    //如果有了期望连接,则需要到net->ct.expect_hash查找自己是
    //否是期望连接,如果是,需要设置 IPS_EXPECTED_BIT,并
    //将 ct->master 指向主连接
    if (net->ct.expect_count) {
        spin_lock(&nf_conntrack_expect_lock);
        exp = nf_ct_find_expectation(net, zone, tuple);
        if (exp) {
            pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
                 ct, exp);
            /* Welcome, Mr. Bond.  We've been expecting you... */
            __set_bit(IPS_EXPECTED_BIT, &ct->status);
            /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
            ct->master = exp->master;
            if (exp->helper) {
                help = nf_ct_helper_ext_add(ct, exp->helper,
                                GFP_ATOMIC);
                if (help)
                    rcu_assign_pointer(help->helper, exp->helper);
            }

#ifdef CONFIG_NF_CONNTRACK_MARK
            ct->mark = exp->master->mark;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
            ct->secmark = exp->master->secmark;
#endif
            NF_CT_STAT_INC(net, expect_new);
        }
        spin_unlock(&nf_conntrack_expect_lock);
    }
    if (!exp) {
        __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
        NF_CT_STAT_INC(net, new);
    }

    /* Now it is inserted into the unconfirmed list, bump refcount */
    nf_conntrack_get(&ct->ct_general);
    //暂时将ct保存到本cpu的unconfirm链表中
    nf_ct_add_to_unconfirmed_list(ct);

    local_bh_enable();

    if (exp) {
        if (exp->expectfn)
            exp->expectfn(ct, exp);
        nf_ct_expect_put(exp);
    }

    return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}
  1. ipv4_help
    ipv4_help主要是执行匹配到的helper函数,进行一些扩展操作,比如ftp的数据通道建立,nat的转换。可参考ftp提供的helper:help
static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
                struct sk_buff *skb,
                const struct net_device *in,
                const struct net_device *out,
                int (*okfn)(struct sk_buff *))
{
    struct nf_conn *ct;
    enum ip_conntrack_info ctinfo;
    const struct nf_conn_help *help;
    const struct nf_conntrack_helper *helper;

    /* This is where we call the helper: as the packet goes out. */
    ct = nf_ct_get(skb, &ctinfo);
    if (!ct || ctinfo == IP_CT_RELATED_REPLY)
        return NF_ACCEPT;

    //从ct的扩展区域尝试获取helper
    help = nfct_help(ct);
    if (!help)
        return NF_ACCEPT;

    /* rcu_read_lock()ed by nf_hook_slow */
    helper = rcu_dereference(help->helper);
    if (!helper)
        return NF_ACCEPT;

    return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
                ct, ctinfo);
}
  1. ipv4_confirm
    ipv4_confirm是优先级最低的hook函数,数据包能走到这里就肯定不会被netfilter丢弃,所以可以将它的ct从uncomfirm(per-cpu)转到confirm(全局的)链表上。
static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
                 struct sk_buff *skb,
                 const struct net_device *in,
                 const struct net_device *out,
                 int (*okfn)(struct sk_buff *))
{
    struct nf_conn *ct;
    enum ip_conntrack_info ctinfo;

    ct = nf_ct_get(skb, &ctinfo);
    if (!ct || ctinfo == IP_CT_RELATED_REPLY)
        goto out;

    /* adjust seqs for loopback traffic only in outgoing direction */
    if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
        !nf_is_loopback_packet(skb)) {
        if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
            NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
            return NF_DROP;
        }
    }
out:
    /* We've seen it coming out the other side: confirm it */
    return nf_conntrack_confirm(skb);
}

/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int nf_conntrack_confirm(struct sk_buff *skb)
{
    struct nf_conn *ct = (struct nf_conn *)skb->nfct;
    int ret = NF_ACCEPT;

    if (ct && !nf_ct_is_untracked(ct)) {
        if (!nf_ct_is_confirmed(ct))
            //将 ct->tuplehash[IP_CT_DIR_ORIGINAL] 从
            //unconfirm hash链上删除,并将ct-
            //>tuplehash[IP_CT_DIR_ORIGINAL]
            //和ct->tuplehash[IP_CT_DIR_REPLY]根据hash同时
            //添加到全局confirm hash链上
            ret = __nf_conntrack_confirm(skb);
        if (likely(ret == NF_ACCEPT))
            //调用通知链上的函数通知netlink模块
            nf_ct_deliver_cached_events(ct);
    }
    return ret;
}

连接跟踪是个基础模块,总结如下图,其他利用它实现功能的模块在其他文章中记录。


image.png

你可能感兴趣的:(netfilter之conntrack连接跟踪)