NAT唯一五元组选取

使用iptable进行nat设置时，可以使用如下扩展选项：

# SNAT 源地址转换，用在 POSTROUTING、INPUT 链
--to-source [[-]][:port[-port]]
--random        # 映射到随机端口号,
--random-fully  # 映射到随机端口号（PRNG 完全随机化）
--persistent    # 映射到固定地址

# DNAT 目的地址转换，用在 PREROUTING、OUTPUT 链
--to-destination [[-]][:port[-port]]
--random        # 映射到随机端口号
--persistent    # 映射到固定地址

在内核中有如下几个标志与上面的选项对应：

/* 指定了IP范围 */
#define NF_NAT_RANGE_MAP_IPS            (1 << 0)
/* 指定了端口具体范围 */
#define NF_NAT_RANGE_PROTO_SPECIFIED        (1 << 1)
/* 范围随机，使用secure_port函数进行源端口计算，对应于--random */
#define NF_NAT_RANGE_PROTO_RANDOM        (1 << 2)
/* 映射到固定地址，同一个客户端使用相同的源地址，对应于--persistent */
#define NF_NAT_RANGE_PERSISTENT            (1 << 3)
/* 完全随机，对应于--random-fully */
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY        (1 << 4)

//上面几个标志有些可以组合使用

//随机标志
#define NF_NAT_RANGE_PROTO_RANDOM_ALL        \
    (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
//范围标志
#define NF_NAT_RANGE_MASK                    \
    (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |    \
     NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |    \
     NF_NAT_RANGE_PROTO_RANDOM_FULLY)

构建nat信息

netfilter在两个地方会构建nat信息。一个是在命中nat规则后构建nat信息，另外一个是relate连接会构建nat信息，在expect函数中。构建nat信息都是使用函数nf_nat_setup_info进行构建，两者的差异在于range参数。后者由iptable规则设置，前者由help函数确定。nat会修改连接跟踪，仅仅修改应答方向。

/* 根据提供的nat类型以及范围进行nat五元组修改 */
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype)
{
    struct net *net = nf_ct_net(ct);/* 获取该连接跟踪所在的网络命名空间 */
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    /* 连接已经确认的不在进行构建 */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     * 获取请求方向的五元组
     */
    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
    /* 根据请求方向的五元组获取nat后的请求方向的五元组 */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
    /* 获取的唯一的五元组进行翻转后将会作为连接跟踪的应答方向的五元组。 */
    /* 新的请求方向的五元组与原来的五元组不一样，则需要改变应答方向的五元组 */
    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* 根据新的五元组得到应答方向的新的五元组 */
        nf_ct_invert_tuplepr(&reply, &new_tuple);
        /* 替换应答方向的五元组 */
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;
        /* 判断该连接是否存在help，如果存在则必须添加seq-adj扩展功能 */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }
    /* 如果是源nat操作，则将该五元组添加到nf_nat_bysource hash表中 */
    /* 该表将会被用来选取snat的源IP，即相同的client会使用相同的源IP */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }

    /* It's done. nat处理完毕 */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;
}

重点分析get_unique_tuple函数

nf_ct_invert_tuplepr(&curr_tuple,

             &ct->tuplehash[IP_CT_DIR_REPLY].tuple);语句求出了curr_tuple，对于首包或者连接没有经过nat来说其值就是请求方向的五元组，没啥不同，对于经过了nat的包，则不同。

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. 
 * 参数tuple为求出来的唯一的五元组。
 * 参数orig_tuple为请求方向的五元组。
 * 参数range为规则设置的参数。
 * 参数maniptype为nat类型，由hook点决定。
 */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
{
    const struct nf_conntrack_zone *zone;
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    rcu_read_lock();
    l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
    l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                    orig_tuple->dst.protonum);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    if (maniptype == NF_NAT_MANIP_SRC && //第一种情况，如果是源nat，并且没有设置随机标志
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* try the original tuple first */
        /* 首先判断原始的方向的五元组是否满足snat的范围要求，如果满足，并且该五元组没有被使用，则直接使用该五元组 
        ** 这种情况下不需要进行nat。非常少见。 */
        if (in_range(l3proto, l4proto, orig_tuple, range)) {
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                goto out;
            }/* 已经使用，则需要进一步计算 */
            
        /* 原始五元组不在范围内，进行源IP选取，选择最近使用的相同的源IP的nat后的IP */    
        } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            /* 查看我们选取的源IP是否满足唯一，满足则直接退出 */
            if (!nf_nat_used_tuple(tuple, ct))
                goto out;
        }
    }

    /* 2) Select the least-used IP/proto combination in the given range */
    /* 2) 前面的snat没有选出合适的源IP或者dnat在这里进一步选择ip */
    *tuple = *orig_tuple;
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */

    /* Only bother mapping if it's not already in range and unique */
    /* 没有设置随机标志 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {//指定了具体端口范围
            if (l4proto->in_range(tuple, maniptype,//查看当前端口是否在指定的范围，并且只指定了一个端口，且五元组没有被使用过，则不再进行端口的选取。
                          &range->min_proto,
                          &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                goto out;
        } else if (!nf_nat_used_tuple(tuple, ct)) {//没有指定具体的端口范围，并且五元组没有被使用，则直接使用。
            goto out;
        }
    }

    /* Last change: get protocol to try to obtain unique tuple. */
    /* 最后使用协议去获取一个端口保证五元组唯一 */
    l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
    rcu_read_unlock();
}

find_appropriate_src

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
             const struct nf_conntrack_zone *zone,
             const struct nf_nat_l3proto *l3proto,
             const struct nf_nat_l4proto *l4proto,
             const struct nf_conntrack_tuple *tuple,
             struct nf_conntrack_tuple *result,
             const struct nf_nat_range *range)
{
    unsigned int h = hash_by_src(net, tuple);
    const struct nf_conn *ct;
    //遍历所有进行snat的请求方向的五元组，查看是否源IP相同，相同则使用对应nat后的源IP。
    hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
        if (same_src(ct, tuple) &&//源IP相同
            net_eq(net, nf_ct_net(ct)) &&//相同命名空间
            nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {//相同的zone
            /* Copy source part from reply tuple. */
            /* 获取应答方向的五元组，反转，得到我们需要nat后的源IP */
            nf_ct_invert_tuplepr(result,
                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);//获取应答方向的反转五元组
            //还原目的IP
            result->dst = tuple->dst;
            //是否符合指定的range，符合则返回1，否则继续下一个元素。
            if (in_range(l3proto, l4proto, result, range))
                return 1;
        }
    }
    return 0;
}

find_best_ips_proto

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 * 选择一个最少使用的IP/PRO协议组合。这里直接采用hash算法计算一个值。
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
            struct nf_conntrack_tuple *tuple,
            const struct nf_nat_range *range,
            const struct nf_conn *ct,
            enum nf_nat_manip_type maniptype)
{
    union nf_inet_addr *var_ipp;
    unsigned int i, max;
    /* Host order */
    u32 minip, maxip, j, dist;
    bool full_range;

    /* No IP mapping?  Do nothing. 没有设置IP转换标志，退出*/
    if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
        return;

    if (maniptype == NF_NAT_MANIP_SRC)/* 根据nat类型，指向需要修改的ip内存地址 */
        var_ipp = &tuple->src.u3;
    else
        var_ipp = &tuple->dst.u3;

    /* Fast path: only one choice. 如果只有一个IP地址，则就使用该IP地址 */
    if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
        *var_ipp = range->min_addr;
        return;
    }
    //计算IP地址最后四字节在ip数组中的偏移。
    if (nf_ct_l3num(ct) == NFPROTO_IPV4)
        max = sizeof(var_ipp->ip) / sizeof(u32) - 1;//为0
    else
        max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;//为3

    /* Hashing source and destination IPs gives a fairly even
     * spread in practice (if there are a small number of IPs
     * involved, there usually aren't that many connections
     * anyway).  The consistency means that servers see the same
     * client coming from the same IP (some Internet Banking sites
     * like this), even across reboots.
     * 如果设置了NF_NAT_RANGE_PERSISTENT标志的话，则保证同一个客户端
     * 使用相同的hash值，即hash的时候仅仅使用源IP，不使用目的IP。
     */
    j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
           range->flags & NF_NAT_RANGE_PERSISTENT ?
            0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
    //对ip地址的每一个四字节进行hash取值，保证在指定的范围内。
    full_range = false;
    for (i = 0; i <= max; i++) {
        /* If first bytes of the address are at the maximum, use the
         * distance. Otherwise use the full range.
         */
        if (!full_range) {
            minip = ntohl((__force __be32)range->min_addr.all[i]);
            maxip = ntohl((__force __be32)range->max_addr.all[i]);
            dist  = maxip - minip + 1;
        } else {
            minip = 0;
            dist  = ~0;
        }

        var_ipp->all[i] = (__force __u32)
            htonl(minip + reciprocal_scale(j, dist));
        if (var_ipp->all[i] != range->max_addr.all[i])
            full_range = true;

        if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
            j ^= (__force u32)tuple->dst.u3.all[i];
    }
}

l4proto->unique_tuple

l4proto->unique_tuple的实现为nf_nat_l4proto_unique_tuple。

/*
如果没有指定范围，DNAT时目的端口不能改变，SNAT时源端口可以改变
端口的变化范围有几个限制，端口是512以内的映射范围是1-512，端口
是512-1024的映射范围是600-1024,1024以上的映射范围就是1024以上
如果指定了端口的变化范围，那就按照指定的来
如果是NF_NAT_RANGE_PROTO_RANDOM模式的话，调用L3的secure_port，
根据源目的IP和需要修改的端口计算一个hash值。
如果是NF_NAT_RANGE_PROTO_RANDOM_FULLY模式的话，直接计算随机数
根据得到的值根据范围取余，再加上最小值就得到的端口，然后判定是否已用，
用了的话加1再判定。
*/
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
                 struct nf_conntrack_tuple *tuple,
                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct,
                 u16 *rover)
{
    unsigned int range_size, min, max, i;
    __be16 *portptr;
    u_int16_t off;

    if (maniptype == NF_NAT_MANIP_SRC)
        portptr = &tuple->src.u.all;
    else
        portptr = &tuple->dst.u.all;

    /* If no range specified... 判断是否指定了具体的端口范围 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {/* 没有指定具体端口范围的话 */
        /* If it's dst rewrite, can't change port 目的nat不改变端口 */
        if (maniptype == NF_NAT_MANIP_DST)
            return;
        /* 源端口为保留端口，则需要保证nat后的源端口也为保留端口 */
        if (ntohs(*portptr) < 1024) {
            /* Loose convention: >> 512 is credential passing */
            /* 源端口小于512，那么在1-511之间进行选择 */
            if (ntohs(*portptr) < 512) {
                min = 1;
                range_size = 511 - min + 1;
            } else {
                /* 大于512，则在600到1024之间进行选择 */
                min = 600;
                range_size = 1023 - min + 1;
            }
        } else {//非保留端口则在1024到65536之间进行选择
            min = 1024;
            range_size = 65535 - 1024 + 1;
        }
    } else {//指定了具体端口范围
        min = ntohs(range->min_proto.all);
        max = ntohs(range->max_proto.all);
        if (unlikely(max < min))
            swap(max, min);
        range_size = max - min + 1;
    }

    if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
        off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
                          ? tuple->dst.u.all
                          : tuple->src.u.all);
    } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
        off = prandom_u32();
    } else {
        off = *rover;
    }

    for (i = 0; ; ++off) {
        *portptr = htons(min + off % range_size);
        /* 端口已经被使用，则加1进行尝试，直到满足要求或者所有情况都应遍历完 
        ** 如果是由于++i == range_size跳出的循环的话，表示没有选出一个唯一的tuple，会话会被删除，报文将会在__nf_conntrack_confirm被丢弃*/
        if (++i != range_size && nf_nat_used_tuple(tuple, ct))
            continue;
        /* 如果没有设置随机的话，设置当前选用的端口号 */
        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
            *rover = off;
        return;
    }
}