上一节我们将了NAT是基于链接跟踪实现的,当一条链接跟踪建立要改变它的tuple的reply方向才能做nat,这个链接跟踪的nat是函数nf_nat_setup_info实现
1、nf_nat_setup_info
nf_nat_setup_info对链接跟踪的做NAT,只会改变链接跟踪reply方向的ip、端口,不会改变数据包的ip、端口,数据包的DAT在上一节已经介绍了是在PRE_ROUTING、POST_ROUTING链的hook点根据链接跟踪的reply方向对数据包做NAT。
nf_nat_setup_info主要做以下几件事
(1)获取链接跟踪和nat关联的结构体struct nf_conn_nat ,如果是空就直接返回
(2)调用nf_nat_initialized判断是否已经做了链接跟踪的NAT
(3)nf_ct_invert_tuplepr获取reply方向的tuple然后取反赋值给curr_tuple也是是orig tuple
(4)get_unique_tuple这个函数是关键,这个就是得到一个新的tuple,new_tuple,这个new_tuple是做了NAT的orig方向。
(5)调用nf_ct_tuple_equal比较curr_tuple和new_tuple是否相等如果不相等就要做NAT改变链接跟踪reply的tuple
(5)调用nf_ct_invert_tuplepr对new_reply取反调用orig方向的tuple reply
(6)nf_conntrack_alter_reply改变链接跟踪tuple的reply方向完成链接跟踪的NAT
(7)做了NAT的链接跟踪如果没有在nat_bysource链表中就要添加进去
(8)设置已经做NAT的标志IPS_DST_NAT_DONE_BIT/IPS_SRC_NAT_DONE_BIT
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
/* nat helper or nfctnetlink also setup binding */
nat = nfct_nat(ct);
if (!nat) {
nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
if (nat == NULL) {
pr_debug("failed to add NAT extension\n");
return NF_ACCEPT;
}
}
NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
maniptype == IP_NAT_MANIP_DST);
BUG_ON(nf_nat_initialized(ct, maniptype));
/* What we've got will look like inverse of reply. Normally
this is what is in the conntrack, except for prior
manipulations (future optimization: if num_manips == 0,
orig_tp =
conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
/*获取reply方向的tuple做反方向复制给curr_tuple*/
nf_ct_invert_tuplepr(&curr_tuple,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/*根据原始的tuple获取新的唯一tuple*/
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
/*新的orig方向和原始orig方向不相等就要做链接跟踪的NAT
也就是改变tuple的reply方向*/
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
/*新的tuple取反*/
nf_ct_invert_tuplepr(&reply, &new_tuple);
/*将取反后的tuple赋值给reply方向
也就是链接跟踪做NAT*/
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
if (maniptype == IP_NAT_MANIP_SRC)
ct->status |= IPS_SRC_NAT;
else
ct->status |= IPS_DST_NAT;
}
/* Place in source hash if this is the first time. */
if (have_to_hash) {
unsigned int srchash;
srchash = hash_by_src(net, nf_ct_zone(ct),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate exntension aera */
nat = nfct_nat(ct);
nat->ct = ct;
hlist_add_head_rcu(&nat->bysource,
&net->ipv4.nat_bysource[srchash]);
spin_unlock_bh(&nf_nat_lock);
}
/* It's done. */
/*设置已经做了SNAT/DNAT标志*/
if (maniptype == IP_NAT_MANIP_DST)
set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
else
set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
return NF_ACCEPT;
}
2、nf_ct_invert_tuplepr
调用__nf_ct_l3proto_find获取三层链接跟踪的操作函数结构体struct nf_conntrack_l3proto实例,调用__nf_ct_l4proto_find获取四层链接跟踪的操作函数结构体struct nf_conntrack_l4proto实例,然后调用nf_ct_invert_tuple根据orig方向取反方向的tuple。
bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig)
{
bool ret;
rcu_read_lock();
ret = nf_ct_invert_tuple(inverse, orig,
__nf_ct_l3proto_find(orig->src.l3num),
__nf_ct_l4proto_find(orig->src.l3num,
orig->dst.protonum));
rcu_read_unlock();
return ret;
}
nf_ct_invert_tuple调用三层、四层的invert_tuple根据orig的nf_conntrack_tuple获取反方向的nf_conntrack_tuple。
bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
/*三层根据orig的nf_conntrack_tuple获取反方向的nf_conntrack_tuple*/
if (l3proto->invert_tuple(inverse, orig) == 0)
return false;
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
/*四层根据orig的nf_conntrack_tuple获取反方向的nf_conntrack_tuple*/
return l4proto->invert_tuple(inverse, orig);
}
3、get_unique_tuple
这个函数主要是获取唯一的做了nat的tuple。
(1)首先如果是SNAT就调用find_appropriate_src在nat_bysource链表中查找已经做了NAT的tuple如果找到了而且没有被其他使用就返回
(2)find_best_ips_proto做ip地址的nat
(3)四层协议做NAT,如果是IP_NAT_RANGE_PROTO_RANDOM标志也就是随机的,就调用四层协议的unique_tuple获取唯一没有被使用的端口做NAT,如果是IP_NAT_RANGE_PROTO_SPECIFIED也就是指定端口,就要调用in_range判断此端口是否在合理返回内。
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
const struct nf_nat_protocol *proto;
u16 zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
range, use that.
This is only required for source (ie. NAT/masq) mappings.
So far, we don't do local source mappings, so multiple
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
/*没有被其他的使用就直接返回*/
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}
/* 2) Select the least-used IP/proto combination in the given
range. */
*tuple = *orig_tuple;
/*IP地址做NAT*/
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
rcu_read_lock();
/*查找四层协议nat实例结构体struct nf_nat_protocol*/
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
/* Change protocol info to have some randomization */
/*支持IP_NAT_RANGE_PROTO_RANDOM就调用四层协议函数unique_tuple
随机获取一个唯一的四层tuple*/
if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
proto->unique_tuple(tuple, range, maniptype, ct);
goto out;
}
/* Only bother mapping if it's not already in range and unique */
/*IP_NAT_RANGE_PROTO_SPECIFIED这个是用户指定的四层端口要调用
ip_range判断是否在合理范围内,而且要判断是否已经被其他使用*/
if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
!nf_nat_used_tuple(tuple, ct))
goto out;
/* Last change: get protocol to try to obtain unique tuple. */
/*调用unique_tuple后去选择一个唯一的没有被使用的四层端口完成nat*/
proto->unique_tuple(tuple, range, maniptype, ct);
out:
rcu_read_unlock();
}
3.1 find_appropriate_src
对应SNAT就会调用这个函数在已经做了NAT的表nat_bysource中查找已经存在的tuple,如果找到了而且没有被使用就对这个reply方向的tuple取反得到目标tuple。然后调用in_range判断目标tuple是否在合理范围内,
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
const struct hlist_node *n;
rcu_read_lock();
/*遍历bysource链表*/
hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
ct = nat->ct;
/*找到了而且等于自己*/
if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
/* Copy source part from reply tuple. */
/*对reply方向的tuple取反得到目的tuple*/
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
/*调用四层协议的in_range判断是否在合理范围内*/
if (in_range(result, range)) {
rcu_read_unlock();
return 1;
}
}
}
rcu_read_unlock();
return 0;
}
3.2 find_best_ips_proto
这个函数是根据range选择一个合理范围的Ip地址做NAT
static void
find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
__be32 *var_ipp;
/* Host order */
u_int32_t minip, maxip, j;
/* No IP mapping? Do nothing. */
if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
return;
if (maniptype == IP_NAT_MANIP_SRC)
var_ipp = &tuple->src.u3.ip;
else
var_ipp = &tuple->dst.u3.ip;
/* Fast path: only one choice. */
if (range->min_ip == range->max_ip) {
*var_ipp = range->min_ip;
return;
}
/* Hashing source and destination IPs gives a fairly even
* spread in practice (if there are a small number of IPs
* involved, there usually aren't that many connections
* anyway). The consistency means that servers see the same
* client coming from the same IP (some Internet Banking sites
* like this), even across reboots. */
minip = ntohl(range->min_ip);
maxip = ntohl(range->max_ip);
j = jhash_2words((__force u32)tuple->src.u3.ip,
range->flags & IP_NAT_RANGE_PERSISTENT ?
0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
j = ((u64)j * (maxip - minip + 1)) >> 32;
*var_ipp = htonl(minip + j);
}