入口在net/ipv4/netfilter/iptable_nat.c中
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.table_init = iptable_nat_table_init,
};
static int __init iptable_nat_init(void)
{
int ret = register_pernet_subsys(&iptable_nat_net_ops);
if (ret)
return ret;
ret = iptable_nat_table_init(&init_net);
if (ret)
unregister_pernet_subsys(&iptable_nat_net_ops);
return ret;
}
static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
/* nat表已经初始化过了 */
if (net->ipv4.nat_table)
return 0;
/* 分配初始化表,用于下面的注册 */
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
if (repl == NULL)
return -ENOMEM;
/* 表注册 */
ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
NULL, &net->ipv4.nat_table);
if (ret < 0) {
kfree(repl);
return ret;
}
ret = ipt_nat_register_lookups(net);
if (ret < 0) {
ipt_unregister_table(net, net->ipv4.nat_table, NULL);
net->ipv4.nat_table = NULL;
}
kfree(repl);
return ret;
}
这个hook函数的实现直接调用了ipt_do_table这个标准规则匹配函数,会根据当前链上的nat规则逐一匹配和处理。
static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
/* 遍历钩子链上的所有规则,进行标准匹配和扩展匹配,执行其target操作 */
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
acpar.state = state;
WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
private = READ_ONCE(table->private); /* Address dependency. */
cpu = smp_processor_id();
/* 首个规则地址 */
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
/* Switch to alternate jumpstack if we're being invoked via TEE.
* TEE issues XT_CONTINUE verdict on original skb so we must not
* clobber the jumpstack.
*
* For recursion via REJECT or SYNPROXY the stack will be clobbered
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
/* 获取对应链上的首个匹配规则 */
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
/* 标准match */
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
/* 未匹配成功,继续下一个规则 */
e = ipt_next_entry(e);
continue;
}
/* 扩展match */
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
/* 只要有返回不匹配的,则说明匹配当前规则失败 */
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
/* 标准match和扩展match都成功 */
/* 获取target */
t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(state->net, skb, hook, state->in,
state->out, table->name, private, e);
#endif
/* Standard target? */
/* 标准target */
if (!t->u.kernel.target->target) {
int v;
v = ((struct xt_standard_target *)t)->verdict;
/* 不会跳转到用户自定义规则 */
if (v < 0) {
/* Pop from stack? */
/* 不是XT_RETURN,则跳出处理结果 */
if (v != XT_RETURN) {
verdict = (unsigned int)(-v) - 1;
break;
}
/* XT_RETURN则继续匹配下一条规则 */
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
} else {
e = jumpstack[--stackidx];
e = ipt_next_entry(e);
}
continue;
}
/* 记录跳转规则,以便返回时获取下一跳规则进行后续匹配 */
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
if (unlikely(stackidx >= private->stacksize)) {
verdict = NF_DROP;
break;
}
jumpstack[stackidx++] = e;
}
/* 获取自定义规则 */
e = get_entry(table_base, v);
continue;
}
/* 扩展target,执行target回调 */
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
/* 需要继续匹配 */
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
/* 跳出处理匹配结果 */
} else {
/* Verdict */
break;
}
/* 无hotdrop,继续匹配 */
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
/* drop标记 */
if (acpar.hotdrop)
return NF_DROP;
/* 返回匹配结果 */
else return verdict;
}
入口在net/ipv4/netfilter/nf_nat_proto.c中
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
.hook = nf_nat_ipv4_out,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_local_fn,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
.hook = nf_nat_ipv4_fn,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
函数工作在PRE_ROUTING钩子点,进行DNAT转换
static unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
unsigned int ret;
/* 获取目的地址 */
__be32 daddr = ip_hdr(skb)->daddr;
/* DNAT转换 */
ret = nf_nat_ipv4_fn(priv, skb, state);
/* 转换之后,目的地址发生变化,释放路由缓存 */
if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
return ret;
}
函数工作在LOCAL_IN钩子点,进行SNAT转换;
static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
return NF_DROP;
else
return NF_ACCEPT;
}
}
return nf_nat_inet_fn(priv, skb, state);
}
函数工作在LOCAL_OUT钩子点,进行DNAT转换
static unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
int err;
/* DNAT转换 */
ret = nf_nat_ipv4_fn(priv, skb, state);
if (ret != NF_ACCEPT)
return ret;
ct = nf_ct_get(skb, &ctinfo);
/* 转换成功 */
if (ct) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
/* ip地址发生变化 */
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
/* 重新查路由 */
err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
#ifdef CONFIG_XFRM
else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
#endif
}
return ret;
}
函数工作在POST_ROUTING钩子点,进行SNAT转换;
static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
int err;
#endif
unsigned int ret;
/* SNAT转换 */
ret = nf_nat_ipv4_fn(priv, skb, state);
#ifdef CONFIG_XFRM
if (ret != NF_ACCEPT)
return ret;
if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
return ret;
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.src.u3.ip !=
ct->tuplehash[!dir].tuple.dst.u3.ip ||
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)) {
err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
}
#endif
return ret;
}
nf_nat_ipv4_fn完成具体的SNAT或者DNAT的转换流程,上面的四个钩子函数都会调用该函数
static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
return NF_DROP;
else
return NF_ACCEPT;
}
}
return nf_nat_inet_fn(priv, skb, state);
}
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
/* 获取是进行DNAT还是SNAT,其中PRE_ROUTING和LOCAL_OUT进行DNAT,LOCAL_IN和POST_ROUTING进行SNAT */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
/* 获取skb关联的连接跟踪sf_conn */
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
if (!ct)
return NF_ACCEPT;
/* 获取NAT扩展 */
nat = nfct_nat(ct);
/* 判断连接跟踪状态 */
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
/* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
*/
/* 尚未进行过NAT转换 */
if (!nf_nat_initialized(ct, maniptype)) {
struct nf_nat_lookup_hook_priv *lpriv = priv;
struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
unsigned int ret;
int i;
if (!e)
goto null_bind;
for (i = 0; i < e->num_hook_entries; i++) {
ret = e->hooks[i].hook(e->hooks[i].priv, skb,
state);
if (ret != NF_ACCEPT)
return ret;
/* 打NAT转换标记 */
if (nf_nat_initialized(ct, maniptype))
goto do_nat;
}
null_bind:
/* 连接跟踪进行NAT */
ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
/* 进行过NAT转换 */
pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct, ct->status);
/* 出接口发生改变 */
if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
break;
default:
/* ESTABLISHED */
WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
ctinfo != IP_CT_ESTABLISHED_REPLY);
if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
do_nat:
/* skb数据包进行NAT转换修改 */
return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_DROP;
}
2.8 nf_nat_alloc_null_binding
unsigned int
nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
}
2.9 __nf_nat_alloc_null_binding
static unsigned int
__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
{
/* Force range to this IP; let proto decide mapping for
* per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
* Use reply in case it's already been mangled (eg local packet).
*/
/* 使用应答方向的ip地址,LOCAL_OUT会先经过mangle,可能改变了 */
union nf_inet_addr ip =
(manip == NF_NAT_MANIP_SRC ?
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
/* 设置range */
struct nf_nat_range2 range = {
.flags = NF_NAT_RANGE_MAP_IPS,
.min_addr = ip,
.max_addr = ip,
};
/* 进行NAT转换 */
return nf_nat_setup_info(ct, &range, manip);
}
2.8 nf_nat_setup_info
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */
/* 已经确认的,返回accpet */
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST);
if (WARN_ON(nf_nat_initialized(ct, maniptype)))
return NF_DROP;
/* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior
* manipulations (future optimization: if num_manips == 0,
* orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
*/
/* 从应答tuple反向得到当前tuple */
nf_ct_invert_tuple(&curr_tuple,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* 根据当前tuple和range得到NAT转换之后的的tuple */
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
/* NAT转换之后和之前的tuple不同 */
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
/* 通过新tuple得到reply_tuple */
nf_ct_invert_tuple(&reply, &new_tuple);
/* 加入到reply hash */
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
/* 更新状态需要做NAT */
if (maniptype == NF_NAT_MANIP_SRC)
ct->status |= IPS_SRC_NAT;
else
ct->status |= IPS_DST_NAT;
/* 扩展项的调整 */
if (nfct_help(ct) && !nfct_seqadj(ct))
if (!nfct_seqadj_ext_add(ct))
return NF_DROP;
}
/* SNAT */
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
spinlock_t *lock;
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
spin_unlock_bh(lock);
}
/* It's done. */
/* NAT转换完成 */
if (maniptype == NF_NAT_MANIP_DST)
ct->status |= IPS_DST_NAT_DONE;
else
ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
* and that same mapping gives a unique tuple within the given
* range, use that.
*
* This is only required for source (ie. NAT/masq) mappings.
* So far, we don't do local source mappings, so multiple
* manips not an issue.
*/
/* SNAT && 没有打RANDOM_ALL标记 */
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
/* 查看orig_tuple是否满足范围要求 */
if (in_range(orig_tuple, range)) {
/* tuple尚未被使用 */
if (!nf_nat_used_tuple(orig_tuple, ct)) {
/* 使用原tuple */
*tuple = *orig_tuple;
return;
}
/* ori_range不满足要求,则从bysource_table中查找一个满足范围的tuple */
} else if (find_appropriate_src(net, zone,
orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
/* tuple尚未被使用 */
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}
/* 从给定range中选择一个最少使用的组合 */
/* 2) Select the least-used IP/proto combination in the given range */
*tuple = *orig_tuple;
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
* the range to make a unique tuple.
*/
/* Only bother mapping if it's not already in range and unique */
/* 没有打RANDOM_ALL标记 */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* 有SPECIFIED标记,对端口号进行检查 */
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
/* 端口号已经在范围之内&&(端口最小最大范围相等||tuple没有使用) */
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
&range->min_proto,
&range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
/* 没有SPECIFIED标记,端口号不变,tuple没有被使用 */
} else if (!nf_nat_used_tuple(tuple, ct)) {
return;
}
}
/* Last chance: get protocol to try to obtain unique tuple. */
/* 随机选择端口号 */
nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}
2.9 nf_nat_packet
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
/* 获取进行SNAT还是DNAT */
enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
/* 获取方向 */
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned int verdict = NF_ACCEPT;
unsigned long statusbit;
/* 设置NAT标记 */
if (mtype == NF_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
else
statusbit = IPS_DST_NAT;
/* Invert if this is reply dir. */
/* 应答方向需要取反 */
if (dir == IP_CT_DIR_REPLY)
statusbit ^= IPS_NAT_MASK;
/* Non-atomic: these bits don't change. */
/* 需要做NAT */
if (ct->status & statusbit)
/* 将ip地址和端口的NAT转换结果写入skb */
verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
return verdict;
}