int
iptables_main(int argc, char *argv[])
{
// 解析用户输入的 iptables 命令,配置好 handle 参数
ret = do_command4(argc, argv, &table, &handle, false);
if (ret) {
ret = iptc_commit(handle);
iptc_free(handle);
}
iptc_commit 函数实际的实现是使用 TC_COMMIT 实现的
int
TC_COMMIT(struct xtc_handle *handle)
{
// 将 handle 参数转换成为 repl 结构,这个结构就是代表 iptable rules的
ret = iptcc_compile_table(handle, repl);
// 就是用这个函数将 repl 写入到 kernel 中
ret = setsockopt(handle->sockfd, TC_IPPROTO, SO_SET_REPLACE, repl,
sizeof(*repl) + repl->size);
内核部分
static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
switch (cmd) {
// 用作命令的配置
case IPT_SO_SET_REPLACE:
ret = do_replace(sock_net(sk), user, len);
break;
// 用作计数?
case IPT_SO_SET_ADD_COUNTERS:
ret = do_add_counters(sock_net(sk), user, len, 0);
break;
return ret;
}
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
int ret;
struct ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
struct ipt_entry *iter;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
goto free_newinfo_untrans;
return 0;
}
struct xt_table_info *
xt_replace_table(struct xt_table *table,
unsigned int num_counters,
struct xt_table_info *newinfo,
int *error)
{
table->private = newinfo;
table->private 与 net->nf.hooks_ipv4[hook] 的关系是在 xt_hook_ops_alloc 初始化的 hook 函数中实现的,hook 函数会在 table->pirvate 中查询需要符合的规则。
通过 TC_COMMIT 函数使用参数 SO_SET_REPLACE 宏,将 iptables 的规则替换到内核中。这里是的规则变化是替换而不是插入某条规则。
在 linux-4.19\net\ipv4\netfilter\iptable_filter.c
文件中 Linux 内核对 iptables 的 filter 表进行了注册与注销函数的声明。
module_init(iptable_filter_init);
module_exit(iptable_filter_fini);
首先注册 nf_hook_ops 结构,改结构包含了 hook 的信息。
struct nf_hook_ops {
/* User fills in from here down. */
nf_hookfn *hook; // 用于实际执行时匹配规则的操作函数
struct net_device *dev; // 与 hook 关联的网络设备
void *priv; // 用于存放 iptables 规则
u_int8_t pf; // hook 所属的具体协议族,如 NFPROTO_IPV4、NFPROTO_ARP 等。
unsigned int hooknum; // hook 的数量
/* Hooks are ordered in ascending priority. */
int priority; // hook 的优先级
};
xt_table 是用于配置表的一个结构。
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS, // 表示 hook 的位置和数量
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER, // 优先级
.table_init = iptable_filter_table_init,
};
FILTER_VALID_HOOKS 宏的表示方法。
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
iptable 实际替换规则时所使用的结构。
struct ipt_replace {
/* Which table. */
char name[XT_TABLE_MAXNAMELEN];
/* Which hook entry points are valid: bitmask. You can't
change this. */
unsigned int valid_hooks;
/* Number of entries */
unsigned int num_entries;
/* Total size of new entries */
unsigned int size;
/* Hook entry points. */
unsigned int hook_entry[NF_INET_NUMHOOKS];
/* Underflow points. */
unsigned int underflow[NF_INET_NUMHOOKS];
/* Information about old entries: */
/* Number of counters (must be equal to current number of entries). */
unsigned int num_counters;
/* The old entries' counters. */
struct xt_counters __user *counters;
/* The entries (hang off end: not really an array). */
struct ipt_entry entries[0];
};
表示一个表所应该包含的全部信息
struct xt_table_info {
/* Size per table */
unsigned int size;
/* Number of entries: FIXME. --RR */
unsigned int number;
/* Initial number of entries. Needed for module usage count */
unsigned int initial_entries;
/* Entry points and underflows */
unsigned int hook_entry[NF_INET_NUMHOOKS];
unsigned int underflow[NF_INET_NUMHOOKS];
/*
* Number of user chains. Since tables cannot have loops, at most
* @stacksize jumps (number of user chains) can possibly be made.
*/
unsigned int stacksize;
void ***jumpstack;
unsigned char entries[0] __aligned(8);
};
根据 packet_filter 声明 nf_hook_ops 结构,iptable_filter_hook 是用于匹配规则所使用的函数。
static int __init iptable_filter_init(void)
{
filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
ret = register_pernet_subsys(&iptable_filter_net_ops);
return ret;
}
iptable_filter_net_ops 包括了 filter 表的初始化和退出函数。register_pernet_subsys 的作用如下,意思就是使用 namespace 隔离不同的 filter table,以及 filter table 也是每个命名空间默认创建的表。
/**
* register_pernet_subsys - register a network namespace subsystem
* @ops: pernet operations structure for the subsystem
*
* Register a subsystem which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
通过以下调用关系,里面比较简单,不分析。
iptable_filter_hook
-> iptable_filter_net_init // 默认开启 forward
-> iptable_filter_table_init
ipt_alloc_initial_table 函数实际是一个大个儿的宏,用于根据 xt_table 结构参数申请一个 ipt_replace 结构,该结构的细节上面已经说明。
ipt_register_table 函数会把这些信息(表、hook 节点等)与 net 结构进行关联。
static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
repl = ipt_alloc_initial_table(&packet_filter);
err = ipt_register_table(net, &packet_filter, repl, filter_ops,
&net->ipv4.iptable_filter);
return err;
}
此处 newinfo 信息为表示一个表所需要的全部信息,而 loc_cpu_entry 为该表所关联的 iptables 的具体规则。
经过 translate_table 对上面初始化完成的信息进行检查后,使用 xt_register_table 函数对表进行注册, 使用 nf_register_net_hooks 函数对 hook 节点进行注册。
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
new_table = xt_register_table(net, table, &bootstrap, newinfo);
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
return ret;
}
初始化传入的 xt_table_info 结构,其中 xt_replace_table 函数的作用就是将 newinfo 赋给 table->private 。
最后将 table 链入 net 的 xt 结构。其 table 数组的每个成员就是不同协议族的 iptales 表结构。
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *input_table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
int ret;
struct xt_table_info *private;
struct xt_table *t, *table;
/* Don't add one object to multiple lists. */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
/* Simplifies replace_table code. */
table->private = bootstrap;
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
private = table->private;
/* save number of initial entries */
private->initial_entries = private->number;
list_add(&table->list, &net->xt.tables[table->af]);
return table;
}
nf_register_net_hooks // 为每一个 hook 节点都在 net 结构上注册
->nf_register_net_hook // 根据不同协议族带入不同参数,向 net 结构注册 filter_ops 结构
->__nf_register_net_hook
在 net 结构的 nf 成员(例如 net->nf.hooks_ipv4 + hooknum )下找到对应协议族的 hook 链(数组表示的)的头部。 初始化一个新的 hook 条目插入到对应的 hook 链中。
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
mutex_lock(&nf_hook_mutex);
// 从 rcu 数据中取出 nf_hook_entries 结构
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
// 再把新初始化的 hook 放入 rcu 中
rcu_assign_pointer(*pp, new_hooks);
hooks_validate(new_hooks);
return 0;
}
从 net->nf.hooks_ipv4[hook] 取出对应的 nf_hook_entries 类型的 head 用于接下来的 hook 遍历查找。
nf_hook_state_init 用于初始化 state,最后调用 hook 函数的时候会根据这个参数的各个状态决定规则的匹配。
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
case NFPROTO_IPV6:
hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
break;
...
if (hook_head) {
struct nf_hook_state state;
nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);
ret = nf_hook_slow(skb, &state, hook_head, 0);
}
rcu_read_unlock();
return ret;
}
nf_hook_entry_hookfn 调用具体 xt_hook_ops_alloc 注册的 hook 函数,对参数得出判断进而返回 accept、drop 等。
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
for (; s < e->num_hook_entries; s++) {
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb(skb);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, e, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
规则是挂在 table->private 上的,而匹配规则所使用匹配函数确实挂在 hook 节点上的。
static unsigned int
iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
unsigned int hook = state->hook; // 匹配哪一个 hook 节点
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
acpar.state = state;
WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
private = READ_ONCE(table->private); /* Address dependency. */
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
/* Switch to alternate jumpstack if we're being invoked via TEE.
* TEE issues XT_CONTINUE verdict on original skb so we must not
* clobber the jumpstack.
*
* For recursion via REJECT or SYNPROXY the stack will be clobbered
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(state->net, skb, hook, state->in,
state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
int v;
v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
verdict = (unsigned int)(-v) - 1;
break;
}
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
} else {
e = jumpstack[--stackidx];
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
if (unlikely(stackidx >= private->stacksize)) {
verdict = NF_DROP;
break;
}
jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
continue;
}
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
} else {
/* Verdict */
break;
}
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
}
match
target
https://segmentfault.com/a/1190000019449845
https://segmentfault.com/a/1190000019455385?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019470135?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019540796?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019605260?utm_source=sf-similar-article