iptables 内部调用过程

iptables 调用过程

int
iptables_main(int argc, char *argv[])
{
	// 解析用户输入的 iptables 命令,配置好 handle 参数
	ret = do_command4(argc, argv, &table, &handle, false);
	if (ret) {
		ret = iptc_commit(handle);
		iptc_free(handle);
	}

iptc_commit 函数实际的实现是使用 TC_COMMIT 实现的

int
TC_COMMIT(struct xtc_handle *handle)
{
    // 将 handle 参数转换成为 repl 结构,这个结构就是代表 iptable rules的
    ret = iptcc_compile_table(handle, repl);

    // 就是用这个函数将 repl 写入到 kernel 中
    ret = setsockopt(handle->sockfd, TC_IPPROTO, SO_SET_REPLACE, repl,
			 sizeof(*repl) + repl->size);

内核部分

static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
	switch (cmd) {
    // 用作命令的配置
	case IPT_SO_SET_REPLACE:
		ret = do_replace(sock_net(sk), user, len);
		break;
    // 用作计数?
	case IPT_SO_SET_ADD_COUNTERS:
		ret = do_add_counters(sock_net(sk), user, len, 0);
		break;

	return ret;
}
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
	int ret;
	struct ipt_replace tmp;
	struct xt_table_info *newinfo;
	void *loc_cpu_entry;
	struct ipt_entry *iter;

	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
		return -EFAULT;

	/* overflow check */
	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
		return -ENOMEM;
	if (tmp.num_counters == 0)
		return -EINVAL;

	tmp.name[sizeof(tmp.name)-1] = 0;

	newinfo = xt_alloc_table_info(tmp.size);
	if (!newinfo)
		return -ENOMEM;

	loc_cpu_entry = newinfo->entries;
	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
			   tmp.size) != 0) {
		ret = -EFAULT;
		goto free_newinfo;
	}
    
    
	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
	if (ret != 0)
		goto free_newinfo;

	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
			   tmp.num_counters, tmp.counters);
	if (ret)
		goto free_newinfo_untrans;
	return 0;

}
struct xt_table_info *
xt_replace_table(struct xt_table *table,
	      unsigned int num_counters,
	      struct xt_table_info *newinfo,
	      int *error)
{

	table->private = newinfo;

table->private 与 net->nf.hooks_ipv4[hook] 的关系是在 xt_hook_ops_alloc 初始化的 hook 函数中实现的,hook 函数会在 table->pirvate 中查询需要符合的规则。

通过 TC_COMMIT 函数使用参数 SO_SET_REPLACE 宏,将 iptables 的规则替换到内核中。这里是的规则变化是替换而不是插入某条规则。

注册 filter 表

使用的结构

linux-4.19\net\ipv4\netfilter\iptable_filter.c 文件中 Linux 内核对 iptables 的 filter 表进行了注册与注销函数的声明。

module_init(iptable_filter_init);
module_exit(iptable_filter_fini);

首先注册 nf_hook_ops 结构,改结构包含了 hook 的信息。

struct nf_hook_ops {
	/* User fills in from here down. */
	nf_hookfn		*hook;        // 用于实际执行时匹配规则的操作函数
	struct net_device	*dev;    // 与 hook 关联的网络设备
	void			*priv;           // 用于存放 iptables 规则
	u_int8_t		pf;           // hook 所属的具体协议族,如 NFPROTO_IPV4、NFPROTO_ARP 等。
	unsigned int		hooknum;   // hook 的数量
	/* Hooks are ordered in ascending priority. */
	int			priority;       // hook 的优先级
};

xt_table 是用于配置表的一个结构。

static const struct xt_table packet_filter = {
	.name		= "filter",
	.valid_hooks	= FILTER_VALID_HOOKS,   // 表示 hook 的位置和数量
	.me		= THIS_MODULE,
	.af		= NFPROTO_IPV4,
	.priority	= NF_IP_PRI_FILTER,        // 优先级
	.table_init	= iptable_filter_table_init,
};

FILTER_VALID_HOOKS 宏的表示方法。

#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
			    (1 << NF_INET_FORWARD) | \
			    (1 << NF_INET_LOCAL_OUT))

iptable 实际替换规则时所使用的结构。

struct ipt_replace {
	/* Which table. */
	char name[XT_TABLE_MAXNAMELEN];

	/* Which hook entry points are valid: bitmask.  You can't
           change this. */
	unsigned int valid_hooks;

	/* Number of entries */
	unsigned int num_entries;

	/* Total size of new entries */
	unsigned int size;

	/* Hook entry points. */
	unsigned int hook_entry[NF_INET_NUMHOOKS];

	/* Underflow points. */
	unsigned int underflow[NF_INET_NUMHOOKS];

	/* Information about old entries: */
	/* Number of counters (must be equal to current number of entries). */
	unsigned int num_counters;
	/* The old entries' counters. */
	struct xt_counters __user *counters;

	/* The entries (hang off end: not really an array). */
	struct ipt_entry entries[0];
};

表示一个表所应该包含的全部信息

struct xt_table_info {
	/* Size per table */
	unsigned int size;
	/* Number of entries: FIXME. --RR */
	unsigned int number;
	/* Initial number of entries. Needed for module usage count */
	unsigned int initial_entries;

	/* Entry points and underflows */
	unsigned int hook_entry[NF_INET_NUMHOOKS];
	unsigned int underflow[NF_INET_NUMHOOKS];

	/*
	 * Number of user chains. Since tables cannot have loops, at most
	 * @stacksize jumps (number of user chains) can possibly be made.
	 */
	unsigned int stacksize;
	void ***jumpstack;

	unsigned char entries[0] __aligned(8);
};

注册流程

根据 packet_filter 声明 nf_hook_ops 结构,iptable_filter_hook 是用于匹配规则所使用的函数。

static int __init iptable_filter_init(void)
{

	filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);

	ret = register_pernet_subsys(&iptable_filter_net_ops);

	return ret;
}

iptable_filter_net_ops 包括了 filter 表的初始化和退出函数。register_pernet_subsys 的作用如下,意思就是使用 namespace 隔离不同的 filter table,以及 filter table 也是每个命名空间默认创建的表。

/**
 *      register_pernet_subsys - register a network namespace subsystem
 *	@ops:  pernet operations structure for the subsystem
 *
 *	Register a subsystem which has init and exit functions
 *	that are called when network namespaces are created and
 *	destroyed respectively.
 *
 *	When registered all network namespace init functions are
 *	called for every existing network namespace.  Allowing kernel
 *	modules to have a race free view of the set of network namespaces.
 *
 *	When a new network namespace is created all of the init
 *	methods are called in the order in which they were registered.
 *
 *	When a network namespace is destroyed all of the exit methods
 *	are called in the reverse of the order with which they were
 *	registered.
 */

通过以下调用关系,里面比较简单,不分析。

iptable_filter_hook
-> iptable_filter_net_init // 默认开启 forward
-> iptable_filter_table_init

ipt_alloc_initial_table 函数实际是一个大个儿的宏,用于根据 xt_table 结构参数申请一个 ipt_replace 结构,该结构的细节上面已经说明。

ipt_register_table 函数会把这些信息(表、hook 节点等)与 net 结构进行关联。

static int __net_init iptable_filter_table_init(struct net *net)
{
	struct ipt_replace *repl;

	repl = ipt_alloc_initial_table(&packet_filter);

	err = ipt_register_table(net, &packet_filter, repl, filter_ops,
				 &net->ipv4.iptable_filter);

	return err;
}

此处 newinfo 信息为表示一个表所需要的全部信息,而 loc_cpu_entry 为该表所关联的 iptables 的具体规则。
经过 translate_table 对上面初始化完成的信息进行检查后,使用 xt_register_table 函数对表进行注册, 使用 nf_register_net_hooks 函数对 hook 节点进行注册。

int ipt_register_table(struct net *net, const struct xt_table *table,
		       const struct ipt_replace *repl,
		       const struct nf_hook_ops *ops, struct xt_table **res)
{
	int ret;
	struct xt_table_info *newinfo;
	struct xt_table_info bootstrap = {0};
	void *loc_cpu_entry;
	struct xt_table *new_table;

	newinfo = xt_alloc_table_info(repl->size);

	loc_cpu_entry = newinfo->entries;
	memcpy(loc_cpu_entry, repl->entries, repl->size);

	ret = translate_table(net, newinfo, loc_cpu_entry, repl);

	new_table = xt_register_table(net, table, &bootstrap, newinfo);

	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));

	return ret;
}

初始化传入的 xt_table_info 结构,其中 xt_replace_table 函数的作用就是将 newinfo 赋给 table->private 。
最后将 table 链入 net 的 xt 结构。其 table 数组的每个成员就是不同协议族的 iptales 表结构。

struct xt_table *xt_register_table(struct net *net,
				   const struct xt_table *input_table,
				   struct xt_table_info *bootstrap,
				   struct xt_table_info *newinfo)
{
	int ret;
	struct xt_table_info *private;
	struct xt_table *t, *table;

	/* Don't add one object to multiple lists. */
	table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);

	/* Simplifies replace_table code. */
	table->private = bootstrap;

	if (!xt_replace_table(table, 0, newinfo, &ret))
		goto unlock;

	private = table->private;

	/* save number of initial entries */
	private->initial_entries = private->number;

	list_add(&table->list, &net->xt.tables[table->af]);

	return table;
}

nf_register_net_hooks // 为每一个 hook 节点都在 net 结构上注册
->nf_register_net_hook // 根据不同协议族带入不同参数,向 net 结构注册 filter_ops 结构
->__nf_register_net_hook

在 net 结构的 nf 成员(例如 net->nf.hooks_ipv4 + hooknum )下找到对应协议族的 hook 链(数组表示的)的头部。 初始化一个新的 hook 条目插入到对应的 hook 链中。

static int __nf_register_net_hook(struct net *net, int pf,
				  const struct nf_hook_ops *reg)
{
	struct nf_hook_entries *p, *new_hooks;
	struct nf_hook_entries __rcu **pp;

	pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);

	mutex_lock(&nf_hook_mutex);

	// 从 rcu 数据中取出 nf_hook_entries 结构
	p = nf_entry_dereference(*pp);
	
	new_hooks = nf_hook_entries_grow(p, reg);

		// 再把新初始化的 hook 放入 rcu 中
		rcu_assign_pointer(*pp, new_hooks);

	hooks_validate(new_hooks);

	return 0;
}

报文匹配

从 net->nf.hooks_ipv4[hook] 取出对应的 nf_hook_entries 类型的 head 用于接下来的 hook 遍历查找。
nf_hook_state_init 用于初始化 state,最后调用 hook 函数的时候会根据这个参数的各个状态决定规则的匹配。

static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
			  struct sock *sk, struct sk_buff *skb,
			  struct net_device *indev, struct net_device *outdev,
			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
	struct nf_hook_entries *hook_head = NULL;
	int ret = 1;

	switch (pf) {
	case NFPROTO_IPV4:
		hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
		break;
	case NFPROTO_IPV6:
		hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
		break;
    ...

	if (hook_head) {
		struct nf_hook_state state;

		nf_hook_state_init(&state, hook, pf, indev, outdev,
				   sk, net, okfn);

		ret = nf_hook_slow(skb, &state, hook_head, 0);
	}
	rcu_read_unlock();

	return ret;
}

nf_hook_entry_hookfn 调用具体 xt_hook_ops_alloc 注册的 hook 函数,对参数得出判断进而返回 accept、drop 等。

int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
		 const struct nf_hook_entries *e, unsigned int s)
{
	unsigned int verdict;
	int ret;

	for (; s < e->num_hook_entries; s++) {
		verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
		switch (verdict & NF_VERDICT_MASK) {
		case NF_ACCEPT:
			break;
		case NF_DROP:
			kfree_skb(skb);
			ret = NF_DROP_GETERR(verdict);
			if (ret == 0)
				ret = -EPERM;
			return ret;
		case NF_QUEUE:
			ret = nf_queue(skb, state, e, s, verdict);
			if (ret == 1)
				continue;
			return ret;
		default:
			/* Implicit handling for NF_STOLEN, as well as any other
			 * non conventional verdicts.
			 */
			return 0;
		}
	}

	return 1;
}

hook 函数

规则是挂在 table->private 上的,而匹配规则所使用匹配函数确实挂在 hook 节点上的。

static unsigned int
iptable_filter_hook(void *priv, struct sk_buff *skb,
		    const struct nf_hook_state *state)
{
	return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}

unsigned int
ipt_do_table(struct sk_buff *skb,
	     const struct nf_hook_state *state,
	     struct xt_table *table)
{
	unsigned int hook = state->hook;    // 匹配哪一个 hook 节点
	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
	const struct iphdr *ip;
	/* Initializing verdict to NF_DROP keeps gcc happy. */
	unsigned int verdict = NF_DROP;
	const char *indev, *outdev;
	const void *table_base;
	struct ipt_entry *e, **jumpstack;
	unsigned int stackidx, cpu;
	const struct xt_table_info *private;
	struct xt_action_param acpar;
	unsigned int addend;

	/* Initialization */
	stackidx = 0;
	ip = ip_hdr(skb);
	indev = state->in ? state->in->name : nulldevname;
	outdev = state->out ? state->out->name : nulldevname;
	/* We handle fragments by dealing with the first fragment as
	 * if it was a normal packet.  All other fragments are treated
	 * normally, except that they will NEVER match rules that ask
	 * things we don't know, ie. tcp syn flag or ports).  If the
	 * rule is also a fragment-specific rule, non-fragments won't
	 * match it. */
	acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
	acpar.thoff   = ip_hdrlen(skb);
	acpar.hotdrop = false;
	acpar.state   = state;

	WARN_ON(!(table->valid_hooks & (1 << hook)));
	local_bh_disable();
	addend = xt_write_recseq_begin();
	private = READ_ONCE(table->private); /* Address dependency. */
	cpu        = smp_processor_id();
	table_base = private->entries;
	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];

	/* Switch to alternate jumpstack if we're being invoked via TEE.
	 * TEE issues XT_CONTINUE verdict on original skb so we must not
	 * clobber the jumpstack.
	 *
	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
	 * but it is no problem since absolute verdict is issued by these.
	 */
	if (static_key_false(&xt_tee_enabled))
		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);

	e = get_entry(table_base, private->hook_entry[hook]);

	do {
		const struct xt_entry_target *t;
		const struct xt_entry_match *ematch;
		struct xt_counters *counter;

		WARN_ON(!e);
		if (!ip_packet_match(ip, indev, outdev,
		    &e->ip, acpar.fragoff)) {
 no_match:
			e = ipt_next_entry(e);
			continue;
		}

		xt_ematch_foreach(ematch, e) {
			acpar.match     = ematch->u.kernel.match;
			acpar.matchinfo = ematch->data;
			if (!acpar.match->match(skb, &acpar))
				goto no_match;
		}

		counter = xt_get_this_cpu_counter(&e->counters);
		ADD_COUNTER(*counter, skb->len, 1);

		t = ipt_get_target_c(e);
		WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
		/* The packet is traced: log it */
		if (unlikely(skb->nf_trace))
			trace_packet(state->net, skb, hook, state->in,
				     state->out, table->name, private, e);
#endif
		/* Standard target? */
		if (!t->u.kernel.target->target) {
			int v;

			v = ((struct xt_standard_target *)t)->verdict;
			if (v < 0) {
				/* Pop from stack? */
				if (v != XT_RETURN) {
					verdict = (unsigned int)(-v) - 1;
					break;
				}
				if (stackidx == 0) {
					e = get_entry(table_base,
					    private->underflow[hook]);
				} else {
					e = jumpstack[--stackidx];
					e = ipt_next_entry(e);
				}
				continue;
			}
			if (table_base + v != ipt_next_entry(e) &&
			    !(e->ip.flags & IPT_F_GOTO)) {
				if (unlikely(stackidx >= private->stacksize)) {
					verdict = NF_DROP;
					break;
				}
				jumpstack[stackidx++] = e;
			}

			e = get_entry(table_base, v);
			continue;
		}

		acpar.target   = t->u.kernel.target;
		acpar.targinfo = t->data;

		verdict = t->u.kernel.target->target(skb, &acpar);
		if (verdict == XT_CONTINUE) {
			/* Target might have changed stuff. */
			ip = ip_hdr(skb);
			e = ipt_next_entry(e);
		} else {
			/* Verdict */
			break;
		}
	} while (!acpar.hotdrop);

	xt_write_recseq_end(addend);
	local_bh_enable();

	if (acpar.hotdrop)
		return NF_DROP;
	else return verdict;
}

TODO

match
target

资料

https://segmentfault.com/a/1190000019449845
https://segmentfault.com/a/1190000019455385?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019470135?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019540796?utm_source=sf-similar-article
https://segmentfault.com/a/1190000019605260?utm_source=sf-similar-article

你可能感兴趣的:(linux,networking,网络,linux)