本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn:
[email protected]
来源: http://yfydz.cublog.cn
1. 前言
netfilter中的防火墙规则是通过用户层的iptables命令来进行编辑的。而规则都是从属于某个表的(见我以前关于netfilter新表的文章)。一般在mangle表对数据进行修改,在nat表对数据进行NAT,在filter表进行过滤。所不同的是NAT表中的规则只对新包(NEW/RELATED)进行处理,而MANGLE和FILTER表中的规则对所有数据包都处理。
以下Linux内核代码版本为2.4.26。
2. 数据结构
每条规则是用结构struct ipt_entry来定义的:
/* include/linux/netfilter_ipv4/ip_tables.h */
struct ipt_entry
{
struct ipt_ip ip;
/* Mark with fields that we care about. */
unsigned int nfcache;
/* Size of ipt_entry + matches */
u_int16_t target_offset;
/* Size of ipt_entry + matches + target */
u_int16_t next_offset;
/* Back pointer */
unsigned int comefrom;
/* Packet and byte counters. */
struct ipt_counters counters;
/* The matches (if any), then the target. */
unsigned char elems[0];
};
参数说明:
struct ipt_ip ip:基本匹配项,包括协议、源地址/掩码、目的地址/掩码、进入网卡、出网卡等
unsigned int nfcache:标志项
u_int16_t target_offset:规则动作的偏移位置
u_int16_t next_offset:下一个规则的偏移位置
unsigned int comefrom:规则返回点
struct ipt_counters counters:计数器
unsigned char elems[0]:规则匹配项表,最后是动作项
ipt_ip结构:
struct ipt_ip {
/* Source and destination IP addr */
struct in_addr src, dst;
/* Mask for src and dest IP addr */
struct in_addr smsk, dmsk;
char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];
/* Protocol, 0 = ANY */
u_int16_t proto;
/* Flags word */
u_int8_t flags;
/* Inverse flags */
u_int8_t invflags;
};
规则中的匹配项结构,注意这不是描述匹配的结构struct ipt_match
struct ipt_entry_match
{
union {
// 这是用户空间(iptables)用到的部分,只提供名称即可
struct {
u_int16_t match_size;
/* Used by userspace */
char name[IPT_FUNCTION_MAXNAMELEN];
} user;
// 这是内核空间用到的部分,指向具体的匹配模块
struct {
u_int16_t match_size;
/* Used inside the kernel */
struct ipt_match *match;
} kernel;
/* Total length */
u_int16_t match_size;
} u;
unsigned char data[0];
};
规则中的目标(规则动作)项结构,注意这不是描述目标的结构struct ipt_target
struct ipt_entry_target
{
union {
// 这是用户空间(iptables)用到的部分,只提供名称即可
struct {
u_int16_t target_size;
/* Used by userspace */
char name[IPT_FUNCTION_MAXNAMELEN];
} user;
// 这是内核空间用到的部分,指向具体的目标模块
struct {
u_int16_t target_size;
/* Used inside the kernel */
struct ipt_target *target;
} kernel;
/* Total length */
u_int16_t target_size;
} u;
unsigned char data[0];
};
3. 规则集操作函数
netfilter处理规则处理基本函数为ipt_do_table(),在filter/mangle表最终都要进入该函数,而nat表只对NEW/RELATED的包进入该函数。该函数遍历所定义的规则集,顺次进行匹配,一旦和规则的条件匹配成功,则按规则指定的动作返回,返回值可能为NF_ACCEPT/NF_DROP/NF_QUEUE/NF_STOLEN等。
/* net/ipv4/netfilter/ip_tables.c */
unsigned int
ipt_do_table(struct sk_buff **pskb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
struct ipt_table *table,
void *userdata)
{
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))) = { 0 };
u_int16_t offset;
struct iphdr *ip;
void *protohdr;
u_int16_t datalen;
int hotdrop = 0;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
void *table_base;
struct ipt_entry *e, *back;
/* Initialization */
ip = (*pskb)->nh.iph;
protohdr = (u_int32_t *)ip + ip->ihl;
datalen = (*pskb)->len - ip->ihl * 4;
// 如果数据包的进入网卡或出网卡为NULL,则在规则匹配时用nulldevname代替
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
offset = ntohs(ip->frag_off) & IP_OFFSET;
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
// 找到规则集起点,每个表可在不同的挂接点定义规则集,但所有规则集都是统一
// 在一个数值里的
table_base = (void *)table->private->entries
+ TABLE_OFFSET(table->private,
cpu_number_map(smp_processor_id()));
// 第一个规则
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
/* Check noone else using our table */
if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
&& ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
smp_processor_id(),
table->name,
&((struct ipt_entry *)table_base)->comefrom,
((struct ipt_entry *)table_base)->comefrom);
}
((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
#endif
// 规则集的最后一条规则,最后一条规则是链的缺省动作,不是全接收就是全部拒绝
/* For return from builtin chain */
back = get_entry(table_base, table->private->underflow[hook]);
// 这是个死循环,因为最后一条规则是链的缺省动作,不是全接收就是全部拒绝
// 是能够跳出的,除非发生意外
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
(*pskb)->nfcache |= e->nfcache;
// 进行基本元素(struct ipt_ip中定义的元素)的匹配,符合再进行后续匹配
if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
struct ipt_entry_target *t;
// 循环匹配规则中独立的匹配条件
if (IPT_MATCH_ITERATE(e, do_match,
*pskb, in, out,
offset, protohdr,
datalen, &hotdrop) != 0)
goto no_match;
// 全部条件匹配,计数器增加
ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
// 获取规则目标
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
/* Standard target? */
if (!t->u.kernel.target->target) {
int v;
// 标准目标,正常情况v值是小于0的,如ACCEPT实际对于-NF_ACCEPT-1,
// DROP对应-NF_DROP-1,都是小于0的数
v = ((struct ipt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
if (v != IPT_RETURN) {
// verdict重新计算回正常值
verdict = (unsigned)(-v) - 1;
break;
}
// 对于IPT_RETURN,返回原来的链重新继续循环
e = back;
back = get_entry(table_base,
back->comefrom);
continue;
}
if (table_base + v
!= (void *)e + e->next_offset) {
/* Save old back ptr in next entry */
struct ipt_entry *next
= (void *)e + e->next_offset;
next->comefrom
= (void *)back - table_base;
/* set back pointer to next entry */
back = next;
}
e = get_entry(table_base, v);
} else {
// 规则目标非标准目标,而是单独定义的目标模块
/* Targets which reenter must return
abs. verdicts */
#ifdef CONFIG_NETFILTER_DEBUG
((struct ipt_entry *)table_base)->comefrom
= 0xeeeeeeec;
#endif
// 调用目标模块的target()函数
verdict = t->u.kernel.target->target(pskb,
hook,
in, out,
t->data,
userdata);
#ifdef CONFIG_NETFILTER_DEBUG
if (((struct ipt_entry *)table_base)->comefrom
!= 0xeeeeeeec
&& verdict == IPT_CONTINUE) {
printk("Target %s reentered!\n",
t->u.kernel.target->name);
verdict = NF_DROP;
}
((struct ipt_entry *)table_base)->comefrom
= 0x57acc001;
#endif
// 目标有可能修改数据包的各种信息,数据包本身也可能不再是原来的包而是拷贝
// 后的包,因此关于包的网络参数需要重新识别
/* Target might have changed stuff. */
ip = (*pskb)->nh.iph;
protohdr = (u_int32_t *)ip + ip->ihl;
datalen = (*pskb)->len - ip->ihl * 4;
if (verdict == IPT_CONTINUE)
// 返回IPT_CONTINUE时继续下一条规则的检查
// 注意不支持IPT_RETURN
e = (void *)e + e->next_offset;
else
/* Verdict */
break;
}
} else {
// 规则不匹配,找下一条规则继续
no_match:
e = (void *)e + e->next_offset;
}
// 匹配模块中有hotdrop参数,允许匹配模块丢包,而通常匹配模块是不丢包的
} while (!hotdrop);
#ifdef CONFIG_NETFILTER_DEBUG
((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
#endif
read_unlock_bh(&table->lock);
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
#else
if (hotdrop)
return NF_DROP;
else return verdict;
#endif
}
4. 规则的修改
netfilter本质上是以数组方法保存规则集的,虽然每条规则的大小可能是不同的,因此在编辑规则时实际上操作比较麻烦的,对于iptables的各种编辑规则的命令,实际上都是替换操作:IPT_SO_SET_REPLACE,对应的处理函数为do_replace()。
/* net/ipv4/netfilter/ip_tables.c */
static int
do_replace(void *user, unsigned int len)
{
int ret;
struct ipt_replace tmp;
struct ipt_table *t;
struct ipt_table_info *newinfo, *oldinfo;
struct ipt_counters *counters;
// 先从用户空间拷贝规则集的描述信息,由结构struct ipt_replace描述
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
// 长度检查
/* Hack: Causes ipchains to give correct error msg --RR */
if (len != sizeof(tmp) + tmp.size)
return -ENOPROTOOPT;
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
// 分配实际的规则集内存空间,每个CPU一个
newinfo = vmalloc(sizeof(struct ipt_table_info)
+ SMP_ALIGN(tmp.size) * smp_num_cpus);
if (!newinfo)
return -ENOMEM;
if (copy_from_user(newinfo->entries, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
// 分配老规则集的计数器空间准备返回给用户空间
counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
if (!counters) {
ret = -ENOMEM;
goto free_newinfo;
}
memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
// 转换规则,检查规则的合法性等操作
ret = translate_table(tmp.name, tmp.valid_hooks,
newinfo, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
duprintf("ip_tables: Translated table\n");
// 找到相应的ipt_table表
t = find_table_lock(tmp.name, &ret, &ipt_mutex);
if (!t)
goto free_newinfo_counters_untrans;
/* You lied! */
if (tmp.valid_hooks != t->valid_hooks) {
duprintf("Valid hook crap: %08X vs %08X\n",
tmp.valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto free_newinfo_counters_untrans_unlock;
}
// 将新的规则集替换原来的规则集
oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
if (!oldinfo)
goto free_newinfo_counters_untrans_unlock;
/* Update module usage count based on number of rules */
duprintf("do_replace: ldnum=%u, initnum=%u, newnum=%u\n",
oldinfo->number, oldinfo->initial_entries, newinfo->number);
if (t->me && (oldinfo->number <= oldinfo->initial_entries) &&
(newinfo->number > oldinfo->initial_entries))
__MOD_INC_USE_COUNT(t->me);
else if (t->me && (oldinfo->number > oldinfo->initial_entries) &&
(newinfo->number <= oldinfo->initial_entries))
__MOD_DEC_USE_COUNT(t->me);
/* Get the old counters. */
// 读取老规则集的计数器
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
// 遍历清除老规则集,调用规则中匹配和目标模块的destroy()函数
IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
// 释放老规则集
vfree(oldinfo);
/* Silent error: too late now. */
// 将计数器拷贝回用户空间
copy_to_user(tmp.counters, counters,
sizeof(struct ipt_counters) * tmp.num_counters);
// 将老计数器释放
vfree(counters);
up(&ipt_mutex);
return 0;
free_newinfo_counters_untrans_unlock:
up(&ipt_mutex);
free_newinfo_counters_untrans:
IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
vfree(newinfo);
return ret;
}
处理过程中比较重要的连接函数为translate_table()和replace_table(),也都在ip_tables.c中定义:
static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ipt_table_info *newinfo,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
const unsigned int *underflows)
{
unsigned int i;
int ret;
newinfo->size = size;
newinfo->number = number;
/* Init all hooks to impossible value. */
for (i = 0; i < NF_IP_NUMHOOKS; i++) {
newinfo->hook_entry[i] = 0xFFFFFFFF;
newinfo->underflow[i] = 0xFFFFFFFF;
}
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
// 检查规则集中规则的合法性,检查偏移是否正确
ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
check_entry_size_and_hooks,
newinfo,
newinfo->entries,
newinfo->entries + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
if (i != number) {
duprintf("translate_table: %u not %u entries\n",
i, number);
return -EINVAL;
}
/* Check hooks all assigned */
for (i = 0; i < NF_IP_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(valid_hooks & (1 << i)))
continue;
// 检查是否在合法hook点没有设置规则,在每个合法hook点是必须有规则的
if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
duprintf("Invalid hook entry %u %u\n",
i, hook_entries[i]);
return -EINVAL;
}
if (newinfo->underflow[i] == 0xFFFFFFFF) {
duprintf("Invalid underflow %u %u\n",
i, underflows[i]);
return -EINVAL;
}
}
// 检查自定义的链是否形成环
if (!mark_source_chains(newinfo, valid_hooks))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
// 遍历规则,通过调用匹配和目标的checkentry()函数检查其合法性
ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
cleanup_entry, &i);
return ret;
}
// 规则集是每个CPU都有一个
/* And one copy for every other CPU */
for (i = 1; i < smp_num_cpus; i++) {
memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
return ret;
}
static struct ipt_table_info *
replace_table(struct ipt_table *table,
unsigned int num_counters,
struct ipt_table_info *newinfo,
int *error)
{
struct ipt_table_info *oldinfo;
#ifdef CONFIG_NETFILTER_DEBUG
{
struct ipt_entry *table_base;
unsigned int i;
for (i = 0; i < smp_num_cpus; i++) {
table_base =
(void *)newinfo->entries
+ TABLE_OFFSET(newinfo, i);
table_base->comefrom = 0xdead57ac;
}
}
#endif
/* Do the substitution. */
write_lock_bh(&table->lock);
/* Check inside lock: is the old number correct? */
if (num_counters != table->private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, table->private->number);
write_unlock_bh(&table->lock);
*error = -EAGAIN;
return NULL;
}
// struct ipt_table结构中的private指向规则集
// 获取老规则集地址指针
oldinfo = table->private;
// 指向新规则集
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;
write_unlock_bh(&table->lock);
return oldinfo;
}
5. 结论
netfilter的规则是数组方式顺序保存,但每个元素(规则)的大小是不同的,每条规则除了基本部分相同外,还包括不同数量的匹配和目标项。规则匹配是顺序匹配,而编辑时实际上是将整个规则集全部替换。
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/22214587/viewspace-709498/,如需转载,请注明出处,否则将追究法律责任。