iptables是linux下常用的一个防火墙软件,可以实现对网络访问的各种限制。iptables相当于防火墙的客户端,与用户进行交换,其后台依赖于内核的netfilter模块。iptables的各种配置,最终都是netfilter模块来实现的。
iptables分为4个表:filter表,nat表,raw表,mangle表。filter表为默认表。如下每的指令,不指定表名,默认操作的是filter表。
iptables -I INPUT -s 1.2.3.4 -j ACCEPT
再如下面的指令,操作的是nat表,需要用-t指令,指明操作的表名。
iptables -t nat -A PREROUTING -d 1.2.3.4 -p tcp -m tcp --dport 81 -j DNAT --to-destination 192.168.0.2:8180
ipv4 filter的初始化函数是iptable_filter_init,代码在net/ipv4/netfilter/iptable_filter.c文件中。
static int __init iptable_filter_init(void)
{
int ret;
// 将packet_filter表的nf_hooks_ops对象的钩子回调函数设置为iptable_filter_hook
filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
if (IS_ERR(filter_ops))
return PTR_ERR(filter_ops);
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
kfree(filter_ops);
return ret;
}
packet_filter的定义如下:
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
.table_init = iptable_filter_table_init,
};
xt_hook_ops_alloc的处理逻辑是根据valid_hooks计算需要添加几个钩子处理函数。有FILTER_VALID_HOOKS的定义可知,其3个bit位是1,因此num_hooks为3。调用kcalloc申请三个nf_hook_ops对象,并分别赋值。
xt_hook_ops_alloc代码如下:
struct nf_hook_ops *
xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
{
unsigned int hook_mask = table->valid_hooks;
// num_hooks即hook_mask有几个bit位是1,对于packet_filter,num_hooks为3
uint8_t i, num_hooks = hweight32(hook_mask);
uint8_t hooknum;
struct nf_hook_ops *ops;
if (!num_hooks)
return ERR_PTR(-EINVAL);
ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL);
if (ops == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0;
hook_mask >>= 1, ++hooknum) {
if (!(hook_mask & 1))
continue;
ops[i].hook = fn;
ops[i].pf = table->af; // NFPROTO_IPV4
ops[i].hooknum = hooknum;
ops[i].priority = table->priority; // NF_IP_PRI_FILTER
++i;
}
return ops;
}
iptable_filter_net_ops的初始化方法是,iptable_filter_net_init,最后调到iptable_filter_table_init。
static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
int err;
if (net->ipv4.iptable_filter)
return 0;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
err = ipt_register_table(net, &packet_filter, repl, filter_ops,
&net->ipv4.iptable_filter);
kfree(repl);
return err;
}
将ipt_alloc_initial_table展开后,代码如下:
void *ipt_alloc_initial_table(const struct xt_table *info)
{
unsigned int hook_mask = info->valid_hooks;
unsigned int nhooks = hweight32(hook_mask); //3
unsigned int bytes = 0, hooknum = 0, i = 0;
struct {
struct ipt_replace repl;
struct ipt_standard entries[];
} *tbl;
struct ipt_error *term;
size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) +
__alignof__(*term) - 1) & ~(__alignof__(*term) - 1);
tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL);
if (tbl == NULL)
return NULL;
term = (struct ipt_error *)&(((char *)tbl)[term_offset]);
strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name));
*term = (struct ipt_error)IPT_ERROR_INIT;
tbl->repl.valid_hooks = hook_mask;
tbl->repl.num_entries = nhooks + 1;
tbl->repl.size = nhooks * sizeof(struct ipt_standard) +
sizeof(struct ipt_error);
for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) {
if (!(hook_mask & 1))
continue;
tbl->repl.hook_entry[hooknum] = bytes;
tbl->repl.underflow[hooknum] = bytes;
tbl->entries[i++] = (struct ipt_standard)
IPT_STANDARD_INIT(NF_ACCEPT);
bytes += sizeof(struct ipt_standard);
}
return tbl;
}
在分析ipt_register_table前,先看下xt_alloc_table_info的代码:
struct xt_table_info *xt_alloc_table_info(unsigned int size)
{
struct xt_table_info *info = NULL;
size_t sz = sizeof(*info) + size;
if (sz < sizeof(*info))
return NULL;
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
info = kvmalloc(sz, GFP_KERNEL);
if (!info)
return NULL;
memset(info, 0, sizeof(*info));
info->size = size;
return info;
}
申请的实际大小为sizeof(xt_table_info) + size,且xt_table_info类型的info的size设置为入参。
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
// repl->size为{nhooks * sizeof(struct ipt_standard) + sizeof(struct ipt_error)}
// 本次分析,nhooks为3
// xt_alloc_table_info申请的空间,实际为:1个xt_table_info + 3个ipt_standard + 1个ipt_error
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
// repl->entries为IPT_STANDARD_INIT(NF_ACCEPT),在ipt_alloc_initial_table中设置
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
// 复制input_table,将拷贝的对象添加到net->xt.tables[table->af]列表上
// new_table->private为newinfo
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
ret = PTR_ERR(new_table);
goto out_free;
}
/* set res now, will see skbs right after nf_register_net_hooks */
// 将net->ipv4.iptable_filter的内容修改为new_table的内容
WRITE_ONCE(*res, new_table);
// 注册ops,即 filter_ops
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
__ipt_unregister_table(net, new_table);
*res = NULL;
}
return ret;
out_free:
xt_free_table_info(newinfo);
return ret;
}
注册完filter_ops后,经过NF_INET_LOCAL_IN,NF_INET_FORWARD,NF_INET_LOCAL_OUT链的数据,会调用iptable_filter_hook进行处理。iptable_filter_hook的代码如下:
static unsigned int
iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
unsigned int hook = state->hook; // NF_INET_LOCAL_IN等
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e;
unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
ip = ip_hdr(skb); // 网络层头
private = table->private;
table_base = private->entries; // entries为xt_table_info最后一个成员
// 获取hook点的ipt_entry
// get_entry ==> (struct ipt_entry *)(base + offset)
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
// 匹配ip包,成功则继续匹配下去,否则跳到下一个规则
// ip_packet_match匹配标准match,如:源/目的地址,进/出网口,协议等
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
} while (!acpar.hotdrop);
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
}
总结一下:一个xt_table结构,代表netfilter一个表,其中filter表对应的是packet_filter。packet_filter的信息,保存到了net->ipv4.iptable_filter上。通过table->private获取到xt_table_info的指针。通过get_entry获取struct ipt_entry结构的指针。一个struct ipt_entry代表filter链中的一条配置。
struct ipt_entry {
struct ipt_ip ip;/* Mark with fields that we care about. */
unsigned int nfcache;/* Size of ipt_entry + matches */
__u16 target_offset;
/* Size of ipt_entry + matches + target */
__u16 next_offset;/* Back pointer */
unsigned int comefrom;/* Packet and byte counters. */
struct xt_counters counters;/* The matches (if any), then the target. */
unsigned char elems[0];
};
说起来有点抽象,让我们进入内核,把这写信息打印出来吧。
首先在filter表的INPUT链添加几条数据,指令如下:
iptables -I INPUT -s 1.2.3.4 -j ACCEPT
iptables -I INPUT -p tcp ! -s 1.2.3.5 -j ACCEPT
iptables -I INPUT -p udp -d 1.2.3.0/24 -j ACCEPT
配置结果为下:
怎么把内核里的数据,打印出来呢。可以通过编写字符驱动,实现驱动的打开操作,在打开操作的回调函数中,将所需的信息打印出来。
void ipt_entry_print(struct ipt_entry* ipt_entry)
{
if (NULL == ipt_entry)
{
printk("ipt_entry is null\n");
return;
}
struct ipt_ip ip = ipt_entry->ip;
// 打印ip
printk("---ip---\n");
printk("src ip: %X, src mask: %X\n", ip.src, ip.smsk);
printk("dst ip: %X, dst mask: %X\n", ip.dst, ip.dmsk);
printk("iniface: %s\n", ip.iniface);
printk("outiface: %s\n", ip.outiface);
/*
IPPROTO_IP 0
IPPROTO_TCP 6
IPPROTO_UDP 17
*/
printk("proto: %d\n", ip.proto); // 6-tcp
printk("flags: %d\n", ip.flags);
/*
取反标志
#define IPT_INV_VIA_IN 0x01 Invert the sense of IN IFACE.
#define IPT_INV_VIA_OUT 0x02 Invert the sense of OUT IFACE
#define IPT_INV_TOS 0x04 Invert the sense of TOS.
#define IPT_INV_SRCIP 0x08 Invert the sense of SRC IP.
#define IPT_INV_DSTIP 0x10 Invert the sense of DST OP.
#define IPT_INV_FRAG 0x20 Invert the sense of FRAG.
#define IPT_INV_PROTO XT_INV_PROTO
#define IPT_INV_MASK 0x7F All possible flag bits mask.
*/
printk("invflags: %d\n", ip.invflags); // 8-源地址取反
printk("---ipt_entry---\n");
printk("nfcache: %d\n", ipt_entry->nfcache);
printk("target_offset: %d\n", ipt_entry->target_offset);
printk("next_offset: %d\n", ipt_entry->next_offset);
printk("comefrom: %d\n", ipt_entry->comefrom);
}
static int hello_open(struct inode* inode, struct file*filep)
{
printk("hello_open\n");
struct task_struct *tsk = current;
struct net *net;
struct xt_table *xt_filter;
struct xt_table_info *filter_info;
const void* table_base;
int i = 0;
int local_in_hook_entry;
struct ipt_entry* ipt_entry;
struct nsproxy *nsprx = tsk->nsproxy; //命名空间
if (NULL == nsprx)
{
printk("nsprx is null\n");
return 0;
}
printk("hello_open get net\n");
net = nsprx->net_ns;
if (NULL == net)
{
printk("net is null\n");
return 0;
}
printk("hello_open get xt_table\n");
xt_filter = net->ipv4.iptable_filter;
if (NULL == xt_filter)
{
printk("xt_filter is null\n");
return 0;
}
// 打印xt_table信息
printk("xt_table: af - %d\n", xt_filter->af);
printk("xt_table: name - %s\n", xt_filter->name);
printk("xt_table: valid_hooks - %d\n", xt_filter->valid_hooks);
printk("xt_table: priority - %d\n", xt_filter->priority);
filter_info = xt_filter->private;
if (NULL == filter_info)
{
printk("filter_info is null\n");
return 0;
}
printk("filter_info: size - %d\n", filter_info->size);
printk("filter_info: number - %d\n", filter_info->number); // 4?
printk("filter_info: initial_entries - %d\n", filter_info->initial_entries);
printk("filter_info: stacksize - %d\n", filter_info->stacksize);
table_base = filter_info->entries;
local_in_hook_entry = filter_info->hook_entry[NF_INET_LOCAL_IN];
printk("filter_info: local_in_hook_entry - %d\n", local_in_hook_entry);
// 其实获取到的是ipt_standard的地址,ipt_entry在ipt_standardh中
ipt_entry = table_base + local_in_hook_entry;
if (NULL == ipt_entry)
{
printk("ipt_entry is null\n");
return 0;
}
for (; i < 24; ++i)
{
printk("***BEGIN***\n");
ipt_entry_print(ipt_entry);
ipt_entry = (void *)ipt_entry + ipt_entry->next_offset;
printk("***END***\n\n");
}
//struct ipt_entry* nxt_ipt_entry = (void *)ipt_entry + ipt_entry->next_offset;
//ipt_entry_print(nxt_ipt_entry);
printk("hello_open finish\n");
return 0;
}
驱动编译,安装完成后,再编写客户端程序,打开字符驱动文件,如:
int fd = open("/dev/test",O_RDWR);
打开/dev/test后,可触发字符驱动执行hello_open。打印的内核日志,可以通过dmesg指令查看。
[ 7539.033208] hello_open
[ 7539.033209] hello_open get net
[ 7539.033209] hello_open get xt_table
[ 7539.033210] xt_table: af - 2 // NFPROTO_IPV4
[ 7539.033210] xt_table: name - filter // 名称
[ 7539.033211] xt_table: valid_hooks - 14 // 第1,2,3 bit位为1(从0开始)
[ 7539.033211] xt_table: priority - 0
[ 7539.033212] filter_info: size - 4272
[ 7539.033212] filter_info: number - 26 //共26条配置?
[ 7539.033213] filter_info: initial_entries - 4
[ 7539.033213] filter_info: stacksize - 5
[ 7539.033213] filter_info: local_in_hook_entry - 0
[ 7539.033214] ***BEGIN***
[ 7539.033214] ---ip--- // 对应第1条配置,目的地址为 1.2.3.0/24
[ 7539.033215] src ip: 0, src mask: 0
[ 7539.033215] dst ip: 30201, dst mask: FFFFFF
[ 7539.033216] iniface:
[ 7539.033216] outiface:
[ 7539.033216] proto: 17 // UPD协议
[ 7539.033217] flags: 0
[ 7539.033217] invflags: 0
[ 7539.033217] ---ipt_entry---
[ 7539.033218] nfcache: 0
[ 7539.033218] target_offset: 112
[ 7539.033219] next_offset: 152
[ 7539.033219] comefrom: 2
[ 7539.033219] ***END***[ 7539.033219] ***BEGIN***
[ 7539.033220] ---ip--- // 对应第2条配置,源地址不是1.2.3.5
[ 7539.033220] src ip: 5030201, src mask: FFFFFFFF
[ 7539.033221] dst ip: 0, dst mask: 0
[ 7539.033221] iniface:
[ 7539.033221] outiface:
[ 7539.033222] proto: 6 // TCP协议
[ 7539.033222] flags: 0
[ 7539.033223] invflags: 8 // #define IPT_INV_SRCIP 0x08
[ 7539.033223] ---ipt_entry---
[ 7539.033223] nfcache: 0
[ 7539.033224] target_offset: 112
[ 7539.033224] next_offset: 152
[ 7539.033224] comefrom: 2
[ 7539.033225] ***END***[ 7539.033225] ***BEGIN***
[ 7539.033225] ---ip--- // 对应第3条配置,对源地址1.2.3.4放行
[ 7539.033226] src ip: 4030201, src mask: FFFFFFFF
[ 7539.033226] dst ip: 0, dst mask: 0
[ 7539.033226] iniface:
[ 7539.033227] outiface:
[ 7539.033227] proto: 0 // 所有协议
[ 7539.033227] flags: 0
[ 7539.033228] invflags: 0
[ 7539.033228] ---ipt_entry---
[ 7539.033228] nfcache: 0
[ 7539.033229] target_offset: 112
[ 7539.033229] next_offset: 152
[ 7539.033229] comefrom: 2
[ 7539.033230] ***END***