iptables与内核的交互

 关于iptables的的工作原理,主要分为三个方面:用户程序对规则的处理,内核对用户命令的处理,内核中netfilter对数据包的过滤(Ref:netfilter分析3-钩子函数执行流程)。
 本文大致分析iptables用户态程序如何解析规则,并将规则配置到内核中。以如下命令为例:

iptables -A INPUT -i eth0 -p tcp -s 192.168.100.0/24 --dport 22 -m state --state NEW,ESTABLISHED -j ACCEPT
iptables -A OUTPUT -o eth0 -p tcp --sport 22 -m state --state ESTABLISHED -j ACCEPT

 主要分析第一句:

iptables -A INPUT -i eth0 -p tcp -s 192.168.100.0/24 --dport 22 -m state --state NEW,ESTABLISHED -j ACCEPT

用户空间

 代码版本:iptables-1.8.7。
 iptables的客户端和内核共享一些数据结构。例如:
ipt_entry 、xt_entry_match、xt_tcp。

struct ipt_entry {
    struct ipt_ip ip;
    /* Mark with fields that we care about. */
    unsigned int nfcache;
    /* Size of ipt_entry + matches */
    __u16 target_offset;
    /* Size of ipt_entry + matches + target */
    __u16 next_offset;
    /* Back pointer */
    unsigned int comefrom;
    /* Packet and byte counters. */
    struct xt_counters counters;
    /* The matches (if any), then the target. */
    unsigned char elems[0];
};
struct xt_entry_match {
    union {
        struct {
            __u16 match_size;
            /* Used by userspace */
            char name[XT_EXTENSION_MAXNAMELEN];
            __u8 revision;
        } user;
        struct {
            __u16 match_size;
            /* Used inside the kernel */
            struct xt_match *match;
        } kernel;
        /* Total length */
        __u16 match_size;
    } u;
    unsigned char data[0];
};
struct xt_tcp {
    __u16 spts[2];          /* Source port range. */
    __u16 dpts[2];          /* Destination port range. */
    __u8 option;            /* TCP Option iff non-zero*/
    __u8 flg_mask;          /* TCP flags mask byte */
    __u8 flg_cmp;           /* TCP flags compare byte */
    __u8 invflags;          /* Inverse flags */
};

 主函数为iptables_main(iptables-standalone.c)。

int
iptables_main(int argc, char *argv[])
{
    char *table = "filter";
    struct xtc_handle *handle = NULL;
    ret = do_command4(argc, argv, &table, &handle, false);
    if (ret) {
        ret = iptc_commit(handle);
        iptc_free(handle);
    }
}

 -A INPUT的解析代码:

int do_command4(int argc, char *argv[], char **table,
        struct xtc_handle **handle, bool restore)
{
        case 'A':
            add_command(&command, CMD_APPEND, CMD_NONE,
                    cs.invert);
            chain = optarg;
            break;
}

 -i eth0的解析代码:

int do_command4(int argc, char *argv[], char **table,
        struct xtc_handle **handle, bool restore)
{
        case 'i':
            if (*optarg == '\0')
                xtables_error(PARAMETER_PROBLEM,
                    "Empty interface is likely to be "
                    "undesired");
            set_option(&cs.options, OPT_VIANAMEIN, &cs.fw.ip.invflags,
                   cs.invert);
            xtables_parse_interface(optarg,
                    cs.fw.ip.iniface,
                    cs.fw.ip.iniface_mask);
            break;
}

 -p tcp -s 192.168.100.0/24 --dport 22
 ip段(192.168.100.0/24)的解析:

int do_command4(int argc, char *argv[], char **table,
        struct xtc_handle **handle, bool restore)
{
    if (shostnetworkmask)
        xtables_ipparse_multiple(shostnetworkmask, &saddrs,
                     &smasks, &nsaddrs);

    if (dhostnetworkmask)
        xtables_ipparse_multiple(dhostnetworkmask, &daddrs,
                     &dmasks, &ndaddrs);
}

 --dport 22的参数解析,需要tcp_match模块,命令中已经指定了协议(-p tcp)。

static struct xtables_match tcp_match = {
    .family     = NFPROTO_UNSPEC,
    .name       = "tcp",
    .version    = XTABLES_VERSION,
    .size       = XT_ALIGN(sizeof(struct xt_tcp)),
    .userspacesize  = XT_ALIGN(sizeof(struct xt_tcp)),
    .help       = tcp_help,
    .init       = tcp_init,
    .parse      = tcp_parse,
    .print      = tcp_print,
    .save       = tcp_save,
    .extra_opts = tcp_opts,
    .xlate      = tcp_xlate,
};

 相应的解析函数:

int command_default(struct iptables_command_state *cs,
            struct xtables_globals *gl)
{
    if (cs->target != NULL &&
        (cs->target->parse != NULL || cs->target->x6_parse != NULL) &&
        cs->c >= cs->target->option_offset &&
        cs->c < cs->target->option_offset + XT_OPTION_OFFSET_SCALE) {
        xtables_option_tpcall(cs->c, cs->argv, cs->invert,
                      cs->target, &cs->fw);
        return 0;
    }

    for (matchp = cs->matches; matchp; matchp = matchp->next) {
        m = matchp->match;

        if (matchp->completed ||
            (m->x6_parse == NULL && m->parse == NULL))
            continue;
        if (cs->c < matchp->match->option_offset ||
            cs->c >= matchp->match->option_offset + XT_OPTION_OFFSET_SCALE)
            continue;
        xtables_option_mpcall(cs->c, cs->argv, cs->invert, m, &cs->fw);
        return 0;
    }

    /* Try loading protocol */
    m = load_proto(cs);
    if (m != NULL) {
        size_t size;

        cs->proto_used = 1;

        size = XT_ALIGN(sizeof(struct xt_entry_match)) + m->size;

        m->m = xtables_calloc(1, size);
        m->m->u.match_size = size;
        strcpy(m->m->u.user.name, m->name);
        m->m->u.user.revision = m->revision;
        xs_init_match(m);

        if (m->x6_options != NULL)
            gl->opts = xtables_options_xfrm(gl->orig_opts,
                            gl->opts,
                            m->x6_options,
                            &m->option_offset);
        else
            gl->opts = xtables_merge_options(gl->orig_opts,
                             gl->opts,
                             m->extra_opts,
                             &m->option_offset);
        if (gl->opts == NULL)
            xtables_error(OTHER_PROBLEM, "can't alloc memory!");
        optind--;
        /* Indicate to rerun getopt *immediately* */
        return 1;
    }
}
void xtables_option_mpcall(unsigned int c, char **argv, bool invert,
               struct xtables_match *m, void *fw)
{
    if (m->x6_parse == NULL) {
    if (m->parse != NULL)
        m->parse(c - m->option_offset, argv, invert,
             &m->mflags, fw, &m->m);
    return;
    }
}

 tcp_parse会将端口数据写入struct xt_tcp中。
 load_proto中会加载按照protocol寻找对应的xtables_match。

struct xtables_match *load_proto(struct iptables_command_state *cs)
{
    if (!should_load_proto(cs))
        return NULL;
    return find_proto(cs->protocol, XTF_TRY_LOAD,
              cs->options & OPT_NUMERIC, &cs->matches);
}
static struct xtables_match *
find_proto(const char *pname, enum xtables_tryload tryload,
       int nolookup, struct xtables_rule_match **matches)
{
     return xtables_find_match(pname, tryload, matches);
}

 命令行中的数据会加载到struct xt_entry_match。之后被复制到struct ipt_entry中。

static struct ipt_entry *
generate_entry(const struct ipt_entry *fw,
           struct xtables_rule_match *matches,
           struct xt_entry_target *target)
{
    unsigned int size;
    struct xtables_rule_match *matchp;
    struct ipt_entry *e;

    size = sizeof(struct ipt_entry);
    for (matchp = matches; matchp; matchp = matchp->next)
        size += matchp->match->m->u.match_size;

    e = xtables_malloc(size + target->u.target_size);
    *e = *fw;
    e->target_offset = size;
    e->next_offset = size + target->u.target_size;

    size = 0;
    for (matchp = matches; matchp; matchp = matchp->next) {
        //复制match中的数据
        memcpy(e->elems + size, matchp->match->m, matchp->match->m->u.match_size);
        size += matchp->match->m->u.match_size;
    }
    memcpy(e->elems + size, target, target->u.target_size);

    return e;
}

 数据复制。

static int
append_entry(const xt_chainlabel chain,
         struct ipt_entry *fw,
         unsigned int nsaddrs,
         const struct in_addr saddrs[],
         const struct in_addr smasks[],
         unsigned int ndaddrs,
         const struct in_addr daddrs[],
         const struct in_addr dmasks[],
         int verbose,
         struct xtc_handle *handle)
{
    for (i = 0; i < nsaddrs; i++) {
        fw->ip.src.s_addr = saddrs[i].s_addr;
        fw->ip.smsk.s_addr = smasks[i].s_addr;
        for (j = 0; j < ndaddrs; j++) {
            fw->ip.dst.s_addr = daddrs[j].s_addr;
            fw->ip.dmsk.s_addr = dmasks[j].s_addr;
            if (verbose)
                print_firewall_line(fw, handle);
            ret &= iptc_append_entry(chain, fw, handle);
        }
    }

    return ret;
}
iptc_append_entry(const IPT_CHAINLABEL chain,
        const STRUCT_ENTRY *e,
        struct xtc_handle *handle)
{
    if (!(r = iptcc_alloc_rule(c, e->next_offset))) {
        DEBUGP("unable to allocate rule for chain `%s'\n", chain);
        errno = ENOMEM;
        return 0;
    }
    memcpy(r->entry, e, e->next_offset);
}
/* allocate and initialize a new rule for the cache */
static struct rule_head *iptcc_alloc_rule(struct chain_head *c, unsigned int size)
{
    r->chain = c;
    r->size = size;
    return r;
}

 解析action,-j ACCEPT。

int do_command4(int argc, char *argv[], char **table,
        struct xtc_handle **handle, bool restore)
{
        case 'j':
            set_option(&cs.options, OPT_JUMP, &cs.fw.ip.invflags,
                   cs.invert);
            command_jump(&cs, optarg);
            break;
}
void command_jump(struct iptables_command_state *cs, const char *jumpto)
{
    cs->jumpto = xt_parse_target(jumpto);
    /* TRY_LOAD (may be chain name) */
    cs->target = xtables_find_target(cs->jumpto, XTF_TRY_LOAD);

    if (cs->target == NULL)
        return;

    size = XT_ALIGN(sizeof(struct xt_entry_target)) + cs->target->size;

    cs->target->t = xtables_calloc(1, size);
    cs->target->t->u.target_size = size;
}

 ACCEPT,DROP,QUEUE,RETURN对应的是standard target。

static struct xtables_target standard_target = {
    .family     = NFPROTO_UNSPEC,
    .name       = "standard",
    .version    = XTABLES_VERSION,
    .size       = XT_ALIGN(sizeof(int)),
    .userspacesize  = XT_ALIGN(sizeof(int)),
    .help       = standard_help,
};

 xt_entry_target分配的大小:

size = XT_ALIGN(sizeof(struct xt_entry_target)) + cs->target->size;
cs->target->t = xtables_calloc(1, size);

 standard target的target->size大小为XT_ALIGN(sizeof(int))。最终分配的结构体为xt_standard_target 。

struct xt_standard_target {
    struct xt_entry_target target;
    int verdict;
};

 整理成内核需要的格式,向内核提交:

int
TC_COMMIT(struct xtc_handle *handle)
{
    /* Replace, then map back the counters. */
    STRUCT_REPLACE *repl;
    new_number = iptcc_compile_table_prep(handle, &new_size);
    ret = iptcc_compile_table(handle, repl);
    ret = setsockopt(handle->sockfd, TC_IPPROTO, SO_SET_REPLACE, repl,
             sizeof(*repl) + repl->size);
}

内核空间

static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
    switch (cmd) {
    case IPT_SO_SET_REPLACE:
        ret = do_replace(sock_net(sk), user, len);
        break;
    default:
        ret = -EINVAL;
    }
    return ret;
}
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
    newinfo = xt_alloc_table_info(tmp.size);
    if (!newinfo)
        return -ENOMEM;

    loc_cpu_entry = newinfo->entries;
    if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
               tmp.size) != 0) {
        ret = -EFAULT;
        goto free_newinfo;
    }
    ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
    if (ret != 0)
        goto free_newinfo;
    
    ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
            tmp.num_counters, tmp.counters);
}
static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
         struct xt_table_info *newinfo, unsigned int num_counters,
         void __user *counters_ptr)
{
    struct xt_table *t;
    t = xt_request_find_table_lock(net, AF_INET, name);
    oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
}
struct xt_table_info *
xt_replace_table(struct xt_table *table,
          unsigned int num_counters,
          struct xt_table_info *newinfo,
          int *error)
{
    table->private = newinfo;
}

原有规则的处理

 用户层调用setsockopt将数据配置到内核。do_replace函数会重新配置规则。但是用户可以多次配置iptable。这里就引入一个问题:之前内核中的iptables规到哪里去了呢?难道被冲掉了吗?
 iptables在重新解析规则时,会调用getsockopt将内核中的规则拷贝出来,然后重新配置。

int do_command4(int argc, char *argv[], char **table,
        struct xtc_handle **handle, bool restore)
{
    /* only allocate handle if we weren't called with a handle */
    if (!*handle)
        *handle = iptc_init(*table);
}
struct xtc_handle *
iptc_init(const char *tablename)
{
    strcpy(info.name, tablename);
//获取entry的大小信息。
    if (getsockopt(sockfd, TC_IPPROTO, SO_GET_INFO, &info, &s) < 0) {
        close(sockfd);
        return NULL;
    }
    h = alloc_handle(&info);
    /* Initialize current state */
    h->sockfd = sockfd;
    h->info = info;

    h->entries->size = h->info.size;

    tmp = sizeof(STRUCT_GET_ENTRIES) + h->info.size;
    if (getsockopt(h->sockfd, TC_IPPROTO, SO_GET_ENTRIES, h->entries,
               &tmp) < 0)
        goto error;
}

 getsockopt(h->sockfd, TC_IPPROTO,SO_GET_ENTRIES, h->entries, &tmp) 从内核中拷贝原有的entries。
 内核中对应的处理函数:

static int
compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
    int ret;

    if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
        return -EPERM;

    switch (cmd) {
    case IPT_SO_GET_INFO:
        ret = get_info(sock_net(sk), user, len, 1);
        break;
    case IPT_SO_GET_ENTRIES:
        ret = compat_get_entries(sock_net(sk), user, len);
        break;
    default:
        ret = do_ipt_get_ctl(sk, cmd, user, len);
    }
    return ret;
}

 读取规则信息之后,iptables重新处理数据:

/* parse an iptables blob into it's pieces */
static int parse_table(struct xtc_handle *h)
{
    /* First pass: over ruleset blob */
    ENTRY_ITERATE(h->entries->entrytable, h->entries->size,
            cache_add_entry, h, &prev, &num);
}
/* main parser function: add an entry from the blob to the cache */
static int cache_add_entry(STRUCT_ENTRY *e,
               struct xtc_handle *h,
               STRUCT_ENTRY **prev,
               unsigned int *num)
{
else if ((builtin = iptcb_ent_is_hook_entry(e, h)) != 0) {
        struct chain_head *c =
            iptcc_alloc_chain_head((char *)hooknames[builtin-1],
                        builtin);
        DEBUGP_C("%u:%u new builtin chain: %p (rules=%p)\n",
            *num, offset, c, &c->rules);
        if (!c) {
            errno = -ENOMEM;
            return -1;
        }

        c->hooknum = builtin;

        __iptcc_p_add_chain(h, c, offset, num);

        /* FIXME: this is ugly. */
        goto new_rule;
    } 
}

 内核中在初始化table的时候,会配置chain。博客——netfilter分析2-表在内核的初始化——有更详尽的分析。
 以filter表为例:

static int __net_init iptable_filter_table_init(struct net *net)
{
    repl = ipt_alloc_initial_table(&packet_filter);
}
void *ipt_alloc_initial_table(const struct xt_table *info)
{
    return xt_alloc_initial_table(ipt, IPT);
}
#define xt_alloc_initial_table(type, typ2) ({ \
    struct { \
        struct type##_replace repl; \
        struct type##_standard entries[]; \
    } *tbl; \
    struct type##_error *term; \
    size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
        __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
    tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
    for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
        if (!(hook_mask & 1)) \
            continue; \
        tbl->repl.hook_entry[hooknum] = bytes; \
        tbl->repl.underflow[hooknum]  = bytes; \
        tbl->entries[i++] = (struct type##_standard) \
            typ2##_STANDARD_INIT(NF_ACCEPT); \
        bytes += sizeof(struct type##_standard); \
    } \
    tbl; \
})

Reference:

[1]25个iptables常用示例
[2]Netfilter之AF_INET协议族rule、match、target
[3]iptables 防火墙-filter表
[4]Netfilter 是如何工作的(二):表(table)与规则(rule)
[5]Netfilter是如何工作的(三):扩展匹配条件和动作
[6]netfilter分析2-表在内核的初始化
[7]netfilter分析3-钩子函数执行流程
[8]iptables性能 -- Kube-proxy引入user define chain思考
[9]ipset详解 使用ipset提高iptables的控制效率

你可能感兴趣的:(iptables与内核的交互)