Linux下快速解析nf_conntrack

1. 背景

回顾了项目需求是系统的统计tcp连接数;
于是想到了 nf_conntrack 这个Linux内核提供的记录和跟踪连接状态的功能;
然后写了个程序解析 /proc/net/nf_conntrack这个映射文件,后来悲剧就发生了,当conntrack表记录变增加到1w以上之后,解析速度急速下降,到了10w规模后,解析耗时几十秒都不能完成,,,
终于后来翻到了netfilter的老巢,发现了解决方法:libnml、libnetfilter_conntrack

2. 使用

核心原理是通过netlink套接字的方式,与内核交互,查询得到结果
libmnl基本方法:

extern struct mnl_socket *mnl_socket_open(int bus);
extern int mnl_socket_bind(struct mnl_socket *nl, unsigned int groups, pid_t pid);
extern int mnl_socket_close(struct mnl_socket *nl);
extern ssize_t mnl_socket_sendto(const struct mnl_socket *nl, const void *req, size_t siz);
extern ssize_t mnl_socket_recvfrom(const struct mnl_socket *nl, void *buf, size_t siz);

libnetfilter_conntrack则主要是对获取的结果进行解析,比如拿出源地址、协议簇信息

/* conntrack attributes */
enum nf_conntrack_attr {
    ATTR_ORIG_IPV4_SRC = 0,         /* u32 bits */
    ATTR_IPV4_SRC = ATTR_ORIG_IPV4_SRC, /* alias */
    ATTR_ORIG_IPV4_DST,         /* u32 bits */
    ATTR_IPV4_DST = ATTR_ORIG_IPV4_DST, /* alias */
    ATTR_REPL_IPV4_SRC,         /* u32 bits */
    ATTR_REPL_IPV4_DST,         /* u32 bits */
    ATTR_ORIG_IPV6_SRC = 4,         /* u128 bits */
    ATTR_IPV6_SRC = ATTR_ORIG_IPV6_SRC, /* alias */
    ATTR_ORIG_IPV6_DST,         /* u128 bits */
    ATTR_IPV6_DST = ATTR_ORIG_IPV6_DST, /* alias */
    ATTR_REPL_IPV6_SRC,         /* u128 bits */
    ATTR_REPL_IPV6_DST,         /* u128 bits */
    ATTR_ORIG_PORT_SRC = 8,         /* u16 bits */
    ATTR_PORT_SRC = ATTR_ORIG_PORT_SRC, /* alias */
    ATTR_ORIG_PORT_DST,         /* u16 bits */
    ATTR_PORT_DST = ATTR_ORIG_PORT_DST, /* alias */
    ATTR_REPL_PORT_SRC,         /* u16 bits */
    ATTR_REPL_PORT_DST,         /* u16 bits */
    ATTR_ICMP_TYPE = 12,            /* u8 bits */
    ATTR_ICMP_CODE,             /* u8 bits */
    ATTR_ICMP_ID,               /* u16 bits */
    ATTR_ORIG_L3PROTO,          /* u8 bits */
    ATTR_L3PROTO = ATTR_ORIG_L3PROTO,   /* alias */
    ATTR_REPL_L3PROTO = 16,         /* u8 bits */
    ATTR_ORIG_L4PROTO,          /* u8 bits */
    ATTR_L4PROTO = ATTR_ORIG_L4PROTO,   /* alias */
    ATTR_REPL_L4PROTO,          /* u8 bits */
    ATTR_TCP_STATE,             /* u8 bits */
    ATTR_SNAT_IPV4 = 20,            /* u32 bits */
    ATTR_DNAT_IPV4,             /* u32 bits */
    ATTR_SNAT_PORT,             /* u16 bits */
    ATTR_DNAT_PORT,             /* u16 bits */
    ATTR_TIMEOUT = 24,          /* u32 bits */
    ATTR_MARK,              /* u32 bits */
    ATTR_ORIG_COUNTER_PACKETS,      /* u64 bits */
    ATTR_REPL_COUNTER_PACKETS,      /* u64 bits */
    ATTR_ORIG_COUNTER_BYTES = 28,       /* u64 bits */
    ATTR_REPL_COUNTER_BYTES,        /* u64 bits */
    ATTR_USE,               /* u32 bits */
    ATTR_ID,                /* u32 bits */
    ATTR_STATUS = 32,           /* u32 bits  */
        ATTR_TCP_FLAGS_ORIG,            /* u8 bits */
    ATTR_TCP_FLAGS_REPL,            /* u8 bits */
    ATTR_TCP_MASK_ORIG,         /* u8 bits */
    ATTR_TCP_MASK_REPL = 36,        /* u8 bits */
    ATTR_MASTER_IPV4_SRC,           /* u32 bits */
    ATTR_MASTER_IPV4_DST,           /* u32 bits */
    ATTR_MASTER_IPV6_SRC,           /* u128 bits */
    ATTR_MASTER_IPV6_DST = 40,      /* u128 bits */
    ATTR_MASTER_PORT_SRC,           /* u16 bits */
    ATTR_MASTER_PORT_DST,           /* u16 bits */
    ATTR_MASTER_L3PROTO,            /* u8 bits */
    ATTR_MASTER_L4PROTO = 44,       /* u8 bits */
    ATTR_SECMARK,               /* u32 bits */
    ATTR_ORIG_NAT_SEQ_CORRECTION_POS,   /* u32 bits */
    ATTR_ORIG_NAT_SEQ_OFFSET_BEFORE,    /* u32 bits */
    ATTR_ORIG_NAT_SEQ_OFFSET_AFTER = 48,    /* u32 bits */
    ATTR_REPL_NAT_SEQ_CORRECTION_POS,   /* u32 bits */
    ATTR_REPL_NAT_SEQ_OFFSET_BEFORE,    /* u32 bits */
    ATTR_REPL_NAT_SEQ_OFFSET_AFTER,     /* u32 bits */
    ATTR_SCTP_STATE = 52,           /* u8 bits */
    ATTR_SCTP_VTAG_ORIG,            /* u32 bits */
    ATTR_SCTP_VTAG_REPL,            /* u32 bits */
    ATTR_HELPER_NAME,           /* string (30 bytes max) */
    ATTR_DCCP_STATE = 56,           /* u8 bits */
    ATTR_DCCP_ROLE,             /* u8 bits */
    ATTR_DCCP_HANDSHAKE_SEQ,        /* u64 bits */
    ATTR_TCP_WSCALE_ORIG,           /* u8 bits */
    ATTR_TCP_WSCALE_REPL = 60,      /* u8 bits */
    ATTR_ZONE,              /* u16 bits */
    ATTR_SECCTX,                /* string */
    ATTR_TIMESTAMP_START,           /* u64 bits, linux >= 2.6.38 */
    ATTR_TIMESTAMP_STOP = 64,       /* u64 bits, linux >= 2.6.38 */
    ATTR_HELPER_INFO,           /* variable length */
    ATTR_CONNLABELS,            /* variable length */
    ATTR_CONNLABELS_MASK,           /* variable length */
    ATTR_ORIG_ZONE,             /* u16 bits */
    ATTR_REPL_ZONE,             /* u16 bits */
    ATTR_SNAT_IPV6,             /* u128 bits */
    ATTR_DNAT_IPV6,             /* u128 bits */
    ATTR_SYNPROXY_ISN,          /* u32 bits */
    ATTR_SYNPROXY_ITS,          /* u32 bits */
    ATTR_SYNPROXY_TSOFF,            /* u32 bits */
    ATTR_MAX
};

3. 例子

以下例子为打印TCP当前连接情况
main函数主要就是创建一个netlink套接字,发送请求IPCTNL_MSG_CT_GET获取整个conntrack表信息
最终结果接收在buf中,使用mnl_cb_run进行循环解析。

int main(void)
{
    struct mnl_socket *nl;
    struct nlmsghdr *nlh;
    struct nfgenmsg *nfh;
    char buf[MNL_SOCKET_BUFFER_SIZE];
    unsigned int seq, portid;
    int ret;

    nl = mnl_socket_open(NETLINK_NETFILTER);
    if (nl == NULL) {
        perror("mnl_socket_open");
        exit(EXIT_FAILURE);
    }   

    if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
        perror("mnl_socket_bind");
        exit(EXIT_FAILURE);
    }   
    portid = mnl_socket_get_portid(nl);

    nlh = mnl_nlmsg_put_header(buf);
    nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET;
    nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
    nlh->nlmsg_seq = seq = time(NULL);

    nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg));
    nfh->nfgen_family = AF_INET;
    nfh->version = NFNETLINK_V0;
    nfh->res_id = 0;
    ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
    if (ret == -1) {
        perror("mnl_socket_recvfrom");
        exit(EXIT_FAILURE);
    }

    ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
    while (ret > 0) {
        ret = mnl_cb_run(buf, ret, seq, portid, data_cb, NULL);
        if (ret <= MNL_CB_STOP) {
            break;
        }
        ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
    }
    if (ret == -1) {
        perror("mnl_socket_recvfrom");
        exit(EXIT_FAILURE);
    }
    mnl_socket_close(nl);
    return 0;
}

以下为 回调函数的实现,在本例子中,则筛选出TCP连接进行展示

#include 
#include 
#include 
#include 
#include 

#include 
#include 

static int data_cb(const struct nlmsghdr *nlh, void *data)
{
    struct nf_conntrack *ct;
    char buf[4096];

    ct = nfct_new();
    if (ct == NULL) {
        return MNL_CB_OK;
    }   

    nfct_nlmsg_parse(nlh, ct);

    switch (nfct_get_attr_u8(ct, ATTR_ORIG_L4PROTO)) {
    case IPPROTO_TCP:
        nfct_snprintf(buf, sizeof(buf), ct, NFCT_T_UNKNOWN, NFCT_O_DEFAULT, 0); 
        printf("%s\n", buf);
        break;
    }   

    nfct_destroy(ct);

    return MNL_CB_OK;
}

运行结果涉及本机一些地址,就不展示了,结果与 /proc/net/nf_conntrack一致,但到10w记录的环境下,并不会有巨大的开销。

4. 总结

只要不设置notrack标识的连接,就可以通过 nf_conntrack进行获取连接数;
而使用libnml+libnetfilter_conntrack的netlink套接字的方式,比直接cat文件速度快很多;
查看了官方手册,发现libnetfilter_conntrack不仅可解析conntrack表,还能够进行监控、修改等高级操作,功能十分强大!

参考文章:
[1] https://en.wikipedia.org/wiki/Netfilter
[2] https://www.netfilter.org/projects/libnetfilter_conntrack/index.html

你可能感兴趣的:(linux,socket)