dpvs学习笔记: 12 TOA 实现原理

在 full-nat two-arm 模式下,后端 real server 获取到请求的来源都是 dpvs local ip, 如何获取真实的 client ip 呢?这就需要 toa 模块,原理都说是修改了 rs 机器获取 ip 的函数,具体如何初现呢?

tcp option 字段

关于 tcp header 可以参考 wiki, 我把截图贴上来

dpvs学习笔记: 12 TOA 实现原理_第1张图片
tcp header

我们知道 ip header 里 src address 肯定是 dpvs local ip, 否则数据包无法发送。那么 client ip 放哪里呢?就是在 tcp header 的 option 字段中。

option 字段最长 40 bytes. 每填充一个选项由三部分构成:op-kind, op-length, op-data. 最常用的 mss 字段就是放在 option 里。只要构建一个不冲突的 op-kind 就可以把 client ip 填充进去。ipv4 的度度是 4 bytes, ipv6 是 16 bytes. 看来整个 option 字段在不久就会不够用。

dpvs 写 tcp option address

DPVS fullnat 在调用 tcp_fnat_in_handler 时会调用 tcp_in_add_toa 写到 mbuf.

static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf,
                          struct tcphdr *tcph)
{
    uint32_t mtu;
    struct tcpopt_addr *toa;
    uint32_t tcp_opt_len;

    uint8_t *p, *q, *tail;
    struct route_entry *rt;

    if (unlikely(conn->af != AF_INET && conn->af != AF_INET6))
        return EDPVS_NOTSUPP;

    tcp_opt_len = conn->af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR;
    /*
     * check if we can add the new option
     */
    /* skb length and tcp option length checking */
    if ((rt = mbuf->userdata) != NULL) {
        mtu = rt->mtu;
    } else if (conn->in_dev) { /* no route for fast-xmit */
        mtu = conn->in_dev->mtu;
    } else {
        RTE_LOG(DEBUG, IPVS, "add toa: MTU unknown.\n");
        return EDPVS_NOROUTE;
    }

    if (unlikely(mbuf->pkt_len > (mtu - tcp_opt_len))) {
        RTE_LOG(DEBUG, IPVS, "add toa: need fragment, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_FRAG;
    }

    /* maximum TCP header is 60, and 40 for options */
    if (unlikely((60 - (tcph->doff << 2)) < tcp_opt_len)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /* check tail room and expand mbuf.
     * have to pull all bits in segments for later operation. */
    if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0))
        return EDPVS_INVPKT;
    tail = (uint8_t *)rte_pktmbuf_append(mbuf, tcp_opt_len);
    if (unlikely(!tail)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /*
     * now add address option
     */

    /* move data down, including existing tcp options
     * @p is last data byte,
     * @q is new position of last data byte */
    p = tail - 1;
    q = p + tcp_opt_len;
    while (p >= ((uint8_t *)tcph + sizeof(struct tcphdr))) {
        *q = *p;
        p--, q--;
    }

    /* insert toa right after TCP basic header */
    toa = (struct tcpopt_addr *)(tcph + 1);
    toa->opcode = TCP_OPT_ADDR;
    toa->opsize = tcp_opt_len;
    toa->port = conn->cport;

    if (conn->af == AF_INET) {
        struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1);
        toa_ip4->addr = conn->caddr.in;
    }
    else {
        struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1);
        toa_ip6->addr = conn->caddr.in6;
    }


    /* reset tcp header length */
    tcph->doff += tcp_opt_len >> 2;

    /* reset ip header total length */
    if (conn->af == AF_INET)
        ip4_hdr(mbuf)->total_length =
            htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len);
    else
        ip6_hdr(mbuf)->ip6_plen =
            htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len);

    /* tcp csum will be recalc later, 
     * so as IP hdr csum since iph.tot_len has been chagned. */
    return EDPVS_OK;
}
  1. 根据 ipv4 ipv6 来确定 toa 需要的长度,2 bytes op-kind, 2 bytes op-length 再加上地址长度。所以 ipv4 共需 8 bytes, ipv6 共需 20 bytes
  2. TCP header 最大长度 60,option 最大长度 40,确何不会超过
  3. rte_pktmbuf_append 将 mbuf 扩展空间,能容纳 toa
  4. 填充 tcpopt_addr 结构体,op-kind TCP_OPT_ADDR 是 254,非官方 tcp/ip 认可的值。端口值是 conn->cport, 最后填充 conn->caddr.in 或 conn->caddr.in6 地址。

real server 安装 toa

很简单,make 编绎后生成 toa.ko 驱动,然后 insmod toa.ko 即可。所有 real server 都需要安装。先看下 module_init 函数 toa_init

static int __init
toa_init(void)
{

    TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n");

    /* alloc statistics array for toa */
    ext_stats = alloc_percpu(struct toa_stat_mib);
    if (NULL == ext_stats)
        return 1;
    proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops);

    /* get the address of function sock_def_readable
     * so later we can know whether the sock is for rpc, tux or others
     */
    sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable");
    TOA_INFO("CPU [%u] sk_data_ready_addr = "
        "kallsyms_lookup_name(sock_def_readable) = %lu\n",
         smp_processor_id(), sk_data_ready_addr);
    if (0 == sk_data_ready_addr) {
        TOA_INFO("cannot find sock_def_readable.\n");
        goto err;
    }

#ifdef TOA_IPV6_ENABLE
    if (0 != get_kernel_ipv6_symbol()) {
        TOA_INFO("get ipv6 struct from kernel fail.\n");
        goto err;
    }
#endif
    
    /* hook funcs for parse and get toa */
    hook_toa_functions();

    TOA_INFO("toa loaded\n");
    return 0;

err:
    proc_net_remove(&init_net, "toa_stats");
    if (NULL != ext_stats) {
        free_percpu(ext_stats);
        ext_stats = NULL;
    }

    return 1;
}
  1. proc_net_fops_create 在 /proc 文件系统下注册 /proc/net/toa_stats 用于查看统计使用
  2. kallsyms_lookup_name 根据名称来获取 sock_def_readable 地址
  3. get_kernel_ipv6_symbol 如果支持 ipv6, 获取相应的回调函数地址
  4. hook_toa_functions 将 toa 功能 hook 进内核
    dpvs学习笔记: 12 TOA 实现原理_第2张图片
    proc_net_fops_create
/* replace the functions with our functions */
static inline int
hook_toa_functions(void)
{
    /* hook inet_getname for ipv4 */
    struct proto_ops *inet_stream_ops_p =
            (struct proto_ops *)&inet_stream_ops;
    /* hook tcp_v4_syn_recv_sock for ipv4 */
    struct inet_connection_sock_af_ops *ipv4_specific_p =
            (struct inet_connection_sock_af_ops *)&ipv4_specific;

    inet_stream_ops_p->getname = inet_getname_toa;
    TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n",
        smp_processor_id(), inet_getname, inet_stream_ops_p->getname);

    ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v4_syn_recv_sock,
        ipv4_specific_p->syn_recv_sock);

#ifdef TOA_IPV6_ENABLE
    inet6_stream_ops_p->getname = inet6_getname_toa;
    TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n",
        smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname);

    ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v6_syn_recv_sock_org_pt,
        ipv6_specific_p->syn_recv_sock);
#endif

    return 0;
}

仔细看看也不难,就是将 inet ops 回调函数 getname 替换为 toa 的。但是我有问题,如果请求不来自 dpvs,普通的请求会不会也受影响?

可以看到 hook 了两个函数 tcp_v4_syn_recv_sock_toa 和 inet_getname_toa

real server 获取 client ip

当完成三次握手时调用 tcp_v4_syn_recv_sock_toa

static struct sock *
tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb,
            struct request_sock *req, struct dst_entry *dst)
{
    struct sock *newsock = NULL;

    TOA_DBG("tcp_v4_syn_recv_sock_toa called\n");

    /* call orginal one */
    newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst);

    /* set our value if need */
    if (NULL != newsock && NULL == newsock->sk_user_data) {
        newsock->sk_user_data = get_toa_data(AF_INET, skb);
        if (NULL != newsock->sk_user_data)
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT);
        else
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT);

        TOA_DBG("tcp_v4_syn_recv_sock_toa: set "
            "sk->sk_user_data to %p\n",
            newsock->sk_user_data);
    }
    return newsock;
}
  1. 调用原有函数 tcp_v4_syn_recv_sock 处理,也就是就里兼容了原有逻辑,普通非 toa 请求也会正常获取到 ip
  2. 额外调用 get_toa_data 生成地址,可以看到地址放到了 sk->sk_user_data 字段。
static void *get_toa_data(int af, struct sk_buff *skb)
{
    struct tcphdr *th;
    int length;
    unsigned char *ptr;

    TOA_DBG("get_toa_data called\n");

    if (NULL != skb) {
        th = tcp_hdr(skb);
        length = (th->doff * 4) - sizeof(struct tcphdr);
        ptr = (unsigned char *) (th + 1);

        while (length > 0) {
            int opcode = *ptr++;
            int opsize;
            switch (opcode) {
            case TCPOPT_EOL:
                return NULL;
            case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
                length--;
                continue;
            default:
                opsize = *ptr++;
                if (opsize < 2) /* "silly options" */
                    return NULL;
                if (opsize > length)
                    /* don't parse partial options */
                    return NULL;
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP4_TOA == opsize) {

                    struct toa_ip4_data tdata;
                    void *ret_ptr = NULL;

                    memcpy(&tdata, ptr - 2, sizeof(tdata));
                    TOA_DBG("af = %d, find toa data: ip = "
                        TOA_NIPQUAD_FMT", port = %u\n",
                        af,
                        TOA_NIPQUAD(tdata.ip),
                        ntohs(tdata.port));
                    if (af == AF_INET) {
                        memcpy(&ret_ptr, &tdata,
                            sizeof(ret_ptr));
                        TOA_DBG("coded ip4 toa data: %p\n",
                            ret_ptr);
                        return ret_ptr;
                    }
#ifdef TOA_IPV6_ENABLE
                    else if (af == AF_INET6) {
                        struct toa_ip6_data *ptr_toa_ip6 =
                            kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                        if (!ptr_toa_ip6) {
                            return NULL;
                        }
                        ptr_toa_ip6->opcode = opcode;
                        ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA;
                        ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0,
                            htonl(0x0000FFFF), tdata.ip);
                        TOA_DBG("coded ip6 toa data: %p\n",
                            ptr_toa_ip6);
                        TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                        return ptr_toa_ip6;
                    }
#endif
                }

#ifdef TOA_IPV6_ENABLE
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP6_TOA == opsize &&
                    af == AF_INET6) {
                    struct toa_ip6_data *ptr_toa_ip6 =
                        kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                    if (!ptr_toa_ip6) {
                            return NULL;
                    }
                    memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data));

                    TOA_DBG("find toa_v6 data : ip = "
                        TOA_NIP6_FMT", port = %u,"
                        " coded ip6 toa data: %p\n",
                        TOA_NIP6(ptr_toa_ip6->in6_addr),
                        ptr_toa_ip6->port,
                        ptr_toa_ip6);
                    TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                    return ptr_toa_ip6;
                }
#endif
                ptr += opsize - 2;
                length -= opsize;
            }
        }
    }
    return NULL;
}
  1. 遍历所有 option, 根据 opcode 来处理 ipv4 或是 ipv6
  2. 将 toa struct 复制一份,然后返回

然后当 real server 调用 getpeername 或是 getsocketname 时调用 inet_getname_toa 来获取 ip,如果是 ipv6 则调用 inet6_getname_toa

inet_getname_toa(struct socket *sock, struct sockaddr *uaddr,
        int *uaddr_len, int peer)
{
    int retval = 0;
    struct sock *sk = sock->sk;
    struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
    struct toa_ip4_data tdata;

    TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n",
        sk->sk_user_data);

    /* call orginal one */
    retval = inet_getname(sock, uaddr, uaddr_len, peer);

    /* set our value if need */
    if (retval == 0 && NULL != sk->sk_user_data && peer) {
        if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) {
            memcpy(&tdata, &sk->sk_user_data, sizeof(tdata));
            if (TCPOPT_TOA == tdata.opcode &&
                TCPOLEN_IP4_TOA == tdata.opsize) {
                TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
                TOA_DBG("inet_getname_toa: set new sockaddr, ip "
                    TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT
                    ", port %u -> %u\n",
                    TOA_NIPQUAD(sin->sin_addr.s_addr),
                    TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port),
                    ntohs(tdata.port));
                sin->sin_port = tdata.port;
                sin->sin_addr.s_addr = tdata.ip;
            } else { /* sk_user_data doesn't belong to us */
                TOA_INC_STATS(ext_stats,
                        GETNAME_TOA_MISMATCH_CNT);
                TOA_DBG("inet_getname_toa: invalid toa data, "
                    "ip "TOA_NIPQUAD_FMT" port %u opcode %u "
                    "opsize %u\n",
                    TOA_NIPQUAD(tdata.ip), ntohs(tdata.port),
                    tdata.opcode, tdata.opsize);
            }
        } else {
            TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT);
        }
    } else { /* no need to get client ip */
        TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT);
    }

    return retval;
}
  1. 调用原有 inet_getname 函数,获取 ip,兼容原有内核逻辑
  2. 判断 sk_user_data 不为空,并且结构体 op-kind op-length 与 ipv4 toa 的相等,获取 ip port ,并填充 sin

小结

实现原理还真简单,只不过有两个隐患。

  1. 如果 option 以后扩充其它内容,长度不够咋办?资源本身就不多
  2. op-kind 254 现在不被 tcp/ip 官方认可,以后会不会被占用?

你可能感兴趣的:(dpvs学习笔记: 12 TOA 实现原理)