lvs四层负载均衡

lvs(linux virtual server)是基于netfilter框架实现的四层负载均衡器,包含两部分,一部分是用户态的ipvsadm配置管理命令,另一部分是内核态的核心ko。

lvs常用的转发模式是DR,tunnel和dnat。DR和tunnel模式下,只有请求报文会经过lvs,响应报文由rs直接返给client。dnat模式下,需要有端口映射,所以响应报文也必须经过lvs做snat后才能发给client。

lvs使用调度器选择合适的rs,可在代码中搜register_ip_vs_scheduler查看支持的调度器。

模块初始化

使用ipvs需要提前加载ip_vs module(modprobe ip_vs),下面看一下此module的初始化。

module_init(ip_vs_init);
static int __init ip_vs_init(void)
{
    ip_vs_control_init();

    ip_vs_protocol_init();

    ip_vs_conn_init();

    register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */

    register_pernet_device(&ipvs_core_dev_ops);

    nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));

    ip_vs_register_nl_ioctl();

    pr_info("ipvs loaded.\n");

    return ret;
}
  1. ip_vs_control_init
    主要是初始化两张hash表ip_vs_svc_table和ip_vs_svc_fwm_table。

/*
 *  Hash table: for virtual service lookups
 */
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)

/* the service table hashed by  */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];

/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];

int __init ip_vs_control_init(void)
{
    //初始化两张hash链表,hash桶大小为256
    /* Initialize svc_table, ip_vs_svc_fwm_table */
    for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
        INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
        INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
    }

    smp_wmb();  /* Do we really need it now ? */

    //注册网络设备状态变化通知函数,如果有网卡down了会调用
    //ip_vs_dst_event,清除和此设备相关的表项
    register_netdevice_notifier(&ip_vs_dst_notifier);

    return 0;
}
  1. ip_vs_protocol_ini
    将协议相关的ip_vs_protocol注册到ip_vs_proto_table,如果ip_vs_protocol提供了init函数,还会在注册后调用init函数。

struct ip_vs_protocol ip_vs_protocol_tcp = {
    .name =         "TCP",
    .protocol =     IPPROTO_TCP,
    .num_states =       IP_VS_TCP_S_LAST,
    .dont_defrag =      0,
    .init =         NULL,
    .exit =         NULL,
    .init_netns =       __ip_vs_tcp_init,
    .exit_netns =       __ip_vs_tcp_exit,
    .register_app =     tcp_register_app,
    .unregister_app =   tcp_unregister_app,
    .conn_schedule =    tcp_conn_schedule,
    .conn_in_get =      ip_vs_conn_in_get_proto,
    .conn_out_get =     ip_vs_conn_out_get_proto,
    .snat_handler =     tcp_snat_handler,
    .dnat_handler =     tcp_dnat_handler,
    .csum_check =       tcp_csum_check,
    .state_name =       tcp_state_name,
    .state_transition = tcp_state_transition,
    .app_conn_bind =    tcp_app_conn_bind,
    .debug_packet =     ip_vs_tcpudp_debug_packet,
    .timeout_change =   tcp_timeout_change,
};

#define IP_VS_PROTO_TAB_SIZE        32  /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto)     ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];

static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
    unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);

    pp->next = ip_vs_proto_table[hash];
    ip_vs_proto_table[hash] = pp;

    if (pp->init != NULL)
        pp->init(pp);

    return 0;
}

int __init ip_vs_protocol_init(void)
{
    char protocols[64];
#define REGISTER_PROTOCOL(p)            \
    do {                    \
        register_ip_vs_protocol(p); \
        strcat(protocols, ", ");    \
        strcat(protocols, (p)->name);   \
    } while (0)

    protocols[0] = '\0';
    protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
    REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
    REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_SCTP
    REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
    REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
    REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
    pr_info("Registered protocols (%s)\n", &protocols[2]);

    return 0;
}
  1. ip_vs_conn_init
    主要是注册用于保存连接表项的hash表ip_vs_conn_tab。

/*
 *  Connection hash table: for input and output packets lookups of IPVS
 */
static struct hlist_head *ip_vs_conn_tab __read_mostly;

#define CONFIG_IP_VS_TAB_BITS   12
static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;

int __init ip_vs_conn_init(void)
{
    int idx;
    //连接表也是hash链表,hash桶大小为 1 <<12 = 4096 
    /* Compute size and mask */
    ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
    ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

    /*
     * Allocate the connection hash table and initialize its list heads
     */
    //分配4096个hash桶
    ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));

    /* Allocate ip_vs_conn slab cache */
    ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
                          sizeof(struct ip_vs_conn), 0,
                          SLAB_HWCACHE_ALIGN, NULL);

    pr_info("Connection hash table configured "
        "(size=%d, memory=%ldKbytes)\n",
        ip_vs_conn_tab_size,
        (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
    IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
          sizeof(struct ip_vs_conn));

    for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
        INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);

    for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
        spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
    }

    /* calculate the random value for connection hash */
    get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));

    return 0;
}

4.注册pernet操作 ipvs_core_ops和ipvs_core_dev_ops
对每个net namespace都会调用init函数进行pernet的初始化。

static struct pernet_operations ipvs_core_ops = {
    .init = __ip_vs_init,
    .exit = __ip_vs_cleanup,
    .id   = &ip_vs_net_id,
    .size = sizeof(struct netns_ipvs),
};

//ipvs_core_dev_ops 只提供了exit,只在卸载模块时调用。
static struct pernet_operations ipvs_core_dev_ops = {
    .exit = __ip_vs_dev_cleanup,
};

//主要是初始化 netns_ipvs 结构体的字段
static int __net_init __ip_vs_init(struct net *net)
{
    struct netns_ipvs *ipvs;

    ipvs = net_generic(net, ip_vs_net_id);
    if (ipvs == NULL)
        return -ENOMEM;

    /* Hold the beast until a service is registerd */
    ipvs->enable = 0;
    ipvs->net = net;
    /* Counters used for creating unique names */
    ipvs->gen = atomic_read(&ipvs_netns_cnt);
    atomic_inc(&ipvs_netns_cnt);
    net->ipvs = ipvs;
    //初始化和estimator相关的字段
    if (ip_vs_estimator_net_init(net) < 0)
        goto estimator_fail;

    if (ip_vs_control_net_init(net) < 0)
        goto control_fail;

    if (ip_vs_protocol_net_init(net) < 0)
        goto protocol_fail;

    if (ip_vs_app_net_init(net) < 0)
        goto app_fail;

    if (ip_vs_conn_net_init(net) < 0)
        goto conn_fail;

    if (ip_vs_sync_net_init(net) < 0)
        goto sync_fail;

    printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
             sizeof(struct netns_ipvs), ipvs->gen);
    return 0;
}
  1. 注册hook函数到netfilter框架

static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 2,
    },
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {
        .hook       = ip_vs_remote_request4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 1,
    },
    /* Before ip_vs_in, change source only for VS/NAT */
    {
        .hook       = ip_vs_local_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 1,
    },
    /* After mangle, schedule and forward local requests */
    {
        .hook       = ip_vs_local_request4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 2,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook       = ip_vs_forward_icmp,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 99,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 100,
    },
}
nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
  1. ip_vs_register_nl_ioctl
    注册两种用户态和内核态通信的方法:sockopt和netlink,ipvsadm可以使用两者之一下发命令

int __init ip_vs_register_nl_ioctl(void)
{
    nf_register_sockopt(&ip_vs_sockopts);

    ip_vs_genl_register();

    return 0;
}

注册调度器

调用register_ip_vs_scheduler将调度器注册到全局链表ip_vs_schedulers中。
在我看的3.18.79内核,已经有了十几种调度器的实现,每一种调度器都是以module形式存在,加载module时进行初始化,将调度器注册到全局变量ip_vs_schedulers中。

//全局链表头,保存调度器
static LIST_HEAD(ip_vs_schedulers);

int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
    struct ip_vs_scheduler *sched;

    if (!scheduler) {
        pr_err("%s(): NULL arg\n", __func__);
        return -EINVAL;
    }

    //调度器名字不能为空
    if (!scheduler->name) {
        pr_err("%s(): NULL scheduler_name\n", __func__);
        return -EINVAL;
    }

    /* increase the module use count */
    ip_vs_use_count_inc();

    mutex_lock(&ip_vs_sched_mutex);

    //如果已经注册过了,不能二次注册
    if (!list_empty(&scheduler->n_list)) {
        mutex_unlock(&ip_vs_sched_mutex);
        ip_vs_use_count_dec();
        pr_err("%s(): [%s] scheduler already linked\n",
               __func__, scheduler->name);
        return -EINVAL;
    }

    /*
     *  Make sure that the scheduler with this name doesn't exist
     *  in the scheduler list.
     */
    //查看是否有同名的调度器
    list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
        if (strcmp(scheduler->name, sched->name) == 0) {
            mutex_unlock(&ip_vs_sched_mutex);
            ip_vs_use_count_dec();
            pr_err("%s(): [%s] scheduler already existed "
                   "in the system\n", __func__, scheduler->name);
            return -EINVAL;
        }
    }
    /*
     *  Add it into the d-linked scheduler list
     */
    //最后将调度器插入全局链表ip_vs_schedulers
    list_add(&scheduler->n_list, &ip_vs_schedulers);
    mutex_unlock(&ip_vs_sched_mutex);

    pr_info("[%s] scheduler registered.\n", scheduler->name);

    return 0;
}

下面为sh(source ip hash)调度器,后面的分析也会以sh为例

module_init(ip_vs_sh_init);

static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
    .name =         "sh",
    .refcnt =       ATOMIC_INIT(0),
    .module =       THIS_MODULE,
    .n_list  =      LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
    .init_service =     ip_vs_sh_init_svc,
    .done_service =     ip_vs_sh_done_svc,
    .add_dest =     ip_vs_sh_dest_changed,
    .del_dest =     ip_vs_sh_dest_changed,
    .upd_dest =     ip_vs_sh_dest_changed,
    .schedule =     ip_vs_sh_schedule,
};

static int __init ip_vs_sh_init(void)
{
    return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}

创建service,添加rs

ipvs的基本用法如下,首先创建一个service,指定协议,ip和端口号,指定调度算法。再给这个service添加真实的后端ip和端口,并指定转发模式。
这里的service是对外提供的,收到client的请求后(目的ip为service ip),根据调度算法从后端ip池找到合适的rs,根据转发模式将报文发送给rs。

//清空所有规则
ipvsadm -C
//添加虚拟service,-s指定调度算法为sh,-t表示tcp协议,1.1.1.10:8080表示添加的vip及其端口号
ipvsadm -A -t 1.1.1.10:8080 -s sh
//给service添加rs,-g表示工作模式为DR模式,-r表示指定的rs为1.1.1.4:8080
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.4:8080 -g
//列出当前的配置
ipvsadm -ln

下面分别看一下创建servcie和添加rs的代码流程

  1. 创建service
    结构体struct ip_vs_service_user_kern是通过ipvsadm添加service到kernel传递参数用的。

struct ip_vs_service_user_kern {
    /* virtual service addresses */
    u16         af;
    //指定的协议
    u16         protocol;
    //指定的vip
    union nf_inet_addr  addr;       /* virtual ip address */
    //指定的vport
    __be16          port;
    u32         fwmark;     /* firwall mark of service */

    /* virtual service options */
    //指定的调度器名字
    char            *sched_name;
    char            *pe_name;
    unsigned int        flags;      /* virtual service flags */
    unsigned int        timeout;    /* persistent timeout in sec */
    __be32          netmask;    /* persistent netmask or plen */
};

ipvsadm下发添加service命令后,kernel端执行函数ip_vs_add_service

static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
          struct ip_vs_service **svc_p)
{
    int ret = 0, i;
    struct ip_vs_scheduler *sched = NULL;
    struct ip_vs_pe *pe = NULL;
    struct ip_vs_service *svc = NULL;
    struct netns_ipvs *ipvs = net_ipvs(net);

    /* increase the module use count */
    ip_vs_use_count_inc();
    
    //如果指定的调度器名字不为none,则根据指定的调度器名字到 
    //全局链表ip_vs_schedulers查找,如果第一次查找失败,会尝
    //试自动加载module,再次查找。两次查找都失败就返回报错
    /* Lookup the scheduler by 'u->sched_name' */
    if (strcmp(u->sched_name, "none")) {
        sched = ip_vs_scheduler_get(u->sched_name);
        if (!sched) {
            pr_info("Scheduler module ip_vs_%s not found\n",
                u->sched_name);
            ret = -ENOENT;
            goto out_err;
        }
    }
    
    //如果指定了pe,则根据pe name获取pe
    if (u->pe_name && *u->pe_name) {
        pe = ip_vs_pe_getbyname(u->pe_name);
        if (pe == NULL) {
            pr_info("persistence engine module ip_vs_pe_%s "
                "not found\n", u->pe_name);
            ret = -ENOENT;
            goto out_err;
        }
    }

    //分配service结构体
    svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);

    //分配percpu统计结构
    svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);

    //初始化percpu 统计
    for_each_possible_cpu(i) {
        struct ip_vs_cpu_stats *ip_vs_stats;
        ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
        u64_stats_init(&ip_vs_stats->syncp);
    }

    /* I'm the first user of the service */
    atomic_set(&svc->refcnt, 0);
    //将命令行设置的内容赋给service
    svc->af = u->af;
    svc->protocol = u->protocol;
    ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
    svc->port = u->port;
    svc->fwmark = u->fwmark;
    svc->flags = u->flags;
    svc->timeout = u->timeout * HZ;
    svc->netmask = u->netmask;
    svc->net = net;
    //初始化destinations链表,用于存放rs
    INIT_LIST_HEAD(&svc->destinations);
    spin_lock_init(&svc->sched_lock);
    spin_lock_init(&svc->stats.lock);

    /* Bind the scheduler */
    if (sched) {
        //将svc和指定的调度器绑定,即调用调度器的init_service
        //函数,并将sched赋值给svc->scheduler。
        //init_service函数会分配每个sched私有结构体,存放在
        //svc->sched_data
        ret = ip_vs_bind_scheduler(svc, sched);
        if (ret)
            goto out_err;
        sched = NULL;
    }

    /* Bind the ct retriever */
    RCU_INIT_POINTER(svc->pe, pe);
    pe = NULL;

    /* Update the virtual service counters */
    if (svc->port == FTPPORT)
        atomic_inc(&ipvs->ftpsvc_counter);
    else if (svc->port == 0)
        atomic_inc(&ipvs->nullsvc_counter);
    //将svc->stats放入链表ipvs->est_list
    ip_vs_start_estimator(net, &svc->stats);

    /* Count only IPv4 services for old get/setsockopt interface */
    if (svc->af == AF_INET)
        //增加service个数
        ipvs->num_services++;

    //将svc添加到全局变量ip_vs_svc_table或者
    //ip_vs_svc_fwm_table
    //Hash it by  in ip_vs_svc_table
    //Hash it by fwmark in svc_fwm_table
    /* Hash the service into the service table */
    ip_vs_svc_hash(svc);

    *svc_p = svc;
    /* Now there is a service - full throttle */
    //只要有一个service,就会设置enable
    ipvs->enable = 1;

    return 0;
}

/*
 *  Bind a service with a scheduler
 */
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
             struct ip_vs_scheduler *scheduler)
{
    int ret;
    //ip_vs_sh_init_svc
    if (scheduler->init_service) {
        ret = scheduler->init_service(svc);
        if (ret) {
            pr_err("%s(): init error\n", __func__);
            return ret;
        }
    }
    rcu_assign_pointer(svc->scheduler, scheduler);
    return 0;
}

static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
    struct ip_vs_sh_state *s;

    /* allocate the SH table for this service */
    s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
    if (s == NULL)
        return -ENOMEM;

    svc->sched_data = s;
    IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
          "current service\n",
          sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);

    /* assign the hash buckets with current dests */
    ip_vs_sh_reassign(s, svc);

    return 0;
}

/*
 *      Assign all the hash buckets of the specified table with the service.
 */
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
    int i;
    struct ip_vs_sh_bucket *b;
    struct list_head *p;
    struct ip_vs_dest *dest;
    int d_count;
    bool empty;

    b = &s->buckets[0];
    p = &svc->destinations;
    empty = list_empty(p);
    d_count = 0;
    for (i=0; idest, 1);
        if (dest)
            ip_vs_dest_put(dest);
        if (empty)
            RCU_INIT_POINTER(b->dest, NULL);
        else {
            if (p == &svc->destinations)
                p = p->next;

            dest = list_entry(p, struct ip_vs_dest, n_list);
            ip_vs_dest_hold(dest);
            RCU_INIT_POINTER(b->dest, dest);

            IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
                      i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
                      atomic_read(&dest->weight));

            /* Don't move to next dest until filling weight */
            if (++d_count >= atomic_read(&dest->weight)) {
                p = p->next;
                d_count = 0;
            }

        }
        b++;
    }
    return 0;
}
  1. 添加rs
    同样的,使用下面结构体传递参数。
    conn_flags: 指定了转发模式,比如DR,tunnel等,如果命令行不指定,默认为DR模式。
    weight: 表示此rs的权重,值越大被选中的概率越大。如果命令行不指定,默认为1。
    port: 为命令行指定的端口号,但是如果是DR或者tunnel模式,会自动转换成service的port,因为DR或者tunnel模式下,不会修改数据包的端口号。
    u_threshold: 如果连接数超过此值会给此dest设置IP_VS_DEST_F_OVERLOAD标志,表示此dest负载太多,默认值为0。
    l_threshold: 如果连接数低于,则清除IP_VS_DEST_F_OVERLOAD,默认为为0

struct ip_vs_dest_user_kern {
    /* destination server address */
    union nf_inet_addr  addr;
    __be16          port;

    /* real server options */
    unsigned int        conn_flags; /* connection flags */
    int         weight;     /* destination weight */

    /* thresholds for active connections */
    u32         u_threshold;    /* upper threshold */
    u32         l_threshold;    /* lower threshold */

    /* Address family of addr */
    u16         af;
};

ipvsadm下发添加rs命令后,kernel端执行函数ip_vs_add_dest

static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
    struct ip_vs_dest *dest;
    union nf_inet_addr daddr;
    __be16 dport = udest->port;
    int ret;

    EnterFunction(2);

    if (udest->weight < 0) {
        pr_err("%s(): server weight less than zero\n", __func__);
        return -ERANGE;
    }

    if (udest->l_threshold > udest->u_threshold) {
        pr_err("%s(): lower threshold is higher than upper threshold\n",
            __func__);
        return -ERANGE;
    }

    ip_vs_addr_copy(udest->af, &daddr, &udest->addr);

    /* We use function that requires RCU lock */
    rcu_read_lock();
    //到svc->destinations查找是否已经存在dest,
    dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
    rcu_read_unlock();

    //已经存在,返回
    if (dest != NULL) {
        IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
        return -EEXIST;
    }

    /*
     * Check if the dest already exists in the trash and
     * is from the same service
     */
    //如果dest被删除了但是还有连接在引用它,会把dest放在链表
    //ipvs->dest_trash上。如果能在ipvs->dest_trash链表上找到完
    //全一样的dest,可以直接取出来使用
    dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
    if (dest != NULL) {
        IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
                  "dest->refcnt=%d, service %u/%s:%u\n",
                  IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
                  atomic_read(&dest->refcnt),
                  dest->vfwmark,
                  IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
                  ntohs(dest->vport));

        __ip_vs_update_dest(svc, dest, udest, 1);
        ret = 0;
    } else {
        /*
         * Allocate and initialize the dest structure
         */
        //否则需要分配dest结构体,最后也会调用 __ip_vs_update_dest
        ret = ip_vs_new_dest(svc, udest, &dest);
    }
    LeaveFunction(2);

    return ret;
}

static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
            struct ip_vs_dest_user_kern *udest, int add)
{
    struct netns_ipvs *ipvs = net_ipvs(svc->net);
    struct ip_vs_service *old_svc;
    struct ip_vs_scheduler *sched;
    int conn_flags;

    /* We cannot modify an address and change the address family */
    BUG_ON(!add && udest->af != dest->af);

    if (add && udest->af != svc->af)
        ipvs->mixed_address_family_dests++;

    /* set the weight and the flags */
    atomic_set(&dest->weight, udest->weight);
    conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
    conn_flags |= IP_VS_CONN_F_INACTIVE;

    /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
    if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
        conn_flags |= IP_VS_CONN_F_NOOUTPUT;
    } else {
        /*
         *    Put the real service in rs_table if not present.
         *    For now only for NAT!
         */
        ip_vs_rs_hash(ipvs, dest);
    }
    atomic_set(&dest->conn_flags, conn_flags);

    /* bind the service */
    old_svc = rcu_dereference_protected(dest->svc, 1);
    if (!old_svc) {
        __ip_vs_bind_svc(dest, svc);
    } else {
        if (old_svc != svc) {
            ip_vs_zero_stats(&dest->stats);
            __ip_vs_bind_svc(dest, svc);
            __ip_vs_svc_put(old_svc, true);
        }
    }

    /* set the dest status flags */
    dest->flags |= IP_VS_DEST_F_AVAILABLE;

    if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
    dest->u_threshold = udest->u_threshold;
    dest->l_threshold = udest->l_threshold;

    dest->af = udest->af;

    spin_lock_bh(&dest->dst_lock);
    __ip_vs_dst_cache_reset(dest);
    spin_unlock_bh(&dest->dst_lock);

    if (add) {
        ip_vs_start_estimator(svc->net, &dest->stats);
        //将dest添加到svc的双向循环链表destinations中。
        list_add_rcu(&dest->n_list, &svc->destinations);
        svc->num_dests++;
        sched = rcu_dereference_protected(svc->scheduler, 1);
        //调用sched的add_dest将重新分配dest,比如对于sh调
        //度器来说,add_dest为ip_vs_sh_dest_changed
        if (sched && sched->add_dest)
            sched->add_dest(svc, dest);
    } else {
        sched = rcu_dereference_protected(svc->scheduler, 1);
        if (sched && sched->upd_dest)
            sched->upd_dest(svc, dest);
    }
}

ip_vs_sh_state为sh调度器的私有数据结构,buckets用来保存一个dest,IP_VS_SH_TAB_SIZE为256,所以对于sh来说,一个service一共只能保存256个dest,即rs。
ip_vs_sh_reassign会将svc->destinations链表中的dest赋值到buckets->dest,但是buckets容量只有256,即使添加再多的dest,也只有最后添加的256个dest生效,这是在dest的weight为1的情况下的结果。如果dest的weight越大,则此dest会占用weight个数的buckets。

struct ip_vs_sh_state {
    struct rcu_head         rcu_head;
    struct ip_vs_sh_bucket      buckets[IP_VS_SH_TAB_SIZE];
};

static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
                 struct ip_vs_dest *dest)
{
    struct ip_vs_sh_state *s = svc->sched_data;

    /* assign the hash buckets with the updated service */
    ip_vs_sh_reassign(s, svc);

    return 0;
}

/*
 *      Assign all the hash buckets of the specified table with the service.
 */
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
    int i;
    struct ip_vs_sh_bucket *b;
    struct list_head *p;
    struct ip_vs_dest *dest;
    int d_count;
    bool empty;

    b = &s->buckets[0];
    p = &svc->destinations;
    empty = list_empty(p);
    d_count = 0;
    for (i=0; idest, 1);
        if (dest)
            ip_vs_dest_put(dest);
        if (empty)
            RCU_INIT_POINTER(b->dest, NULL);
        else {
            //svc->destinations 为双向循环链表
            if (p == &svc->destinations)
                p = p->next;

            dest = list_entry(p, struct ip_vs_dest, n_list);
            ip_vs_dest_hold(dest);
            RCU_INIT_POINTER(b->dest, dest);

            IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
                      i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
                      atomic_read(&dest->weight));

            /* Don't move to next dest until filling weight */
            if (++d_count >= atomic_read(&dest->weight)) {
                p = p->next;
                d_count = 0;
            }
        }
        b++;
    }
    return 0;
}

如果转发模式为DR或者tunnel,则添加的dest的port会自动转换成svc的port,可参考下面ipvsadm命令行部分的代码

static int process_options(int argc, char **argv, int reading_stdin)
        /*
         * The destination port must be equal to the service port
         * if the IP_VS_CONN_F_TUNNEL or IP_VS_CONN_F_DROUTE is set.
         * Don't worry about this if fwmark is used.
         */
        if (!ce.svc.fwmark &&
            (fwd_method == IP_VS_CONN_F_TUNNEL ||
             fwd_method == IP_VS_CONN_F_DROUTE))
            ce.dest.port = ce.svc.port;

//为1.1.1.10:8080添加一个rs 1.1.1.3:12,虽然端口号指定为12,但是因为转发模式为DR,所以会自动转换成8080
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
//再次1.1.1.10:8080添加一个rs 1.1.1.3:13,指定端口号为13,自动转换成8080后,发现已经有相同的rs了,所以报错
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:13 -g
Destination already exists

DR模式下的数据转发

假设在LB上添加了如下svc和rs

//-s sh指定调度器为sh(source hash)
ipvsadm -A -t 1.1.1.10:8080 -s sh
//-g 指定转发模式为DR
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g

client请求server的数据流为: 2.2.2.1:4444->1.1.1.10:8080
在LB的LOCAL_IN hook点上,会经过ip_vs_reply4和ip_vs_remote_request4的处理。ip_vs_reply4会调用ip_vs_out(ops->hooknum, skb, AF_INET),这是对从本机发出去的报文的处理,先不看它。

重点看ip_vs_remote_request4的流程,如下

static unsigned int
ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
              const struct net_device *in,
              const struct net_device *out,
              int (*okfn)(struct sk_buff *))
{
    return ip_vs_in(ops->hooknum, skb, AF_INET);
}

static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct net *net;
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;
    int ret, pkts;
    struct netns_ipvs *ipvs;

    //如果经过ipvs处理了就直接返回
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

    /*
     *  Big tappo:
     *  - remote client: only PACKET_HOST
     *  - route: used for struct net when skb->dev is unset
     */
    if (unlikely((skb->pkt_type != PACKET_HOST &&
              hooknum != NF_INET_LOCAL_OUT) ||
             !skb_dst(skb))) {
        ip_vs_fill_iph_skb(af, skb, &iph);
        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
                  " ignored in hook %u\n",
                  skb->pkt_type, iph.protocol,
                  IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
        return NF_ACCEPT;
    }
    /* ipvs enabled in this netns ? */
    net = skb_net(skb);
    ipvs = net_ipvs(net);
    if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
        return NF_ACCEPT;
    //获取ip头信息
    ip_vs_fill_iph_skb(af, skb, &iph);
    ...
    //根据四层protocol获取protocol data,进而获取pp,pp为协议相关操作集合
    /* Protocol supported? */
    pd = ip_vs_proto_data_get(net, iph.protocol);
    if (unlikely(!pd))
        return NF_ACCEPT;
    pp = pd->pp;
    /*
     * Check if the packet belongs to an existing connection entry
     */
    //首先根据caddr和cport到连接表ip_vs_conn_tab查找是否已经
    //存在,如果已经存在走快速路径,调用packet_xmit转发数
    //据,否则走慢速路径来查找svc,让调度器选择合适的rs,并创
    //建新的连接,绑定转发函数到packet_xmit,最后也会调用
    //packet_xmit转发数据
    cp = pp->conn_in_get(af, skb, &iph, 0);
    ...
    //连接不存在,需要新建连接
    if (unlikely(!cp) && !iph.fragoffs) {
        /* No (second) fragments need to enter here, as nf_defrag_ipv6
         * replayed fragment zero will already have created the cp
         */
        int v;
        //1. 调用协议相关的conn_schedule函数(对于tcp来说,
        //tcp_conn_schedule),根据 目的ip和port查找是否有匹配
        //的svc,如果找到了,再调用sched->schedule(对 于sh调
        //度器来说,ip_vs_sh_schedule)找出合适的rs,创建
        //ip_vs_conn_new
        /* Schedule and create new connection entry into &cp */
        if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
            return v;
    }
    ...
    ip_vs_in_stats(cp, skb);
    ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
    
    //2. 调用绑定的转发函数
    if (cp->packet_xmit)
        ret = cp->packet_xmit(skb, cp, pp, &iph);
        /* do not touch skb anymore */
    else {
        IP_VS_DBG_RL("warning: packet_xmit is null");
        ret = NF_ACCEPT;
         }
}
  1. pp->conn_schedule 对于tcp来说为 tcp_conn_schedule

static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
          int *verdict, struct ip_vs_conn **cpp,
          struct ip_vs_iphdr *iph)
{
    struct net *net;
    struct ip_vs_service *svc;
    struct tcphdr _tcph, *th;
    struct netns_ipvs *ipvs;
    //获取四层头
    th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
    if (th == NULL) {
        *verdict = NF_DROP;
        return 0;
    }
    net = skb_net(skb);
    ipvs = net_ipvs(net);
    /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
    rcu_read_lock();
    //根据报文中的目的ip和port,查找是否有匹配的svc
    if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
        (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
                      &iph->daddr, th->dest))) {
        int ignored;

        if (ip_vs_todrop(ipvs)) {
            /*
             * It seems that we are very loaded.
             * We have to drop this packet :(
             */
            rcu_read_unlock();
            *verdict = NF_DROP;
            return 0;
        }

        /*
         * Let the virtual server select a real server for the
         * incoming connection, and create a connection entry.
         */
        //找到svc,让svc选择一个rs,并创建新连接
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            rcu_read_unlock();
            return 0;
        }
    }
    rcu_read_unlock();
    /* NF_ACCEPT */
    return 1;
}

struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
           struct ip_vs_proto_data *pd, int *ignored,
           struct ip_vs_iphdr *iph)
{
    struct ip_vs_protocol *pp = pd->pp;
    struct ip_vs_conn *cp = NULL;
    struct ip_vs_scheduler *sched;
    struct ip_vs_dest *dest;
    __be16 _ports[2], *pptr;
    unsigned int flags;

    *ignored = 1;
    /*
     * IPv6 frags, only the first hit here.
     */
    pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
    if (pptr == NULL)
        return NULL;
    ...
    sched = rcu_dereference(svc->scheduler);
    if (sched) {
        /* read svc->sched_data after svc->scheduler */
        smp_rmb();
        //1.1 调用调度器的schedule寻找合适的dest,对于sh调度
        //器来说调用的是ip_vs_sh_schedule
        dest = sched->schedule(svc, skb, iph);
    } else {
        dest = NULL;
    }
    if (dest == NULL) {
        IP_VS_DBG(1, "Schedule: no dest found.\n");
        return NULL;
    }

    flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
         && iph->protocol == IPPROTO_UDP) ?
        IP_VS_CONN_F_ONE_PACKET : 0;

    /*
     *    Create a connection entry.
     */
    {
        struct ip_vs_conn_param p;

        ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
                      &iph->saddr, pptr[0], &iph->daddr,
                      pptr[1], &p);
        //1.2 创建新的连接
        cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
                    dest->port ? dest->port : pptr[1],
                    flags, dest, skb->mark);
        if (!cp) {
            *ignored = -1;
            return NULL;
        }
    }

    ip_vs_conn_stats(cp, svc);
    return cp;
}

1.1 sh调度器的调度函数ip_vs_sh_schedule,根据源ip计算hash后寻找合适的rs。

static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
          struct ip_vs_iphdr *iph)
{
    struct ip_vs_dest *dest;
    struct ip_vs_sh_state *s;
    __be16 port = 0;

    //计算hash时是否考虑源端口
    if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
        port = ip_vs_sh_get_port(skb, iph);

    s = (struct ip_vs_sh_state *) svc->sched_data;

    if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
        //计算hash值,选择一个rs,如果选中的rs不可用,则从rs
        //开始遍历直到找到可用的rs
        dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
    else
        //计算hash值,选择一个ls
        dest = ip_vs_sh_get(svc, s, &iph->saddr, port);

    if (!dest) {
        ip_vs_scheduler_err(svc, "no destination available");
        return NULL;
    }

    IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
              IP_VS_DBG_ADDR(svc->af, &iph->saddr),
              IP_VS_DBG_ADDR(dest->af, &dest->addr),
              ntohs(dest->port));

    return dest;
}

static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
         const union nf_inet_addr *addr, __be16 port)
{
    unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
    struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);

    return (!dest || is_unavailable(dest)) ? NULL : dest;
}

1.2 创建新连接
创建一个新的连接项,通过caddr和cport计算出hash值,并将它加入到hash表ip_vs_conn_tab。

连接项里包含client的信息caddr和cport,service的信息vaddr和vport,真实server的信息daddr和dport。

根据转发模式,绑定不同的函数。

/*
 *  Create a new connection entry and hash it into the ip_vs_conn_tab
 */
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
           const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
           struct ip_vs_dest *dest, __u32 fwmark)
{
    struct ip_vs_conn *cp;
    struct netns_ipvs *ipvs = net_ipvs(p->net);
    struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
                               p->protocol);
    //分配连接项
    cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
    INIT_HLIST_NODE(&cp->c_list);
    setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
    ip_vs_conn_net_set(cp, p->net);
    cp->af         = p->af;
    cp->daf        = dest_af;
    cp->protocol       = p->protocol;
    ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
    cp->cport      = p->cport;
    /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
    ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
               &cp->vaddr, p->vaddr);
    cp->vport      = p->vport;
    ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
    cp->dport          = dport;
    cp->flags      = flags;
    cp->fwmark         = fwmark;
    if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
        ip_vs_pe_get(p->pe);
        cp->pe = p->pe;
        cp->pe_data = p->pe_data;
        cp->pe_data_len = p->pe_data_len;
    } else {
        cp->pe = NULL;
        cp->pe_data = NULL;
        cp->pe_data_len = 0;
    }
    spin_lock_init(&cp->lock);

    /*
     * Set the entry is referenced by the current thread before hashing
     * it in the table, so that other thread run ip_vs_random_dropentry
     * but cannot drop this entry.
     */
    atomic_set(&cp->refcnt, 1);

    cp->control = NULL;
    atomic_set(&cp->n_control, 0);
    atomic_set(&cp->in_pkts, 0);

    cp->packet_xmit = NULL;
    cp->app = NULL;
    cp->app_data = NULL;
    /* reset struct ip_vs_seq */
    cp->in_seq.delta = 0;
    cp->out_seq.delta = 0;

    atomic_inc(&ipvs->conn_count);
    if (flags & IP_VS_CONN_F_NO_CPORT)
        atomic_inc(&ip_vs_conn_no_cport_cnt);

    /* Bind the connection with a destination server */
    cp->dest = NULL;
    ip_vs_bind_dest(cp, dest);

    /* Set its state and timeout */
    cp->state = 0;
    cp->old_state = 0;
    cp->timeout = 3*HZ;
    cp->sync_endtime = jiffies & ~3UL;

    /* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
    if (p->af == AF_INET6)
        ip_vs_bind_xmit_v6(cp);
    else
#endif
        //根据转发模式,绑定不同的函数
        ip_vs_bind_xmit(cp);

    if (unlikely(pd && atomic_read(&pd->appcnt)))
        ip_vs_bind_app(cp, pd->pp);

    /*
     * Allow conntrack to be preserved. By default, conntrack
     * is created and destroyed for every packet.
     * Sometimes keeping conntrack can be useful for
     * IP_VS_CONN_F_ONE_PACKET too.
     */

    if (ip_vs_conntrack_enabled(ipvs))
        cp->flags |= IP_VS_CONN_F_NFCT;

    /* Hash it in the ip_vs_conn_tab finally */
    //最后将连接项插入全局hash链表ip_vs_conn_tab[hash]
    ip_vs_conn_hash(cp);

    return cp;
}

创建连接时,根据转发模式绑定相应的转发函数

/*
 *  Bind a connection entry with the corresponding packet_xmit.
 *  Called by ip_vs_conn_new.
 */
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
    switch (IP_VS_FWD_METHOD(cp)) {
    case IP_VS_CONN_F_MASQ:
        cp->packet_xmit = ip_vs_nat_xmit;
        break;

    case IP_VS_CONN_F_TUNNEL:
#ifdef CONFIG_IP_VS_IPV6
        if (cp->daf == AF_INET6)
            cp->packet_xmit = ip_vs_tunnel_xmit_v6;
        else
#endif
            cp->packet_xmit = ip_vs_tunnel_xmit;
        break;

    case IP_VS_CONN_F_DROUTE:
        cp->packet_xmit = ip_vs_dr_xmit;
        break;

    case IP_VS_CONN_F_LOCALNODE:
        cp->packet_xmit = ip_vs_null_xmit;
        break;

    case IP_VS_CONN_F_BYPASS:
        cp->packet_xmit = ip_vs_bypass_xmit;
        break;
    }
}
  1. DR模式下的转发函数 ip_vs_dr_xmit

int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
          struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    int local;

    EnterFunction(10);

    rcu_read_lock();
    //根据cp->daddr.ip(rs ip)查找路由,而不是根据skb中的目的ip(vip)
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
    if (local < 0)
        goto tx_error;
    if (local) {
        rcu_read_unlock();
        //如果路由结果为local,则返回accept即可
        return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
    }

    ip_send_check(ip_hdr(skb));

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;
    //不是local,需要经过NF_INET_LOCAL_OUT处理发送出去
    ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
    rcu_read_unlock();

    LeaveFunction(10);
    return NF_STOLEN;

  tx_error:
    kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
                     struct ip_vs_conn *cp, int local)
{
    int ret = NF_STOLEN;
    //ipvs_property 置为1,表示经过ipvs处理
    skb->ipvs_property = 1;
    if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
        ip_vs_notrack(skb);
    if (!local) {
        ip_vs_drop_early_demux_sk(skb);
        skb_forward_csum(skb);
        NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
            dst_output);
    } else
        ret = NF_ACCEPT;
    return ret;
}

在NF_INET_LOCAL_OUT hook点上,还会执行另外两个ipvs的hook函数,但是因为skb->ipvs_property置位了,所以不会再次处理,返回accept。

ip_vs_local_reply4 --> ip_vs_out
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

ip_vs_local_request4 --> ip_vs_in
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

从hook函数返回后,调用dst_output,即调用ip_output。

int ip_output(struct sock *sk, struct sk_buff *skb)
{
    struct net_device *dev = skb_dst(skb)->dev;

    IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);

    skb->dev = dev;
    skb->protocol = htons(ETH_P_IP);
    //经过NF_INET_POST_ROUTING hook点,也没特殊处理,
    //通过后调用ip_finish_output
    return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
                ip_finish_output,
                !(IPCB(skb)->flags & IPSKB_REROUTED));
}

ip_finish_output->ip_finish_output2调用邻居子系统函数填充mac地址,此时才是重点,需要将源mac设置为LB mac,目的mac设置为RS mac。因为之前查找路由时,就是使用的RS的ip地址,而不是数据包中vip地址。路由表项rt->rt_gateway的值为: 如果是同网段的,则rt->rt_gateway为目的RS地址,如果是不同网段的,则rt->rt_gateway为网关地址。因为LB和RS在同网段,所以rt->rt_gateway就是RS的地址。

static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
    if (rt->rt_gateway)
        return rt->rt_gateway;
    return daddr;
}

ip_finish_output2
    //根据nexthop获取neigh信息,如果neigh为空,则需创建新的
    //neigh。有了rs的mac地址,才会将报文发出去。
    nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
    neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
    if (unlikely(!neigh))
        neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
    if (!IS_ERR(neigh)) {
        dst_neigh_output(dst, neigh, skb);

dst_neigh_output
      //如果arp状态是正常的,则调用 dev_queue_xmit将报文发送出去
      if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
        return neigh_hh_output(hh, skb);
    else
        //在neigh_resolve_output中,缓存数据包到arp_queue队
        //列,发送arp请求,等接收到arp回应后,再将缓存的数据
        //包发送出去
        return n->output(n, skb); //neigh_resolve_output
  1. DR模式转发总结

a. LB转发数据给RS时,不会修改ip层信息,只修改目的mac为RS的mac,所以LB和RS必须在同一网段。
b. LB和RS都需要接收目的ip为vip的数据包,所以LB和RS上都需要配置vip。
    LB上一般通过keepalived实现主备两个LB,vip可以在两个LB上漂移。
    RS上的vip需要配置在lo接口上(掩码为32),并且RS上所有网卡需要如下配置:
        arp_ignore=1 -->只响应请求地址为接收网卡上地址的arp请求
        arp_announce=2 -->忽略数据包中源ip,选择发送网卡上合适的ip作为arp请求的源ip
c. client的请求需要经过LB转发到RS,但是响应报文不会经过LB,所以需要保证RS和client网络可达,而且RS的网关不能指向LB。
d. 不支持端口映射,所以命令行添加rs时不用指定port,即使指定了也会自动转换成svc的port。

由上面代码可知,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4。

TUNNEL转发模式

可以参考lvs官网对tunnel模式的介绍。这里摘录两段话
If you want to try a test LVS-Tun setup on the bench, take a standard LVS-DR setup LVS-DR example, change lo on the realservers to tunl0 (and handle the ARP problem on tunl0) and change the ipvsadm switch from -g to -i . If your clients are going to be sending large packets, you need to set the MTU (see MTU for the ipip packet DIP->RIP). This can be done on the realserver with iptables (see tunl MTU solved) or iproute2 (see setting the MTU by route).
上面这段话的意思是如果想搭建个lvs-tunnel的实验环境,在lvs-dr环境上稍作修改即可。将vip从lo上删除,配置在tunl0上,ipvsadm添加rs时,转发模式由-i改成-g。

In LVS-Tun, the tunl0 device holds the VIP, just as the lo device holds the device for LVS-DR. You need to build the tunl0 device into the Linux kernel (in networking options - IP:tunneling) - it is turned off by default. The tunnelling (ipip) can be built as a module, in which case you'll have to insmod ipip before you can use it, or you can build ipip directly into the kernel. With a kernel enabled for ipip, you should be able to see the unconfigured tunl0 device with ifconfig or with ip addr show (Feb 2004 - my ifconfig used to see the unconfigured tunl0, but it doesn't anymore.)
上面这段话的意思是,如果ipip模块编译进kernel了,就会自动生成一个虚拟网卡设备tunl0,如果没有编译进内核,需要使用modprobe ipip加载一下这个模块,这样也会生成tunl0设备。如下所示

[root@test1 ~]# modprobe ipip
[root@test1 ~]# lsmod | grep ipip
ipip                   16384  0
tunnel4                16384  1 ipip
ip_tunnel              24576  1 ipip
[root@test1 ~]# ip a
...
4: tunl0@NONE:  mtu 1480 qdisc noop state DOWN group default qlen 1000
    link/ipip 0.0.0.0 brd 0.0.0.0

Then you configure the tunl0 device

ifconfig tunl0 192.168.1.110 netmask 255.255.255.255 broadcast 192.168.1.110
or
ip addr add dev tunl0 192.168.1.110/32 brd 192.168.1.110
Note
the VIP is a /32 addr, so the brd addr is the VIP, not x.x.x.255.

LB上隧道报文封装不依赖其他module,在ip_vs_tunnel_xmit中直接将报文封装成ipip包发送出去。这也就限制了tunnel模式只支持ipip隧道。RS接收端需要提前加载ipip module,来解封装ipip报文。处理完后,将响应报文直接发给client,不用封装,也不用经过LB。

在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_tunnel_xmit。

/*
 *   IP Tunneling transmitter
 *
 *   This function encapsulates the packet in a new IP packet, its
 *   destination will be set to cp->daddr. Most code of this function
 *   is taken from ipip.c.
 *
 *   It is used in VS/TUN cluster. The load balancer selects a real
 *   server from a cluster based on a scheduling algorithm,
 *   encapsulates the request packet and forwards it to the selected
 *   server. For example, all real servers are configured with
 *   "ifconfig tunl0  up". When the server receives
 *   the encapsulated packet, it will decapsulate the packet, processe
 *   the request and return the response packets directly to the client
 *   without passing the load balancer. This can greatly increase the
 *   scalability of virtual server.
 *
 *   Used for ANY protocol
 */
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
          struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
    struct rtable *rt;          /* Route to the other host */
    __be32 saddr;               /* Source for tunnel */
    struct net_device *tdev;        /* Device to other host */
    __u8 next_protocol = 0;
    __u8 dsfield = 0;
    __u8 ttl = 0;
    __be16 df = 0;
    __be16 *dfp = NULL;
    struct iphdr  *iph;         /* Our new IP header */
    unsigned int max_headroom;      /* The extra header space needed */
    int ret, local;

    EnterFunction(10);

    rcu_read_lock();
    //根据cp->daddr查找路由,注意daddr是调度器选出的rs的ip,
    //不是数据包中的vip
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_CONNECT |
                   IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
    if (local < 0)
        goto tx_error;
    
    //如果查找结果是local,则返回accept
    if (local) {
        rcu_read_unlock();
        return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
    }

    //取出路由结果中的出口设备
    rt = skb_rtable(skb);
    tdev = rt->dst.dev;

    /*
     * Okay, now see if we can stuff it in the buffer as-is.
     */
    //计算出口设备可容纳的最大长度
    max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);

    /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
    dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
    //先计算下max_headroom能不能容纳封装后报文长度,同时取
    //出ip头里的field
    skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
                     &next_protocol, NULL, &dsfield,
                     &ttl, dfp);
    if (IS_ERR(skb))
        goto tx_error;

    skb = iptunnel_handle_offloads(
        skb, false, __tun_gso_type_mask(AF_INET, cp->af));
    if (IS_ERR(skb))
        goto tx_error;

    skb->transport_header = skb->network_header;
    //在skb ip头前面腾出一个ip头的长度,用来封装外层ip
    skb_push(skb, sizeof(struct iphdr));
    skb_reset_network_header(skb);
    memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
    //外层ip头赋值,saddr为查找路由后给定,daddr为cp->daddr,即rs的ip
    /*
     *  Push down and install the IPIP header.
     */
    iph         =   ip_hdr(skb);
    iph->version        =   4;
    iph->ihl        =   sizeof(struct iphdr)>>2;
    iph->frag_off       =   df;
    iph->protocol       =   next_protocol;
    iph->tos        =   dsfield;
    iph->daddr      =   cp->daddr.ip;
    iph->saddr      =   saddr;
    iph->ttl        =   ttl;
    ip_select_ident(skb, NULL);

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    ret = ip_vs_tunnel_xmit_prepare(skb, cp);
    if (ret == NF_ACCEPT)
            //发给LOCAL_OUT hook点处理,此时的报文已经封装成
        //ipip报文,外层ip是LB 和rs的隧道ip,内层ip是client和vip。
        ip_local_out(skb);
    else if (ret == NF_DROP)
        kfree_skb(skb);
    rcu_read_unlock();

    LeaveFunction(10);

    return NF_STOLEN;

  tx_error:
    if (!IS_ERR(skb))
        kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

DNAT转发模式

在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_nat_xmit。

int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
           struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct rtable *rt;      /* Route to the other host */
    int local, rc, was_input;

    EnterFunction(10);

    rcu_read_lock();
    /* check if it is a connection of no-client-port */
    if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
        __be16 _pt, *p;

        p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
        if (p == NULL)
            goto tx_error;
        ip_vs_conn_fill_cport(cp, *p);
        IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
    }

    was_input = rt_is_input_route(skb_rtable(skb));
    //根据 cp->daddr.ip 查找路由,而不是根据skb中的目的ip(vip)
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_RDR, NULL, ipvsh);
    if (local < 0)
        goto tx_error;
    rt = skb_rtable(skb);
    ...
    //调用协议相关的 tcp_dnat_handler,修改数据包的目的port为cp->dport
    /* mangle the packet */
    if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
        goto tx_error;
    //修改目的ip为 cp->daddr.ip
    ip_hdr(skb)->daddr = cp->daddr.ip;
    //重新计算校验和
    ip_send_check(ip_hdr(skb));

    IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");

    /* FIXME: when application helper enlarges the packet and the length
       is larger than the MTU of outgoing device, there will be still
       MTU problem. */

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
    rcu_read_unlock();

    LeaveFunction(10);
    return rc;

  tx_error:
    kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
                     struct ip_vs_conn *cp, int local)
{
    int ret = NF_STOLEN;

    skb->ipvs_property = 1;
    if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 1);

    /* Remove the early_demux association unless it's bound for the
     * exact same port and address on this host after translation.
     */
    if (!local || cp->vport != cp->dport ||
        !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
        ip_vs_drop_early_demux_sk(skb);

    if (!local) {
        skb_forward_csum(skb);
        //同样的,将dnat后的数据包调用local out发送出去
        NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
            dst_output);
    } else
        ret = NF_ACCEPT;

    return ret;
}

所以到达lb的数据流为:cip:cport->vip:vport,
经过dnat后的数据流为:cip:cport->rip:rport.
rs处理完后的响应数据流为:rip:rport->cip:cport,需要将rip:rport还原成vip:vport,所以此数据流必须发给lb做snat。又因为目的ip不是lb的ip,所以必须将rs的默认网关指向lb。当数据流到达lb后,查找路由表发现目的ip不是lb的ip,所以需要转发此数据包(必须保证net.ipv4.ip_forward = 1),将走ip_forward函数转发,函数最后需要经过NF_INET_FORWARD hook点的处理,

NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
               rt->dst.dev, ip_forward_finish);

此hook点注册了两个和ipvs相关的函数ip_vs_forward_icmp和ip_vs_reply4,很显然前一个是处理icmp的,重点是ip_vs_reply4。

static unsigned int
ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
         const struct net_device *in, const struct net_device *out,
         int (*okfn)(struct sk_buff *))
{
    return ip_vs_out(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct net *net = NULL;
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;

    EnterFunction(11);

    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

    /* Bad... Do not break raw sockets */
    if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
             af == AF_INET)) {
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(skb->sk);

        if (inet && sk->sk_family == PF_INET && inet->nodefrag)
            return NF_ACCEPT;
    }

    if (unlikely(!skb_dst(skb)))
        return NF_ACCEPT;

    net = skb_net(skb);
    if (!net_ipvs(net)->enable)
        return NF_ACCEPT;

    ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6) {
        if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
            int related;
            int verdict = ip_vs_out_icmp_v6(skb, &related,
                            hooknum, &iph);

            if (related)
                return verdict;
        }
    } else
#endif
        if (unlikely(iph.protocol == IPPROTO_ICMP)) {
            int related;
            int verdict = ip_vs_out_icmp(skb, &related, hooknum);

            if (related)
                return verdict;
        }

    pd = ip_vs_proto_data_get(net, iph.protocol);
    if (unlikely(!pd))
        return NF_ACCEPT;
    pp = pd->pp;

    /* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET)
#endif
        if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
            if (ip_vs_gather_frags(skb,
                           ip_vs_defrag_user(hooknum)))
                return NF_STOLEN;

            ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
        }

    /*
     * Check if the packet belongs to an existing entry
     */
    //因为从client到rs是通过cip和cport创建的连接表,所以反方向
    //是通过目的ip和port(也就是cip和cport)查找是否有连接表
    cp = pp->conn_out_get(af, skb, &iph, 0);
    //如果查找到连接表,才需要处理
    if (likely(cp))
        return handle_response(af, skb, pd, cp, &iph);
    ...
}

static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
        struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
    struct ip_vs_protocol *pp = pd->pp;

    IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");

    if (!skb_make_writable(skb, iph->len))
        goto drop;

    /* mangle the packet */
    //调用协议相关的 snat_handler 处理数据包,即 
    //tcp_snat_handler将源port换成vport
    if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
        goto drop;

    {
        //修改源ip为vaddr
        ip_hdr(skb)->saddr = cp->vaddr.ip;
        ip_send_check(ip_hdr(skb));
    }

    /*
     * nf_iterate does not expect change in the skb->dst->dev.
     * It looks like it is not fatal to enable this code for hooks
     * where our handlers are at the end of the chain list and
     * when all next handlers use skb->dst->dev and not outdev.
     * It will definitely route properly the inout NAT traffic
     * when multiple paths are used.
     */

    /* For policy routing, packets originating from this
     * machine itself may be routed differently to packets
     * passing through.  We want this packet to be routed as
     * if it came from this machine itself.  So re-compute
     * the routing information.
     */
    if (ip_vs_route_me_harder(af, skb))
        goto drop;

    IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");

    ip_vs_out_stats(cp, skb);
    ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
    skb->ipvs_property = 1;
    if (!(cp->flags & IP_VS_CONN_F_NFCT))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 0);
    ip_vs_conn_put(cp);

    LeaveFunction(11);
    //最后返回accept即可,从hook函数返回后,会调用
    //ip_forward_finish最终发给client端
    return NF_ACCEPT;

drop:
    ip_vs_conn_put(cp);
    kfree_skb(skb);
    LeaveFunction(11);
    return NF_STOLEN;
}

dnat转发总结:
RS的网关必须指向DIP
请求和响应报文都需要经过LB,流量很大的话,LB会成为瓶颈
支持端口映射

参考文档

http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/

也可参考:lvs四层负载均衡 - 简书 (jianshu.com) 

你可能感兴趣的:(kernel,负载均衡,kernel,netfilter,lvs,conntrack)