lvs(linux virtual server)是基于netfilter框架实现的四层负载均衡器,包含两部分,一部分是用户态的ipvsadm配置管理命令,另一部分是内核态的核心ko。
lvs常用的转发模式是DR,tunnel和dnat。DR和tunnel模式下,只有请求报文会经过lvs,响应报文由rs直接返给client。dnat模式下,需要有端口映射,所以响应报文也必须经过lvs做snat后才能发给client。
lvs使用调度器选择合适的rs,可在代码中搜register_ip_vs_scheduler查看支持的调度器。
使用ipvs需要提前加载ip_vs module(modprobe ip_vs),下面看一下此module的初始化。
module_init(ip_vs_init);
static int __init ip_vs_init(void)
{
ip_vs_control_init();
ip_vs_protocol_init();
ip_vs_conn_init();
register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
register_pernet_device(&ipvs_core_dev_ops);
nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
ip_vs_register_nl_ioctl();
pr_info("ipvs loaded.\n");
return ret;
}
/*
* Hash table: for virtual service lookups
*/
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
int __init ip_vs_control_init(void)
{
//初始化两张hash链表,hash桶大小为256
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
//注册网络设备状态变化通知函数,如果有网卡down了会调用
//ip_vs_dst_event,清除和此设备相关的表项
register_netdevice_notifier(&ip_vs_dst_notifier);
return 0;
}
struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP",
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
.init = NULL,
.exit = NULL,
.init_netns = __ip_vs_tcp_init,
.exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
.csum_check = tcp_csum_check,
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
};
#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
if (pp->init != NULL)
pp->init(pp);
return 0;
}
int __init ip_vs_protocol_init(void)
{
char protocols[64];
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
strcat(protocols, ", "); \
strcat(protocols, (p)->name); \
} while (0)
protocols[0] = '\0';
protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_SCTP
REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
pr_info("Registered protocols (%s)\n", &protocols[2]);
return 0;
}
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
static struct hlist_head *ip_vs_conn_tab __read_mostly;
#define CONFIG_IP_VS_TAB_BITS 12
static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
int __init ip_vs_conn_init(void)
{
int idx;
//连接表也是hash链表,hash桶大小为 1 <<12 = 4096
/* Compute size and mask */
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
/*
* Allocate the connection hash table and initialize its list heads
*/
//分配4096个hash桶
ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
/* Allocate ip_vs_conn slab cache */
ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
sizeof(struct ip_vs_conn), 0,
SLAB_HWCACHE_ALIGN, NULL);
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
ip_vs_conn_tab_size,
(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
return 0;
}
4.注册pernet操作 ipvs_core_ops和ipvs_core_dev_ops
对每个net namespace都会调用init函数进行pernet的初始化。
static struct pernet_operations ipvs_core_ops = {
.init = __ip_vs_init,
.exit = __ip_vs_cleanup,
.id = &ip_vs_net_id,
.size = sizeof(struct netns_ipvs),
};
//ipvs_core_dev_ops 只提供了exit,只在卸载模块时调用。
static struct pernet_operations ipvs_core_dev_ops = {
.exit = __ip_vs_dev_cleanup,
};
//主要是初始化 netns_ipvs 结构体的字段
static int __net_init __ip_vs_init(struct net *net)
{
struct netns_ipvs *ipvs;
ipvs = net_generic(net, ip_vs_net_id);
if (ipvs == NULL)
return -ENOMEM;
/* Hold the beast until a service is registerd */
ipvs->enable = 0;
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
atomic_inc(&ipvs_netns_cnt);
net->ipvs = ipvs;
//初始化和estimator相关的字段
if (ip_vs_estimator_net_init(net) < 0)
goto estimator_fail;
if (ip_vs_control_net_init(net) < 0)
goto control_fail;
if (ip_vs_protocol_net_init(net) < 0)
goto protocol_fail;
if (ip_vs_app_net_init(net) < 0)
goto app_fail;
if (ip_vs_conn_net_init(net) < 0)
goto conn_fail;
if (ip_vs_sync_net_init(net) < 0)
goto sync_fail;
printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
sizeof(struct netns_ipvs), ipvs->gen);
return 0;
}
static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
.hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
}
nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
int __init ip_vs_register_nl_ioctl(void)
{
nf_register_sockopt(&ip_vs_sockopts);
ip_vs_genl_register();
return 0;
}
调用register_ip_vs_scheduler将调度器注册到全局链表ip_vs_schedulers中。
在我看的3.18.79内核,已经有了十几种调度器的实现,每一种调度器都是以module形式存在,加载module时进行初始化,将调度器注册到全局变量ip_vs_schedulers中。
//全局链表头,保存调度器
static LIST_HEAD(ip_vs_schedulers);
int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
struct ip_vs_scheduler *sched;
if (!scheduler) {
pr_err("%s(): NULL arg\n", __func__);
return -EINVAL;
}
//调度器名字不能为空
if (!scheduler->name) {
pr_err("%s(): NULL scheduler_name\n", __func__);
return -EINVAL;
}
/* increase the module use count */
ip_vs_use_count_inc();
mutex_lock(&ip_vs_sched_mutex);
//如果已经注册过了,不能二次注册
if (!list_empty(&scheduler->n_list)) {
mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
return -EINVAL;
}
/*
* Make sure that the scheduler with this name doesn't exist
* in the scheduler list.
*/
//查看是否有同名的调度器
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
return -EINVAL;
}
}
/*
* Add it into the d-linked scheduler list
*/
//最后将调度器插入全局链表ip_vs_schedulers
list_add(&scheduler->n_list, &ip_vs_schedulers);
mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
return 0;
}
下面为sh(source ip hash)调度器,后面的分析也会以sh为例
module_init(ip_vs_sh_init);
static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
.name = "sh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
.add_dest = ip_vs_sh_dest_changed,
.del_dest = ip_vs_sh_dest_changed,
.upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
static int __init ip_vs_sh_init(void)
{
return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
ipvs的基本用法如下,首先创建一个service,指定协议,ip和端口号,指定调度算法。再给这个service添加真实的后端ip和端口,并指定转发模式。
这里的service是对外提供的,收到client的请求后(目的ip为service ip),根据调度算法从后端ip池找到合适的rs,根据转发模式将报文发送给rs。
//清空所有规则
ipvsadm -C
//添加虚拟service,-s指定调度算法为sh,-t表示tcp协议,1.1.1.10:8080表示添加的vip及其端口号
ipvsadm -A -t 1.1.1.10:8080 -s sh
//给service添加rs,-g表示工作模式为DR模式,-r表示指定的rs为1.1.1.4:8080
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.4:8080 -g
//列出当前的配置
ipvsadm -ln
下面分别看一下创建servcie和添加rs的代码流程
struct ip_vs_service_user_kern {
/* virtual service addresses */
u16 af;
//指定的协议
u16 protocol;
//指定的vip
union nf_inet_addr addr; /* virtual ip address */
//指定的vport
__be16 port;
u32 fwmark; /* firwall mark of service */
/* virtual service options */
//指定的调度器名字
char *sched_name;
char *pe_name;
unsigned int flags; /* virtual service flags */
unsigned int timeout; /* persistent timeout in sec */
__be32 netmask; /* persistent netmask or plen */
};
ipvsadm下发添加service命令后,kernel端执行函数ip_vs_add_service
static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
//如果指定的调度器名字不为none,则根据指定的调度器名字到
//全局链表ip_vs_schedulers查找,如果第一次查找失败,会尝
//试自动加载module,再次查找。两次查找都失败就返回报错
/* Lookup the scheduler by 'u->sched_name' */
if (strcmp(u->sched_name, "none")) {
sched = ip_vs_scheduler_get(u->sched_name);
if (!sched) {
pr_info("Scheduler module ip_vs_%s not found\n",
u->sched_name);
ret = -ENOENT;
goto out_err;
}
}
//如果指定了pe,则根据pe name获取pe
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out_err;
}
}
//分配service结构体
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
//分配percpu统计结构
svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
//初始化percpu 统计
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *ip_vs_stats;
ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
u64_stats_init(&ip_vs_stats->syncp);
}
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
//将命令行设置的内容赋给service
svc->af = u->af;
svc->protocol = u->protocol;
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->net = net;
//初始化destinations链表,用于存放rs
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
//将svc和指定的调度器绑定,即调用调度器的init_service
//函数,并将sched赋值给svc->scheduler。
//init_service函数会分配每个sched私有结构体,存放在
//svc->sched_data
ret = ip_vs_bind_scheduler(svc, sched);
if (ret)
goto out_err;
sched = NULL;
}
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
//将svc->stats放入链表ipvs->est_list
ip_vs_start_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
//增加service个数
ipvs->num_services++;
//将svc添加到全局变量ip_vs_svc_table或者
//ip_vs_svc_fwm_table
//Hash it by in ip_vs_svc_table
//Hash it by fwmark in svc_fwm_table
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
/* Now there is a service - full throttle */
//只要有一个service,就会设置enable
ipvs->enable = 1;
return 0;
}
/*
* Bind a service with a scheduler
*/
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
struct ip_vs_scheduler *scheduler)
{
int ret;
//ip_vs_sh_init_svc
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
pr_err("%s(): init error\n", __func__);
return ret;
}
}
rcu_assign_pointer(svc->scheduler, scheduler);
return 0;
}
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_state *s;
/* allocate the SH table for this service */
s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
svc->sched_data = s;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
/* assign the hash buckets with current dests */
ip_vs_sh_reassign(s, svc);
return 0;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
int d_count;
bool empty;
b = &s->buckets[0];
p = &svc->destinations;
empty = list_empty(p);
d_count = 0;
for (i=0; idest, 1);
if (dest)
ip_vs_dest_put(dest);
if (empty)
RCU_INIT_POINTER(b->dest, NULL);
else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
ip_vs_dest_hold(dest);
RCU_INIT_POINTER(b->dest, dest);
IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
atomic_read(&dest->weight));
/* Don't move to next dest until filling weight */
if (++d_count >= atomic_read(&dest->weight)) {
p = p->next;
d_count = 0;
}
}
b++;
}
return 0;
}
struct ip_vs_dest_user_kern {
/* destination server address */
union nf_inet_addr addr;
__be16 port;
/* real server options */
unsigned int conn_flags; /* connection flags */
int weight; /* destination weight */
/* thresholds for active connections */
u32 u_threshold; /* upper threshold */
u32 l_threshold; /* lower threshold */
/* Address family of addr */
u16 af;
};
ipvsadm下发添加rs命令后,kernel端执行函数ip_vs_add_dest
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
int ret;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
//到svc->destinations查找是否已经存在dest,
dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
rcu_read_unlock();
//已经存在,返回
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
return -EEXIST;
}
/*
* Check if the dest already exists in the trash and
* is from the same service
*/
//如果dest被删除了但是还有连接在引用它,会把dest放在链表
//ipvs->dest_trash上。如果能在ipvs->dest_trash链表上找到完
//全一样的dest,可以直接取出来使用
dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
if (dest != NULL) {
IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
"dest->refcnt=%d, service %u/%s:%u\n",
IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
//否则需要分配dest结构体,最后也会调用 __ip_vs_update_dest
ret = ip_vs_new_dest(svc, udest, &dest);
}
LeaveFunction(2);
return ret;
}
static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
/* We cannot modify an address and change the address family */
BUG_ON(!add && udest->af != dest->af);
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
ip_vs_rs_hash(ipvs, dest);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
old_svc = rcu_dereference_protected(dest->svc, 1);
if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
__ip_vs_svc_put(old_svc, true);
}
}
/* set the dest status flags */
dest->flags |= IP_VS_DEST_F_AVAILABLE;
if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
dest->af = udest->af;
spin_lock_bh(&dest->dst_lock);
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
if (add) {
ip_vs_start_estimator(svc->net, &dest->stats);
//将dest添加到svc的双向循环链表destinations中。
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
//调用sched的add_dest将重新分配dest,比如对于sh调
//度器来说,add_dest为ip_vs_sh_dest_changed
if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
ip_vs_sh_state为sh调度器的私有数据结构,buckets用来保存一个dest,IP_VS_SH_TAB_SIZE为256,所以对于sh来说,一个service一共只能保存256个dest,即rs。
ip_vs_sh_reassign会将svc->destinations链表中的dest赋值到buckets->dest,但是buckets容量只有256,即使添加再多的dest,也只有最后添加的256个dest生效,这是在dest的weight为1的情况下的结果。如果dest的weight越大,则此dest会占用weight个数的buckets。
struct ip_vs_sh_state {
struct rcu_head rcu_head;
struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
};
static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
struct ip_vs_dest *dest)
{
struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
ip_vs_sh_reassign(s, svc);
return 0;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
int d_count;
bool empty;
b = &s->buckets[0];
p = &svc->destinations;
empty = list_empty(p);
d_count = 0;
for (i=0; idest, 1);
if (dest)
ip_vs_dest_put(dest);
if (empty)
RCU_INIT_POINTER(b->dest, NULL);
else {
//svc->destinations 为双向循环链表
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
ip_vs_dest_hold(dest);
RCU_INIT_POINTER(b->dest, dest);
IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
atomic_read(&dest->weight));
/* Don't move to next dest until filling weight */
if (++d_count >= atomic_read(&dest->weight)) {
p = p->next;
d_count = 0;
}
}
b++;
}
return 0;
}
如果转发模式为DR或者tunnel,则添加的dest的port会自动转换成svc的port,可参考下面ipvsadm命令行部分的代码
static int process_options(int argc, char **argv, int reading_stdin)
/*
* The destination port must be equal to the service port
* if the IP_VS_CONN_F_TUNNEL or IP_VS_CONN_F_DROUTE is set.
* Don't worry about this if fwmark is used.
*/
if (!ce.svc.fwmark &&
(fwd_method == IP_VS_CONN_F_TUNNEL ||
fwd_method == IP_VS_CONN_F_DROUTE))
ce.dest.port = ce.svc.port;
//为1.1.1.10:8080添加一个rs 1.1.1.3:12,虽然端口号指定为12,但是因为转发模式为DR,所以会自动转换成8080
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
//再次1.1.1.10:8080添加一个rs 1.1.1.3:13,指定端口号为13,自动转换成8080后,发现已经有相同的rs了,所以报错
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:13 -g
Destination already exists
假设在LB上添加了如下svc和rs
//-s sh指定调度器为sh(source hash)
ipvsadm -A -t 1.1.1.10:8080 -s sh
//-g 指定转发模式为DR
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
client请求server的数据流为: 2.2.2.1:4444->1.1.1.10:8080
在LB的LOCAL_IN hook点上,会经过ip_vs_reply4和ip_vs_remote_request4的处理。ip_vs_reply4会调用ip_vs_out(ops->hooknum, skb, AF_INET),这是对从本机发出去的报文的处理,先不看它。
重点看ip_vs_remote_request4的流程,如下
static unsigned int
ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
return ip_vs_in(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
struct netns_ipvs *ipvs;
//如果经过ipvs处理了就直接返回
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/*
* Big tappo:
* - remote client: only PACKET_HOST
* - route: used for struct net when skb->dev is unset
*/
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
ip_vs_fill_iph_skb(af, skb, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
net = skb_net(skb);
ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
//获取ip头信息
ip_vs_fill_iph_skb(af, skb, &iph);
...
//根据四层protocol获取protocol data,进而获取pp,pp为协议相关操作集合
/* Protocol supported? */
pd = ip_vs_proto_data_get(net, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
//首先根据caddr和cport到连接表ip_vs_conn_tab查找是否已经
//存在,如果已经存在走快速路径,调用packet_xmit转发数
//据,否则走慢速路径来查找svc,让调度器选择合适的rs,并创
//建新的连接,绑定转发函数到packet_xmit,最后也会调用
//packet_xmit转发数据
cp = pp->conn_in_get(af, skb, &iph, 0);
...
//连接不存在,需要新建连接
if (unlikely(!cp) && !iph.fragoffs) {
/* No (second) fragments need to enter here, as nf_defrag_ipv6
* replayed fragment zero will already have created the cp
*/
int v;
//1. 调用协议相关的conn_schedule函数(对于tcp来说,
//tcp_conn_schedule),根据 目的ip和port查找是否有匹配
//的svc,如果找到了,再调用sched->schedule(对 于sh调
//度器来说,ip_vs_sh_schedule)找出合适的rs,创建
//ip_vs_conn_new
/* Schedule and create new connection entry into &cp */
if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
return v;
}
...
ip_vs_in_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
//2. 调用绑定的转发函数
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp, &iph);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;
}
}
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct netns_ipvs *ipvs;
//获取四层头
th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
net = skb_net(skb);
ipvs = net_ipvs(net);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
rcu_read_lock();
//根据报文中的目的ip和port,查找是否有匹配的svc
if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
(svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
&iph->daddr, th->dest))) {
int ignored;
if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
//找到svc,让svc选择一个rs,并创建新连接
*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
else
*verdict = NF_DROP;
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_proto_data *pd, int *ignored,
struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
*ignored = 1;
/*
* IPv6 frags, only the first hit here.
*/
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return NULL;
...
sched = rcu_dereference(svc->scheduler);
if (sched) {
/* read svc->sched_data after svc->scheduler */
smp_rmb();
//1.1 调用调度器的schedule寻找合适的dest,对于sh调度
//器来说调用的是ip_vs_sh_schedule
dest = sched->schedule(svc, skb, iph);
} else {
dest = NULL;
}
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
&& iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
* Create a connection entry.
*/
{
struct ip_vs_conn_param p;
ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
&iph->saddr, pptr[0], &iph->daddr,
pptr[1], &p);
//1.2 创建新的连接
cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
dest->port ? dest->port : pptr[1],
flags, dest, skb->mark);
if (!cp) {
*ignored = -1;
return NULL;
}
}
ip_vs_conn_stats(cp, svc);
return cp;
}
1.1 sh调度器的调度函数ip_vs_sh_schedule,根据源ip计算hash后寻找合适的rs。
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_state *s;
__be16 port = 0;
//计算hash时是否考虑源端口
if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
port = ip_vs_sh_get_port(skb, iph);
s = (struct ip_vs_sh_state *) svc->sched_data;
if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
//计算hash值,选择一个rs,如果选中的rs不可用,则从rs
//开始遍历直到找到可用的rs
dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
else
//计算hash值,选择一个ls
dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph->saddr),
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
return dest;
}
static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
return (!dest || is_unavailable(dest)) ? NULL : dest;
}
1.2 创建新连接
创建一个新的连接项,通过caddr和cport计算出hash值,并将它加入到hash表ip_vs_conn_tab。
连接项里包含client的信息caddr和cport,service的信息vaddr和vport,真实server的信息daddr和dport。
根据转发模式,绑定不同的函数。
/*
* Create a new connection entry and hash it into the ip_vs_conn_tab
*/
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
struct netns_ipvs *ipvs = net_ipvs(p->net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
//分配连接项
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->daf = dest_af;
cp->protocol = p->protocol;
ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
/* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
&cp->vaddr, p->vaddr);
cp->vport = p->vport;
ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
cp->fwmark = fwmark;
if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
ip_vs_pe_get(p->pe);
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
} else {
cp->pe = NULL;
cp->pe_data = NULL;
cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
/*
* Set the entry is referenced by the current thread before hashing
* it in the table, so that other thread run ip_vs_random_dropentry
* but cannot drop this entry.
*/
atomic_set(&cp->refcnt, 1);
cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
cp->packet_xmit = NULL;
cp->app = NULL;
cp->app_data = NULL;
/* reset struct ip_vs_seq */
cp->in_seq.delta = 0;
cp->out_seq.delta = 0;
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
cp->old_state = 0;
cp->timeout = 3*HZ;
cp->sync_endtime = jiffies & ~3UL;
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
if (p->af == AF_INET6)
ip_vs_bind_xmit_v6(cp);
else
#endif
//根据转发模式,绑定不同的函数
ip_vs_bind_xmit(cp);
if (unlikely(pd && atomic_read(&pd->appcnt)))
ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
* is created and destroyed for every packet.
* Sometimes keeping conntrack can be useful for
* IP_VS_CONN_F_ONE_PACKET too.
*/
if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
//最后将连接项插入全局hash链表ip_vs_conn_tab[hash]
ip_vs_conn_hash(cp);
return cp;
}
创建连接时,根据转发模式绑定相应的转发函数
/*
* Bind a connection entry with the corresponding packet_xmit.
* Called by ip_vs_conn_new.
*/
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
switch (IP_VS_FWD_METHOD(cp)) {
case IP_VS_CONN_F_MASQ:
cp->packet_xmit = ip_vs_nat_xmit;
break;
case IP_VS_CONN_F_TUNNEL:
#ifdef CONFIG_IP_VS_IPV6
if (cp->daf == AF_INET6)
cp->packet_xmit = ip_vs_tunnel_xmit_v6;
else
#endif
cp->packet_xmit = ip_vs_tunnel_xmit;
break;
case IP_VS_CONN_F_DROUTE:
cp->packet_xmit = ip_vs_dr_xmit;
break;
case IP_VS_CONN_F_LOCALNODE:
cp->packet_xmit = ip_vs_null_xmit;
break;
case IP_VS_CONN_F_BYPASS:
cp->packet_xmit = ip_vs_bypass_xmit;
break;
}
}
int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
int local;
EnterFunction(10);
rcu_read_lock();
//根据cp->daddr.ip(rs ip)查找路由,而不是根据skb中的目的ip(vip)
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
if (local < 0)
goto tx_error;
if (local) {
rcu_read_unlock();
//如果路由结果为local,则返回accept即可
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
ip_send_check(ip_hdr(skb));
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
//不是local,需要经过NF_INET_LOCAL_OUT处理发送出去
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
{
int ret = NF_STOLEN;
//ipvs_property 置为1,表示经过ipvs处理
skb->ipvs_property = 1;
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
} else
ret = NF_ACCEPT;
return ret;
}
在NF_INET_LOCAL_OUT hook点上,还会执行另外两个ipvs的hook函数,但是因为skb->ipvs_property置位了,所以不会再次处理,返回accept。
ip_vs_local_reply4 --> ip_vs_out
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
ip_vs_local_request4 --> ip_vs_in
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
从hook函数返回后,调用dst_output,即调用ip_output。
int ip_output(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
//经过NF_INET_POST_ROUTING hook点,也没特殊处理,
//通过后调用ip_finish_output
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
ip_finish_output->ip_finish_output2调用邻居子系统函数填充mac地址,此时才是重点,需要将源mac设置为LB mac,目的mac设置为RS mac。因为之前查找路由时,就是使用的RS的ip地址,而不是数据包中vip地址。路由表项rt->rt_gateway的值为: 如果是同网段的,则rt->rt_gateway为目的RS地址,如果是不同网段的,则rt->rt_gateway为网关地址。因为LB和RS在同网段,所以rt->rt_gateway就是RS的地址。
static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
if (rt->rt_gateway)
return rt->rt_gateway;
return daddr;
}
ip_finish_output2
//根据nexthop获取neigh信息,如果neigh为空,则需创建新的
//neigh。有了rs的mac地址,才会将报文发出去。
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
dst_neigh_output(dst, neigh, skb);
dst_neigh_output
//如果arp状态是正常的,则调用 dev_queue_xmit将报文发送出去
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
else
//在neigh_resolve_output中,缓存数据包到arp_queue队
//列,发送arp请求,等接收到arp回应后,再将缓存的数据
//包发送出去
return n->output(n, skb); //neigh_resolve_output
a. LB转发数据给RS时,不会修改ip层信息,只修改目的mac为RS的mac,所以LB和RS必须在同一网段。
b. LB和RS都需要接收目的ip为vip的数据包,所以LB和RS上都需要配置vip。
LB上一般通过keepalived实现主备两个LB,vip可以在两个LB上漂移。
RS上的vip需要配置在lo接口上(掩码为32),并且RS上所有网卡需要如下配置:
arp_ignore=1 -->只响应请求地址为接收网卡上地址的arp请求
arp_announce=2 -->忽略数据包中源ip,选择发送网卡上合适的ip作为arp请求的源ip
c. client的请求需要经过LB转发到RS,但是响应报文不会经过LB,所以需要保证RS和client网络可达,而且RS的网关不能指向LB。
d. 不支持端口映射,所以命令行添加rs时不用指定port,即使指定了也会自动转换成svc的port。
由上面代码可知,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4。
可以参考lvs官网对tunnel模式的介绍。这里摘录两段话
If you want to try a test LVS-Tun setup on the bench, take a standard LVS-DR setup LVS-DR example, change lo on the realservers to tunl0 (and handle the ARP problem on tunl0) and change the ipvsadm switch from -g to -i . If your clients are going to be sending large packets, you need to set the MTU (see MTU for the ipip packet DIP->RIP). This can be done on the realserver with iptables (see tunl MTU solved) or iproute2 (see setting the MTU by route).
上面这段话的意思是如果想搭建个lvs-tunnel的实验环境,在lvs-dr环境上稍作修改即可。将vip从lo上删除,配置在tunl0上,ipvsadm添加rs时,转发模式由-i改成-g。
In LVS-Tun, the tunl0 device holds the VIP, just as the lo device holds the device for LVS-DR. You need to build the tunl0 device into the Linux kernel (in networking options - IP:tunneling) - it is turned off by default. The tunnelling (ipip) can be built as a module, in which case you'll have to insmod ipip before you can use it, or you can build ipip directly into the kernel. With a kernel enabled for ipip, you should be able to see the unconfigured tunl0 device with ifconfig or with ip addr show (Feb 2004 - my ifconfig used to see the unconfigured tunl0, but it doesn't anymore.)
上面这段话的意思是,如果ipip模块编译进kernel了,就会自动生成一个虚拟网卡设备tunl0,如果没有编译进内核,需要使用modprobe ipip加载一下这个模块,这样也会生成tunl0设备。如下所示
[root@test1 ~]# modprobe ipip
[root@test1 ~]# lsmod | grep ipip
ipip 16384 0
tunnel4 16384 1 ipip
ip_tunnel 24576 1 ipip
[root@test1 ~]# ip a
...
4: tunl0@NONE: mtu 1480 qdisc noop state DOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
Then you configure the tunl0 device
ifconfig tunl0 192.168.1.110 netmask 255.255.255.255 broadcast 192.168.1.110
or
ip addr add dev tunl0 192.168.1.110/32 brd 192.168.1.110
Note
the VIP is a /32 addr, so the brd addr is the VIP, not x.x.x.255.
LB上隧道报文封装不依赖其他module,在ip_vs_tunnel_xmit中直接将报文封装成ipip包发送出去。这也就限制了tunnel模式只支持ipip隧道。RS接收端需要提前加载ipip module,来解封装ipip报文。处理完后,将响应报文直接发给client,不用封装,也不用经过LB。
在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_tunnel_xmit。
/*
* IP Tunneling transmitter
*
* This function encapsulates the packet in a new IP packet, its
* destination will be set to cp->daddr. Most code of this function
* is taken from ipip.c.
*
* It is used in VS/TUN cluster. The load balancer selects a real
* server from a cluster based on a scheduling algorithm,
* encapsulates the request packet and forwards it to the selected
* server. For example, all real servers are configured with
* "ifconfig tunl0 up". When the server receives
* the encapsulated packet, it will decapsulate the packet, processe
* the request and return the response packets directly to the client
* without passing the load balancer. This can greatly increase the
* scalability of virtual server.
*
* Used for ANY protocol
*/
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
__u8 next_protocol = 0;
__u8 dsfield = 0;
__u8 ttl = 0;
__be16 df = 0;
__be16 *dfp = NULL;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
EnterFunction(10);
rcu_read_lock();
//根据cp->daddr查找路由,注意daddr是调度器选出的rs的ip,
//不是数据包中的vip
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_CONNECT |
IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
if (local < 0)
goto tx_error;
//如果查找结果是local,则返回accept
if (local) {
rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
//取出路由结果中的出口设备
rt = skb_rtable(skb);
tdev = rt->dst.dev;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
//计算出口设备可容纳的最大长度
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
//先计算下max_headroom能不能容纳封装后报文长度,同时取
//出ip头里的field
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, NULL, &dsfield,
&ttl, dfp);
if (IS_ERR(skb))
goto tx_error;
skb = iptunnel_handle_offloads(
skb, false, __tun_gso_type_mask(AF_INET, cp->af));
if (IS_ERR(skb))
goto tx_error;
skb->transport_header = skb->network_header;
//在skb ip头前面腾出一个ip头的长度,用来封装外层ip
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
//外层ip头赋值,saddr为查找路由后给定,daddr为cp->daddr,即rs的ip
/*
* Push down and install the IPIP header.
*/
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = next_protocol;
iph->tos = dsfield;
iph->daddr = cp->daddr.ip;
iph->saddr = saddr;
iph->ttl = ttl;
ip_select_ident(skb, NULL);
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
//发给LOCAL_OUT hook点处理,此时的报文已经封装成
//ipip报文,外层ip是LB 和rs的隧道ip,内层ip是client和vip。
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
tx_error:
if (!IS_ERR(skb))
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_nat_xmit。
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
int local, rc, was_input;
EnterFunction(10);
rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
was_input = rt_is_input_route(skb_rtable(skb));
//根据 cp->daddr.ip 查找路由,而不是根据skb中的目的ip(vip)
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR, NULL, ipvsh);
if (local < 0)
goto tx_error;
rt = skb_rtable(skb);
...
//调用协议相关的 tcp_dnat_handler,修改数据包的目的port为cp->dport
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
//修改目的ip为 cp->daddr.ip
ip_hdr(skb)->daddr = cp->daddr.ip;
//重新计算校验和
ip_send_check(ip_hdr(skb));
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
rcu_read_unlock();
LeaveFunction(10);
return rc;
tx_error:
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
{
int ret = NF_STOLEN;
skb->ipvs_property = 1;
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 1);
/* Remove the early_demux association unless it's bound for the
* exact same port and address on this host after translation.
*/
if (!local || cp->vport != cp->dport ||
!ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
ip_vs_drop_early_demux_sk(skb);
if (!local) {
skb_forward_csum(skb);
//同样的,将dnat后的数据包调用local out发送出去
NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
} else
ret = NF_ACCEPT;
return ret;
}
所以到达lb的数据流为:cip:cport->vip:vport,
经过dnat后的数据流为:cip:cport->rip:rport.
rs处理完后的响应数据流为:rip:rport->cip:cport,需要将rip:rport还原成vip:vport,所以此数据流必须发给lb做snat。又因为目的ip不是lb的ip,所以必须将rs的默认网关指向lb。当数据流到达lb后,查找路由表发现目的ip不是lb的ip,所以需要转发此数据包(必须保证net.ipv4.ip_forward = 1),将走ip_forward函数转发,函数最后需要经过NF_INET_FORWARD hook点的处理,
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
rt->dst.dev, ip_forward_finish);
此hook点注册了两个和ipvs相关的函数ip_vs_forward_icmp和ip_vs_reply4,很显然前一个是处理icmp的,重点是ip_vs_reply4。
static unsigned int
ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
return ip_vs_out(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
EnterFunction(11);
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/* Bad... Do not break raw sockets */
if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(skb->sk);
if (inet && sk->sk_family == PF_INET && inet->nodefrag)
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
net = skb_net(skb);
if (!net_ipvs(net)->enable)
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(skb, &related,
hooknum, &iph);
if (related)
return verdict;
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
int verdict = ip_vs_out_icmp(skb, &related, hooknum);
if (related)
return verdict;
}
pd = ip_vs_proto_data_get(net, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET)
#endif
if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
if (ip_vs_gather_frags(skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
//因为从client到rs是通过cip和cport创建的连接表,所以反方向
//是通过目的ip和port(也就是cip和cport)查找是否有连接表
cp = pp->conn_out_get(af, skb, &iph, 0);
//如果查找到连接表,才需要处理
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph);
...
}
static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, iph->len))
goto drop;
/* mangle the packet */
//调用协议相关的 snat_handler 处理数据包,即
//tcp_snat_handler将源port换成vport
if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
goto drop;
{
//修改源ip为vaddr
ip_hdr(skb)->saddr = cp->vaddr.ip;
ip_send_check(ip_hdr(skb));
}
/*
* nf_iterate does not expect change in the skb->dst->dev.
* It looks like it is not fatal to enable this code for hooks
* where our handlers are at the end of the chain list and
* when all next handlers use skb->dst->dev and not outdev.
* It will definitely route properly the inout NAT traffic
* when multiple paths are used.
*/
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
* if it came from this machine itself. So re-compute
* the routing information.
*/
if (ip_vs_route_me_harder(af, skb))
goto drop;
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
LeaveFunction(11);
//最后返回accept即可,从hook函数返回后,会调用
//ip_forward_finish最终发给client端
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
LeaveFunction(11);
return NF_STOLEN;
}
dnat转发总结:
RS的网关必须指向DIP
请求和响应报文都需要经过LB,流量很大的话,LB会成为瓶颈
支持端口映射
http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/
也可参考:lvs四层负载均衡 - 简书 (jianshu.com)