1. 前言
ipvs是章文嵩先生主持的一个开源项目,早在2.2内核时就已经以内核补丁形式出现,RedHat6.1和6.0的一个重要区别就是增加了IPVS。从2.4.24后IPVS已经成为Linux官方标准内核的一部分,2.2时IPVS是完全独立的部分,2.4以后借用了netfilter的一些处理机制,但主体还是比较独立,但功能和netfilter有重复的地方。
IPVS官方网站为: http://www.linuxvirtualserver.org
以下内核代码版本2.6.17.11, ipvs版本为1.2.1。
2. IPVS的外部表现
根据LVS官方网站的介绍,LVS支持三种负载均衡模式:NAT,tunnel和direct routing(DR)。NAT是通用模式,所有交互数据必须通过均衡器;后两种则是一种半连接处理方式,请求数据通过均衡器,而服务器的回应则是直接路由返回的,而这两种方法的区别是tunnel模式下由于进行了IP封装所以可路由,而DR方式是修改MAC地址来实现,所以必须同一网段。
3. 几个重要结构
3.1 协议
这个结构用来描述IPVS支持的IP协议。IPVS的IP层协议支持TCP, UDP, AH和ESP这4种IP层协议
struct ip_vs_protocol {
// 链表中的下一项
struct ip_vs_protocol *next;
// 协议名称, "TCP", "UDP"...
char *name;
// 协议值: 6, 17, ...
__u16 protocol;
// 不进行分配
int dont_defrag;
// 协议应用计数器,也据是该协议的中多连接协议的数量
atomic_t appcnt; /* counter of proto app incs */
// 协议各状态的超时数组
int *timeout_table; /* protocol timeout table */
// 协议初始化
void (*init)(struct ip_vs_protocol *pp);
// 协议释放
void (*exit)(struct ip_vs_protocol *pp);
// 协议调度
int (*conn_schedule)(struct sk_buff *skb,
struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp);
// 查找in方向的IPVS连接
struct ip_vs_conn *
(*conn_in_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse);
// 查找out方向的IPVS连接
struct ip_vs_conn *
(*conn_out_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse);
// 源NAT操作
int (*snat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
// 目的NAT操作
int (*dnat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
// 协议校验和计算
int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp);
// 当前协议状态名称: 如"LISTEN", "ESTABLISH"...
const char *(*state_name)(int state);
// 协议状态迁移
int (*state_transition)(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp);
// 登记应用
int (*register_app)(struct ip_vs_app *inc);
// 去除应用登记
void (*unregister_app)(struct ip_vs_app *inc);
int (*app_conn_bind)(struct ip_vs_conn *cp);
// 数据包打印
void (*debug_packet)(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg);
// 调整超时
void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
// 设置各种状态下的协议超时
int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);
};
3.2 IPVS连接
这个结构用来描述IPVS的连接。IPVS的连接和netfilter定义的连接类似
/*
* IP_VS structure allocated for each dynamically scheduled connection
*/
struct ip_vs_conn {
// HASH链表
struct list_head c_list; /* hashed list heads */
/* Protocol, addresses and port numbers */
// 客户机地址
__u32 caddr; /* client address */
// 服务器对外的虚拟地址
__u32 vaddr; /* virtual address */
// 服务器实际地址
__u32 daddr; /* destination address */
// 客户端的端口
__u16 cport;
// 服务器对外虚拟端口
__u16 vport;
// 服务器实际端口
__u16 dport;
// 协议类型
__u16 protocol; /* Which protocol (TCP/UDP) */
/* counter and timer */
// 连接引用计数
atomic_t refcnt; /* reference count */
// 定时器
struct timer_list timer; /* Expiration timer */
// 超时时间
volatile unsigned long timeout; /* timeout */
/* Flags and state transition */
// 状态转换锁
spinlock_t lock; /* lock for state transition */
volatile __u16 flags; /* status flags */
volatile __u16 state; /* state info */
/* Control members */
// 主连接, 如FTP
struct ip_vs_conn *control; /* Master control connection */
// 子连接数
atomic_t n_control; /* Number of controlled ones */
// 真正服务器
struct ip_vs_dest *dest; /* real server */
// 进入的数据统计
atomic_t in_pkts; /* incoming packet counter */
/* packet transmitter for different forwarding methods. If it
mangles the packet, it must return NF_DROP or better NF_STOLEN,
otherwise this must be changed to a sk_buff **.
*/
// 数据包发送
int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp);
/* Note: we can group the following members into a structure,
in order to save more space, and the following members are
only used in VS/NAT anyway */
// IPVS应用
struct ip_vs_app *app; /* bound ip_vs_app object */
// 应用的私有数据
void *app_data; /* Application private data */
// 进入数据的序列号
struct ip_vs_seq in_seq; /* incoming seq. struct */
// 发出数据的序列号
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
3.3 IPVS服务
这个结构用来描述IPVS对外的虚拟服务器信息。
/*
* The information about the virtual service offered to the net
* and the forwarding entries
*/
struct ip_vs_service {
// 按普通协议,地址,端口进行HASH的链表
struct list_head s_list; /* for normal service table */
// 按nfmark进行HASH的链表(感觉没必要)
struct list_head f_list; /* for fwmark-based service table */
// 引用计数
atomic_t refcnt; /* reference counter */
// 使用计数
atomic_t usecnt; /* use counter */
// 协议
__u16 protocol; /* which protocol (TCP/UDP) */
// 虚拟服务器地址
__u32 addr; /* IP address for virtual service */
// 虚拟端口
__u16 port; /* port number for the service */
// 就是skb中的nfmark
__u32 fwmark; /* firewall mark of the service */
// 标志
unsigned flags; /* service status flags */
// 超时
unsigned timeout; /* persistent timeout in ticks */
// 网络掩码
__u32 netmask; /* grouping granularity */
// 真实服务器的地址链表
struct list_head destinations; /* real server d-linked list */
// 真实服务器的数量
__u32 num_dests; /* number of servers */
// 服务统计信息
struct ip_vs_stats stats; /* statistics for the service */
// 应用
struct ip_vs_app *inc; /* bind conns to this app inc */
/* for scheduling */
// 调度指针
struct ip_vs_scheduler *scheduler; /* bound scheduler object */
rwlock_t sched_lock; /* lock sched_data */
void *sched_data; /* scheduler application data */
};
3.4 IPVS目的服务器
这个结构用来描述具体的真实服务器的信息
/*
* The real server destination forwarding entry
* with ip address, port number, and so on.
*/
struct ip_vs_dest {
//
struct list_head n_list; /* for the dests in the service */
struct list_head d_list; /* for table with all the dests */
// 服务器地址
__u32 addr; /* IP address of the server */
// 服务器端口
__u16 port; /* port number of the server */
// 目标标志,易变参数
volatile unsigned flags; /* dest status flags */
// 连接标志
atomic_t conn_flags; /* flags to copy to conn */
// 服务器权重
atomic_t weight; /* server weight */
// 引用次数
atomic_t refcnt; /* reference counter */
// 统计数
struct ip_vs_stats stats; /* statistics */
/* connection counters and thresholds */
// 活动的连接
atomic_t activeconns; /* active connections */
// 不活动的连接
atomic_t inactconns; /* inactive connections */
// 保持的连接
atomic_t persistconns; /* persistent connections */
// 连接上限
__u32 u_threshold; /* upper threshold */
// 连接下限
__u32 l_threshold; /* lower threshold */
/* for destination cache */
spinlock_t dst_lock; /* lock of dst_cache */
struct dst_entry *dst_cache; /* destination cache entry */
u32 dst_rtos; /* RT_TOS(tos) for dst */
/* for virtual service */
struct ip_vs_service *svc; /* service it belongs to */
__u16 protocol; /* which protocol (TCP/UDP) */
__u32 vaddr; /* virtual IP address */
__u16 vport; /* virtual port number */
__u32 vfwmark; /* firewall mark of service */
};
3.5 IPVS调度器
这个结构用来描述IPVS调度算法,目前调度方法包括rr,wrr,lc, wlc, lblc, lblcr, dh, sh等
/*
* The scheduler object
*/
struct ip_vs_scheduler {
struct list_head n_list; /* d-linked list head */
char *name; /* scheduler name */
atomic_t refcnt; /* reference counter */
struct module *module; /* THIS_MODULE/NULL */
/* scheduler initializing service */
int (*init_service)(struct ip_vs_service *svc);
/* scheduling service finish */
int (*done_service)(struct ip_vs_service *svc);
/* scheduler updating service */
int (*update_service)(struct ip_vs_service *svc);
/* selecting a server from the given service */
struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
const struct sk_buff *skb);
};
3.6 IPVS应用
IPVS应用是针对多连接协议的, 目前也就只支持FTP。由于ip_vs_app.c是从2.2过来的,没有管内核是否本身有NAT的情况,所以相当于自身实现了应用协议的NAT处理,包括内容信息的改变,TCP序列号确认号的调整等,而现在这些都由netfilter实现了,IPVS可以不用管这些,只处理连接调度就行了。
IPVS的应用模块化还不是很好,在处理连接端口时,还要判断是否是FTPPORT,也就是说不支持其他多连接协议的,应该象netfilter一样为每个多连接协议设置一个helper,自动调用,不用在程序里判断端口。
/*
* The application module object (a.k.a. app incarnation)
*/
struct ip_vs_app
{
// 用来挂接到应用链表
struct list_head a_list; /* member in app list */
int type; /* IP_VS_APP_TYPE_xxx */
char *name; /* application module name */
// 协议, TCP, UDP...
__u16 protocol;
// 模块本身
struct module *module; /* THIS_MODULE/NULL */
// 应用的具体实例链表
struct list_head incs_list; /* list of incarnations */
/* members for application incarnations */
// 将应用结构挂接到对应协议(TCP, UDP...)的应用表
struct list_head p_list; /* member in proto app list */
struct ip_vs_app *app; /* its real application */
__u16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */
/* output hook: return false if can't linearize. diff set for TCP. */
int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff **, int *diff);
/* input hook: return false if can't linearize. diff set for TCP. */
int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff **, int *diff);
/* ip_vs_app initializer */
int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
/* ip_vs_app finish */
int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *);
/* not used now */
int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *,
struct ip_vs_protocol *);
void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *);
int * timeout_table;
int * timeouts;
int timeouts_size;
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,
int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn *
(*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app,
const struct iphdr *iph, unsigned int proto_off,
int inverse);
struct ip_vs_conn *
(*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app,
const struct iphdr *iph, unsigned int proto_off,
int inverse);
int (*state_transition)(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_app *app);
void (*timeout_change)(struct ip_vs_app *app, int flags);
};
3.7 用户空间数据结构
用户空间信息是ipvsadm程序接收用户输入后传递给内核ipvs的信息,信息都是很直接的,没有各种控制信息。ipvsadm和ipvs的关系相当于iptables和netfilter的关系。
3.7.1 用户空间的虚拟服务信息
/*
* The struct ip_vs_service_user and struct ip_vs_dest_user are
* used to set IPVS rules through setsockopt.
*/
struct ip_vs_service_user {
/* virtual service addresses */
u_int16_t protocol;
u_int32_t addr; /* virtual ip address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* virtual service options */
char sched_name[IP_VS_SCHEDNAME_MAXLEN];
unsigned flags; /* virtual service flags */
unsigned timeout; /* persistent timeout in sec */
u_int32_t netmask; /* persistent netmask */
};
3.7.2 用户空间的真实服务器信息
struct ip_vs_dest_user {
/* destination server address */
u_int32_t addr;
u_int16_t port;
/* real server options */
unsigned conn_flags; /* connection flags */
int weight; /* destination weight */
/* thresholds for active connections */
u_int32_t u_threshold; /* upper threshold */
u_int32_t l_threshold; /* lower threshold */
};
3.7.3 用户空间的统计信息
/*
* IPVS statistics object (for user space)
*/
struct ip_vs_stats_user
{
__u32 conns; /* connections scheduled */
__u32 inpkts; /* incoming packets */
__u32 outpkts; /* outgoing packets */
__u64 inbytes; /* incoming bytes */
__u64 outbytes; /* outgoing bytes */
__u32 cps; /* current connection rate */
__u32 inpps; /* current in packet rate */
__u32 outpps; /* current out packet rate */
__u32 inbps; /* current in byte rate */
__u32 outbps; /* current out byte rate */
};
3.7.4 用户空间的获取信息结构
/* The argument to IP_VS_SO_GET_INFO */
struct ip_vs_getinfo {
/* version number */
unsigned int version;
/* size of connection hash table */
unsigned int size;
/* number of virtual services */
unsigned int num_services;
};
3.7.5 用户空间的服务规则项信息
/* The argument to IP_VS_SO_GET_SERVICE */
struct ip_vs_service_entry {
/* which service: user fills in these */
u_int16_t protocol;
u_int32_t addr; /* virtual address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* service options */
char sched_name[IP_VS_SCHEDNAME_MAXLEN];
unsigned flags; /* virtual service flags */
unsigned timeout; /* persistent timeout */
u_int32_t netmask; /* persistent netmask */
/* number of real servers */
unsigned int num_dests;
/* statistics */
struct ip_vs_stats_user stats;
};
3.7.6 用户空间的服务器项信息
struct ip_vs_dest_entry {
u_int32_t addr; /* destination address */
u_int16_t port;
unsigned conn_flags; /* connection flags */
int weight; /* destination weight */
u_int32_t u_threshold; /* upper threshold */
u_int32_t l_threshold; /* lower threshold */
u_int32_t activeconns; /* active connections */
u_int32_t inactconns; /* inactive connections */
u_int32_t persistconns; /* persistent connections */
/* statistics */
struct ip_vs_stats_user stats;
};
3.7.7 用户空间的获取服务器项信息
/* The argument to IP_VS_SO_GET_DESTS */
struct ip_vs_get_dests {
/* which service: user fills in these */
u_int16_t protocol;
u_int32_t addr; /* virtual address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* number of real servers */
unsigned int num_dests;
/* the real servers */
struct ip_vs_dest_entry entrytable[0];
};
3.7.8 用户空间的获取虚拟服务项信息
/* The argument to IP_VS_SO_GET_SERVICES */
struct ip_vs_get_services {
/* number of virtual services */
unsigned int num_services;
/* service table */
struct ip_vs_service_entry entrytable[0];
};
3.7.9 用户空间的获取超时信息结构
/* The argument to IP_VS_SO_GET_TIMEOUT */
struct ip_vs_timeout_user {
int tcp_timeout;
int tcp_fin_timeout;
int udp_timeout;
};
3.7.10 用户空间的获取IPVS内核守护进程信息结构
/* The argument to IP_VS_SO_GET_DAEMON */
struct ip_vs_daemon_user {
/* sync daemon state (master/backup) */
int state;
/* multicast interface name */
char mcast_ifn[IP_VS_IFNAME_MAXLEN];
/* SyncID we belong to */
int syncid;
};