有一个问题:IPVS中的local client 指什么? 在2.6.32的代码中,多了对于Local client的判断。
查阅IPVS的官方文档http://www.linuxvirtualserver.org/Documents.html, 可以看到所谓Local client 就是指director本身也作为一个server的情况,在这种情况下,director可以处理packet localy,而不是进行转发,所以称为local client(我觉得这里叫local server 更合适)。
对于local client的情况,如果要将包发送给本地,那么在ip_vs_in中会创建这样的一个connection entry。现在对于local client在ip_vs_in中的处理还是没弄明白。理论上,对于从local server发给client 的包,它不会经过forward这个点,但是会经过localOutput这个点。所以,可以在localOutput这个点,添加钩子,并判断数据包是不是conn_out_get,然后调用handle_response进行处理。但是,现在在程序中,把这些处理放到了ip_vs_in函数中,它是添加在localInput这个hook点的,不符合逻辑。
所以,合理的推断是,第一次数据包进入ip_vs_in时,创建了connection entry。然后,经过ip_vs_in的处理,将目的地址和端口改成了DR的另一个地址和端口,然后将包发送。发送之后,数据包会再次经过ip_vs_in。也就是说 从内核发往本地的另一个网口的包,会再次的经过ip_rcv。目前我还没有找到这个调用过程,但是我猜是这样的(这个推断应该是正确的)。
另外,在IPVS中,对于connection,从client(sip, sport)->server(dip,dport)和server(dip,dport)->client(sip,sport)的双向的数据为同一个connection entry。
对于tcp和udp,在协议中都包含连接查找函数了conn_in_get和conn_out_get,分别用于正向连接查找和反向查找。它们最终调用的也分别是__ip_vs_conn_in_get和__ip_vs_conn_out_get,在这两个函数,对于正向和反向,最后查找到了的是同一个connection。
另外,conn_out_get只用在NAT的情况 和 local client的情况。
static struct nf_hook_ops ip_vs_in_ops = {
.hook = ip_vs_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_LOCAL_IN,
.priority = 100,
};
ip_vs_in 是所有从client发过来的数据包的总入口。也就是说数据包首先经过ip_vs_in的处理。
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
int ret, restart, af, pkts;
af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
/*
* Big tappo: only PACKET_HOST, including loopback for local client
* Don't handle local packets on IPv6 for now
*/
if (unlikely(skb->pkt_type != PACKET_HOST)) {
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
skb->pkt_type,
iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr));
return NF_ACCEPT;
}
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
/* Protocol supported? */
pp = ip_vs_proto_get(iph.protocol);
if (unlikely(!pp))
return NF_ACCEPT;
/*
* Check if the packet belongs to an existing connection entry
*/
cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
if (unlikely(!cp)) {
int v;
/* For local client packets, it could be a response ??? wusq*/
cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
if (cp)
return handle_response(af, skb, pp, cp, iph.len);
if (!pp->conn_schedule(af, skb, pp, &v, &cp))
return v;
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal");
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
/* don't restart its timer, and silently
drop the packet. */
__ip_vs_conn_put(cp);
return NF_DROP;
}
ip_vs_in_stats(cp, skb);
restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;
}
/* Increase its packet counter and check if it is needed
* to be synchronized
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
*/
pkts = atomic_add_return(1, &cp->in_pkts);
if (af == AF_INET &&
(ip_vs_sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
(pkts % sysctl_ip_vs_sync_threshold[1]
== sysctl_ip_vs_sync_threshold[0])) ||
((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
(cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
(cp->state == IP_VS_TCP_S_TIME_WAIT)))))
ip_vs_sync_conn(cp);
cp->old_state = cp->state;
ip_vs_conn_put(cp);
return ret;
}
forward处的钩子。这个函数对转发包进行处理, 只用在NAT模式的均衡处理,处理的是服务器返回的包,因为TUNNEL和DR方式下都是直接发给了client,不经过load balancer的处理。客户端请求的包也不经过这个hook,客户端的请求包经过的是local in的hook。
但如果设置了DNAT规则,数据包在PREROUTING点进行了目的地址修改,这样就不会再进入INPUT点而是直接转到FORWARD点处理,这时时针对该包的 IPVS连接是没有建立的.
static
struct nf_hook_ops ip_vs_out_ops =
{
.hook = ip_vs_out,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_FORWARD,
.priority = 100,};
static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
int ihl;
//这个标志只占一位,标志置位就是已经经过IPVS处理了,直接返回
if (skb->ipvs_property)
return NF_ACCEPT;
iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
//处理可能的连接相关ICMP错误信息,如地址端口不可达等
int related, verdict = ip_vs_out_icmp(pskb, &related);
if (related)
return verdict;
skb = *pskb;
iph = skb->nh.iph;
}
//取得IPVS协议, tcp/udp/ah/esp之一
pp = ip_vs_proto_get(iph->protocol);
if (unlikely(!pp))
return NF_ACCEPT;
//如果是碎片包进行重组,基本不可能,因为数据包进入netfilter时就要进行碎片重组
if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) {
skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
if (!skb)
return NF_STOLEN;
iph = skb->nh.iph;
*pskb = skb;
}
ihl = iph->ihl << 2; //ip头长度
//查找IPVS连接
cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
if (unlikely(!cp)) { //没有找到,可能是请求方向的包经过DNAT过来的---这里的作用不清楚
if (sysctl_ip_vs_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP)) {
__u16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, ihl, sizeof(_ports), _ports);
if (pptr == NULL)
return NF_ACCEPT;
//用源地址,源端口来查真实服务器结构,如果是请求方向是找不到的
//这种情况下数据包就不再被IPVS处理
if (ip_vs_lookup_real_service(iph->protocol, iph->saddr, pptr[0])) {
if (iph->protocol != IPPROTO_TCP || !is_tcp_reset(skb)) {
icmp_send(skb,ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); //发送icmp不可达信息
return NF_DROP;
}
}
}
return NF_ACCEPT;
}
//找到连接,该包是服务器的回应包
//skb数据包要求是可写的
if (!ip_vs_make_skb_writable(pskb, ihl))
goto drop;
//修改协议部分信息,如TCP、UDP的端口, nat_handler负责修改端口号
if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
goto drop;
skb = *pskb;
//修改源地址, 由于是服务器的返回包,只修改源地址
skb->nh.iph->saddr = cp->vaddr;
ip_send_check(skb->nh.iph); //重新计算校验和
//重新计算路由信息,对于本地产生的数据包
if (__ip_route_me_harder(pskb, RTN_LOCAL) != 0)
goto drop;
skb = *pskb;
//PVS输出统计
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); //状态迁移
ip_vs_conn_put(cp); //调整连接超时,释放连接计数
skb->ipvs_property = 1; //对该包设置标志表示IPVS处理过了
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(*pskb);
return NF_STOLEN;
}
static struct nf_hook_ops ip_vs_forward_icmp_ops = {
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_FORWARD,
.priority = 99, //在ip_vs_out_ops之前进行
};
//这个函数对转发的ICMP包进行处理, 处理由于服务器失效而引起的网络或端口不可达的ICMP信息,其他和服务器无关的ICMP信息不处理.
static unsigned int ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in,
const struct net_device *out, int (*okfn)(struct sk_buff *))
{
int r;
if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
return ip_vs_in_icmp(pskb, &r, hooknum);
}
static int ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
{
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
unsigned int offset, ihl, verdict;
*related = 1; //这个参数指示该ICMP包是否和IPVS的连接相关,好像没用
if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
//进行碎片重组
skb = ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
if (!skb)
return NF_STOLEN;
*pskb = skb;
}
iph = skb->nh.iph; //ip头
offset = ihl = iph->ihl * 4; //数据开始
//获取icmp头
ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
if (ic == NULL)
return NF_DROP;
//如果不是这三种ICMP信息,则该skb与IPVS无关
if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
offset += sizeof(_icmph); //获取ip头
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT;
//找的是ICMP信息中包含的原始包中的协议,而不是ICMP
pp = ip_vs_proto_get(cih->protocol);
if (!pp)
return NF_ACCEPT;
//如果是碎片包且定义了不处理标志则直接返回
if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && pp->dont_defrag))
return NF_ACCEPT;
offset += cih->ihl * 4; // + ip头长度
//查找IPVS连接
cp = pp->conn_in_get(skb, pp, cih, offset, 1);
if (!cp)
return NF_ACCEPT;
//缺省的裁定结果是丢弃包
verdict = NF_DROP;
//检查ip校验和
if (skb->ip_summed != CHECKSUM_UNNECESSARY && ip_vs_checksum_complete(skb, ihl)) {
IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
goto out;
}
ip_vs_in_stats(cp, skb);//进行输入统计
//如果内部协议是TCP/UDP,发送偏移量要包括前4个字节: 源端口和目的端口
if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
offset += 2 * sizeof(__u16);
//发送ICMP
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
out:
__ip_vs_conn_put(cp);
return verdict;
}
发送各种ICMP错误信息包
int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset)
{
struct rtable *rt; /* Route to the other host */
int mtu;
int rc;
//如果不是NAT情况的IPVS连接, 即是TUNNEL或DR,直接调用连接的发送函数发送
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
rc = cp->packet_xmit(skb, cp, pp);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
goto out;
}
//查找路由
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
goto tx_error_icmp;
mtu = dst_mtu(&rt->u.dst);
//数据包过长超过MTU,但又是不允许分片的,发送ICMP出错包
if ((skb->len > mtu) && (skb->nh.iph->frag_off & __constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
goto tx_error;
}
//让skb可写
if (!ip_vs_make_skb_writable(&skb, offset))
goto tx_error_put;
//skb留出足够的硬件头空间
if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
goto tx_error_put;
dst_release(skb->dst);
skb->dst = &rt->u.dst;
//修改ICMP包
ip_vs_nat_icmp(skb, pp, cp, 0);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
//将该包用OUTPUT点的hook_ops进行处理
/* #define IP_VS_XMIT(skb, rt) \
* do { \
* (skb)->ipvs_property = 1; \
* (skb)->ip_summed = CHECKSUM_NONE; \
* NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, (rt)->u.dst.dev, dst_output); \
* } while (0)
*/
IP_VS_XMIT(skb, rt);
rc = NF_STOLEN; //表示该skb不用返回到正常的IP栈了
goto out;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
dev_kfree_skb(skb);
rc = NF_STOLEN;
out:
return rc;
tx_error_put:
ip_rt_put(rt);
goto tx_error;
}
static struct nf_hook_ops ip_vs_post_routing_ops = {
.hook = ip_vs_post_routing,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC-1, //在源NAT之前进行
};
这个函数对最后要发出的包进行检查,这个包是经过IPVS修改过了,不用再被netfilter进行修改,那么返回NF_STOP.
如果没被IPVS处理过,继续后续hook点操作
static
unsigned
int
ip_vs_post_routing(unsigned
int
hooknum,
struct
sk_buff **pskb,
const
struct
net_device *
in
,
const
struct
net_device *
out
,
int
(*okfn)(
struct
sk_buff *
)){
if (!((*pskb)->ipvs_property)) //如果没被IPVS处理过,继续后续hook点操作
return NF_ACCEPT; //STOP就不继续后面的低优先级的hook_ops的操作了
return NF_STOP;
}