syn-proxy logic2
主要的调用关系如下:
ip_vs_in() -→
conn_schedule() ==> tcp_conn_schedule() -→ ip_vs_synproxy_ack_rcv()
依赖NF_INET_PRE_ROUTING链上的ip_vs_in()hook函数,该hook函数用来确认当前ack报文是否存在对应的syn-cookie来判断是否为正常的ack报文,如果为正常的ack报文则向rs发送SYN报文来发起连接。ip_vs_in()源码如下:
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn) (struct sk_buff *))
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
int ret, restart, af, pkts;
int v = NF_DROP; /* for FULLNAT */
int res_dir; /* for FULLNAT */
af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
/*
* Big tappo: only PACKET_HOST, including loopback for local client
* Don't handle local packets on IPv6 for now
*/
/* pkt_type可取如下值:
* PACKET_HOST:这是一个发往本机的数据包
* PACKET_BROADCAST:广播数据包
* PACKET_MULTICAST:多播数据包
* PACKET_OTHERHOST:该数据包是发往其它机器的,如果本机没有被配置为转发功能,该数据包即被丢弃
* 对于ip_vs来说只需关注发给本机的数据报即可,其他数据报由内核协议栈继续处理
*/
if (unlikely(skb->pkt_type != PACKET_HOST)) {
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
skb->pkt_type,
iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr));
return NF_ACCEPT;
}
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
/* 处理ICMP协议报文,其中包括:
* 1. 对ip层分片数据包的重组
* 2. 对ICMP TYPE为DEST_UNREACH、SOURCE_QUENCH 、TIME_EXCEEDED的报文处理,其他类型的icmp报文交给协议栈继续处理
*/
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
/* Protocol supported? */
pp = ip_vs_proto_get(iph.protocol);
if (unlikely(!pp))
return NF_ACCEPT;
/*
* Check if the packet belongs to an existing connection entry
*/
/* 对于处于syn-proxy logic2的流来说,此时针对与这条流的连接还未创建,因此将会直接执行create connection部分 */
cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0, &res_dir);
if (likely(cp)) {
/* For full-nat/local-client packets, it could be a response */
if (res_dir == IP_VS_CIDX_F_IN2OUT) {
return handle_response(af, skb, pp, cp, iph.len);
}
} else {
/* create a new connection */
int v;
/* syn-proxy logic2的主要逻辑就在tcp_conn_schedule()---→ ip_vs_synproxy_ack_rcv() 中,我们下面来着重分析一下 */
if (!pp->conn_schedule(af, skb, pp, &v, &cp))
return v;
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal");
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
/* don't restart its timer, and silently
drop the packet. */
__ip_vs_conn_put(cp);
return NF_DROP;
}
tcp_conn_schedule()函数源码如下:
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
/*
* Syn-proxy step 2 logic: receive client's
* 3-handshake Ack packet
*/
if (ip_vs_synproxy_ack_rcv(af, skb, th, pp, cpp, &iph, verdict) == 0) {
return 0;
}
if (th->syn && !th->ack && !th->fin && !th->rst &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
th->dest))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, 0);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
return 1;
}
/* drop tcp packet which send to vip and !vport */
if (sysctl_ip_vs_tcp_drop_entry &&
(svc = ip_vs_lookup_vip(af, iph.protocol, &iph.daddr))) {
IP_VS_INC_ESTATS(ip_vs_esmib, DEFENCE_TCP_DROP);
*verdict = NF_DROP;
return 0;
}
return 1;
}
ip_vs_synproxy_ack_rcv()源码如下:
/*
* Syn-proxy step 2 logic
* Receive client's 3-handshakes Ack packet, do cookie check
* and then send syn to rs after creating a session.
*
*/
int
ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th,
struct ip_vs_protocol *pp, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph, int *verdict)
{
struct ip_vs_synproxy_opt opt;
struct ip_vs_service *svc;
int res_cookie_check;
/*
* Don't check svc syn-proxy flag, as it may
* be changed after syn-proxy step 1.
*/
/* 判断是否为ack包,并且能够根据请求的dst ip及port拿到对应的svc结构体 */
if (!th->syn && th->ack && !th->rst && !th->fin &&
(svc =
ip_vs_service_get(af, skb->mark, iph->protocol, &iph->daddr,
th->dest))) {
/* 当前load太高,需要丢弃该数据包 */
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/* 在开启synproxy_defer时,ack包中必须要存在payload */
if (sysctl_ip_vs_synproxy_defer &&
!syn_proxy_ack_has_data(skb, iph, th)) {
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_NULL_ACK);
/*
* When expecting ack packet with payload,
* we get a pure ack, so have to drop it.
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Import: set tcp hdr before cookie check, as it
* will be used in cookie_check funcs.
*/
skb_set_transport_header(skb, iph→len);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb,
ntohl
(th->
ack_seq)
- 1,
&opt);
} else
#endif
{
/* ip_vs_synproxy_v4_cookie_check()
* 1.使用check_tcp_syn_cookie()来校验该ack包是否合法:
* 以下为check_tcp_syn_cookie()源码:
* static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
* __be16 sport, __be16 dport, __u32 sseq,
* __u32 count, __u32 maxdiff)
* {
* __u32 diff;
*
* /* Strip away the layers from the cookie */
* /* 这里的cookie就是client ack包的ack seq -1(即lvs发送给client的syn-ack报文的seq,该seq存储了client syn包的各种信息,以此来校验是否为合法的三次握手报文) */
* cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; // 从cookie中拿掉根据这个tcp流相关信息拿到的sha1 hash值1和syn包的seq值
* /* 在synproxy logic1中,我们知道syn-cookie中的高8位为系统的开机分钟数,低24位由地址、端口、开机分钟数计算出的sha1 hash值2(32bit)和根据tcp option拼接成
* * 的data计算得出,而我们在判断本次收到的包是否为正常的建连请求时,只需对比这时的cookie高8位与当前系统开机分钟数的差值,即syn包与ack包到达间隔的最大值
* * 是否满足系统的设定值即可。若满足,则从cookie中去除sha1 hash值2并返回(此时的返回值中仅包含低22位即client syn包的tcp option),否则返回(__u32)-1
* */
* /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
* diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
* if (diff >= maxdiff)
* return (__u32)-1;
*
* return (cookie -
* cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
* & COOKIEMASK; /* Leaving the data behind */
* }
* 2. 根据ack包的tcp option更改opt的值
*/
res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb,
ntohl
(th->
ack_seq)
- 1,
&opt);
}
if (!res_cookie_check) {
/* cookie不可用,丢弃 */
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_BAD_ACK);
/*
* Cookie check fail, drop it.
*/
IP_VS_DBG(6, "syn_cookie check failed seq=%u\n",
ntohl(th->ack_seq) - 1);
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_OK_ACK);
/* 此时判断为正常的连接请求,开始分配相关资源 */
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, 1);
if (!*cpp) {
IP_VS_DBG(6, "ip_vs_schedule failed\n");
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
/*
* Release service, we don't need it any more.
*/
ip_vs_service_put(svc);
/*
* Do anything but print a error msg when fail.
* Because session will be correctly freed in ip_vs_conn_expire.
*/
/* 向rs发送syn包开始三次握手 */
if (!syn_proxy_send_rs_syn(af, th, *cpp, skb, pp, &opt)) {
IP_VS_ERR_RL("syn_proxy_send_rs_syn failed!\n");
}
/* count in the ack packet (STOLEN by synproxy) */
ip_vs_in_stats(*cpp, skb);
/*
* Active sesion timer, and dec refcnt.
* Also stole the skb, and let caller return immediately.
*/
ip_vs_conn_put(*cpp);
*verdict = NF_STOLEN;
return 0;
}
return 1;
}
syn_proxy_send_rs_syn()
/*
* Create syn packet and send it to rs.
* ATTENTION: we also store syn skb in cp if syn retransimition
* is tured on.
*/
static int
syn_proxy_send_rs_syn(int af, const struct tcphdr *th,
struct ip_vs_conn *cp, struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_synproxy_opt *opt)
{
struct sk_buff *syn_skb;
int tcp_hdr_size;
__u8 tcp_flags = TCPCB_FLAG_SYN;
unsigned int tcphoff;
struct tcphdr *new_th;
if (!cp->packet_xmit) {
IP_VS_ERR_RL("warning: packet_xmit is null");
return 0;
}
syn_skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (unlikely(syn_skb == NULL)) {
IP_VS_ERR_RL("alloc skb failed when send rs syn packet\n");
return 0;
}
/* Reserve space for headers */
skb_reserve(syn_skb, MAX_TCP_HEADER);
tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
(opt->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
(opt->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
/* SACK_PERM is in the place of NOP NOP of TS */
((opt->sack_ok
&& !opt->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
new_th = (struct tcphdr *)skb_push(syn_skb, tcp_hdr_size);
/* Compose tcp header */
skb_reset_transport_header(syn_skb);
syn_skb->csum = 0;
/* Set tcp hdr */
new_th->source = th->source;
new_th->dest = th->dest;
new_th->seq = htonl(ntohl(th->seq) - 1);
new_th->ack_seq = 0;
*(((__u16 *) new_th) + 6) =
htons(((tcp_hdr_size >> 2) << 12) | tcp_flags);
/* FIX_ME: what window should we use */
new_th->window = htons(5000);
new_th->check = 0;
new_th->urg_ptr = 0;
new_th->urg = 0;
new_th->ece = 0;
new_th->cwr = 0;
syn_proxy_syn_build_options((__be32 *) (new_th + 1), opt);
/*
* Set ip hdr
* Attention: set source and dest addr to ack skb's.
* we rely on packet_xmit func to do NATs thing.
*/
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
struct ipv6hdr *ack_iph = ipv6_hdr(skb);
struct ipv6hdr *iph =
(struct ipv6hdr *)skb_push(syn_skb, sizeof(struct ipv6hdr));
tcphoff = sizeof(struct ipv6hdr);
skb_reset_network_header(syn_skb);
memcpy(&iph->saddr, &ack_iph->saddr, sizeof(struct in6_addr));
memcpy(&iph->daddr, &ack_iph->daddr, sizeof(struct in6_addr));
iph->version = 6;
iph->nexthdr = NEXTHDR_TCP;
iph->payload_len = htons(tcp_hdr_size);
iph->hop_limit = IPV6_DEFAULT_HOPLIMIT;
new_th->check = 0;
syn_skb->csum =
skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
new_th->check =
csum_ipv6_magic(&iph->saddr, &iph→daddr,
syn_skb->len - tcphoff, IPPROTO_TCP,
syn_skb->csum);
} else
#endif
{
struct iphdr *ack_iph = ip_hdr(skb);
u32 rtos = RT_TOS(ack_iph->tos);
struct iphdr *iph =
(struct iphdr *)skb_push(syn_skb, sizeof(struct iphdr));
tcphoff = sizeof(struct iphdr);
skb_reset_network_header(syn_skb);
*((__u16 *) iph) = htons((4 << 12) | (5 << 8) | (rtos & 0xff));
iph->tot_len = htons(syn_skb->len);
iph->frag_off = htons(IP_DF);
/* FIX_ME: what ttl shoule we use */
iph->ttl = IPDEFTTL;
iph->protocol = IPPROTO_TCP;
iph->saddr = ack_iph->saddr;
iph->daddr = ack_iph->daddr;
ip_send_check(iph);
new_th->check = 0;
syn_skb->csum =
skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
new_th->check =
csum_tcpudp_magic(iph->saddr, iph->daddr,
syn_skb->len - tcphoff, IPPROTO_TCP,
syn_skb->csum);
}
/* Save syn_skb if syn retransmission is on */
if (sysctl_ip_vs_synproxy_syn_retry > 0) {
cp->syn_skb = skb_copy(syn_skb, GFP_ATOMIC);
atomic_set(&cp->syn_retry_max, sysctl_ip_vs_synproxy_syn_retry);
}
/* Save info for fast_response_xmit */
if(sysctl_ip_vs_fast_xmit && skb->dev &&
likely(skb->dev->type == ARPHRD_ETHER) &&
skb_mac_header_was_set(skb)) {
struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
if(likely(cp->indev == NULL)) {
cp->indev = skb->dev;
dev_hold(cp->indev);
}
if (unlikely(cp->indev != skb->dev)) {
dev_put(cp->indev);
cp->indev = skb->dev;
dev_hold(cp->indev);
}
memcpy(cp->src_hwaddr, eth->h_source, ETH_ALEN);
memcpy(cp->dst_hwaddr, eth->h_dest, ETH_ALEN);
IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE);
IP_VS_DBG_RL("syn_proxy_send_rs_syn netdevice:%s\n",
netdev_name(skb->dev));
}
/* count in the syn packet */
ip_vs_in_stats(cp, skb);
/* If xmit failed, syn_skb will be freed correctly. */
cp->packet_xmit(syn_skb, cp, pp);
return 1;
}