syn-proxy logic2

主要的调用关系如下:

ip_vs_in() -→

       conn_schedule() ==> tcp_conn_schedule()

                                                -→ ip_vs_synproxy_ack_rcv()

依赖NF_INET_PRE_ROUTING链上的ip_vs_in()hook函数,该hook函数用来确认当前ack报文是否存在对应的syn-cookie来判断是否为正常的ack报文,如果为正常的ack报文则向rs发送SYN报文来发起连接。ip_vs_in()源码如下:

/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn) (struct sk_buff *))
{
      struct ip_vs_iphdr iph; 
      struct ip_vs_protocol *pp; 
      struct ip_vs_conn *cp; 
      int ret, restart, af, pkts;
      int v = NF_DROP; /* for FULLNAT */
      int res_dir; /* for FULLNAT */

      af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;

      ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

      /* 
       * Big tappo: only PACKET_HOST, including loopback for local client
       * Don't handle local packets on IPv6 for now
       */

       /* pkt_type可取如下值:

        * PACKET_HOST:这是一个发往本机的数据包

        * PACKET_BROADCAST:广播数据包

        * PACKET_MULTICAST:多播数据包

        * PACKET_OTHERHOST:该数据包是发往其它机器的,如果本机没有被配置为转发功能,该数据包即被丢弃

        * 对于ip_vs来说只需关注发给本机的数据报即可,其他数据报由内核协议栈继续处理

        */
       if (unlikely(skb->pkt_type != PACKET_HOST)) {
           IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
               skb->pkt_type,
               iph.protocol,
               IP_VS_DBG_ADDR(af, &iph.daddr));
           return NF_ACCEPT;
}

#ifdef CONFIG_IP_VS_IPV6
     if (af == AF_INET6) {
         if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
             int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);

             if (related)
                 return verdict;
             ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
         } 
    } else

#endif

   /* 处理ICMP协议报文,其中包括:

    * 1. 对ip层分片数据包的重组

    * 2. 对ICMP TYPE为DEST_UNREACH、SOURCE_QUENCH 、TIME_EXCEEDED的报文处理,其他类型的icmp报文交给协议栈继续处理

    */

   if (unlikely(iph.protocol == IPPROTO_ICMP)) {
         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);

       if (related)
           return verdict;
       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
   }

   /* Protocol supported? */
   pp = ip_vs_proto_get(iph.protocol);
   if (unlikely(!pp))
       return NF_ACCEPT;

   /*
    * Check if the packet belongs to an existing connection entry
    */

 /* 对于处于syn-proxy logic2的流来说,此时针对与这条流的连接还未创建,因此将会直接执行create connection部分 */
   cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0, &res_dir);

   if (likely(cp)) {
   /* For full-nat/local-client packets, it could be a response */
       if (res_dir == IP_VS_CIDX_F_IN2OUT) {
           return handle_response(af, skb, pp, cp, iph.len);
       }
   } else {
   /* create a new connection */
   int v;
/* syn-proxy logic2的主要逻辑就在tcp_conn_schedule()---→ ip_vs_synproxy_ack_rcv() 中,我们下面来着重分析一下 */
   if (!pp->conn_schedule(af, skb, pp, &v, &cp))
       return v;
   }

   if (unlikely(!cp)) {
       /* sorry, all this trouble for a no-hit :) */
       IP_VS_DBG_PKT(12, pp, skb, 0,
           "packet continues traversal as normal");
       return NF_ACCEPT;
   }

   IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");

   /* Check the server status */

   if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
      /* the destination server is not available */

     if (sysctl_ip_vs_expire_nodest_conn) {
         /* try to expire the connection immediately */
         ip_vs_conn_expire_now(cp);
   }
   /* don't restart its timer, and silently
      drop the packet. */
   __ip_vs_conn_put(cp);
   return NF_DROP;

}

tcp_conn_schedule()函数源码如下:

static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 
                                int *verdict, struct ip_vs_conn **cpp)
{ 
      struct ip_vs_service *svc;
      struct tcphdr _tcph, *th; 
      struct ip_vs_iphdr iph; 

      ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

      th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
      if (th == NULL) {
          *verdict = NF_DROP;
          return 0;
      } 

      /* 
       * Syn-proxy step 2 logic: receive client's
       * 3-handshake Ack packet
       */
       if (ip_vs_synproxy_ack_rcv(af, skb, th, pp, cpp, &iph, verdict) == 0) { 
           return 0;
       } 

       if (th->syn && !th->ack && !th->fin && !th->rst &&
           (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
                      th->dest))) {
            if (ip_vs_todrop()) {
                  /* 
                   * It seems that we are very loaded.
                   * We have to drop this packet :(
                   */ 
                   ip_vs_service_put(svc);
                   *verdict = NF_DROP;
                    return 0;  

            } 

            /* 
             * Let the virtual server select a real server for the
             * incoming connection, and create a connection entry.
             */
             *cpp = ip_vs_schedule(svc, skb, 0);

             if (!*cpp) {
                  *verdict = ip_vs_leave(svc, skb, pp);
                  return 0;
             }
             ip_vs_service_put(svc);
             return 1;
         }

         /* drop tcp packet which send to vip and !vport */
         if (sysctl_ip_vs_tcp_drop_entry &&
            (svc = ip_vs_lookup_vip(af, iph.protocol, &iph.daddr))) {
                 IP_VS_INC_ESTATS(ip_vs_esmib, DEFENCE_TCP_DROP);
                 *verdict = NF_DROP;
                 return 0;
        }

       return 1;
}

ip_vs_synproxy_ack_rcv()源码如下:

/* 
* Syn-proxy step 2 logic
* Receive client's 3-handshakes Ack packet, do cookie check
* and then send syn to rs after creating a session.
* 
*/ 
int 
ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th, 
                           struct ip_vs_protocol *pp, struct ip_vs_conn **cpp,
                           struct ip_vs_iphdr *iph, int *verdict)
{ 
      struct ip_vs_synproxy_opt opt; 
      struct ip_vs_service *svc;
      int res_cookie_check;

     /* 
      * Don't check svc syn-proxy flag, as it may
      * be changed after syn-proxy step 1.
      */

/* 判断是否为ack包,并且能够根据请求的dst ip及port拿到对应的svc结构体 */
      if (!th->syn && th->ack && !th->rst && !th->fin &&
           (svc =
               ip_vs_service_get(af, skb->mark, iph->protocol, &iph->daddr,
                                       th->dest))) {

/* 当前load太高,需要丢弃该数据包 */
          if (ip_vs_todrop()) {
               /* 
                * It seems that we are very loaded.
                * We have to drop this packet :(
                */
                ip_vs_service_put(svc);
                *verdict = NF_DROP;
                return 0;
          } 
 /* 在开启synproxy_defer时,ack包中必须要存在payload */
         if (sysctl_ip_vs_synproxy_defer &&
               !syn_proxy_ack_has_data(skb, iph, th)) {
               /* update statistics */
              IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_NULL_ACK);
               /*
                * When expecting ack packet with payload,
                * we get a pure ack, so have to drop it.
                */
               ip_vs_service_put(svc);

               *verdict = NF_DROP;
                return 0;
          } 

          /* 
           * Import: set tcp hdr before cookie check, as it
           * will be used in cookie_check funcs.
           */ 
          skb_set_transport_header(skb, iph→len);

#ifdef CONFIG_IP_VS_IPV6
         if (af == AF_INET6) {
              res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb,
                                                                     ntohl
                                                                     (th->
                                                                      ack_seq)
                                                                      - 1,
                                                                     &opt);
         } else
#endif 
        { 

              /* ip_vs_synproxy_v4_cookie_check()

               * 1.使用check_tcp_syn_cookie()来校验该ack包是否合法:

               *    以下为check_tcp_syn_cookie()源码:

               *    static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
               *                                                             __be16 sport, __be16 dport, __u32 sseq,
               *                                                            __u32 count, __u32 maxdiff)
               *  {
               *            __u32 diff;
               *
               *          /* Strip away the layers from the cookie */

               *          /* 这里的cookie就是client ack包的ack seq -1(即lvs发送给client的syn-ack报文的seq,该seq存储了client syn包的各种信息,以此来校验是否为合法的三次握手报文) */
               *          cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;  // 从cookie中拿掉根据这个tcp流相关信息拿到的sha1 hash值1和syn包的seq值

               *          /* 在synproxy logic1中,我们知道syn-cookie中的高8位为系统的开机分钟数,低24位由地址、端口、开机分钟数计算出的sha1 hash值2(32bit)和根据tcp option拼接成

               *           * 的data计算得出,而我们在判断本次收到的包是否为正常的建连请求时,只需对比这时的cookie高8位与当前系统开机分钟数的差值,即syn包与ack包到达间隔的最大值

               *           * 是否满足系统的设定值即可。若满足,则从cookie中去除sha1 hash值2并返回(此时的返回值中仅包含低22位即client syn包的tcp option),否则返回(__u32)-1

               *           */
               *         /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
               *         diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
               *         if (diff >= maxdiff)
               *              return (__u32)-1;
               *
               *         return (cookie -
               *                  cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) 
               *                  & COOKIEMASK; /* Leaving the data behind */
               *   }

               * 2. 根据ack包的tcp option更改opt的值

               */
              res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb,
                                                                     ntohl
                                                                     (th->
                                                                      ack_seq)
                                                                      - 1,
                                                                     &opt);
        } 

        if (!res_cookie_check) {

              /* cookie不可用,丢弃 */
              /* update statistics */
              IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_BAD_ACK);
              /* 
               * Cookie check fail, drop it.
               */
              IP_VS_DBG(6, "syn_cookie check failed seq=%u\n",
                        ntohl(th->ack_seq) - 1);
              ip_vs_service_put(svc);
              *verdict = NF_DROP;
              return 0;
        } 

        /* update statistics */

        IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_OK_ACK);

        /* 此时判断为正常的连接请求,开始分配相关资源 */

        /*
         * Let the virtual server select a real server for the
         * incoming connection, and create a connection entry.
         */
         *cpp = ip_vs_schedule(svc, skb, 1);
         if (!*cpp) {
              IP_VS_DBG(6, "ip_vs_schedule failed\n");
              *verdict = ip_vs_leave(svc, skb, pp);
              return 0;
         } 

        /*
         * Release service, we don't need it any more.
         */
         ip_vs_service_put(svc);

        /*
         * Do anything but print a error msg when fail.
         * Because session will be correctly freed in ip_vs_conn_expire.
         */

         /* 向rs发送syn包开始三次握手 */
         if (!syn_proxy_send_rs_syn(af, th, *cpp, skb, pp, &opt)) {
                IP_VS_ERR_RL("syn_proxy_send_rs_syn failed!\n");
        }

        /* count in the ack packet (STOLEN by synproxy) */
        ip_vs_in_stats(*cpp, skb);

        /*
         * Active sesion timer, and dec refcnt. 
         * Also stole the skb, and let caller return immediately.
         */
         ip_vs_conn_put(*cpp);
         *verdict = NF_STOLEN;
          return 0;
     } 

     return 1;
}

syn_proxy_send_rs_syn()

/* 
* Create syn packet and send it to rs.
* ATTENTION: we also store syn skb in cp if syn retransimition
* is tured on.
*/ 
static int 
syn_proxy_send_rs_syn(int af, const struct tcphdr *th,

            struct ip_vs_conn *cp, struct sk_buff *skb,
            struct ip_vs_protocol *pp, struct ip_vs_synproxy_opt *opt)
{ 
        struct sk_buff *syn_skb;
        int tcp_hdr_size;
        __u8 tcp_flags = TCPCB_FLAG_SYN;
        unsigned int tcphoff;
        struct tcphdr *new_th;

        if (!cp->packet_xmit) {
             IP_VS_ERR_RL("warning: packet_xmit is null");
             return 0;
        } 

        syn_skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
        if (unlikely(syn_skb == NULL)) {
            IP_VS_ERR_RL("alloc skb failed when send rs syn packet\n");
            return 0;
        } 

        /* Reserve space for headers */
       skb_reserve(syn_skb, MAX_TCP_HEADER);
       tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
                     (opt->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + 
                     (opt->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + 
                     /* SACK_PERM is in the place of NOP NOP of TS */
                    ((opt->sack_ok
                      && !opt->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); 

       new_th = (struct tcphdr *)skb_push(syn_skb, tcp_hdr_size);
       /* Compose tcp header */
       skb_reset_transport_header(syn_skb);
       syn_skb->csum = 0; 

       /* Set tcp hdr */

      new_th->source = th->source;
      new_th->dest = th->dest;
      new_th->seq = htonl(ntohl(th->seq) - 1);
     new_th->ack_seq = 0;
     *(((__u16 *) new_th) + 6) =
               htons(((tcp_hdr_size >> 2) << 12) | tcp_flags);
     /* FIX_ME: what window should we use */
     new_th->window = htons(5000);
     new_th->check = 0;
     new_th->urg_ptr = 0;
     new_th->urg = 0;
     new_th->ece = 0;
     new_th->cwr = 0;

     syn_proxy_syn_build_options((__be32 *) (new_th + 1), opt);

     /* 
      * Set ip hdr
      * Attention: set source and dest addr to ack skb's.
      * we rely on packet_xmit func to do NATs thing.
      */ 
     #ifdef CONFIG_IP_VS_IPV6
     if (af == AF_INET6) {
     struct ipv6hdr *ack_iph = ipv6_hdr(skb);
     struct ipv6hdr *iph =
               (struct ipv6hdr *)skb_push(syn_skb, sizeof(struct ipv6hdr));

     tcphoff = sizeof(struct ipv6hdr);
     skb_reset_network_header(syn_skb);
     memcpy(&iph->saddr, &ack_iph->saddr, sizeof(struct in6_addr));
     memcpy(&iph->daddr, &ack_iph->daddr, sizeof(struct in6_addr));

     iph->version = 6;
     iph->nexthdr = NEXTHDR_TCP;
     iph->payload_len = htons(tcp_hdr_size);
     iph->hop_limit = IPV6_DEFAULT_HOPLIMIT;

     new_th->check = 0;
     syn_skb->csum =
               skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
     new_th->check =
               csum_ipv6_magic(&iph->saddr, &iph→daddr,

                         syn_skb->len - tcphoff, IPPROTO_TCP,
                         syn_skb->csum);
     } else
     #endif 
     { 
          struct iphdr *ack_iph = ip_hdr(skb);
          u32 rtos = RT_TOS(ack_iph->tos);
          struct iphdr *iph =
                    (struct iphdr *)skb_push(syn_skb, sizeof(struct iphdr));

          tcphoff = sizeof(struct iphdr);
          skb_reset_network_header(syn_skb);
          *((__u16 *) iph) = htons((4 << 12) | (5 << 8) | (rtos & 0xff));
          iph->tot_len = htons(syn_skb->len);
          iph->frag_off = htons(IP_DF);
          /* FIX_ME: what ttl shoule we use */
          iph->ttl = IPDEFTTL;
          iph->protocol = IPPROTO_TCP;
          iph->saddr = ack_iph->saddr;
          iph->daddr = ack_iph->daddr;

          ip_send_check(iph);

          new_th->check = 0;
          syn_skb->csum =
                    skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
          new_th->check =
                    csum_tcpudp_magic(iph->saddr, iph->daddr,
                    syn_skb->len - tcphoff, IPPROTO_TCP,
                    syn_skb->csum);
     }

     /* Save syn_skb if syn retransmission is on */
     if (sysctl_ip_vs_synproxy_syn_retry > 0) {
               cp->syn_skb = skb_copy(syn_skb, GFP_ATOMIC);
               atomic_set(&cp->syn_retry_max, sysctl_ip_vs_synproxy_syn_retry);
     } 

     /* Save info for fast_response_xmit */
     if(sysctl_ip_vs_fast_xmit && skb->dev &&
               likely(skb->dev->type == ARPHRD_ETHER) &&
               skb_mac_header_was_set(skb)) {

          struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);

          if(likely(cp->indev == NULL)) {
               cp->indev = skb->dev;
               dev_hold(cp->indev);
          } 

          if (unlikely(cp->indev != skb->dev)) {
               dev_put(cp->indev);
               cp->indev = skb->dev;
               dev_hold(cp->indev);
          } 

          memcpy(cp->src_hwaddr, eth->h_source, ETH_ALEN);
          memcpy(cp->dst_hwaddr, eth->h_dest, ETH_ALEN);
          IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE);
          IP_VS_DBG_RL("syn_proxy_send_rs_syn netdevice:%s\n",
          netdev_name(skb->dev));
     } 

     /* count in the syn packet */
     ip_vs_in_stats(cp, skb);

     /* If xmit failed, syn_skb will be freed correctly. */
     cp->packet_xmit(syn_skb, cp, pp);

     return 1;
}