IPVS源代码分析---hook函数

有一个问题:IPVS中的local client 指什么? 在2.6.32的代码中,多了对于Local client的判断。
查阅IPVS的官方文档http://www.linuxvirtualserver.org/Documents.html, 可以看到所谓Local client 就是指director本身也作为一个server的情况,在这种情况下,director可以处理packet localy,而不是进行转发,所以称为local client(我觉得这里叫local server 更合适)。

对于local client的情况,如果要将包发送给本地,那么在ip_vs_in中会创建这样的一个connection entry。现在对于local client在ip_vs_in中的处理还是没弄明白。理论上,对于从local server发给client 的包,它不会经过forward这个点,但是会经过localOutput这个点。所以,可以在localOutput这个点,添加钩子,并判断数据包是不是conn_out_get,然后调用handle_response进行处理。但是,现在在程序中,把这些处理放到了ip_vs_in函数中,它是添加在localInput这个hook点的,不符合逻辑。
所以,合理的推断是,第一次数据包进入ip_vs_in时,创建了connection entry。然后,经过ip_vs_in的处理,将目的地址和端口改成了DR的另一个地址和端口,然后将包发送。发送之后,数据包会再次经过ip_vs_in。也就是说 从内核发往本地的另一个网口的包,会再次的经过ip_rcv。目前我还没有找到这个调用过程,但是我猜是这样的(这个推断应该是正确的)。


另外,在IPVS中,对于connection,从client(sip, sport)->server(dip,dport)和server(dip,dport)->client(sip,sport)的双向的数据为同一个connection entry。
对于tcp和udp,在协议中都包含连接查找函数了conn_in_get和conn_out_get,分别用于正向连接查找和反向查找。它们最终调用的也分别是__ip_vs_conn_in_get和__ip_vs_conn_out_get,在这两个函数,对于正向和反向,最后查找到了的是同一个connection。
另外,conn_out_get只用在NAT的情况 和 local client的情况。
 
  

static struct nf_hook_ops ip_vs_in_ops = {
        .hook           = ip_vs_in,
        .owner          = THIS_MODULE,
        .pf             = PF_INET,
        .hooknum        = NF_IP_LOCAL_IN,
        .priority       = 100,
};
ip_vs_in 是所有从client发过来的数据包的总入口。也就是说数据包首先经过ip_vs_in的处理。

/*
 *	Check if it's for virtual services, look it up,
 *	and send it on its way...
 */
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
	 const struct net_device *in, const struct net_device *out,
	 int (*okfn)(struct sk_buff *))
{
	struct ip_vs_iphdr iph;
	struct ip_vs_protocol *pp;
	struct ip_vs_conn *cp;
	int ret, restart, af, pkts;

	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;

	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

	/*
	 *	Big tappo: only PACKET_HOST, including loopback for local client
	 *	Don't handle local packets on IPv6 for now
	 */
	if (unlikely(skb->pkt_type != PACKET_HOST)) {
		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
			      skb->pkt_type,
			      iph.protocol,
			      IP_VS_DBG_ADDR(af, &iph.daddr));
		return NF_ACCEPT;
	}

#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6) {
		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
			int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);

			if (related)
				return verdict;
			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
		}
	} else
#endif
		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
			int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);

			if (related)
				return verdict;
			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
		}

	/* Protocol supported? */
	pp = ip_vs_proto_get(iph.protocol);
	if (unlikely(!pp))
		return NF_ACCEPT;

	/*
	 * Check if the packet belongs to an existing connection entry
	 */
	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);

	if (unlikely(!cp)) {
		int v;

		/* For local client packets, it could be a response ??? wusq*/
		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
		if (cp)
			return handle_response(af, skb, pp, cp, iph.len);

		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
			return v;
	}

	if (unlikely(!cp)) {
		/* sorry, all this trouble for a no-hit :) */
		IP_VS_DBG_PKT(12, pp, skb, 0,
			      "packet continues traversal as normal");
		return NF_ACCEPT;
	}

	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");

	/* Check the server status */
	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
		/* the destination server is not available */

		if (sysctl_ip_vs_expire_nodest_conn) {
			/* try to expire the connection immediately */
			ip_vs_conn_expire_now(cp);
		}
		/* don't restart its timer, and silently
		   drop the packet. */
		__ip_vs_conn_put(cp);
		return NF_DROP;
	}

	ip_vs_in_stats(cp, skb);
	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
	if (cp->packet_xmit)
		ret = cp->packet_xmit(skb, cp, pp);
		/* do not touch skb anymore */
	else {
		IP_VS_DBG_RL("warning: packet_xmit is null");
		ret = NF_ACCEPT;
	}

	/* Increase its packet counter and check if it is needed
	 * to be synchronized
	 *
	 * Sync connection if it is about to close to
	 * encorage the standby servers to update the connections timeout
	 */
	pkts = atomic_add_return(1, &cp->in_pkts);
	if (af == AF_INET &&
	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
	    (((cp->protocol != IPPROTO_TCP ||
	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
	      (pkts % sysctl_ip_vs_sync_threshold[1]
	       == sysctl_ip_vs_sync_threshold[0])) ||
	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
		ip_vs_sync_conn(cp);
	cp->old_state = cp->state;

	ip_vs_conn_put(cp);
	return ret;
}


forward处的钩子。这个函数对转发包进行处理, 只用在NAT模式的均衡处理,处理的是服务器返回的包,因为TUNNEL和DR方式下都是直接发给了client,不经过load balancer的处理。客户端请求的包也不经过这个hook,客户端的请求包经过的是local in的hook。
但如果设置了DNAT规则,数据包在PREROUTING点进行了目的地址修改,这样就不会再进入INPUT点而是直接转到FORWARD点处理,这时时针对该包的 IPVS连接是没有建立的.
static struct nf_hook_ops ip_vs_out_ops = {  
.hook           = ip_vs_out,        
.owner          = THIS_MODULE,        
.pf             = PF_INET,        
.hooknum        = NF_IP_FORWARD,        
.priority       = 100,};

static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *))
{
      struct sk_buff  *skb = *pskb;
      struct iphdr    *iph;
      struct ip_vs_protocol *pp;
      struct ip_vs_conn *cp;
      int ihl;

      //这个标志只占一位,标志置位就是已经经过IPVS处理了,直接返回
      if (skb->ipvs_property)
            return NF_ACCEPT;

      iph = skb->nh.iph;
      if (unlikely(iph->protocol == IPPROTO_ICMP)) {
            //处理可能的连接相关ICMP错误信息,如地址端口不可达等
	    int related, verdict = ip_vs_out_icmp(pskb, &related);
            if (related)
                  return verdict;

            skb = *pskb;
            iph = skb->nh.iph;
      }
      //取得IPVS协议, tcp/udp/ah/esp之一
      pp = ip_vs_proto_get(iph->protocol);
      if (unlikely(!pp))
            return NF_ACCEPT;

      //如果是碎片包进行重组,基本不可能,因为数据包进入netfilter时就要进行碎片重组
	if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) {
            skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);

            if (!skb)
                  return NF_STOLEN;

            iph = skb->nh.iph;
            *pskb = skb;
      }
      ihl = iph->ihl << 2; //ip头长度
      //查找IPVS连接
      cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
      if (unlikely(!cp)) { //没有找到,可能是请求方向的包经过DNAT过来的---这里的作用不清楚
            if (sysctl_ip_vs_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP)) {
                  __u16 _ports[2], *pptr;
                  pptr = skb_header_pointer(skb, ihl, sizeof(_ports), _ports);
                  if (pptr == NULL)
                        return NF_ACCEPT;

                  //用源地址,源端口来查真实服务器结构,如果是请求方向是找不到的
                  //这种情况下数据包就不再被IPVS处理
                  if (ip_vs_lookup_real_service(iph->protocol, iph->saddr, pptr[0])) {
                        if (iph->protocol != IPPROTO_TCP || !is_tcp_reset(skb)) {
                              icmp_send(skb,ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); //发送icmp不可达信息
                              return NF_DROP;
                        }
                  }

            }
            return NF_ACCEPT;

      }
      //找到连接,该包是服务器的回应包
      //skb数据包要求是可写的
      if (!ip_vs_make_skb_writable(pskb, ihl))
            goto drop;

      //修改协议部分信息,如TCP、UDP的端口, nat_handler负责修改端口号
      if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
             goto drop;

      skb = *pskb;
      //修改源地址, 由于是服务器的返回包,只修改源地址
      skb->nh.iph->saddr = cp->vaddr;
      ip_send_check(skb->nh.iph); //重新计算校验和

      //重新计算路由信息,对于本地产生的数据包
      if (__ip_route_me_harder(pskb, RTN_LOCAL) != 0)
            goto drop;

      skb = *pskb;
      //PVS输出统计
      ip_vs_out_stats(cp, skb);
      ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); //状态迁移
      ip_vs_conn_put(cp); //调整连接超时,释放连接计数

      skb->ipvs_property = 1; //对该包设置标志表示IPVS处理过了

      return NF_ACCEPT;
drop:
      ip_vs_conn_put(cp);
      kfree_skb(*pskb);
      return NF_STOLEN;
}

static struct nf_hook_ops ip_vs_forward_icmp_ops = {
        .hook           = ip_vs_forward_icmp,
        .owner          = THIS_MODULE,
        .pf             = PF_INET,
        .hooknum        = NF_IP_FORWARD,
        .priority       = 99, //在ip_vs_out_ops之前进行
};

//这个函数对转发的ICMP包进行处理, 处理由于服务器失效而引起的网络或端口不可达的ICMP信息,其他和服务器无关的ICMP信息不处理.
static unsigned int ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in,
                                       const struct net_device *out, int (*okfn)(struct sk_buff *))
{
        int r;

        if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
                return NF_ACCEPT;

        return ip_vs_in_icmp(pskb, &r, hooknum);
}

static int ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
{
      struct sk_buff *skb = *pskb;
      struct iphdr *iph;
      struct icmphdr  _icmph, *ic;
      struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
      struct ip_vs_conn *cp;
      struct ip_vs_protocol *pp;
      unsigned int offset, ihl, verdict;

      *related = 1; //这个参数指示该ICMP包是否和IPVS的连接相关,好像没用

      if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
            //进行碎片重组
            skb = ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
            if (!skb)
                  return NF_STOLEN;
            *pskb = skb;
      }
      iph = skb->nh.iph; //ip头
      offset = ihl = iph->ihl * 4; //数据开始
      //获取icmp头
      ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
      if (ic == NULL)
            return NF_DROP;
      //如果不是这三种ICMP信息,则该skb与IPVS无关
      if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) {
            *related = 0;
            return NF_ACCEPT;
      }

      offset += sizeof(_icmph); //获取ip头
      cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
      if (cih == NULL)
            return NF_ACCEPT;
      //找的是ICMP信息中包含的原始包中的协议,而不是ICMP
      pp = ip_vs_proto_get(cih->protocol);
      if (!pp)
            return NF_ACCEPT;
      //如果是碎片包且定义了不处理标志则直接返回
      if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && pp->dont_defrag))
            return NF_ACCEPT;
      offset += cih->ihl * 4; // + ip头长度
      //查找IPVS连接
      cp = pp->conn_in_get(skb, pp, cih, offset, 1);
      if (!cp)
            return NF_ACCEPT;
      //缺省的裁定结果是丢弃包
      verdict = NF_DROP;
      //检查ip校验和
      if (skb->ip_summed != CHECKSUM_UNNECESSARY && ip_vs_checksum_complete(skb, ihl)) {
            IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
            goto out;
      }
      ip_vs_in_stats(cp, skb);//进行输入统计
      //如果内部协议是TCP/UDP,发送偏移量要包括前4个字节: 源端口和目的端口
      if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
            offset += 2 * sizeof(__u16);
      //发送ICMP
      verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
out:
      __ip_vs_conn_put(cp);
      return verdict;
}

发送各种ICMP错误信息包
int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset)
{
      struct rtable   *rt;    /* Route to the other host */
      int mtu;
      int rc;
      //如果不是NAT情况的IPVS连接, 即是TUNNEL或DR,直接调用连接的发送函数发送
      if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
            if (cp->packet_xmit)
                  rc = cp->packet_xmit(skb, cp, pp);
            else
                  rc = NF_ACCEPT;

            /* do not touch skb anymore */
            atomic_inc(&cp->in_pkts);
            goto out;
      }
      //查找路由
      if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
	goto tx_error_icmp;

      mtu = dst_mtu(&rt->u.dst);
      //数据包过长超过MTU,但又是不允许分片的,发送ICMP出错包
      if ((skb->len > mtu) && (skb->nh.iph->frag_off & __constant_htons(IP_DF))) {
             ip_rt_put(rt);
             icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
             IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
             goto tx_error;
      }
      //让skb可写
      if (!ip_vs_make_skb_writable(&skb, offset))
             goto tx_error_put;
      //skb留出足够的硬件头空间
      if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
	    goto tx_error_put;
      dst_release(skb->dst);
      skb->dst = &rt->u.dst;
      //修改ICMP包
      ip_vs_nat_icmp(skb, pp, cp, 0);
      /* Another hack: avoid icmp_send in ip_fragment */
      skb->local_df = 1;

      //将该包用OUTPUT点的hook_ops进行处理
      /* #define IP_VS_XMIT(skb, rt)                             \
       * do {                                                    \
       *       (skb)->ipvs_property = 1;                       \
       *       (skb)->ip_summed = CHECKSUM_NONE;               \
       *       NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, (rt)->u.dst.dev, dst_output);           \
       * } while (0)
       */
      IP_VS_XMIT(skb, rt);
      rc = NF_STOLEN; //表示该skb不用返回到正常的IP栈了
      goto out;
tx_error_icmp:
      dst_link_failure(skb);
tx_error:
      dev_kfree_skb(skb);
      rc = NF_STOLEN;
out:
      return rc;
tx_error_put:
      ip_rt_put(rt);
      goto tx_error;
}
static struct nf_hook_ops ip_vs_post_routing_ops = {
        .hook           = ip_vs_post_routing,
        .owner          = THIS_MODULE,
        .pf             = PF_INET,
        .hooknum        = NF_IP_POST_ROUTING,
        .priority       = NF_IP_PRI_NAT_SRC-1, //在源NAT之前进行
};

这个函数对最后要发出的包进行检查,这个包是经过IPVS修改过了,不用再被netfilter进行修改,那么返回NF_STOP.
 
  
如果没被IPVS处理过,继续后续hook点操作
static unsigned int ip_vs_post_routing(unsigned int hooknum, struct sk_buff **pskb, const struct net_device * in , const struct net_device * out , int (*okfn)( struct sk_buff * )){
        if (!((*pskb)->ipvs_property)) //如果没被IPVS处理过,继续后续hook点操作                
              return NF_ACCEPT;        //STOP就不继续后面的低优先级的hook_ops的操作了        
    return NF_STOP;
}

你可能感兴趣的:(IPVS的研究和分析)