kube-proxy细节分析

其实kube-proxy的代码本身并不复杂,只是有个细节容易被大家忽略,大家可能都知道它有轮询的复杂均衡策略,是通过iptables实现的,那它是怎样控制平均转发的呢?iptables有个random的模块支持,那怎样控制权重呢?
看代码,一步一步分析

    {
        tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
        for _, table := range tablesNeedServicesChain {
            if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
                glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
                return
            }
        }

        tableChainsNeedJumpServices := []struct {
            table utiliptables.Table
            chain utiliptables.Chain
        }{
            {utiliptables.TableFilter, utiliptables.ChainInput},
            {utiliptables.TableFilter, utiliptables.ChainOutput},
            {utiliptables.TableNAT, utiliptables.ChainOutput},
            {utiliptables.TableNAT, utiliptables.ChainPrerouting},
        }
        comment := "kubernetes service portals"
        args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
        for _, tc := range tableChainsNeedJumpServices {
            if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
                glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
                return
            }
        }
    }

首先是建立filter表的INPUT/OUTPUT和nat表的OUTPUT/PREROUTE规则全部跳转到service链
效果如下:

-A OUTPUT -m comment --comment "kubernetes service portals" -j KUBE-SERVICES

-A PREROUTING -m comment --comment "kubernetes service portals" -j KUBE-SERVICES
-A OUTPUT -m comment --comment "kubernetes service portals" -j KUBE-SERVICES

这样出去的流量都会被service的链截获了

当然如果有些流量需要通过SNAT出去

    {
        if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
            glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
            return
        }

        comment := "kubernetes postrouting rules"
        args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
        if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
            glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
            return
        }
    }

效果如下:

-A POSTROUTING -m comment --comment "kubernetes postrouting rules" -j KUBE-POSTROUTING
-A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -m mark --mark 0x4000/0x4000 -j MASQUERADE

现在开始建立kubernetes proxy的各个链

    writeLine(proxier.filterChains, "*filter")
    writeLine(proxier.natChains, "*nat")

    // Make sure we keep stats for the top-level chains, if they existed
    // (which most should have because we created them above).
    if chain, ok := existingFilterChains[kubeServicesChain]; ok {
        writeLine(proxier.filterChains, chain)
    } else {
        writeLine(proxier.filterChains, utiliptables.MakeChainLine(kubeServicesChain))
    }
    if chain, ok := existingNATChains[kubeServicesChain]; ok {
        writeLine(proxier.natChains, chain)
    } else {
        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubeServicesChain))
    }
    if chain, ok := existingNATChains[kubeNodePortsChain]; ok {
        writeLine(proxier.natChains, chain)
    } else {
        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubeNodePortsChain))
    }
    if chain, ok := existingNATChains[kubePostroutingChain]; ok {
        writeLine(proxier.natChains, chain)
    } else {
        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubePostroutingChain))
    }
    if chain, ok := existingNATChains[KubeMarkMasqChain]; ok {
        writeLine(proxier.natChains, chain)
    } else {
        writeLine(proxier.natChains, utiliptables.MakeChainLine(KubeMarkMasqChain))
    }

这个里面创建KUBE-SERVICES、KUBE-NODEPORTS、KUBE-POSTROUTING、KUBE-MARK-MASQ

通过kubernetes创建的service会分配一个clusterIP,这些clusterIP是在iptables上面实现的

        args := []string{
            "-A", string(kubeServicesChain),
            "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcNameString),
            "-m", protocol, "-p", protocol,
            "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()),
            "--dport", fmt.Sprintf("%d", svcInfo.port),
        }
        if proxier.masqueradeAll {
            writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
        }
        if len(proxier.clusterCIDR) > 0 {
            writeLine(proxier.natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)
        }
        writeLine(proxier.natRules, append(args, "-j", string(svcChain))...)

上面就是截获clusterIP的流量做DNAT,这里面需要补充的就是如果一个服务后面有多个endpoint的,

for i, endpointChain := range endpointChains {
            // Balancing rules in the per-service chain.
            args := []string{
                "-A", string(svcChain),
                "-m", "comment", "--comment", svcNameString,
            }
            if i < (n - 1) {
                // Each rule is a probabilistic match.
                args = append(args,
                    "-m", "statistic",
                    "--mode", "random",
                    "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i)))
            }
            // The final (or only if n == 1) rule is a guaranteed match.
            args = append(args, "-j", string(endpointChain))
            writeLine(proxier.natRules, args...)

            // Rules in the per-endpoint chain.
            args = []string{
                "-A", string(endpointChain),
                "-m", "comment", "--comment", svcNameString,
            }
            // Handle traffic that loops back to the originator with SNAT.
            writeLine(proxier.natRules, append(args,
                "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i].endpoint, ":")[0]),
                "-j", string(KubeMarkMasqChain))...)
            // Update client-affinity lists.
            if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {
                args = append(args, "-m", "recent", "--name", string(endpointChain), "--set")
            }
            // DNAT to final destination.
            args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i].endpoint)
            writeLine(proxier.natRules, args...)
        }

上面通过循环的方式创建后端endpoint的转发,概率是通过probability后的1.0/float64(n-i)计算出来的,譬如有两个的场景,那么将会是一个0.5和1也就是第一个是50%概率第二个是100%概率,如果是三个的话类似,33%、50%、100%。下面是10个endpoint的例子。

kubectl get svc --all-namespaces
NAMESPACE      NAME                    CLUSTER-IP      EXTERNAL-IP   PORT(S)                      AGE
admin          docker2048              10.13.52.135    11.11.1.1     80/TCP                       1d


[root@master-62 ~]# 
[root@master-62 ~]# iptables-save |grep 10.13.52.135
-A KUBE-SERVICES -d 10.13.52.135/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 cluster IP" -m tcp --dport 80 -j KUBE-SVC-MHWEDWK6NM5OGU2T
[root@master-62 ~]# 
[root@master-62 ~]# 
[root@master-62 ~]# iptables-save |grep KUBE-SVC-MHWEDWK6NM5OGU2T
:KUBE-SVC-MHWEDWK6NM5OGU2T - [0:0]
-A KUBE-SERVICES -d 10.13.52.135/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 cluster IP" -m tcp --dport 80 -j KUBE-SVC-MHWEDWK6NM5OGU2T
-A KUBE-SERVICES -d 11.11.1.1/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 external IP" -m tcp --dport 80 -m physdev ! --physdev-is-in -m addrtype ! --src-type LOCAL -j KUBE-SVC-MHWEDWK6NM5OGU2T
-A KUBE-SERVICES -d 11.11.1.1/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 external IP" -m tcp --dport 80 -m addrtype --dst-type LOCAL -j KUBE-SVC-MHWEDWK6NM5OGU2T
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.10000000009 -j KUBE-SEP-VC767CJYOTCBCN3B
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.11110999994 -j KUBE-SEP-HQELSIUR5HSCB2VN
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.12500000000 -j KUBE-SEP-X2UDSU7Q4UA4IKY7
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.14286000002 -j KUBE-SEP-DQ3TZIZIDTXU77P7
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.16667000018 -j KUBE-SEP-A3JWOZYQIIDDEKNM
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.20000000019 -j KUBE-SEP-6EZ2MUBOPU2WH44E
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.25000000000 -j KUBE-SEP-4KG3GD3BQ5TCAUPR
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.33332999982 -j KUBE-SEP-6EXLETYC4LYB5NLM
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.50000000000 -j KUBE-SEP-VLQQMEFA6Y5RZLE7
-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -j KUBE-SEP-CXDZACZ7ESWWLYJM

你可能感兴趣的:(Kubernetes)