Kubernetes version:
v1.9.11
OS:
NAME=“CentOS Linux”
VERSION=“7 (Core)”
ID=“centos”
ID_LIKE=“rhel fedora”
VERSION_ID=“7”
PRETTY_NAME=“CentOS Linux 7 (Core)”
ANSI_COLOR=“0;31”
CPE_NAME=“cpe:/o:centos:centos:7”
HOME_URL=“https://www.centos.org/”
BUG_REPORT_URL=“https://bugs.centos.org/”
CENTOS_MANTISBT_PROJECT=“CentOS-7”
CENTOS_MANTISBT_PROJECT_VERSION=“7”
REDHAT_SUPPORT_PRODUCT=“centos”
REDHAT_SUPPORT_PRODUCT_VERSION=“7”
问题描述,在部署完k8s集群之后,部署一些服务(服务类型为clusterIP),都能正常使用,其中有一些服务是nodePort类型的,不能正常的访问。
然后查看ipvs种的规则,没有。增加kube-proxy启动日志级别为4,然后查看对应的日志,发现其他类型的service都有添加ipvs vitrul server的日志,
就是NodePort类型的service就是没有发现对应的添加ipvs vitrul server的日志,通过查看对应处理NodePort service的代码,找出问题所在的原因。
代码具体位置:
https://github.com/kubernetes/kubernetes/blob/release-1.9/pkg/proxy/ipvs/proxier.go#L1312
https://github.com/kubernetes/kubernetes/blob/release-1.9/pkg/proxy/ipvs/proxier.go#L176
下面截取代码片段:
/ This is where all of the ipvs calls happen.
// assumes proxier.mu is held
func (proxier *Proxier) syncProxyRules() {
......
if svcInfo.nodePort != 0 {
lp := utilproxy.LocalPort{
Description: "nodePort for " + svcNameString,
IP: "",
Port: svcInfo.nodePort,
Protocol: protocol,
}
if proxier.portsMap[lp] != nil {
glog.V(4).Infof("Port %s was open before and is still needed", lp.String())
replacementPortsMap[lp] = proxier.portsMap[lp]
} else {
socket, err := proxier.portMapper.OpenLocalPort(&lp)
if err != nil {
glog.Errorf("can't open %s, skipping this nodePort: %v", lp.String(), err)
continue
}
if lp.Protocol == "udp" {
isIPv6 := utilproxy.IsIPv6(svcInfo.clusterIP)
utilproxy.ClearUDPConntrackForPort(proxier.exec, lp.Port, isIPv6)
}
replacementPortsMap[lp] = socket
} // We're holding the port, so it's OK to install ipvs rules.
// Nodeports need SNAT, unless they're local.
// ipset call
if !svcInfo.onlyNodeLocalEndpoints {
entry = &utilipset.Entry{
// No need to provide ip info
Port: svcInfo.nodePort,
Protocol: protocol,
SetType: utilipset.BitmapPort,
}
switch protocol {
case "tcp":
proxier.nodePortSetTCP.activeEntries.Insert(entry.String())
case "udp":
proxier.nodePortSetUDP.activeEntries.Insert(entry.String())
default:
// It should never hit
glog.Errorf("Unsupported protocol type: %s", protocol)
}
}
// Build ipvs kernel routes for each node ip address
nodeIPs, err := proxier.ipGetter.NodeIPs()
if err != nil {
glog.Errorf("Failed to get node IP, err: %v", err)
} else {
for _, nodeIP := range nodeIPs {
// ipvs call
serv := &utilipvs.VirtualServer{
Address: nodeIP,
Port: uint16(svcInfo.nodePort),
Protocol: string(svcInfo.protocol),
Scheduler: proxier.ipvsScheduler,
}
if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {
serv.Flags |= utilipvs.FlagPersistent
serv.Timeout = uint32(svcInfo.stickyMaxAgeSeconds)
}
// There is no need to bind Node IP to dummy interface, so set parameter `bindAddr` to `false`.
if err := proxier.syncService(svcNameString, serv, false); err == nil {
activeIPVSServices[serv.String()] = true
if err := proxier.syncEndpoint(svcName, svcInfo.onlyNodeLocalEndpoints, serv); err != nil {
glog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
}
} else {
glog.Errorf("Failed to sync service: %v, err: %v", serv, err)
}
}
}
}
}
......
}
proxier.ipGetter.NodeIPs()的真实实现如下
func (r *realIPGetter) NodeIPs() (ips []net.IP, err error) {
interfaces, err := net.Interfaces()
if err != nil {
return nil, err
}
for i := range interfaces {
name := interfaces[i].Name
// We assume node ip bind to eth{x}
if !strings.HasPrefix(name, "eth") {
continue
}
intf, err := net.InterfaceByName(name)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Failed to get interface by name: %s, error: %v", name, err))
continue
}
addrs, err := intf.Addrs()
if err != nil {
utilruntime.HandleError(fmt.Errorf("Failed to get addresses from interface: %s, error: %v", name, err))
continue
}
for _, a := range addrs {
if ipnet, ok := a.(*net.IPNet); ok {
ips = append(ips, ipnet.IP)
}
}
}
return
}
syncProxyRules方法处理NodePort类型的service的核心代码主要是;
nodeIPs, err := proxier.ipGetter.NodeIPs()
for _, nodeIP := range nodeIPs {
// ipvs call
serv := &utilipvs.VirtualServer{
Address: nodeIP,
Port: uint16(svcInfo.nodePort),
Protocol: string(svcInfo.protocol),
Scheduler: proxier.ipvsScheduler,
}
if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {
serv.Flags |= utilipvs.FlagPersistent
serv.Timeout = uint32(svcInfo.stickyMaxAgeSeconds)
}
// There is no need to bind Node IP to dummy interface, so set parameter `bindAddr` to `false`.
if err := proxier.syncService(svcNameString, serv, false); err == nil {
activeIPVSServices[serv.String()] = true
if err := proxier.syncEndpoint(svcName, svcInfo.onlyNodeLocalEndpoints, serv); err != nil {
glog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
}
} else {
glog.Errorf("Failed to sync service: %v, err: %v", serv, err)
}
}
func (proxier *Proxier) syncService(svcName string, vs *utilipvs.VirtualServer, bindAddr bool) error {
appliedVirtualServer, _ := proxier.ipvs.GetVirtualServer(vs)
if appliedVirtualServer == nil || !appliedVirtualServer.Equal(vs) {
if appliedVirtualServer == nil {
// IPVS service is not found, create a new service
glog.V(3).Infof("Adding new service %q %s:%d/%s", svcName, vs.Address, vs.Port, vs.Protocol)
if err := proxier.ipvs.AddVirtualServer(vs); err != nil {
glog.Errorf("Failed to add IPVS service %q: %v", svcName, err)
return err
}
} else {
// IPVS service was changed, update the existing one
// During updates, service VIP will not go down
glog.V(3).Infof("IPVS service %s was changed", svcName)
if err := proxier.ipvs.UpdateVirtualServer(appliedVirtualServer); err != nil {
glog.Errorf("Failed to update IPVS service, err:%v", err)
return err
}
}
}
// bind service address to dummy interface even if service not changed,
// in case that service IP was removed by other processes
if bindAddr {
_, err := proxier.netlinkHandle.EnsureAddressBind(vs.Address.String(), DefaultDummyDevice)
if err != nil {
glog.Errorf("Failed to bind service address to dummy device %q: %v", svcName, err)
return err
}
}
return nil
}
其中proxier.syncService是调用ipvs的api添加ipvs的vitrul server。我们文章开始提到的问题是压根就没有添加NodePort类型service的日志,所以
我就通过代码上下文断言是nodeIps为空,只有nodeIPs为空,循环体才不会执行,所以就不会添加NodePort的service的ipvs vitrul server。然后在通过查看
func (r *realIPGetter) NodeIPs() (ips []net.IP, err error),发现其中有行代码可能是导致问题的原因:
if !strings.HasPrefix(name, "eth") {
continue
}
这个方法去节点的ip只取以eth开头的网卡的ip,通过与运维同事确认,该主机的网卡不是以eth开头的,最后更改这个获取ip的方法,然后编译代码,NodePort的服务正常访问
最后查阅相关文档确认该问题已经在v1.10.0版本中已经被修复,具体请查看修复该问题的PR