kubernetes v1.12.1
用于管理 Node 对象,主要包括来几个两个主要功能:
Node Controller 负载发现,管理和监控集群中的各个 Node 节点。kubelet 在启动时通过 API Server 注册节点信息,并定时向 API Server 发送节点信息。API Server 收到将信息写入 etcd。信息包括节点健康状况,节点资源,节点名称,节点地址信息,操作系统版本,docker 版本,kubelet 版本
nodeInfo:
architecture: amd64
bootID: 6eca61f9-80de-4f23-a3f8-388dba810a3f
containerRuntimeVersion: docker://17.12.1-ce
kernelVersion: 3.10.0-327.el7.x86_64
kubeProxyVersion: v1.12.1
kubeletVersion: v1.12.1
machineID: 39619c0debc84e85a745c45f16fb13c0
operatingSystem: linux
osImage: CentOS Linux 7 (Core)
systemUUID: 420300C1-B585-FCDF-B363-A6CEB34EA1EF
监测节点的健康状况。当节点变为不可访问时,节点控制器负责将NodeStatus的NodeReady条件更新为ConditionUnknown,随后从节点中卸载所有pod ,如果节点继续无法访问,(默认超时时间为40 --node-monitor-period秒,开始报告ConditionUnknown,之后为5m开始卸载)。节点控制器按每秒来检查每个节点的状态
Kube-controller-manager: 周期性检查所有节点状态,当节点处于 NotReady 状态超过一段时间后,驱逐该节点上所有 pod
路径pkg/controller/nodelifecycle/node_lifecycyle_controller.go
// Controller is the controller that manages node's life cycle.
type Controller struct {
taintManager *scheduler.NoExecuteTaintManager
podInformerSynced cache.InformerSynced
cloud cloudprovider.Interface
kubeClient clientset.Interface
runTaintManager bool
// if set to true Controller will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable'
// taints instead of evicting Pods itself.
useTaintBasedEvictions bool
// if set to true, NodeController will taint Nodes based on its condition for 'NetworkUnavailable',
// 'MemoryPressure', 'OutOfDisk' and 'DiskPressure'.
taintNodeByCondition bool
nodeUpdateQueue workqueue.Interface
}
注册nodelifecyclle,controllers["nodelifecycle"] = startNodeLifecycleController
// NewControllerInitializers is a public map of named controller groups (you can start more than one in an init func)
// paired to their InitFunc. This allows for structured downstream composition and subdivision.
func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc {
controllers := map[string]InitFunc{}
controllers["nodeipam"] = startNodeIpamController
controllers["nodelifecycle"] = startNodeLifecycleController
return controllers
}
调用NewNodeLifecycleController函数初始化
NewNodeLifeCycleController初始化函数(第2章节讲解)
lifecycleController.Run函数主要执行体(第3章节讲解)
func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, error) {
lifecycleController, err := lifecyclecontroller.NewNodeLifecycleController(
ctx.InformerFactory.Core().V1().Pods(),
ctx.InformerFactory.Core().V1().Nodes(),
ctx.InformerFactory.Extensions().V1beta1().DaemonSets(),
ctx.Cloud,
ctx.ClientBuilder.ClientOrDie("node-controller"),
ctx.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
ctx.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration,
ctx.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration,
ctx.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration,
ctx.ComponentConfig.NodeLifecycleController.NodeEvictionRate,
ctx.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate,
ctx.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold,
ctx.ComponentConfig.NodeLifecycleController.UnhealthyZoneThreshold,
ctx.ComponentConfig.NodeLifecycleController.EnableTaintManager,
utilfeature.DefaultFeatureGate.Enabled(features.TaintBasedEvictions),
utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition),
)
if err != nil {
return nil, true, err
}
go lifecycleController.Run(ctx.Stop)
return nil, true, nil
}
函数定义,参数较多,关注有pod ds
- -node-monitor-period duration: NodeController同步NodeStatus的时间间隔(default 5s)
- --node-startup-grace-period duration: 在标记节点不健康之前,允许开始节点不响应的时间 (default 1m0s)
- --node-monitor-grace-period duration: 在标记节点不健康之前,允许运行节点不响应的时间,必须是n倍的kubelet's nodeStatusUpdateFrequency,N意味着kubelet报告node状态重试的次数(default 40s)
- --pod-eviction-timeout duration: 在失败的节点上删除pod的宽限时间 (default 5m0s)
- --node-eviction-rate float32: 当zone健康node失败情况,删除节点上的pod的速率 (default 0.1)
- --secondary-node-eviction-rate float32: 当zone不健康node失败情况,删除节点上的pod的速率,如果集群大小小于 large-cluster-size-threshold,则隐式地将设置为0。(default 0.01)
- --large-cluster-size-threshold int32: Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller(default 50)
- --unhealthy-zone-threshold float32: not ready 节点(至少3个)的比例达到该值时,将 Zone 标记为不健康 (default 0.55)
- --enable-taint-manager:如果设置为true则开启NoExecute Taints,将驱逐所有节点上(拥有这种污点的节点)不容忍运行pod (default true)
// NewNodeLifecycleController returns a new taint controller.
func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
daemonSetInformer extensionsinformers.DaemonSetInformer,
cloud cloudprovider.Interface,
kubeClient clientset.Interface,
nodeMonitorPeriod time.Duration,
nodeStartupGracePeriod time.Duration,
nodeMonitorGracePeriod time.Duration,
podEvictionTimeout time.Duration,
evictionLimiterQPS float32,
secondaryEvictionLimiterQPS float32,
largeClusterThreshold int32,
unhealthyZoneThreshold float32,
runTaintManager bool,
useTaintBasedEvictions bool,
taintNodeByCondition bool) (*Controller, error)
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"})
if kubeClient != nil && kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
metrics.RegisterMetricAndTrackRateLimiterUsage("node_lifecycle_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter())
}
nc := &Controller{
cloud: cloud,
kubeClient: kubeClient,
now: metav1.Now,
knownNodeSet: make(map[string]*v1.Node),
nodeStatusMap: make(map[string]nodeStatusData),
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
},
nodeShutdownInCloudProvider: func(ctx context.Context, node *v1.Node) (bool, error) {
return nodeutil.ShutdownInCloudProvider(ctx, cloud, node)
},
recorder: recorder,
nodeMonitorPeriod: nodeMonitorPeriod,
nodeStartupGracePeriod: nodeStartupGracePeriod,
nodeMonitorGracePeriod: nodeMonitorGracePeriod,
zonePodEvictor: make(map[string]*scheduler.RateLimitedTimedQueue),
zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue),
zoneStates: make(map[string]ZoneState),
podEvictionTimeout: podEvictionTimeout,
evictionLimiterQPS: evictionLimiterQPS,
secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
largeClusterThreshold: largeClusterThreshold,
unhealthyZoneThreshold: unhealthyZoneThreshold,
runTaintManager: runTaintManager,
useTaintBasedEvictions: useTaintBasedEvictions && runTaintManager,
taintNodeByCondition: taintNodeByCondition,
nodeUpdateQueue: workqueue.New(),
}
if useTaintBasedEvictions {
glog.Infof("Controller is using taint based evictions.")
}
nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
nc.computeZoneStateFunc = nc.ComputeZoneState
包括, AddFunc UpdateFunc DeleteFunc
注册podInformerSynced为podInformer.Informer().HasSynced
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
pod := obj.(*v1.Pod)
if nc.taintManager != nil {
nc.taintManager.PodUpdated(nil, pod)
}
},
UpdateFunc: func(prev, obj interface{}) {
prevPod := prev.(*v1.Pod)
newPod := obj.(*v1.Pod)
if nc.taintManager != nil {
nc.taintManager.PodUpdated(prevPod, newPod)
}
},
DeleteFunc: func(obj interface{}) {
pod, isPod := obj.(*v1.Pod)
// We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
if !isPod {
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
glog.Errorf("Received unexpected object: %v", obj)
return
}
pod, ok = deletedState.Obj.(*v1.Pod)
if !ok {
glog.Errorf("DeletedFinalStateUnknown contained non-Pod object: %v", deletedState.Obj)
return
}
}
if nc.taintManager != nil {
nc.taintManager.PodUpdated(pod, nil)
}
},
})
nc.podInformerSynced = podInformer.Informer().HasSynced
--enable-taint-manager:如果设置为true则开启NoExecute Taints,将驱逐所有节点上(拥有这种污点的节点)不容忍运行pod (default true)
添加node informer的event handler,包括taintManager.NodeUpdated
if nc.runTaintManager {
nc.taintManager = scheduler.NewNoExecuteTaintManager(kubeClient)
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error {
nc.taintManager.NodeUpdated(nil, node)
return nil
}),
UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error {
nc.taintManager.NodeUpdated(oldNode, newNode)
return nil
}),
DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error {
nc.taintManager.NodeUpdated(node, nil)
return nil
}),
})
}
// NOTE(resouer): nodeInformer to substitute deprecated taint key (notReady -> not-ready).
// Remove this logic when we don't need this backwards compatibility
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error {
return nc.doFixDeprecatedTaintKeyPass(node)
}),
UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
return nc.doFixDeprecatedTaintKeyPass(newNode)
}),
})
nc.nodeLister = nodeInformer.Lister()
nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
nc.daemonSetStore = daemonSetInformer.Lister()
nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced
// Run starts an asynchronous loop that monitors the status of cluster nodes.
func (nc *Controller) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
glog.Infof("Starting node controller")
defer glog.Infof("Shutting down node controller")
if !controller.WaitForCacheSync("taint", stopCh, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
return
}
if nc.runTaintManager {
go nc.taintManager.Run(stopCh)
}
if nc.taintNodeByCondition {
// Close node update queue to cleanup go routine.
defer nc.nodeUpdateQueue.ShutDown()
// Start workers to update NoSchedule taint for nodes.
for i := 0; i < scheduler.UpdateWorkerSize; i++ {
// Thanks to "workqueue", each worker just need to get item from queue, because
// the item is flagged when got from queue: if new event come, the new item will
// be re-queued until "Done", so no more than one worker handle the same item and
// no event missed.
go wait.Until(nc.doNoScheduleTaintingPassWorker, time.Second, stopCh)
}
}
if nc.useTaintBasedEvictions {
// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
go wait.Until(nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod, stopCh)
} else {
// Managing eviction of nodes:
// When we delete pods off a node, if the node was not empty at the time we then
// queue an eviction watcher. If we hit an error, retry deletion.
go wait.Until(nc.doEvictionPass, scheduler.NodeEvictionPeriod, stopCh)
}
// Incorporate the results of node status pushed from kubelet to master.
go wait.Until(func() {
if err := nc.monitorNodeStatus(); err != nil {
glog.Errorf("Error monitoring node status: %v", err)
}
}, nc.nodeMonitorPeriod, stopCh)
<-stopCh
}
// We are listing nodes from local cache as we can tolerate some small delays
// comparing to state from etcd and there is eventual consistency anyway.
nodes, err := nc.nodeLister.List(labels.Everything())
if err != nil {
return err
}
added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes)
for i := range newZoneRepresentatives {
nc.addPodEvictorForNewZone(newZoneRepresentatives[i])
}
for i := range added {
glog.V(1).Infof("Controller observed a new Node: %#v", added[i].Name)
nodeutil.RecordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name))
nc.knownNodeSet[added[i].Name] = added[i]
nc.addPodEvictorForNewZone(added[i])
if nc.useTaintBasedEvictions {
nc.markNodeAsReachable(added[i])
} else {
nc.cancelPodEviction(added[i])
}
}
for i := range deleted {
glog.V(1).Infof("Controller observed a Node deletion: %v", deleted[i].Name)
nodeutil.RecordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name))
delete(nc.knownNodeSet, deleted[i].Name)
}
zoneToNodeConditions := map[string][]*v1.NodeCondition{}
for i := range nodes {
var gracePeriod time.Duration
var observedReadyCondition v1.NodeCondition
var currentReadyCondition *v1.NodeCondition
node := nodes[i].DeepCopy()
if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeStatusUpdateRetry, func() (bool, error) {
gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
if err == nil {
return true, nil
}
name := node.Name
node, err = nc.kubeClient.CoreV1().Nodes().Get(name, metav1.GetOptions{})
if err != nil {
glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
return false, err
}
return false, nil
}); err != nil {
glog.Errorf("Update status of Node '%v' from Controller error: %v. "+
"Skipping - no pods will be evicted.", node.Name, err)
continue
}
逻辑多,欠套之深,看着哎
观察到的node condition未ready时,
如果useTaintBasedEvictions为true,
如果node已经被Taint为UnreachableTaint,则将其改成NotReadyTaint,否则将node加入到taint队列
如果useTaintBasedEvictions为false,
在descisionTimestamp < readyTransitionTimestamp + nc.podEvictionTimeout (默认5分钟)
将node入队PodEvictor,由PodEvictor负责处理
decisionTimestamp := nc.now()
if currentReadyCondition != nil {
// Check eviction timeout against decisionTimestamp
if observedReadyCondition.Status == v1.ConditionFalse {
if nc.useTaintBasedEvictions {
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
taintToAdd := *NotReadyTaintTemplate
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
glog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
}
} else if nc.markNodeForTainting(node) {
glog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.",
node.Name,
decisionTimestamp,
)
}
} else {
if decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
if nc.evictPods(node) {
glog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v",
node.Name,
decisionTimestamp,
nc.nodeStatusMap[node.Name].readyTransitionTimestamp,
nc.podEvictionTimeout,
)
}
}
}
}
观察到的node condition为unknown时,
如果useTaintBasedEvictions为true,
如果node已经被Taint为NotReadyTaint,则将其改成Unreachable Taint,否则将node加入到taint队列
如果useTaintBasedEvictions为false,
在descisionTimestamp < probeTimestamp + nc.podEvictionTimeout (默认5分钟)
将node入队PodEvictor,由PodEvictor负责处理
if observedReadyCondition.Status == v1.ConditionUnknown {
if nc.useTaintBasedEvictions {
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
taintToAdd := *UnreachableTaintTemplate
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
glog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
}
} else if nc.markNodeForTainting(node) {
glog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.",
node.Name,
decisionTimestamp,
)
}
} else {
if decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
if nc.evictPods(node) {
glog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v",
node.Name,
decisionTimestamp,
nc.nodeStatusMap[node.Name].readyTransitionTimestamp,
nc.podEvictionTimeout-gracePeriod,
)
}
}
}
}
观察到的node condition为true时,
如果useTaintBasedEvictions为true,
标记node为reachable状态
如果useTaintBasedEvictions为false,
删除node在pod eviction,node已经是ready情况,取小pod eviction处理
if observedReadyCondition.Status == v1.ConditionTrue {
if nc.useTaintBasedEvictions {
removed, err := nc.markNodeAsReachable(node)
if err != nil {
glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
}
if removed {
glog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name)
}
} else {
if nc.cancelPodEviction(node) {
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
}
}
// remove shutdown taint this is needed always depending do we use taintbased or not
err := nc.markNodeAsNotShutdown(node)
if err != nil {
glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
}
}
报告event: NodeNotReady ,并标记所有pod也为not ready
// Report node event.
if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue {
nodeutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady")
if err = nodeutil.MarkAllPodsNotReady(nc.kubeClient, node); err != nil {
utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
}
}
2.1 WaitForCacheSync 函数等待 PodInformer、NodeInformer、DaemonSetInformer 的HasSyncs 都返回 true,全部完成同步
// WaitForCacheSync is a wrapper around cache.WaitForCacheSync that generates log messages
// indicating that the controller identified by controllerName is waiting for syncs, followed by
// either a successful or failed sync.
func WaitForCacheSync(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool {
glog.Infof("Waiting for caches to sync for %s controller", controllerName)
if !cache.WaitForCacheSync(stopCh, cacheSyncs...) {
utilruntime.HandleError(fmt.Errorf("Unable to sync caches for %s controller", controllerName))
return false
}
glog.Infof("Caches are synced for %s controller", controllerName)
return true
}
WaitForCacheSync 每隔 100 毫秒全部执行一次,直到所有 cacheSyncs 都返回 true,表示全部 cache 同步
const syncedPollPeriod = 100 * time.Millisecond
// WaitForCacheSync waits for caches to populate. It returns true if it was successful, false
// if the controller should shutdown
func WaitForCacheSync(stopCh <-chan struct{}, cacheSyncs ...InformerSynced) bool {
err := wait.PollUntil(syncedPollPeriod,
func() (bool, error) {
for _, syncFunc := range cacheSyncs {
if !syncFunc() {
return false, nil
}
}
return true, nil
},
stopCh)
if err != nil {
glog.V(2).Infof("stop requested")
return false
}
glog.V(4).Infof("caches populated")
return true
}
初始化一个 node manager 结构体,包括一堆线程数的个数
调用 waitForCacheSync 进行 cache 同步
启动 goroutine 进行 node 状态监控。对节点进行分类,包括 added deleted newZoneRepresentatives,分别进行处理。跟新所有节点状态