一、引言
在 K8S 系统运行过程中,kubelet 需要定期向 API Server 上报节点运行状态(也就是心跳消息)
本文从源码角度分析下 kubelet 进行节点状态上报的工作机制
其实,心跳的原理比较简单,就是启动一个协程,定期向 APIServer 发送更新数据!
二、源码分析
首先,来到 kubelet 入口,cmd/kubelet/kubelet.go#main
func main() {
rand.Seed(time.Now().UnixNano())
command := app.NewKubeletCommand(server.SetupSignalHandler())
logs.InitLogs()
defer logs.FlushLogs()
if err := command.Execute(); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}
入口函数通过 NewKubeletCommand() 构造了一个 cobra Command 对象,并通过调用 Execute() 执行了该 Command
NewKubeletCommand 的定义位于 cmd/kubelet/app/server.go#NewKubeletCommand
// NewKubeletCommand creates a *cobra.Command object with default parameters
func NewKubeletCommand(stopCh <-chan struct{}) *cobra.Command {
...
cmd := &cobra.Command{
Use: componentKubelet,
...
Run: func(cmd *cobra.Command, args []string) {
...
// run the kubelet
klog.V(5).Infof("KubeletConfiguration: %#v", kubeletServer.KubeletConfiguration)
if err := Run(kubeletServer, kubeletDeps, stopCh); err != nil {
klog.Fatal(err)
}
},
}
return cmd
}
该函数除了处理了一大堆的参数解析外,在 Run 函数的末尾,最终调起了 kubelet 服务
继续追踪,来到 cmd/kubelet/app/server.go#Run
// Run runs the specified KubeletServer with the given Dependencies. This should never exit.
// The kubeDeps argument may be nil - if so, it is initialized from the settings on KubeletServer.
// Otherwise, the caller is assumed to have set up the Dependencies object and a default one will
// not be generated.
func Run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies, stopCh <-chan struct{}) error {
// To help debugging, immediately log version
klog.Infof("Version: %+v", version.Get())
if err := initForOS(s.KubeletFlags.WindowsService); err != nil {
return fmt.Errorf("failed OS init: %v", err)
}
if err := run(s, kubeDeps, stopCh); err != nil {
return fmt.Errorf("failed to run Kubelet: %v", err)
}
return nil
}
继续追踪,来到 cmd/kubelet/app/server.go#run
func run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies, stopCh <-chan struct{}) (err error) {
...
if err := RunKubelet(s, kubeDeps, s.RunOnce); err != nil {
return err
}
...
select {
case <-done:
break
case <-stopCh:
break
}
return nil
}
继续追踪到 cmd/kubelet/app/server.go#RunKubelet
func RunKubelet(kubeServer *options.KubeletServer, kubeDeps *kubelet.Dependencies, runOnce bool) error {
...
// process pods and exit.
if runOnce {
if _, err := k.RunOnce(podCfg.Updates()); err != nil {
return fmt.Errorf("runonce failed: %v", err)
}
klog.Infof("Started kubelet as runonce")
} else {
startKubelet(k, podCfg, &kubeServer.KubeletConfiguration, kubeDeps, kubeServer.EnableServer)
klog.Infof("Started kubelet")
}
return nil
}
继续追踪,到 cmd/kubelet/app/server.go#startKubelet
func startKubelet(k kubelet.Bootstrap, podCfg *config.PodConfig, kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *kubelet.Dependencies, enableServer bool) {
// start the kubelet
go wait.Until(func() {
k.Run(podCfg.Updates())
}, 0, wait.NeverStop)
// start the kubelet server
if enableServer {
go k.ListenAndServe(net.ParseIP(kubeCfg.Address), uint(kubeCfg.Port), kubeDeps.TLSOptions, kubeDeps.Auth, kubeCfg.EnableDebuggingHandlers, kubeCfg.EnableContentionProfiling)
}
if kubeCfg.ReadOnlyPort > 0 {
go k.ListenAndServeReadOnly(net.ParseIP(kubeCfg.Address), uint(kubeCfg.ReadOnlyPort))
}
}
继续追踪到 pkg/kubelet/kubelet.go#Run
// Run starts the kubelet reacting to config updates
func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
if kl.logServer == nil {
kl.logServer = http.StripPrefix("/logs/", http.FileServer(http.Dir("/var/log/")))
}
if kl.kubeClient == nil {
klog.Warning("No api server defined - no node status update will be sent.")
}
// Start the cloud provider sync manager
if kl.cloudResourceSyncManager != nil {
go kl.cloudResourceSyncManager.Run(wait.NeverStop)
}
if err := kl.initializeModules(); err != nil {
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error())
klog.Fatal(err)
}
// Start volume manager
go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
if kl.kubeClient != nil {
// Start syncing node status immediately, this may set up things the runtime needs to run.
go wait.Until(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, wait.NeverStop)
go kl.fastStatusUpdateOnce()
// start syncing lease
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) {
go kl.nodeLeaseController.Run(wait.NeverStop)
}
}
go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop)
// Start loop to sync iptables util rules
if kl.makeIPTablesUtilChains {
go wait.Until(kl.syncNetworkUtil, 1*time.Minute, wait.NeverStop)
}
// Start a goroutine responsible for killing pods (that are not properly
// handled by pod workers).
go wait.Until(kl.podKiller, 1*time.Second, wait.NeverStop)
// Start component sync loops.
kl.statusManager.Start()
kl.probeManager.Start()
// Start syncing RuntimeClasses if enabled.
if kl.runtimeClassManager != nil {
go kl.runtimeClassManager.Run(wait.NeverStop)
}
// Start the pod lifecycle event generator.
kl.pleg.Start()
kl.syncLoop(updates, kl)
}
可以看到,这个 Run 方法开启了众多的处理协程,主要都是用来同步配置的,我们主要关注节点状态上报心跳
重点关注到这句:
if kl.kubeClient != nil {
// Start syncing node status immediately, this may set up things the runtime needs to run.
go wait.Until(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, wait.NeverStop)
go kl.fastStatusUpdateOnce()// start syncing lease (这是一种新的心跳方式)
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) {
go kl.nodeLeaseController.Run(wait.NeverStop)
}
}
kl.syncNodeStatus 节点状态同步函数将每隔 kl.nodeStatusUpdateFrequency(默认 10s)执行一次
下面,来到 syncNodeStatus 函数定义 pkg/kubelet/kubelet_node_status.go#syncNodeStatus
// syncNodeStatus should be called periodically from a goroutine.
// It synchronizes node status to master if there is any change or enough time
// passed from the last sync, registering the kubelet first if necessary.
func (kl *Kubelet) syncNodeStatus() {
kl.syncNodeStatusMux.Lock()
defer kl.syncNodeStatusMux.Unlock()
if kl.kubeClient == nil || kl.heartbeatClient == nil {
return
}
if kl.registerNode {
// This will exit immediately if it doesn't need to do anything.
kl.registerWithAPIServer()
}
if err := kl.updateNodeStatus(); err != nil {
klog.Errorf("Unable to update node status: %v", err)
}
}
这里还进行了同步状态判断,如果是注册节点,则执行 registerWithAPIServer,否则,执行 updateNodeStatus
我们主要看 updateNodeStatus,pkg/kubelet/kubelet_node_status.go#updateNodeStatus
// updateNodeStatus updates node status to master with retries if there is any
// change or enough time passed from the last sync.
func (kl *Kubelet) updateNodeStatus() error {
klog.V(5).Infof("Updating node status")
for i := 0; i < nodeStatusUpdateRetry; i++ {
if err := kl.tryUpdateNodeStatus(i); err != nil {
if i > 0 && kl.onRepeatedHeartbeatFailure != nil {
kl.onRepeatedHeartbeatFailure()
}
klog.Errorf("Error updating node status, will retry: %v", err)
} else {
return nil
}
}
return fmt.Errorf("update node status exceeds retry count")
}
跟踪到 pkg/kubelet/kubelet_node_status.go#tryUpdateNodeStatus
// tryUpdateNodeStatus tries to update node status to master if there is any
// change or enough time passed from the last sync.
func (kl *Kubelet) tryUpdateNodeStatus(tryNumber int) error {
// In large clusters, GET and PUT operations on Node objects coming
// from here are the majority of load on apiserver and etcd.
// To reduce the load on etcd, we are serving GET operations from
// apiserver cache (the data might be slightly delayed but it doesn't
// seem to cause more conflict - the delays are pretty small).
// If it result in a conflict, all retries are served directly from etcd.
opts := metav1.GetOptions{}
if tryNumber == 0 {
util.FromApiserverCache(&opts)
}
node, err := kl.heartbeatClient.CoreV1().Nodes().Get(string(kl.nodeName), opts)
if err != nil {
return fmt.Errorf("error getting node %q: %v", kl.nodeName, err)
}
originalNode := node.DeepCopy()
if originalNode == nil {
return fmt.Errorf("nil %q node object", kl.nodeName)
}
podCIDRChanged := false
if node.Spec.PodCIDR != "" {
// Pod CIDR could have been updated before, so we cannot rely on
// node.Spec.PodCIDR being non-empty. We also need to know if pod CIDR is
// actually changed.
if podCIDRChanged, err = kl.updatePodCIDR(node.Spec.PodCIDR); err != nil {
klog.Errorf(err.Error())
}
}
kl.setNodeStatus(node)
now := kl.clock.Now()
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) && now.Before(kl.lastStatusReportTime.Add(kl.nodeStatusReportFrequency)) {
if !podCIDRChanged && !nodeStatusHasChanged(&originalNode.Status, &node.Status) {
return nil
}
}
// Patch the current status on the API server
updatedNode, _, err := nodeutil.PatchNodeStatus(kl.heartbeatClient.CoreV1(), types.NodeName(kl.nodeName), originalNode, node)
if err != nil {
return err
}
kl.lastStatusReportTime = now
kl.setLastObservedNodeAddresses(updatedNode.Status.Addresses)
// If update finishes successfully, mark the volumeInUse as reportedInUse to indicate
// those volumes are already updated in the node's status
kl.volumeManager.MarkVolumesAsReportedInUse(updatedNode.Status.VolumesInUse)
return nil
}
该函数的重点在于通过 setNodeStatus 函数调用,给 node 对象赋予必需的状态信息,然后通过调用 nodeutil.PatchNodeStatus() 将 Node 状态通告给 APIServer
setNodeStatus 的定义如下 pkg/kubelet/kubelet_node_status.go#setNodeStatus
// setNodeStatus fills in the Status fields of the given Node, overwriting
// any fields that are currently set.
// TODO(madhusudancs): Simplify the logic for setting node conditions and
// refactor the node status condition code out to a different file.
func (kl *Kubelet) setNodeStatus(node *v1.Node) {
for i, f := range kl.setNodeStatusFuncs {
klog.V(5).Infof("Setting node status at position %v", i)
if err := f(node); err != nil {
klog.Warningf("Failed to set some node status fields: %s", err)
}
}
}
可见是调用了之前注册的一系列处理函数来完成对 node 的状态赋值的,那这些函数在哪里注册的呢?
当然是在初始化 kubelet 对象的时候了,跟踪到 pkg/kubelet/kubelet.go#NewMainKubelet
func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
kubeDeps *Dependencies,
crOptions *config.ContainerRuntimeOptions,
containerRuntime string,
runtimeCgroups string,
hostnameOverride string,
nodeIP string,
providerID string,
cloudProvider string,
certDirectory string,
rootDirectory string,
registerNode bool,
registerWithTaints []api.Taint,
allowedUnsafeSysctls []string,
remoteRuntimeEndpoint string,
remoteImageEndpoint string,
experimentalMounterPath string,
experimentalKernelMemcgNotification bool,
experimentalCheckNodeCapabilitiesBeforeMount bool,
experimentalNodeAllocatableIgnoreEvictionThreshold bool,
minimumGCAge metav1.Duration,
maxPerPodContainerCount int32,
maxContainerCount int32,
masterServiceNamespace string,
registerSchedulable bool,
nonMasqueradeCIDR string,
keepTerminatedPodVolumes bool,
nodeLabels map[string]string,
seccompProfileRoot string,
bootstrapCheckpointPath string,
nodeStatusMaxImages int32) (*Kubelet, error) {
...
// Generating the status funcs should be the last thing we do,
// since this relies on the rest of the Kubelet having been constructed.
klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()
return klet, nil
}
kubelet 初始化函数很长,但最后也是调用了 defaultNodeStatusFuncs() 对 setNodeStatusFuncs 进行了赋值
跟踪到 pkg/kubelet/kubelet_node_status.go#defaultNodeStatusFuncs
// defaultNodeStatusFuncs is a factory that generates the default set of
// setNodeStatus funcs
func (kl *Kubelet) defaultNodeStatusFuncs() []func(*v1.Node) error {
// if cloud is not nil, we expect the cloud resource sync manager to exist
var nodeAddressesFunc func() ([]v1.NodeAddress, error)
if kl.cloud != nil {
nodeAddressesFunc = kl.cloudResourceSyncManager.NodeAddresses
}
var validateHostFunc func() error
if kl.appArmorValidator != nil {
validateHostFunc = kl.appArmorValidator.ValidateHost
}
var setters []func(n *v1.Node) error
setters = append(setters,
nodestatus.NodeAddress(kl.nodeIP, kl.nodeIPValidator, kl.hostname, kl.hostnameOverridden, kl.externalCloudProvider, kl.cloud, nodeAddressesFunc),
nodestatus.MachineInfo(string(kl.nodeName), kl.maxPods, kl.podsPerCore, kl.GetCachedMachineInfo, kl.containerManager.GetCapacity,
kl.containerManager.GetDevicePluginResourceCapacity, kl.containerManager.GetNodeAllocatableReservation, kl.recordEvent),
nodestatus.VersionInfo(kl.cadvisor.VersionInfo, kl.containerRuntime.Type, kl.containerRuntime.Version),
nodestatus.DaemonEndpoints(kl.daemonEndpoints),
nodestatus.Images(kl.nodeStatusMaxImages, kl.imageManager.GetImageList),
nodestatus.GoRuntime(),
)
if utilfeature.DefaultFeatureGate.Enabled(features.AttachVolumeLimit) {
setters = append(setters, nodestatus.VolumeLimits(kl.volumePluginMgr.ListVolumePluginWithLimits))
}
setters = append(setters,
nodestatus.MemoryPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderMemoryPressure, kl.recordNodeStatusEvent),
nodestatus.DiskPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderDiskPressure, kl.recordNodeStatusEvent),
nodestatus.PIDPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderPIDPressure, kl.recordNodeStatusEvent),
nodestatus.ReadyCondition(kl.clock.Now, kl.runtimeState.runtimeErrors, kl.runtimeState.networkErrors, validateHostFunc, kl.containerManager.Status, kl.recordNodeStatusEvent),
nodestatus.VolumesInUse(kl.volumeManager.ReconcilerStatesHasBeenSynced, kl.volumeManager.GetVolumesInUse),
// TODO(mtaufen): I decided not to move this setter for now, since all it does is send an event
// and record state back to the Kubelet runtime object. In the future, I'd like to isolate
// these side-effects by decoupling the decisions to send events and partial status recording
// from the Node setters.
kl.recordNodeSchedulableEvent,
)
return setters
}
可见, defaultNodeStatusFuncs 注册了各种状态的处理函数,包括:节点地址、镜像、Go环境等等,具体就不细说了
对于二次开发而言,如果我们需要 APIServer 掌握更多的 Node 信息,不妨在此处添加自定义函数!
而处理函数定义的细节就在 pkg/kubelet/nodestatus/setters.go
// NodeAddress returns a Setter that updates address-related information on the node.
func NodeAddress(nodeIP net.IP, // typically Kubelet.nodeIP
validateNodeIPFunc func(net.IP) error, // typically Kubelet.nodeIPValidator
hostname string, // typically Kubelet.hostname
hostnameOverridden bool, // was the hostname force set?
externalCloudProvider bool, // typically Kubelet.externalCloudProvider
cloud cloudprovider.Interface, // typically Kubelet.cloud
nodeAddressesFunc func() ([]v1.NodeAddress, error), // typically Kubelet.cloudResourceSyncManager.NodeAddresses
) Setter {
return func(node *v1.Node) error {
if nodeIP != nil {
if err := validateNodeIPFunc(nodeIP); err != nil {
return fmt.Errorf("failed to validate nodeIP: %v", err)
}
klog.V(2).Infof("Using node IP: %q", nodeIP.String())
}
if externalCloudProvider {
if nodeIP != nil {
if node.ObjectMeta.Annotations == nil {
node.ObjectMeta.Annotations = make(map[string]string)
}
node.ObjectMeta.Annotations[kubeletapis.AnnotationProvidedIPAddr] = nodeIP.String()
}
// We rely on the external cloud provider to supply the addresses.
return nil
}
if cloud != nil {
nodeAddresses, err := nodeAddressesFunc()
if err != nil {
return err
}
if nodeIP != nil {
enforcedNodeAddresses := []v1.NodeAddress{}
var nodeIPType v1.NodeAddressType
for _, nodeAddress := range nodeAddresses {
if nodeAddress.Address == nodeIP.String() {
enforcedNodeAddresses = append(enforcedNodeAddresses, v1.NodeAddress{Type: nodeAddress.Type, Address: nodeAddress.Address})
nodeIPType = nodeAddress.Type
break
}
}
if len(enforcedNodeAddresses) > 0 {
for _, nodeAddress := range nodeAddresses {
if nodeAddress.Type != nodeIPType && nodeAddress.Type != v1.NodeHostName {
enforcedNodeAddresses = append(enforcedNodeAddresses, v1.NodeAddress{Type: nodeAddress.Type, Address: nodeAddress.Address})
}
}
enforcedNodeAddresses = append(enforcedNodeAddresses, v1.NodeAddress{Type: v1.NodeHostName, Address: hostname})
node.Status.Addresses = enforcedNodeAddresses
return nil
}
return fmt.Errorf("failed to get node address from cloud provider that matches ip: %v", nodeIP)
}
switch {
case len(nodeAddresses) == 0:
// the cloud provider didn't specify any addresses
nodeAddresses = append(nodeAddresses, v1.NodeAddress{Type: v1.NodeHostName, Address: hostname})
case !hasAddressType(nodeAddresses, v1.NodeHostName) && hasAddressValue(nodeAddresses, hostname):
// the cloud provider didn't specify an address of type Hostname,
// but the auto-detected hostname matched an address reported by the cloud provider,
// so we can add it and count on the value being verifiable via cloud provider metadata
nodeAddresses = append(nodeAddresses, v1.NodeAddress{Type: v1.NodeHostName, Address: hostname})
case hostnameOverridden:
// the hostname was force-set via flag/config.
// this means the hostname might not be able to be validated via cloud provider metadata,
// but was a choice by the kubelet deployer we should honor
var existingHostnameAddress *v1.NodeAddress
for i := range nodeAddresses {
if nodeAddresses[i].Type == v1.NodeHostName {
existingHostnameAddress = &nodeAddresses[i]
break
}
}
if existingHostnameAddress == nil {
// no existing Hostname address found, add it
klog.Warningf("adding overridden hostname of %v to cloudprovider-reported addresses", hostname)
nodeAddresses = append(nodeAddresses, v1.NodeAddress{Type: v1.NodeHostName, Address: hostname})
} else {
// override the Hostname address reported by the cloud provider
klog.Warningf("replacing cloudprovider-reported hostname of %v with overridden hostname of %v", existingHostnameAddress.Address, hostname)
existingHostnameAddress.Address = hostname
}
}
node.Status.Addresses = nodeAddresses
} else {
var ipAddr net.IP
var err error
// 1) Use nodeIP if set
// 2) If the user has specified an IP to HostnameOverride, use it
// 3) Lookup the IP from node name by DNS and use the first valid IPv4 address.
// If the node does not have a valid IPv4 address, use the first valid IPv6 address.
// 4) Try to get the IP from the network interface used as default gateway
if nodeIP != nil {
ipAddr = nodeIP
} else if addr := net.ParseIP(hostname); addr != nil {
ipAddr = addr
} else {
var addrs []net.IP
addrs, _ = net.LookupIP(node.Name)
for _, addr := range addrs {
if err = validateNodeIPFunc(addr); err == nil {
if addr.To4() != nil {
ipAddr = addr
break
}
if addr.To16() != nil && ipAddr == nil {
ipAddr = addr
}
}
}
if ipAddr == nil {
ipAddr, err = utilnet.ChooseHostInterface()
}
}
if ipAddr == nil {
// We tried everything we could, but the IP address wasn't fetchable; error out
return fmt.Errorf("can't get ip address of node %s. error: %v", node.Name, err)
}
node.Status.Addresses = []v1.NodeAddress{
{Type: v1.NodeInternalIP, Address: ipAddr.String()},
{Type: v1.NodeHostName, Address: hostname},
}
}
return nil
}
}
func hasAddressType(addresses []v1.NodeAddress, addressType v1.NodeAddressType) bool {
for _, address := range addresses {
if address.Type == addressType {
return true
}
}
return false
}
func hasAddressValue(addresses []v1.NodeAddress, addressValue string) bool {
for _, address := range addresses {
if address.Address == addressValue {
return true
}
}
return false
}
// MachineInfo returns a Setter that updates machine-related information on the node.
func MachineInfo(nodeName string,
maxPods int,
podsPerCore int,
machineInfoFunc func() (*cadvisorapiv1.MachineInfo, error), // typically Kubelet.GetCachedMachineInfo
capacityFunc func() v1.ResourceList, // typically Kubelet.containerManager.GetCapacity
devicePluginResourceCapacityFunc func() (v1.ResourceList, v1.ResourceList, []string), // typically Kubelet.containerManager.GetDevicePluginResourceCapacity
nodeAllocatableReservationFunc func() v1.ResourceList, // typically Kubelet.containerManager.GetNodeAllocatableReservation
recordEventFunc func(eventType, event, message string), // typically Kubelet.recordEvent
) Setter {
return func(node *v1.Node) error {
// Note: avoid blindly overwriting the capacity in case opaque
// resources are being advertised.
if node.Status.Capacity == nil {
node.Status.Capacity = v1.ResourceList{}
}
var devicePluginAllocatable v1.ResourceList
var devicePluginCapacity v1.ResourceList
var removedDevicePlugins []string
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
info, err := machineInfoFunc()
if err != nil {
// TODO(roberthbailey): This is required for test-cmd.sh to pass.
// See if the test should be updated instead.
node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(maxPods), resource.DecimalSI)
klog.Errorf("Error getting machine info: %v", err)
} else {
node.Status.NodeInfo.MachineID = info.MachineID
node.Status.NodeInfo.SystemUUID = info.SystemUUID
for rName, rCap := range cadvisor.CapacityFromMachineInfo(info) {
node.Status.Capacity[rName] = rCap
}
if podsPerCore > 0 {
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
int64(math.Min(float64(info.NumCores*podsPerCore), float64(maxPods))), resource.DecimalSI)
} else {
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
int64(maxPods), resource.DecimalSI)
}
if node.Status.NodeInfo.BootID != "" &&
node.Status.NodeInfo.BootID != info.BootID {
// TODO: This requires a transaction, either both node status is updated
// and event is recorded or neither should happen, see issue #6055.
recordEventFunc(v1.EventTypeWarning, events.NodeRebooted,
fmt.Sprintf("Node %s has been rebooted, boot id: %s", nodeName, info.BootID))
}
node.Status.NodeInfo.BootID = info.BootID
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
// TODO: all the node resources should use ContainerManager.GetCapacity instead of deriving the
// capacity for every node status request
initialCapacity := capacityFunc()
if initialCapacity != nil {
node.Status.Capacity[v1.ResourceEphemeralStorage] = initialCapacity[v1.ResourceEphemeralStorage]
}
}
devicePluginCapacity, devicePluginAllocatable, removedDevicePlugins = devicePluginResourceCapacityFunc()
if devicePluginCapacity != nil {
for k, v := range devicePluginCapacity {
if old, ok := node.Status.Capacity[k]; !ok || old.Value() != v.Value() {
klog.V(2).Infof("Update capacity for %s to %d", k, v.Value())
}
node.Status.Capacity[k] = v
}
}
for _, removedResource := range removedDevicePlugins {
klog.V(2).Infof("Set capacity for %s to 0 on device removal", removedResource)
// Set the capacity of the removed resource to 0 instead of
// removing the resource from the node status. This is to indicate
// that the resource is managed by device plugin and had been
// registered before.
//
// This is required to differentiate the device plugin managed
// resources and the cluster-level resources, which are absent in
// node status.
node.Status.Capacity[v1.ResourceName(removedResource)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
}
}
// Set Allocatable.
if node.Status.Allocatable == nil {
node.Status.Allocatable = make(v1.ResourceList)
}
// Remove extended resources from allocatable that are no longer
// present in capacity.
for k := range node.Status.Allocatable {
_, found := node.Status.Capacity[k]
if !found && v1helper.IsExtendedResourceName(k) {
delete(node.Status.Allocatable, k)
}
}
allocatableReservation := nodeAllocatableReservationFunc()
for k, v := range node.Status.Capacity {
value := *(v.Copy())
if res, exists := allocatableReservation[k]; exists {
value.Sub(res)
}
if value.Sign() < 0 {
// Negative Allocatable resources don't make sense.
value.Set(0)
}
node.Status.Allocatable[k] = value
}
if devicePluginAllocatable != nil {
for k, v := range devicePluginAllocatable {
if old, ok := node.Status.Allocatable[k]; !ok || old.Value() != v.Value() {
klog.V(2).Infof("Update allocatable for %s to %d", k, v.Value())
}
node.Status.Allocatable[k] = v
}
}
// for every huge page reservation, we need to remove it from allocatable memory
for k, v := range node.Status.Capacity {
if v1helper.IsHugePageResourceName(k) {
allocatableMemory := node.Status.Allocatable[v1.ResourceMemory]
value := *(v.Copy())
allocatableMemory.Sub(value)
if allocatableMemory.Sign() < 0 {
// Negative Allocatable resources don't make sense.
allocatableMemory.Set(0)
}
node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory
}
}
return nil
}
}
// VersionInfo returns a Setter that updates version-related information on the node.
func VersionInfo(versionInfoFunc func() (*cadvisorapiv1.VersionInfo, error), // typically Kubelet.cadvisor.VersionInfo
runtimeTypeFunc func() string, // typically Kubelet.containerRuntime.Type
runtimeVersionFunc func() (kubecontainer.Version, error), // typically Kubelet.containerRuntime.Version
) Setter {
return func(node *v1.Node) error {
verinfo, err := versionInfoFunc()
if err != nil {
// TODO(mtaufen): consider removing this log line, since returned error will be logged
klog.Errorf("Error getting version info: %v", err)
return fmt.Errorf("error getting version info: %v", err)
}
node.Status.NodeInfo.KernelVersion = verinfo.KernelVersion
node.Status.NodeInfo.OSImage = verinfo.ContainerOsVersion
runtimeVersion := "Unknown"
if runtimeVer, err := runtimeVersionFunc(); err == nil {
runtimeVersion = runtimeVer.String()
}
node.Status.NodeInfo.ContainerRuntimeVersion = fmt.Sprintf("%s://%s", runtimeTypeFunc(), runtimeVersion)
node.Status.NodeInfo.KubeletVersion = version.Get().String()
// TODO: kube-proxy might be different version from kubelet in the future
node.Status.NodeInfo.KubeProxyVersion = version.Get().String()
return nil
}
}
// DaemonEndpoints returns a Setter that updates the daemon endpoints on the node.
func DaemonEndpoints(daemonEndpoints *v1.NodeDaemonEndpoints) Setter {
return func(node *v1.Node) error {
node.Status.DaemonEndpoints = *daemonEndpoints
return nil
}
}
// Images returns a Setter that updates the images on the node.
// imageListFunc is expected to return a list of images sorted in descending order by image size.
// nodeStatusMaxImages is ignored if set to -1.
func Images(nodeStatusMaxImages int32,
imageListFunc func() ([]kubecontainer.Image, error), // typically Kubelet.imageManager.GetImageList
) Setter {
return func(node *v1.Node) error {
// Update image list of this node
var imagesOnNode []v1.ContainerImage
containerImages, err := imageListFunc()
if err != nil {
// TODO(mtaufen): consider removing this log line, since returned error will be logged
klog.Errorf("Error getting image list: %v", err)
node.Status.Images = imagesOnNode
return fmt.Errorf("error getting image list: %v", err)
}
// we expect imageListFunc to return a sorted list, so we just need to truncate
if int(nodeStatusMaxImages) > -1 &&
int(nodeStatusMaxImages) < len(containerImages) {
containerImages = containerImages[0:nodeStatusMaxImages]
}
for _, image := range containerImages {
names := append(image.RepoDigests, image.RepoTags...)
// Report up to MaxNamesPerImageInNodeStatus names per image.
if len(names) > MaxNamesPerImageInNodeStatus {
names = names[0:MaxNamesPerImageInNodeStatus]
}
imagesOnNode = append(imagesOnNode, v1.ContainerImage{
Names: names,
SizeBytes: image.Size,
})
}
node.Status.Images = imagesOnNode
return nil
}
}
// GoRuntime returns a Setter that sets GOOS and GOARCH on the node.
func GoRuntime() Setter {
return func(node *v1.Node) error {
node.Status.NodeInfo.OperatingSystem = goruntime.GOOS
node.Status.NodeInfo.Architecture = goruntime.GOARCH
return nil
}
}
// ReadyCondition returns a Setter that updates the v1.NodeReady condition on the node.
func ReadyCondition(
nowFunc func() time.Time, // typically Kubelet.clock.Now
runtimeErrorsFunc func() []string, // typically Kubelet.runtimeState.runtimeErrors
networkErrorsFunc func() []string, // typically Kubelet.runtimeState.networkErrors
appArmorValidateHostFunc func() error, // typically Kubelet.appArmorValidator.ValidateHost, might be nil depending on whether there was an appArmorValidator
cmStatusFunc func() cm.Status, // typically Kubelet.containerManager.Status
recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent
) Setter {
return func(node *v1.Node) error {
// NOTE(aaronlevy): NodeReady condition needs to be the last in the list of node conditions.
// This is due to an issue with version skewed kubelet and master components.
// ref: https://github.com/kubernetes/kubernetes/issues/16961
currentTime := metav1.NewTime(nowFunc())
newNodeReadyCondition := v1.NodeCondition{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
Reason: "KubeletReady",
Message: "kubelet is posting ready status",
LastHeartbeatTime: currentTime,
}
rs := append(runtimeErrorsFunc(), networkErrorsFunc()...)
requiredCapacities := []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods}
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
requiredCapacities = append(requiredCapacities, v1.ResourceEphemeralStorage)
}
missingCapacities := []string{}
for _, resource := range requiredCapacities {
if _, found := node.Status.Capacity[resource]; !found {
missingCapacities = append(missingCapacities, string(resource))
}
}
if len(missingCapacities) > 0 {
rs = append(rs, fmt.Sprintf("Missing node capacity for resources: %s", strings.Join(missingCapacities, ", ")))
}
if len(rs) > 0 {
newNodeReadyCondition = v1.NodeCondition{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
Reason: "KubeletNotReady",
Message: strings.Join(rs, ","),
LastHeartbeatTime: currentTime,
}
}
// Append AppArmor status if it's enabled.
// TODO(tallclair): This is a temporary message until node feature reporting is added.
if appArmorValidateHostFunc != nil && newNodeReadyCondition.Status == v1.ConditionTrue {
if err := appArmorValidateHostFunc(); err == nil {
newNodeReadyCondition.Message = fmt.Sprintf("%s. AppArmor enabled", newNodeReadyCondition.Message)
}
}
// Record any soft requirements that were not met in the container manager.
status := cmStatusFunc()
if status.SoftRequirements != nil {
newNodeReadyCondition.Message = fmt.Sprintf("%s. WARNING: %s", newNodeReadyCondition.Message, status.SoftRequirements.Error())
}
readyConditionUpdated := false
needToRecordEvent := false
for i := range node.Status.Conditions {
if node.Status.Conditions[i].Type == v1.NodeReady {
if node.Status.Conditions[i].Status == newNodeReadyCondition.Status {
newNodeReadyCondition.LastTransitionTime = node.Status.Conditions[i].LastTransitionTime
} else {
newNodeReadyCondition.LastTransitionTime = currentTime
needToRecordEvent = true
}
node.Status.Conditions[i] = newNodeReadyCondition
readyConditionUpdated = true
break
}
}
if !readyConditionUpdated {
newNodeReadyCondition.LastTransitionTime = currentTime
node.Status.Conditions = append(node.Status.Conditions, newNodeReadyCondition)
}
if needToRecordEvent {
if newNodeReadyCondition.Status == v1.ConditionTrue {
recordEventFunc(v1.EventTypeNormal, events.NodeReady)
} else {
recordEventFunc(v1.EventTypeNormal, events.NodeNotReady)
klog.Infof("Node became not ready: %+v", newNodeReadyCondition)
}
}
return nil
}
}
// MemoryPressureCondition returns a Setter that updates the v1.NodeMemoryPressure condition on the node.
func MemoryPressureCondition(nowFunc func() time.Time, // typically Kubelet.clock.Now
pressureFunc func() bool, // typically Kubelet.evictionManager.IsUnderMemoryPressure
recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent
) Setter {
return func(node *v1.Node) error {
currentTime := metav1.NewTime(nowFunc())
var condition *v1.NodeCondition
// Check if NodeMemoryPressure condition already exists and if it does, just pick it up for update.
for i := range node.Status.Conditions {
if node.Status.Conditions[i].Type == v1.NodeMemoryPressure {
condition = &node.Status.Conditions[i]
}
}
newCondition := false
// If the NodeMemoryPressure condition doesn't exist, create one
if condition == nil {
condition = &v1.NodeCondition{
Type: v1.NodeMemoryPressure,
Status: v1.ConditionUnknown,
}
// cannot be appended to node.Status.Conditions here because it gets
// copied to the slice. So if we append to the slice here none of the
// updates we make below are reflected in the slice.
newCondition = true
}
// Update the heartbeat time
condition.LastHeartbeatTime = currentTime
// Note: The conditions below take care of the case when a new NodeMemoryPressure condition is
// created and as well as the case when the condition already exists. When a new condition
// is created its status is set to v1.ConditionUnknown which matches either
// condition.Status != v1.ConditionTrue or
// condition.Status != v1.ConditionFalse in the conditions below depending on whether
// the kubelet is under memory pressure or not.
if pressureFunc() {
if condition.Status != v1.ConditionTrue {
condition.Status = v1.ConditionTrue
condition.Reason = "KubeletHasInsufficientMemory"
condition.Message = "kubelet has insufficient memory available"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasInsufficientMemory")
}
} else if condition.Status != v1.ConditionFalse {
condition.Status = v1.ConditionFalse
condition.Reason = "KubeletHasSufficientMemory"
condition.Message = "kubelet has sufficient memory available"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasSufficientMemory")
}
if newCondition {
node.Status.Conditions = append(node.Status.Conditions, *condition)
}
return nil
}
}
// PIDPressureCondition returns a Setter that updates the v1.NodePIDPressure condition on the node.
func PIDPressureCondition(nowFunc func() time.Time, // typically Kubelet.clock.Now
pressureFunc func() bool, // typically Kubelet.evictionManager.IsUnderPIDPressure
recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent
) Setter {
return func(node *v1.Node) error {
currentTime := metav1.NewTime(nowFunc())
var condition *v1.NodeCondition
// Check if NodePIDPressure condition already exists and if it does, just pick it up for update.
for i := range node.Status.Conditions {
if node.Status.Conditions[i].Type == v1.NodePIDPressure {
condition = &node.Status.Conditions[i]
}
}
newCondition := false
// If the NodePIDPressure condition doesn't exist, create one
if condition == nil {
condition = &v1.NodeCondition{
Type: v1.NodePIDPressure,
Status: v1.ConditionUnknown,
}
// cannot be appended to node.Status.Conditions here because it gets
// copied to the slice. So if we append to the slice here none of the
// updates we make below are reflected in the slice.
newCondition = true
}
// Update the heartbeat time
condition.LastHeartbeatTime = currentTime
// Note: The conditions below take care of the case when a new NodePIDPressure condition is
// created and as well as the case when the condition already exists. When a new condition
// is created its status is set to v1.ConditionUnknown which matches either
// condition.Status != v1.ConditionTrue or
// condition.Status != v1.ConditionFalse in the conditions below depending on whether
// the kubelet is under PID pressure or not.
if pressureFunc() {
if condition.Status != v1.ConditionTrue {
condition.Status = v1.ConditionTrue
condition.Reason = "KubeletHasInsufficientPID"
condition.Message = "kubelet has insufficient PID available"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasInsufficientPID")
}
} else if condition.Status != v1.ConditionFalse {
condition.Status = v1.ConditionFalse
condition.Reason = "KubeletHasSufficientPID"
condition.Message = "kubelet has sufficient PID available"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasSufficientPID")
}
if newCondition {
node.Status.Conditions = append(node.Status.Conditions, *condition)
}
return nil
}
}
// DiskPressureCondition returns a Setter that updates the v1.NodeDiskPressure condition on the node.
func DiskPressureCondition(nowFunc func() time.Time, // typically Kubelet.clock.Now
pressureFunc func() bool, // typically Kubelet.evictionManager.IsUnderDiskPressure
recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent
) Setter {
return func(node *v1.Node) error {
currentTime := metav1.NewTime(nowFunc())
var condition *v1.NodeCondition
// Check if NodeDiskPressure condition already exists and if it does, just pick it up for update.
for i := range node.Status.Conditions {
if node.Status.Conditions[i].Type == v1.NodeDiskPressure {
condition = &node.Status.Conditions[i]
}
}
newCondition := false
// If the NodeDiskPressure condition doesn't exist, create one
if condition == nil {
condition = &v1.NodeCondition{
Type: v1.NodeDiskPressure,
Status: v1.ConditionUnknown,
}
// cannot be appended to node.Status.Conditions here because it gets
// copied to the slice. So if we append to the slice here none of the
// updates we make below are reflected in the slice.
newCondition = true
}
// Update the heartbeat time
condition.LastHeartbeatTime = currentTime
// Note: The conditions below take care of the case when a new NodeDiskPressure condition is
// created and as well as the case when the condition already exists. When a new condition
// is created its status is set to v1.ConditionUnknown which matches either
// condition.Status != v1.ConditionTrue or
// condition.Status != v1.ConditionFalse in the conditions below depending on whether
// the kubelet is under disk pressure or not.
if pressureFunc() {
if condition.Status != v1.ConditionTrue {
condition.Status = v1.ConditionTrue
condition.Reason = "KubeletHasDiskPressure"
condition.Message = "kubelet has disk pressure"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasDiskPressure")
}
} else if condition.Status != v1.ConditionFalse {
condition.Status = v1.ConditionFalse
condition.Reason = "KubeletHasNoDiskPressure"
condition.Message = "kubelet has no disk pressure"
condition.LastTransitionTime = currentTime
recordEventFunc(v1.EventTypeNormal, "NodeHasNoDiskPressure")
}
if newCondition {
node.Status.Conditions = append(node.Status.Conditions, *condition)
}
return nil
}
}
// VolumesInUse returns a Setter that updates the volumes in use on the node.
func VolumesInUse(syncedFunc func() bool, // typically Kubelet.volumeManager.ReconcilerStatesHasBeenSynced
volumesInUseFunc func() []v1.UniqueVolumeName, // typically Kubelet.volumeManager.GetVolumesInUse
) Setter {
return func(node *v1.Node) error {
// Make sure to only update node status after reconciler starts syncing up states
if syncedFunc() {
node.Status.VolumesInUse = volumesInUseFunc()
}
return nil
}
}
// VolumeLimits returns a Setter that updates the volume limits on the node.
func VolumeLimits(volumePluginListFunc func() []volume.VolumePluginWithAttachLimits, // typically Kubelet.volumePluginMgr.ListVolumePluginWithLimits
) Setter {
return func(node *v1.Node) error {
if node.Status.Capacity == nil {
node.Status.Capacity = v1.ResourceList{}
}
if node.Status.Allocatable == nil {
node.Status.Allocatable = v1.ResourceList{}
}
pluginWithLimits := volumePluginListFunc()
for _, volumePlugin := range pluginWithLimits {
attachLimits, err := volumePlugin.GetVolumeLimits()
if err != nil {
klog.V(4).Infof("Error getting volume limit for plugin %s", volumePlugin.GetPluginName())
continue
}
for limitKey, value := range attachLimits {
node.Status.Capacity[v1.ResourceName(limitKey)] = *resource.NewQuantity(value, resource.DecimalSI)
node.Status.Allocatable[v1.ResourceName(limitKey)] = *resource.NewQuantity(value, resource.DecimalSI)
}
}
return nil
}
}
好了,kubelet 与 APIServer 之间的心跳机制就暂时分析到这里,其实目前 kubelet 还支持另外一种心跳方式
// start syncing lease
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) {
go kl.nodeLeaseController.Run(wait.NeverStop)
}
就是这种 NodeLease feature 特性了,不过默认是不启用的,相关细节改天再聊吧!
更多分享,敬请期待!