kubelet是在每个节点上运行的主要“节点代理”。它可以使用以下之一向apiserver注册该节点:主机名;用于覆盖主机名的标志;或云提供商的特定逻辑。
kubelet根据PodSpec起作用。 PodSpec是一个描述Pod的YAML或JSON对象。 kubelet接收通过各种机制(主要是通过apiserver)提供的PodSpec集合,并确保这些PodSpec中描述的容器正在运行且运行状况良好。 Kubelet不管理不是Kubernetes创建的容器。
kubelet核心的功能点就是保住每个pod的状态并且运行正常,能够根据apiserver的状态来同步当前的pod的状态,首先查看一下整个流程的概述。
func main() {
runtime.GOMAXPROCS(runtime.NumCPU()) // 开启cpu对应个数的线程运行
s := app.NewKubeletServer() // 初始化对象
s.AddFlags(pflag.CommandLine)
util.InitFlags()
util.InitLogs()
defer util.FlushLogs()
verflag.PrintAndExitIfRequested()
if err := s.Run(nil); err != nil { // 运行起来
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}
整体的开始代码比较简洁明了,先是初始化,然后直接运行。
// NewKubeletServer will create a new KubeletServer with default values.
func NewKubeletServer() *KubeletServer {
return &KubeletServer{
Address: net.ParseIP("0.0.0.0"), // 本机监听的地址
AuthPath: util.NewStringFlag("/var/lib/kubelet/kubernetes_auth"), // deprecated
CAdvisorPort: 4194,
CertDirectory: "/var/run/kubernetes", // 证书路径
CgroupRoot: "",
ConfigureCBR0: false,
ContainerRuntime: "docker", // 运行时的工具
CPUCFSQuota: false,
DockerDaemonContainer: "/docker-daemon",
DockerExecHandlerName: "native",
EnableDebuggingHandlers: true,
EnableServer: true,
FileCheckFrequency: 20 * time.Second,
HealthzBindAddress: net.ParseIP("127.0.0.1"), // 监控检测的地址
HealthzPort: 10248,
HostNetworkSources: kubelet.AllSource,
HostPIDSources: kubelet.AllSource,
HostIPCSources: kubelet.AllSource,
HTTPCheckFrequency: 20 * time.Second,
ImageGCHighThresholdPercent: 90,
ImageGCLowThresholdPercent: 80,
KubeConfig: util.NewStringFlag("/var/lib/kubelet/kubeconfig"),
LowDiskSpaceThresholdMB: 256,
MasterServiceNamespace: api.NamespaceDefault,
MaxContainerCount: 100,
MaxPerPodContainerCount: 2,
MaxOpenFiles: 1000000,
MinimumGCAge: 1 * time.Minute,
NetworkPluginDir: "/usr/libexec/kubernetes/kubelet-plugins/net/exec/", // 网络插件
NetworkPluginName: "",
NodeStatusUpdateFrequency: 10 * time.Second,
OOMScoreAdj: qos.KubeletOomScoreAdj,
PodInfraContainerImage: dockertools.PodInfraContainerImage,
Port: ports.KubeletPort,
ReadOnlyPort: ports.KubeletReadOnlyPort,
RegisterNode: true, // will be ignored if no apiserver is configured
RegistryBurst: 10,
ResourceContainer: "/kubelet",
RktPath: "",
RktStage1Image: "",
RootDirectory: defaultRootDir,
SerializeImagePulls: true,
SyncFrequency: 10 * time.Second, // 同步的频率信息
SystemContainer: "",
}
}
....
// Run runs the specified KubeletServer for the given KubeletConfig. This should never exit.
// The kcfg argument may be nil - if so, it is initialized from the settings on KubeletServer.
// Otherwise, the caller is assumed to have set up the KubeletConfig object and all defaults
// will be ignored.
func (s *KubeletServer) Run(kcfg *KubeletConfig) error {
if kcfg == nil {
cfg, err := s.KubeletConfig() // 生成配置信息
if err != nil {
return err
}
kcfg = cfg
clientConfig, err := s.CreateAPIServerClientConfig() // 获取连接到AIPServer的配置信息
if err == nil {
kcfg.KubeClient, err = client.New(clientConfig) // 生成访问的实例
}
if err != nil && len(s.APIServerList) > 0 {
glog.Warningf("No API client: %v", err)
}
cloud, err := cloudprovider.InitCloudProvider(s.CloudProvider, s.CloudConfigFile) // 初始化 云服务提供者
if err != nil {
return err
}
glog.V(2).Infof("Successfully initialized cloud provider: %q from the config file: %q\n", s.CloudProvider, s.CloudConfigFile)
kcfg.Cloud = cloud
}
if kcfg.CAdvisorInterface == nil {
ca, err := cadvisor.New(s.CAdvisorPort) // 生成监控相关信息
if err != nil {
return err
}
kcfg.CAdvisorInterface = ca
}
util.ReallyCrash = s.ReallyCrashForTesting
rand.Seed(time.Now().UTC().UnixNano())
credentialprovider.SetPreferredDockercfgPath(s.RootDirectory)
glog.V(2).Infof("Using root directory: %v", s.RootDirectory)
// TODO(vmarmol): Do this through container config.
oomAdjuster := oom.NewOomAdjuster()
if err := oomAdjuster.ApplyOomScoreAdj(0, s.OOMScoreAdj); err != nil {
glog.Warning(err)
}
if err := RunKubelet(kcfg, nil); err != nil { // 运行kubelet
return err
}
if s.HealthzPort > 0 { // 开启监控检测信息
healthz.DefaultHealthz()
go util.Until(func() {
err := http.ListenAndServe(net.JoinHostPort(s.HealthzBindAddress.String(), strconv.Itoa(s.HealthzPort)), nil)
if err != nil {
glog.Errorf("Starting health server failed: %v", err)
}
}, 5*time.Second, util.NeverStop)
}
if s.RunOnce { // 是否运行一次 如果运行一次则退出
return nil
}
// run forever
select {}
}
从流程上看就是通过解析配置文件来,启动访问APIServer的客户端,启动监控信息等。
// RunKubelet is responsible for setting up and running a kubelet. It is used in three different applications:
// 1 Integration tests
// 2 Kubelet binary
// 3 Standalone 'kubernetes' binary
// Eventually, #2 will be replaced with instances of #3
func RunKubelet(kcfg *KubeletConfig, builder KubeletBuilder) error {
...
if builder == nil {
builder = createAndInitKubelet // 创建函数
}
if kcfg.OSInterface == nil {
kcfg.OSInterface = kubecontainer.RealOS{}
}
k, podCfg, err := builder(kcfg) // 生成配置信息
if err != nil {
return fmt.Errorf("failed to create kubelet: %v", err)
}
util.ApplyRLimitForSelf(kcfg.MaxOpenFiles)
// process pods and exit.
if kcfg.Runonce {
if _, err := k.RunOnce(podCfg.Updates()); err != nil {
return fmt.Errorf("runonce failed: %v", err)
}
glog.Infof("Started kubelet as runonce")
} else {
startKubelet(k, podCfg, kcfg) // 如果不是一次运行 则开始循环运行
glog.Infof("Started kubelet")
}
return nil
}
func startKubelet(k KubeletBootstrap, podCfg *config.PodConfig, kc *KubeletConfig) {
// start the kubelet
go util.Until(func() { k.Run(podCfg.Updates()) }, 0, util.NeverStop) // 监听pod的更新数据
// start the kubelet server
if kc.EnableServer { // 是否开始server端
go util.Until(func() {
k.ListenAndServe(kc.Address, kc.Port, kc.TLSOptions, kc.EnableDebuggingHandlers)
}, 0, util.NeverStop)
}
if kc.ReadOnlyPort > 0 {
go util.Until(func() {
k.ListenAndServeReadOnly(kc.Address, kc.ReadOnlyPort)
}, 0, util.NeverStop)
}
}
初始化过程主要就是通过createAndInitKubelet来初始化。
func createAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.PodConfig, err error) {
// TODO: block until all sources have delivered at least one update to the channel, or break the sync loop
// up into "per source" synchronizations
// TODO: KubeletConfig.KubeClient should be a client interface, but client interface misses certain methods
// used by kubelet. Since NewMainKubelet expects a client interface, we need to make sure we are not passing
// a nil pointer to it when what we really want is a nil interface.
var kubeClient client.Interface
if kc.KubeClient != nil {
kubeClient = kc.KubeClient
}
gcPolicy := kubelet.ContainerGCPolicy{
MinAge: kc.MinimumGCAge,
MaxPerPodContainer: kc.MaxPerPodContainerCount,
MaxContainers: kc.MaxContainerCount,
} // 配置gc的频率
daemonEndpoints := &api.NodeDaemonEndpoints{
KubeletEndpoint: api.DaemonEndpoint{Port: int(kc.Port)},
}
pc = makePodSourceConfig(kc) // 生成Pod相关的源文件配置 后续都监听该配置
k, err = kubelet.NewMainKubelet(
kc.Hostname,
kc.NodeName,
kc.DockerClient,
kubeClient,
kc.RootDirectory,
kc.PodInfraContainerImage,
kc.SyncFrequency,
float32(kc.RegistryPullQPS),
kc.RegistryBurst,
kc.EventRecordQPS,
kc.EventBurst,
gcPolicy,
pc.SeenAllSources,
kc.RegisterNode,
kc.StandaloneMode,
kc.ClusterDomain,
kc.ClusterDNS,
kc.MasterServiceNamespace,
kc.VolumePlugins,
kc.NetworkPlugins,
kc.NetworkPluginName,
kc.StreamingConnectionIdleTimeout,
kc.Recorder,
kc.CAdvisorInterface,
kc.ImageGCPolicy,
kc.DiskSpacePolicy,
kc.Cloud,
kc.NodeStatusUpdateFrequency,
kc.ResourceContainer,
kc.OSInterface,
kc.CgroupRoot,
kc.ContainerRuntime,
kc.RktPath,
kc.RktStage1Image,
kc.Mounter,
kc.Writer,
kc.DockerDaemonContainer,
kc.SystemContainer,
kc.ConfigureCBR0,
kc.PodCIDR,
kc.MaxPods,
kc.DockerExecHandler,
kc.ResolverConfig,
kc.CPUCFSQuota,
daemonEndpoints,
kc.SerializeImagePulls,
)
if err != nil {
return nil, nil, err
}
k.BirthCry()
k.StartGarbageCollection() // 开启垃圾回收
return k, pc, nil
}
在Pod相关操作中就如下。
func makePodSourceConfig(kc *KubeletConfig) *config.PodConfig {
// source of all configuration
cfg := config.NewPodConfig(config.PodConfigNotificationIncremental, kc.Recorder)
// define file config source
if kc.ConfigFile != "" {
glog.Infof("Adding manifest file: %v", kc.ConfigFile)
config.NewSourceFile(kc.ConfigFile, kc.NodeName, kc.FileCheckFrequency, cfg.Channel(kubelet.FileSource))
}
// define url config source
if kc.ManifestURL != "" {
glog.Infof("Adding manifest url %q with HTTP header %v", kc.ManifestURL, kc.ManifestURLHeader)
config.NewSourceURL(kc.ManifestURL, kc.ManifestURLHeader, kc.NodeName, kc.HTTPCheckFrequency, cfg.Channel(kubelet.HTTPSource))
}
if kc.KubeClient != nil {
glog.Infof("Watching apiserver")
config.NewSourceApiserver(kc.KubeClient, kc.NodeName, cfg.Channel(kubelet.ApiserverSource)) // 连接APIServer的客户端并监听
}
return cfg
}
通过client来监听APIServer对应的Pod的信息的变更。
// NewSourceApiserver creates a config source that watches and pulls from the apiserver.
func NewSourceApiserver(c *client.Client, nodeName string, updates chan<- interface{}) {
lw := cache.NewListWatchFromClient(c, "pods", api.NamespaceAll, fields.OneTermEqualSelector(client.PodHost, nodeName))
newSourceApiserverFromLW(lw, updates) // 监听远端Pod变化
}
// newSourceApiserverFromLW holds creates a config source that watches and pulls from the apiserver.
func newSourceApiserverFromLW(lw cache.ListerWatcher, updates chan<- interface{}) {
send := func(objs []interface{}) {
var pods []*api.Pod
for _, o := range objs {
pods = append(pods, o.(*api.Pod))
}
updates <- kubelet.PodUpdate{Pods: pods, Op: kubelet.SET, Source: kubelet.ApiserverSource} // 将更新发送到updates中
}
cache.NewReflector(lw, &api.Pod{}, cache.NewUndeltaStore(send, cache.MetaNamespaceKeyFunc), 0).Run()
}
至此监听Pod的流程完成了,通过updates来进行数据传输。接下来就是查看MainKubelet来生成新的实例。
// New creates a new Kubelet for use in main
func NewMainKubelet(
hostname string,
nodeName string,
dockerClient dockertools.DockerInterface,
kubeClient client.Interface,
rootDirectory string,
podInfraContainerImage string,
resyncInterval time.Duration,
pullQPS float32,
pullBurst int,
eventQPS float32,
eventBurst int,
containerGCPolicy ContainerGCPolicy,
sourcesReady SourcesReadyFn,
registerNode bool,
standaloneMode bool,
clusterDomain string,
clusterDNS net.IP,
masterServiceNamespace string,
volumePlugins []volume.VolumePlugin,
networkPlugins []network.NetworkPlugin,
networkPluginName string,
streamingConnectionIdleTimeout time.Duration,
recorder record.EventRecorder,
cadvisorInterface cadvisor.Interface,
imageGCPolicy ImageGCPolicy,
diskSpacePolicy DiskSpacePolicy,
cloud cloudprovider.Interface,
nodeStatusUpdateFrequency time.Duration,
resourceContainer string,
osInterface kubecontainer.OSInterface,
cgroupRoot string,
containerRuntime string,
rktPath string,
rktStage1Image string,
mounter mount.Interface,
writer kubeio.Writer,
dockerDaemonContainer string,
systemContainer string,
configureCBR0 bool,
podCIDR string,
pods int,
dockerExecHandler dockertools.ExecHandler,
resolverConfig string,
cpuCFSQuota bool,
daemonEndpoints *api.NodeDaemonEndpoints,
serializeImagePulls bool,
) (*Kubelet, error) {
if rootDirectory == "" {
return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
}
if resyncInterval <= 0 {
return nil, fmt.Errorf("invalid sync frequency %d", resyncInterval)
}
if systemContainer != "" && cgroupRoot == "" {
return nil, fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified")
}
dockerClient = dockertools.NewInstrumentedDockerInterface(dockerClient) // 获取操作接口
serviceStore := cache.NewStore(cache.MetaNamespaceKeyFunc)
if kubeClient != nil {
// TODO: cache.NewListWatchFromClient is limited as it takes a client implementation rather
// than an interface. There is no way to construct a list+watcher using resource name.
listWatch := &cache.ListWatch{
ListFunc: func() (runtime.Object, error) {
return kubeClient.Services(api.NamespaceAll).List(labels.Everything())
},
WatchFunc: func(resourceVersion string) (watch.Interface, error) {
return kubeClient.Services(api.NamespaceAll).Watch(labels.Everything(), fields.Everything(), resourceVersion)
},
}
cache.NewReflector(listWatch, &api.Service{}, serviceStore, 0).Run() // 新生成一个监控services的信息变化
}
serviceLister := &cache.StoreToServiceLister{Store: serviceStore} // 监控service的变化
nodeStore := cache.NewStore(cache.MetaNamespaceKeyFunc)
if kubeClient != nil {
// TODO: cache.NewListWatchFromClient is limited as it takes a client implementation rather
// than an interface. There is no way to construct a list+watcher using resource name.
fieldSelector := fields.Set{client.ObjectNameField: nodeName}.AsSelector()
listWatch := &cache.ListWatch{
ListFunc: func() (runtime.Object, error) {
return kubeClient.Nodes().List(labels.Everything(), fieldSelector) // 查看所有的节点的信息
},
WatchFunc: func(resourceVersion string) (watch.Interface, error) {
return kubeClient.Nodes().Watch(labels.Everything(), fieldSelector, resourceVersion)
},
}
cache.NewReflector(listWatch, &api.Node{}, nodeStore, 0).Run()
}
nodeLister := &cache.StoreToNodeLister{Store: nodeStore} // 监听node的变化情况
// TODO: get the real node object of ourself,
// and use the real node name and UID.
// TODO: what is namespace for node?
nodeRef := &api.ObjectReference{
Kind: "Node",
Name: nodeName,
UID: types.UID(nodeName),
Namespace: "",
}
containerGC, err := newContainerGC(dockerClient, containerGCPolicy) // 生成GC容器
if err != nil {
return nil, err
}
imageManager, err := newImageManager(dockerClient, cadvisorInterface, recorder, nodeRef, imageGCPolicy) // 生成镜像管理
if err != nil {
return nil, fmt.Errorf("failed to initialize image manager: %v", err)
}
diskSpaceManager, err := newDiskSpaceManager(cadvisorInterface, diskSpacePolicy) // 生成磁盘管理
if err != nil {
return nil, fmt.Errorf("failed to initialize disk manager: %v", err)
}
statusManager := status.NewManager(kubeClient) // 获取状态管理
readinessManager := kubecontainer.NewReadinessManager()
containerRefManager := kubecontainer.NewRefManager()
volumeManager := newVolumeManager() // 生成容器卷管理
oomWatcher := NewOOMWatcher(cadvisorInterface, recorder)
klet := &Kubelet{
hostname: hostname,
nodeName: nodeName,
dockerClient: dockerClient,
kubeClient: kubeClient,
rootDirectory: rootDirectory,
resyncInterval: resyncInterval,
containerRefManager: containerRefManager,
readinessManager: readinessManager,
httpClient: &http.Client{},
sourcesReady: sourcesReady,
registerNode: registerNode,
standaloneMode: standaloneMode,
clusterDomain: clusterDomain,
clusterDNS: clusterDNS,
serviceLister: serviceLister,
nodeLister: nodeLister,
runtimeMutex: sync.Mutex{},
runtimeUpThreshold: maxWaitForContainerRuntime,
lastTimestampRuntimeUp: time.Time{},
masterServiceNamespace: masterServiceNamespace,
streamingConnectionIdleTimeout: streamingConnectionIdleTimeout,
recorder: recorder,
cadvisor: cadvisorInterface,
containerGC: containerGC,
imageManager: imageManager,
diskSpaceManager: diskSpaceManager,
statusManager: statusManager,
volumeManager: volumeManager,
cloud: cloud,
nodeRef: nodeRef,
nodeStatusUpdateFrequency: nodeStatusUpdateFrequency,
resourceContainer: resourceContainer,
os: osInterface,
oomWatcher: oomWatcher,
cgroupRoot: cgroupRoot,
mounter: mounter,
writer: writer,
configureCBR0: configureCBR0,
podCIDR: podCIDR,
pods: pods,
syncLoopMonitor: util.AtomicValue{},
resolverConfig: resolverConfig,
cpuCFSQuota: cpuCFSQuota,
daemonEndpoints: daemonEndpoints,
}
if plug, err := network.InitNetworkPlugin(networkPlugins, networkPluginName, &networkHost{klet}); err != nil { // 初始化网络插件类
return nil, err
} else {
klet.networkPlugin = plug
}
machineInfo, err := klet.GetCachedMachineInfo()
if err != nil {
return nil, err
}
oomAdjuster := oom.NewOomAdjuster()
procFs := procfs.NewProcFs()
// Initialize the runtime.
switch containerRuntime { // 检查使用哪一种运行接口
case "docker":
// Only supported one for now, continue.
klet.containerRuntime = dockertools.NewDockerManager(
dockerClient,
recorder,
readinessManager,
containerRefManager,
machineInfo,
podInfraContainerImage,
pullQPS,
pullBurst,
containerLogsDir,
osInterface,
klet.networkPlugin,
klet,
klet.httpClient,
dockerExecHandler,
oomAdjuster,
procFs,
klet.cpuCFSQuota,
serializeImagePulls,
)
case "rkt":
conf := &rkt.Config{
Path: rktPath,
Stage1Image: rktStage1Image,
InsecureSkipVerify: true,
}
rktRuntime, err := rkt.New(
conf,
klet,
recorder,
containerRefManager,
readinessManager,
klet.volumeManager,
serializeImagePulls,
)
if err != nil {
return nil, err
}
klet.containerRuntime = rktRuntime
// No Docker daemon to put in a container.
dockerDaemonContainer = ""
default:
return nil, fmt.Errorf("unsupported container runtime %q specified", containerRuntime)
}
// Setup container manager, can fail if the devices hierarchy is not mounted
// (it is required by Docker however).
containerManager, err := newContainerManager(mounter, cadvisorInterface, dockerDaemonContainer, systemContainer, resourceContainer) // 生成镜像管理
if err != nil {
return nil, fmt.Errorf("failed to create the Container Manager: %v", err)
}
klet.containerManager = containerManager
go util.Until(klet.syncNetworkStatus, 30*time.Second, util.NeverStop) // 同步网络状态
if klet.kubeClient != nil {
// Start syncing node status immediately, this may set up things the runtime needs to run.
go util.Until(klet.syncNodeStatus, klet.nodeStatusUpdateFrequency, util.NeverStop) // 同步Pod状态
}
// Wait for the runtime to be up with a timeout.
if err := waitUntilRuntimeIsUp(klet.containerRuntime, maxWaitForContainerRuntime); err != nil {
return nil, fmt.Errorf("timed out waiting for %q to come up: %v", containerRuntime, err)
}
klet.lastTimestampRuntimeUp = time.Now()
klet.runner = klet.containerRuntime
klet.podManager = newBasicPodManager(klet.kubeClient) // 获取基础的podManager
runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime)
if err != nil {
return nil, err
}
klet.runtimeCache = runtimeCache
klet.podWorkers = newPodWorkers(runtimeCache, klet.syncPod, recorder) // 新生成一个podWrokers
metrics.Register(runtimeCache)
if err = klet.setupDataDirs(); err != nil {
return nil, err
}
if err = klet.volumePluginMgr.InitPlugins(volumePlugins, &volumeHost{klet}); err != nil { // 初始化容器卷插件
return nil, err
}
// If the container logs directory does not exist, create it.
if _, err := os.Stat(containerLogsDir); err != nil {
if err := osInterface.Mkdir(containerLogsDir, 0755); err != nil {
glog.Errorf("Failed to create directory %q: %v", containerLogsDir, err)
}
}
klet.backOff = util.NewBackOff(resyncInterval, maxContainerBackOff)
klet.podKillingCh = make(chan *kubecontainer.Pod, podKillingChannelCapacity)
return klet, nil
}
通过初始化kubelet实例,然后再调用该实例的Run方法来监听获取的Pod的变化信息。
// Run starts the kubelet reacting to config updates
func (kl *Kubelet) Run(updates <-chan PodUpdate) {
if kl.logServer == nil {
kl.logServer = http.StripPrefix("/logs/", http.FileServer(http.Dir("/var/log/")))
}
if kl.kubeClient == nil {
glog.Warning("No api server defined - no node status update will be sent.")
}
// Move Kubelet to a container.
if kl.resourceContainer != "" {
// Fixme: I need to reside inside ContainerManager interface.
err := util.RunInResourceContainer(kl.resourceContainer)
if err != nil {
glog.Warningf("Failed to move Kubelet to container %q: %v", kl.resourceContainer, err)
}
glog.Infof("Running in container %q", kl.resourceContainer)
}
if err := kl.imageManager.Start(); err != nil { // 开始imageManager
kl.recorder.Eventf(kl.nodeRef, "KubeletSetupFailed", "Failed to start ImageManager %v", err)
glog.Errorf("Failed to start ImageManager, images may not be garbage collected: %v", err)
}
if err := kl.cadvisor.Start(); err != nil { // 开启监控
kl.recorder.Eventf(kl.nodeRef, "KubeletSetupFailed", "Failed to start CAdvisor %v", err)
glog.Errorf("Failed to start CAdvisor, system may not be properly monitored: %v", err)
}
if err := kl.containerManager.Start(); err != nil { // 开启容器管理
kl.recorder.Eventf(kl.nodeRef, "KubeletSetupFailed", "Failed to start ContainerManager %v", err)
glog.Errorf("Failed to start ContainerManager, system may not be properly isolated: %v", err)
}
if err := kl.oomWatcher.Start(kl.nodeRef); err != nil {
kl.recorder.Eventf(kl.nodeRef, "KubeletSetupFailed", "Failed to start OOM watcher %v", err)
glog.Errorf("Failed to start OOM watching: %v", err)
}
go util.Until(kl.updateRuntimeUp, 5*time.Second, util.NeverStop)
// Start a goroutine responsible for killing pods (that are not properly
// handled by pod workers).
go util.Until(kl.podKiller, 1*time.Second, util.NeverStop) // 循环开启检查是否需要杀掉pod
// Run the system oom watcher forever.
kl.statusManager.Start()
kl.syncLoop(updates, kl) // 同步所有的更新
}
继续查看syncLoop处理流程。
// syncLoop is the main loop for processing changes. It watches for changes from
// three channels (file, apiserver, and http) and creates a union of them. For
// any new change seen, will run a sync against desired state and running state. If
// no changes are seen to the configuration, will synchronize the last known desired
// state every sync-frequency seconds. Never returns.
func (kl *Kubelet) syncLoop(updates <-chan PodUpdate, handler SyncHandler) {
glog.Info("Starting kubelet main sync loop.")
kl.resyncTicker = time.NewTicker(kl.resyncInterval)
var housekeepingTimestamp time.Time
for {
if !kl.containerRuntimeUp() { // 检查是否启动
time.Sleep(5 * time.Second)
glog.Infof("Skipping pod synchronization, container runtime is not up.")
continue
}
if !kl.doneNetworkConfigure() { // 检查网络配置
time.Sleep(5 * time.Second)
glog.Infof("Skipping pod synchronization, network is not configured")
continue
}
// Make sure we sync first to receive the pods from the sources before
// performing housekeeping.
if !kl.syncLoopIteration(updates, handler) { // 同步数据
break
}
// We don't want to perform housekeeping too often, so we set a minimum
// period for it. Housekeeping would be performed at least once every
// kl.resyncInterval, and *no* more than once every
// housekeepingMinimumPeriod.
// TODO (#13418): Investigate whether we can/should spawn a dedicated
// goroutine for housekeeping
if !kl.sourcesReady() {
// If the sources aren't ready, skip housekeeping, as we may
// accidentally delete pods from unready sources.
glog.V(4).Infof("Skipping cleanup, sources aren't ready yet.")
} else if housekeepingTimestamp.IsZero() {
housekeepingTimestamp = time.Now()
} else if time.Since(housekeepingTimestamp) > housekeepingMinimumPeriod {
glog.V(4).Infof("SyncLoop (housekeeping)")
if err := handler.HandlePodCleanups(); err != nil {
glog.Errorf("Failed cleaning pods: %v", err)
}
housekeepingTimestamp = time.Now()
}
}
}
func (kl *Kubelet) syncLoopIteration(updates <-chan PodUpdate, handler SyncHandler) bool {
kl.syncLoopMonitor.Store(time.Now())
select {
case u, open := <-updates: // 获取更新信息
if !open {
glog.Errorf("Update channel is closed. Exiting the sync loop.")
return false
}
switch u.Op {
case ADD:
glog.V(2).Infof("SyncLoop (ADD): %q", kubeletUtil.FormatPodNames(u.Pods))
handler.HandlePodAdditions(u.Pods) // 添加Pod
case UPDATE:
glog.V(2).Infof("SyncLoop (UPDATE): %q", kubeletUtil.FormatPodNames(u.Pods))
handler.HandlePodUpdates(u.Pods) // 更新Pod
case REMOVE:
glog.V(2).Infof("SyncLoop (REMOVE): %q", kubeletUtil.FormatPodNames(u.Pods))
handler.HandlePodDeletions(u.Pods) // 删除Pod
case SET:
// TODO: Do we want to support this?
glog.Errorf("Kubelet does not support snapshot update")
}
case <-kl.resyncTicker.C:
// Periodically syncs all the pods and performs cleanup tasks.
glog.V(4).Infof("SyncLoop (periodic sync)")
handler.HandlePodSyncs(kl.podManager.GetPods())
}
kl.syncLoopMonitor.Store(time.Now())
return true
}
调用的handler的实例最终调用的还是Kubelet实例的方法。
func (kl *Kubelet) HandlePodAdditions(pods []*api.Pod) {
start := time.Now()
sort.Sort(podsByCreationTime(pods))
for _, pod := range pods {
kl.podManager.AddPod(pod) // 通过podManger添加pod
if isMirrorPod(pod) {
kl.handleMirrorPod(pod, start)
continue
}
// Note that allPods includes the new pod since we added at the
// beginning of the loop.
allPods := kl.podManager.GetPods()
// We failed pods that we rejected, so activePods include all admitted
// pods that are alive and the new pod.
activePods := kl.filterOutTerminatedPods(allPods) // 获取所有的pods
// Check if we can admit the pod; if not, reject it.
if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { // 检查是否正常 否则就拒绝
kl.rejectPod(pod, reason, message)
continue
}
mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod)
kl.dispatchWork(pod, SyncPodCreate, mirrorPod, start)
}
}
func (kl *Kubelet) HandlePodUpdates(pods []*api.Pod) {
start := time.Now()
for _, pod := range pods {
kl.podManager.UpdatePod(pod) // 通过podManager更新pod
if isMirrorPod(pod) {
kl.handleMirrorPod(pod, start)
continue
}
// TODO: Evaluate if we need to validate and reject updates.
mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod)
kl.dispatchWork(pod, SyncPodUpdate, mirrorPod, start)
}
}
func (kl *Kubelet) HandlePodDeletions(pods []*api.Pod) {
start := time.Now()
for _, pod := range pods {
kl.podManager.DeletePod(pod) // 通过podManger删除pod
if isMirrorPod(pod) {
kl.handleMirrorPod(pod, start)
continue
}
// Deletion is allowed to fail because the periodic cleanup routine
// will trigger deletion again.
if err := kl.deletePod(pod.UID); err != nil { // 删除pod
glog.V(2).Infof("Failed to delete pod %q, err: %v", kubeletUtil.FormatPodName(pod), err)
}
}
}
通过更新或者添加的pod都会通过podWorker来进行更新。
func (kl *Kubelet) dispatchWork(pod *api.Pod, syncType SyncPodType, mirrorPod *api.Pod, start time.Time) {
if kl.podIsTerminated(pod) { // 如果pod已经停止则返回
return
}
// Run the sync in an async worker.
kl.podWorkers.UpdatePod(pod, mirrorPod, func() {
metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start))
}) // 更新pod或者新增
// Note the number of containers for new pods.
if syncType == SyncPodCreate {
metrics.ContainersPerPodCount.Observe(float64(len(pod.Spec.Containers)))
}
}
此时就会调用如下UpdatePod函数进行处理。
func (p *podWorkers) managePodLoop(podUpdates <-chan workUpdate) {
var minRuntimeCacheTime time.Time
for newWork := range podUpdates { // 获取更新的信息
func() {
defer p.checkForUpdates(newWork.pod.UID, newWork.updateCompleteFn)
// We would like to have the state of the containers from at least
// the moment when we finished the previous processing of that pod.
if err := p.runtimeCache.ForceUpdateIfOlder(minRuntimeCacheTime); err != nil {
glog.Errorf("Error updating the container runtime cache: %v", err)
return
}
pods, err := p.runtimeCache.GetPods() // 获取pods
if err != nil {
glog.Errorf("Error getting pods while syncing pod: %v", err)
return
}
err = p.syncPodFn(newWork.pod, newWork.mirrorPod,
kubecontainer.Pods(pods).FindPodByID(newWork.pod.UID), newWork.updateType) // 同步调用pod
if err != nil {
glog.Errorf("Error syncing pod %s, skipping: %v", newWork.pod.UID, err)
p.recorder.Eventf(newWork.pod, "FailedSync", "Error syncing pod, skipping: %v", err)
return
}
minRuntimeCacheTime = time.Now()
newWork.updateCompleteFn() // 更新网络
}()
}
}
// Apply the new setting to the specified pod. updateComplete is called when the update is completed.
func (p *podWorkers) UpdatePod(pod *api.Pod, mirrorPod *api.Pod, updateComplete func()) {
uid := pod.UID
var podUpdates chan workUpdate
var exists bool
// TODO: Pipe this through from the kubelet. Currently kubelets operating with
// snapshot updates (PodConfigNotificationSnapshot) will send updates, creates
// and deletes as SET operations, which makes updates indistinguishable from
// creates. The intent here is to communicate to the pod worker that it can take
// certain liberties, like skipping status generation, when it receives a create
// event for a pod.
updateType := SyncPodUpdate
p.podLock.Lock()
defer p.podLock.Unlock()
if podUpdates, exists = p.podUpdates[uid]; !exists {
// We need to have a buffer here, because checkForUpdates() method that
// puts an update into channel is called from the same goroutine where
// the channel is consumed. However, it is guaranteed that in such case
// the channel is empty, so buffer of size 1 is enough.
podUpdates = make(chan workUpdate, 1)
p.podUpdates[uid] = podUpdates
// Creating a new pod worker either means this is a new pod, or that the
// kubelet just restarted. In either case the kubelet is willing to believe
// the status of the pod for the first pod worker sync. See corresponding
// comment in syncPod.
updateType = SyncPodCreate // 如果是新创建
go func() {
defer util.HandleCrash()
p.managePodLoop(podUpdates) // 调用创建流程
}()
}
if !p.isWorking[pod.UID] {
p.isWorking[pod.UID] = true // 设置标志位等信息
podUpdates <- workUpdate{
pod: pod,
mirrorPod: mirrorPod,
updateCompleteFn: updateComplete,
updateType: updateType,
} //
} else {
p.lastUndeliveredWorkUpdate[pod.UID] = workUpdate{
pod: pod,
mirrorPod: mirrorPod,
updateCompleteFn: updateComplete,
updateType: updateType,
}
}
}
如果新增调用的syncPodFn函数,就是初始化传入的kubelet的syncPod函数。
func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, runningPod kubecontainer.Pod, updateType SyncPodType) error {
podFullName := kubecontainer.GetPodFullName(pod) // 获取完整信息
uid := pod.UID
start := time.Now()
var firstSeenTime time.Time
if firstSeenTimeStr, ok := pod.Annotations[ConfigFirstSeenAnnotationKey]; !ok {
glog.V(3).Infof("First seen time not recorded for pod %q", pod.UID)
} else {
firstSeenTime = kubeletTypes.ConvertToTimestamp(firstSeenTimeStr).Get()
}
// Before returning, regenerate status and store it in the cache.
defer func() {
if isStaticPod(pod) && mirrorPod == nil { // 判断是否是静态的pod
// No need to cache the status because the mirror pod does not
// exist yet.
return
}
status, err := kl.generatePodStatus(pod) // 获取pod状态
if err != nil {
glog.Errorf("Unable to generate status for pod with name %q and uid %q info with error(%v)", podFullName, uid, err)
} else {
podToUpdate := pod
if mirrorPod != nil {
podToUpdate = mirrorPod
}
existingStatus, ok := kl.statusManager.GetPodStatus(podToUpdate.UID) // 获取状态
if !ok || existingStatus.Phase == api.PodPending && status.Phase == api.PodRunning &&
!firstSeenTime.IsZero() {
metrics.PodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime))
}
kl.statusManager.SetPodStatus(podToUpdate, status) // 设置pod状态
}
}()
// Kill pods we can't run.
if err := canRunPod(pod); err != nil || pod.DeletionTimestamp != nil { // 如果pod不能运行则杀掉
if err := kl.killPod(pod, runningPod); err != nil {
util.HandleError(err)
}
return err
}
// Create Mirror Pod for Static Pod if it doesn't already exist
if isStaticPod(pod) {
if mirrorPod != nil && !kl.podManager.IsMirrorPodOf(mirrorPod, pod) {
// The mirror pod is semantically different from the static pod. Remove
// it. The mirror pod will get recreated later.
glog.Errorf("Deleting mirror pod %q because it is outdated", podFullName)
if err := kl.podManager.DeleteMirrorPod(podFullName); err != nil {
glog.Errorf("Failed deleting mirror pod %q: %v", podFullName, err)
}
}
if mirrorPod == nil {
glog.V(3).Infof("Creating a mirror pod for static pod %q", podFullName)
if err := kl.podManager.CreateMirrorPod(pod); err != nil {
glog.Errorf("Failed creating a mirror pod %q: %v", podFullName, err)
}
_, ok := kl.podManager.GetMirrorPodByPod(pod)
if !ok {
glog.Errorf("Mirror pod not available")
}
}
}
if err := kl.makePodDataDirs(pod); err != nil { // 设置pod的运行路径
glog.Errorf("Unable to make pod data directories for pod %q (uid %q): %v", podFullName, uid, err)
return err
}
// Starting phase:
ref, err := api.GetReference(pod)
if err != nil {
glog.Errorf("Couldn't make a ref to pod %q: '%v'", podFullName, err)
}
// Mount volumes.
podVolumes, err := kl.mountExternalVolumes(pod) //挂载数据卷
if err != nil {
if ref != nil {
kl.recorder.Eventf(ref, "FailedMount", "Unable to mount volumes for pod %q: %v", podFullName, err)
}
glog.Errorf("Unable to mount volumes for pod %q: %v; skipping pod", podFullName, err)
return err
}
kl.volumeManager.SetVolumes(pod.UID, podVolumes) //设置卷
// The kubelet is the source of truth for pod status. It ignores the status sent from
// the apiserver and regenerates status for every pod update, incrementally updating
// the status it received at pod creation time.
//
// The container runtime needs 2 pieces of information from the status to sync a pod:
// The terminated state of containers (to restart them) and the podIp (for liveness probes).
// New pods don't have either, so we skip the expensive status generation step.
//
// If we end up here with a create event for an already running pod, it could result in a
// restart of its containers. This cannot happen unless the kubelet restarts, because the
// delete before the second create would cancel this pod worker.
//
// If the kubelet restarts, we have a bunch of running containers for which we get create
// events. This is ok, because the pod status for these will include the podIp and terminated
// status. Any race conditions here effectively boils down to -- the pod worker didn't sync
// state of a newly started container with the apiserver before the kubelet restarted, so
// it's OK to pretend like the kubelet started them after it restarted.
//
// Also note that deletes currently have an updateType of `create` set in UpdatePods.
// This, again, does not matter because deletes are not processed by this method.
var podStatus api.PodStatus
if updateType == SyncPodCreate {
// This is the first time we are syncing the pod. Record the latency
// since kubelet first saw the pod if firstSeenTime is set.
if !firstSeenTime.IsZero() {
metrics.PodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime))
}
podStatus = pod.Status
podStatus.StartTime = &unversioned.Time{Time: start}
kl.statusManager.SetPodStatus(pod, podStatus) // 如果更新则设置状态
glog.V(3).Infof("Not generating pod status for new pod %q", podFullName)
} else {
var err error
podStatus, err = kl.generatePodStatus(pod) // 获取当前状态
if err != nil {
glog.Errorf("Unable to get status for pod %q (uid %q): %v", podFullName, uid, err)
return err
}
}
pullSecrets, err := kl.getPullSecretsForPod(pod)
if err != nil {
glog.Errorf("Unable to get pull secrets for pod %q (uid %q): %v", podFullName, uid, err)
return err
}
err = kl.containerRuntime.SyncPod(pod, runningPod, podStatus, pullSecrets, kl.backOff) //同步pod的状态信息
if err != nil {
return err
}
ingress, egress, err := extractBandwidthResources(pod) // 获取绑定的网络信息
if err != nil {
return err
}
if egress != nil || ingress != nil {
if pod.Spec.HostNetwork {
kl.recorder.Event(pod, "HostNetworkNotSupported", "Bandwidth shaping is not currently supported on the host network")
} else if kl.shaper != nil {
status, found := kl.statusManager.GetPodStatus(pod.UID)
if !found {
statusPtr, err := kl.containerRuntime.GetPodStatus(pod)
if err != nil {
glog.Errorf("Error getting pod for bandwidth shaping")
return err
}
status = *statusPtr
}
if len(status.PodIP) > 0 {
err = kl.shaper.ReconcileCIDR(fmt.Sprintf("%s/32", status.PodIP), egress, ingress)
}
} else {
kl.recorder.Event(pod, "NilShaper", "Pod requests bandwidth shaping, but the shaper is undefined")
}
}
return nil
}
此处就是检查启动之前所有的状态并在检查之后启动,如果此时使用的是docker的接口,则会调用如下正真去启动容器。
// Sync the running pod to match the specified desired pod.
func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus api.PodStatus, pullSecrets []api.Secret, backOff *util.Backoff) error {
start := time.Now()
defer func() {
metrics.ContainerManagerLatency.WithLabelValues("SyncPod").Observe(metrics.SinceInMicroseconds(start))
}()
podFullName := kubecontainer.GetPodFullName(pod)
containerChanges, err := dm.computePodContainerChanges(pod, runningPod, podStatus) // 更改pod状态
if err != nil {
return err
}
glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges)
if containerChanges.StartInfraContainer || (len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0) {
if len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0 {
glog.V(4).Infof("Killing Infra Container for %q because all other containers are dead.", podFullName)
} else {
glog.V(4).Infof("Killing Infra Container for %q, will start new one", podFullName)
}
// Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container)
err = dm.KillPod(pod, runningPod) //如果正在运行则杀掉
if err != nil {
return err
}
} else {
// Otherwise kill any containers in this pod which are not specified as ones to keep.
for _, container := range runningPod.Containers { // 获取正在运行的要保留
_, keep := containerChanges.ContainersToKeep[kubeletTypes.DockerID(container.ID)]
if !keep {
glog.V(3).Infof("Killing unwanted container %+v", container)
// attempt to find the appropriate container policy
var podContainer *api.Container
for i, c := range pod.Spec.Containers {
if c.Name == container.Name {
podContainer = &pod.Spec.Containers[i]
break
}
}
err = dm.KillContainerInPod(container.ID, podContainer, pod) // 如果不保留的则杀掉
if err != nil {
glog.Errorf("Error killing container: %v", err)
}
}
}
}
// If we should create infra container then we do it first.
podInfraContainerID := containerChanges.InfraContainerId // 获取运行的基础容器
if containerChanges.StartInfraContainer && (len(containerChanges.ContainersToStart) > 0) {
glog.V(4).Infof("Creating pod infra container for %q", podFullName)
podInfraContainerID, err = dm.createPodInfraContainer(pod) // 创建Pod
// Call the networking plugin
if err == nil {
err = dm.networkPlugin.SetUpPod(pod.Namespace, pod.Name, podInfraContainerID) // 设置网络
}
if err != nil {
glog.Errorf("Failed to create pod infra container: %v; Skipping pod %q", err, podFullName)
return err
}
// Setup the host interface (FIXME: move to networkPlugin when ready)
podInfraContainer, err := dm.client.InspectContainer(string(podInfraContainerID)) // 进入容器
if err != nil {
glog.Errorf("Failed to inspect pod infra container: %v; Skipping pod %q", err, podFullName)
return err
}
if err = hairpin.SetUpContainer(podInfraContainer.State.Pid, "eth0"); err != nil {
glog.Warningf("Hairpin setup failed for pod %q: %v", podFullName, err) // 设置网络
}
// Find the pod IP after starting the infra container in order to expose
// it safely via the downward API without a race and be able to use podIP in kubelet-managed /etc/hosts file.
pod.Status.PodIP = dm.determineContainerIP(pod.Name, pod.Namespace, podInfraContainer) // 获取网络IP
}
// Start everything
for idx := range containerChanges.ContainersToStart {
container := &pod.Spec.Containers[idx]
// containerChanges.StartInfraContainer causes the containers to be restarted for config reasons
// ignore backoff
if !containerChanges.StartInfraContainer && dm.doBackOff(pod, container, podStatus, backOff) {
glog.V(4).Infof("Backing Off restarting container %+v in pod %v", container, podFullName)
continue
}
glog.V(4).Infof("Creating container %+v in pod %v", container, podFullName)
err := dm.imagePuller.PullImage(pod, container, pullSecrets) // 拉取镜像
dm.updateReasonCache(pod, container, "PullImageError", err) // 更新
if err != nil {
glog.Warningf("Failed to pull image %q from pod %q and container %q: %v", container.Image, kubecontainer.GetPodFullName(pod), container.Name, err)
continue
}
if container.SecurityContext != nil && container.SecurityContext.RunAsNonRoot {
err := dm.verifyNonRoot(container)
dm.updateReasonCache(pod, container, "VerifyNonRootError", err)
if err != nil {
glog.Errorf("Error running pod %q container %q: %v", kubecontainer.GetPodFullName(pod), container.Name, err)
continue
}
}
// TODO(dawnchen): Check RestartPolicy.DelaySeconds before restart a container
// Note: when configuring the pod's containers anything that can be configured by pointing
// to the namespace of the infra container should use namespaceMode. This includes things like the net namespace
// and IPC namespace. PID mode cannot point to another container right now.
// See createPodInfraContainer for infra container setup.
namespaceMode := fmt.Sprintf("container:%v", podInfraContainerID)
_, err = dm.runContainerInPod(pod, container, namespaceMode, namespaceMode, getPidMode(pod)) // 运行容器
dm.updateReasonCache(pod, container, "RunContainerError", err)
if err != nil {
// TODO(bburns) : Perhaps blacklist a container after N failures?
glog.Errorf("Error running pod %q container %q: %v", kubecontainer.GetPodFullName(pod), container.Name, err)
continue
}
// Successfully started the container; clear the entry in the failure
// reason cache.
dm.clearReasonCache(pod, container)
}
return nil
}
至此,pod的相关的容器的创建工作基本完成。这其中经过了多重的初始化流程,最终通过监听从APIServer传入的信息,来通过podWorkers来进行pod的状态管理,从而完成每个node节点的pod维护工作。
本文只是非常简单的描述了一下kubelet初始化相关的流程,并初步查看了有关pod的一些初始化准备的工作,从而加深kubelet的工作原理,本文其实还有大量的细节未展开学习,如pod的删除更新、容器的网络等详细的流程需要在以后的学习中进一步理解。由于本人才疏学浅,如有错误请批评指正。