kubernetes源码阅读之controller manager启动

照旧先是在cmd里面创建CMServer然后启动,看代码如下

func main() {
	s := options.NewCMServer()
	s.AddFlags(pflag.CommandLine)

	flag.InitFlags()
	util.InitLogs()
	defer util.FlushLogs()

	verflag.PrintAndExitIfRequested()

	if err := app.Run(s); err != nil {
		fmt.Fprintf(os.Stderr, "%v\n", err)
		os.Exit(1)
	}
}
在创建里面首先是一堆配置,主要是配置work线程数,将在run方法里面启动对应个数的线程,

func NewCMServer() *CMServer {
	s := CMServer{
		KubeControllerManagerConfiguration: componentconfig.KubeControllerManagerConfiguration{
			Port:                              ports.ControllerManagerPort,
			Address:                           "0.0.0.0",
			ConcurrentEndpointSyncs:           5,//Endpoint controller线程数
			ConcurrentServiceSyncs:            1,//service controller线程数
			ConcurrentRCSyncs:                 5,//RC controller线程数
			ConcurrentRSSyncs:                 5,//RS controller线程数
			ConcurrentDaemonSetSyncs:          2,//DaemonSet controller线程数
			ConcurrentJobSyncs:                5,//job controller线程数
			ConcurrentResourceQuotaSyncs:      5,//资源配额 controller线程数
			ConcurrentDeploymentSyncs:         5,//Deployment controller线程数
			ConcurrentNamespaceSyncs:          2,//命名空间 controller线程数
			ConcurrentSATokenSyncs:            5,
			LookupCacheSizeForRC:              4096,
			LookupCacheSizeForRS:              4096,
			LookupCacheSizeForDaemonSet:       1024,
			ServiceSyncPeriod:                 unversioned.Duration{Duration: 5 * time.Minute},
			NodeSyncPeriod:                    unversioned.Duration{Duration: 10 * time.Second},
			ResourceQuotaSyncPeriod:           unversioned.Duration{Duration: 5 * time.Minute},
			NamespaceSyncPeriod:               unversioned.Duration{Duration: 5 * time.Minute},
			PVClaimBinderSyncPeriod:           unversioned.Duration{Duration: 15 * time.Second},
			HorizontalPodAutoscalerSyncPeriod: unversioned.Duration{Duration: 30 * time.Second},
			DeploymentControllerSyncPeriod:    unversioned.Duration{Duration: 30 * time.Second},
			MinResyncPeriod:                   unversioned.Duration{Duration: 12 * time.Hour},
			RegisterRetryCount:                10,
			PodEvictionTimeout:                unversioned.Duration{Duration: 5 * time.Minute},
			NodeMonitorGracePeriod:            unversioned.Duration{Duration: 40 * time.Second},
			NodeStartupGracePeriod:            unversioned.Duration{Duration: 60 * time.Second},
			NodeMonitorPeriod:                 unversioned.Duration{Duration: 5 * time.Second},
			ClusterName:                       "kubernetes",
			NodeCIDRMaskSize:                  24,
			ConfigureCloudRoutes:              true,
			TerminatedPodGCThreshold:          12500,
			VolumeConfiguration: componentconfig.VolumeConfiguration{
				EnableHostPathProvisioning: false,
				EnableDynamicProvisioning:  true,
				PersistentVolumeRecyclerConfiguration: componentconfig.PersistentVolumeRecyclerConfiguration{
					MaximumRetry:             3,
					MinimumTimeoutNFS:        300,
					IncrementTimeoutNFS:      30,
					MinimumTimeoutHostPath:   60,
					IncrementTimeoutHostPath: 30,
				},
				FlexVolumePluginDir: "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/",
			},
			ContentType:             "application/vnd.kubernetes.protobuf",
			KubeAPIQPS:              20.0,
			KubeAPIBurst:            30,
			LeaderElection:          leaderelection.DefaultLeaderElectionConfiguration(),
			ControllerStartInterval: unversioned.Duration{Duration: 0 * time.Second},
			EnableGarbageCollector:  false,
			ConcurrentGCSyncs:       5,
			ClusterSigningCertFile:  "/etc/kubernetes/ca/ca.pem",
			ClusterSigningKeyFile:   "/etc/kubernetes/ca/ca.key",
		},
	}
	s.LeaderElection.LeaderElect = true
	return &s
}

还配置了ca证书的路径,容器垃圾回收线程数以及一些超时时间和同步周期等时间设置。创建完成之后是启动

// 启动CMServer.  This should never exit.
func Run(s *options.CMServer) error {
	if c, err := configz.New("componentconfig"); err == nil {
		c.Set(s.KubeControllerManagerConfiguration)
	} else {
		glog.Errorf("unable to register configz: %s", err)
	}
	kubeconfig, err := clientcmd.BuildConfigFromFlags(s.Master, s.Kubeconfig)
	if err != nil {
		return err
	}

	kubeconfig.ContentConfig.ContentType = s.ContentType
	// Override kubeconfig qps/burst settings from flags
	kubeconfig.QPS = s.KubeAPIQPS
	kubeconfig.Burst = int(s.KubeAPIBurst)

	kubeClient, err := client.New(kubeconfig)
	if err != nil {
		glog.Fatalf("Invalid API configuration: %v", err)
	}
	//启动server,这些都是系统监控的api,如/metrics可以记录api调用次数
	go func() {
		mux := http.NewServeMux()
		healthz.InstallHandler(mux)
		if s.EnableProfiling {
			mux.HandleFunc("/debug/pprof/", pprof.Index)
			mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
			mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
		}
		configz.InstallHandler(mux)
		mux.Handle("/metrics", prometheus.Handler())

		server := &http.Server{
			Addr:    net.JoinHostPort(s.Address, strconv.Itoa(int(s.Port))),
			Handler: mux,
		}
		glog.Fatal(server.ListenAndServe())
	}()
	//这里定义了启动controllers的启动方法。下面有调用这个方法
	run := func(stop <-chan struct{}) {
		err := StartControllers(s, kubeClient, kubeconfig, stop)
		glog.Fatalf("error running controllers: %v", err)
		panic("unreachable")
	}
	//k8s的HA方案中CM是一主多备,如果自己不是leader则不启动CM
	if !s.LeaderElection.LeaderElect {
		run(nil)
		panic("unreachable")
	}

	eventBroadcaster := record.NewBroadcaster()
	eventBroadcaster.StartLogging(glog.Infof)
	eventBroadcaster.StartRecordingToSink(kubeClient.Events(""))
	recorder := eventBroadcaster.NewRecorder(api.EventSource{Component: "controller-manager"})

	id, err := os.Hostname()
	if err != nil {
		return err
	}

	leaderelection.RunOrDie(leaderelection.LeaderElectionConfig{
		EndpointsMeta: api.ObjectMeta{
			Namespace: "kube-system",
			Name:      "kube-controller-manager",
		},
		Client:        kubeClient,
		Identity:      id,
		EventRecorder: recorder,
		LeaseDuration: s.LeaderElection.LeaseDuration.Duration,
		RenewDeadline: s.LeaderElection.RenewDeadline.Duration,
		RetryPeriod:   s.LeaderElection.RetryPeriod.Duration,
		Callbacks: leaderelection.LeaderCallbacks{
			OnStartedLeading: run,//这里使用上面定义的run方法如果自己是leader启动controllers
			OnStoppedLeading: func() {
				glog.Fatalf("leaderelection lost")
			},
		},
	})
	panic("unreachable")
}

上面是启动各个controllers,有两个细节分别介绍一下,第一是StartControllers这个方法,怎样启动各个controller的,由于方法太长,我就不粘全部代码了,如下

func StartControllers(s *options.CMServer, kubeClient *client.Client, kubeconfig *restclient.Config, stop <-chan struct{}) error {
	sharedInformers := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "shared-informers")), ResyncPeriod(s)())
	//创建并启动endpoint controller
	go endpointcontroller.NewEndpointController(sharedInformers.Pods().Informer(), clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "endpoint-controller"))).
		Run(int(s.ConcurrentEndpointSyncs), wait.NeverStop)
	time.Sleep(wait.Jitter(s.ControllerStartInterval.Duration, ControllerStartJitter))
	//创建并启动replication controller
	go replicationcontroller.NewReplicationManager(
		sharedInformers.Pods().Informer(),
		clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "replication-controller")),
		ResyncPeriod(s),
		replicationcontroller.BurstReplicas,
		int(s.LookupCacheSizeForRC),
		s.EnableGarbageCollector,
	).Run(int(s.ConcurrentRCSyncs), wait.NeverStop)
这样就逐一创建并启动各个controller了

第二个细节是选主leaderelection.RunOrDie这个是选主的机制,在以后详细介绍利用etcd选主过程,这里简单提一下,他们启动的时候都会抢先注册自己为leader,当然只有一个会成功,其它的节点watch,如果leader挂了后其它节点会成为leader启动自己CM

func (le *LeaderElector) acquire() {
	stop := make(chan struct{})
	wait.JitterUntil(func() {
		succeeded := le.tryAcquireOrRenew()
		le.maybeReportTransition()
		if !succeeded {
			glog.V(4).Infof("failed to renew lease %v/%v", le.config.EndpointsMeta.Namespace, le.config.EndpointsMeta.Name)
			return
		}
		le.config.EventRecorder.Eventf(&api.Endpoints{ObjectMeta: le.config.EndpointsMeta}, api.EventTypeNormal, "%v became leader", le.config.Identity)
		glog.Infof("sucessfully acquired lease %v/%v", le.config.EndpointsMeta.Namespace, le.config.EndpointsMeta.Name)
		close(stop)
	}, le.config.RetryPeriod, JitterFactor, true, stop)
}

上面的方法就是循环调用tryAcquireOrRenew获取leader锁,有几个参数

DefaultLeaseDuration = 15 * time.Second
DefaultRenewDeadline = 10 * time.Second
DefaultRetryPeriod   = 2 * time.Second

这些都是选主过程使用的,譬如第一个就是租约时间,如果你是leader但这个时效只有15秒,ok解释清楚上面两个问题,下面就针对replication controller介绍一下创建和启动

首先看struct的定义

type ReplicationManager struct {
	kubeClient clientset.Interface//访问apiserver的客户端
	podControl controller.PodControlInterface//pod操作函数的封装,在controller_util.go里面定义实现
	//CreatePods、CreatePodsOnNode、DeletePod等pod操作方法,这些方法都由RealPodControl去调用apiserver完成创建实现

	// internalPodInformer is used to hold a personal informer.  If we're using
	// a normal shared informer, then the informer will be started for us.  If
	// we have a personal informer, we must start it ourselves.   If you start
	// the controller using NewReplicationManager(passing SharedInformer), this
	// will be null
	internalPodInformer framework.SharedIndexInformer

	// An rc is temporarily suspended after creating/deleting these many replicas.
	// It resumes normal action after observing the watch events for them.
	burstReplicas int
	// To allow injection of syncReplicationController for testing.
	syncHandler func(rcKey string) error

	// A TTLCache of pod creates/deletes each rc expects to see.
	expectations *controller.UIDTrackingControllerExpectations

	// A store of replication controllers, populated by the rcController
	rcStore cache.StoreToReplicationControllerLister
	// 监控RC的变化,实现RC同步的任务调度逻辑
	rcController *framework.Controller
	// A store of pods, populated by the podController
	podStore cache.StoreToPodLister
	//监控pod绑定变化,实现pod同步的任务调度逻辑
	podController framework.ControllerInterface
	// podStoreSynced如果最新一次完成同步则返回true
	// Added as a member to the struct to allow injection for testing.
	podStoreSynced func() bool

	lookupCache *controller.MatchingCache

	// Controllers that need to be synced
	queue *workqueue.Type

	// garbageCollectorEnabled denotes if the garbage collector is enabled. RC
	// manager behaves differently if GC is enabled.
	garbageCollectorEnabled bool
}

然后看创建

func newReplicationManager(eventRecorder record.EventRecorder, podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int, garbageCollectorEnabled bool) *ReplicationManager {
	if kubeClient != nil && kubeClient.Core().GetRESTClient().GetRateLimiter() != nil {
		metrics.RegisterMetricAndTrackRateLimiterUsage("replication_controller", kubeClient.Core().GetRESTClient().GetRateLimiter())
	}
	//创建ReplicationManager,这里面包括了podControl,它定义了pod的创建等操作
	rm := &ReplicationManager{
		kubeClient: kubeClient,
		podControl: controller.RealPodControl{
			KubeClient: kubeClient,
			Recorder:   eventRecorder,
		},
		burstReplicas: burstReplicas,
		expectations:  controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
		queue:         workqueue.New(),
		garbageCollectorEnabled: garbageCollectorEnabled,
	}
	//这里通过NewIndexerInformer返回两个重要的变量,rcStore是缓存rc信息的,rcController是rc的控制器
	rm.rcStore.Indexer, rm.rcController = framework.NewIndexerInformer(
		&cache.ListWatch{
			ListFunc: func(options api.ListOptions) (runtime.Object, error) {
				return rm.kubeClient.Core().ReplicationControllers(api.NamespaceAll).List(options)
			},
			WatchFunc: func(options api.ListOptions) (watch.Interface, error) {
				return rm.kubeClient.Core().ReplicationControllers(api.NamespaceAll).Watch(options)
			},
		},
		&api.ReplicationController{},
		// TODO: Can we have much longer period here?
		FullControllerResyncPeriod,
		framework.ResourceEventHandlerFuncs{
			AddFunc:    rm.enqueueController,
			UpdateFunc: rm.updateRC,
			// This will enter the sync loop and no-op, because the controller has been deleted from the store.
			// Note that deleting a controller immediately after scaling it to 0 will not work. The recommended
			// way of achieving this is by performing a `stop` operation on the controller.
			DeleteFunc: rm.enqueueController,
		},
		cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc},
	)
	//添加事件处理handler方法,针对于不同的event调用不同方法
	podInformer.AddEventHandler(framework.ResourceEventHandlerFuncs{
		AddFunc: rm.addPod,
		// This invokes the rc for every pod change, eg: host assignment. Though this might seem like overkill
		// the most frequent pod update is status, and the associated rc will only list from local storage, so
		// it should be ok.
		UpdateFunc: rm.updatePod,
		DeleteFunc: rm.deletePod,
	})
	rm.podStore.Indexer = podInformer.GetIndexer()
	rm.podController = podInformer.GetController()

	rm.syncHandler = rm.syncReplicationController
	rm.podStoreSynced = rm.podController.HasSynced
	rm.lookupCache = controller.NewMatchingCache(lookupCacheSize)
	return rm
}
还是有细节要介绍,第一个是NewIndexerInformer
func NewIndexerInformer(
	lw cache.ListerWatcher,
	objType runtime.Object,
	resyncPeriod time.Duration,
	h ResourceEventHandler,
	indexers cache.Indexers,
) (cache.Indexer, *Controller) {
	// This will hold the client state, as we know it.
	clientState := cache.NewIndexer(DeletionHandlingMetaNamespaceKeyFunc, indexers)

	// This will hold incoming changes. Note how we pass clientState in as a
	// KeyLister, that way resync operations will result in the correct set
	// of update/delete deltas.
	fifo := cache.NewDeltaFIFO(cache.MetaNamespaceKeyFunc, nil, clientState)

	cfg := &Config{
		Queue:            fifo,//存放变化的资源(需要同步的资源)
		ListerWatcher:    lw,//获取和监控资源对象的变化
		ObjectType:       objType,
		FullResyncPeriod: resyncPeriod,
		RetryOnError:     false,

		Process: func(obj interface{}) error {
			// from oldest to newest
			for _, d := range obj.(cache.Deltas) {
				switch d.Type {
				case cache.Sync, cache.Added, cache.Updated:
					if old, exists, err := clientState.Get(d.Object); err == nil && exists {
						if err := clientState.Update(d.Object); err != nil {
							return err
						}
						h.OnUpdate(old, d.Object)//调用UpdateFunc
					} else {
						if err := clientState.Add(d.Object); err != nil {
							return err
						}
						h.OnAdd(d.Object)//调用AddFunc
					}
				case cache.Deleted:
					if err := clientState.Delete(d.Object); err != nil {
						return err
					}
					h.OnDelete(d.Object)//调用DeleteFunc
				}
			}
			return nil
		},
	}
	return clientState, New(cfg)
}
上面的方法返回的clientState是本地缓存client,为了避免反复请求apiserver。然后根据Config配置创建Controller。这个Config定义了lw监听和处理监听返回消息的方法Process,当处理不同的事件调用不同的方法。现在rc controller已经创建好了

创建完成之后启动,启动的方法如下

func (rm *ReplicationManager) Run(workers int, stopCh <-chan struct{}) {
	defer utilruntime.HandleCrash()
	glog.Infof("Starting RC Manager")
	go rm.rcController.Run(stopCh)
	go rm.podController.Run(stopCh)
	for i := 0; i < workers; i++ {
		go wait.Until(rm.worker, time.Second, stopCh)
	}

	if rm.internalPodInformer != nil {
		go rm.internalPodInformer.Run(stopCh)
	}

	<-stopCh
	glog.Infof("Shutting down RC Manager")
	rm.queue.ShutDown()
}
这个run的方法是先启动两个controller,启动方法如下

func (c *Controller)  Run(stopCh <-chan struct{}) {
	defer utilruntime.HandleCrash()
	r := cache.NewReflector(
		c.config.ListerWatcher,
		c.config.ObjectType,
		c.config.Queue,//FIFO
		c.config.FullResyncPeriod,
	)

	c.reflectorMutex.Lock()
	c.reflector = r
	c.reflectorMutex.Unlock()

	r.RunUntil(stopCh)

	wait.Until(c.processLoop, time.Second, stopCh)//每秒调用一次
}
上面的方法就是通过Reflector的ListAndWatch将变化的rc或者pod放到本地FIFO当中详细部分后面再逐一介绍

然后启动多个work线程去处理,FIFO中待处理的任务,通过syncReplicationController里面的manageReplicas

// manageReplicas checks and updates replicas for the given replication controller.
func (rm *ReplicationManager) manageReplicas(filteredPods []*api.Pod, rc *api.ReplicationController) {
	diff := len(filteredPods) - int(rc.Spec.Replicas)
	rcKey, err := controller.KeyFunc(rc)
	if err != nil {
		glog.Errorf("Couldn't get key for replication controller %#v: %v", rc, err)
		return
	}
	if diff < 0 {
		diff *= -1
		if diff > rm.burstReplicas {
			diff = rm.burstReplicas
		}
		// TODO: Track UIDs of creates just like deletes. The problem currently
		// is we'd need to wait on the result of a create to record the pod's
		// UID, which would require locking *across* the create, which will turn
		// into a performance bottleneck. We should generate a UID for the pod
		// beforehand and store it via ExpectCreations.
		rm.expectations.ExpectCreations(rcKey, diff)
		var wg sync.WaitGroup
		wg.Add(diff)
		glog.V(2).Infof("Too few %q/%q replicas, need %d, creating %d", rc.Namespace, rc.Name, rc.Spec.Replicas, diff)
		for i := 0; i < diff; i++ {
			go func() {
				defer wg.Done()
				var err error
				if rm.garbageCollectorEnabled {
					var trueVar = true
					controllerRef := &api.OwnerReference{
						APIVersion: getRCKind().GroupVersion().String(),
						Kind:       getRCKind().Kind,
						Name:       rc.Name,
						UID:        rc.UID,
						Controller: &trueVar,
					}
					err = rm.podControl.CreatePodsWithControllerRef(rc.Namespace, rc.Spec.Template, rc, controllerRef)
				} else {
					err = rm.podControl.CreatePods(rc.Namespace, rc.Spec.Template, rc)
				}
				if err != nil {
					// Decrement the expected number of creates because the informer won't observe this pod
					glog.V(2).Infof("Failed creation, decrementing expectations for controller %q/%q", rc.Namespace, rc.Name)
					rm.expectations.CreationObserved(rcKey)
					rm.enqueueController(rc)
					utilruntime.HandleError(err)
				}
			}()
		}
		wg.Wait()
	} else if diff > 0 {
		if diff > rm.burstReplicas {
			diff = rm.burstReplicas
		}
		glog.V(2).Infof("Too many %q/%q replicas, need %d, deleting %d", rc.Namespace, rc.Name, rc.Spec.Replicas, diff)
		// No need to sort pods if we are about to delete all of them
		if rc.Spec.Replicas != 0 {
			// Sort the pods in the order such that not-ready < ready, unscheduled
			// < scheduled, and pending < running. This ensures that we delete pods
			// in the earlier stages whenever possible.
			sort.Sort(controller.ActivePods(filteredPods))
		}
		// Snapshot the UIDs (ns/name) of the pods we're expecting to see
		// deleted, so we know to record their expectations exactly once either
		// when we see it as an update of the deletion timestamp, or as a delete.
		// Note that if the labels on a pod/rc change in a way that the pod gets
		// orphaned, the rs will only wake up after the expectations have
		// expired even if other pods are deleted.
		deletedPodKeys := []string{}
		for i := 0; i < diff; i++ {
			deletedPodKeys = append(deletedPodKeys, controller.PodKey(filteredPods[i]))
		}
		// We use pod namespace/name as a UID to wait for deletions, so if the
		// labels on a pod/rc change in a way that the pod gets orphaned, the
		// rc will only wake up after the expectation has expired.
		rm.expectations.ExpectDeletions(rcKey, deletedPodKeys)
		var wg sync.WaitGroup
		wg.Add(diff)
		for i := 0; i < diff; i++ {
			go func(ix int) {
				defer wg.Done()
				if err := rm.podControl.DeletePod(rc.Namespace, filteredPods[ix].Name, rc); err != nil {
					// Decrement the expected number of deletes because the informer won't observe this deletion
					podKey := controller.PodKey(filteredPods[ix])
					glog.V(2).Infof("Failed to delete %v due to %v, decrementing expectations for controller %q/%q", podKey, err, rc.Namespace, rc.Name)
					rm.expectations.DeletionObserved(rcKey, podKey)
					rm.enqueueController(rc)
					utilruntime.HandleError(err)
				}
			}(i)
		}
		wg.Wait()
	}
}

上面的方法完成对具体pod增删







你可能感兴趣的:(Kubernetes)