照旧先是在cmd里面创建CMServer然后启动,看代码如下
func main() {
s := options.NewCMServer()
s.AddFlags(pflag.CommandLine)
flag.InitFlags()
util.InitLogs()
defer util.FlushLogs()
verflag.PrintAndExitIfRequested()
if err := app.Run(s); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}
在创建里面首先是一堆配置,主要是配置work线程数,将在run方法里面启动对应个数的线程,
func NewCMServer() *CMServer {
s := CMServer{
KubeControllerManagerConfiguration: componentconfig.KubeControllerManagerConfiguration{
Port: ports.ControllerManagerPort,
Address: "0.0.0.0",
ConcurrentEndpointSyncs: 5,//Endpoint controller线程数
ConcurrentServiceSyncs: 1,//service controller线程数
ConcurrentRCSyncs: 5,//RC controller线程数
ConcurrentRSSyncs: 5,//RS controller线程数
ConcurrentDaemonSetSyncs: 2,//DaemonSet controller线程数
ConcurrentJobSyncs: 5,//job controller线程数
ConcurrentResourceQuotaSyncs: 5,//资源配额 controller线程数
ConcurrentDeploymentSyncs: 5,//Deployment controller线程数
ConcurrentNamespaceSyncs: 2,//命名空间 controller线程数
ConcurrentSATokenSyncs: 5,
LookupCacheSizeForRC: 4096,
LookupCacheSizeForRS: 4096,
LookupCacheSizeForDaemonSet: 1024,
ServiceSyncPeriod: unversioned.Duration{Duration: 5 * time.Minute},
NodeSyncPeriod: unversioned.Duration{Duration: 10 * time.Second},
ResourceQuotaSyncPeriod: unversioned.Duration{Duration: 5 * time.Minute},
NamespaceSyncPeriod: unversioned.Duration{Duration: 5 * time.Minute},
PVClaimBinderSyncPeriod: unversioned.Duration{Duration: 15 * time.Second},
HorizontalPodAutoscalerSyncPeriod: unversioned.Duration{Duration: 30 * time.Second},
DeploymentControllerSyncPeriod: unversioned.Duration{Duration: 30 * time.Second},
MinResyncPeriod: unversioned.Duration{Duration: 12 * time.Hour},
RegisterRetryCount: 10,
PodEvictionTimeout: unversioned.Duration{Duration: 5 * time.Minute},
NodeMonitorGracePeriod: unversioned.Duration{Duration: 40 * time.Second},
NodeStartupGracePeriod: unversioned.Duration{Duration: 60 * time.Second},
NodeMonitorPeriod: unversioned.Duration{Duration: 5 * time.Second},
ClusterName: "kubernetes",
NodeCIDRMaskSize: 24,
ConfigureCloudRoutes: true,
TerminatedPodGCThreshold: 12500,
VolumeConfiguration: componentconfig.VolumeConfiguration{
EnableHostPathProvisioning: false,
EnableDynamicProvisioning: true,
PersistentVolumeRecyclerConfiguration: componentconfig.PersistentVolumeRecyclerConfiguration{
MaximumRetry: 3,
MinimumTimeoutNFS: 300,
IncrementTimeoutNFS: 30,
MinimumTimeoutHostPath: 60,
IncrementTimeoutHostPath: 30,
},
FlexVolumePluginDir: "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/",
},
ContentType: "application/vnd.kubernetes.protobuf",
KubeAPIQPS: 20.0,
KubeAPIBurst: 30,
LeaderElection: leaderelection.DefaultLeaderElectionConfiguration(),
ControllerStartInterval: unversioned.Duration{Duration: 0 * time.Second},
EnableGarbageCollector: false,
ConcurrentGCSyncs: 5,
ClusterSigningCertFile: "/etc/kubernetes/ca/ca.pem",
ClusterSigningKeyFile: "/etc/kubernetes/ca/ca.key",
},
}
s.LeaderElection.LeaderElect = true
return &s
}
// 启动CMServer. This should never exit.
func Run(s *options.CMServer) error {
if c, err := configz.New("componentconfig"); err == nil {
c.Set(s.KubeControllerManagerConfiguration)
} else {
glog.Errorf("unable to register configz: %s", err)
}
kubeconfig, err := clientcmd.BuildConfigFromFlags(s.Master, s.Kubeconfig)
if err != nil {
return err
}
kubeconfig.ContentConfig.ContentType = s.ContentType
// Override kubeconfig qps/burst settings from flags
kubeconfig.QPS = s.KubeAPIQPS
kubeconfig.Burst = int(s.KubeAPIBurst)
kubeClient, err := client.New(kubeconfig)
if err != nil {
glog.Fatalf("Invalid API configuration: %v", err)
}
//启动server,这些都是系统监控的api,如/metrics可以记录api调用次数
go func() {
mux := http.NewServeMux()
healthz.InstallHandler(mux)
if s.EnableProfiling {
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
}
configz.InstallHandler(mux)
mux.Handle("/metrics", prometheus.Handler())
server := &http.Server{
Addr: net.JoinHostPort(s.Address, strconv.Itoa(int(s.Port))),
Handler: mux,
}
glog.Fatal(server.ListenAndServe())
}()
//这里定义了启动controllers的启动方法。下面有调用这个方法
run := func(stop <-chan struct{}) {
err := StartControllers(s, kubeClient, kubeconfig, stop)
glog.Fatalf("error running controllers: %v", err)
panic("unreachable")
}
//k8s的HA方案中CM是一主多备,如果自己不是leader则不启动CM
if !s.LeaderElection.LeaderElect {
run(nil)
panic("unreachable")
}
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(glog.Infof)
eventBroadcaster.StartRecordingToSink(kubeClient.Events(""))
recorder := eventBroadcaster.NewRecorder(api.EventSource{Component: "controller-manager"})
id, err := os.Hostname()
if err != nil {
return err
}
leaderelection.RunOrDie(leaderelection.LeaderElectionConfig{
EndpointsMeta: api.ObjectMeta{
Namespace: "kube-system",
Name: "kube-controller-manager",
},
Client: kubeClient,
Identity: id,
EventRecorder: recorder,
LeaseDuration: s.LeaderElection.LeaseDuration.Duration,
RenewDeadline: s.LeaderElection.RenewDeadline.Duration,
RetryPeriod: s.LeaderElection.RetryPeriod.Duration,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: run,//这里使用上面定义的run方法如果自己是leader启动controllers
OnStoppedLeading: func() {
glog.Fatalf("leaderelection lost")
},
},
})
panic("unreachable")
}
func StartControllers(s *options.CMServer, kubeClient *client.Client, kubeconfig *restclient.Config, stop <-chan struct{}) error {
sharedInformers := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "shared-informers")), ResyncPeriod(s)())
//创建并启动endpoint controller
go endpointcontroller.NewEndpointController(sharedInformers.Pods().Informer(), clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "endpoint-controller"))).
Run(int(s.ConcurrentEndpointSyncs), wait.NeverStop)
time.Sleep(wait.Jitter(s.ControllerStartInterval.Duration, ControllerStartJitter))
//创建并启动replication controller
go replicationcontroller.NewReplicationManager(
sharedInformers.Pods().Informer(),
clientset.NewForConfigOrDie(restclient.AddUserAgent(kubeconfig, "replication-controller")),
ResyncPeriod(s),
replicationcontroller.BurstReplicas,
int(s.LookupCacheSizeForRC),
s.EnableGarbageCollector,
).Run(int(s.ConcurrentRCSyncs), wait.NeverStop)
这样就逐一创建并启动各个controller了
第二个细节是选主leaderelection.RunOrDie这个是选主的机制,在以后详细介绍利用etcd选主过程,这里简单提一下,他们启动的时候都会抢先注册自己为leader,当然只有一个会成功,其它的节点watch,如果leader挂了后其它节点会成为leader启动自己CM
func (le *LeaderElector) acquire() {
stop := make(chan struct{})
wait.JitterUntil(func() {
succeeded := le.tryAcquireOrRenew()
le.maybeReportTransition()
if !succeeded {
glog.V(4).Infof("failed to renew lease %v/%v", le.config.EndpointsMeta.Namespace, le.config.EndpointsMeta.Name)
return
}
le.config.EventRecorder.Eventf(&api.Endpoints{ObjectMeta: le.config.EndpointsMeta}, api.EventTypeNormal, "%v became leader", le.config.Identity)
glog.Infof("sucessfully acquired lease %v/%v", le.config.EndpointsMeta.Namespace, le.config.EndpointsMeta.Name)
close(stop)
}, le.config.RetryPeriod, JitterFactor, true, stop)
}
DefaultLeaseDuration = 15 * time.Second
DefaultRenewDeadline = 10 * time.Second
DefaultRetryPeriod = 2 * time.Second
这些都是选主过程使用的,譬如第一个就是租约时间,如果你是leader但这个时效只有15秒,ok解释清楚上面两个问题,下面就针对replication controller介绍一下创建和启动
首先看struct的定义
type ReplicationManager struct {
kubeClient clientset.Interface//访问apiserver的客户端
podControl controller.PodControlInterface//pod操作函数的封装,在controller_util.go里面定义实现
//CreatePods、CreatePodsOnNode、DeletePod等pod操作方法,这些方法都由RealPodControl去调用apiserver完成创建实现
// internalPodInformer is used to hold a personal informer. If we're using
// a normal shared informer, then the informer will be started for us. If
// we have a personal informer, we must start it ourselves. If you start
// the controller using NewReplicationManager(passing SharedInformer), this
// will be null
internalPodInformer framework.SharedIndexInformer
// An rc is temporarily suspended after creating/deleting these many replicas.
// It resumes normal action after observing the watch events for them.
burstReplicas int
// To allow injection of syncReplicationController for testing.
syncHandler func(rcKey string) error
// A TTLCache of pod creates/deletes each rc expects to see.
expectations *controller.UIDTrackingControllerExpectations
// A store of replication controllers, populated by the rcController
rcStore cache.StoreToReplicationControllerLister
// 监控RC的变化,实现RC同步的任务调度逻辑
rcController *framework.Controller
// A store of pods, populated by the podController
podStore cache.StoreToPodLister
//监控pod绑定变化,实现pod同步的任务调度逻辑
podController framework.ControllerInterface
// podStoreSynced如果最新一次完成同步则返回true
// Added as a member to the struct to allow injection for testing.
podStoreSynced func() bool
lookupCache *controller.MatchingCache
// Controllers that need to be synced
queue *workqueue.Type
// garbageCollectorEnabled denotes if the garbage collector is enabled. RC
// manager behaves differently if GC is enabled.
garbageCollectorEnabled bool
}
然后看创建
func newReplicationManager(eventRecorder record.EventRecorder, podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int, garbageCollectorEnabled bool) *ReplicationManager {
if kubeClient != nil && kubeClient.Core().GetRESTClient().GetRateLimiter() != nil {
metrics.RegisterMetricAndTrackRateLimiterUsage("replication_controller", kubeClient.Core().GetRESTClient().GetRateLimiter())
}
//创建ReplicationManager,这里面包括了podControl,它定义了pod的创建等操作
rm := &ReplicationManager{
kubeClient: kubeClient,
podControl: controller.RealPodControl{
KubeClient: kubeClient,
Recorder: eventRecorder,
},
burstReplicas: burstReplicas,
expectations: controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
queue: workqueue.New(),
garbageCollectorEnabled: garbageCollectorEnabled,
}
//这里通过NewIndexerInformer返回两个重要的变量,rcStore是缓存rc信息的,rcController是rc的控制器
rm.rcStore.Indexer, rm.rcController = framework.NewIndexerInformer(
&cache.ListWatch{
ListFunc: func(options api.ListOptions) (runtime.Object, error) {
return rm.kubeClient.Core().ReplicationControllers(api.NamespaceAll).List(options)
},
WatchFunc: func(options api.ListOptions) (watch.Interface, error) {
return rm.kubeClient.Core().ReplicationControllers(api.NamespaceAll).Watch(options)
},
},
&api.ReplicationController{},
// TODO: Can we have much longer period here?
FullControllerResyncPeriod,
framework.ResourceEventHandlerFuncs{
AddFunc: rm.enqueueController,
UpdateFunc: rm.updateRC,
// This will enter the sync loop and no-op, because the controller has been deleted from the store.
// Note that deleting a controller immediately after scaling it to 0 will not work. The recommended
// way of achieving this is by performing a `stop` operation on the controller.
DeleteFunc: rm.enqueueController,
},
cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc},
)
//添加事件处理handler方法,针对于不同的event调用不同方法
podInformer.AddEventHandler(framework.ResourceEventHandlerFuncs{
AddFunc: rm.addPod,
// This invokes the rc for every pod change, eg: host assignment. Though this might seem like overkill
// the most frequent pod update is status, and the associated rc will only list from local storage, so
// it should be ok.
UpdateFunc: rm.updatePod,
DeleteFunc: rm.deletePod,
})
rm.podStore.Indexer = podInformer.GetIndexer()
rm.podController = podInformer.GetController()
rm.syncHandler = rm.syncReplicationController
rm.podStoreSynced = rm.podController.HasSynced
rm.lookupCache = controller.NewMatchingCache(lookupCacheSize)
return rm
}
还是有细节要介绍,第一个是NewIndexerInformer
func NewIndexerInformer(
lw cache.ListerWatcher,
objType runtime.Object,
resyncPeriod time.Duration,
h ResourceEventHandler,
indexers cache.Indexers,
) (cache.Indexer, *Controller) {
// This will hold the client state, as we know it.
clientState := cache.NewIndexer(DeletionHandlingMetaNamespaceKeyFunc, indexers)
// This will hold incoming changes. Note how we pass clientState in as a
// KeyLister, that way resync operations will result in the correct set
// of update/delete deltas.
fifo := cache.NewDeltaFIFO(cache.MetaNamespaceKeyFunc, nil, clientState)
cfg := &Config{
Queue: fifo,//存放变化的资源(需要同步的资源)
ListerWatcher: lw,//获取和监控资源对象的变化
ObjectType: objType,
FullResyncPeriod: resyncPeriod,
RetryOnError: false,
Process: func(obj interface{}) error {
// from oldest to newest
for _, d := range obj.(cache.Deltas) {
switch d.Type {
case cache.Sync, cache.Added, cache.Updated:
if old, exists, err := clientState.Get(d.Object); err == nil && exists {
if err := clientState.Update(d.Object); err != nil {
return err
}
h.OnUpdate(old, d.Object)//调用UpdateFunc
} else {
if err := clientState.Add(d.Object); err != nil {
return err
}
h.OnAdd(d.Object)//调用AddFunc
}
case cache.Deleted:
if err := clientState.Delete(d.Object); err != nil {
return err
}
h.OnDelete(d.Object)//调用DeleteFunc
}
}
return nil
},
}
return clientState, New(cfg)
}
上面的方法返回的clientState是本地缓存client,为了避免反复请求apiserver。然后根据Config配置创建Controller。这个Config定义了lw监听和处理监听返回消息的方法Process,当处理不同的事件调用不同的方法。现在rc controller已经创建好了
创建完成之后启动,启动的方法如下
func (rm *ReplicationManager) Run(workers int, stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
glog.Infof("Starting RC Manager")
go rm.rcController.Run(stopCh)
go rm.podController.Run(stopCh)
for i := 0; i < workers; i++ {
go wait.Until(rm.worker, time.Second, stopCh)
}
if rm.internalPodInformer != nil {
go rm.internalPodInformer.Run(stopCh)
}
<-stopCh
glog.Infof("Shutting down RC Manager")
rm.queue.ShutDown()
}
这个run的方法是先启动两个controller,启动方法如下
func (c *Controller) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
r := cache.NewReflector(
c.config.ListerWatcher,
c.config.ObjectType,
c.config.Queue,//FIFO
c.config.FullResyncPeriod,
)
c.reflectorMutex.Lock()
c.reflector = r
c.reflectorMutex.Unlock()
r.RunUntil(stopCh)
wait.Until(c.processLoop, time.Second, stopCh)//每秒调用一次
}
上面的方法就是通过Reflector的ListAndWatch将变化的rc或者pod放到本地FIFO当中详细部分后面再逐一介绍
然后启动多个work线程去处理,FIFO中待处理的任务,通过syncReplicationController里面的manageReplicas
// manageReplicas checks and updates replicas for the given replication controller.
func (rm *ReplicationManager) manageReplicas(filteredPods []*api.Pod, rc *api.ReplicationController) {
diff := len(filteredPods) - int(rc.Spec.Replicas)
rcKey, err := controller.KeyFunc(rc)
if err != nil {
glog.Errorf("Couldn't get key for replication controller %#v: %v", rc, err)
return
}
if diff < 0 {
diff *= -1
if diff > rm.burstReplicas {
diff = rm.burstReplicas
}
// TODO: Track UIDs of creates just like deletes. The problem currently
// is we'd need to wait on the result of a create to record the pod's
// UID, which would require locking *across* the create, which will turn
// into a performance bottleneck. We should generate a UID for the pod
// beforehand and store it via ExpectCreations.
rm.expectations.ExpectCreations(rcKey, diff)
var wg sync.WaitGroup
wg.Add(diff)
glog.V(2).Infof("Too few %q/%q replicas, need %d, creating %d", rc.Namespace, rc.Name, rc.Spec.Replicas, diff)
for i := 0; i < diff; i++ {
go func() {
defer wg.Done()
var err error
if rm.garbageCollectorEnabled {
var trueVar = true
controllerRef := &api.OwnerReference{
APIVersion: getRCKind().GroupVersion().String(),
Kind: getRCKind().Kind,
Name: rc.Name,
UID: rc.UID,
Controller: &trueVar,
}
err = rm.podControl.CreatePodsWithControllerRef(rc.Namespace, rc.Spec.Template, rc, controllerRef)
} else {
err = rm.podControl.CreatePods(rc.Namespace, rc.Spec.Template, rc)
}
if err != nil {
// Decrement the expected number of creates because the informer won't observe this pod
glog.V(2).Infof("Failed creation, decrementing expectations for controller %q/%q", rc.Namespace, rc.Name)
rm.expectations.CreationObserved(rcKey)
rm.enqueueController(rc)
utilruntime.HandleError(err)
}
}()
}
wg.Wait()
} else if diff > 0 {
if diff > rm.burstReplicas {
diff = rm.burstReplicas
}
glog.V(2).Infof("Too many %q/%q replicas, need %d, deleting %d", rc.Namespace, rc.Name, rc.Spec.Replicas, diff)
// No need to sort pods if we are about to delete all of them
if rc.Spec.Replicas != 0 {
// Sort the pods in the order such that not-ready < ready, unscheduled
// < scheduled, and pending < running. This ensures that we delete pods
// in the earlier stages whenever possible.
sort.Sort(controller.ActivePods(filteredPods))
}
// Snapshot the UIDs (ns/name) of the pods we're expecting to see
// deleted, so we know to record their expectations exactly once either
// when we see it as an update of the deletion timestamp, or as a delete.
// Note that if the labels on a pod/rc change in a way that the pod gets
// orphaned, the rs will only wake up after the expectations have
// expired even if other pods are deleted.
deletedPodKeys := []string{}
for i := 0; i < diff; i++ {
deletedPodKeys = append(deletedPodKeys, controller.PodKey(filteredPods[i]))
}
// We use pod namespace/name as a UID to wait for deletions, so if the
// labels on a pod/rc change in a way that the pod gets orphaned, the
// rc will only wake up after the expectation has expired.
rm.expectations.ExpectDeletions(rcKey, deletedPodKeys)
var wg sync.WaitGroup
wg.Add(diff)
for i := 0; i < diff; i++ {
go func(ix int) {
defer wg.Done()
if err := rm.podControl.DeletePod(rc.Namespace, filteredPods[ix].Name, rc); err != nil {
// Decrement the expected number of deletes because the informer won't observe this deletion
podKey := controller.PodKey(filteredPods[ix])
glog.V(2).Infof("Failed to delete %v due to %v, decrementing expectations for controller %q/%q", podKey, err, rc.Namespace, rc.Name)
rm.expectations.DeletionObserved(rcKey, podKey)
rm.enqueueController(rc)
utilruntime.HandleError(err)
}
}(i)
}
wg.Wait()
}
}
上面的方法完成对具体pod增删