【kubernetes/k8s源码分析】 controller-manager之StatefulSet源码分析

StatefulSet和Deployment的区别

Deployment部署无状态服务,StatefulSet部署有状态服务

官方给出的建议是,如果部署的应用满足以下一个或多个部署需求,则建议使用StatefulSet

  • 稳定的、唯一的网络标识。
  • 稳定的、持久的存储。
  • 有序的、优雅的部署和伸缩。
  • 有序的、优雅的删除和停止。
  • 有序的、自动的滚动更新。

     稳定主要是针对Pod发生re-schedule后仍然要保持之前的网络标识和持久化存储。网络标识包括hostname、集群内DNS中该Pod对应的域名,并不能保证Pod re-schedule之后IP不变。如果使用Deployment,需要考虑滚动更新的过程中的参数控制(maxSurge、maxUnavailable)、每个应用的IP池预留造成的IP浪费等等问题

0. 从这里开始啊

// NewControllerInitializers is a public map of named controller groups (you can start more than one in an init func)
// paired to their InitFunc.  This allows for structured downstream composition and subdivision.
func NewControllerInitializers() map[string]InitFunc {
	controllers := map[string]InitFunc{}
	
	controllers["statefulset"] = startStatefulSetController
	
	return controllers
}

  0.1 startStatefulsetController函数

  •   初始化一个StatefulSetController对象。最后启动该对象的Run时,进入循环处理流程
func startStatefulSetController(ctx ControllerContext) (bool, error) {
	if !ctx.AvailableResources[schema.GroupVersionResource{Group: "apps", Version: "v1beta1", Resource: "statefulsets"}] {
		return false, nil
	}
	go statefulset.NewStatefulSetController(
		ctx.InformerFactory.Core().V1().Pods(),
		ctx.InformerFactory.Apps().V1beta1().StatefulSets(),
		ctx.InformerFactory.Core().V1().PersistentVolumeClaims(),
		ctx.InformerFactory.Apps().V1beta1().ControllerRevisions(),
		ctx.ClientBuilder.ClientOrDie("statefulset-controller"),
	).Run(1, ctx.Stop)
	return true, nil
}

1. NewStatefulSetController函数

    路径: pkg/controller/statefulset/stateful_set.go,主要ListWatch Pod和StatefulSet

  1.1 创建stattefulsetController结构体,包括queue

	ssc := &StatefulSetController{
		kubeClient: kubeClient,
		control: NewDefaultStatefulSetControl(
			NewRealStatefulPodControl(
				kubeClient,
				setInformer.Lister(),
				podInformer.Lister(),
				pvcInformer.Lister(),
				recorder),
			NewRealStatefulSetStatusUpdater(kubeClient, setInformer.Lister()),
			history.NewHistory(kubeClient, revInformer.Lister()),
		),
		pvcListerSynced: pvcInformer.Informer().HasSynced,
		queue:           workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "statefulset"),
		podControl:      controller.RealPodControl{KubeClient: kubeClient, Recorder: recorder},

		revListerSynced: revInformer.Informer().HasSynced,
	}

  1.2 PodInformer注册了add update delete EventHandler,会将Pod对应的StatefulSet加入到queue中

	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
		// lookup the statefulset and enqueue
		AddFunc: ssc.addPod,
		// lookup current and old statefulset if labels changed
		UpdateFunc: ssc.updatePod,
		// lookup statefulset accounting for deletion tombstones
		DeleteFunc: ssc.deletePod,
	})

  1.3 SetInformer注册了add update EventHandler,将StatefulSet加入到 queue中

	setInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc: ssc.enqueueStatefulSet,
			UpdateFunc: func(old, cur interface{}) {
				oldPS := old.(*apps.StatefulSet)
				curPS := cur.(*apps.StatefulSet)
				if oldPS.Status.Replicas != curPS.Status.Replicas {
					glog.V(4).Infof("Observed updated replica count for StatefulSet: %v, %d->%d", curPS.Name, oldPS.Status.Replicas, curPS.Status.Replicas)
				}
				ssc.enqueueStatefulSet(cur)
			},
			DeleteFunc: ssc.enqueueStatefulSet,
		},
		statefulSetResyncPeriod,
	)
	ssc.setLister = setInformer.Lister()
	ssc.setListerSynced = setInformer.Informer().HasSynced

2. func (ssc *StatefulSetController) sync(key string) error

   从 queue中pop一个StatefulSet对象,然后执行sync进行操作

  2.1 调statefulset的Get API得到statefulSet对象

	set, err := ssc.setLister.StatefulSets(namespace).Get(name)
	if errors.IsNotFound(err) {
		glog.Infof("StatefulSet has been deleted %v", key)
		return nil
	}

  2.2 sync中

  • 根据setLabel匹配出所有revisions、然后检查这些revisions中是否有OwnerReference为空的,如果有,那说明存在Orphaned的Revisions。
  • 调用getPodsForStatefulSet获取StatefulSet应该管理的Pods
	selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("error converting StatefulSet %v selector: %v", key, err))
		// This is a non-transient error, so don't retry.
		return nil
	}

	if err := ssc.adoptOrphanRevisions(set); err != nil {
		return err
	}

	pods, err := ssc.getPodsForStatefulSet(set, selector)
	if err != nil {
		return err
	}

	return ssc.syncStatefulSet(set, pods)

3.(ssc *defaultStatefulSetControl) UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error函数

  3.1 获取currentRevision和updateRevision

	// list all revisions and sort them
	revisions, err := ssc.ListRevisions(set)
	if err != nil {
		return err
	}
	history.SortControllerRevisions(revisions)

	// get the current, and update revisions
	currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
	if err != nil {
		return err
	}

  3.2  updateStatefulSet函数

    第四节讲解 

	// perform the main update function and get the status
	status, err := ssc.updateStatefulSet(set, currentRevision, updateRevision, collisionCount, pods)
	if err != nil {
		return err
	}

  3.3 最终调用API更新statefulset状态

func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
	set *apps.StatefulSet,
	status *apps.StatefulSetStatus) error {

	// complete any in progress rolling update if necessary
	completeRollingUpdate(set, status)

	// if the status is not inconsistent do not perform an update
	if !inconsistentStatus(set, status) {
		return nil
	}

	// copy set and update its status
	set = set.DeepCopy()
	if err := ssc.statusUpdater.UpdateStatefulSetStatus(set, status); err != nil {
		return err
	}

	return nil
}

4. func (ssc *defaultStatefulSetControl) updateStatefulSet(
   set *apps.StatefulSet,
   currentRevision *apps.ControllerRevision,
   updateRevision *apps.ControllerRevision,
   collisionCount int32,
   pods []*v1.Pod) (*apps.StatefulSetStatus, error)函数

 4.1 获取currentRevision和updateRevision

    并设置generation,currentRevision, updateRevision等信息到StatefulSet status

	// get the current and update revisions of the set.
	currentSet, err := ApplyRevision(set, currentRevision)
	if err != nil {
		return nil, err
	}
	updateSet, err := ApplyRevision(set, updateRevision)
	if err != nil {
		return nil, err
	}

	// set the generation, and revisions in the returned status
	status := apps.StatefulSetStatus{}
	status.ObservedGeneration = new(int64)
	*status.ObservedGeneration = set.Generation
	status.CurrentRevision = currentRevision.Name
	status.UpdateRevision = updateRevision.Name
	status.CollisionCount = new(int32)
	*status.CollisionCount = collisionCount

 4.2 pods分成两个slice

  • replicas: 0 <= getOrdinal(pod) < set.Spec.Replicas
  • condemned: set.Spec.Replicas <= getOrdinal(pod)
replicaCount := int(*set.Spec.Replicas)
// slice that will contain all Pods such that 0 <= getOrdinal(pod) < set.Spec.Replicas
replicas := make([]*v1.Pod, replicaCount)
// slice that will contain all Pods such that set.Spec.Replicas <= getOrdinal(pod)
condemned := make([]*v1.Pod, 0, len(pods))
unhealthy := 0
firstUnhealthyOrdinal := math.MaxInt32

 4.3 将pod分为两个slice,replicas与condemned

	// First we partition pods into two lists valid replicas and condemned Pods
	for i := range pods {
		status.Replicas++

		// count the number of running and ready replicas
		if isRunningAndReady(pods[i]) {
			status.ReadyReplicas++
		}

		// count the number of current and update replicas
		if isCreated(pods[i]) && !isTerminating(pods[i]) {
			if getPodRevision(pods[i]) == currentRevision.Name {
				status.CurrentReplicas++
			} else if getPodRevision(pods[i]) == updateRevision.Name {
				status.UpdatedReplicas++
			}
		}

		if ord := getOrdinal(pods[i]); 0 <= ord && ord < replicaCount {
			// if the ordinal of the pod is within the range of the current number of replicas,
			// insert it at the indirection of its ordinal
			replicas[ord] = pods[i]

		} else if ord >= replicaCount {
			// if the ordinal is greater than the number of replicas add it to the condemned list
			condemned = append(condemned, pods[i])
		}
		// If the ordinal could not be parsed (ord < 0), ignore the Pod.
	}

 4.4 从repilcas和condemned中找出第一个unhealthy的Pod

   ordinal最小的unhealth pod

// find the first unhealthy Pod
	for i := range replicas {
		if !isHealthy(replicas[i]) {
			unhealthy++
			if ord := getOrdinal(replicas[i]); ord < firstUnhealthyOrdinal {
				firstUnhealthyOrdinal = ord
				firstUnhealthyPod = replicas[i]
			}
		}
	}

	for i := range condemned {
		if !isHealthy(condemned[i]) {
			unhealthy++
			if ord := getOrdinal(condemned[i]); ord < firstUnhealthyOrdinal {
				firstUnhealthyOrdinal = ord
				firstUnhealthyPod = condemned[i]
			}
		}
	}

  判断healthy:

【kubernetes/k8s源码分析】 controller-manager之StatefulSet源码分析_第1张图片

  4.5 DeletionTimestamp不空为删除的pod,直接返回,monotonic是否设置并行,为设置则为串行

	// If the StatefulSet is being deleted, don't do anything other than updating
	// status.
	if set.DeletionTimestamp != nil {
		return &status, nil
	}

	monotonic := !allowsBurst(set)

  4.6 遍历valid replicas

  • pod Failed (pod.Status.Phase = Failed), 删除并new这个pod object(注意revisions匹配)pod.Status.Phase = "",pod还没有recreate,则Create,ParallelPodManagement = OrderedReady,则直接返回当前status。ParallelPodManagement = Parallel,则for下一个
  • pod正在删除且ParallelPodManagement = OrderedReady,返回status
  • pod不是RunningAndReady状态,且ParallelPodManagement = OrderedReady,返回status
  • pod与statefulset的identity和storage是否匹配,不匹配则调用apiserver Update Stateful Pod进行updateIdentity和updateStorage(创建PVC)
	// Examine each replica with respect to its ordinal
	for i := range replicas {
		// delete and recreate failed pods
		if isFailed(replicas[i]) {
			
			if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil {
				return &status, err
			}
			if getPodRevision(replicas[i]) == currentRevision.Name {
				status.CurrentReplicas--
			} else if getPodRevision(replicas[i]) == updateRevision.Name {
				status.UpdatedReplicas--
			}
			status.Replicas--
			replicas[i] = newVersionedStatefulSetPod(
				currentSet,
				updateSet,
				currentRevision.Name,
				updateRevision.Name,
				i)
		}
		// If we find a Pod that has not been created we create the Pod
		if !isCreated(replicas[i]) {
			
		}
		
		if isTerminating(replicas[i]) && monotonic {
		
		}
	
		if !isRunningAndReady(replicas[i]) && monotonic {
			
		}
	}

  4.7 遍历condemned replicas,index由大到小都被删除

  •   清理isTerminating状态pod.Status.Phase == v1.PodFailed
  •   DeleteStatefulPod调用API接口删除pod
	for target := len(condemned) - 1; target >= 0; target-- {
		// wait for terminating pods to expire
		if isTerminating(condemned[target]) {
			
			// block if we are in monotonic mode
			if monotonic {
				return &status, nil
			}
			continue
		}
		// if we are in monotonic mode and the condemned target is not the first unhealthy Pod block
		if !isRunningAndReady(condemned[target]) && monotonic && condemned[target] != firstUnhealthyPod {
			
			return &status, nil
		}


		if err := ssc.podControl.DeleteStatefulPod(set, condemned[target]); err != nil {
			return &status, err
		}
		if getPodRevision(condemned[target]) == currentRevision.Name {
			status.CurrentReplicas--
		} else if getPodRevision(condemned[target]) == updateRevision.Name {
			status.UpdatedReplicas--
		}
		if monotonic {
			return &status, nil
		}
	}

  4.8 UpdateStrategy Type是OnDelete

  •  如果UpdateStrategy Type是OnDelete,只有Pods被手动删除后,才会触发Recreate
	// for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted.
	if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType {
		return &status, nil
	}

  4.9 

  • RollingUpdate:Partition不设置就相当于0,相当于全部pods进行更新
  • pod revision不是updateRevision且不是正在删除的,则删除这个pod
  • pod不是healthy的,直接返回状态
	// we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
	updateMin := 0
	if set.Spec.UpdateStrategy.RollingUpdate != nil {
		updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
	}
	// we terminate the Pod with the largest ordinal that does not match the update revision.
	for target := len(replicas) - 1; target >= updateMin; target-- {

		// delete the Pod if it is not already terminating and does not match the update revision.
		if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
			glog.V(4).Infof("StatefulSet %s/%s terminating Pod %s for update",
				set.Namespace,
				set.Name,
				replicas[target].Name)
			err := ssc.podControl.DeleteStatefulPod(set, replicas[target])
			status.CurrentReplicas--
			return &status, err
		}

		// wait for unhealthy Pods on update
		if !isHealthy(replicas[target]) {
			glog.V(4).Infof(
				"StatefulSet %s/%s is waiting for Pod %s to update",
				set.Namespace,
				set.Name,
				replicas[target].Name)
			return &status, nil
		}

	}

5. newVersionedStatefulSetPod函数

【kubernetes/k8s源码分析】 controller-manager之StatefulSet源码分析_第2张图片

  • 更新策略是RollingUpdate且Partition为0或者ordinal < Partition,则使用currentRevision创建该Pod Object。
  • 其余使用updateRevision
func newVersionedStatefulSetPod(currentSet, updateSet *apps.StatefulSet, currentRevision, updateRevision string, ordinal int) *v1.Pod {
	if currentSet.Spec.UpdateStrategy.Type == apps.RollingUpdateStatefulSetStrategyType &&
		(currentSet.Spec.UpdateStrategy.RollingUpdate == nil && ordinal < int(currentSet.Status.CurrentReplicas)) ||
		(currentSet.Spec.UpdateStrategy.RollingUpdate != nil && ordinal < int(*currentSet.Spec.UpdateStrategy.RollingUpdate.Partition)) {
		pod := newStatefulSetPod(currentSet, ordinal)
		setPodRevision(pod, currentRevision)
		return pod
	}
	pod := newStatefulSetPod(updateSet, ordinal)
	setPodRevision(pod, updateRevision)
	return pod
}

 

 

你可能感兴趣的:(【kubernetes/k8s源码分析】 controller-manager之StatefulSet源码分析)