ReplicaSet syncReplicaSet的逻辑,首先找到rs下相关的pod
// 从informer中获取当前ns下所有的pod
allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
if err != nil {
return err
}
// 忽略不健康的pod
filteredPods := controller.FilterActivePods(allPods)
// 找到被该rs的labelSelector和ownerRefControllerBy选中的pod
filteredPods, err = rsc.claimPods(ctx, rs, selector, filteredPods)
if err != nil {
return err
}
计算Pod的健康状态
func IsPodActive(p *v1.Pod) bool {
return v1.PodSucceeded != p.Status.Phase &&
v1.PodFailed != p.Status.Phase &&
p.DeletionTimestamp == nil
}
计算当前实例数与期望实例数的diff,进而判断扩缩实例数
func (rsc *ReplicaSetController) manageReplicas(ctx context.Context, filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
// 计算当前实例数与期望实例数的diff
diff := len(filteredPods) - int(*(rs.Spec.Replicas))
rsKey, err := controller.KeyFunc(rs)
if err != nil {
utilruntime.HandleError(fmt.Errorf("couldn't get key for %v %#v: %v", rsc.Kind, rs, err))
return nil
}
// 扩容,新建实例数
if diff < 0 {
diff *= -1
// burstReplicas默认值为500
if diff > rsc.burstReplicas {
diff = rsc.burstReplicas
}
rsc.expectations.ExpectCreations(rsKey, diff)
klog.V(2).InfoS("Too few replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "creating", diff)
successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
err := rsc.podControl.CreatePods(ctx, rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
if err != nil {
if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
return nil
}
}
return err
})
if skippedPods := diff - successfulCreations; skippedPods > 0 {
klog.V(2).Infof("Slow-start failure. Skipping creation of %d pods, decrementing expectations for %v %v/%v", skippedPods, rsc.Kind, rs.Namespace, rs.Name)
for i := 0; i < skippedPods; i++ {
// Decrement the expected number of creates because the informer won't observe this pod
rsc.expectations.CreationObserved(rsKey)
}
}
return err
} else if diff > 0 {
// 缩容,删除实例
// burstReplicas默认值为500
if diff > rsc.burstReplicas {
diff = rsc.burstReplicas
}
klog.V(2).InfoS("Too many replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "deleting", diff)
// 获取deployment ownerRefControllerBy的所有rs的所有pod
relatedPods, err := rsc.getIndirectlyRelatedPods(rs)
utilruntime.HandleError(err)
// 找到需要被删除的pod列表
podsToDelete := getPodsToDelete(filteredPods, relatedPods, diff)
// 在cache中记录需要被删除的podkey
rsc.expectations.ExpectDeletions(rsKey, getPodKeys(podsToDelete))
errCh := make(chan error, diff)
var wg sync.WaitGroup
wg.Add(diff)
for _, pod := range podsToDelete {
go func(targetPod *v1.Pod) {
defer wg.Done()
// 删除pod
if err := rsc.podControl.DeletePod(ctx, rs.Namespace, targetPod.Name, rs); err != nil {
// 从cache中移除已被删除的podkey
podKey := controller.PodKey(targetPod)
rsc.expectations.DeletionObserved(rsKey, podKey)
if !apierrors.IsNotFound(err) {
klog.V(2).Infof("Failed to delete %v, decremented expectations for %v %s/%s", podKey, rsc.Kind, rs.Namespace, rs.Name)
errCh <- err
}
}
}(pod)
}
wg.Wait()
select {
case err := <-errCh:
// all errors have been reported before and they're likely to be the same, so we'll only return the first one we hit.
if err != nil {
return err
}
default:
}
}
return nil
}
着重看一下getPodsToDelete方法
func getPodsToDelete(filteredPods, relatedPods []*v1.Pod, diff int) []*v1.Pod {
// 如果需要删除的实例数 小于 当前rs的活跃实例数,则对实例数删除优先级进行排序
// 如果需要删除的实例数 大于或等于 当前rs的活跃实例数,则不需要排序,直接返回所有实例数
if diff < len(filteredPods) {
// 对该deployment下所有rs的所有pod所在node进行计数,并将node对应的计数值作为到当前rs下pod的rank值
podsWithRanks := getPodsRankedByRelatedPodsOnSameNode(filteredPods, relatedPods)
// 参考rank值进行排序
sort.Sort(podsWithRanks)
reportSortingDeletionAgeRatioMetric(filteredPods, diff)
}
return filteredPods[:diff]
}
getPodsRankedByRelatedPodsOnSameNode按照在同一node上的pod数量设置对应的rank值,并返回ActivePodsWithRanks方便进行排序
func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) controller.ActivePodsWithRanks {
// 对该deployment下所有rs的所有pod所在node进行计数
podsOnNode := make(map[string]int)
for _, pod := range relatedPods {
if controller.IsPodActive(pod) {
podsOnNode[pod.Spec.NodeName]++
}
}
// 并将node对应的计数值作为到当前rs下pod的rank值
ranks := make([]int, len(podsToRank))
for i, pod := range podsToRank {
ranks[i] = podsOnNode[pod.Spec.NodeName]
}
// 以ActivePodsWithRanks格式返回,方便进行排序
return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()}
}
看一下ActivePodsWithRanks结构体
type ActivePodsWithRanks struct {
// Pods is a list of pods.
Pods []*v1.Pod
// Rank is a ranking of pods. This ranking is used during sorting when
// comparing two pods that are both scheduled, in the same phase, and
// having the same ready status.
Rank []int
// Now is a reference timestamp for doing logarithmic timestamp comparisons.
// If zero, comparison happens without scaling.
Now metav1.Time
}
排序算法
func (s ActivePodsWithRanks) Less(i, j int) bool {
// 1. 未被调度到节点的pod优先被删除,Unassigned < assigned
// If only one of the pods is unassigned, the unassigned one is smaller
if s.Pods[i].Spec.NodeName != s.Pods[j].Spec.NodeName && (len(s.Pods[i].Spec.NodeName) == 0 || len(s.Pods[j].Spec.NodeName) == 0) {
return len(s.Pods[i].Spec.NodeName) == 0
}
// 2. pending优先于unknown优先于running被删除, PodPending < PodUnknown < PodRunning
if podPhaseToOrdinal[s.Pods[i].Status.Phase] != podPhaseToOrdinal[s.Pods[j].Status.Phase] {
return podPhaseToOrdinal[s.Pods[i].Status.Phase] < podPhaseToOrdinal[s.Pods[j].Status.Phase]
}
// 3. condition ready为false优先被删除,Not ready < ready
// If only one of the pods is not ready, the not ready one is smaller
if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) {
return !podutil.IsPodReady(s.Pods[i])
}
// 4. k8s v1.21版本提供了controller.kubernetes.io/pod-deletion-cost注解作为featuregate,较低的值优先被删除,默认开启,lower pod-deletion-cost < higher pod-deletion cost
if utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost) {
pi, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[i].Annotations)
pj, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[j].Annotations)
if pi != pj {
return pi < pj
}
}
// 5. 在同一个节点上的较多的pod,优先被删除,Doubled up < not doubled up
if s.Rank[i] != s.Rank[j] {
return s.Rank[i] > s.Rank[j]
}
// 6. 如果pod都是ready,则ready时间较短的实例优先被删除 Been ready for empty time < less time < more time
if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) {
readyTime1 := podReadyTime(s.Pods[i])
readyTime2 := podReadyTime(s.Pods[j])
if !readyTime1.Equal(readyTime2) {
// k8s v1.21版本提供了LogarithmicScaleDown的featuregate,默认开启,计算时间戳计算以2为底的对数即log2
if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
return afterOrZero(readyTime1, readyTime2)
} else {
if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() {
return afterOrZero(readyTime1, readyTime2)
}
rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now)
// 如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机
if rankDiff == 0 {
return s.Pods[i].UID < s.Pods[j].UID
}
return rankDiff < 0
}
}
}
// 7. container重启次数较多的pod优先被删除,多个container取最大值
if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) {
return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j])
}
// 8. 创建时间短的pod优先被删除
Empty creation time pods < newer pods < older pods
if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) {
// k8s v1.21版本提供了LogarithmicScaleDown的featuregate,默认开启,计算时间戳计算以2为底的对数即log2
if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
} else {
if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() {
return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
}
rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now)
// 如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机
if rankDiff == 0 {
return s.Pods[i].UID < s.Pods[j].UID
}
return rankDiff < 0
}
}
return false
}
在6和8中使用LogarithmicScaleDown来计算时间戳,该算法不是直接比较时间戳,而是比较自创建以来的经过时间和到当前时间为止的时间差,再以2为底计算对数,进行四舍五入。这样做的效果是,当两个时间差差距较小时,将其视为相等的时间。也就是说,create或ready了几纳秒或几毫秒的pod的优先级是相等的,但它们与运行了几秒钟或几天的pod优先级不同。
同时在6和8中如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机。
可以参考以10为底计算对数的例子,但实际k8s最终采用的是以2为底计算对数,参考enhancements/keps/sig-apps/2185-random-pod-select-on-replicaset-downscale at master · kubernetes/enhancements · GitHub
总而言之
1、未被调度到节点的pod优先被删除
2、pod状态pending优先于unknown优先于running被删除
3、pod condition ready为false优先被删除
4、pod注解pod-deletion-cost值较低的优先被删除
5、同一个节点上的较多的pod,优先被删除
6、如果pod都是ready,则ready时间较短的实例优先被删除,但ready时间差距较小也会认为是ready时间相同,若相同则此时会比较UID,UID小的优先被删除,可以理解为随机删除
7、pod都是非ready,则container重启次数较多的pod优先被删除,多个container取最大值
8、创建时间短的pod优先被删除,但创建时间差距较小也会认为是创建时间相同,若相同则此时会比较UID,UID小的优先被删除,可以理解为随机删除