k8s之ReplicaSet回收pod的优先级

ReplicaSet syncReplicaSet的逻辑,首先找到rs下相关的pod

    // 从informer中获取当前ns下所有的pod
    allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
    if err != nil {
        return err
    }
    // 忽略不健康的pod
    filteredPods := controller.FilterActivePods(allPods)
​
    // 找到被该rs的labelSelector和ownerRefControllerBy选中的pod
    filteredPods, err = rsc.claimPods(ctx, rs, selector, filteredPods)
    if err != nil {
        return err
    }

计算Pod的健康状态

func IsPodActive(p *v1.Pod) bool {
   return v1.PodSucceeded != p.Status.Phase &&
      v1.PodFailed != p.Status.Phase &&
      p.DeletionTimestamp == nil
}

计算当前实例数与期望实例数的diff,进而判断扩缩实例数

func (rsc *ReplicaSetController) manageReplicas(ctx context.Context, filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
    // 计算当前实例数与期望实例数的diff
    diff := len(filteredPods) - int(*(rs.Spec.Replicas))
    rsKey, err := controller.KeyFunc(rs)
    if err != nil {
        utilruntime.HandleError(fmt.Errorf("couldn't get key for %v %#v: %v", rsc.Kind, rs, err))
        return nil
    }
    // 扩容,新建实例数
    if diff < 0 {
        diff *= -1
        // burstReplicas默认值为500
        if diff > rsc.burstReplicas {
            diff = rsc.burstReplicas
        }
        rsc.expectations.ExpectCreations(rsKey, diff)
        klog.V(2).InfoS("Too few replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "creating", diff)
        successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
            err := rsc.podControl.CreatePods(ctx, rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
            if err != nil {
                if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
                    return nil
                }
            }
            return err
        })
        if skippedPods := diff - successfulCreations; skippedPods > 0 {
            klog.V(2).Infof("Slow-start failure. Skipping creation of %d pods, decrementing expectations for %v %v/%v", skippedPods, rsc.Kind, rs.Namespace, rs.Name)
            for i := 0; i < skippedPods; i++ {
                // Decrement the expected number of creates because the informer won't observe this pod
                rsc.expectations.CreationObserved(rsKey)
            }
        }
        return err
    } else if diff > 0 {
        // 缩容,删除实例
        // burstReplicas默认值为500
        if diff > rsc.burstReplicas {
            diff = rsc.burstReplicas
        }
        klog.V(2).InfoS("Too many replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "deleting", diff)
        // 获取deployment ownerRefControllerBy的所有rs的所有pod
        relatedPods, err := rsc.getIndirectlyRelatedPods(rs)
        utilruntime.HandleError(err)
​
        // 找到需要被删除的pod列表
        podsToDelete := getPodsToDelete(filteredPods, relatedPods, diff)
​
        // 在cache中记录需要被删除的podkey
        rsc.expectations.ExpectDeletions(rsKey, getPodKeys(podsToDelete))
​
        errCh := make(chan error, diff)
        var wg sync.WaitGroup
        wg.Add(diff)
        for _, pod := range podsToDelete {
            go func(targetPod *v1.Pod) {
                defer wg.Done()
                // 删除pod
                if err := rsc.podControl.DeletePod(ctx, rs.Namespace, targetPod.Name, rs); err != nil {
                    // 从cache中移除已被删除的podkey
                    podKey := controller.PodKey(targetPod)
                    rsc.expectations.DeletionObserved(rsKey, podKey)
                    if !apierrors.IsNotFound(err) {
                        klog.V(2).Infof("Failed to delete %v, decremented expectations for %v %s/%s", podKey, rsc.Kind, rs.Namespace, rs.Name)
                        errCh <- err
                    }
                }
            }(pod)
        }
        wg.Wait()
​
        select {
        case err := <-errCh:
            // all errors have been reported before and they're likely to be the same, so we'll only return the first one we hit.
            if err != nil {
                return err
            }
        default:
        }
    }
​
    return nil
}

着重看一下getPodsToDelete方法

func getPodsToDelete(filteredPods, relatedPods []*v1.Pod, diff int) []*v1.Pod {
   // 如果需要删除的实例数 小于 当前rs的活跃实例数,则对实例数删除优先级进行排序
   // 如果需要删除的实例数 大于或等于 当前rs的活跃实例数,则不需要排序,直接返回所有实例数
   if diff < len(filteredPods) {
      // 对该deployment下所有rs的所有pod所在node进行计数,并将node对应的计数值作为到当前rs下pod的rank值
      podsWithRanks := getPodsRankedByRelatedPodsOnSameNode(filteredPods, relatedPods)
      // 参考rank值进行排序
      sort.Sort(podsWithRanks)
      reportSortingDeletionAgeRatioMetric(filteredPods, diff)
   }
   return filteredPods[:diff]
}

getPodsRankedByRelatedPodsOnSameNode按照在同一node上的pod数量设置对应的rank值,并返回ActivePodsWithRanks方便进行排序

func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) controller.ActivePodsWithRanks {
   // 对该deployment下所有rs的所有pod所在node进行计数
   podsOnNode := make(map[string]int)
   for _, pod := range relatedPods {
      if controller.IsPodActive(pod) {
         podsOnNode[pod.Spec.NodeName]++
      }
   }
   // 并将node对应的计数值作为到当前rs下pod的rank值
   ranks := make([]int, len(podsToRank))
   for i, pod := range podsToRank {
      ranks[i] = podsOnNode[pod.Spec.NodeName]
   }
   // 以ActivePodsWithRanks格式返回,方便进行排序
   return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()}
}

看一下ActivePodsWithRanks结构体

type ActivePodsWithRanks struct {
   // Pods is a list of pods.
   Pods []*v1.Pod
​
   // Rank is a ranking of pods.  This ranking is used during sorting when
   // comparing two pods that are both scheduled, in the same phase, and
   // having the same ready status.
   Rank []int
​
   // Now is a reference timestamp for doing logarithmic timestamp comparisons.
   // If zero, comparison happens without scaling.
   Now metav1.Time
}

排序算法

func (s ActivePodsWithRanks) Less(i, j int) bool {
   // 1. 未被调度到节点的pod优先被删除,Unassigned < assigned
   // If only one of the pods is unassigned, the unassigned one is smaller
   if s.Pods[i].Spec.NodeName != s.Pods[j].Spec.NodeName && (len(s.Pods[i].Spec.NodeName) == 0 || len(s.Pods[j].Spec.NodeName) == 0) {
      return len(s.Pods[i].Spec.NodeName) == 0
   }
   // 2. pending优先于unknown优先于running被删除, PodPending < PodUnknown < PodRunning
   if podPhaseToOrdinal[s.Pods[i].Status.Phase] != podPhaseToOrdinal[s.Pods[j].Status.Phase] {
      return podPhaseToOrdinal[s.Pods[i].Status.Phase] < podPhaseToOrdinal[s.Pods[j].Status.Phase]
   }
   // 3. condition ready为false优先被删除,Not ready < ready
   // If only one of the pods is not ready, the not ready one is smaller
   if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) {
      return !podutil.IsPodReady(s.Pods[i])
   }
​
   // 4. k8s v1.21版本提供了controller.kubernetes.io/pod-deletion-cost注解作为featuregate,较低的值优先被删除,默认开启,lower pod-deletion-cost < higher pod-deletion cost
   if utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost) {
      pi, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[i].Annotations)
      pj, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[j].Annotations)
      if pi != pj {
         return pi < pj
      }
   }
​
   // 5. 在同一个节点上的较多的pod,优先被删除,Doubled up < not doubled up
   if s.Rank[i] != s.Rank[j] {
      return s.Rank[i] > s.Rank[j]
   }
​
   // 6. 如果pod都是ready,则ready时间较短的实例优先被删除 Been ready for empty time < less time < more time
   if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) {
      readyTime1 := podReadyTime(s.Pods[i])
      readyTime2 := podReadyTime(s.Pods[j])
      if !readyTime1.Equal(readyTime2) {
         // k8s v1.21版本提供了LogarithmicScaleDown的featuregate,默认开启,计算时间戳计算以2为底的对数即log2
         if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
            return afterOrZero(readyTime1, readyTime2)
         } else {
            if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() {
               return afterOrZero(readyTime1, readyTime2)
            }
            rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now)
            // 如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机
            if rankDiff == 0 {
               return s.Pods[i].UID < s.Pods[j].UID
            }
            return rankDiff < 0
         }
      }
   }
   // 7. container重启次数较多的pod优先被删除,多个container取最大值
   if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) {
      return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j])
   }
   // 8. 创建时间短的pod优先被删除
    Empty creation time pods < newer pods < older pods
   if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) {
      // k8s v1.21版本提供了LogarithmicScaleDown的featuregate,默认开启,计算时间戳计算以2为底的对数即log2
      if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
         return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
      } else {
         if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() {
            return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
         }
         rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now)
         // 如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机
         if rankDiff == 0 {
            return s.Pods[i].UID < s.Pods[j].UID
         }
         return rankDiff < 0
      }
   }
   return false
}

在6和8中使用LogarithmicScaleDown来计算时间戳,该算法不是直接比较时间戳,而是比较自创建以来的经过时间和到当前时间为止的时间差,再以2为底计算对数,进行四舍五入。这样做的效果是,当两个时间差差距较小时,将其视为相等的时间。也就是说,create或ready了几纳秒或几毫秒的pod的优先级是相等的,但它们与运行了几秒钟或几天的pod优先级不同。

同时在6和8中如果使用LogarithmicScaleDown的计算结果相同,则会比较pod的UID,UID的ASCII码较小的pod优先被删除,可以理解为随机

可以参考以10为底计算对数的例子,但实际k8s最终采用的是以2为底计算对数,参考enhancements/keps/sig-apps/2185-random-pod-select-on-replicaset-downscale at master · kubernetes/enhancements · GitHub

k8s之ReplicaSet回收pod的优先级_第1张图片

总而言之

1、未被调度到节点的pod优先被删除

2、pod状态pending优先于unknown优先于running被删除

3、pod condition ready为false优先被删除

4、pod注解pod-deletion-cost值较低的优先被删除

5、同一个节点上的较多的pod,优先被删除

6、如果pod都是ready,则ready时间较短的实例优先被删除,但ready时间差距较小也会认为是ready时间相同,若相同则此时会比较UID,UID小的优先被删除,可以理解为随机删除

7、pod都是非ready,则container重启次数较多的pod优先被删除,多个container取最大值

8、创建时间短的pod优先被删除,但创建时间差距较小也会认为是创建时间相同,若相同则此时会比较UID,UID小的优先被删除,可以理解为随机删除

你可能感兴趣的:(kubernetes,rs,downscale)