深入分析kube-batch（4）——actions

action是真正的调度过程，顺序是reclaim -> allocate -> backfill -> preempt

interface

// Action is the interface of scheduler action.
type Action interface {
   // The unique name of Action.
   Name() string

   // Initialize initializes the allocator plugins.
   Initialize()

   // Execute allocates the cluster's resources into each queue.
   Execute(ssn *Session)

   // UnIntialize un-initializes the allocator plugins.
   UnInitialize()
}

重点关注Execute实现

reclaim

kube-batch\pkg\scheduler\actions\reclaim\reclaim.go

func (alloc *reclaimAction) Execute(ssn *framework.Session) {
   queues := util.NewPriorityQueue(ssn.QueueOrderFn)

   preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
   preemptorTasks := map[api.JobID]*util.PriorityQueue{}

   for _, job := range ssn.Jobs {
      queues.Push(queue)

      if len(job.TaskStatusIndex[api.Pending]) != 0 {
         preemptorsMap[job.Queue].Push(job)
         
         for _, task := range job.TaskStatusIndex[api.Pending] {
            preemptorTasks[job.UID].Push(task)
         }
      }
   }

根据优先级排序queue
将待调度的task保存为抢占者

for {
   if queues.Empty() {
      break
   }

   queue := queues.Pop().(*api.QueueInfo)
   jobs, found := preemptorsMap[queue.UID]
   tasks, found := preemptorTasks[job.UID]

   resreq := task.Resreq.Clone()
   reclaimed := api.EmptyResource()

   assigned := false

   for _, n := range ssn.Nodes {
      if err := ssn.PredicateFn(task, n); err != nil {
         continue
      }

      var reclaimees []*api.TaskInfo
      for _, task := range n.Tasks {
         if task.Status != api.Running {
            continue
         }

         reclaimees = append(reclaimees, task.Clone())
      }
      victims := ssn.Reclaimable(task, reclaimees)

      if len(victims) == 0 {
         continue
      }

      // If not enough resource, continue
      allRes := api.EmptyResource()
      for _, v := range victims {
         allRes.Add(v.Resreq)
      }
      if allRes.Less(resreq) {
         continue
      }

      // Reclaim victims for tasks.
      for _, reclaimee := range victims {
         ssn.Evict(reclaimee, "reclaim")
         
         reclaimed.Add(reclaimee.Resreq)
         if resreq.LessEqual(reclaimee.Resreq) {
            break
         }
         resreq.Sub(reclaimee.Resreq)
      }       
      break
   }

}

找到优先级最高的queue，job，task
遍历node，首先过预选函数，很奇怪，没有PodFitsResources，应该是kube-batch自己管理资源
找到node上正在运行的pod
找到受害者
如果受害者资源总量小于pod申请资源总量，就跳过
驱逐受害者，调用删除接口
如果释放足够的资源，就跳出驱逐

reclaim过程目前还没遇到过，回收函数也不是很理解。我觉得回收不是很必要，驱逐逻辑不应该在这里做，kubelet已经有了驱逐逻辑，不是很明白reclaim的必要性。而且不是每次都需要回收，应该判断node是否自愿不足。我会在配置中移除reclaim的action，还能提高性能。

allocate

        for !tasks.Empty() {
            task := tasks.Pop().(*api.TaskInfo)

            for _, node := range ssn.Nodes {
                if err := ssn.PredicateFn(task, node); err != nil {
                    continue
                }

                // Allocate idle resource to the task.
                if task.Resreq.LessEqual(node.Idle) {
                    ssn.Allocate(task, node.Name)
                    break
                }
            }
        }

分配过程只看最核心的部分，

过一遍预选函数，
比较pod资源申请和node空闲资源
bind

这里解决了上面的疑问，预选函数中没有PodFitsResources，是因为在这里实现了类似功能；不过又多了一个疑问，这里如果node满足pod要求，那么就直接bind了？没有优选过程吗？那soft亲和性怎么办？

backfill

func (alloc *backfillAction) Execute(ssn *framework.Session) {

   for _, job := range ssn.Jobs {
      for _, task := range job.TaskStatusIndex[api.Pending] {
         
         if task.Resreq.IsEmpty() {
            for _, node := range ssn.Nodes {
               ssn.PredicateFn(task, node);

               ssn.Allocate(task, node.Name)
               break
            }
         }
      }
   }
}

backfill是为了处理BestEffort后加的action，相关issue。

preempt

抢占逻辑一直没有很理解，这里先暂时不分析了，会另外开一篇文章专门介绍抢占，不过kube-batch的抢占跟K8S的不太一样。

总结

学习了马达老师的kube-batch为我打开了一个新思路，不过对于我的项目可能有点重，不是很需要回收和抢占逻辑，还有貌似也不支持优选逻辑，没有优选就没有soft亲和性，这个是个非常致命的问题，所以后期应该准备参考default-scheduler和kube-batch自己写一个调度器了。

深入分析kube-batch（4）——actions

深入分析kube-batch（4）——actions

interface

reclaim

allocate

backfill

preempt

总结

你可能感兴趣的:(深入分析kube-batch（4）——actions)