Bluejoy Jing

kube-scheduler源码解读（5）

在kube-scheduler源码解读的第四篇文章中，我们完成了对scheduler组件启动过程中的所有前期准备流程。
本篇文章我们将分析sched.scheduleOne函数的执行流程，Pod的调度就是通过这个函数进行的。让我们开始pod的调度之旅吧。

sched.scheduleOne函数的定义如下：

// scheduleOne does the entire scheduling workflow for a single pod.  It is serialized on the scheduling algorithm's host fitting.
func (sched *Scheduler) scheduleOne(ctx context.Context) {
    // 1. 获取需要调度的pod
	podInfo := sched.NextPod()
	// pod could be nil when schedulerQueue is closed
	if podInfo == nil || podInfo.Pod == nil {
		return
	}

    // 2. 获取当前待调度pod对应的scheduler框架的profile
	pod := podInfo.Pod
	prof, err := sched.profileForPod(pod)
	if err != nil {
		// This shouldn't happen, because we only accept for scheduling the pods
		// which specify a scheduler name that matches one of the profiles.
		klog.Error(err)
		return
	}

       // 3. 判断是否可以忽略当前pod的调度
	if sched.skipPodSchedule(prof, pod) {
		return
	}

	klog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)

    // 4. 尝试给pod查找一个合适的节点
	// Synchronously attempt to find a fit for the pod.
	start := time.Now()
	state := framework.NewCycleState()
	state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent)
	schedulingCycleCtx, cancel := context.WithCancel(ctx)
	defer cancel()
	scheduleResult, err := sched.Algorithm.Schedule(schedulingCycleCtx, prof, state, pod)
	if err != nil {
		// Schedule() may have failed because the pod would not fit on any host, so we try to
		// preempt, with the expectation that the next time the pod is tried for scheduling it
		// will fit due to the preemption. It is also possible that a different pod will schedule
		// into the resources that were preempted, but this is harmless.
		if fitError, ok := err.(*core.FitError); ok {
			if sched.DisablePreemption {
				klog.V(3).Infof("Pod priority feature is not enabled or preemption is disabled by scheduler configuration." +
					" No preemption is performed.")
			} else {
				preemptionStartTime := time.Now()
				sched.preempt(schedulingCycleCtx, prof, state, pod, fitError)
				metrics.PreemptionAttempts.Inc()
				metrics.SchedulingAlgorithmPreemptionEvaluationDuration.Observe(metrics.SinceInSeconds(preemptionStartTime))
				metrics.DeprecatedSchedulingDuration.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime))
			}
			// Pod did not fit anywhere, so it is counted as a failure. If preemption
			// succeeds, the pod should get counted as a success the next time we try to
			// schedule it. (hopefully)
			metrics.PodScheduleFailures.Inc()
		} else {
			klog.Errorf("error selecting node for pod: %v", err)
			metrics.PodScheduleErrors.Inc()
		}
		sched.recordSchedulingFailure(prof, podInfo.DeepCopy(), err, v1.PodReasonUnschedulable, err.Error())
		return
	}
	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))

    // 5. 更新cache来假设pod已经运行到了某个节点上
	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
	// This allows us to keep scheduling without waiting on binding to occur.

    // 5.1 获取pod信息
	assumedPodInfo := podInfo.DeepCopy()
	assumedPod := assumedPodInfo.Pod

    // 5.2 在assume pod之前先assume volume
	// Assume volumes first before assuming the pod.
	//
	// If all volumes are completely bound, then allBound is true and binding will be skipped.
	//
	// Otherwise, binding of volumes is started after the pod is assumed, but before pod binding.
	//
	// This function modifies 'assumedPod' if volume binding is required.
	allBound, err := sched.VolumeBinder.AssumePodVolumes(assumedPod, scheduleResult.SuggestedHost)
	if err != nil {
		sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError,
			fmt.Sprintf("AssumePodVolumes failed: %v", err))
		metrics.PodScheduleErrors.Inc()
		return
	}

    // 5.3 运行reserve插件
	// Run "reserve" plugins.
	if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
		sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, sts.Message())
		metrics.PodScheduleErrors.Inc()
		return
	}

    // 5.4 assume pod
	// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
	err = sched.assume(assumedPod, scheduleResult.SuggestedHost)
	if err != nil {
		// This is most probably result of a BUG in retrying logic.
		// We report an error here so that pod scheduling can be retried.
		// This relies on the fact that Error will check if the pod has been bound
		// to a node and if so will not add it back to the unscheduled pods queue
		// (otherwise this would cause an infinite loop).
		sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, fmt.Sprintf("AssumePod failed: %v", err))
		metrics.PodScheduleErrors.Inc()
		// trigger un-reserve plugins to clean up state associated with the reserved Pod
		prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		return
	}

    // 6. 运行permit插件
	// Run "permit" plugins.
	runPermitStatus := prof.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
		var reason string
		if runPermitStatus.IsUnschedulable() {
			metrics.PodScheduleFailures.Inc()
			reason = v1.PodReasonUnschedulable
		} else {
			metrics.PodScheduleErrors.Inc()
			reason = SchedulerError
		}
		if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
			klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
		}
		// One of the plugins returned status different than success or wait.
		prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		sched.recordSchedulingFailure(prof, assumedPodInfo, runPermitStatus.AsError(), reason, runPermitStatus.Message())
		return
	}

    // 7 异步的绑定pod到host上
	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
	go func() {
		bindingCycleCtx, cancel := context.WithCancel(ctx)
		defer cancel()
		metrics.SchedulerGoroutines.WithLabelValues("binding").Inc()
		defer metrics.SchedulerGoroutines.WithLabelValues("binding").Dec()

		waitOnPermitStatus := prof.WaitOnPermit(bindingCycleCtx, assumedPod)
		if !waitOnPermitStatus.IsSuccess() {
			var reason string
			if waitOnPermitStatus.IsUnschedulable() {
				metrics.PodScheduleFailures.Inc()
				reason = v1.PodReasonUnschedulable
			} else {
				metrics.PodScheduleErrors.Inc()
				reason = SchedulerError
			}
			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
			}
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, waitOnPermitStatus.AsError(), reason, waitOnPermitStatus.Message())
			return
		}

		// Bind volumes first before Pod
		if !allBound {
			err := sched.bindVolumes(assumedPod)
			if err != nil {
				sched.recordSchedulingFailure(prof, assumedPodInfo, err, "VolumeBindingFailed", err.Error())
				metrics.PodScheduleErrors.Inc()
				// trigger un-reserve plugins to clean up state associated with the reserved Pod
				prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
				return
			}
		}

		// Run "prebind" plugins.
		preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		if !preBindStatus.IsSuccess() {
			var reason string
			metrics.PodScheduleErrors.Inc()
			reason = SchedulerError
			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
			}
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, preBindStatus.AsError(), reason, preBindStatus.Message())
			return
		}

		err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
		metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
		if err != nil {
			metrics.PodScheduleErrors.Inc()
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, fmt.Sprintf("Binding rejected: %v", err))
		} else {
			// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
			if klog.V(2) {
				klog.Infof("pod %v/%v is bound successfully on node %q, %d nodes evaluated, %d nodes were found feasible.", assumedPod.Namespace, assumedPod.Name, scheduleResult.SuggestedHost, scheduleResult.EvaluatedNodes, scheduleResult.FeasibleNodes)
			}

			metrics.PodScheduleSuccesses.Inc()
			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
			metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

			// Run "postbind" plugins.
			prof.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		}
	}()
}

主要流程如下：

获取需要调度的pod
获取当前待调度pod对应的scheduler框架的profile
判断是否可以忽略当前pod的调度
尝试给pod查找一个合适的节点
更新cache来assume pod已经运行到了某个节点上
5.1 在assume pod之前先assume volume
5.2 运行reserve插件
5.3 assume pod
运行permit插件
异步的绑定pod到host上

1. 获取需要调度的pod

sched.NextPod函数的定义是通过如下流程进行配置的：
pkg/scheduler/scheduler.go里的New函数 ->
configurator.createFromProvider函数或者 configurator.createFromConfig函数 ->
c.create函数
其中c.create函数的定义如下所示：

// create a scheduler from a set of registered plugins.
func (c *Configurator) create() (*Scheduler, error) {
	var extenders []core.SchedulerExtender
	var ignoredExtendedResources []string
	if len(c.extenders) != 0 {
		var ignorableExtenders []core.SchedulerExtender
		for ii := range c.extenders {
			klog.V(2).Infof("Creating extender with config %+v", c.extenders[ii])
			extender, err := core.NewHTTPExtender(&c.extenders[ii])
			if err != nil {
				return nil, err
			}
			if !extender.IsIgnorable() {
				extenders = append(extenders, extender)
			} else {
				ignorableExtenders = append(ignorableExtenders, extender)
			}
			for _, r := range c.extenders[ii].ManagedResources {
				if r.IgnoredByScheduler {
					ignoredExtendedResources = append(ignoredExtendedResources, r.Name)
				}
			}
		}
		// place ignorable extenders to the tail of extenders
		extenders = append(extenders, ignorableExtenders...)
	}

	// If there are any extended resources found from the Extenders, append them to the pluginConfig for each profile.
	// This should only have an effect on ComponentConfig v1alpha2, where it is possible to configure Extenders and
	// plugin args (and in which case the extender ignored resources take precedence).
	// For earlier versions, using both policy and custom plugin config is disallowed, so this should be the only
	// plugin config for this plugin.
	if len(ignoredExtendedResources) > 0 {
		for i := range c.profiles {
			prof := &c.profiles[i]
			prof.PluginConfig = append(prof.PluginConfig,
				frameworkplugins.NewPluginConfig(
					noderesources.FitName,
					noderesources.FitArgs{IgnoredResources: ignoredExtendedResources},
				),
			)
		}
	}

	profiles, err := profile.NewMap(c.profiles, c.buildFramework, c.recorderFactory)
	if err != nil {
		return nil, fmt.Errorf("initializing profiles: %v", err)
	}
	if len(profiles) == 0 {
		return nil, errors.New("at least one profile is required")
	}
	// Profiles are required to have equivalent queue sort plugins.
	lessFn := profiles[c.profiles[0].SchedulerName].Framework.QueueSortFunc()
	podQueue := internalqueue.NewSchedulingQueue(
		lessFn,
		internalqueue.WithPodInitialBackoffDuration(time.Duration(c.podInitialBackoffSeconds)*time.Second),
		internalqueue.WithPodMaxBackoffDuration(time.Duration(c.podMaxBackoffSeconds)*time.Second),
	)

	// Setup cache debugger.
	debugger := cachedebugger.New(
		c.informerFactory.Core().V1().Nodes().Lister(),
		c.podInformer.Lister(),
		c.schedulerCache,
		podQueue,
	)
	debugger.ListenForSignal(c.StopEverything)

	algo := core.NewGenericScheduler(
		c.schedulerCache,
		podQueue,
		c.nodeInfoSnapshot,
		extenders,
		c.informerFactory.Core().V1().PersistentVolumeClaims().Lister(),
		GetPodDisruptionBudgetLister(c.informerFactory),
		c.disablePreemption,
		c.percentageOfNodesToScore,
		c.enableNonPreempting,
	)

	return &Scheduler{
		SchedulerCache:  c.schedulerCache,
		Algorithm:       algo,
		Profiles:        profiles,
		NextPod:         internalqueue.MakeNextPodFunc(podQueue),
		Error:           MakeDefaultErrorFunc(c.client, podQueue, c.schedulerCache),
		StopEverything:  c.StopEverything,
		VolumeBinder:    c.volumeBinder,
		SchedulingQueue: podQueue,
	}, nil
}

MakeNextPodFunc函数的定义如下所示：

// MakeNextPodFunc returns a function to retrieve the next pod from a given
// scheduling queue
func MakeNextPodFunc(queue SchedulingQueue) func() *framework.PodInfo {
	return func() *framework.PodInfo {
		podInfo, err := queue.Pop()
		if err == nil {
			klog.V(4).Infof("About to try and schedule pod %v/%v", podInfo.Pod.Namespace, podInfo.Pod.Name)
			return podInfo
		}
		klog.Errorf("Error while retrieving next pod from scheduling queue: %v", err)
		return nil
	}
}

2. 获取当前待调度pod对应的scheduler框架的profile

func (sched *Scheduler) profileForPod(pod *v1.Pod) (*profile.Profile, error) {
	prof, ok := sched.Profiles[pod.Spec.SchedulerName]
	if !ok {
		return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName)
	}
	return prof, nil
}

3. 判断是否可以忽略当前pod的调度

主要在以下两种情况下忽略调此次调度：
1）pod已经被更新为要删除
2）已经执行了assumed的pod，发生了一些可以忽略的更新操作，比如：ResourceVersion， Spec.NodeName and/or Annotations

skipPodSchedule函数的定义如下：

// skipPodSchedule returns true if we could skip scheduling the pod for specified cases.
func (sched *Scheduler) skipPodSchedule(prof *profile.Profile, pod *v1.Pod) bool {
	// Case 1: pod is being deleted.
	if pod.DeletionTimestamp != nil {
		prof.Recorder.Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
		klog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
		return true
	}

	// Case 2: pod has been assumed and pod updates could be skipped.
	// An assumed pod can be added again to the scheduling queue if it got an update event
	// during its previous scheduling cycle but before getting assumed.
	if sched.skipPodUpdate(pod) {
		return true
	}

	return false
}

skipPodUpdate函数的定义如下：

// skipPodUpdate checks whether the specified pod update should be ignored.
// This function will return true if
//   - The pod has already been assumed, AND
//   - The pod has only its ResourceVersion, Spec.NodeName and/or Annotations
//     updated.
func (sched *Scheduler) skipPodUpdate(pod *v1.Pod) bool {
	// Non-assumed pods should never be skipped.
	isAssumed, err := sched.SchedulerCache.IsAssumedPod(pod)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err))
		return false
	}
	if !isAssumed {
		return false
	}

	// Gets the assumed pod from the cache.
	assumedPod, err := sched.SchedulerCache.GetPod(pod)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("failed to get assumed pod %s/%s from cache: %v", pod.Namespace, pod.Name, err))
		return false
	}

	// Compares the assumed pod in the cache with the pod update. If they are
	// equal (with certain fields excluded), this pod update will be skipped.
	f := func(pod *v1.Pod) *v1.Pod {
		p := pod.DeepCopy()
		// ResourceVersion must be excluded because each object update will
		// have a new resource version.
		p.ResourceVersion = ""
		// Spec.NodeName must be excluded because the pod assumed in the cache
		// is expected to have a node assigned while the pod update may nor may
		// not have this field set.
		p.Spec.NodeName = ""
		// Annotations must be excluded for the reasons described in
		// https://github.com/kubernetes/kubernetes/issues/52914.
		p.Annotations = nil
		return p
	}
	assumedPodCopy, podCopy := f(assumedPod), f(pod)
	if !reflect.DeepEqual(assumedPodCopy, podCopy) {
		return false
	}
	klog.V(3).Infof("Skipping pod %s/%s update", pod.Namespace, pod.Name)
	return true
}

4. 尝试给pod查找一个合适的节点

1）通过Schedule函数来选择一个合适的节点
2）如果成功则继续执行；如果不成功则判断是否开启了抢占调度机制，执行抢占调度流程。
Schedule函数的定义如下所示：

// Schedule tries to schedule the given pod to one of the nodes in the node list.
// If it succeeds, it will return the name of the node.
// If it fails, it will return a FitError error with reasons.
func (g *genericScheduler) Schedule(ctx context.Context, prof *profile.Profile, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
	trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
	defer trace.LogIfLong(100 * time.Millisecond)

    // 1. 检查pvc
	if err := podPassesBasicChecks(pod, g.pvcLister); err != nil {
		return result, err
	}
	trace.Step("Basic checks done")

    // 2. 快照scheduler cache 和 node infos，以备fit和priority函数使用
	if err := g.snapshot(); err != nil {
		return result, err
	}
	trace.Step("Snapshotting scheduler cache and node infos done")

    // 3. 检查可用节点的个数，如果为0则直接返回ErrNoNodesAvailable错误
	if g.nodeInfoSnapshot.NumNodes() == 0 {
		return result, ErrNoNodesAvailable
	}

    // 4. 运行prefilter插件
	// Run "prefilter" plugins.
	preFilterStatus := prof.RunPreFilterPlugins(ctx, state, pod)
	if !preFilterStatus.IsSuccess() {
		return result, preFilterStatus.AsError()
	}
	trace.Step("Running prefilter plugins done")

    // 5. 查找满足条件的节点
	startPredicateEvalTime := time.Now()
	filteredNodes, filteredNodesStatuses, err := g.findNodesThatFitPod(ctx, prof, state, pod)
	if err != nil {
		return result, err
	}
	trace.Step("Computing predicates done")

    // 6. 判断满足条件的节点是否为0
	if len(filteredNodes) == 0 {
		return result, &FitError{
			Pod:                   pod,
			NumAllNodes:           g.nodeInfoSnapshot.NumNodes(),
			FilteredNodesStatuses: filteredNodesStatuses,
		}
	}

    // 7. 运行prescore插件
	// Run "prescore" plugins.
	prescoreStatus := prof.RunPreScorePlugins(ctx, state, pod, filteredNodes)
	if !prescoreStatus.IsSuccess() {
		return result, prescoreStatus.AsError()
	}
	trace.Step("Running prescore plugins done")

	metrics.DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration.Observe(metrics.SinceInSeconds(startPredicateEvalTime))
	metrics.DeprecatedSchedulingDuration.WithLabelValues(metrics.PredicateEvaluation).Observe(metrics.SinceInSeconds(startPredicateEvalTime))

    // 8. 如果预选过后，只有一个节点的话，直接使用这个节点
	startPriorityEvalTime := time.Now()
	// When only one node after predicate, just use it.
	if len(filteredNodes) == 1 {
		metrics.DeprecatedSchedulingAlgorithmPriorityEvaluationSecondsDuration.Observe(metrics.SinceInSeconds(startPriorityEvalTime))
		return ScheduleResult{
			SuggestedHost:  filteredNodes[0].Name,
			EvaluatedNodes: 1 + len(filteredNodesStatuses),
			FeasibleNodes:  1,
		}, nil
	}

    // 9. 给预选过的节点打分
	priorityList, err := g.prioritizeNodes(ctx, prof, state, pod, filteredNodes)
	if err != nil {
		return result, err
	}

	metrics.DeprecatedSchedulingAlgorithmPriorityEvaluationSecondsDuration.Observe(metrics.SinceInSeconds(startPriorityEvalTime))
	metrics.DeprecatedSchedulingDuration.WithLabelValues(metrics.PriorityEvaluation).Observe(metrics.SinceInSeconds(startPriorityEvalTime))

    // 10. 选择一个得分最高的节点
	host, err := g.selectHost(priorityList)
	trace.Step("Prioritizing done")

    // 11. 返回选择的节点
	return ScheduleResult{
		SuggestedHost:  host,
		EvaluatedNodes: len(filteredNodes) + len(filteredNodesStatuses),
		FeasibleNodes:  len(filteredNodes),
	}, err
}

抢占调度函数preemt的定义如下：

// preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible.
// If it succeeds, it adds the name of the node where preemption has happened to the pod spec.
// It returns the node name and an error if any.
func (sched *Scheduler) preempt(ctx context.Context, prof *profile.Profile, state *framework.CycleState, preemptor *v1.Pod, scheduleErr error) (string, error) {
	preemptor, err := sched.podPreemptor.getUpdatedPod(preemptor)
	if err != nil {
		klog.Errorf("Error getting the updated preemptor pod object: %v", err)
		return "", err
	}

	node, victims, nominatedPodsToClear, err := sched.Algorithm.Preempt(ctx, prof, state, preemptor, scheduleErr)
	if err != nil {
		klog.Errorf("Error preempting victims to make room for %v/%v: %v", preemptor.Namespace, preemptor.Name, err)
		return "", err
	}
	var nodeName = ""
	if node != nil {
		nodeName = node.Name
		// Update the scheduling queue with the nominated pod information. Without
		// this, there would be a race condition between the next scheduling cycle
		// and the time the scheduler receives a Pod Update for the nominated pod.
		sched.SchedulingQueue.UpdateNominatedPodForNode(preemptor, nodeName)

		// Make a call to update nominated node name of the pod on the API server.
		err = sched.podPreemptor.setNominatedNodeName(preemptor, nodeName)
		if err != nil {
			klog.Errorf("Error in preemption process. Cannot set 'NominatedPod' on pod %v/%v: %v", preemptor.Namespace, preemptor.Name, err)
			sched.SchedulingQueue.DeleteNominatedPodIfExists(preemptor)
			return "", err
		}

		for _, victim := range victims {
			if err := sched.podPreemptor.deletePod(victim); err != nil {
				klog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err)
				return "", err
			}
			// If the victim is a WaitingPod, send a reject message to the PermitPlugin
			if waitingPod := prof.GetWaitingPod(victim.UID); waitingPod != nil {
				waitingPod.Reject("preempted")
			}
			prof.Recorder.Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by %v/%v on node %v", preemptor.Namespace, preemptor.Name, nodeName)

		}
		metrics.PreemptionVictims.Observe(float64(len(victims)))
	}
	// Clearing nominated pods should happen outside of "if node != nil". Node could
	// be nil when a pod with nominated node name is eligible to preempt again,
	// but preemption logic does not find any node for it. In that case Preempt()
	// function of generic_scheduler.go returns the pod itself for removal of
	// the 'NominatedPod' field.
	for _, p := range nominatedPodsToClear {
		rErr := sched.podPreemptor.removeNominatedNodeName(p)
		if rErr != nil {
			klog.Errorf("Cannot remove 'NominatedPod' field of pod: %v", rErr)
			// We do not return as this error is not critical.
		}
	}
	return nodeName, err
}

5. 更新cache来assume pod已经运行到了某个节点上

5.1 获取pod信息

5.2 在assume pod之前先assume volume

调用AssumepodVolumes函数更新volume相关的cache，如：pvCache，pvcCache,podBindingCache
AssumepodVolumes函数定义如下：

// AssumePodVolumes will take the cached matching PVs and PVCs to provision
// in podBindingCache for the chosen node, and:
// 1. Update the pvCache with the new prebound PV.
// 2. Update the pvcCache with the new PVCs with annotations set
// 3. Update podBindingCache again with cached API updates for PVs and PVCs.
func (b *volumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, err error) {
	podName := getPodName(assumedPod)

	klog.V(4).Infof("AssumePodVolumes for pod %q, node %q", podName, nodeName)
	start := time.Now()
	defer func() {
		metrics.VolumeSchedulingStageLatency.WithLabelValues("assume").Observe(time.Since(start).Seconds())
		if err != nil {
			metrics.VolumeSchedulingStageFailed.WithLabelValues("assume").Inc()
		}
	}()

	if allBound := b.arePodVolumesBound(assumedPod); allBound {
		klog.V(4).Infof("AssumePodVolumes for pod %q, node %q: all PVCs bound and nothing to do", podName, nodeName)
		return true, nil
	}

	assumedPod.Spec.NodeName = nodeName

	claimsToBind := b.podBindingCache.GetBindings(assumedPod, nodeName)
	claimsToProvision := b.podBindingCache.GetProvisionedPVCs(assumedPod, nodeName)

	// Assume PV
	newBindings := []*bindingInfo{}
	for _, binding := range claimsToBind {
		newPV, dirty, err := pvutil.GetBindVolumeToClaim(binding.pv, binding.pvc)
		klog.V(5).Infof("AssumePodVolumes: GetBindVolumeToClaim for pod %q, PV %q, PVC %q.  newPV %p, dirty %v, err: %v",
			podName,
			binding.pv.Name,
			binding.pvc.Name,
			newPV,
			dirty,
			err)
		if err != nil {
			b.revertAssumedPVs(newBindings)
			return false, err
		}
		// TODO: can we assume everytime?
		if dirty {
			err = b.pvCache.Assume(newPV)
			if err != nil {
				b.revertAssumedPVs(newBindings)
				return false, err
			}
		}
		newBindings = append(newBindings, &bindingInfo{pv: newPV, pvc: binding.pvc})
	}

	// Assume PVCs
	newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
	for _, claim := range claimsToProvision {
		// The claims from method args can be pointing to watcher cache. We must not
		// modify these, therefore create a copy.
		claimClone := claim.DeepCopy()
		metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, pvutil.AnnSelectedNode, nodeName)
		err = b.pvcCache.Assume(claimClone)
		if err != nil {
			b.revertAssumedPVs(newBindings)
			b.revertAssumedPVCs(newProvisionedPVCs)
			return
		}

		newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
	}

	// Update cache with the assumed pvcs and pvs
	// Even if length is zero, update the cache with an empty slice to indicate that no
	// operations are needed
	b.podBindingCache.UpdateBindings(assumedPod, nodeName, newBindings, newProvisionedPVCs)

	return
}

5.3 运行reserve插件

Reserve插件是为给定的Pod保留节点上的资源，维护运行时状态的插件（也称为stateful plugins）应使用此扩展点由调度程序通知。这是在调度程序实际将Pod绑定到节点之前发生的，并且它存在是为了防止竞争条件，同时调度程序会等待绑定成功。
这是调度周期的最后一步。一旦Pod处于保留状态，它将在绑定周期结束时触发Unreserve插件（失败）或 Post-bind插件（成功）。

// RunReservePlugins runs the set of configured reserve plugins. If any of these
// plugins returns an error, it does not continue running the remaining ones and
// returns the error. In such case, pod will not be scheduled.
func (f *framework) RunReservePlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) (status *Status) {
	startTime := time.Now()
	defer func() {
		metrics.FrameworkExtensionPointDuration.WithLabelValues(reserve, status.Code().String()).Observe(metrics.SinceInSeconds(startTime))
	}()
	for _, pl := range f.reservePlugins {
		status = f.runReservePlugin(ctx, pl, state, pod, nodeName)
		if !status.IsSuccess() {
			msg := fmt.Sprintf("error while running %q reserve plugin for pod %q: %v", pl.Name(), pod.Name, status.Message())
			klog.Error(msg)
			return NewStatus(Error, msg)
		}
	}
	return nil
}

func (f *framework) runReservePlugin(ctx context.Context, pl ReservePlugin, state *CycleState, pod *v1.Pod, nodeName string) *Status {
	if !state.ShouldRecordPluginMetrics() {
		return pl.Reserve(ctx, state, pod, nodeName)
	}
	startTime := time.Now()
	status := pl.Reserve(ctx, state, pod, nodeName)
	f.metricsRecorder.observePluginDurationAsync(reserve, pl.Name(), status, metrics.SinceInSeconds(startTime))
	return status
}

5.4 assume pod

assume函数定义如下：

// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
// assume modifies `assumed`.
func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
	// Optimistically assume that the binding will succeed and send it to apiserver
	// in the background.
	// If the binding fails, scheduler will release resources allocated to assumed pod
	// immediately.
	assumed.Spec.NodeName = host

	if err := sched.SchedulerCache.AssumePod(assumed); err != nil {
		klog.Errorf("scheduler cache AssumePod failed: %v", err)
		return err
	}
	// if "assumed" is a nominated pod, we should remove it from internal cache
	if sched.SchedulingQueue != nil {
		sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed)
	}

	return nil
}

6. 运行permit插件

Permit插件用于防止或延迟Pod的绑定。Permit插件可以做以下三件事之一。
1）approve：所有许可插件都批准Pod后，将其发送以进行绑定。
2）deny：如果任何许可插件拒绝Pod，则将其返回到调度队列。这将触发Unreserve插件。
3）wait (with a timeout)：如果许可插件返回wait，则Pod将保持在许可阶段，直到插件批准它为止。如果发生超时，wait将变为拒绝，并且Pod将返回到调度队列，从而触发 Unreserve插件。

// RunPermitPlugins runs the set of configured permit plugins. If any of these
// plugins returns a status other than "Success" or "Wait", it does not continue
// running the remaining plugins and returns an error. Otherwise, if any of the
// plugins returns "Wait", then this function will create and add waiting pod
// to a map of currently waiting pods and return status with "Wait" code.
// Pod will remain waiting pod for the minimum duration returned by the permit plugins.
func (f *framework) RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) (status *Status) {
	startTime := time.Now()
	defer func() {
		metrics.FrameworkExtensionPointDuration.WithLabelValues(permit, status.Code().String()).Observe(metrics.SinceInSeconds(startTime))
	}()
	pluginsWaitTime := make(map[string]time.Duration)
	statusCode := Success
	for _, pl := range f.permitPlugins {
		status, timeout := f.runPermitPlugin(ctx, pl, state, pod, nodeName)
		if !status.IsSuccess() {
			if status.IsUnschedulable() {
				msg := fmt.Sprintf("rejected pod %q by permit plugin %q: %v", pod.Name, pl.Name(), status.Message())
				klog.V(4).Infof(msg)
				return NewStatus(status.Code(), msg)
			}
			if status.Code() == Wait {
				// Not allowed to be greater than maxTimeout.
				if timeout > maxTimeout {
					timeout = maxTimeout
				}
				pluginsWaitTime[pl.Name()] = timeout
				statusCode = Wait
			} else {
				msg := fmt.Sprintf("error while running %q permit plugin for pod %q: %v", pl.Name(), pod.Name, status.Message())
				klog.Error(msg)
				return NewStatus(Error, msg)
			}
		}
	}
	if statusCode == Wait {
		waitingPod := newWaitingPod(pod, pluginsWaitTime)
		f.waitingPods.add(waitingPod)
		msg := fmt.Sprintf("one or more plugins asked to wait and no plugin rejected pod %q", pod.Name)
		klog.V(4).Infof(msg)
		return NewStatus(Wait, msg)
	}
	return nil
}

func (f *framework) runPermitPlugin(ctx context.Context, pl PermitPlugin, state *CycleState, pod *v1.Pod, nodeName string) (*Status, time.Duration) {
	if !state.ShouldRecordPluginMetrics() {
		return pl.Permit(ctx, state, pod, nodeName)
	}
	startTime := time.Now()
	status, timeout := pl.Permit(ctx, state, pod, nodeName)
	f.metricsRecorder.observePluginDurationAsync(permit, pl.Name(), status, metrics.SinceInSeconds(startTime))
	return status, timeout
}

7. 异步的绑定pod到host上

异步绑定pod的流程是通过启动一个goroutine来实现的，具体定义如下：

    // bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
	go func() {
		bindingCycleCtx, cancel := context.WithCancel(ctx)
		defer cancel()
		metrics.SchedulerGoroutines.WithLabelValues("binding").Inc()
		defer metrics.SchedulerGoroutines.WithLabelValues("binding").Dec()

        // 1. 等待permit插件的许可
		waitOnPermitStatus := prof.WaitOnPermit(bindingCycleCtx, assumedPod)
		if !waitOnPermitStatus.IsSuccess() {
			var reason string
			if waitOnPermitStatus.IsUnschedulable() {
				metrics.PodScheduleFailures.Inc()
				reason = v1.PodReasonUnschedulable
			} else {
				metrics.PodScheduleErrors.Inc()
				reason = SchedulerError
			}
			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
			}
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, waitOnPermitStatus.AsError(), reason, waitOnPermitStatus.Message())
			return
		}

        // 2. 绑定volume
		// Bind volumes first before Pod
		if !allBound {
			err := sched.bindVolumes(assumedPod)
			if err != nil {
				sched.recordSchedulingFailure(prof, assumedPodInfo, err, "VolumeBindingFailed", err.Error())
				metrics.PodScheduleErrors.Inc()
				// trigger un-reserve plugins to clean up state associated with the reserved Pod
				prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
				return
			}
		}

        // 3. 运行prebind插件
		// Run "prebind" plugins.
		preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		if !preBindStatus.IsSuccess() {
			var reason string
			metrics.PodScheduleErrors.Inc()
			reason = SchedulerError
			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
			}
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, preBindStatus.AsError(), reason, preBindStatus.Message())
			return
		}

        // 4. 绑定pod
		err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
		metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
		if err != nil {
			metrics.PodScheduleErrors.Inc()
			// trigger un-reserve plugins to clean up state associated with the reserved Pod
			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
			sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, fmt.Sprintf("Binding rejected: %v", err))
		} else {
			// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
			if klog.V(2) {
				klog.Infof("pod %v/%v is bound successfully on node %q, %d nodes evaluated, %d nodes were found feasible.", assumedPod.Namespace, assumedPod.Name, scheduleResult.SuggestedHost, scheduleResult.EvaluatedNodes, scheduleResult.FeasibleNodes)
			}

			metrics.PodScheduleSuccesses.Inc()
			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
			metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

            // 5. 运行postbind插件
			// Run "postbind" plugins.
			prof.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
		}
	}()

异步绑定pod的主要流程如下：
1）等待permit插件的许可
2）绑定volume
3）运行prebind插件
4）绑定pod，如果失败执行unreserve插件
5）运行postbind插件

总结：

至此，通过5篇文章，我们完成了kube-scheduler组件源码执行流程分析，在分析过程中如果有任何问题或不正确的地方，还望各位大佬不吝赐教。

你可能感兴趣的:(Kubernetes)

ABP VNext + RediSearch：微服务级全文检索 Kookoos Abp vNext .net 微服务全文检索架构 ABP vNext Redis
ABPVNext+RediSearch：微服务级全文检索目录ABPVNext+RediSearch：微服务级全文检索一、背景与动机️二、环境与依赖2.1DockerCompose启动RedisStack2.2Kubernetes部署（示例Manifest）2.3ABPVNext&NuGet包️三、架构与流程图️四、索引模型与依赖注入4.1模型定义4.2服务注册️五、IndexService&Sea
Kubernetes 资源调度中标签（Label）和选择器（Selector）深入理解 pengdott 云原生 kubernetes java 容器
目录前言：一、什么是标签（Label）二、什么是选择器（Selector）三、标签和选择器的应用四、最佳实践五、总结前言：在Kubernetes中，标签（Label）和选择器（Selector）是资源调度中非常重要的概念。它们帮助我们组织、分类和选择集群中的资源对象。通过标签和选择器，Kubernetes可以轻松地管理和调度Pods、服务（Services）以及其他资源对象。本文将深入探讨Kube
在Amazon EKS中应用Amazon Fargate的Serverless容器化实践 AWS官方合作商 serverless 云原生 aws
本文深度解析如何通过Fargate实现EKS集群的Serverless节点管理，大幅降低K8s运维复杂度一、为什么选择Fargate+EKS？AmazonEKS（ElasticKubernetesService）提供托管式K8s集群，而Fargate作为无服务器计算引擎，二者结合可解决以下痛点：运维简化无需管理WorkerNode（EC2实例）自动处理节点扩缩容/打补丁/安全加固成本优化按Pod资
Kubernetes基于helm部署jenkins lldhsds kubernetes 云计算 kubernetes jenkins devops
Kubernetes基于helm安装jenkinsjenkins支持war包、docker镜像、系统安装包、helm安装等。在Kubernetes上使用Helm安装Jenkins可以简化安装和管理Jenkins的过程。同时借助Kubernetes，jenkins可以实现工作节点的动态调用伸缩，更好的提高资源利用率。通过Jenkins的kubernetes-plugin来实现将Jenkins运行在K
容器与 Kubernetes 基本概念与架构木鱼时刻软件开发 kubernetes 架构容器
文章目录1.典型环境层次结构2.Kubernetes生态三大类2.1核心组件2.2集群管理工具2.3生态辅助工具2.4资源管理关系3.Docker容器技术与实践3.1镜像拉取加速3.2认证与登录3.3常用命令3.4存储挂载方式对比3.5docker-compose启动3.6容器化应用部署示例4.kind快速启动5.参考资料1.典型环境层次结构物理机/宿主机：运行虚拟化或容器化环境的基础硬件。虚拟机
Java容器化核弹级优化：Kubernetes资源调度与性能飙升指南——让Java应用在容器中跑出0.01秒响应！墨夶 Java学习资料5 java kubernetes 开发语言
在云原生时代，Java应用的容器化部署已成为标配，但如何让Java在容器编排中实现资源利用率提升400%、响应时间缩短至毫秒级？本文将揭秘10大核心优化策略，通过**20000行代码级深度解析一、Java容器化资源调度的核心挑战1.1资源争夺的“死亡螺旋”//未优化的Java容器典型问题publicclassResourceStarvation{publicstaticvoidmain(<
ElasticCTR：一键部署的分布式CTR预估解决方案萧桔格Wilbur
ElasticCTR：一键部署的分布式CTR预估解决方案ElasticCTRElasticCTR，即飞桨弹性计算推荐系统，是基于Kubernetes的企业级推荐系统开源解决方案。该方案融合了百度业务场景下持续打磨的高精度CTR模型、飞桨开源框架的大规模分布式训练能力、工业级稀疏参数弹性调度服务，帮助用户在Kubernetes环境中一键完成推荐系统部署，具备高性能、工业级部署、端到端体验的特点，并且
containerd
一、理论Containerd是容器底层运行时，c/s架构。docker运行需要containerd作为容器底层运行时。kubernetes1.24版本之前（不包含1.24版本）支持docker、containerd等容器底层运行时，1.24版本之后（包含1.24版本）默认容器底层运行时就是containerd。containerd由storage、metadata、runtimes三大组件组成st
大规模分布式数据库读写分离架构：一致性、可用性与性能的权衡实践
目录1引言：数据库架构的核心三角2原创架构设计2.1读写分离系统架构2.2读写核心流程3企业级实现代码3.1Python路由服务核心代码3.2TypeScript复制状态监控3.3Kubernetes部署YAML示例4性能对比量化分析5生产级部署与安全方案5.1高可用部署架构5.2安全审计方案6技术前瞻性分析6.1演进路线图6.2关键趋势解读7附录：完整技术图谱结论1引言：数据库架构的核心三角在大
Spring Cloud（微服务部署与监控）白仑色 Spring系列 spring cloud 微服务 spring 微服务部署服务监控健康检查
摘要在微服务架构中，随着服务数量的增长和部署复杂度的提升，如何高效部署、持续监控、快速定位问题并实现自动化运维成为保障系统稳定性的关键。本文将围绕SpringCloud微服务的部署与监控展开，深入讲解：微服务打包与部署方式（JAR/Docker/Kubernetes）如何构建CI/CD流水线服务健康检查与自动恢复机制Prometheus+Grafana实现指标可视化监控ELK实现日志集中管理Sky
Kubernetes Pod 调度基础眠修 kubernetes 容器云原生
目录一、ReplicationController和ReplicaSet1、ReplicationController（复制控制器，RC）（1）编辑ReplicationController文件（2）创建ReplicationController（3）删除一个pod并立即查看pod状态（4）删除ReplicationController2、标签与标签选择器（1）标签（2）标签选择器基于等式的选择器
Kubernetes Pod常见的几种调度方式 Seal^_^ 【云原生】容器化与编排技术持续集成 #Kubernetes kubernetes 容器云原生 K8s Pod Pod的几种调度方式面试
KubernetesPod常见的几种调度方式1、Deployment或ReplicationController(RC)2、NodeSelector（定向调度）3、NodeAffinity（亲和性调度）4、Taints和Tolerations（污点和容忍）TheBegin点点关注，收藏不迷路1、Deployment或ReplicationController(RC)功能：自动部署容器应用的多份副本
Kubernetes Pod调度基础别骂我h 个人笔记容器
目录一、ReplicationController和ReplicaSet1.ReplicationControllerReplicationController的使用示例2.标签与标签选择器标签标签选择器标签与标签选择器举例3.ReplicaSet定义ReplicaSet实例二、无状态应用管理Deployment1.什么是无状态2.无状态服务特点3.无状态服务的应用场景4.创建Deployment
Spring Boot + ONNX Runtime模型部署
文章目录前言一、模型导出二、Java推理引擎选型三、SpringBoot实战3.1核心架构3.2分层架构详细实现1.Controller层-请求入口2.Service层-核心业务流程3.关键组件深度优化四、云原生部署：Docker+Kubernetes总结前言在AI浪潮席卷全球的今天，Java工程师如何守住后端主战场？模型部署正是Java工程师融入AI领域的方向。为什么Java工程师必须掌握模型部
多容器应用与编排——AI教你学Docker LuckyLay AI教你学Docker 人工智能 docker 容器
2.2多容器应用与编排现代应用通常由多个服务（如Web、数据库、缓存等）组成，每个服务运行在独立的容器里。如何高效管理、协调、扩展、升级这些多容器应用，成为容器化实践的核心。容器编排工具（如DockerCompose、Swarm、Kubernetes）正是为此而生。一、Compose：本地/开发环境多容器编排1.概述DockerCompose是用于定义和运行多容器Docker应用的工具。通过doc
低代码平台架构设计 LINGYI_WEN 低代码前端开发语言
1.整体架构概述1.1技术栈选择前端：React+Redux/Vue+Vuex后端：Node.js+Express/SpringBoot数据库：MySQL/PostgreSQL/MongoDB云服务：AWS/Azure/GoogleCloud容器化：Docker+Kubernetes1.2模块划分前端模块：可视化编辑器：用于拖拽和配置组件预览器：实时预览页面效果发布器：将设计好的页面发布到生产环境
【Kubernetes】ReplicaSet 如何选择要删除的 Pod - 缩容优先级深度解析 showyoui 云原生开源 kubernetes 容器云原生
文章目录概述核心问题：控制器如何在自己的Pod中做选择？ReplicaSet的删除优先级排序特殊情况：StatefulSet决策流程图关键应用：使用`pod-deletion-cost`总结概述当您缩减一个Deployment或ReplicaSet的副本数时，控制器必须从其管理的众多Pod中做出选择：删除哪一个？这是一个在应用更新和弹性伸缩中频繁发生的操作。与因节点资源不足而引发的"被动"驱逐不同
Kubernetes第八章--存储类型运维小贺 kubernetes 容器云计算云原生运维
k8s存储概述在Kubernetes（K8s）中，存储系统是一个关键的组成部分，用于管理容器化应用的数据持久性和共享性。K8s的存储分类可以从多个维度进行理解，但主要分为两大类：临时存储和持久存储。关于元数据和真实数据的分类，虽然这两个概念在存储系统中普遍存在，但在K8s的存储分类中，它们并不是直接用于分类存储类型的标准。不过，可以从K8s存储类型如何管理和使用这些数据的角度来探讨。k8s支持的卷
Kubernetes第七章--Service详解 (纯干货) 运维小贺 kubernetes 容器云原生 docker etcd
Service存在的意义？引入Service主要是解决Pod的动态变化，通过创建Service，可以为一组具有相同功能的容器应用提供一个统一的入口地址，并且将请求负载分发到后端的各个容器应用上。若提供服务的容器应用是分布式，所以存在多个pod副本，而Pod副本数量可能在运行过程中动态改变，比如水平扩缩容，或者服务器发生故障Pod的IP地址也有可能发生变化。当pod的地址端口发生改变后，客户端再想连
Kubernetes Pod 调度基础
目录一、ReplicationController与ReplicaSet：Pod副本数的守护者1.1ReplicationController：确保Pod副本数的基础机制1.1.1ReplicationController实践示例1.2标签与标签选择器：Kubernetes对象管理的核心机制1.2.1标签（Label）的定义与规范1.2.2标签选择器（LabelSelector）的类型与用法1.2
16.6 《3分钟扩容20实例！LanguageMentor容器化部署实战：高并发下的负载均衡与自动扩展方案》少林码僧负载均衡运维人工智能语言模型机器学习 langchain llama
LanguageMentorAgent容器化部署与发布：高并发场景下的负载均衡与自动扩展关键词：KubernetesHPA,AWSAutoScaling,会话亲和性,监控指标,滚动更新1.高并发场景下的架构挑战LanguageMentor作为对话式Agent需要处理多用户同时在线会话，容器化部署需解决两个核心问题：
如何在宝塔面板中配置SSL证书？奔跑吧邓邓子高效运维 ssl 服务器网络协议
提示：“奔跑吧邓邓子”的高效运维专栏聚焦于各类运维场景中的实际操作与问题解决。内容涵盖服务器硬件（如IBMSystem3650M5）、云服务平台（如腾讯云、华为云）、服务器软件（如Nginx、Apache、GitLab、Redis、Elasticsearch、Kubernetes、Docker等）、开发工具（如Git、HBuilder）以及网络安全（如挖矿病毒排查、SSL证书配置）等多个方面。无论
一文读懂Kubernetes：架构、优势与应用 t0_54program 大数据与人工智能 kubernetes 架构容器个人开发
在当今的云原生计算领域，容器和Kubernetes的应用极为广泛。尽管Kubernetes是一项相对较新的技术，但众多全球企业已在生产环境中用它来管理关键业务应用程序。它之所以广受欢迎，得益于其一系列强大的功能，如增强的安全性、更出色的微服务管理能力、更高的可观测性，以及更高效的扩展和资源利用。什么是Kubernetes？Kubernetes，常简称为k8s，是谷歌实验室于2014年开发的开源容器
一文读懂Kubernetes之 K8s 概述野熊佩骑 Linux系统应用运维 kubernetes 容器云原生 docker 微服务 kubelet devops
目录一、Kubernetes集群组件(一)、控制平面组件(ControlPlaneComponents)1、kube-apiserver2、etcd3、kube-scheduler4、kube-controller-manager5、cloud-controller-manager(可选的)(二)、节点组件1、kubelet2、kube-proxy(可选的)3、容器运行时(Containerrun
云上游戏服务器架构全解析你一身傲骨怎能输架构设计游戏服务器架构
文章摘要本文提出了一套现代化、可落地的云上游戏服务器架构方案，针对FPS、MOBA、MMO等游戏类型的高并发、低延迟需求。该架构采用微服务设计，包含全球接入层、API网关、匹配/大厅服务、对局服务器、业务微服务等组件，通过Kubernetes实现弹性伸缩，支持百万级玩家同时在线。关键技术包括：多地域部署降低延迟、WebSocket/UDP实时通信、帧同步/状态同步机制、Saga分布式事务处理以及完
Spring Boot和Spring Cloud微服务架构实战指南 Javen Fang
本文还有配套的精品资源，点击获取简介：本文介绍微服务架构的基本概念及其与SpringBoot和SpringCloud的关系。SpringBoot简化了Spring应用的初始搭建和开发流程，而SpringCloud提供了一系列微服务解决方案，如服务发现、配置中心等。通过实例说明如何搭建和配置微服务，并包含脚本配置的使用，如Docker和Kubernetes来管理微服务部署。文档和具体项目文件如"se
K8S必问面试题之：K8S架构中每个组件的作用运维爱背锅 K8S面试题 kubernetes 架构容器 K8S面试题面试 devops 运维
微信关注运维爱背锅，用通俗易懂的方式教你运维K8S面试题：K8S架构中每个组件的作用大家好！今天我们来聊聊Kubernetes（简称K8S）中各个组件的作用，这是一道必问的面试题——各个组件就像一支分工明确的足球队，有人守门、有人射门，还有人负责喊战术。下面咱们就用“人话”拆解一下这些组件的职责。1.etcd：集群的“八卦的小本本”作用：分布式K-V（键值）存储数据库，专门记录集群的所有“秘密”，
容器化与微服务何遇mirror 服务器容器微服务
目录编辑第一节：容器化与微服务第二节：Docker与Kubernetes的介绍第三节：容器与传统虚拟化的对比第四节：微服务架构与虚拟化实际案例分析第一节：容器化与微服务容器化与微服务概述容器化是一种轻量级的虚拟化技术，它允许开发者将应用程序及其依赖项打包成一个可移植的容器。微服务架构则是一种将大型应用程序分解为小的、独立的服务的方法，这些服务可以独立部署、扩展和维护。容器化的优势轻量级：容器使用共
云原生灰度方案对比：服务网格灰度（Istio ）与 K8s Ingress 灰度（Nginx Ingress ）大手你不懂微服务-云原生 Java Java项目实战云原生 istio kubernetes 微服务
服务网格灰度与KubernetesIngress灰度是云原生环境下两种主流的灰度发布方案，它们在架构定位、实现方式和适用场景上存在显著差异。以下从多个维度对比分析，并给出选型建议：一、核心区别对比维度服务网格灰度（以Istio为例）K8sIngress灰度（以NginxIngress为例）架构层级网络层（L7），工作在服务间通信层面边缘网关层，工作在集群入口处流量控制范围服务间的全链路流量集群外部
如何设计一个高并发系统？从哪些方面考虑？真IT布道者架构性能优化分布式
核心观点：高并发系统设计需要从架构分层、资源扩展、性能优化、容错机制四个维度综合考量，通过分布式架构和异步化等手段实现系统弹性。一、架构分层设计1.分层解耦接入层：使用Nginx/LVS实现负载均衡，采用DNS轮询或Anycast进行流量分发服务层：微服务架构（如SpringCloud或Kubernetes），服务按功能垂直拆分数据层：读写分离（MySQL主从）+分库分表（ShardingSphe
mongodb3.03开启认证 21jhf mongodb
下载了最新mongodb3.03版本，当使用--auth 参数命令行开启mongodb用户认证时遇到很多问题，现总结如下：（百度上搜到的基本都是老版本的，看到db.addUser的就是，请忽略） Windows下我做了一个bat文件，用来启动mongodb，命令行如下： mongod --dbpath db\data --port 27017 --directoryperdb --logp
【Spark103】Task not serializable bit1129 Serializable
Task not serializable是Spark开发过程最令人头疼的问题之一，这里记录下出现这个问题的两个实例，一个是自己遇到的，另一个是stackoverflow上看到。等有时间了再仔细探究出现Task not serialiazable的各种原因以及出现问题后如何快速定位问题的所在，至少目前阶段碰到此类问题，没有什么章法 1. package spark.exampl
你所熟知的 LRU(最近最少使用) dalan_123 java
关于LRU这个名词在很多地方或听说，或使用，接下来看下lru缓存回收的实现 1、大体的想法 a、查询出最近最晚使用的项 b、给最近的使用的项做标记通过使用链表就可以完成这两个操作，关于最近最少使用的项只需要返回链表的尾部；标记最近使用的项，只需要将该项移除并放置到头部，那么难点就出现你如何能够快速在链表定位对应的该项？这时候多
Javascript 跨域周凡杨 JavaScript jsonp 跨域 cross-domain
linux下安装apache服务器 g21121 apache
安装apache 下载windows版本apache，下载地址：http://httpd.apache.org/download.cgi 1.windows下安装apache Windows下安装apache比较简单，注意选择路径和端口即可，这里就不再赘述了。 2.linux下安装apache：下载之后上传到linux的相关目录，这里指定为/home/apach
FineReport的JS编辑框和URL地址栏语法简介老A不折腾 finereport web报表报表软件语法总结
JS编辑框： 1.FineReport的js。作为一款BS产品，browser端的JavaScript是必不可少的。 FineReport中的js是已经调用了finereport.js的。大家知道，预览报表时，报表servlet会将cpt模板转为html，在这个html的head头部中会引入FineReport的js，这个finereport.js中包含了许多内置的fun
根据STATUS信息对MySQL进行优化墙头上一根草 status
mysql 查看当前正在执行的操作，即正在执行的sql语句的方法为: show processlist 命令 mysql> show global status;可以列出MySQL服务器运行各种状态值，我个人较喜欢的用法是show status like '查询值%';一、慢查询mysql> show variab
我的spring学习笔记7-Spring的Bean配置文件给Bean定义别名 aijuans Spring 3
本文介绍如何给Spring的Bean配置文件的Bean定义别名？原始的 <bean id="business" class="onlyfun.caterpillar.device.Business"> <property name="writer"> <ref b
高性能mysql 之性能剖析 annan211 性能 mysql mysql 性能剖析剖析
1 定义性能优化 mysql服务器性能，此处定义为响应时间。在解释性能优化之前，先来消除一个误解，很多人认为，性能优化就是降低cpu的利用率或者减少对资源的使用。这是一个陷阱。资源时用来消耗并用来工作的，所以有时候消耗更多的资源能够加快查询速度，保持cpu忙绿，这是必要的。很多时候发现编译进了新版本的InnoDB之后，cpu利用率上升的很厉害，这并不
主外键和索引唯一性约束百合不是茶索引唯一性约束主外键约束联机删除
目标;第一步;创建两张表用户表和文章表第二步;发表文章 1,建表; ---用户表 BlogUsers --userID唯一的 --userName --pwd --sex create
线程的调度 bijian1013 java 多线程 thread 线程的调度 java多线程
1. Java提供一个线程调度程序来监控程序中启动后进入可运行状态的所有线程。线程调度程序按照线程的优先级决定应调度哪些线程来执行。 2. 多数线程的调度是抢占式的（即我想中断程序运行就中断，不需要和将被中断的程序协商） a)
查看日志常用命令 bijian1013 linux 命令 unix
一.日志查找方法，可以用通配符查某台主机上的所有服务器grep "关键字" /wls/applogs/custom-*/error.log 二.查看日志常用命令1.grep '关键字' error.log：在error.log中搜索'关键字'2.grep -C10 '关键字' error.log：显示关键字前后10行记录3.grep '关键字' error.l
【持久化框架MyBatis3一】MyBatis版HelloWorld bit1129 helloworld
MyBatis这个系列的文章，主要参考《Java Persistence with MyBatis 3》。样例数据本文以MySQL数据库为例，建立一个STUDENTS表，插入两条数据，然后进行单表的增删改查 CREATE TABLE STUDENTS ( stud_id int(11) NOT NULL AUTO_INCREMENT,
【Hadoop十五】Hadoop Counter bit1129 hadoop
1. 只有Map任务的Map Reduce Job File System Counters FILE: Number of bytes read=3629530 FILE: Number of bytes written=98312 FILE: Number of read operations=0 FILE: Number of lar
解决Tomcat数据连接池无法释放 ronin47 tomcat 连接池　优化
近段时间，公司的检测中心报表系统(SMC)的开发人员时不时找到我，说用户老是出现无法登录的情况。前些日子因为手头上有Jboss集群的测试工作，发现用户不能登录时，都是在Tomcat中将这个项目Reload一下就好了，不过只是治标而已，因为大概几个小时之后又会再次出现无法登录的情况。今天上午，开发人员小毛又找到我，要我协助将这个问题根治一下，拖太久用户难保不投诉。简单分析了一
java-75-二叉树两结点的最低共同父结点 bylijinnan java
import java.util.LinkedList; import java.util.List; import ljn.help.*; public class BTreeLowestParentOfTwoNodes { public static void main(String[] args) { /* * node data is stored in
行业垂直搜索引擎网页抓取项目 carlwu Lucene Nutch Heritrix Solr
公司有一个搜索引擎项目，希望各路高人有空来帮忙指导，谢谢！这是详细需求：（1）通过提供的网站地址(大概100-200个网站)，网页抓取程序能不断抓取网页和其它类型的文件（如Excel、PDF、Word、ppt及zip类型），并且程序能够根据事先提供的规则，过滤掉不相干的下载内容。（2）程序能够搜索这些抓取的内容，并能对这些抓取文件按照油田名进行分类，然后放到服务器不同的目录中。
[通讯与服务]在总带宽资源没有大幅增加之前,不适宜大幅度降低资费 comsci 资源
降低通讯服务资费，就意味着有更多的用户进入，就意味着通讯服务提供商要接待和服务更多的用户，在总体运维成本没有由于技术升级而大幅下降的情况下，这种降低资费的行为将导致每个用户的平均带宽不断下降，而享受到的服务质量也在下降，这对用户和服务商都是不利的。。。。。。。。 &nbs
Java时区转换及时间格式 Cwind java
本文介绍Java API 中 Date, Calendar, TimeZone和DateFormat的使用，以及不同时区时间相互转化的方法和原理。问题描述：向处于不同时区的服务器发请求时需要考虑时区转换的问题。譬如，服务器位于东八区（北京时间，GMT+8:00），而身处东四区的用户想要查询当天的销售记录。则需把东四区的“今天”这个时间范围转换为服务器所在时区的时间范围。
readonly,只读，不可用 dashuaifu js jsp disable readOnly readOnly
readOnly 和 readonly 不同，在做js开发时一定要注意函数大小写和jsp黄线的警告！！！我就经历过这么一件事：使用readOnly在某些浏览器或同一浏览器不同版本有的可以实现“只读”功能，有的就不行，而且函数readOnly有黄线警告！！！就这样被折磨了不短时间！！！（期间使用过disable函数，但是发现disable函数之后后台接收不到前台的的数据！！！）
LABjs、RequireJS、SeaJS 介绍 dcj3sjt126com js Web
LABjs 的核心是 LAB（Loading and Blocking）：Loading 指异步并行加载，Blocking 是指同步等待执行。LABjs 通过优雅的语法（script 和 wait）实现了这两大特性，核心价值是性能优化。LABjs 是一个文件加载器。RequireJS 和 SeaJS 则是模块加载器，倡导的是一种模块化开发理念，核心价值是让 JavaScript 的模块化开发变得更
[应用结构]入口脚本 dcj3sjt126com PHP yii2
入口脚本入口脚本是应用启动流程中的第一环，一个应用（不管是网页应用还是控制台应用）只有一个入口脚本。终端用户的请求通过入口脚本实例化应用并将将请求转发到应用。 Web 应用的入口脚本必须放在终端用户能够访问的目录下，通常命名为 index.php，也可以使用 Web 服务器能定位到的其他名称。控制台应用的入口脚本一般在应用根目录下命名为 yii（后缀为.php），该文
haoop shell命令 eksliang hadoop hadoop shell
cat chgrp chmod chown copyFromLocal copyToLocal cp du dus expunge get getmerge ls lsr mkdir movefromLocal mv put rm rmr setrep stat tail test text
MultiStateView不同的状态下显示不同的界面 gundumw100 android
只要将指定的view放在该控件里面，可以该view在不同的状态下显示不同的界面，这对ListView很有用，比如加载界面，空白界面，错误界面。而且这些见面由你指定布局，非常灵活。 PS：ListView虽然可以设置一个EmptyView，但使用起来不方便，不灵活，有点累赘。 <com.kennyc.view.MultiStateView xmlns:android=&qu
jQuery实现页面内锚点平滑跳转 ini JavaScript html jquery html5 css
平时我们做导航滚动到内容都是通过锚点来做，刷的一下就直接跳到内容了，没有一丝的滚动效果，而且 url 链接最后会有“小尾巴”，就像#keleyi，今天我就介绍一款 jquery 做的滚动的特效，既可以设置滚动速度，又可以在 url 链接上没有“小尾巴”。效果体验：http://keleyi.com/keleyi/phtml/jqtexiao/37.htmHTML文件代码： &
kafka offset迁移 kane_xie kafka
在早前的kafka版本中（0.8.0），offset是被存储在zookeeper中的。到当前版本（0.8.2）为止，kafka同时支持offset存储在zookeeper和offset manager（broker）中。从官方的说明来看，未来offset的zookeeper存储将会被弃用。因此现有的基于kafka的项目如果今后计划保持更新的话，可以考虑在合适
android > 搭建 cordova 环境 mft8899 android
1 , 安装 node.js http://nodejs.org node -v 查看版本 2, 安装 npm 可以先从 https://github.com/isaacs/npm/tags 下载源码解压到
java封装的比较器，比较是否全相同，获取不同字段名字 qifeifei
非常实用的java比较器，贴上代码： import java.util.HashSet; import java.util.List; import java.util.Set; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import net.sf.json.JsonConfig; i
记录一些函数用法 .Aky. 位运算 PHP 数据库函数 IP
高手们照旧忽略。想弄个全天朝IP段数据库，找了个今天最新更新的国内所有运营商IP段，copy到文件，用文件函数，字符串函数把玩下。分割出startIp和endIp这样格式写入.txt文件，直接用phpmyadmin导入.csv文件的形式导入。（生命在于折腾，也许你们觉得我傻X，直接下载人家弄好的导入不就可以，做自己的菜鸟，让别人去说吧）当然用到了ip2long()函数把字符串转为整型数
sublime text 3 rust wudixiaotie Sublime Text
1.sublime text 3 => install package => Rust 2.cd ~/.config/sublime-text-3/Packages 3.mkdir rust 4.git clone https://github.com/sp0/rust-style 5.cd rust-style 6.cargo build --release 7.ctrl