kube-scheduler源码分析---初始化过程

目录

框架

kube-scheduler的cmd目录结构如下

main函数

核心代码抽离

NewOptions

Options.Config

config对象

informer初始化

Run函数

NewSchedulerConfig分析

schedulerCache初始化及结构

scheduler队列实现

InformerFactory.Start

WaitForCacheSync

InformerFactory.WaitForCacheSync

controller.WaitForCacheSync

LeaderElection

Scheduler.Run

总结


以下代码分析基于V1.12.4版本

框架

分析源码的主线还是按照上述框架图。

  1. 初始化过程,分析用到哪些数据结构,如何初始化的
  2. 调度的整个流程分析
  3. 预选算法分析
  4. 优选算法分析
  5. 抢占调度分析

kube-scheduler的cmd目录结构如下

kube-scheduler
├── BUILD
├── OWNERS
├── app # 运行scheduler的相关参数和配置对象
│   ├── BUILD
│   ├── config
│   │   ├── BUILD
│   │   └── config.go # Scheduler的配置对象config
│   ├── options  # scheduler启动参数
│   │   ├── BUILD
│   │   ├── configfile.go
│   │   ├── deprecated.go
│   │   ├── deprecated_test.go
│   │   ├── insecure_serving.go
│   │   ├── insecure_serving_test.go
│   │   ├── options.go # NewOptions、Options及方法
│   │   └── options_test.go
│   └── server.go   # 主要包括NewSchedulerCommand、NewSchedulerConfig和Run函数
└── scheduler.go    # main主程序入口,生成cobra命令并运行、初始化日志系统

main函数

kube-scheduler的入口Main函数,仍然是采用统一的代码风格,使用Cobra命令行框架。

代码位置 /cmd/kube-scheduler/scheduler.go

func main() {
	// 初始化 schedulerCommand 命令结构体
	command := app.NewSchedulerCommand()
    // 执行命令
	if err := command.Execute(); err != nil {
		fmt.Fprintf(os.Stderr, "%v\n", err)
		os.Exit(1)
	}
}

代码位置 /cmd/kube-scheduler/app/server.go

// NewSchedulerCommand creates a *cobra.Command object with default parameters
func NewSchedulerCommand() *cobra.Command {
	opts, err := options.NewOptions()
	if err != nil {
		glog.Fatalf("unable to initialize command options: %v", err)
	}

	cmd := &cobra.Command{
		Use: "kube-scheduler",
		Long: `The Kubernetes scheduler is a policy-rich, topology-aware,
workload-specific function that significantly impacts availability, performance,
and capacity. The scheduler needs to take into account individual and collective
resource requirements, quality of service requirements, hardware/software/policy
constraints, affinity and anti-affinity specifications, data locality, inter-workload
interference, deadlines, and so on. Workload-specific requirements will be exposed
through the API as necessary.`,
		Run: func(cmd *cobra.Command, args []string) {
			verflag.PrintAndExitIfRequested()
			utilflag.PrintFlags(cmd.Flags())

			if len(args) != 0 {
				fmt.Fprint(os.Stderr, "arguments are not supported\n")
			}

			if errs := opts.Validate(); len(errs) > 0 {
				fmt.Fprintf(os.Stderr, "%v\n", utilerrors.NewAggregate(errs))
				os.Exit(1)
			}

			if len(opts.WriteConfigTo) > 0 {
				if err := options.WriteConfigFile(opts.WriteConfigTo, &opts.ComponentConfig); err != nil {
					fmt.Fprintf(os.Stderr, "%v\n", err)
					os.Exit(1)
				}
				glog.Infof("Wrote configuration to: %s\n", opts.WriteConfigTo)
				return
			}

			c, err := opts.Config()
			if err != nil {
				fmt.Fprintf(os.Stderr, "%v\n", err)
				os.Exit(1)
			}

			stopCh := make(chan struct{})
			if err := Run(c.Complete(), stopCh); err != nil {
				fmt.Fprintf(os.Stderr, "%v\n", err)
				os.Exit(1)
			}
		},
	}

	opts.AddFlags(cmd.Flags())
	cmd.MarkFlagFilename("config", "yaml", "yml", "json")

	return cmd
}

核心代码抽离

// 构造配置参数options
opts, err := options.NewOptions()
// scheduler config初始化对象
c, err := opts.Config()
// run scheduler
stopCh := make(chan struct{})
err := Run(c.Complete(), stopCh)
// 添加命令行参数
opts.AddFlags(cmd.Flags())

NewOptions

NewOptions主要用来构造SchedulerServer使用的参数和上下文。

  • Options结构
// Options 是运行scheduler调度器的所需参数
type Options struct {
	
    // scheduler默认配置值。ConfigFile指定时将会被覆盖
	ComponentConfig kubeschedulerconfig.KubeSchedulerConfiguration

	SecureServing           *apiserveroptions.SecureServingOptions
    // health 和 metrics相关参数
	CombinedInsecureServing *CombinedInsecureServingOptions
    // 认证授权相关
	Authentication          *apiserveroptions.DelegatingAuthenticationOptions
	Authorization           *apiserveroptions.DelegatingAuthorizationOptions
    // 弃用参数
	Deprecated              *DeprecatedOptions

	// Config 配置文件路径.
	ConfigFile string

	// WriteConfigTo 默认配置保存.
	WriteConfigTo string
    // master 连接地址
	Master string
}

// KubeSchedulerConfiguration configures a scheduler
type KubeSchedulerConfiguration struct {
	metav1.TypeMeta

    // 调度器名称
	SchedulerName string
	// 调度算法源选取。 1. provider 2. policy文件 两者互斥
	AlgorithmSource SchedulerAlgorithmSource
	
    // pod调度规则亲和性的权重值。每一个RequiredDuringScheduling存在隐式的亲和性规则。默认值是1
	HardPodAffinitySymmetricWeight int32

	// 高可用leader选举(不做分析)
	LeaderElection KubeSchedulerLeaderElectionConfiguration

    // api-server通信的客户端配置参数。kube-config 认证信息和master地址、QPS、Burst
	ClientConnection apimachineryconfig.ClientConnectionConfiguration
	
	// 调度器默认的健康检查地址 0.0.0.0:10251
	HealthzBindAddress string
	// 监控相关 0.0.0.0:10251.
	MetricsBindAddress string

	// DebuggingConfiguration holds configuration for Debugging related features
	// TODO: We might wanna make this a substruct like Debugging apiserverconfig.DebuggingConfiguration
	apiserverconfig.DebuggingConfiguration

	// 是否开启pod抢占特性
	DisablePreemption bool

    // 值是0时,选取默认值是50%
    // 500 nodes  30  只会调度选取150node即可,帮助提升调度器性能
	PercentageOfNodesToScore int32

	FailureDomains string

    // 调用api-server接口 bind超时时间
	BindTimeoutSeconds *int64
}
// NewOptions returns default scheduler app options.
func NewOptions() (*Options, error) {
    // kubeschedulerconfig.KubeSchedulerConfiguration 默认参数设置
	cfg, err := newDefaultComponentConfig()
	if err != nil {
		return nil, err
	}

	hhost, hport, err := splitHostIntPort(cfg.HealthzBindAddress)
	if err != nil {
		return nil, err
	}

	o := &Options{
		ComponentConfig: *cfg,
		SecureServing:   nil, // TODO: enable with apiserveroptions.NewSecureServingOptions()
		CombinedInsecureServing: &CombinedInsecureServingOptions{
			Healthz: &apiserveroptions.DeprecatedInsecureServingOptions{
				BindNetwork: "tcp",
			},
			Metrics: &apiserveroptions.DeprecatedInsecureServingOptions{
				BindNetwork: "tcp",
			},
			BindPort:    hport,
			BindAddress: hhost,
		},
		Authentication: nil, // TODO: enable with apiserveroptions.NewDelegatingAuthenticationOptions()
		Authorization:  nil, // TODO: enable with apiserveroptions.NewDelegatingAuthorizationOptions()
		Deprecated: &DeprecatedOptions{
			UseLegacyPolicyConfig:    false,
			PolicyConfigMapNamespace: metav1.NamespaceSystem,
		},
	}

	return o, nil
}
opts.AddFlags(cmd.Flags()) 命令行输出参数解析
// AddFlags adds flags for the scheduler options.
func (o *Options) AddFlags(fs *pflag.FlagSet) {
	fs.StringVar(&o.ConfigFile, "config", o.ConfigFile, "The path to the configuration file. Flags override values in this file.")
	fs.StringVar(&o.WriteConfigTo, "write-config-to", o.WriteConfigTo, "If set, write the configuration values to this file and exit.")
	fs.StringVar(&o.Master, "master", o.Master, "The address of the Kubernetes API server (overrides any value in kubeconfig)")
	fs.Int32Var(&ipam.IpamConfig.Port, "ipam-port", ipam.IpamConfig.Port, "The port that the ipam's http service runs on")
	fs.StringVar(&ipam.IpamConfig.Address, "ipam-address", ipam.IpamConfig.Address, "The IP address that the ipam's http service runs on")
	fs.StringVar(&ipam.IpamConfig.Token, "ipam-token", ipam.IpamConfig.Token, "The token that the ipam's http service runs on")
	fs.Float64Var(&ipam.IpamConfig.RequestLimitCPU, "cpu-limit", ipam.IpamConfig.RequestLimitCPU, "The cpu limit of each node")
	fs.Float64Var(&ipam.IpamConfig.RequestLimitMemory, "memory-limit", ipam.IpamConfig.RequestLimitMemory, "The memory limit of each node")

	o.SecureServing.AddFlags(fs)
	o.CombinedInsecureServing.AddFlags(fs)
	o.Authentication.AddFlags(fs)
	o.Authorization.AddFlags(fs)
	o.Deprecated.AddFlags(fs, &o.ComponentConfig)

	leaderelectionconfig.BindFlags(&o.ComponentConfig.LeaderElection.LeaderElectionConfiguration, fs)
   // 特性开关
	utilfeature.DefaultFeatureGate.AddFlag(fs)
}

通过启动日志获取默认启动参数值:

I0508 18:24:50.931827  121176 feature_gate.go:206] feature gates: &{map[TaintNodesByCondition:false PodPriority:false ScheduleDaemonSetPods:false]}
I0508 18:24:50.932099  121176 flags.go:33] FLAG: --address="0.0.0.0"
I0508 18:24:50.932106  121176 flags.go:33] FLAG: --algorithm-provider=""/调度算法设置
I0508 18:24:50.932109  121176 flags.go:33] FLAG: --alsologtostderr="false"
I0508 18:24:50.932113  121176 flags.go:33] FLAG: --config="" // 配置文件默认是空值
I0508 18:24:50.932115  121176 flags.go:33] FLAG: --contention-profiling="false"
I0508 18:24:50.932119  121176 flags.go:33] FLAG: --cpu-limit="85"
I0508 18:24:50.932124  121176 flags.go:33] FLAG: --failure-domains="kubernetes.io/hostname,failure-domain.beta.kubernetes.io/zone,failure-domain.beta.kubernetes.io/region"     // 由kubelet上报并设置
I0508 18:24:50.932129  121176 flags.go:33] FLAG: --feature-gates="PodPriority=false,ScheduleDaemonSetPods=false,TaintNodesByCondition=false"
// 特性开关
I0508 18:24:50.932141  121176 flags.go:33] FLAG: --hard-pod-affinity-symmetric-weight="1"
I0508 18:24:50.932145  121176 flags.go:33] FLAG: --help="false"
I0508 18:24:50.932148  121176 flags.go:33] FLAG: --ipam-address=""
I0508 18:24:50.932151  121176 flags.go:33] FLAG: --ipam-port=""
I0508 18:24:50.932153  121176 flags.go:33] FLAG: --ipam-token=""
I0508 18:24:50.932157  121176 flags.go:33] FLAG: --kube-api-burst="100"
I0508 18:24:50.932159  121176 flags.go:33] FLAG: --kube-api-content-type="application/vnd.kubernetes.protobuf"
I0508 18:24:50.932162  121176 flags.go:33] FLAG: --kube-api-qps="50"
I0508 18:24:50.932166  121176 flags.go:33] FLAG: --kubeconfig=""   // 未提供,master已指定
I0508 18:24:50.932169  121176 flags.go:33] FLAG: --leader-elect="true"
I0508 18:24:50.932171  121176 flags.go:33] FLAG: --leader-elect-lease-duration="15s"
I0508 18:24:50.932175  121176 flags.go:33] FLAG: --leader-elect-renew-deadline="10s"
I0508 18:24:50.932178  121176 flags.go:33] FLAG: --leader-elect-resource-lock="endpoints"
I0508 18:24:50.932180  121176 flags.go:33] FLAG: --leader-elect-retry-period="2s"
I0508 18:24:50.932183  121176 flags.go:33] FLAG: --lock-object-name="kube-scheduler"
I0508 18:24:50.932186  121176 flags.go:33] FLAG: --lock-object-namespace="kube-system"
I0508 18:24:50.932188  121176 flags.go:33] FLAG: --log-backtrace-at=":0"
I0508 18:24:50.932192  121176 flags.go:33] FLAG: --log-dir="/var/log/kubernetes"
I0508 18:24:50.932195  121176 flags.go:33] FLAG: --log-flush-frequency="5s"
I0508 18:24:50.932197  121176 flags.go:33] FLAG: --logtostderr="false"
I0508 18:24:50.932200  121176 flags.go:33] FLAG: --master="http://127.0.0.1:8080" //master
I0508 18:24:50.932203  121176 flags.go:33] FLAG: --memory-limit="85"
I0508 18:24:50.932206  121176 flags.go:33] FLAG: --policy-config-file="" // policy源指定算法配置
I0508 18:24:50.932208  121176 flags.go:33] FLAG: --policy-configmap=""
I0508 18:24:50.932210  121176 flags.go:33] FLAG: --policy-configmap-namespace="kube-system"
I0508 18:24:50.932213  121176 flags.go:33] FLAG: --port="10251"
I0508 18:24:50.932216  121176 flags.go:33] FLAG: --profiling="false"
I0508 18:24:50.932219  121176 flags.go:33] FLAG: --scheduler-name="default-scheduler" // 调度器名称
I0508 18:24:50.932221  121176 flags.go:33] FLAG: --stderrthreshold="3"
I0508 18:24:50.932224  121176 flags.go:33] FLAG: --use-legacy-policy-config="false"
I0508 18:24:50.932226  121176 flags.go:33] FLAG: --v="8"
I0508 18:24:50.932229  121176 flags.go:33] FLAG: --version="false"
I0508 18:24:50.932233  121176 flags.go:33] FLAG: --vmodule=""
I0508 18:24:50.932236  121176 flags.go:33] FLAG: --write-config-to="" // 不要求保存

Options.Config

config初始化调度器的配置对象。

// Config return a scheduler config object
func (o *Options) Config() (*schedulerappconfig.Config, error) {
    // scheduler 配置对象
	c := &schedulerappconfig.Config{}
    // options 转化为 scheduler运行的config对象
	if err := o.ApplyTo(c); err != nil {
		return nil, err
	}

	// prepare kube clients.
    // client-go创建连接集群的客户端 master或者通过kubeconfig
	client, leaderElectionClient, eventClient, err := createClients(c.ComponentConfig.ClientConnection, o.Master, c.ComponentConfig.LeaderElection.RenewDeadline.Duration)
	if err != nil {
		return nil, err
	}

	// Prepare event clients.
	eventBroadcaster := record.NewBroadcaster()
	recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, corev1.EventSource{Component: c.ComponentConfig.SchedulerName})

	// Set up leader election if enabled.
	var leaderElectionConfig *leaderelection.LeaderElectionConfig
	if c.ComponentConfig.LeaderElection.LeaderElect {
		leaderElectionConfig, err = makeLeaderElectionConfig(c.ComponentConfig.LeaderElection, leaderElectionClient, recorder)
		if err != nil {
			return nil, err
		}
	}

	c.Client = client
	c.InformerFactory = informers.NewSharedInformerFactory(client, 0)
	c.PodInformer = factory.NewPodInformer(client, 0)
	c.EventClient = eventClient
	c.Recorder = recorder
	c.Broadcaster = eventBroadcaster
	c.LeaderElection = leaderElectionConfig

	return c, nil
}

Config函数主要执行以下操作:

  • 转化options参数为scheduler的配置对象
  • 构建scheduler client、leaderElectionClient、eventClient。
  • 创建event recorder
  • 设置leader选举
  • 创建informer对象,主要函数有NewSharedInformerFactoryNewPodInformer

config对象

// Config has all the context to run a Scheduler
type Config struct {
	// 调度器配置对象.
	ComponentConfig kubeschedulerconfig.KubeSchedulerConfiguration

	InsecureServing        *apiserver.DeprecatedInsecureServingInfo // nil will disable serving on an insecure port
	InsecureMetricsServing *apiserver.DeprecatedInsecureServingInfo // non-nil if metrics should be served independently
	Authentication         apiserver.AuthenticationInfo
	Authorization          apiserver.AuthorizationInfo
	SecureServing          *apiserver.SecureServingInfo

	Client          clientset.Interface // 集群客户端
	InformerFactory informers.SharedInformerFactory // informer工厂
	PodInformer     coreinformers.PodInformer // podInformer
	EventClient     v1core.EventsGetter
	Recorder        record.EventRecorder
	Broadcaster     record.EventBroadcaster

	// LeaderElection is optional.
	LeaderElection *leaderelection.LeaderElectionConfig
}
// ApplyTo options 给调度器配置对象
func (o *Options) ApplyTo(c *schedulerappconfig.Config) error {
    // 没有采用configFile文件方式 --config 
	if len(o.ConfigFile) == 0 {
		c.ComponentConfig = o.ComponentConfig

        // 旧方式采用deprecated flags
		if err := o.Deprecated.ApplyTo(&c.ComponentConfig); err != nil {
			return err
		}
		if err := o.CombinedInsecureServing.ApplyTo(c, &c.ComponentConfig); err != nil {
			return err
		}
	} else {
        //加载 config 文件中的内容
		cfg, err := loadConfigFromFile(o.ConfigFile)
		if err != nil {
			return err
		}

        // 通过config文件中加载的配置 赋值 配置对象
		c.ComponentConfig = *cfg

		if err := o.CombinedInsecureServing.ApplyToFromLoadedConfig(c, &c.ComponentConfig); err != nil {
			return err
		}
	}
    // 公共配置
	if err := o.SecureServing.ApplyTo(&c.SecureServing); err != nil {
		return err
	}
	if err := o.Authentication.ApplyTo(&c.Authentication, c.SecureServing, nil); err != nil {
		return err
	}
	return o.Authorization.ApplyTo(&c.Authorization)
}

informer初始化

简单分析下,informer初始化都有哪些数据结构,做了什么事情?

c.PodInformer = factory.NewPodInformer(client, 0)
// NewPodInformer creates a shared index informer that returns only non-terminal pods.
func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer {
  // selector 表示list/watch的限制条件 pendong running unknown
	selector := fields.ParseSelectorOrDie(
		"status.phase!=" + string(v1.PodSucceeded) +
			",status.phase!=" + string(v1.PodFailed))
  // list/watch 客户端  资源:pods  namespace: ""表示全部namespace
  // listFunc 和 watchFunc
	lw := cache.NewListWatchFromClient(client.CoreV1().RESTClient(), string(v1.ResourcePods), metav1.NamespaceAll, selector)
  // 可以看出podInformer其实是 cache.NewSharedIndexInformer 进行初始化的
	return &podInformer{
		informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}),
	}
}

// NewSharedIndexInformer creates a new instance for the listwatcher.
func NewSharedIndexInformer(lw ListerWatcher, objType runtime.Object, defaultEventHandlerResyncPeriod time.Duration, indexers Indexers) SharedIndexInformer {
	realClock := &clock.RealClock{}
	sharedIndexInformer := &sharedIndexInformer{
    // 处理器
		processor:                       &sharedProcessor{clock: realClock},
    // 带有index索引的本地缓存  索引键
		indexer:                         NewIndexer(DeletionHandlingMetaNamespaceKeyFunc, indexers),
		listerWatcher:                   lw,
		objectType:                      objType,
		resyncCheckPeriod:               defaultEventHandlerResyncPeriod,
		defaultEventHandlerResyncPeriod: defaultEventHandlerResyncPeriod,
		cacheMutationDetector:           NewCacheMutationDetector(fmt.Sprintf("%T", objType)),
		clock: realClock,
	}
	return sharedIndexInformer
}

Run函数

err := Run(c.Complete(), stopCh)

Run是后台执行的进程,一直运行scheduler的相关操作。

// Run runs the Scheduler.
func Run(c schedulerserverconfig.CompletedConfig, stopCh <-chan struct{}) error {
	// Apply algorithms based on feature gates.
	// 基于特性开关应用调度算法
	algorithmprovider.ApplyFeatureGates()

	// Build a scheduler config from the provided algorithm source.
	// NewSchedulerConfig初始化SchedulerConfig,最后初始化生成scheduler结构体。
	schedulerConfig, err := NewSchedulerConfig(c)
	if err != nil {
		return err
	}

	// Create the scheduler.
	sched := scheduler.NewFromConfig(schedulerConfig)

	// Prepare the event broadcaster.
	if c.Broadcaster != nil && c.EventClient != nil {
		c.Broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: c.EventClient.Events("")})
	}

	// Start all informers.
	// 运行informer podInforme 及 informerFactory.   client-go的informer机制
	go c.PodInformer.Informer().Run(stopCh)
	c.InformerFactory.Start(stopCh)

	// Wait for all caches to sync before scheduling.
	// 调度前等待所有的local store同步完成
	// InformerFactory.WaitForCacheSync等待所有启动的informer的cache进行同步,保持本地的store信息与etcd的信息是最新一致的。
	c.InformerFactory.WaitForCacheSync(stopCh)
	controller.WaitForCacheSync("scheduler", stopCh, c.PodInformer.Informer().HasSynced)

	// Prepare a reusable run function.
	run := func(ctx context.Context) {
		sched.Run()
		<-ctx.Done()
	}

	ctx, cancel := context.WithCancel(context.TODO()) // TODO once Run() accepts a context, it should be used here
	defer cancel()

	go func() {
		select {
		case <-stopCh:
			cancel()
		case <-ctx.Done():
		}
	}()

	// If leader election is enabled, run via LeaderElector until done and exit.
	// 如果开启leader选举,则运行LeaderElector直到选举结束或退出。
	if c.LeaderElection != nil {
		c.LeaderElection.Callbacks = leaderelection.LeaderCallbacks{
			OnStartedLeading: run,
			OnStoppedLeading: func() {
				utilruntime.HandleError(fmt.Errorf("lost master"))
			},
		}
		leaderElector, err := leaderelection.NewLeaderElector(*c.LeaderElection)
		if err != nil {
			return fmt.Errorf("couldn't create leader elector: %v", err)
		}

		leaderElector.Run(ctx)

		return fmt.Errorf("lost lease")
	}

	// Leader election is disabled, so run inline until done.
	run(ctx)
	return fmt.Errorf("finished without leader elect")
}

Run函数的主要内容如下:

  • 通过scheduler config来创建scheduler的结构体。
  • 运行event broadcaster、healthz server、metrics server。
  • 运行所有的informer并在调度前等待cache的同步(重点)。
  • 执行sched.Run()来运行scheduler的调度逻辑。
  • 如果多个scheduler并开启了LeaderElect,则执行leader选举。

NewSchedulerConfig分析

// NewSchedulerConfig creates the scheduler configuration. This is exposed for use by tests.
func NewSchedulerConfig(s schedulerserverconfig.CompletedConfig) (*scheduler.Config, error) {
	var storageClassInformer storageinformers.StorageClassInformer
	// 如果开启磁盘调度特性,storageClassInformer
	if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
		storageClassInformer = s.InformerFactory.Storage().V1().StorageClasses()
	}

	// Set up the configurator which can create schedulers from configs.
	// scheduler 配置器
	// factory.NewConfigFactory 分析,主要是初始化过程的缓存、队列等实现
	configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
		SchedulerName:                  s.ComponentConfig.SchedulerName,
		Client:                         s.Client,
		NodeInformer:                   s.InformerFactory.Core().V1().Nodes(),
		PodInformer:                    s.PodInformer,
		PvInformer:                     s.InformerFactory.Core().V1().PersistentVolumes(),
		PvcInformer:                    s.InformerFactory.Core().V1().PersistentVolumeClaims(),
		ReplicationControllerInformer:  s.InformerFactory.Core().V1().ReplicationControllers(),
		ReplicaSetInformer:             s.InformerFactory.Apps().V1().ReplicaSets(),
		StatefulSetInformer:            s.InformerFactory.Apps().V1().StatefulSets(),
		ServiceInformer:                s.InformerFactory.Core().V1().Services(),
		PdbInformer:                    s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
		StorageClassInformer:           storageClassInformer,
		HardPodAffinitySymmetricWeight: s.ComponentConfig.HardPodAffinitySymmetricWeight,
		// https://my.oschina.net/jxcdwangtao/blog/1813858
		EnableEquivalenceClassCache:    utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
		DisablePreemption:              s.ComponentConfig.DisablePreemption,
		PercentageOfNodesToScore:       s.ComponentConfig.PercentageOfNodesToScore,
		BindTimeoutSeconds:             *s.ComponentConfig.BindTimeoutSeconds,
	})
  // 配置文件获取算法源并设置 scheduler config信息
  // 下一节分析 算法的注册和整个scheduler config的过程
	source := s.ComponentConfig.AlgorithmSource
	var config *scheduler.Config
	.....  省略预选优选算法注册、scheduler对象的config
	// Additional tweaks to the config produced by the configurator.
	config.Recorder = s.Recorder
  // 是否开启 抢占调度策略
	config.DisablePreemption = s.ComponentConfig.DisablePreemption
	return config, nil
}

// NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only
// return the interface.
func NewConfigFactory(args *ConfigFactoryArgs) scheduler.Configurator {
	stopEverything := make(chan struct{})
	// 调度器缓存
	schedulerCache := schedulercache.New(30*time.Second, stopEverything)

	// storageClassInformer is only enabled through VolumeScheduling feature gate
	var storageClassLister storagelisters.StorageClassLister
	if args.StorageClassInformer != nil {
		storageClassLister = args.StorageClassInformer.Lister()
	}
	c := &configFactory{
		client:                         args.Client, // 集群客户端
		podLister:                      schedulerCache, // 缓存
		podQueue:                       core.NewSchedulingQueue(), // 队列
		pVLister:                       args.PvInformer.Lister(),
		pVCLister:                      args.PvcInformer.Lister(),  // list接口
		serviceLister:                  args.ServiceInformer.Lister(),
		controllerLister:               args.ReplicationControllerInformer.Lister(),
		replicaSetLister:               args.ReplicaSetInformer.Lister(),
		statefulSetLister:              args.StatefulSetInformer.Lister(),
		pdbLister:                      args.PdbInformer.Lister(),
		storageClassLister:             storageClassLister,
		schedulerCache:                 schedulerCache,
		StopEverything:                 stopEverything, // goroutinue退出chan
		schedulerName:                  args.SchedulerName,
		hardPodAffinitySymmetricWeight: args.HardPodAffinitySymmetricWeight,
		enableEquivalenceClassCache:    args.EnableEquivalenceClassCache,
		disablePreemption:              args.DisablePreemption,
		percentageOfNodesToScore:       args.PercentageOfNodesToScore,
	}
	// HasSynced returns true if the first batch of items has been popped
	c.scheduledPodsHasSynced = args.PodInformer.Informer().HasSynced
	// scheduled pod cache
	// podInformer注册资源更新事件回调函数
	args.PodInformer.Informer().AddEventHandler(
		cache.FilteringResourceEventHandler{
		  // 过滤函数
			FilterFunc: func(obj interface{}) bool {
				switch t := obj.(type) {
				case *v1.Pod:
				  // pod类型资源 (scheduled and running) 正常运行的pod
					return assignedNonTerminatedPod(t)
				// watch 到pod删除事件	
				case cache.DeletedFinalStateUnknown:
					if pod, ok := t.Obj.(*v1.Pod); ok {
						return assignedNonTerminatedPod(pod)
					}
					runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
					return false
				default:
					runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
					return false
				}
			},
			// 回调函数handler,添加、更新、删除事件
			Handler: cache.ResourceEventHandlerFuncs{
				AddFunc:    c.addPodToCache,
				UpdateFunc: c.updatePodInCache,
				DeleteFunc: c.deletePodFromCache,
			},
		},
	)
	// unscheduled pod queue
	args.PodInformer.Informer().AddEventHandler(
		cache.FilteringResourceEventHandler{
			FilterFunc: func(obj interface{}) bool {
				switch t := obj.(type) {
				case *v1.Pod:
					return unassignedNonTerminatedPod(t) && responsibleForPod(t, args.SchedulerName)
				case cache.DeletedFinalStateUnknown:
					if pod, ok := t.Obj.(*v1.Pod); ok {
						return unassignedNonTerminatedPod(pod) && responsibleForPod(pod, args.SchedulerName)
					}
					runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
					return false
				default:
					runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
					return false
				}
			},
			Handler: cache.ResourceEventHandlerFuncs{
				AddFunc:    c.addPodToSchedulingQueue,
				UpdateFunc: c.updatePodInSchedulingQueue,
				DeleteFunc: c.deletePodFromSchedulingQueue,
			},
		},
	)
	// ScheduledPodLister is something we provide to plug-in functions that
	// they may need to call.
	c.scheduledPodLister = assignedPodLister{args.PodInformer.Lister()}

	args.NodeInformer.Informer().AddEventHandler(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    c.addNodeToCache,
			UpdateFunc: c.updateNodeInCache,
			DeleteFunc: c.deleteNodeFromCache,
		},
	)
	c.nodeLister = args.NodeInformer.Lister()

	args.PdbInformer.Informer().AddEventHandler(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    c.addPDBToCache,
			UpdateFunc: c.updatePDBInCache,
			DeleteFunc: c.deletePDBFromCache,
		},
	)
	c.pdbLister = args.PdbInformer.Lister()

	// On add and delete of PVs, it will affect equivalence cache items
	// related to persistent volume
	args.PvInformer.Informer().AddEventHandler(
		cache.ResourceEventHandlerFuncs{
			// MaxPDVolumeCountPredicate: since it relies on the counts of PV.
			AddFunc:    c.onPvAdd,
			UpdateFunc: c.onPvUpdate,
			DeleteFunc: c.onPvDelete,
		},
	)
	c.pVLister = args.PvInformer.Lister()

	// This is for MaxPDVolumeCountPredicate: add/delete PVC will affect counts of PV when it is bound.
	args.PvcInformer.Informer().AddEventHandler(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    c.onPvcAdd,
			UpdateFunc: c.onPvcUpdate,
			DeleteFunc: c.onPvcDelete,
		},
	)
	c.pVCLister = args.PvcInformer.Lister()

	// This is for ServiceAffinity: affected by the selector of the service is updated.
	// Also, if new service is added, equivalence cache will also become invalid since
	// existing pods may be "captured" by this service and change this predicate result.
	args.ServiceInformer.Informer().AddEventHandler(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    c.onServiceAdd,
			UpdateFunc: c.onServiceUpdate,
			DeleteFunc: c.onServiceDelete,
		},
	)
	c.serviceLister = args.ServiceInformer.Lister()

	// Existing equivalence cache should not be affected by add/delete RC/Deployment etc,
	// it only make sense when pod is scheduled or deleted

	if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
		// Setup volume binder
		c.volumeBinder = volumebinder.NewVolumeBinder(args.Client, args.PvcInformer, args.PvInformer, args.StorageClassInformer, time.Duration(args.BindTimeoutSeconds)*time.Second)

		args.StorageClassInformer.Informer().AddEventHandler(
			cache.ResourceEventHandlerFuncs{
				AddFunc:    c.onStorageClassAdd,
				DeleteFunc: c.onStorageClassDelete,
			},
		)
	}

	// Setup cache comparer
    // 设置比较器
	comparer := &cacheComparer{
		podLister:  args.PodInformer.Lister(),
		nodeLister: args.NodeInformer.Lister(),
		pdbLister:  args.PdbInformer.Lister(),
		cache:      c.schedulerCache,
		podQueue:   c.podQueue,
	}

	ch := make(chan os.Signal, 1)
	signal.Notify(ch, compareSignal)
    // 触发信号时会将集群中的node pod 和scheduler 队列及缓存中的进行比较
	go func() {
		for {
			select {
			case <-c.StopEverything:
				return
			case <-ch:
				comparer.Compare()
			}
		}
	}()
    // 返回配置结构
	return c
}

schedulerCache初始化及结构

schedulerCache初始化及结构
type schedulerCache struct {
	stop   <-chan struct{}
	ttl    time.Duration
	period time.Duration

	// This mutex guards all fields within this cache struct.
	mu sync.RWMutex
	// a set of assumed pod keys. podUID
	assumedPods map[string]bool
	// a map from pod key to podState.
	podStates map[string]*podState
	nodes     map[string]*NodeInfo
	// zone标签记录node列表
	nodeTree  *NodeTree
	// 主动驱逐保护
	pdbs      map[string]*policy.PodDisruptionBudget
	// A map from image name to its imageState.
	imageStates map[string]*imageState
}

type podState struct {
	pod *v1.Pod
	// assumedPod 最后过期时间
	deadline *time.Time
	// 绑定是否完成
	bindingFinished bool
}

// NodeInfo node级别的聚合信息
type NodeInfo struct {
	// Overall node information.
	node *v1.Node

	pods             []*v1.Pod
	podsWithAffinity []*v1.Pod
	usedPorts        util.HostPortInfo

	// Total requested resource of all pods on this node.
	// It includes assumed pods which scheduler sends binding to apiserver but
	// didn't get it as scheduled yet.
	requestedResource *Resource
	nonzeroRequest    *Resource
	// We store allocatedResources (which is Node.Status.Allocatable.*) explicitly
	// as int64, to avoid conversions and accessing map.
	allocatableResource *Resource

	// Cached taints of the node for faster lookup.
	taints    []v1.Taint
	taintsErr error

	imageStates map[string]*ImageStateSummary
	TransientInfo *transientSchedulerInfo

	// Cached conditions of node for faster lookup.
	memoryPressureCondition v1.ConditionStatus
	diskPressureCondition   v1.ConditionStatus
	pidPressureCondition    v1.ConditionStatus

	// Whenever NodeInfo changes, generation is bumped.
	// This is used to avoid cloning it if the object didn't change.
	generation int64
}

// stop后端goroutinue停止信号  ttl assumedPod过期的时长
func New(ttl time.Duration, stop <-chan struct{}) Cache {
	cache := newSchedulerCache(ttl, cleanAssumedPeriod, stop)
	cache.run()
	return cache
}
// 启动goroutinue 执行cache.cleanupExpiredAssumedPods函数
func (cache *schedulerCache) run() {
	go wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
}

// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
	cache.mu.Lock()
	defer cache.mu.Unlock()

	// The size of assumedPods should be small
	for key := range cache.assumedPods {
		ps, ok := cache.podStates[key]
		if !ok {
			panic("Key found in assumed set but not in podStates. Potentially a logical error.")
		}
		// 绑定未完成
		if !ps.bindingFinished {
			glog.V(3).Infof("Couldn't expire cache for pod %v/%v. Binding is still in progress.",
				ps.pod.Namespace, ps.pod.Name)
			continue
		}
		// 绑定已完成,且时间已过期。从缓存中清除
		if now.After(*ps.deadline) {
			glog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)
			if err := cache.expirePod(key, ps); err != nil {
				glog.Errorf("ExpirePod failed for %s: %v", key, err)
			}
		}
	}
}

scheduler队列实现

  • FIFO队列
// pod优先级特性打开,优先级队列:根据堆实现的优先级队列
// FIFO队列
func NewSchedulingQueue() SchedulingQueue {
	if util.PodPriorityEnabled() {
		return NewPriorityQueue()
	}
	return NewFIFO()
}

// FIFO队列 见前面博客分析
// NewFIFO creates a FIFO object.   MetaNamespaceKeyFunc 函数,对象键===> 对象 default/
func NewFIFO() *FIFO {
	return &FIFO{FIFO: cache.NewFIFO(cache.MetaNamespaceKeyFunc)}
}

// NewFIFO returns a Store which can be used to queue up items to
// process.
func NewFIFO(keyFunc KeyFunc) *FIFO {
	f := &FIFO{
		items:   map[string]interface{}{},
		queue:   []string{},
		keyFunc: keyFunc,
	}
	f.cond.L = &f.lock
	return f
}
  • 优先级队列
优先级队列

// 优先级队列。优先级队列的头部元素是优先级最高的pending Pod.
// activeQ 队列,通过heap实现,保存即将被调度的pod。
// unschedulableQ,保存已经尝试过并且被确定为不可调度的pod。
type PriorityQueue struct {
	lock sync.RWMutex  // 读写锁
	cond sync.Cond     // 条件变量,激活队列

	activeQ *Heap
	unschedulableQ *UnschedulablePodsMap
	
	// 提名运行在某个node上的pod列表
	nominatedPods map[string][]*v1.Pod
	
  //当收到将pod从unschedulableQ移动到activeQ的请求时,receivedMoveRequest设置为true,当从activeQ弹出pod时,receiveMoveRequest设置为false。它表示当我们在尝试调度pod时收到移动请求。在这种情况下,将不可调度的pod放回activeQ
	receivedMoveRequest bool
}

// NewPriorityQueue creates a PriorityQueue object.
func NewPriorityQueue() *PriorityQueue {
	pq := &PriorityQueue{
		activeQ:        newHeap(cache.MetaNamespaceKeyFunc, activeQComp),
		unschedulableQ: newUnschedulablePodsMap(),
		nominatedPods:  map[string][]*v1.Pod{},
	}
	pq.cond.L = &pq.lock
	return pq
}

// 优先级比较函数、优先级相同按照时间戳
func activeQComp(pod1, pod2 interface{}) bool {
	p1 := pod1.(*v1.Pod)
	p2 := pod2.(*v1.Pod)
	prio1 := util.GetPodPriority(p1)
	prio2 := util.GetPodPriority(p2)
	return (prio1 > prio2) || (prio1 == prio2 && podTimestamp(p1).Before(podTimestamp(p2)))
}

// newHeap returns a Heap which can be used to queue up items to process.
// 通过堆实现的优先级队列
func newHeap(keyFn KeyFunc, lessFn LessFunc) *Heap {
	return &Heap{
		data: &heapData{
			items:    map[string]*heapItem{},
			queue:    []string{},
			keyFunc:  keyFn,
			lessFunc: lessFn,
		},
	}
}

// newUnschedulablePodsMap initializes a new object of UnschedulablePodsMap.
func newUnschedulablePodsMap() *UnschedulablePodsMap {
	return &UnschedulablePodsMap{
		pods:    make(map[string]*v1.Pod),
		keyFunc: util.GetPodFullName,
	}
}

InformerFactory.Start

运行PodInformer,并运行InformerFactory。此部分的逻辑为client-go的informer机制。

// Start all informers.
go c.PodInformer.Informer().Run(stopCh)
c.InformerFactory.Start(stopCh)

WaitForCacheSync

在调度前等待cache同步。

// Wait for all caches to sync before scheduling.
c.InformerFactory.WaitForCacheSync(stopCh)
controller.WaitForCacheSync("scheduler", stopCh, c.PodInformer.Informer().HasSynced)

InformerFactory.WaitForCacheSync

InformerFactory.WaitForCacheSync等待所有启动的informer的cache进行同步,保持本地的store信息与etcd的信息是最新一致的。

// WaitForCacheSync waits for all started informers' cache were synced.
func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool {
    informers := func() map[reflect.Type]cache.SharedIndexInformer {
        f.lock.Lock()
        defer f.lock.Unlock()

        informers := map[reflect.Type]cache.SharedIndexInformer{}
        for informerType, informer := range f.informers {
            if f.startedInformers[informerType] {
                informers[informerType] = informer
            }
        }
        return informers
    }()

    res := map[reflect.Type]bool{}
    // 等待所有的启动informer同步完成
    for informType, informer := range informers {
        res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced)
    }
    return res
}

接着调用cache.WaitForCacheSync

// WaitForCacheSync waits for caches to populate.  It returns true if it was successful, false
// if the controller should shutdown
func WaitForCacheSync(stopCh <-chan struct{}, cacheSyncs ...InformerSynced) bool {
    // 轮询检测直到informer 同步完成
    err := wait.PollUntil(syncedPollPeriod,
        func() (bool, error) {
            for _, syncFunc := range cacheSyncs {
                if !syncFunc() {
                    return false, nil
                }
            }
            return true, nil
        },
        stopCh)
    if err != nil {
        glog.V(2).Infof("stop requested")
        return false
    }

    glog.V(4).Infof("caches populated")
    return true
}

controller.WaitForCacheSync

controller.WaitForCacheSync是对cache.WaitForCacheSync的一层封装,通过不同的controller的名字来记录不同controller等待cache同步。

controller.WaitForCacheSync("scheduler", stop, s.PodInformer.Informer().HasSynced)

controller.WaitForCacheSync具体代码如下:

// WaitForCacheSync is a wrapper around cache.WaitForCacheSync that generates log messages
// indicating that the controller identified by controllerName is waiting for syncs, followed by
// either a successful or failed sync.
func WaitForCacheSync(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool {
    glog.Infof("Waiting for caches to sync for %s controller", controllerName)

    if !cache.WaitForCacheSync(stopCh, cacheSyncs...) {
        utilruntime.HandleError(fmt.Errorf("Unable to sync caches for %s controller", controllerName))
        return false
    }

    glog.Infof("Caches are synced for %s controller", controllerName)
    return true
}

LeaderElection

如果有多个scheduler,并开启leader选举,则运行LeaderElector直到选举结束或退出。

// If leader election is enabled, run via LeaderElector until done and exit.
if c.LeaderElection != nil {
    c.LeaderElection.Callbacks = leaderelection.LeaderCallbacks{
        OnStartedLeading: run,
        OnStoppedLeading: func() {
            utilruntime.HandleError(fmt.Errorf("lost master"))
        },
    }
    leaderElector, err := leaderelection.NewLeaderElector(*c.LeaderElection)
    if err != nil {
        return fmt.Errorf("couldn't create leader elector: %v", err)
    }

    leaderElector.Run(ctx)

    return fmt.Errorf("lost lease")
}

Scheduler.Run

// Prepare a reusable run function.
run := func(ctx context.Context) {
    sched.Run()
    <-ctx.Done()
}

ctx, cancel := context.WithCancel(context.TODO()) // TODO once Run() accepts a context, it should be used here
defer cancel()

go func() {
    select {
    case <-stopCh:
        cancel()
    case <-ctx.Done():
    }
}()
...
run(ctx)

Scheduler.Run先等待cache同步,然后开启调度逻辑的goroutine。

Scheduler.Run的具体代码如下:

// Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
func (sched *Scheduler) Run() {
    if !sched.config.WaitForCacheSync() {
        return
    }

    go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
}

总结

上述都是对/cmd/kube-scheduler/scheduler.go代码即 scheduler 调度器初始化部分的分析,下小节将分析算法的注册及scheduler config生成各部分字段的意义。

你可能感兴趣的:(kubernetes)