目录
框架
kube-scheduler的cmd目录结构如下
main函数
核心代码抽离
NewOptions
Options.Config
config对象
informer初始化
Run函数
NewSchedulerConfig分析
schedulerCache初始化及结构
scheduler队列实现
InformerFactory.Start
WaitForCacheSync
InformerFactory.WaitForCacheSync
controller.WaitForCacheSync
LeaderElection
Scheduler.Run
总结
以下代码分析基于V1.12.4版本
分析源码的主线还是按照上述框架图。
kube-scheduler
├── BUILD
├── OWNERS
├── app # 运行scheduler的相关参数和配置对象
│ ├── BUILD
│ ├── config
│ │ ├── BUILD
│ │ └── config.go # Scheduler的配置对象config
│ ├── options # scheduler启动参数
│ │ ├── BUILD
│ │ ├── configfile.go
│ │ ├── deprecated.go
│ │ ├── deprecated_test.go
│ │ ├── insecure_serving.go
│ │ ├── insecure_serving_test.go
│ │ ├── options.go # NewOptions、Options及方法
│ │ └── options_test.go
│ └── server.go # 主要包括NewSchedulerCommand、NewSchedulerConfig和Run函数
└── scheduler.go # main主程序入口,生成cobra命令并运行、初始化日志系统
kube-scheduler
的入口Main
函数,仍然是采用统一的代码风格,使用Cobra命令行框架。
代码位置 /cmd/kube-scheduler/scheduler.go
func main() {
// 初始化 schedulerCommand 命令结构体
command := app.NewSchedulerCommand()
// 执行命令
if err := command.Execute(); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}
代码位置 /cmd/kube-scheduler/app/server.go
// NewSchedulerCommand creates a *cobra.Command object with default parameters
func NewSchedulerCommand() *cobra.Command {
opts, err := options.NewOptions()
if err != nil {
glog.Fatalf("unable to initialize command options: %v", err)
}
cmd := &cobra.Command{
Use: "kube-scheduler",
Long: `The Kubernetes scheduler is a policy-rich, topology-aware,
workload-specific function that significantly impacts availability, performance,
and capacity. The scheduler needs to take into account individual and collective
resource requirements, quality of service requirements, hardware/software/policy
constraints, affinity and anti-affinity specifications, data locality, inter-workload
interference, deadlines, and so on. Workload-specific requirements will be exposed
through the API as necessary.`,
Run: func(cmd *cobra.Command, args []string) {
verflag.PrintAndExitIfRequested()
utilflag.PrintFlags(cmd.Flags())
if len(args) != 0 {
fmt.Fprint(os.Stderr, "arguments are not supported\n")
}
if errs := opts.Validate(); len(errs) > 0 {
fmt.Fprintf(os.Stderr, "%v\n", utilerrors.NewAggregate(errs))
os.Exit(1)
}
if len(opts.WriteConfigTo) > 0 {
if err := options.WriteConfigFile(opts.WriteConfigTo, &opts.ComponentConfig); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
glog.Infof("Wrote configuration to: %s\n", opts.WriteConfigTo)
return
}
c, err := opts.Config()
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
stopCh := make(chan struct{})
if err := Run(c.Complete(), stopCh); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
},
}
opts.AddFlags(cmd.Flags())
cmd.MarkFlagFilename("config", "yaml", "yml", "json")
return cmd
}
// 构造配置参数options
opts, err := options.NewOptions()
// scheduler config初始化对象
c, err := opts.Config()
// run scheduler
stopCh := make(chan struct{})
err := Run(c.Complete(), stopCh)
// 添加命令行参数
opts.AddFlags(cmd.Flags())
NewOptions主要用来构造SchedulerServer使用的参数和上下文。
// Options 是运行scheduler调度器的所需参数
type Options struct {
// scheduler默认配置值。ConfigFile指定时将会被覆盖
ComponentConfig kubeschedulerconfig.KubeSchedulerConfiguration
SecureServing *apiserveroptions.SecureServingOptions
// health 和 metrics相关参数
CombinedInsecureServing *CombinedInsecureServingOptions
// 认证授权相关
Authentication *apiserveroptions.DelegatingAuthenticationOptions
Authorization *apiserveroptions.DelegatingAuthorizationOptions
// 弃用参数
Deprecated *DeprecatedOptions
// Config 配置文件路径.
ConfigFile string
// WriteConfigTo 默认配置保存.
WriteConfigTo string
// master 连接地址
Master string
}
// KubeSchedulerConfiguration configures a scheduler
type KubeSchedulerConfiguration struct {
metav1.TypeMeta
// 调度器名称
SchedulerName string
// 调度算法源选取。 1. provider 2. policy文件 两者互斥
AlgorithmSource SchedulerAlgorithmSource
// pod调度规则亲和性的权重值。每一个RequiredDuringScheduling存在隐式的亲和性规则。默认值是1
HardPodAffinitySymmetricWeight int32
// 高可用leader选举(不做分析)
LeaderElection KubeSchedulerLeaderElectionConfiguration
// api-server通信的客户端配置参数。kube-config 认证信息和master地址、QPS、Burst
ClientConnection apimachineryconfig.ClientConnectionConfiguration
// 调度器默认的健康检查地址 0.0.0.0:10251
HealthzBindAddress string
// 监控相关 0.0.0.0:10251.
MetricsBindAddress string
// DebuggingConfiguration holds configuration for Debugging related features
// TODO: We might wanna make this a substruct like Debugging apiserverconfig.DebuggingConfiguration
apiserverconfig.DebuggingConfiguration
// 是否开启pod抢占特性
DisablePreemption bool
// 值是0时,选取默认值是50%
// 500 nodes 30 只会调度选取150node即可,帮助提升调度器性能
PercentageOfNodesToScore int32
FailureDomains string
// 调用api-server接口 bind超时时间
BindTimeoutSeconds *int64
}
// NewOptions returns default scheduler app options.
func NewOptions() (*Options, error) {
// kubeschedulerconfig.KubeSchedulerConfiguration 默认参数设置
cfg, err := newDefaultComponentConfig()
if err != nil {
return nil, err
}
hhost, hport, err := splitHostIntPort(cfg.HealthzBindAddress)
if err != nil {
return nil, err
}
o := &Options{
ComponentConfig: *cfg,
SecureServing: nil, // TODO: enable with apiserveroptions.NewSecureServingOptions()
CombinedInsecureServing: &CombinedInsecureServingOptions{
Healthz: &apiserveroptions.DeprecatedInsecureServingOptions{
BindNetwork: "tcp",
},
Metrics: &apiserveroptions.DeprecatedInsecureServingOptions{
BindNetwork: "tcp",
},
BindPort: hport,
BindAddress: hhost,
},
Authentication: nil, // TODO: enable with apiserveroptions.NewDelegatingAuthenticationOptions()
Authorization: nil, // TODO: enable with apiserveroptions.NewDelegatingAuthorizationOptions()
Deprecated: &DeprecatedOptions{
UseLegacyPolicyConfig: false,
PolicyConfigMapNamespace: metav1.NamespaceSystem,
},
}
return o, nil
}
opts.AddFlags(cmd.Flags()) 命令行输出参数解析
// AddFlags adds flags for the scheduler options.
func (o *Options) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&o.ConfigFile, "config", o.ConfigFile, "The path to the configuration file. Flags override values in this file.")
fs.StringVar(&o.WriteConfigTo, "write-config-to", o.WriteConfigTo, "If set, write the configuration values to this file and exit.")
fs.StringVar(&o.Master, "master", o.Master, "The address of the Kubernetes API server (overrides any value in kubeconfig)")
fs.Int32Var(&ipam.IpamConfig.Port, "ipam-port", ipam.IpamConfig.Port, "The port that the ipam's http service runs on")
fs.StringVar(&ipam.IpamConfig.Address, "ipam-address", ipam.IpamConfig.Address, "The IP address that the ipam's http service runs on")
fs.StringVar(&ipam.IpamConfig.Token, "ipam-token", ipam.IpamConfig.Token, "The token that the ipam's http service runs on")
fs.Float64Var(&ipam.IpamConfig.RequestLimitCPU, "cpu-limit", ipam.IpamConfig.RequestLimitCPU, "The cpu limit of each node")
fs.Float64Var(&ipam.IpamConfig.RequestLimitMemory, "memory-limit", ipam.IpamConfig.RequestLimitMemory, "The memory limit of each node")
o.SecureServing.AddFlags(fs)
o.CombinedInsecureServing.AddFlags(fs)
o.Authentication.AddFlags(fs)
o.Authorization.AddFlags(fs)
o.Deprecated.AddFlags(fs, &o.ComponentConfig)
leaderelectionconfig.BindFlags(&o.ComponentConfig.LeaderElection.LeaderElectionConfiguration, fs)
// 特性开关
utilfeature.DefaultFeatureGate.AddFlag(fs)
}
通过启动日志获取默认启动参数值:
I0508 18:24:50.931827 121176 feature_gate.go:206] feature gates: &{map[TaintNodesByCondition:false PodPriority:false ScheduleDaemonSetPods:false]}
I0508 18:24:50.932099 121176 flags.go:33] FLAG: --address="0.0.0.0"
I0508 18:24:50.932106 121176 flags.go:33] FLAG: --algorithm-provider=""/调度算法设置
I0508 18:24:50.932109 121176 flags.go:33] FLAG: --alsologtostderr="false"
I0508 18:24:50.932113 121176 flags.go:33] FLAG: --config="" // 配置文件默认是空值
I0508 18:24:50.932115 121176 flags.go:33] FLAG: --contention-profiling="false"
I0508 18:24:50.932119 121176 flags.go:33] FLAG: --cpu-limit="85"
I0508 18:24:50.932124 121176 flags.go:33] FLAG: --failure-domains="kubernetes.io/hostname,failure-domain.beta.kubernetes.io/zone,failure-domain.beta.kubernetes.io/region" // 由kubelet上报并设置
I0508 18:24:50.932129 121176 flags.go:33] FLAG: --feature-gates="PodPriority=false,ScheduleDaemonSetPods=false,TaintNodesByCondition=false"
// 特性开关
I0508 18:24:50.932141 121176 flags.go:33] FLAG: --hard-pod-affinity-symmetric-weight="1"
I0508 18:24:50.932145 121176 flags.go:33] FLAG: --help="false"
I0508 18:24:50.932148 121176 flags.go:33] FLAG: --ipam-address=""
I0508 18:24:50.932151 121176 flags.go:33] FLAG: --ipam-port=""
I0508 18:24:50.932153 121176 flags.go:33] FLAG: --ipam-token=""
I0508 18:24:50.932157 121176 flags.go:33] FLAG: --kube-api-burst="100"
I0508 18:24:50.932159 121176 flags.go:33] FLAG: --kube-api-content-type="application/vnd.kubernetes.protobuf"
I0508 18:24:50.932162 121176 flags.go:33] FLAG: --kube-api-qps="50"
I0508 18:24:50.932166 121176 flags.go:33] FLAG: --kubeconfig="" // 未提供,master已指定
I0508 18:24:50.932169 121176 flags.go:33] FLAG: --leader-elect="true"
I0508 18:24:50.932171 121176 flags.go:33] FLAG: --leader-elect-lease-duration="15s"
I0508 18:24:50.932175 121176 flags.go:33] FLAG: --leader-elect-renew-deadline="10s"
I0508 18:24:50.932178 121176 flags.go:33] FLAG: --leader-elect-resource-lock="endpoints"
I0508 18:24:50.932180 121176 flags.go:33] FLAG: --leader-elect-retry-period="2s"
I0508 18:24:50.932183 121176 flags.go:33] FLAG: --lock-object-name="kube-scheduler"
I0508 18:24:50.932186 121176 flags.go:33] FLAG: --lock-object-namespace="kube-system"
I0508 18:24:50.932188 121176 flags.go:33] FLAG: --log-backtrace-at=":0"
I0508 18:24:50.932192 121176 flags.go:33] FLAG: --log-dir="/var/log/kubernetes"
I0508 18:24:50.932195 121176 flags.go:33] FLAG: --log-flush-frequency="5s"
I0508 18:24:50.932197 121176 flags.go:33] FLAG: --logtostderr="false"
I0508 18:24:50.932200 121176 flags.go:33] FLAG: --master="http://127.0.0.1:8080" //master
I0508 18:24:50.932203 121176 flags.go:33] FLAG: --memory-limit="85"
I0508 18:24:50.932206 121176 flags.go:33] FLAG: --policy-config-file="" // policy源指定算法配置
I0508 18:24:50.932208 121176 flags.go:33] FLAG: --policy-configmap=""
I0508 18:24:50.932210 121176 flags.go:33] FLAG: --policy-configmap-namespace="kube-system"
I0508 18:24:50.932213 121176 flags.go:33] FLAG: --port="10251"
I0508 18:24:50.932216 121176 flags.go:33] FLAG: --profiling="false"
I0508 18:24:50.932219 121176 flags.go:33] FLAG: --scheduler-name="default-scheduler" // 调度器名称
I0508 18:24:50.932221 121176 flags.go:33] FLAG: --stderrthreshold="3"
I0508 18:24:50.932224 121176 flags.go:33] FLAG: --use-legacy-policy-config="false"
I0508 18:24:50.932226 121176 flags.go:33] FLAG: --v="8"
I0508 18:24:50.932229 121176 flags.go:33] FLAG: --version="false"
I0508 18:24:50.932233 121176 flags.go:33] FLAG: --vmodule=""
I0508 18:24:50.932236 121176 flags.go:33] FLAG: --write-config-to="" // 不要求保存
config初始化调度器的配置对象。
// Config return a scheduler config object
func (o *Options) Config() (*schedulerappconfig.Config, error) {
// scheduler 配置对象
c := &schedulerappconfig.Config{}
// options 转化为 scheduler运行的config对象
if err := o.ApplyTo(c); err != nil {
return nil, err
}
// prepare kube clients.
// client-go创建连接集群的客户端 master或者通过kubeconfig
client, leaderElectionClient, eventClient, err := createClients(c.ComponentConfig.ClientConnection, o.Master, c.ComponentConfig.LeaderElection.RenewDeadline.Duration)
if err != nil {
return nil, err
}
// Prepare event clients.
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, corev1.EventSource{Component: c.ComponentConfig.SchedulerName})
// Set up leader election if enabled.
var leaderElectionConfig *leaderelection.LeaderElectionConfig
if c.ComponentConfig.LeaderElection.LeaderElect {
leaderElectionConfig, err = makeLeaderElectionConfig(c.ComponentConfig.LeaderElection, leaderElectionClient, recorder)
if err != nil {
return nil, err
}
}
c.Client = client
c.InformerFactory = informers.NewSharedInformerFactory(client, 0)
c.PodInformer = factory.NewPodInformer(client, 0)
c.EventClient = eventClient
c.Recorder = recorder
c.Broadcaster = eventBroadcaster
c.LeaderElection = leaderElectionConfig
return c, nil
}
Config函数主要执行以下操作:
NewSharedInformerFactory
和NewPodInformer
。// Config has all the context to run a Scheduler
type Config struct {
// 调度器配置对象.
ComponentConfig kubeschedulerconfig.KubeSchedulerConfiguration
InsecureServing *apiserver.DeprecatedInsecureServingInfo // nil will disable serving on an insecure port
InsecureMetricsServing *apiserver.DeprecatedInsecureServingInfo // non-nil if metrics should be served independently
Authentication apiserver.AuthenticationInfo
Authorization apiserver.AuthorizationInfo
SecureServing *apiserver.SecureServingInfo
Client clientset.Interface // 集群客户端
InformerFactory informers.SharedInformerFactory // informer工厂
PodInformer coreinformers.PodInformer // podInformer
EventClient v1core.EventsGetter
Recorder record.EventRecorder
Broadcaster record.EventBroadcaster
// LeaderElection is optional.
LeaderElection *leaderelection.LeaderElectionConfig
}
// ApplyTo options 给调度器配置对象
func (o *Options) ApplyTo(c *schedulerappconfig.Config) error {
// 没有采用configFile文件方式 --config
if len(o.ConfigFile) == 0 {
c.ComponentConfig = o.ComponentConfig
// 旧方式采用deprecated flags
if err := o.Deprecated.ApplyTo(&c.ComponentConfig); err != nil {
return err
}
if err := o.CombinedInsecureServing.ApplyTo(c, &c.ComponentConfig); err != nil {
return err
}
} else {
//加载 config 文件中的内容
cfg, err := loadConfigFromFile(o.ConfigFile)
if err != nil {
return err
}
// 通过config文件中加载的配置 赋值 配置对象
c.ComponentConfig = *cfg
if err := o.CombinedInsecureServing.ApplyToFromLoadedConfig(c, &c.ComponentConfig); err != nil {
return err
}
}
// 公共配置
if err := o.SecureServing.ApplyTo(&c.SecureServing); err != nil {
return err
}
if err := o.Authentication.ApplyTo(&c.Authentication, c.SecureServing, nil); err != nil {
return err
}
return o.Authorization.ApplyTo(&c.Authorization)
}
简单分析下,informer初始化都有哪些数据结构,做了什么事情?
c.PodInformer = factory.NewPodInformer(client, 0)
// NewPodInformer creates a shared index informer that returns only non-terminal pods.
func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer {
// selector 表示list/watch的限制条件 pendong running unknown
selector := fields.ParseSelectorOrDie(
"status.phase!=" + string(v1.PodSucceeded) +
",status.phase!=" + string(v1.PodFailed))
// list/watch 客户端 资源:pods namespace: ""表示全部namespace
// listFunc 和 watchFunc
lw := cache.NewListWatchFromClient(client.CoreV1().RESTClient(), string(v1.ResourcePods), metav1.NamespaceAll, selector)
// 可以看出podInformer其实是 cache.NewSharedIndexInformer 进行初始化的
return &podInformer{
informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}),
}
}
// NewSharedIndexInformer creates a new instance for the listwatcher.
func NewSharedIndexInformer(lw ListerWatcher, objType runtime.Object, defaultEventHandlerResyncPeriod time.Duration, indexers Indexers) SharedIndexInformer {
realClock := &clock.RealClock{}
sharedIndexInformer := &sharedIndexInformer{
// 处理器
processor: &sharedProcessor{clock: realClock},
// 带有index索引的本地缓存 索引键
indexer: NewIndexer(DeletionHandlingMetaNamespaceKeyFunc, indexers),
listerWatcher: lw,
objectType: objType,
resyncCheckPeriod: defaultEventHandlerResyncPeriod,
defaultEventHandlerResyncPeriod: defaultEventHandlerResyncPeriod,
cacheMutationDetector: NewCacheMutationDetector(fmt.Sprintf("%T", objType)),
clock: realClock,
}
return sharedIndexInformer
}
err := Run(c.Complete(), stopCh)
Run是后台执行的进程,一直运行scheduler的相关操作。
// Run runs the Scheduler.
func Run(c schedulerserverconfig.CompletedConfig, stopCh <-chan struct{}) error {
// Apply algorithms based on feature gates.
// 基于特性开关应用调度算法
algorithmprovider.ApplyFeatureGates()
// Build a scheduler config from the provided algorithm source.
// NewSchedulerConfig初始化SchedulerConfig,最后初始化生成scheduler结构体。
schedulerConfig, err := NewSchedulerConfig(c)
if err != nil {
return err
}
// Create the scheduler.
sched := scheduler.NewFromConfig(schedulerConfig)
// Prepare the event broadcaster.
if c.Broadcaster != nil && c.EventClient != nil {
c.Broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: c.EventClient.Events("")})
}
// Start all informers.
// 运行informer podInforme 及 informerFactory. client-go的informer机制
go c.PodInformer.Informer().Run(stopCh)
c.InformerFactory.Start(stopCh)
// Wait for all caches to sync before scheduling.
// 调度前等待所有的local store同步完成
// InformerFactory.WaitForCacheSync等待所有启动的informer的cache进行同步,保持本地的store信息与etcd的信息是最新一致的。
c.InformerFactory.WaitForCacheSync(stopCh)
controller.WaitForCacheSync("scheduler", stopCh, c.PodInformer.Informer().HasSynced)
// Prepare a reusable run function.
run := func(ctx context.Context) {
sched.Run()
<-ctx.Done()
}
ctx, cancel := context.WithCancel(context.TODO()) // TODO once Run() accepts a context, it should be used here
defer cancel()
go func() {
select {
case <-stopCh:
cancel()
case <-ctx.Done():
}
}()
// If leader election is enabled, run via LeaderElector until done and exit.
// 如果开启leader选举,则运行LeaderElector直到选举结束或退出。
if c.LeaderElection != nil {
c.LeaderElection.Callbacks = leaderelection.LeaderCallbacks{
OnStartedLeading: run,
OnStoppedLeading: func() {
utilruntime.HandleError(fmt.Errorf("lost master"))
},
}
leaderElector, err := leaderelection.NewLeaderElector(*c.LeaderElection)
if err != nil {
return fmt.Errorf("couldn't create leader elector: %v", err)
}
leaderElector.Run(ctx)
return fmt.Errorf("lost lease")
}
// Leader election is disabled, so run inline until done.
run(ctx)
return fmt.Errorf("finished without leader elect")
}
Run函数的主要内容如下:
sched.Run()
来运行scheduler的调度逻辑。LeaderElect
,则执行leader选举。// NewSchedulerConfig creates the scheduler configuration. This is exposed for use by tests.
func NewSchedulerConfig(s schedulerserverconfig.CompletedConfig) (*scheduler.Config, error) {
var storageClassInformer storageinformers.StorageClassInformer
// 如果开启磁盘调度特性,storageClassInformer
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
storageClassInformer = s.InformerFactory.Storage().V1().StorageClasses()
}
// Set up the configurator which can create schedulers from configs.
// scheduler 配置器
// factory.NewConfigFactory 分析,主要是初始化过程的缓存、队列等实现
configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
SchedulerName: s.ComponentConfig.SchedulerName,
Client: s.Client,
NodeInformer: s.InformerFactory.Core().V1().Nodes(),
PodInformer: s.PodInformer,
PvInformer: s.InformerFactory.Core().V1().PersistentVolumes(),
PvcInformer: s.InformerFactory.Core().V1().PersistentVolumeClaims(),
ReplicationControllerInformer: s.InformerFactory.Core().V1().ReplicationControllers(),
ReplicaSetInformer: s.InformerFactory.Apps().V1().ReplicaSets(),
StatefulSetInformer: s.InformerFactory.Apps().V1().StatefulSets(),
ServiceInformer: s.InformerFactory.Core().V1().Services(),
PdbInformer: s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
StorageClassInformer: storageClassInformer,
HardPodAffinitySymmetricWeight: s.ComponentConfig.HardPodAffinitySymmetricWeight,
// https://my.oschina.net/jxcdwangtao/blog/1813858
EnableEquivalenceClassCache: utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
DisablePreemption: s.ComponentConfig.DisablePreemption,
PercentageOfNodesToScore: s.ComponentConfig.PercentageOfNodesToScore,
BindTimeoutSeconds: *s.ComponentConfig.BindTimeoutSeconds,
})
// 配置文件获取算法源并设置 scheduler config信息
// 下一节分析 算法的注册和整个scheduler config的过程
source := s.ComponentConfig.AlgorithmSource
var config *scheduler.Config
..... 省略预选优选算法注册、scheduler对象的config
// Additional tweaks to the config produced by the configurator.
config.Recorder = s.Recorder
// 是否开启 抢占调度策略
config.DisablePreemption = s.ComponentConfig.DisablePreemption
return config, nil
}
// NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only
// return the interface.
func NewConfigFactory(args *ConfigFactoryArgs) scheduler.Configurator {
stopEverything := make(chan struct{})
// 调度器缓存
schedulerCache := schedulercache.New(30*time.Second, stopEverything)
// storageClassInformer is only enabled through VolumeScheduling feature gate
var storageClassLister storagelisters.StorageClassLister
if args.StorageClassInformer != nil {
storageClassLister = args.StorageClassInformer.Lister()
}
c := &configFactory{
client: args.Client, // 集群客户端
podLister: schedulerCache, // 缓存
podQueue: core.NewSchedulingQueue(), // 队列
pVLister: args.PvInformer.Lister(),
pVCLister: args.PvcInformer.Lister(), // list接口
serviceLister: args.ServiceInformer.Lister(),
controllerLister: args.ReplicationControllerInformer.Lister(),
replicaSetLister: args.ReplicaSetInformer.Lister(),
statefulSetLister: args.StatefulSetInformer.Lister(),
pdbLister: args.PdbInformer.Lister(),
storageClassLister: storageClassLister,
schedulerCache: schedulerCache,
StopEverything: stopEverything, // goroutinue退出chan
schedulerName: args.SchedulerName,
hardPodAffinitySymmetricWeight: args.HardPodAffinitySymmetricWeight,
enableEquivalenceClassCache: args.EnableEquivalenceClassCache,
disablePreemption: args.DisablePreemption,
percentageOfNodesToScore: args.PercentageOfNodesToScore,
}
// HasSynced returns true if the first batch of items has been popped
c.scheduledPodsHasSynced = args.PodInformer.Informer().HasSynced
// scheduled pod cache
// podInformer注册资源更新事件回调函数
args.PodInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
// 过滤函数
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
// pod类型资源 (scheduled and running) 正常运行的pod
return assignedNonTerminatedPod(t)
// watch 到pod删除事件
case cache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1.Pod); ok {
return assignedNonTerminatedPod(pod)
}
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
return false
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
// 回调函数handler,添加、更新、删除事件
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
},
},
)
// unscheduled pod queue
args.PodInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
return unassignedNonTerminatedPod(t) && responsibleForPod(t, args.SchedulerName)
case cache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1.Pod); ok {
return unassignedNonTerminatedPod(pod) && responsibleForPod(pod, args.SchedulerName)
}
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, c))
return false
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToSchedulingQueue,
UpdateFunc: c.updatePodInSchedulingQueue,
DeleteFunc: c.deletePodFromSchedulingQueue,
},
},
)
// ScheduledPodLister is something we provide to plug-in functions that
// they may need to call.
c.scheduledPodLister = assignedPodLister{args.PodInformer.Lister()}
args.NodeInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.addNodeToCache,
UpdateFunc: c.updateNodeInCache,
DeleteFunc: c.deleteNodeFromCache,
},
)
c.nodeLister = args.NodeInformer.Lister()
args.PdbInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.addPDBToCache,
UpdateFunc: c.updatePDBInCache,
DeleteFunc: c.deletePDBFromCache,
},
)
c.pdbLister = args.PdbInformer.Lister()
// On add and delete of PVs, it will affect equivalence cache items
// related to persistent volume
args.PvInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
// MaxPDVolumeCountPredicate: since it relies on the counts of PV.
AddFunc: c.onPvAdd,
UpdateFunc: c.onPvUpdate,
DeleteFunc: c.onPvDelete,
},
)
c.pVLister = args.PvInformer.Lister()
// This is for MaxPDVolumeCountPredicate: add/delete PVC will affect counts of PV when it is bound.
args.PvcInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.onPvcAdd,
UpdateFunc: c.onPvcUpdate,
DeleteFunc: c.onPvcDelete,
},
)
c.pVCLister = args.PvcInformer.Lister()
// This is for ServiceAffinity: affected by the selector of the service is updated.
// Also, if new service is added, equivalence cache will also become invalid since
// existing pods may be "captured" by this service and change this predicate result.
args.ServiceInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.onServiceAdd,
UpdateFunc: c.onServiceUpdate,
DeleteFunc: c.onServiceDelete,
},
)
c.serviceLister = args.ServiceInformer.Lister()
// Existing equivalence cache should not be affected by add/delete RC/Deployment etc,
// it only make sense when pod is scheduled or deleted
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
// Setup volume binder
c.volumeBinder = volumebinder.NewVolumeBinder(args.Client, args.PvcInformer, args.PvInformer, args.StorageClassInformer, time.Duration(args.BindTimeoutSeconds)*time.Second)
args.StorageClassInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.onStorageClassAdd,
DeleteFunc: c.onStorageClassDelete,
},
)
}
// Setup cache comparer
// 设置比较器
comparer := &cacheComparer{
podLister: args.PodInformer.Lister(),
nodeLister: args.NodeInformer.Lister(),
pdbLister: args.PdbInformer.Lister(),
cache: c.schedulerCache,
podQueue: c.podQueue,
}
ch := make(chan os.Signal, 1)
signal.Notify(ch, compareSignal)
// 触发信号时会将集群中的node pod 和scheduler 队列及缓存中的进行比较
go func() {
for {
select {
case <-c.StopEverything:
return
case <-ch:
comparer.Compare()
}
}
}()
// 返回配置结构
return c
}
schedulerCache初始化及结构
type schedulerCache struct {
stop <-chan struct{}
ttl time.Duration
period time.Duration
// This mutex guards all fields within this cache struct.
mu sync.RWMutex
// a set of assumed pod keys. podUID
assumedPods map[string]bool
// a map from pod key to podState.
podStates map[string]*podState
nodes map[string]*NodeInfo
// zone标签记录node列表
nodeTree *NodeTree
// 主动驱逐保护
pdbs map[string]*policy.PodDisruptionBudget
// A map from image name to its imageState.
imageStates map[string]*imageState
}
type podState struct {
pod *v1.Pod
// assumedPod 最后过期时间
deadline *time.Time
// 绑定是否完成
bindingFinished bool
}
// NodeInfo node级别的聚合信息
type NodeInfo struct {
// Overall node information.
node *v1.Node
pods []*v1.Pod
podsWithAffinity []*v1.Pod
usedPorts util.HostPortInfo
// Total requested resource of all pods on this node.
// It includes assumed pods which scheduler sends binding to apiserver but
// didn't get it as scheduled yet.
requestedResource *Resource
nonzeroRequest *Resource
// We store allocatedResources (which is Node.Status.Allocatable.*) explicitly
// as int64, to avoid conversions and accessing map.
allocatableResource *Resource
// Cached taints of the node for faster lookup.
taints []v1.Taint
taintsErr error
imageStates map[string]*ImageStateSummary
TransientInfo *transientSchedulerInfo
// Cached conditions of node for faster lookup.
memoryPressureCondition v1.ConditionStatus
diskPressureCondition v1.ConditionStatus
pidPressureCondition v1.ConditionStatus
// Whenever NodeInfo changes, generation is bumped.
// This is used to avoid cloning it if the object didn't change.
generation int64
}
// stop后端goroutinue停止信号 ttl assumedPod过期的时长
func New(ttl time.Duration, stop <-chan struct{}) Cache {
cache := newSchedulerCache(ttl, cleanAssumedPeriod, stop)
cache.run()
return cache
}
// 启动goroutinue 执行cache.cleanupExpiredAssumedPods函数
func (cache *schedulerCache) run() {
go wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
}
// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
cache.mu.Lock()
defer cache.mu.Unlock()
// The size of assumedPods should be small
for key := range cache.assumedPods {
ps, ok := cache.podStates[key]
if !ok {
panic("Key found in assumed set but not in podStates. Potentially a logical error.")
}
// 绑定未完成
if !ps.bindingFinished {
glog.V(3).Infof("Couldn't expire cache for pod %v/%v. Binding is still in progress.",
ps.pod.Namespace, ps.pod.Name)
continue
}
// 绑定已完成,且时间已过期。从缓存中清除
if now.After(*ps.deadline) {
glog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)
if err := cache.expirePod(key, ps); err != nil {
glog.Errorf("ExpirePod failed for %s: %v", key, err)
}
}
}
}
// pod优先级特性打开,优先级队列:根据堆实现的优先级队列
// FIFO队列
func NewSchedulingQueue() SchedulingQueue {
if util.PodPriorityEnabled() {
return NewPriorityQueue()
}
return NewFIFO()
}
// FIFO队列 见前面博客分析
// NewFIFO creates a FIFO object. MetaNamespaceKeyFunc 函数,对象键===> 对象 default/
func NewFIFO() *FIFO {
return &FIFO{FIFO: cache.NewFIFO(cache.MetaNamespaceKeyFunc)}
}
// NewFIFO returns a Store which can be used to queue up items to
// process.
func NewFIFO(keyFunc KeyFunc) *FIFO {
f := &FIFO{
items: map[string]interface{}{},
queue: []string{},
keyFunc: keyFunc,
}
f.cond.L = &f.lock
return f
}
优先级队列
// 优先级队列。优先级队列的头部元素是优先级最高的pending Pod.
// activeQ 队列,通过heap实现,保存即将被调度的pod。
// unschedulableQ,保存已经尝试过并且被确定为不可调度的pod。
type PriorityQueue struct {
lock sync.RWMutex // 读写锁
cond sync.Cond // 条件变量,激活队列
activeQ *Heap
unschedulableQ *UnschedulablePodsMap
// 提名运行在某个node上的pod列表
nominatedPods map[string][]*v1.Pod
//当收到将pod从unschedulableQ移动到activeQ的请求时,receivedMoveRequest设置为true,当从activeQ弹出pod时,receiveMoveRequest设置为false。它表示当我们在尝试调度pod时收到移动请求。在这种情况下,将不可调度的pod放回activeQ
receivedMoveRequest bool
}
// NewPriorityQueue creates a PriorityQueue object.
func NewPriorityQueue() *PriorityQueue {
pq := &PriorityQueue{
activeQ: newHeap(cache.MetaNamespaceKeyFunc, activeQComp),
unschedulableQ: newUnschedulablePodsMap(),
nominatedPods: map[string][]*v1.Pod{},
}
pq.cond.L = &pq.lock
return pq
}
// 优先级比较函数、优先级相同按照时间戳
func activeQComp(pod1, pod2 interface{}) bool {
p1 := pod1.(*v1.Pod)
p2 := pod2.(*v1.Pod)
prio1 := util.GetPodPriority(p1)
prio2 := util.GetPodPriority(p2)
return (prio1 > prio2) || (prio1 == prio2 && podTimestamp(p1).Before(podTimestamp(p2)))
}
// newHeap returns a Heap which can be used to queue up items to process.
// 通过堆实现的优先级队列
func newHeap(keyFn KeyFunc, lessFn LessFunc) *Heap {
return &Heap{
data: &heapData{
items: map[string]*heapItem{},
queue: []string{},
keyFunc: keyFn,
lessFunc: lessFn,
},
}
}
// newUnschedulablePodsMap initializes a new object of UnschedulablePodsMap.
func newUnschedulablePodsMap() *UnschedulablePodsMap {
return &UnschedulablePodsMap{
pods: make(map[string]*v1.Pod),
keyFunc: util.GetPodFullName,
}
}
运行PodInformer,并运行InformerFactory。此部分的逻辑为client-go的informer机制。
// Start all informers.
go c.PodInformer.Informer().Run(stopCh)
c.InformerFactory.Start(stopCh)
在调度前等待cache同步。
// Wait for all caches to sync before scheduling.
c.InformerFactory.WaitForCacheSync(stopCh)
controller.WaitForCacheSync("scheduler", stopCh, c.PodInformer.Informer().HasSynced)
InformerFactory.WaitForCacheSync
等待所有启动的informer的cache进行同步,保持本地的store信息与etcd的信息是最新一致的。
// WaitForCacheSync waits for all started informers' cache were synced.
func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool {
informers := func() map[reflect.Type]cache.SharedIndexInformer {
f.lock.Lock()
defer f.lock.Unlock()
informers := map[reflect.Type]cache.SharedIndexInformer{}
for informerType, informer := range f.informers {
if f.startedInformers[informerType] {
informers[informerType] = informer
}
}
return informers
}()
res := map[reflect.Type]bool{}
// 等待所有的启动informer同步完成
for informType, informer := range informers {
res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced)
}
return res
}
接着调用cache.WaitForCacheSync
。
// WaitForCacheSync waits for caches to populate. It returns true if it was successful, false
// if the controller should shutdown
func WaitForCacheSync(stopCh <-chan struct{}, cacheSyncs ...InformerSynced) bool {
// 轮询检测直到informer 同步完成
err := wait.PollUntil(syncedPollPeriod,
func() (bool, error) {
for _, syncFunc := range cacheSyncs {
if !syncFunc() {
return false, nil
}
}
return true, nil
},
stopCh)
if err != nil {
glog.V(2).Infof("stop requested")
return false
}
glog.V(4).Infof("caches populated")
return true
}
controller.WaitForCacheSync
是对cache.WaitForCacheSync
的一层封装,通过不同的controller的名字来记录不同controller等待cache同步。
controller.WaitForCacheSync("scheduler", stop, s.PodInformer.Informer().HasSynced)
controller.WaitForCacheSync
具体代码如下:
// WaitForCacheSync is a wrapper around cache.WaitForCacheSync that generates log messages
// indicating that the controller identified by controllerName is waiting for syncs, followed by
// either a successful or failed sync.
func WaitForCacheSync(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool {
glog.Infof("Waiting for caches to sync for %s controller", controllerName)
if !cache.WaitForCacheSync(stopCh, cacheSyncs...) {
utilruntime.HandleError(fmt.Errorf("Unable to sync caches for %s controller", controllerName))
return false
}
glog.Infof("Caches are synced for %s controller", controllerName)
return true
}
如果有多个scheduler,并开启leader选举,则运行LeaderElector
直到选举结束或退出。
// If leader election is enabled, run via LeaderElector until done and exit.
if c.LeaderElection != nil {
c.LeaderElection.Callbacks = leaderelection.LeaderCallbacks{
OnStartedLeading: run,
OnStoppedLeading: func() {
utilruntime.HandleError(fmt.Errorf("lost master"))
},
}
leaderElector, err := leaderelection.NewLeaderElector(*c.LeaderElection)
if err != nil {
return fmt.Errorf("couldn't create leader elector: %v", err)
}
leaderElector.Run(ctx)
return fmt.Errorf("lost lease")
}
// Prepare a reusable run function.
run := func(ctx context.Context) {
sched.Run()
<-ctx.Done()
}
ctx, cancel := context.WithCancel(context.TODO()) // TODO once Run() accepts a context, it should be used here
defer cancel()
go func() {
select {
case <-stopCh:
cancel()
case <-ctx.Done():
}
}()
...
run(ctx)
Scheduler.Run
先等待cache同步,然后开启调度逻辑的goroutine。
Scheduler.Run的具体代码如下:
// Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
func (sched *Scheduler) Run() {
if !sched.config.WaitForCacheSync() {
return
}
go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
}
上述都是对/cmd/kube-scheduler/scheduler.go
代码即 scheduler 调度器初始化部分的分析,下小节将分析算法的注册及scheduler config生成各部分字段的意义。