前面文章一部分已有介绍Pod亲和性相关的调度策略算法分析,接下来我们继续Pod相关调度算法分析:
三 POD亲和性
Pod亲和性基础描述:
yml配置实例sample:
---
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: affinity
labels:
app: affinity
spec:
replicas: 3
template:
metadata:
labels:
app: affinity
role: lab-web
spec:
containers:
- name: nginx
image: nginx:1.9.0
ports:
- containerPort: 80
name: nginx_web_Lab
affinity: #为实现高可用,三个pod应该分布在不同Node上
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- prod-pod
topologyKey: kubernetes.io/hostname
3.1 Pod亲和性预选策略MatchInterPodAffinityPred
策略说明:
对需被调度的Pod进行亲和/反亲和配置匹配检测目标Pods,然后获取满足亲和条件的Pods所运行的Nodes
的 TopologyKey的值(亲和性pod定义topologyKey)与目标 Nodes进行一一匹配是否符合条件.
适用NodeAffinity配置项:
PodAffinity.Required
DuringSchedulingIgnoredDuringExecution
PodAntiAffinity.Required
DuringSchedulingIgnoredDuringExecution
预选策略源码分析:
- 策略注册:defaultPredicates()注册了一条名为“MatchInterPodAffinity”预选策略项.
!FILENAME pkg/scheduler/algorithmprovider/defaults/defaults.go:143
func defaultPredicates() sets.String {
...
factory.RegisterFitPredicateFactory(
predicates.MatchInterPodAffinityPred,
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
return predicates.NewPodAffinityPredicate(args.NodeInfo, args.PodLister)
},
...
}
- 策略Func: checker.InterPodAffinityMatches()
Func是通过NewPodAffinityProdicate()实例化PodAffinityChecker类对象后返回。
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1138
type PodAffinityChecker struct {
info NodeInfo
podLister algorithm.PodLister
}
func NewPodAffinityPredicate(info NodeInfo, podLister algorithm.PodLister) algorithm.FitPredicate {
checker := &PodAffinityChecker{
info: info,
podLister: podLister,
}
return checker.InterPodAffinityMatches //返回策略func
}
InterPodAffinityMatches()
检测一个pod是否满足调度到特定的(符合pod亲和或反亲和配置)Node上。
- satisfiesExistingPodsAntiAffinity() 满足存在的Pods反亲和配置.
- satisfiesPodsAffinityAntiAffinity() 满足Pods亲和与反亲和配置.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1155
func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
node := nodeInfo.Node()
if node == nil {
return false, nil, fmt.Errorf("node not found")
}
//①
if failedPredicates, error := c.satisfiesExistingPodsAntiAffinity(pod, meta, nodeInfo); failedPredicates != nil {
failedPredicates := append([]algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, failedPredicates)
return false, failedPredicates, error
}
// Now check if requirements will be satisfied on this node.
affinity := pod.Spec.Affinity
if affinity == nil || (affinity.PodAffinity == nil && affinity.PodAntiAffinity == nil) {
return true, nil, nil
}
//②
if failedPredicates, error := c.satisfiesPodsAffinityAntiAffinity(pod, meta, nodeInfo, affinity); failedPredicates != nil {
failedPredicates := append([]algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, failedPredicates)
return false, failedPredicates, error
}
return true, nil, nil
}
① satisfiesExistingPodsAntiAffinity()
检测当pod被调度到目标node上是否触犯了其它pods所定义的反亲和配置.
即:当调度一个pod到目标Node上,而某个或某些Pod定义了反亲和配置与被
调度的Pod相匹配(触犯),那么就不应该将此Node加入到可选的潜在调度Nodes列表内.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1293
func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (algorithm.PredicateFailureReason, error) {
node := nodeInfo.Node()
if node == nil {
return ErrExistingPodsAntiAffinityRulesNotMatch, fmt.Errorf("Node is nil")
}
var topologyMaps *topologyPairsMaps
//如果存在预处理的MetaData则直接获取topologyPairsAntiAffinityPodsMap
if predicateMeta, ok := meta.(*predicateMetadata); ok {
topologyMaps = predicateMeta.topologyPairsAntiAffinityPodsMap
} else {
// 不存在预处理的MetaData处理逻辑.
// 过滤掉pod的nodeName等于NodeInfo.Node.Name,且不存在于nodeinfo中.
// 即运行在其它Nodes上的Pods
filteredPods, err := c.podLister.FilteredList(nodeInfo.Filter, labels.Everything())
if err != nil {
errMessage := fmt.Sprintf("Failed to get all pods, %+v", err)
klog.Error(errMessage)
return ErrExistingPodsAntiAffinityRulesNotMatch, errors.New(errMessage)
}
// 获取被调度Pod与其它存在反亲和配置的Pods匹配的topologyMaps
if topologyMaps, err = c.getMatchingAntiAffinityTopologyPairsOfPods(pod, filteredPods); err != nil {
errMessage := fmt.Sprintf("Failed to get all terms that pod %+v matches, err: %+v", podName(pod), err)
klog.Error(errMessage)
return ErrExistingPodsAntiAffinityRulesNotMatch, errors.New(errMessage)
}
}
// 遍历所有topology pairs(所有反亲和topologyKey/Value),检测Node是否有影响.
for topologyKey, topologyValue := range node.Labels {
if topologyMaps.topologyPairToPods[topologyPair{key: topologyKey, value: topologyValue}] != nil {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v", podName(pod), node.Name)
return ErrExistingPodsAntiAffinityRulesNotMatch, nil
}
}
return nil, nil
}
getMatchingAntiAffinityTopologyPairsOfPods()
获取被调度Pod与其它存在反亲和配置的Pods匹配的topologyMaps
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1270
func (c *PodAffinityChecker) getMatchingAntiAffinityTopologyPairsOfPods(pod *v1.Pod, existingPods []*v1.Pod) (*topologyPairsMaps, error) {
topologyMaps := newTopologyPairsMaps()
// 遍历所有存在Pods,获取pod所运行的Node信息
for _, existingPod := range existingPods {
existingPodNode, err := c.info.GetNodeInfo(existingPod.Spec.NodeName)
if err != nil {
if apierrors.IsNotFound(err) {
klog.Errorf("Node not found, %v", existingPod.Spec.NodeName)
continue
}
return nil, err
}
// 依据被调度的pod、目标pod、目标Node信息(上面获取得到)获取TopologyPairs。
// getMatchingAntiAffinityTopologyPairsOfPod()下面详述
existingPodTopologyMaps, err := getMatchingAntiAffinityTopologyPairsOfPod(pod, existingPod, existingPodNode)
if err != nil {
return nil, err
}
topologyMaps.appendMaps(existingPodTopologyMaps)
}
return topologyMaps, nil
}
//1)是否ExistingPod定义了反亲和配置,如果没有直接返回
//2)如果有定义,是否有任务一个反亲和Term匹配需被调度的pod.
// 如果配置则将返回term定义的TopologyKey和Node的topologyValue.
func getMatchingAntiAffinityTopologyPairsOfPod(newPod *v1.Pod, existingPod *v1.Pod, node *v1.Node) (*topologyPairsMaps, error) {
affinity := existingPod.Spec.Affinity
if affinity == nil || affinity.PodAntiAffinity == nil {
return nil, nil
}
topologyMaps := newTopologyPairsMaps()
for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) {
namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(existingPod, &term)
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
if err != nil {
return nil, err
}
if priorityutil.PodMatchesTermsNamespaceAndSelector(newPod, namespaces, selector) {
if topologyValue, ok := node.Labels[term.TopologyKey]; ok {
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
topologyMaps.addTopologyPair(pair, existingPod)
}
}
}
return topologyMaps, nil
}
② satisfiesPodsAffinityAntiAffinity()
满足Pods亲和与反亲和配置.
我们先看一下代码结构,我将共分为两个部分if{}部分,else{}部分,依赖于是否指定了预处理的预选metadata.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1367
func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod,
meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo,
affinity *v1.Affinity) (algorithm.PredicateFailureReason, error) {
node := nodeInfo.Node()
if node == nil {
return ErrPodAffinityRulesNotMatch, fmt.Errorf("Node is nil")
}
if predicateMeta, ok := meta.(*predicateMetadata); ok {
... //partI
} else {
... //partII
}
return nil, nil
}
partI
if{...}
- 如果指定了预处理metadata,则使用此逻辑,否则跳至else{...}
- 获取所有pod亲和性定义AffinityTerms,如果存在亲和性定义,基于指定的metadata判断AffinityTerms所定义的nodeTopoloykey与值是否所有都存在于metadata.topologyPairsPotential
Affinity
Pods之内(潜在匹配亲和定义的pod list)。 - 获取所有pod亲和性定义AntiAffinityTerms,如果存在反亲和定义,基于指定的metadata判断AntiAffinityTerms所定义的nodeTopoloykey与值 是否有一个存在于 metadata.topologyPairsPotential
AntiAffinity
Pods之内的情况(潜在匹配anti反亲和定义的pod list)。
if predicateMeta, ok := meta.(*predicateMetadata); ok {
// 检测所有affinity terms.
topologyPairsPotentialAffinityPods := predicateMeta.topologyPairsPotentialAffinityPods
if affinityTerms := GetPodAffinityTerms(affinity.PodAffinity); len(affinityTerms) > 0 {
matchExists := c.nodeMatchesAllTopologyTerms(pod, topologyPairsPotentialAffinityPods, nodeInfo, affinityTerms)
if !matchExists {
if !(len(topologyPairsPotentialAffinityPods.topologyPairToPods) == 0 && targetPodMatchesAffinityOfPod(pod, pod)) {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinity",
podName(pod), node.Name)
return ErrPodAffinityRulesNotMatch, nil
}
}
}
// 检测所有anti-affinity terms.
topologyPairsPotentialAntiAffinityPods := predicateMeta.topologyPairsPotentialAntiAffinityPods
if antiAffinityTerms := GetPodAntiAffinityTerms(affinity.PodAntiAffinity); len(antiAffinityTerms) > 0 {
matchExists := c.nodeMatchesAnyTopologyTerm(pod, topologyPairsPotentialAntiAffinityPods, nodeInfo, antiAffinityTerms)
if matchExists {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinity",
podName(pod), node.Name)
return ErrPodAntiAffinityRulesNotMatch, nil
}
}
}
以下说明继续if{…}内所用的各个子逻辑函数分析(按代码位置的先后顺序):
GetPodAffinityTerms()
如果存在podAffinity硬件配置,获取所有"匹配必要条件”Terms
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1217
func GetPodAffinityTerms(podAffinity *v1.PodAffinity) (terms []v1.PodAffinityTerm) {
if podAffinity != nil {
if len(podAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
terms = podAffinity.RequiredDuringSchedulingIgnoredDuringExecution
}
}
return terms
}
nodeMatchesAllTopologyTerms()
判断目标Node是否匹配所有亲和性配置的定义Terms的topology值.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1336
// 目标Node须匹配所有Affinity terms所定义的TopologyKey,且值须与nodes(运行被亲和匹配表达式匹配的Pods)
// 的TopologyKey和值相匹配。
// 注:此逻辑内metadata预计算了topologyPairs
func (c *PodAffinityChecker) nodeMatchesAllTopologyTerms(pod *v1.Pod, topologyPairs *topologyPairsMaps, nodeInfo *schedulercache.NodeInfo, terms []v1.PodAffinityTerm) bool {
node := nodeInfo.Node()
for _, term := range terms {
// 判断目标node上是否存在亲和配置定义的TopologyKey的key,取出其topologykey值
// 根据key与值创建topologyPair
// 基于metadata.topologyPairsPotentialAffinityPods(潜在亲和pods的topologyPairs)判断\
//目标node上的ToplogyKey与value是否相互匹配.
if topologyValue, ok := node.Labels[term.TopologyKey]; ok {
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
if _, ok := topologyPairs.topologyPairToPods[pair]; !ok {
return false // 一项不满足则为false
}
} else {
return false
}
}
return true
}
// topologyPairsMaps结构定义
type topologyPairsMaps struct {
topologyPairToPods map[topologyPair]podSet
podToTopologyPairs map[string]topologyPairSet
}
targetPodMatchesAffinityOfPod()
根据pod的亲和定义检测目标pod的NameSpace是否符合条件以及 Labels.selector条件表达式是否匹配.
!FILENAME pkg/scheduler/algorithm/predicates/metadata.go:498
func targetPodMatchesAffinityOfPod(pod, targetPod *v1.Pod) bool {
affinity := pod.Spec.Affinity
if affinity == nil || affinity.PodAffinity == nil {
return false
}
affinityProperties, err := getAffinityTermProperties(pod, GetPodAffinityTerms(affinity.PodAffinity)) // ①
if err != nil {
klog.Errorf("error in getting affinity properties of Pod %v", pod.Name)
return false
} // ②
return podMatchesAllAffinityTermProperties(targetPod, affinityProperties)
}
// ① 获取affinityTerms所定义所有的namespaces 和 selector 列表,
// 返回affinityTermProperites数组. 数组的每项定义{namesapces,selector}.
func getAffinityTermProperties(pod *v1.Pod, terms []v1.PodAffinityTerm) (properties []*affinityTermProperties, err error) {
if terms == nil {
return properties, nil
}
for _, term := range terms {
namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term)
// 基于定义的亲和性term,创建labels.selector
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
if err != nil {
return nil, err
}
// 返回 namespaces 和 selector
properties = append(properties, &affinityTermProperties{namespaces: namespaces, selector: selector})
}
return properties, nil
}
// 返回Namespace列表(如果term未指定Namespace则使用被调度pod的Namespace).
func GetNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.String {
names := sets.String{}
if len(podAffinityTerm.Namespaces) == 0 {
names.Insert(pod.Namespace)
} else {
names.Insert(podAffinityTerm.Namespaces...)
}
return names
}
// ② 遍历properties所有定义的namespaces 和 selector 列表,调用PodMatchesTermsNamespaceAndSelector()进行一一匹配.
func podMatchesAllAffinityTermProperties(pod *v1.Pod, properties []*affinityTermProperties) bool {
if len(properties) == 0 {
return false
}
for _, property := range properties {
if !priorityutil.PodMatchesTermsNamespaceAndSelector(pod, property.namespaces, property.selector) {
return false
}
}
return true
}
// 检测NameSpaces一致性和Labels.selector是否匹配.
// - 如果pod.Namespaces不相等于指定的NameSpace值则返回false,如果true则继续labels match.
// - 如果pod.labels不能Match Labels.selector选择器,则返回false,反之true
func PodMatchesTermsNamespaceAndSelector(pod *v1.Pod, namespaces sets.String, selector labels.Selector) bool {
if !namespaces.Has(pod.Namespace) {
return false
}
if !selector.Matches(labels.Set(pod.Labels)) {
return false
}
return true
}
GetPodAntiAffinityTerms()
获取pod反亲和配置所有的必要条件Terms
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1231
func GetPodAntiAffinityTerms(podAntiAffinity *v1.PodAntiAffinity) (terms []v1.PodAffinityTerm) {
if podAntiAffinity != nil {
if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
terms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
}
}
return terms
}
nodeMatchesAnyTopologyTerm()
判断目标Node是否有匹配了反亲和的定义Terms的topology值*.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1353
// Node只须匹配任何一条AnitAffinity terms所定义的TopologyKey则为True
// 逻辑等同于nodeMatchesAllTopologyTerms(),只是匹配一条则返回为true.
func (c *PodAffinityChecker) nodeMatchesAnyTopologyTerm(pod *v1.Pod, topologyPairs *topologyPairsMaps, nodeInfo *schedulercache.NodeInfo, terms []v1.PodAffinityTerm) bool {
node := nodeInfo.Node()
for _, term := range terms {
if topologyValue, ok := node.Labels[term.TopologyKey]; ok {
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
if _, ok := topologyPairs.topologyPairToPods[pair]; ok {
return true // 一项满足则为true
}
}
}
return false
}
partII
else{...}
- 如果没有预处理的Metadata,则通过指定podFilter过滤器获取满足条件的pod列表
- 获取所有亲和配置定义,如果存在则,通过获取PodAffinity所定义的所有namespaces和标签条件表达式进行匹配”目标pod",完全符合则获取此目标pod的运行node的topologykey(此为affinity指定的topologykey)的
值
和"潜在Node"的topologykey的值比对是否一致。 - 与上类似,获取所有anti反亲和配置定义,如果存在则,通过获取PodAntiAffinity所定义的所有namespaces和标签条件表达式进行匹配”目标pod",完全符合则获取此目标pod的运行node的topologykey(此为AntiAffinity指定的topologykey)的值和"潜在Node"的topologykey的值比对是否一致。
else {
// We don't have precomputed metadata. We have to follow a slow path to check affinity terms.
filteredPods, err := c.podLister.FilteredList(nodeInfo.Filter, labels.Everything())
if err != nil {
return ErrPodAffinityRulesNotMatch, err
}
//获取亲和、反亲和配置定义的"匹配条件"Terms
affinityTerms := GetPodAffinityTerms(affinity.PodAffinity)
antiAffinityTerms := GetPodAntiAffinityTerms(affinity.PodAntiAffinity)
matchFound, termsSelectorMatchFound := false, false
for _, targetPod := range filteredPods {
// 遍历所有目标Pod,检测所有亲和性配置"匹配条件"Terms
if !matchFound && len(affinityTerms) > 0 {
// podMatchesPodAffinityTerms()对namespaces和标签条件表达式进行匹配目标pod【详解后述】
affTermsMatch, termsSelectorMatch, err := c.podMatchesPodAffinityTerms(pod, targetPod, nodeInfo, affinityTerms)
if err != nil {
errMessage := fmt.Sprintf("Cannot schedule pod %+v onto node %v, because of PodAffinity, err: %v", podName(pod), node.Name, err)
klog.Error(errMessage)
return ErrPodAffinityRulesNotMatch, errors.New(errMessage)
}
if termsSelectorMatch {
termsSelectorMatchFound = true
}
if affTermsMatch {
matchFound = true
}
}
// 同上,遍历所有目标Pod,检测所有Anti反亲和配置"匹配条件"Terms.
if len(antiAffinityTerms) > 0 {
antiAffTermsMatch, _, err := c.podMatchesPodAffinityTerms(pod, targetPod, nodeInfo, antiAffinityTerms)
if err != nil || antiAffTermsMatch {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinityTerm, err: %v",
podName(pod), node.Name, err)
return ErrPodAntiAffinityRulesNotMatch, nil
}
}
}
if !matchFound && len(affinityTerms) > 0 {
if termsSelectorMatchFound {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinity",
podName(pod), node.Name)
return ErrPodAffinityRulesNotMatch, nil
}
// Check if pod matches its own affinity properties (namespace and label selector).
if !targetPodMatchesAffinityOfPod(pod, pod) {
klog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinity",
podName(pod), node.Name)
return ErrPodAffinityRulesNotMatch, nil
}
}
}
以下说明继续else{…}内所用的子逻辑函数分析:
podMatchesPodAffinityTerms()
通过获取亲和配置定义的所有namespaces和标签条件表达式进行匹配目标pod,完全符合则获取此目标pod的运行node的topologykey(此为affinity指定的topologykey)的值
和潜在Node的topologykey的值
比对是否一致.
!FILENAME pkg/scheduler/algorithm/predicates/predicates.go:1189
func (c *PodAffinityChecker) podMatchesPodAffinityTerms(pod, targetPod *v1.Pod, nodeInfo *schedulercache.NodeInfo, terms []v1.PodAffinityTerm) (bool, bool, error) {
if len(terms) == 0 {
return false, false, fmt.Errorf("terms array is empty")
}
// 获取{namespaces,selector}列表
props, err := getAffinityTermProperties(pod, terms)
if err != nil {
return false, false, err
}
// 匹配目标pod是否在affinityTerm定义的{namespaces,selector}列表内所有项,如果不匹配则返回false,
// 如果匹配则获取此pod的运行node信息(称为目标Node),
// 通过“目标Node”所定义的topologykey(此为affinity指定的topologykey)的值来匹配“潜在被调度的Node”的topologykey是否一致。
if !podMatchesAllAffinityTermProperties(targetPod, props) {
return false, false, nil
}
// Namespace and selector of the terms have matched. Now we check topology of the terms.
targetPodNode, err := c.info.GetNodeInfo(targetPod.Spec.NodeName)
if err != nil {
return false, false, err
}
for _, term := range terms {
if len(term.TopologyKey) == 0 {
return false, false, fmt.Errorf("empty topologyKey is not allowed except for PreferredDuringScheduling pod anti-affinity")
}
if !priorityutil.NodesHaveSameTopologyKey(nodeInfo.Node(), targetPodNode, term.TopologyKey) {
return false, true, nil
}
}
return true, true, nil
}
priorityutil.NodesHaveSameTopologyKey()* 正真的toplogykey比较实现的逻辑代码块。
从此代码可以看出deployment的yml对topologykey设定的可以支持自定义的
!FILENAME pkg/scheduler/algorithm/priorities/util/topologies.go:53
// 判断两者的topologyKey定义的值是否一致。
func NodesHaveSameTopologyKey(nodeA, nodeB *v1.Node, topologyKey string) bool {
if len(topologyKey) == 0 {
return false
}
if nodeA.Labels == nil || nodeB.Labels == nil {
return false
}
nodeALabel, okA := nodeA.Labels[topologyKey] //取Node一个被意义化的“Label”的值value
nodeBLabel, okB := nodeB.Labels[topologyKey]
// If found label in both nodes, check the label
if okB && okA {
return nodeALabel == nodeBLabel //比对
}
return false
}
3.2 Pod亲和性优选策略InterPodAffinityPriority
策略说明:
并发遍历所有潜在的目标Nodes,对Pods与需被调度Pod的亲和和反亲性检测,对亲性匹配则增,对反亲性
匹配则减, 最终对每个Node进行统计分数。
适用NodeAffinity配置项:
PodAffinity.Preferred
DuringSchedulingIgnoredDuringExecution
PodAntiAffinity.Preferred
DuringSchedulingIgnoredDuringExecution
预选策略源码分析:
- 策略注册:defaultPriorities()注册了一条名为“InterPodAffinityPriority”优选策略项.
!FILENAME pkg/scheduler/algorithmprovider/defaults/defaults.go:145
// k8s.io/kubernetes/pkg/scheduler/algorithmprovider/defaults/defaults.go
func defaultPriorities() sets.String {
...
factory.RegisterPriorityConfigFactory(
"InterPodAffinityPriority",
factory.PriorityConfigFactory{
Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
return priorities.NewInterPodAffinityPriority(args.NodeInfo, args.NodeLister, args.PodLister, args.HardPodAffinitySymmetricWeight)
},
Weight: 1,
},
),
...
}
- 策略Func: interPodAffinity.CalculateInterPodAffinityPriority()
通过NewPodAffinityPriority()实例化interPodAffinityod类对象及CalculateInterPodAffinityPriority()策略Func返回。
!FILENAME pkg/scheduler/algorithm/priorities/interpod_affinity.go:45
func NewInterPodAffinityPriority(
info predicates.NodeInfo,
nodeLister algorithm.NodeLister,
podLister algorithm.PodLister,
hardPodAffinityWeight int32) algorithm.PriorityFunction {
interPodAffinity := &InterPodAffinity{
info: info,
nodeLister: nodeLister,
podLister: podLister,
hardPodAffinityWeight: hardPodAffinityWeight,
}
return interPodAffinity.CalculateInterPodAffinityPriority
}
CalculateInterPodAffinityPriority()
基于pod亲和性配置匹配"必要条件项”Terms,并发处理所有目标nodes,为其目标node统计亲和weight得分.
我们先来看一下它的代码结构:
- processPod := func(existingPod *v1.Pod) error {…
pm.processTerms()
}- processNode := func(i int) {…}
- workqueue.ParallelizeUntil(context.TODO(), 16, len(allNodeNames),
processNode
)- fScore = float64(schedulerapi.MaxPriority) * ((pm.counts[node.Name] - minCount) / (maxCount - minCount))
此代码逻辑需理解几个定义:
pod 一个"需被调度的Pod"
hasAffinityConstraints "被调度的pod"是否有定义亲和配置
hasAntiAffinityConstraints "被调度的pod"是否有定义亲和配置
existingPod 一个待处理的"亲和目标pod"
existingPodNode 运行此“亲和目标pod”的节点--“目标Node”
existingHasAffinityConstraints "亲和目标pod"是否存在亲和约束
existingHasAntiAffinityConstraints "亲和目标pod"是否存在反亲和约束
!FILENAME pkg/scheduler/algorithm/priorities/interpod_affinity.go:119
func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error) {
affinity := pod.Spec.Affinity
//"需被调度Pod"是否存在亲和、反亲和约束配置
hasAffinityConstraints := affinity != nil && affinity.PodAffinity != nil
hasAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil
allNodeNames := make([]string, 0, len(nodeNameToInfo))
for name := range nodeNameToInfo {
allNodeNames = append(allNodeNames, name)
}
var maxCount float64
var minCount float64
pm := newPodAffinityPriorityMap(nodes)
// processPod()主要处理pod亲和和反亲和weight累计的逻辑代码。 ②
// 调用了Terms处理方法:processTerms()
processPod := func(existingPod *v1.Pod) error {
...
// 亲和性检测逻辑代码 ①
pm.processTerms(terms, pod, existingPod, existingPodNode, 1)
...
}
//ProcessNode()通过一个判断是否存在亲和性配置选择调用processPod() ③
processNode := func(i int) {
...
if err := processPod(existingPod); err != nil {
pm.setError(err)
}
...
}
// 并发多线程处理调用ProcessNode()
workqueue.ParallelizeUntil(context.TODO(), 16, len(allNodeNames), processNode)
...
for _, node := range nodes {
if pm.counts[node.Name] > maxCount {
maxCount = pm.counts[node.Name]
}
if pm.counts[node.Name] < minCount {
minCount = pm.counts[node.Name]
}
}
result := make(schedulerapi.HostPriorityList, 0, len(nodes))
for _, node := range nodes {
fScore := float64(0)
if (maxCount - minCount) > 0 { //reduce计算fScore分 ④
fScore = float64(schedulerapi.MaxPriority) * ((pm.counts[node.Name] - minCount) / (maxCount - minCount))
}
result = append(result, schedulerapi.HostPriority{
Host: node.Name,
Score: int(fScore)
})
}
}
return result, nil
}
① ProcessTerms()
给定Pod和此Pod的定义的亲和性配置(podAffinityTerm)、被测目标pod、运行被测目标pod的Node信息,对所有潜在可被调度的Nodes列表进行一一检测,并对根据检测结果为node进行weight累计。
流程如下:
“被测Pod”的namespaces是否与“给定的pod”的namespaces是否一致;
“被测Pod”的labels是否与“给定的pod”的podAffinityTerm定义匹配;
-
如果前两条件都为True,则对运行“被测的pod”的node的TopologyKey的值与所有潜在可被调度的Node进行遍历检测 TopologyKey的值是否一致,true则累计weight值.
逻辑理解:
1
与2
实现了找出在同一个namespace下满足被调pod所配置podAffinityTerm的pods;3
则实现获取topologyKey的值与潜在被调度的Node进行匹配检测” .此处则可清楚的理解pod亲和性配置匹配的内在含义与逻辑。
!FILENAME pkg/scheduler/algorithm/priorities/interpod_affinity.go:107
func (p *podAffinityPriorityMap) processTerms(terms []v1.WeightedPodAffinityTerm, podDefiningAffinityTerm, podToCheck *v1.Pod, fixedNode *v1.Node, multiplier int) {
for i := range terms {
term := &terms[i]
p.processTerm(&term.PodAffinityTerm, podDefiningAffinityTerm, podToCheck, fixedNode, float64(term.Weight*int32(multiplier)))
}
}
func (p *podAffinityPriorityMap) processTerm(term *v1.PodAffinityTerm, podDefiningAffinityTerm, podToCheck *v1.Pod, fixedNode *v1.Node, weight float64) {
// 获取namesapce信息(affinityTerm.Namespaces或pod.Namesapce)
// 根据podAffinityTerm定义生成selector对象(参看本文开头的述labelSelector)
namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(podDefiningAffinityTerm, term)
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) //labeSelector
if err != nil {
p.setError(err)
return
}
//判断“被检测的Pod”的Namespace和Selector Labels是否匹配
match := priorityutil.PodMatchesTermsNamespaceAndSelector(podToCheck, namespaces, selector)
if match {
func() {
p.Lock()
defer p.Unlock()
for _, node := range p.nodes {
//对"运行被检测亲和Pod的Node节点" 与被考虑的所有Nodes进行一一匹配TopologyKey检查,如相等则进行累加权值
if priorityutil.NodesHaveSameTopologyKey(node, fixedNode, term.TopologyKey) {
p.counts[node.Name] += weight
}
}
}()
}
}
GetNamespaceFromPodAffinitTerm()
返回Namespaces列表(如果term未指定Namespace则使用被调度pod的Namespace)
!FILENAME pkg/scheduler/algorithm/priorities/util/topologies.go:28
func GetNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.String {
names := sets.String{}
if len(podAffinityTerm.Namespaces) == 0 {
names.Insert(pod.Namespace)
} else {
names.Insert(podAffinityTerm.Namespaces...)
}
return names
}
PodMatchesTermsNamespaceAndSelector()
检测NameSpace一致性和Labels.selector是否匹配.
!FILENAME pkg/scheduler/algorithm/priorities/util/topologies.go:40
func PodMatchesTermsNamespaceAndSelector(pod *v1.Pod, namespaces sets.String, selector labels.Selector) bool {
if !namespaces.Has(pod.Namespace) {
return false
}
if !selector.Matches(labels.Set(pod.Labels)) {
return false
}
return true
}
② **processPod() ** 处理亲和和反亲和逻辑层,调用processTerms()进行检测与统计权重值。
!FILENAME pkg/scheduler/algorithm/priorities/interpod_affinity.go:136
processPod := func(existingPod *v1.Pod) error {
existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName)
if err != nil {
if apierrors.IsNotFound(err) {
klog.Errorf("Node not found, %v", existingPod.Spec.NodeName)
return nil
}
return err
}
existingPodAffinity := existingPod.Spec.Affinity
existingHasAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAffinity != nil
existingHasAntiAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAntiAffinity != nil
//如果"需被调度的Pod"存在亲和约束,则与"亲和目标Pod"和"亲和目标Node"进行一次ProcessTerms()检测,如果成立则wieght权重值加1倍.
if hasAffinityConstraints {
terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution
pm.processTerms(terms, pod, existingPod, existingPodNode, 1)
}
// 如果"需被调度的Pod"存在反亲和约束,则与"亲和目标Pod"和"亲和目标Node"进行一次ProcessTerms()检测,如果成立则wieght权重值减1倍.
if hasAntiAffinityConstraints {
terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution
pm.processTerms(terms, pod, existingPod, existingPodNode, -1)
}
//如果"亲和目标Pod"存在亲和约束,则反过来与"需被调度的Pod"和"亲和目标Node"进行一次ProcessTerms()检测,如果成立则wieght权重值加1倍.
if existingHasAffinityConstraints {
if ipa.hardPodAffinityWeight > 0 {
terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution
for _, term := range terms {
pm.processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight))
}
}
terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution
pm.processTerms(terms, existingPod, pod, existingPodNode, 1)
}
// 如果"亲和目标Pod"存在反亲和约束,则反过来与"需被调度的Pod"和"亲和目标Node"进行一次ProcessTerms()检测,如果成立则wieght权重值减1倍.
if existingHasAntiAffinityConstraints {
terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution
pm.processTerms(terms, existingPod, pod, existingPodNode, -1)
}
return nil
}
③ **processNode ** 如果"被调度pod"未定义亲和配置,则检测潜在Nodes的亲和性定义.
!FILENAME pkg/scheduler/algorithm/priorities/interpod_affinity.go:193
processNode := func(i int) {
nodeInfo := nodeNameToInfo[allNodeNames[i]]
if nodeInfo.Node() != nil {
if hasAffinityConstraints || hasAntiAffinityConstraints {
// We need to process all the nodes.
for _, existingPod := range nodeInfo.Pods() {
if err := processPod(existingPod); err != nil {
pm.setError(err)
}
}
} else {
for _, existingPod := range nodeInfo.PodsWithAffinity() {
if err := processPod(existingPod); err != nil {
pm.setError(err)
}
}
}
}
}
④ 最后的得分fscore计算公式:
// 10 * (node权重累计值 - 最小权重得分值) / (最大权重得分值 - 最小权重得分值)
fScore = float64(schedulerapi.MaxPriority) * ((pm.counts[node.Name] - minCount) / (maxCount - minCount))
const (
// MaxPriority defines the max priority value.
MaxPriority = 10
)
End
...未完,请参看后续三service亲和性.
文章及内容转发请署名XiaoYang