kubernetes v1.12.1
kubelet 驱赶的是节点上的某些 Pod,驱赶哪些 Pod与 Qos 机制有关(1.8),1.9 以后的版本请看下文分解
只有当节点内存和磁盘资源紧张时,目的就是为了回收 node 节点的资源
Soft Eviction Thresholds
软驱逐机制表示,当node的内存/磁盘空间达到一定的阈值后,要观察一段时间,如果改善到低于阈值就不进行驱逐,若这段时间一直高于阈值就进行驱逐
参数--eviction-soft
配置,观察时间通过参数--eviction-soft-grace-period
进行配置,如1m30s
参数eviction-max-pod-grace-period
,影响到要被驱逐的pod的termination time,即终止该pod的容器要花费的时间
kubelet通过OOM Killer来回收缺点:
- System OOM events会保存记录直到完成了OOM
- OOM Killer干掉containers后,Scheduler可能又会调度新的Pod到该Node上或者直接在node上重新运行,又会触发该Node上的OOM Killer,可能无限循化这种操作
完全依赖于oom_kill并不是一个很好的方案,一来对于cpu要求高的容器没有作用,二来单纯将pod杀死,并不能根本上解决困局,比如pod占用node绝大部分内存,加入pod被kill后再次调度到这个node上,oom的情况还会复现。所以kubelet增加了一套驱逐机制 (引用,好像听有道理)
kubelet的eviction主要会回收两种资源,内存和磁盘
--eviction-hard="imagefs.available<15%,memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%"
--eviction-max-pod-grace-period="0"
--eviction-minimum-reclaim=""
--eviction-pressure-transition-period="5m0s"
--eviction-soft=""
--eviction-soft-grace-period=""
注意:分为eviction-soft和eviction-hard。soft 到达 threshold 值时会给pod一段时间优雅退出,而hard直接杀掉pod,不给任何优雅退出的机会
- memory.available:
memory.available
:=node.status.capacity[memory]
-node.stats.memory.workingSet
- nodefs.available:
nodefs.available
:=node.stats.fs.available
- nodefs.inodesFree:
nodefs.inodesFree
:=node.stats.fs.inodesFree
- imagefs.available:
imagefs.available
:=node.stats.runtime.imagefs.available
- imagefs.inodesFree:
imagefs.inodesFree
:=node.stats.runtime.imagefs.inodesFree
- allocatableMemory.available:
注意:
- nodefs: 指node自身的存储,存储运行日志等
- imagefs: 指dockerd存储image和容器可写层
// managerImpl implements Manager
type managerImpl struct {
// used to track time
clock clock.Clock
// config is how the manager is configured
config Config
// the function to invoke to kill a pod
killPodFunc KillPodFunc
// the interface that knows how to do image gc
imageGC ImageGC
// the interface that knows how to do container gc
containerGC ContainerGC
// protects access to internal state
sync.RWMutex
// node conditions are the set of conditions present
nodeConditions []v1.NodeConditionType
// captures when a node condition was last observed based on a threshold being met
nodeConditionsLastObservedAt nodeConditionsObservedAt
// nodeRef is a reference to the node
nodeRef *v1.ObjectReference
// used to record events about the node
recorder record.EventRecorder
// used to measure usage stats on system
summaryProvider stats.SummaryProvider
// records when a threshold was first observed
thresholdsFirstObservedAt thresholdsObservedAt
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
thresholdsMet []evictionapi.Threshold
// signalToRankFunc maps a resource to ranking function for that resource.
signalToRankFunc map[evictionapi.Signal]rankFunc
// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
// last observations from synchronize
lastObservations signalObservations
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
dedicatedImageFs *bool
// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
thresholdNotifiers []ThresholdNotifier
// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
thresholdsLastUpdated time.Time
}
路径: pkg/kubelet/kubelet.go
可以参照上面kubelet启动eviction默认值
thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
if err != nil {
return nil, err
}
evictionConfig := eviction.Config{
PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration,
MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod),
Thresholds: thresholds,
KernelMemcgNotification: experimentalKernelMemcgNotification,
PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(),
}
// setup eviction manager
evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock)
klet.evictionManager = evictionManager
klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
隐藏的够深
路径:pkg/kubelet/eviction/eviction_manager.go
调用syncronize主要执行体,没有可以驱逐的pod将进行sleep,好像10s
// Start starts the control loop to observe and response to low compute resources.
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
thresholdHandler := func(message string) {
glog.Infof(message)
m.synchronize(diskInfoProvider, podFunc)
}
if m.config.KernelMemcgNotification {
for _, threshold := range m.config.Thresholds {
glog.Infof("zzlin managerImpl Start threshold: %#v m.config.PodCgroupRoot: %v", threshold, m.config.PodCgroupRoot)
if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
if err != nil {
glog.Warningf("eviction manager: failed to create memory threshold notifier: %v", err)
} else {
go notifier.Start()
m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
}
}
}
}
// start the eviction manager monitoring
go func() {
for {
if evictedPods := m.synchronize(diskInfoProvider, podFunc); evictedPods != nil {
glog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods))
m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
} else {
time.Sleep(monitoringInterval)
}
}
}()
}
进行evcition pod的流程
HasDedicatedImageFs 函数
// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
if m.dedicatedImageFs == nil {
hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
if ok != nil {
return nil
}
m.dedicatedImageFs = &hasImageFs
m.signalToRankFunc = buildSignalToRankFunc(hasImageFs)
m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
}
3.1.1 buildSingalToRankRunc 函数
注册Evict Pod时各种Resource的排名函数,注册的有 eviction signal
- memory.availablealloc
- atableMemory.available
- nodefs.available
- nodefs.inodesFree
- imagefs.available
- imagefs.inodesFree
// buildSignalToRankFunc returns ranking functions associated with resources
func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc {
signalToRankFunc := map[evictionapi.Signal]rankFunc{
evictionapi.SignalMemoryAvailable: rankMemoryPressure,
evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure,
}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
} else {
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
// since imagefs and nodefs share a common device, they share common ranking functions.
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
}
return signalToRankFunc
}
路径pkg/kubelet/server/stats/summary.go
activePods := podFunc()
updateStats := true
summary, err := m.summaryProvider.Get(updateStats)
if err != nil {
glog.Errorf("eviction manager: failed to get get summary stats: %v", err)
return nil
}
makeSignalObservations
从 cAdvisor 中获取 Eviction Signal Observation 和 Pod的 StatsFunc
显示signal资源情况包括如下:
// makeSignalObservations derives observations using the specified summary provider.
func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) {
// build the function to work against for pod stats
statsFunc := cachedStatsFunc(summary.Pods)
// build an evaluation context for current eviction signals
result := signalObservations{}
3.3.1 memory.available
available 为 node 节点的 memory.AvailableBytes,capacity 为 node 节点的 memory.AvailableBytes + memory.WorkingSetBytes
if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
result[evictionapi.SignalMemoryAvailable] = signalObservation{
available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
time: memory.Time,
}
}
3.3.2 allocatableMemory.available
if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil {
klog.Errorf("eviction manager: failed to construct signal: %q error: %v", evictionapi.SignalAllocatableMemoryAvailable, err)
} else {
if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{
available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
time: memory.Time,
}
}
}
3.3.3 nodefs.available 和 nodefs.inodesFree
if nodeFs := summary.Node.Fs; nodeFs != nil {
if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
result[evictionapi.SignalNodeFsAvailable] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
time: nodeFs.Time,
}
}
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI),
time: nodeFs.Time,
}
}
}
从cAdvisor 得到的 observasions,和配置的 thresholds通过 thresholdsMe t
计算得到这一轮的 thresholds
GetThresholdQuantity 函数如果 thresholds存在则返回这个,不存在则返回值为 observed 的capacity * threshold.Percentage,如果设置了 eviction-minimum-reclaim ,则把这部分资源加入
先前已记录但还没解决的thresholds 与本轮阈值合并
// determine the set of thresholds met independent of grace period
thresholds = thresholdsMet(thresholds, observations, false)
debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
klog.Infof("zzlin synchronize thresholds: %v", thresholds)
// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
if len(m.thresholdsMet) > 0 {
thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
}
thresholdsFirstObservedAt 这个主要用来记录 eviction signal 第一次的时间 (如果先前 eviction signal 有时间标记,则还是使用先前时间,没有则设置 now 时间)
// thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt {
results := thresholdsObservedAt{}
for i := range thresholds {
observedAt, found := lastObservedAt[thresholds[i]]
if !found {
observedAt = now
}
results[thresholds[i]] = observedAt
}
return results
}
根据阈值的 eviction signal 得到注册的node condition,注册的如下,比如 memory.available MemoryPressure,如下所示
// map eviction signals to node conditions signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{} signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure
// nodeConditions returns the set of node conditions associated with a threshold
func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType {
results := []v1.NodeConditionType{}
for _, threshold := range thresholds {
if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
if !hasNodeCondition(results, nodeCondition) {
results = append(results, nodeCondition)
}
}
}
return results
}
本轮 node condition 与上次的何必合并,以最新的为准
// nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
func nodeConditionsLastObservedAt(nodeConditions []v1.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt {
results := nodeConditionsObservedAt{}
// the input conditions were observed "now"
for i := range nodeConditions {
results[nodeConditions[i]] = now
}
// the conditions that were not observed now are merged in with their old time
for key, value := range lastObservedAt {
_, found := results[key]
if !found {
results[key] = value
}
}
return results
}
nodeConditionsObservedSince 函数过略掉那些以及各超过 eviction-pressure-transition-period 时间的 node condition(eviction-pressure-transition-period:默认为5分钟,脱离pressure condition的时)
// nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period
func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []v1.NodeConditionType {
results := []v1.NodeConditionType{}
for nodeCondition, at := range observedAt {
duration := now.Sub(at)
if duration < period {
results = append(results, nodeCondition)
}
}
return results
}
这个主要是设置 eviction-soft-grace-period:默认为90秒,超过该值加入阈值集合
// thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold {
results := []evictionapi.Threshold{}
for threshold, at := range observedAt {
duration := now.Sub(at)
if duration < threshold.GracePeriod {
klog.V(2).Infof("eviction manager: eviction criteria not yet met for %v, duration: %v", formatThreshold(threshold), duration)
continue
}
results = append(results, threshold)
}
return results
}
// update internal state
m.Lock()
m.nodeConditions = nodeConditions
m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
m.thresholdsMet = thresholds
决定阈值集合跟上次是否更新状态
func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold {
results := []evictionapi.Threshold{}
for i := range thresholds {
threshold := thresholds[i]
observed, found := observations[threshold.Signal]
if !found {
klog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal)
continue
}
last, found := lastObservations[threshold.Signal]
if !found || observed.time.IsZero() || observed.time.After(last.time.Time) {
results = append(results, threshold)
}
}
return results
}
if len(thresholds) == 0 {
klog.V(3).Infof("eviction manager: no resources are starved")
return nil
}
// rank the thresholds by eviction priority
sort.Sort(byEvictionPriority(thresholds))
thresholdToReclaim := thresholds[0]
resourceToReclaim, found := signalToResource[thresholdToReclaim.Signal]
if !found {
klog.V(3).Infof("eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold.", thresholdToReclaim.Signal)
return nil
}
调用 reclaimNodeLevelResources
对Node上 Resource进 行资源回收,在观察一下,如果新观察的没有超出设置阈值则本轮结束
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{}
signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{}
// with an imagefs, imagefs pressure should delete unused images
signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
for _, nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
if err := nodeReclaimFunc(); err != nil {
klog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
}
}
if len(nodeReclaimFuncs) > 0 {
summary, err := m.summaryProvider.Get(true)
if err != nil {
klog.Errorf("eviction manager: failed to get summary stats after resource reclaim: %v", err)
return false
}
// make observations and get a function to derive pod usage stats relative to those observations.
observations, _ := makeSignalObservations(summary)
debugLogObservations("observations after resource reclaim", observations)
// determine the set of thresholds met independent of grace period
thresholds := thresholdsMet(m.config.Thresholds, observations, false)
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
if len(thresholds) == 0 {
return true
}
}
// rank the pods for eviction
rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
if !ok {
klog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal)
return nil
}
// the only candidates viable for eviction are those pods that had anything running.
if len(activePods) == 0 {
klog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
return nil
}
排序的规则为先按照 pod 的使用量是否超过了设置的 request,从 1.8之前使用 qos进行优先级,1.9 使用exceedMemoryRequests 是否超出了 request
根据优先级,这个也是 1.9 开始加入的
最后按照内存的使用量排序
// rankMemoryPressure orders the input pods for eviction in response to memory pressure.
// It ranks by whether or not the pod's usage exceeds its requests, then by priority, and
// finally by memory usage above requests.
func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) {
orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods)
}
每一轮最多驱逐一个 pod,如果是 soft evict 则给一段结束生命周期时间,参数的设置是 eviction-max-pod-grace-period
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
gracePeriodOverride := int64(0)
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
}
message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc)
if m.evictPod(pod, gracePeriodOverride, message, annotations) {
return []*v1.Pod{pod}
}
}
thresholdsMet
得到这一轮满足t的 thresholdsthresholdsMet
得到已记录但还没解决的 thresholds,然后与本轮中的 thresholds 合并
https://kubernetes.io/docs/tasks/administer-cluster/out-of-resource