【kubernetes/k8s源码分析】 kubelet cgroup 资源预留源码分析

kubernetes 1.13

 

WHY

      默认情况下 pod 能够使用节点全部可用资源。用户 pod 中的应用疯狂占用内存,pod 将与 node 上的系统守护进程和 kubernetes 组件争夺资源并导致节点资源短缺,从而产生 node not ready 等严重问题。    

 

WHAT

 Node Capacity
---------------------------
|     kube-reserved       |
|-------------------------|
|     system-reserved     |
|-------------------------|
|    eviction-threshold   |
|-------------------------|
|                         |
|      allocatable        |
|   (available for pods)  |
|                         |
|                         |
---------------------------

     NodeAllocatable = [NodeCapacity] - [kube-reserved] - [system-reserved] - [eviction-threshold]

  • Node Capacity:Node的所有硬件资源
  • kube-reserved: kubelet预留资源,看代码只涉及到 kubelet
  • system-reserved:给System进程预留的资源,包括 systemd启动的进程
  • eviction-threshold:kubelet eviction的阈值设定
  • Allocatable:真正scheduler调度Pod时的参考值(保证Node上所有Pods的request resource不超过Allocatable)

 

--cgroups-per-qos:默认 true,启用新的 cgroup 层次结构,启用 QoS 和 Pod 级别的 cgroups kubepods

 

HOW

       指定 kube-reserved 和 system-reserved 值能够使 kubelet 执行 kube-reserved 和 system-reserved。请注意,要想执行 kube-reserved 或者 system-reserved时,需要分别指定 --kube-reserved-cgroup 或者 --system-reserved-cgroup

 

--enforce-node-allocatable=pods,kube-reserved,system-reserved
--kube-reserved-cgroup=/system.slice/kubelet.service
--system-reserved-cgroup=/system.slice
--kube-reserved=cpu=200m,memory=250Mi
--system-reserved=cpu=1200m,memory=1250Mi
--eviction-hard=memory.available<5%,nodefs.available<10%,imagefs.available<10%
--eviction-soft=memory.available<10%,nodefs.available<15%,imagefs.available<15%
--eviction-soft-grace-period=memory.available=2m,nodefs.available=2m,imagefs.available=2m
--eviction-max-pod-grace-period=30
--eviction-minimum-reclaim=memory.available=0Mi,nodefs.available=500Mi,imagefs.available=500Mi

 

    路径 cmd/kubelet/app/server.go

func run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies, stopCh <-chan struct{}) (err error)

 

 

1. 获得 cgroup 包括, kubepods,docker 

cgroupRoots = append(cgroupRoots, cm.NodeAllocatableRoot(s.CgroupRoot, s.CgroupDriver))
kubeletCgroup, err := cm.GetKubeletContainer(s.KubeletCgroups)
if err != nil {
	return fmt.Errorf("failed to get the kubelet's cgroup: %v", err)
}
if kubeletCgroup != "" {
	cgroupRoots = append(cgroupRoots, kubeletCgroup)
}

runtimeCgroup, err := cm.GetRuntimeContainer(s.ContainerRuntime, s.RuntimeCgroups)
if err != nil {
	return fmt.Errorf("failed to get the container runtime's cgroup: %v", err)
}
if runtimeCgroup != "" {
	// RuntimeCgroups is optional, so ignore if it isn't specified
	cgroupRoots = append(cgroupRoots, runtimeCgroup)
}

if s.SystemCgroups != "" {
	// SystemCgroups is optional, so ignore if it isn't specified
	cgroupRoots = append(cgroupRoots, s.SystemCgroups)
}

 

kubeDeps.ContainerManager, err = cm.NewContainerManager(
	kubeDeps.Mounter,
	kubeDeps.CAdvisorInterface,
	cm.NodeConfig{
		RuntimeCgroupsName:    s.RuntimeCgroups,
		SystemCgroupsName:     s.SystemCgroups,
		KubeletCgroupsName:    s.KubeletCgroups,
		ContainerRuntime:      s.ContainerRuntime,
		CgroupsPerQOS:         s.CgroupsPerQOS,
		CgroupRoot:            s.CgroupRoot,
		CgroupDriver:          s.CgroupDriver,
		KubeletRootDir:        s.RootDirectory,
		ProtectKernelDefaults: s.ProtectKernelDefaults,
		NodeAllocatableConfig: cm.NodeAllocatableConfig{
			KubeReservedCgroupName:   s.KubeReservedCgroup,
			SystemReservedCgroupName: s.SystemReservedCgroup,
			EnforceNodeAllocatable:   sets.NewString(s.EnforceNodeAllocatable...),
			KubeReserved:             kubeReserved,
			SystemReserved:           systemReserved,
			HardEvictionThresholds:   hardEvictionThresholds,
		},
		QOSReserved:                           *experimentalQOSReserved,
		ExperimentalCPUManagerPolicy:          s.CPUManagerPolicy,
		ExperimentalCPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration,
		ExperimentalPodPidsLimit:              s.PodPidsLimit,
		EnforceCPULimits:                      s.CPUCFSQuota,
		CPUCFSQuotaPeriod:                     s.CPUCFSQuotaPeriod.Duration,
	},
	s.FailSwapOn,
	devicePluginEnabled,
	kubeDeps.Recorder)

 

2. NewContainerManager

    子系统读取 /proc/self/mountinfo 和 /proc/self/cgroup

# cat /proc/self/mountinfo 
17 60 0:16 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
18 60 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
19 60 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=1410096k,nr_inodes=352524,mode=755
20 17 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
21 19 0:17 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
22 19 0:11 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
23 60 0:18 / /run rw,nosuid,nodev shared:23 - tmpfs tmpfs rw,seclabel,mode=755
24 17 0:19 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
25 24 0:20 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
26 17 0:21 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
27 24 0:22 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,memory
28 24 0:23 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,pids
29 24 0:24 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,freezer
30 24 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,cpuacct,cpu
31 24 0:26 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
32 24 0:27 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,net_prio,net_cls
33 24 0:28 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,devices
34 24 0:29 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,perf_event
35 24 0:30 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,cpuset
36 24 0:31 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,blkio
58 17 0:32 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
60 1 253:0 / / rw,relatime shared:1 - xfs /dev/mapper/centos-root rw,seclabel,attr2,inode64,noquota
37 17 0:14 / /sys/fs/selinux rw,relatime shared:22 - selinuxfs selinuxfs rw
38 19 0:13 / /dev/mqueue rw,relatime shared:24 - mqueue mqueue rw,seclabel
39 18 0:33 / /proc/sys/fs/binfmt_misc rw,relatime shared:25 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=12066
40 17 0:6 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
41 19 0:34 / /dev/hugepages rw,relatime shared:27 - hugetlbfs hugetlbfs rw,seclabel
42 18 0:35 / /proc/fs/nfsd rw,relatime shared:28 - nfsd nfsd rw
74 60 8:1 / /boot rw,relatime shared:29 - xfs /dev/sda1 rw,seclabel,attr2,inode64,noquota
76 60 253:2 / /var/lib/rook rw,relatime shared:30 - ext4 /dev/mapper/vgdata-lvdata1 rw,seclabel,data=ordered
78 60 0:36 / /var/lib/nfs/rpc_pipefs rw,relatime shared:31 - rpc_pipefs sunrpc rw
153 23 0:37 / /run/user/0 rw,nosuid,nodev,relatime shared:103 - tmpfs tmpfs rw,seclabel,size=284204k,mode=700
 

# cat /proc/self/cgroup
11:blkio:/user.slice
10:cpuset:/
9:perf_event:/
8:devices:/user.slice
7:net_prio,net_cls:/
6:hugetlb:/
5:cpuacct,cpu:/user.slice
4:freezer:/
3:pids:/
2:memory:/user.slice
1:name=systemd:/user.slice/user-0.slice/session-4.scope

// TODO(vmarmol): Add limits to the system containers.
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
	subsystems, err := GetCgroupSubsystems()
	if err != nil {
		return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
	}

 

    2.1 如果设置了 --fail-swap-on 为 true

# cat /proc/swaps 
Filename                Type        Size    Used    Priority
/dev/dm-1                               partition    2621436    186300    -1 

if failSwapOn {
	// Check whether swap is enabled. The Kubelet does not support running with swap enabled.
	swapData, err := ioutil.ReadFile("/proc/swaps")
	if err != nil {
		return nil, err
	}
	swapData = bytes.TrimSpace(swapData) // extra trailing \n
	swapLines := strings.Split(string(swapData), "\n")

	// If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should
	// error out unless --fail-swap-on is set to false.
	if len(swapLines) > 1 {
		return nil, fmt.Errorf("Running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", swapLines)
	}
}

    2.2 实例化 NewCgroupManager

     默认为 cgroupfs,cgroupManagerImpl 实现了 CgroupManager 接口

// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
	managerType := libcontainerCgroupfs
	if cgroupDriver == string(libcontainerSystemd) {
		managerType = libcontainerSystemd
	}
	return &cgroupManagerImpl{
		subsystems: cs,
		adapter:    newLibcontainerAdapter(managerType),
	}
}

    2.3 如果设置了 --cgroups-per-qos 为 true,

       开启创建 QoS cgroup 层级,true 意味着创建顶级 QoS 和 pod cgroups (默认 true)

       Exists 函数检查所有子系统,包括 cpu cpuacct cpuset menory systemd

       将 kubepods 加入根 root

// Check if Cgroup-root actually exists on the node
if nodeConfig.CgroupsPerQOS {
	// this does default to / when enabled, but this tests against regressions.
	if nodeConfig.CgroupRoot == "" {
		return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root")
	}

	// we need to check that the cgroup root actually exists for each subsystem
	// of note, we always use the cgroupfs driver when performing this check since
	// the input is provided in that format.
	// this is important because we do not want any name conversion to occur.
	if !cgroupManager.Exists(cgroupRoot) {
		return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist", cgroupRoot)
	}
	klog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot)
	// Include the top level cgroup for enforcing node allocatable into cgroup-root.
	// This way, all sub modules can avoid having to understand the concept of node allocatable.
	cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName)
}

 

实例化 qosContainerManagerImpl,先记录稍后分析

func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
	if !nodeConfig.CgroupsPerQOS {
		return &qosContainerManagerNoop{
			cgroupRoot: cgroupRoot,
		}, nil
	}

	return &qosContainerManagerImpl{
		subsystems:    subsystems,
		cgroupManager: cgroupManager,
		cgroupRoot:    cgroupRoot,
		qosReserved:   nodeConfig.QOSReserved,
	}, nil
}

    实例化 CPU manager

    根据 --feature-gates mapStringBool CPUManager=true|false (BETA - default=true)

// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
	cm.cpuManager, err = cpumanager.NewManager(
		nodeConfig.ExperimentalCPUManagerPolicy,
		nodeConfig.ExperimentalCPUManagerReconcilePeriod,
		machineInfo,
		cm.GetNodeAllocatableReservation(),
		nodeConfig.KubeletRootDir,
	)
	if err != nil {
		klog.Errorf("failed to initialize cpu manager: %v", err)
		return nil, err
	}
}

 

func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate)

         -->  updateRuntimeUp

                      -->  initializeRuntimeDependentModules

                                    -->  kl.containerManager.Start

 

3.  Start  函数

    路径 pkg/kubelet/cm/container_manager_linux.go

func (cm *containerManagerImpl) Start(node *v1.Node,
	activePods ActivePodsFunc,
	sourcesReady config.SourcesReady,
	podStatusProvider status.PodStatusProvider,
	runtimeService internalapi.RuntimeService) error {

    3.1 前面实例化,这次 start CPU manager

// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
	cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), podStatusProvider, runtimeService)
}

    3.2 存储还是在试验阶段

    根据 --feature-gates mapStringBool LocalStorageCapacityIsolation=true|false (BETA - default=true)

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.LocalStorageCapacityIsolation) {
	rootfs, err := cm.cadvisorInterface.RootFsInfo()
	if err != nil {
		return fmt.Errorf("failed to get rootfs info: %v", err)
	}
	for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
		cm.capacity[rName] = rCap
	}
}

    3.3 validateNodeAllocatable 函数

      预留资源不应该超过节点总量

// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
	var errors []string
	nar := cm.GetNodeAllocatableReservation()
	for k, v := range nar {
		value := cm.capacity[k].DeepCopy()
		value.Sub(v)

		if value.Sign() < 0 {
			errors = append(errors, fmt.Sprintf("Resource %q has an allocatable of %v, capacity of %v", k, v, value))
		}
	}

	if len(errors) > 0 {
		return fmt.Errorf("Invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
	}
	return nil
}

    3.4 setupNode 函数

     validateSystemRequirements 检查 cgroup 子系统是否已经 mount 了,包括,cpu cpuacct cpuset memory 

     cpu 子系统是否存在 cpu.cfs_period_us 和 cpu.cfs_quota_us,这俩主要是比例使用 CPU,以这俩商值作为 CPU 核数 

func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
	f, err := validateSystemRequirements(cm.mountUtil)
	if err != nil {
		return err
	}
	if !f.cpuHardcapping {
		cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
	}

    3.4.1 setupKernelTunables

     检查内核 tunable 参数是否设置

  • vm/overcommit_memory: 内存分配策略,详解见下文
  • vm.panic_on_oom:等于0时,表示当内存耗尽时,内核会触发OOM killer杀掉最耗内存的进程
  • kernel/panic: 表示如果发生内核严重错误(kernel panic),则内核在重新引导之前等待的时间(以秒为单位)。0 秒,表示在发生内核严重错误时将禁止自动重新引导
  • kernel.panic_on_oops:当系统发生oops或BUG时,所采取的措施 0:继续运行,1:让klog记录oops的输出,然后panic,若kernel.panic不为0,则等待后重新引导内核
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables(option KernelTunableBehavior) error {
	desiredState := map[string]int{
		utilsysctl.VmOvercommitMemory: utilsysctl.VmOvercommitMemoryAlways,
		utilsysctl.VmPanicOnOOM:       utilsysctl.VmPanicOnOOMInvokeOOMKiller,
		utilsysctl.KernelPanic:        utilsysctl.KernelPanicRebootTimeout,
		utilsysctl.KernelPanicOnOops:  utilsysctl.KernelPanicOnOopsAlways,
		utilsysctl.RootMaxKeys:        utilsysctl.RootMaxKeysSetting,
		utilsysctl.RootMaxBytes:       utilsysctl.RootMaxBytesSetting,
	}

    3.4.2 kubepods cgroup

     在这篇文章分析  https://blog.csdn.net/zhonglinzhang/article/details/92994123

// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
	if err := cm.createNodeAllocatableCgroups(); err != nil {
		return err
	}
	err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
	if err != nil {
		return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
	}
}

    3.4.3 enforceNodeAllocatableCgroups

     得到 node 节点可分配资源,需要去掉预留资源

// enforceNodeAllocatableCgroups enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
	nc := cm.NodeConfig.NodeAllocatableConfig

	// We need to update limits on node allocatable cgroup no matter what because
	// default cpu shares on cgroups are low and can cause cpu starvation.
	nodeAllocatable := cm.capacity
	// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
	if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
		nodeAllocatable = cm.getNodeAllocatableAbsolute()
	}

    3.4.4 应用 system 预留资源

    enforceExistingCgroup 这个 cgroup 为 system.slice,还是必须存在 cgroup 子系统 cpu cpuacct cpuset memory systemd

    调用 cgroup manager 接口更新 cgroup 

// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
	klog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved)
	if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.SystemReservedCgroupName), nc.SystemReserved); err != nil {
		message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
		cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
		return fmt.Errorf(message)
	}
	cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}

    3.4.5 同理应用 kube 预留资源 

if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
	klog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved)
	if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.KubeReservedCgroupName), nc.KubeReserved); err != nil {
		message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
		cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
		return fmt.Errorf(message)
	}
	cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}

   3.5 runtime 为 docker 的情况

if cm.ContainerRuntime == "docker" {
	// With the docker-CRI integration, dockershim will manage the cgroups
	// and oom score for the docker processes.
	// In the future, NodeSpec should mandate the cgroup that the
	// runtime processes need to be in. For now, we still check the
	// cgroup for docker periodically, so that kubelet can recognize
	// the cgroup for docker and serve stats for the runtime.
	// TODO(#27097): Fix this after NodeSpec is clearly defined.
	cm.periodicTasks = append(cm.periodicTasks, func() {
		klog.V(4).Infof("[ContainerManager]: Adding periodic tasks for docker CRI integration")
		cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
		if err != nil {
			klog.Error(err)
			return
		}
		klog.V(2).Infof("[ContainerManager]: Discovered runtime cgroups name: %s", cont)
		cm.Lock()
		defer cm.Unlock()
		cm.RuntimeCgroupsName = cont
	})
}

 

 

注意:

    kubelet启动会检查以下cgroup subsystem的存在:cpu、cpuacct、cpuset、memory、systemd,提前创建如下目录

// Exists checks if all subsystem cgroups already exist
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
	// Get map of all cgroup paths on the system for the particular cgroup
	cgroupPaths := m.buildCgroupPaths(name)

	// the presence of alternative control groups not known to runc confuses
	// the kubelet existence checks.
	// ideally, we would have a mechanism in runc to support Exists() logic
	// scoped to the set control groups it understands.  this is being discussed
	// in https://github.com/opencontainers/runc/issues/1440
	// once resolved, we can remove this code.
	whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")

mkdir -p /sys/fs/cgroup/cpu/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/cpuacct/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/cpuset/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/memory/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/systemd/system.slice/kubelet.service

     可以一劳永逸方法,添加入 kubelet.service 文件中

Documentation=https://github.com/GoogleCloudPlatform/kubernetes
After=docker.service
Requires=docker.service
 
[Service]
EnvironmentFile=-/etc/etcd/env
WorkingDirectory=/var/lib/kubelet
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/memory/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpu/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpuacct/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/hugetlb/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/systemd/system.slice/kubelet.service

ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpuset/system.slice/kubelet.service

 

内核参数 overcommit_memory 

       执行grep -i commit  /proc/meminfo

  • CommitLimit是一个内存分配上限,CommitLimit = 物理内存 * overcommit_ratio(默认50,即50%) + swap大小
  • Committed_As是已经分配的内存大小

    内存分配策略,可选值:0、1、2。
0, 内核将检查是否有足够的可用内存供应用进程使用;如果有足够的可用内存内存申请允许;否则内存申请失败,并把错误返回给应用进程。
1, 内核允许分配所有的物理内存,而不管当前的内存状态如何
2, 表示内核允许分配超过所有物理内存和交换空间总和的内存

    什么是 Overcommit和OOM

        Linux对大部分申请内存的请求都回复"yes",以便能跑更多更大的程序。因为申请内存后,并不会马上使用内存。这种技术叫做 Overcommit。当linux发现内存不足时,会发生OOM killer(OOM=out-of-memory)。它会选择杀死一些进程(用户态进程,不是内核线程),以便释放内存。
        当oom-killer发生时,linux会选择杀死哪些进程?选择进程的函数是oom_badness函数(在mm/oom_kill.c中),该 函数会计算每个进程的点数(0~1000)。点数越高,这个进程越有可能被杀死。每个进程的点数跟oom_score_adj有关,而且 oom_score_adj可以被设置(-1000最低,1000最高)。

      解决方法:

     很简单,按提示的操作(将vm.overcommit_memory 设为1)即可:

     有三种方式修改内核参数,但要有root权限:

 (1)编辑/etc/sysctl.conf ,改vm.overcommit_memory=1,然后sysctl -p使配置文件生效

 (2)sysctl vm.overcommit_memory=1

 (3)echo 1 > /proc/sys/vm/overcommit_memory

   参考:linux的vm.overcommit_memory的内存分配参数详解

 

内核参数 vm/panic_on_oom

    默认为0开启    为1时表示关闭此功能

    等于0时,表示当内存耗尽时,内核会触发OOM killer杀掉最耗内存的进程

    当OOM Killer被启动时,通过观察进程自动计算得出各当前进程的得分 /proc//oom_score,分值越高越容易被kill掉。    

     而且计算分值时主要参照 /proc//oom_adj ,  oom_adj 取值范围从-17到15,当等于-17时表示在任何时候此进程都不会被 oom killer kill掉(适用于mysql)。

     /proc/[pid]/oom_adj ,该pid进程被oom killer杀掉的权重,介于 [-17,15]之间,越高的权重,意味着更可能被oom killer选中,-17表示禁止被kill掉。

     /proc/[pid]/oom_score,当前该pid进程的被kill的分数,越高的分数意味着越可能被kill,这个数值是根据oom_adj运算后的结果,是oom_killer的主要参考。

 

总结:

     写资源预留到 limit 目录下

你可能感兴趣的:(kubelet)