kubernetes 1.13
默认情况下 pod 能够使用节点全部可用资源。用户 pod 中的应用疯狂占用内存,pod 将与 node 上的系统守护进程和 kubernetes 组件争夺资源并导致节点资源短缺,从而产生 node not ready 等严重问题。
Node Capacity
---------------------------
| kube-reserved |
|-------------------------|
| system-reserved |
|-------------------------|
| eviction-threshold |
|-------------------------|
| |
| allocatable |
| (available for pods) |
| |
| |
---------------------------
NodeAllocatable = [NodeCapacity] - [kube-reserved] - [system-reserved] - [eviction-threshold]
- Node Capacity:Node的所有硬件资源
- kube-reserved: kubelet预留资源,看代码只涉及到 kubelet
- system-reserved:给System进程预留的资源,包括 systemd启动的进程
- eviction-threshold:kubelet eviction的阈值设定
- Allocatable:真正scheduler调度Pod时的参考值(保证Node上所有Pods的request resource不超过Allocatable)
--cgroups-per-qos:默认 true,启用新的 cgroup 层次结构,启用 QoS 和 Pod 级别的 cgroups kubepods
指定 kube-reserved
和 system-reserved
值能够使 kubelet
执行 kube-reserved
和 system-reserved
。请注意,要想执行 kube-reserved
或者 system-reserved
时,需要分别指定 --kube-reserved-cgroup
或者 --system-reserved-cgroup
--enforce-node-allocatable=pods,kube-reserved,system-reserved
--kube-reserved-cgroup=/system.slice/kubelet.service
--system-reserved-cgroup=/system.slice
--kube-reserved=cpu=200m,memory=250Mi
--system-reserved=cpu=1200m,memory=1250Mi
--eviction-hard=memory.available<5%,nodefs.available<10%,imagefs.available<10%
--eviction-soft=memory.available<10%,nodefs.available<15%,imagefs.available<15%
--eviction-soft-grace-period=memory.available=2m,nodefs.available=2m,imagefs.available=2m
--eviction-max-pod-grace-period=30
--eviction-minimum-reclaim=memory.available=0Mi,nodefs.available=500Mi,imagefs.available=500Mi
路径 cmd/kubelet/app/server.go
func run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies, stopCh <-chan struct{}) (err error)
cgroupRoots = append(cgroupRoots, cm.NodeAllocatableRoot(s.CgroupRoot, s.CgroupDriver))
kubeletCgroup, err := cm.GetKubeletContainer(s.KubeletCgroups)
if err != nil {
return fmt.Errorf("failed to get the kubelet's cgroup: %v", err)
}
if kubeletCgroup != "" {
cgroupRoots = append(cgroupRoots, kubeletCgroup)
}
runtimeCgroup, err := cm.GetRuntimeContainer(s.ContainerRuntime, s.RuntimeCgroups)
if err != nil {
return fmt.Errorf("failed to get the container runtime's cgroup: %v", err)
}
if runtimeCgroup != "" {
// RuntimeCgroups is optional, so ignore if it isn't specified
cgroupRoots = append(cgroupRoots, runtimeCgroup)
}
if s.SystemCgroups != "" {
// SystemCgroups is optional, so ignore if it isn't specified
cgroupRoots = append(cgroupRoots, s.SystemCgroups)
}
kubeDeps.ContainerManager, err = cm.NewContainerManager(
kubeDeps.Mounter,
kubeDeps.CAdvisorInterface,
cm.NodeConfig{
RuntimeCgroupsName: s.RuntimeCgroups,
SystemCgroupsName: s.SystemCgroups,
KubeletCgroupsName: s.KubeletCgroups,
ContainerRuntime: s.ContainerRuntime,
CgroupsPerQOS: s.CgroupsPerQOS,
CgroupRoot: s.CgroupRoot,
CgroupDriver: s.CgroupDriver,
KubeletRootDir: s.RootDirectory,
ProtectKernelDefaults: s.ProtectKernelDefaults,
NodeAllocatableConfig: cm.NodeAllocatableConfig{
KubeReservedCgroupName: s.KubeReservedCgroup,
SystemReservedCgroupName: s.SystemReservedCgroup,
EnforceNodeAllocatable: sets.NewString(s.EnforceNodeAllocatable...),
KubeReserved: kubeReserved,
SystemReserved: systemReserved,
HardEvictionThresholds: hardEvictionThresholds,
},
QOSReserved: *experimentalQOSReserved,
ExperimentalCPUManagerPolicy: s.CPUManagerPolicy,
ExperimentalCPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration,
ExperimentalPodPidsLimit: s.PodPidsLimit,
EnforceCPULimits: s.CPUCFSQuota,
CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration,
},
s.FailSwapOn,
devicePluginEnabled,
kubeDeps.Recorder)
子系统读取 /proc/self/mountinfo 和 /proc/self/cgroup
# cat /proc/self/mountinfo
17 60 0:16 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
18 60 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
19 60 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=1410096k,nr_inodes=352524,mode=755
20 17 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
21 19 0:17 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
22 19 0:11 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
23 60 0:18 / /run rw,nosuid,nodev shared:23 - tmpfs tmpfs rw,seclabel,mode=755
24 17 0:19 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
25 24 0:20 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
26 17 0:21 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
27 24 0:22 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,memory
28 24 0:23 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,pids
29 24 0:24 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,freezer
30 24 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,cpuacct,cpu
31 24 0:26 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
32 24 0:27 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,net_prio,net_cls
33 24 0:28 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,devices
34 24 0:29 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,perf_event
35 24 0:30 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,cpuset
36 24 0:31 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,blkio
58 17 0:32 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
60 1 253:0 / / rw,relatime shared:1 - xfs /dev/mapper/centos-root rw,seclabel,attr2,inode64,noquota
37 17 0:14 / /sys/fs/selinux rw,relatime shared:22 - selinuxfs selinuxfs rw
38 19 0:13 / /dev/mqueue rw,relatime shared:24 - mqueue mqueue rw,seclabel
39 18 0:33 / /proc/sys/fs/binfmt_misc rw,relatime shared:25 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=12066
40 17 0:6 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
41 19 0:34 / /dev/hugepages rw,relatime shared:27 - hugetlbfs hugetlbfs rw,seclabel
42 18 0:35 / /proc/fs/nfsd rw,relatime shared:28 - nfsd nfsd rw
74 60 8:1 / /boot rw,relatime shared:29 - xfs /dev/sda1 rw,seclabel,attr2,inode64,noquota
76 60 253:2 / /var/lib/rook rw,relatime shared:30 - ext4 /dev/mapper/vgdata-lvdata1 rw,seclabel,data=ordered
78 60 0:36 / /var/lib/nfs/rpc_pipefs rw,relatime shared:31 - rpc_pipefs sunrpc rw
153 23 0:37 / /run/user/0 rw,nosuid,nodev,relatime shared:103 - tmpfs tmpfs rw,seclabel,size=284204k,mode=700
# cat /proc/self/cgroup
11:blkio:/user.slice
10:cpuset:/
9:perf_event:/
8:devices:/user.slice
7:net_prio,net_cls:/
6:hugetlb:/
5:cpuacct,cpu:/user.slice
4:freezer:/
3:pids:/
2:memory:/user.slice
1:name=systemd:/user.slice/user-0.slice/session-4.scope
// TODO(vmarmol): Add limits to the system containers.
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
subsystems, err := GetCgroupSubsystems()
if err != nil {
return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
}
# cat /proc/swaps
Filename Type Size Used Priority
/dev/dm-1 partition 2621436 186300 -1
if failSwapOn {
// Check whether swap is enabled. The Kubelet does not support running with swap enabled.
swapData, err := ioutil.ReadFile("/proc/swaps")
if err != nil {
return nil, err
}
swapData = bytes.TrimSpace(swapData) // extra trailing \n
swapLines := strings.Split(string(swapData), "\n")
// If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should
// error out unless --fail-swap-on is set to false.
if len(swapLines) > 1 {
return nil, fmt.Errorf("Running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", swapLines)
}
}
默认为 cgroupfs,cgroupManagerImpl 实现了 CgroupManager 接口
// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
managerType := libcontainerCgroupfs
if cgroupDriver == string(libcontainerSystemd) {
managerType = libcontainerSystemd
}
return &cgroupManagerImpl{
subsystems: cs,
adapter: newLibcontainerAdapter(managerType),
}
}
开启创建 QoS cgroup 层级,true 意味着创建顶级 QoS 和 pod cgroups (默认 true)
Exists 函数检查所有子系统,包括 cpu cpuacct cpuset menory systemd
将 kubepods 加入根 root
// Check if Cgroup-root actually exists on the node
if nodeConfig.CgroupsPerQOS {
// this does default to / when enabled, but this tests against regressions.
if nodeConfig.CgroupRoot == "" {
return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root")
}
// we need to check that the cgroup root actually exists for each subsystem
// of note, we always use the cgroupfs driver when performing this check since
// the input is provided in that format.
// this is important because we do not want any name conversion to occur.
if !cgroupManager.Exists(cgroupRoot) {
return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist", cgroupRoot)
}
klog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot)
// Include the top level cgroup for enforcing node allocatable into cgroup-root.
// This way, all sub modules can avoid having to understand the concept of node allocatable.
cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName)
}
实例化 qosContainerManagerImpl,先记录稍后分析
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
if !nodeConfig.CgroupsPerQOS {
return &qosContainerManagerNoop{
cgroupRoot: cgroupRoot,
}, nil
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.QOSReserved,
}, nil
}
实例化 CPU manager
根据 --feature-gates mapStringBool CPUManager=true|false (BETA - default=true)
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager, err = cpumanager.NewManager(
nodeConfig.ExperimentalCPUManagerPolicy,
nodeConfig.ExperimentalCPUManagerReconcilePeriod,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
)
if err != nil {
klog.Errorf("failed to initialize cpu manager: %v", err)
return nil, err
}
}
func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate)
--> updateRuntimeUp
--> initializeRuntimeDependentModules
--> kl.containerManager.Start
路径 pkg/kubelet/cm/container_manager_linux.go
func (cm *containerManagerImpl) Start(node *v1.Node,
activePods ActivePodsFunc,
sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService) error {
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), podStatusProvider, runtimeService)
}
根据 --feature-gates mapStringBool LocalStorageCapacityIsolation=true|false (BETA - default=true)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.LocalStorageCapacityIsolation) {
rootfs, err := cm.cadvisorInterface.RootFsInfo()
if err != nil {
return fmt.Errorf("failed to get rootfs info: %v", err)
}
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
}
预留资源不应该超过节点总量
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
var errors []string
nar := cm.GetNodeAllocatableReservation()
for k, v := range nar {
value := cm.capacity[k].DeepCopy()
value.Sub(v)
if value.Sign() < 0 {
errors = append(errors, fmt.Sprintf("Resource %q has an allocatable of %v, capacity of %v", k, v, value))
}
}
if len(errors) > 0 {
return fmt.Errorf("Invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
}
return nil
}
validateSystemRequirements 检查 cgroup 子系统是否已经 mount 了,包括,cpu cpuacct cpuset memory
cpu 子系统是否存在 cpu.cfs_period_us 和 cpu.cfs_quota_us,这俩主要是比例使用 CPU,以这俩商值作为 CPU 核数
func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
f, err := validateSystemRequirements(cm.mountUtil)
if err != nil {
return err
}
if !f.cpuHardcapping {
cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
}
3.4.1 setupKernelTunables
检查内核 tunable 参数是否设置
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables(option KernelTunableBehavior) error {
desiredState := map[string]int{
utilsysctl.VmOvercommitMemory: utilsysctl.VmOvercommitMemoryAlways,
utilsysctl.VmPanicOnOOM: utilsysctl.VmPanicOnOOMInvokeOOMKiller,
utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout,
utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways,
utilsysctl.RootMaxKeys: utilsysctl.RootMaxKeysSetting,
utilsysctl.RootMaxBytes: utilsysctl.RootMaxBytesSetting,
}
3.4.2 kubepods cgroup
在这篇文章分析 https://blog.csdn.net/zhonglinzhang/article/details/92994123
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
}
}
3.4.3 enforceNodeAllocatableCgroups
得到 node 节点可分配资源,需要去掉预留资源
// enforceNodeAllocatableCgroups enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
nc := cm.NodeConfig.NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.capacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableAbsolute()
}
3.4.4 应用 system 预留资源
enforceExistingCgroup 这个 cgroup 为 system.slice,还是必须存在 cgroup 子系统 cpu cpuacct cpuset memory systemd
调用 cgroup manager 接口更新 cgroup
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
klog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved)
if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.SystemReservedCgroupName), nc.SystemReserved); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
3.4.5 同理应用 kube 预留资源
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
klog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved)
if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.KubeReservedCgroupName), nc.KubeReserved); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
if cm.ContainerRuntime == "docker" {
// With the docker-CRI integration, dockershim will manage the cgroups
// and oom score for the docker processes.
// In the future, NodeSpec should mandate the cgroup that the
// runtime processes need to be in. For now, we still check the
// cgroup for docker periodically, so that kubelet can recognize
// the cgroup for docker and serve stats for the runtime.
// TODO(#27097): Fix this after NodeSpec is clearly defined.
cm.periodicTasks = append(cm.periodicTasks, func() {
klog.V(4).Infof("[ContainerManager]: Adding periodic tasks for docker CRI integration")
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
klog.Error(err)
return
}
klog.V(2).Infof("[ContainerManager]: Discovered runtime cgroups name: %s", cont)
cm.Lock()
defer cm.Unlock()
cm.RuntimeCgroupsName = cont
})
}
kubelet启动会检查以下cgroup subsystem的存在:cpu、cpuacct、cpuset、memory、systemd,提前创建如下目录
// Exists checks if all subsystem cgroups already exist
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
// Get map of all cgroup paths on the system for the particular cgroup
cgroupPaths := m.buildCgroupPaths(name)
// the presence of alternative control groups not known to runc confuses
// the kubelet existence checks.
// ideally, we would have a mechanism in runc to support Exists() logic
// scoped to the set control groups it understands. this is being discussed
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
mkdir -p /sys/fs/cgroup/cpu/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/cpuacct/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/cpuset/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/memory/system.slice/kubelet.service
mkdir -p /sys/fs/cgroup/systemd/system.slice/kubelet.service
可以一劳永逸方法,添加入 kubelet.service 文件中
Documentation=https://github.com/GoogleCloudPlatform/kubernetes
After=docker.service
Requires=docker.service
[Service]
EnvironmentFile=-/etc/etcd/env
WorkingDirectory=/var/lib/kubelet
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/memory/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpu/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpuacct/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/hugetlb/system.slice/kubelet.service
ExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/systemd/system.slice/kubelet.serviceExecStartPost=/usr/bin/mkdir -p /sys/fs/cgroup/cpuset/system.slice/kubelet.service
执行grep -i commit /proc/meminfo
内存分配策略,可选值:0、1、2。
0, 内核将检查是否有足够的可用内存供应用进程使用;如果有足够的可用内存内存申请允许;否则内存申请失败,并把错误返回给应用进程。
1, 内核允许分配所有的物理内存,而不管当前的内存状态如何
2, 表示内核允许分配超过所有物理内存和交换空间总和的内存
什么是 Overcommit和OOM
Linux对大部分申请内存的请求都回复"yes",以便能跑更多更大的程序。因为申请内存后,并不会马上使用内存。这种技术叫做 Overcommit。当linux发现内存不足时,会发生OOM killer(OOM=out-of-memory)。它会选择杀死一些进程(用户态进程,不是内核线程),以便释放内存。
当oom-killer发生时,linux会选择杀死哪些进程?选择进程的函数是oom_badness函数(在mm/oom_kill.c中),该 函数会计算每个进程的点数(0~1000)。点数越高,这个进程越有可能被杀死。每个进程的点数跟oom_score_adj有关,而且 oom_score_adj可以被设置(-1000最低,1000最高)。
解决方法:
很简单,按提示的操作(将vm.overcommit_memory 设为1)即可:
有三种方式修改内核参数,但要有root权限:
(1)编辑/etc/sysctl.conf ,改vm.overcommit_memory=1,然后sysctl -p使配置文件生效
(2)sysctl vm.overcommit_memory=1
(3)echo 1 > /proc/sys/vm/overcommit_memory
参考:linux的vm.overcommit_memory的内存分配参数详解
默认为0开启 为1时表示关闭此功能
等于0时,表示当内存耗尽时,内核会触发OOM killer杀掉最耗内存的进程。
当OOM Killer被启动时,通过观察进程自动计算得出各当前进程的得分 /proc/
/oom_score, 分值越高越容易被kill掉。而且计算分值时主要参照 /proc/
/oom_adj , oom_adj 取值范围从-17到15,当等于-17时表示在任何时候此进程都不会被 oom killer kill掉(适用于mysql)。/proc/[pid]/oom_adj ,该pid进程被oom killer杀掉的权重,介于 [-17,15]之间,越高的权重,意味着更可能被oom killer选中,-17表示禁止被kill掉。
/proc/[pid]/oom_score,当前该pid进程的被kill的分数,越高的分数意味着越可能被kill,这个数值是根据oom_adj运算后的结果,是oom_killer的主要参考。
写资源预留到 limit 目录下