OOM是Out Of Memory(内存溢出)的缩写,虽然linux kernel的内存管理有很多机制(从cache中回收、swap out等)可以满足用户空间的各种虚拟内存需求,但是,当你的系统配置不合理,让一匹小马拉大车的时,内核会运行非常缓慢并且在可能在某个时间点分配page frame的时候遇到内存耗尽、无法分配的情况。应对这种状况首先出场是root用户(系统管理员),其首先会给系统增加内存,不过对于kernel而言,当面对OOM的时候,会根据OOM参数来进行相应的处理。其中就涉及听过比较多的OOM killer机制(Out Of Memory killer),该机制会监控系统的进程,然后发现那些占用内存过大,尤其是瞬间占用内存很快的进程,最后出于防止内存耗尽的目的,而把该进程杀掉。
这个参数表达的就是当kernel遇到OOM的时候,是否panic。
当值为0的时,在OOM的时候会调用OOM killer,会杀掉那些选中的进程,释放内存,然后系统恢复正常。当值为1的时,发生了OOM以后,如果是在有cpuset/memory policy/memcg的约束情况下的OOM,可以考虑不panic,而是启动OOM killer,从有这些约束的进程中选一个进程杀掉,系统也可以恢复,如果进程没有这些约束则直接panic。当值为2的时,OOM后必然发生kernel panic。系统默认值是0。
//include/linux/oom.h
enum oom_constraint {
CONSTRAINT_NONE,
CONSTRAINT_CPUSET,
CONSTRAINT_MEMORY_POLICY,
CONSTRAINT_MEMCG,
};
下面细讲一下上面说的约束。如上是在kernel中的定义。对于UMA系统而言, oom_constraint永远都是CONSTRAINT_NONE,意味着永远都是真正的内存不足导致的OOM,而不是因为有什么上面的约束导致内存不足。需要注意的一下,手机是UMA系统。对于NUMA系统,有可能附加了上面的约束导致了系统OOM,实际上,系统中还有充足的内存。
CONSTRAINT_CPUSET:cpusets是kernel中的一种机制,通过该机制可以让某一组进程只运行在某些CPU上和只用某些memory node。cpuset是基于cgroup子系统实现(关于cgroup子系统可以参考内核文档 Documentation/cgroups/cgroups.txt)。cpusets是cgroup文件系统中的一个子系统。使用cpuset上述功能可以让系统管理员动态调整进程运行所在的cpu和memory node。如果拥有这个约束的进程出现OOM,仅仅说明分配内存给这个进程的memory node的内存出现不足,但是整个系统有很多memory node,其他的node可能有充足的memory资源。
CONSTRAINT_MEMORY_POLICY:memory policy是NUMA系统中如何控制分配各个memory node资源的策略模块。用户空间程序(NUMA-aware的程序)可以通过memory policy的API,针对整个系统、针对一个特定进程的特定的VMA来制定策略。在这种约束条件下,如果出现了OOM可能是memory policy约束导致的,而不是系统内存不足。
CONSTRAINT_MEMCG:MEMCG就是memory control group,是cgroup子系统下memory的控制策略,控制系统memory资源分配的控制器,通俗的讲就是把一组进程的内存使用限定在一个范围内。当这一组的内存使用超过上限就会OOM,在这种情况下的OOM就是CONSTRAINT_MEMCG类型的OOM。显然这个不一定是系统内存不足。
同Linux kernel内存管理之overcommit相关参数里面讲述的一样,sysctl_panic_on_oom变量是和/proc/sys/vm/panic_on_oom对应的,初始值,跟系统调用的管理以及主要的判断逻辑如下,check_panic_on_oom函数:
//mm/oom_kill.c
//这里默认初始化是为0
int sysctl_panic_on_oom = IS_ENABLED(CONFIG_DEBUG_PANIC_ON_OOM) ? 2 : 0;;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
//kernel/sysctl.c
//在这个文件下,定义了一系列/proc/sys/vm下参数的参数调用
......
static struct ctl_table vm_table[] = {
{
.procname = "overcommit_memory",
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &two,
},
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &two,
},
{
.procname = "oom_kill_allocating_task",
.data = &sysctl_oom_kill_allocating_task,
.maxlen = sizeof(sysctl_oom_kill_allocating_task),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "oom_dump_tasks",
.data = &sysctl_oom_dump_tasks,
.maxlen = sizeof(sysctl_oom_dump_tasks),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
.mode = 0644,
.proc_handler = overcommit_ratio_handler,
},
{
.procname = "overcommit_kbytes",
.data = &sysctl_overcommit_kbytes,
.maxlen = sizeof(sysctl_overcommit_kbytes),
.mode = 0644,
.proc_handler = overcommit_kbytes_handler,
......
}
......
//mm/oom_kill.c
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
//根据panic_on_oom参数,判断是否panic还是走OOM killer流程
static void check_panic_on_oom(struct oom_control *oc,
enum oom_constraint constraint)
{
//如果sysctl_panic_on_oom为0,则不panic,走OOM killer流程,直接return
if (likely(!sysctl_panic_on_oom))
return;
//这里表示sysctl_panic_on_oom为1,则需要根据实际情况来判断
if (sysctl_panic_on_oom != 2) {
/*
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
//当constraint为CONSTRAINT_NONE,且panic_on_oom == 1时,才会发生panic
//否则,如果约束是CONSTRAINT_CPUSET,CONSTRAINT_MEMORY_POLICY,CONSTRAINT_MEMCG,就算为1,也是走OOM killer流程,return
if (constraint != CONSTRAINT_NONE)
return;
}
//下面就是sysctl_panic_on_oom为2的情况,
/* Do not panic for oom kills triggered by sysrq */
//特殊情况处理,如果sysrq主动触发的OOM(oc->order为-1)或者进程的adj是正整数,则走OOM killer流程,直接return
if (is_sysrq_oom(oc) || oc->only_positive_adj)
return;
//触发panic,开始dump stack等各种信息,设备死机
dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
手机默认使用的是缺省值,panic_on_oom为0。
User:/proc/sys/vm # cat panic_on_oom
cat panic_on_oom
0
这个参数用于在发生OOM,走OOM killer流程时,判断应该杀掉哪个进程。
当值为0时,会计算一个得分,得分最高者的进程,会被kill。当值为1时,如果触发OOM进程是用户空间进程(不能kill内核进程)、不是unkillable task(如init进程),未设定oom_score_adj阻止kill该进程,则直接杀死该进程。具体的代码函数实现涉及out_of_memory部分和oom_kill_process全部。
在out_of_memory里面调用的方式和顺序。
//mm/oom_kill.c
int sysctl_panic_on_oom = IS_ENABLED(CONFIG_DEBUG_PANIC_ON_OOM) ? 2 : 0;;
int sysctl_oom_kill_allocating_task;//这里默认初始化是为0
int sysctl_oom_dump_tasks = 1;
bool out_of_memory(struct oom_control *oc)
{
......
check_panic_on_oom(oc, constraint);
//发生OOM,走OOM killer流程,判断kill哪个进程和如何kill这个进程
//这个if语句针对的是oom_kill_allocating_task为1的情况,需要简单判断一下当前进程的一下信息,看能否直接kill,如果可以,则调用oom_kill_process,否则跳过
/*如果(1)oc->memcg为NULL,没有CONSTRAINT_MEMCG约束,100%内存不足;
(2)oom_kill_allocating_task参数为1,不用靠计算得分来判定
(3)当前进程的内存描述符mm存在
(4)当前进程是可以kill的进程
(5)oom_score_adj不是-1000(OOM_SCORE_ADJ_MIN),这些条件均成立,则可以kill掉当前进程
*/
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
//因为要kill,current.usage参数减1,
get_task_struct(current);
//oc->chosen指向当前进程,代表这是个要kill掉的进程
oc->chosen = current;
//正式开始kill当前进程,前戏结束...
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)",
false);
return true;
}
//针对oom_kill_allocating_task为0或者当前进程不能kill的时候,select_bad_process会选择一个最坏
//(占用内存最大)的进程,然后后面继续调用oom_kill_process来kill进程
select_bad_process(oc);
......
}
static void oom_kill_process(struct oom_control *oc, const char *message,
bool quiet)
{
struct task_struct *p = oc->chosen;//p指向要被kill的task
unsigned int points = oc->chosen_points;//得到这个task对应的得分
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t;
struct mem_cgroup *oom_group;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
task_lock(p);
//task_will_free_mem判断进程是否exiting;如果是,此时不要将其kill,返回true
if (task_will_free_mem(p)) {
//标记该进程为TIF_MEMDIE,实际上是不用kill的,因为已经在exiting了,自增oom_victims
mark_oom_victim(p);
//唤醒oom reaper内核线程,快速回收这个exiting进程的匿名内存、非VM_SHARED内存、swapped out内存等
wake_oom_reaper(p);
task_unlock(p);
//出队,在wake_oom_reaper里面入队了,最后return,结束OOM killer流程
put_task_struct(p);
return;
}
task_unlock(p);
//判断内核判断log打印速率太频繁(重复日志)。__ratelimit返回0表示太频繁,会禁止你打印,
//否则,调用dump_header该进程的内存状态信息,dump_header函数在后面第四部分会细讲
if (!quiet && __ratelimit(&oom_rs))
dump_header(oc, p);
//打印message,pid,进程name,进程的得分
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
/*
* If any of p's children has a different mm and is eligible for kill,
* the one with the highest oom_badness() score is sacrificed for its
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
read_lock(&tasklist_lock);
/*
* The task 'p' might have already exited before reaching here. The
* put_task_struct() will free task_struct 'p' while the loop still try
* to access the field of 'p', so, get an extra reference.
*/
get_task_struct(p);
//当进程p的子进程的内存描述符mm不同且根据计算得分该子线程是最高,那么子进程会替代父进程p被kill,
//避免kill父进程带来的接管子进程的额外开销
//遍历进程p的子进程
for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
//如果当前子进程的内存描述符mm与父进程p相同,则continue
if (process_shares_mm(child, p->mm))
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
//否则,调用oom_badness计算这个子进程的得分,oom_badness函数会在后面第五部分细讲
child_points = oom_badness(child,
oc->memcg, oc->nodemask, oc->totalpages,
oc->only_positive_adj);
//如果子进程的得分大于父进程p的得分,更新victim和victim_points参数,要被kill的变为该子进程
if (child_points > victim_points) {
put_task_struct(victim);
victim = child;
victim_points = child_points;
get_task_struct(victim);
}
}
}
put_task_struct(p);
read_unlock(&tasklist_lock);
/*
* Do we need to kill the entire memory cgroup?
* Or even one of the ancestor memory cgroups?
* Check this out before killing the victim task.
*/
//看能否得到跟victim和oc->memcg具有相同memcg的memcg,如果有,下面会kill这个memcg里面的所有task
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
/*
* If ->only_positive_adj = true in oom context,
* consider them as kill from ulmk.
*/
//如果only_positive_adj为true,得到上次被user lowmemorykiller的时间
if (oc->only_positive_adj)
ulmk_update_last_kill();
//核心函数,开始正式kill了,前面都是kill的开胃小菜
__oom_kill_process(victim);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
//如果前面所述,如果有必要会kill oom_group里面的所有task
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
mem_cgroup_put(oom_group);
}
}
/*
* Checks whether the given task is dying or exiting and likely to
* release its address space. This means that all threads and processes
* sharing the same mm have to be killed or exiting.
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
tatic bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
bool ret = true;
/*
* Skip tasks without mm because it might have passed its exit_mm and
* exit_oom_victim. oom_reaper could have rescued that but do not rely
* on that for now. We can consider find_lock_task_mm in future.
*/
//当前task的内存描述为NULL,无法做出判断,返回false
if (!mm)
return false;
//根据task的signal进行判断,如果exiting进程,__task_will_free_mem返回true
if (!__task_will_free_mem(task))
return false;
/*
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
//如果内存描述符mm的flags的第MMF_OOM_SKIP位值为1,test_bit返回1,return false
//MMF_OOM_SKIP置位,表示该进程不会kill
if (test_bit(MMF_OOM_SKIP, &mm->flags))
return false;
//如果这块内存的使用者数目(包括进程和子线程)<=1,认为是exiting,返回true,不kill
if (atomic_read(&mm->mm_users) <= 1)
return true;
/*
* Make sure that all tasks which share the mm with the given tasks
* are dying as well to make sure that a) nobody pins its mm and
* b) the task is also reapable by the oom reaper.
*/
//判断跟当前task共享mm的进程是否也是exiting的
rcu_read_lock();
//利用局部变量p遍历系统中的所有进程
for_each_process(p) {
//遍历进程p里面的所有线程,如果某个线程的mm跟当前task的mm相同,说明这个进程p也跟当前task共享,返回true
if (!process_shares_mm(p, mm))
continue;
//判断是不是同一个thread_group,通过task_struct->signal,是的话,continue
if (same_thread_group(task, p))
continue;
//继续调用__task_will_free_mem,判断进程p是否是exiting,若是,则ret为true,继续遍历;否则ret为false,break,当前task不是exiting的
ret = __task_will_free_mem(p);
if (!ret)
break;
}
rcu_read_unlock();
return ret;
}
static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
//如果该进程的signal是SIGNAL_GROUP_COREDUMP,表示是在coredumping的进程,这类进程可能在exit_mm中长时间睡眠,无法判断是否是exiting类进程,所以返回false
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
//如果该进程的signal是SIGNAL_GROUP_EXIT,表示是在exiting的进程
if (sig->flags & SIGNAL_GROUP_EXIT)
return true;
//如果该进程没有子线程同时task的flags是PF_EXITING,表示是在exiting的进程
if (thread_group_empty(task) && (task->flags & PF_EXITING))
return true;
//其他情况表明进程不是exiting进程
return false;
}
#define K(x) ((x) << (PAGE_SHIFT-10)) //x的单位是page(1 page = 4K),这里将page转化为K,所以乘以4
static void __oom_kill_process(struct task_struct *victim)
{
struct task_struct *p;
struct mm_struct *mm;
bool can_oom_reap = true;
/*遍历确认被选中进程的线程组,判断是否还存在task_struct->mm,如果不存在
* (有可能这个时候进程退出了,或释放了mm),就没必要再kill了,p为NULL
* 如果存在,则p指向线程组中的对应进程
*/
p = find_lock_task_mm(victim);//这里上了锁,task_lock(victim)
if (!p) {
put_task_struct(victim);
return;
} else if (victim != p) {
//若此时的进程与传入的不是同一个时,则更新victim,同时将原来的进程出队
get_task_struct(p);
put_task_struct(victim);
victim = p;
}
/* Get a reference to safely compare mm after task_unlock(victim) */
//确保mm还是要被kill的进程mm,mmgrab函数做了标记,因为后面task_unlock(victim)后没有锁的保护了
mm = victim->mm;
mmgrab(mm);
/* Raise event before sending signal: task reaper must see this */
//在发送信号之前引发OOM_KILL事件,让任务收割者oom reaper内核线程知道
count_vm_event(OOM_KILL);
memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
/*
* We should send SIGKILL before granting access to memory reserves
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
//发送SIGKILL信号给victim进程
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
//将victim进程为TIF_MEMDIE,因为后面将被kill
mark_oom_victim(victim);
//打印kill进程的pid,name,oom_score_adj以及内存使用信息
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB oom_score_adj=%hd\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)),
p->signal->oom_score_adj);
task_unlock(victim);
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
* depletion of all memory. This prevents mm->mmap_sem livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
rcu_read_lock();
/*
* 遍历系统中的所有进程,寻找跟被选中进程(victim)共享mm结构
* 的进程(内核线程除外),共享mm结构即共享进程地址空间,比如fork后exec之前,
* 父子进程是共享mm的,回收内存必须要将共享mm的所有进程都kill掉。
*/
for_each_process(p) {
//不是共享的,process_shares_mm返回false,continue
if (!process_shares_mm(p, mm))
continue;
//如果是同一个thread_group的,continue
if (same_thread_group(p, victim))
continue;
//通过tgid判断进程是否是init,如果是,返回true,因为1号进程不能被kill,continue
if (is_global_init(p)) {
can_oom_reap = false;
set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
continue;
}
/*
* No use_mm() user needs to read from the userspace so we are
* ok to reap it.
*/
//如果共享的内核线程,不用kill,continue
if (unlikely(p->flags & PF_KTHREAD))
continue;
//如果到这里,说明这个进程p是需要被kill的,那么跟前面一样,发送SIGKILL信号给p进程
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
}
rcu_read_unlock();
//如果是跟init进程共享mm的,则can_oom_reap为false,这个victim进程不能被kill
//否则,调用wake_oom_reaper唤醒oom reaper内核线程收割内存
if (can_oom_reap)
wake_oom_reaper(victim);
//跟mmgrab函数是一对的,这里解除标记,后面可以dereference
mmdrop(mm);
put_task_struct(victim);
}
#undef K //K这个宏到这里停止使用并删除
//这个函数可以更加详细的讲一下。https://blog.csdn.net/sinat_22338935/article/details/118409427
static void wake_oom_reaper(struct task_struct *tsk)
{
/*
* Move the lock here to avoid scenario of queuing
* the same task by both OOM killer and any other SIGKILL
* path.
*/
//上锁,避免相同task重复入队oom_reaper_wait
spin_lock(&oom_reaper_lock);
/* mm is already queued? */
//判断task是否已经在oom_reaper_wait队列,如果是则解锁,return,因为在队列里面,oom reaper内核线程就会去收割
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) {
spin_unlock(&oom_reaper_lock);
return;
}
get_task_struct(tsk);
tsk->oom_reaper_list = oom_reaper_list;
//oom_reaper_list 为待收割的进程
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
//ftrace记录当前进程的内存收割情况
trace_wake_reaper(tsk->pid);
//OOM Reaper等待队列:oom_reaper_wait,oom_reaper线程在此等待,当有OOM产生的时候唤醒等待队列,并从oom_reaper_list中获取待收割进程结构体
wake_up(&oom_reaper_wait);
}
select_bad_process函数通过逐级调用,实际上主要是调用了oom_badness函数来完成进程得分的计算,然后根据这个得分和原先值比较,不断更新,最终得到一个得分最高的进程,保存在oc->chosen中,分数保存在oc->chosen_points。
/*
* Simple selection loop. We choose the process with the highest number of
* 'points'. In case scan was aborted, oc->chosen is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
//oc->memcg不为NULL,有CONSTRAINT_MEMCG约束,
//那么遍历以这个memcg为root的树,访问每一个memcg里面的task,调用oom_evaluate_task函数,看能否找到一个得分最高的"bad"进程
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
//oc->memcg为NULL,没有CONSTRAINT_MEMCG约束,是真的内存不足,直接遍历系统的所有用户进程,找出得分最高的"bad"进程,
//更新oc里面的chosen,chosen_points参数,break
struct task_struct *p;
rcu_read_lock();
for_each_process(p)
if (oom_evaluate_task(p, oc))
break;
rcu_read_unlock();
}
//更新chosen_points参数
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
}
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;
//如果task是不可kill的,直接return 0,oom_unkillable_task函授后面会细讲
if (oom_unkillable_task(task, NULL, oc->nodemask))
goto next;
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
* the task has MMF_OOM_SKIP because chances that it would release
* any memory is quite low.
*/
//判断是否是sysrq触发的oom killer,判断当前task是否是victim进程(tsk->signal->oom_mm)
//若不是sysrq触发,且是victim进程,进入if
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
//判断是否有MMF_OOM_SKIP标志,如果有,则不被kill,return 0
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
goto next;
//否则,由于已经是victim进程,已经将要被kill了,跳转到abort,需要将已经入队的oc->chosen,出队
goto abort;
}
/*
* If task is allocating a lot of memory and has been marked to be
* killed first if it triggers an oom, then select it.
*/
//如果task已经被标记了在触发了OOM后,首先kill它(task->signal->oom_flag_origin),那么就选择它
//得分直接赋值无符号长整型最大值
if (oom_task_origin(task)) {
points = ULONG_MAX;
goto select;
}
//如果是上面的特殊条件均没有触发,此时会调用oom_badness函数来得到当前task的points
//oom_badness函数后面在第五部分会详细描述
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages,
oc->only_positive_adj);
//如果point为0,或者小于之前所选chosen进程的oc->chosen_points得分,则return 0,继续循环
if (!points || points < oc->chosen_points)
goto next;
/* Prefer thread group leaders for display purposes */
if (points == oc->chosen_points && thread_group_leader(oc->chosen))
goto next;
select:
//如果oc->chosen不为NULL,需要将已经入队的oc->chosen进程,出队
if (oc->chosen)
put_task_struct(oc->chosen);
//将当前kill的进程入队,同时更新chosen和chosen_points
get_task_struct(task);
oc->chosen = task;
oc->chosen_points = points;
next:
return 0;
abort:
//如果oc->chosen不为NULL,需要将已经入队的oc->chosen进程,出队
if (oc->chosen)
put_task_struct(oc->chosen);
//否则,将oc->chosen指向不存在进程
oc->chosen = (void *)-1UL;
return 1;
}
手机默认使用的是缺省值,oom_kill_allocating_task为0,通过计算得分,kill得分最高的进程。
User:/proc/sys/vm # cat oom_kill_allocating_task
cat oom_kill_allocating_task
0
这个参数用于当发生OOM时,无论是panic还是走OOM killer流程,判断是否要记录进程的相关内存状态信息。这些信息可以用于定位发生OOM的原因。dump的信息涉及系统中所有用户空间进程的进程标识信息、该进程使用的虚拟内存总量、该进程实际使用物理内存(我们又称之为RSS,Resident Set Size,不仅仅是自己程序使用的物理内存,也包含共享库占用的内存)、进程的页表信息等。
当值为0时,上面描述的进程内存状态信息均不会打印,在大型的系统中,有几千个进程,逐一打印每一个task的内存信息有可能会导致性能问题,特别是在已经OOM的情况下。当值为1时,当符合这三种情况时,会调用dump_header->dump_tasks来打印系统中所有task的内存状态信息。第一,OOM触发的panic,在check_panic_on_oom函数里面会调用dump_header;第二,走OOM killer流程,没有找到合适的的"bad"进程,在out_of_memory函数里面直接调用dump_header;第三,走OOM killer流程,找到合适的进程要kill,在oom_kill_process函数里面会调用。
//mm/oom_kill.c
int sysctl_panic_on_oom = IS_ENABLED(CONFIG_DEBUG_PANIC_ON_OOM) ? 2 : 0;;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;//这里默认初始化是为1
下面细讲一下dump_header函数和dump_tasks函数的代码实现。
//介绍一下CONFIG_COMPACTION
/*
//mm/Kconfig
内核里的紧致内存机制,类似于磁盘碎片整理:把碎的页移动整合到连续的一段空间,就留出一段连续的内存。开销主要是:移动也的开销
在内核中打开CONFIG_COMPACTION就可以使用compaction功能了,要注意它只能整理可移动的页面。
默认开启,同时也是建议开启的
#
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
def_bool y
select MIGRATION
depends on MMU
help
Compaction is the only memory management component to form
high order (larger physically contiguous) memory blocks
reliably. The page allocator relies on compaction heavily and
the lack of the feature can lead to unexpected OOM killer
invocations for high order memory requests. You shouldn't
disable this option unless there really is a strong reason for
it and then we would be really interested to hear about that at
[email protected].
*/
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
//显示是那个进程触发了OOM,打印相应的一些信息
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask,
nodemask_pr_args(oc->nodemask), oc->order,
current->signal->oom_score_adj);
//如前所述,默认是使能的,而且不建议关闭,直接跳过这里
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
//打印当前进程的cpuset信息,包括cpuset的name和该进程可以访问哪些memory node(mems_allowed)
cpuset_print_current_mems_allowed();
//打印当前task的信息和它的栈信息
dump_stack();
//判断oc->memcg是否为NULL,这里面记录了cgroup相关的内存使用信息,
if (is_memcg_oom(oc))
//不为NULL,打印跟memcg相关的OOM信息
mem_cgroup_print_oom_info(oc->memcg, p);
else {
//为NULL,打印整个系统的内存使用情
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
//判断不可回收slab的总内存大小是否大于所有用户内存(LRU pages)
if (is_dump_unreclaim_slabs())
//如果是,则打印不可回收slab的信息
dump_unreclaimable_slab();
show_mem_call_notifiers();
}
//如果使能了oom_dump_tasks,dump_tasks就会dump系统用户空间进程的前面所述的内存状态信息
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
}
/**
* dump_tasks - dump current memory state of all system tasks
* @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
pr_info("Tasks state (memory values in pages):\n");
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
//开始遍历系统中所有进程
for_each_process(p) {
//判断是否是不可kill的进程,如果是,直接跳过,继续循环,这些进程的内存状况信息不打印。oom_unkillable_task函数在后面部分会讲
if (oom_unkillable_task(p, memcg, nodemask))
continue;
//判断从当前进程或者该进程的子线程判断mm锁是否存在,如果存在,则返回包含该锁的当前进程或者该进程的子线程
task = find_lock_task_mm(p);
//如果这是内核线程或者进程p的所有线程都不存在内存描述符mm,task为NULL,则直接跳过,不kill,同时这些进程的内存状况信息也不打印
if (!task) {
/*
* This is a kthread or all of p's threads have already
* detached their mm's. There's no need to report
* them; they can't be oom killed anyway.
*/
continue;
}
//经过前面判断,对于eligible task,开始打印相应的信息(pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj value, name)
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
//释放的这个锁,是在函数find_lock_task_mm里面加的
task_unlock(task);
}
rcu_read_unlock();
}
这三个参数用于计算进程的得分,得分最高者,就kill对应的进程。在oom_kill_allocating_task为0或者当前进程不能kill的时候,这些参数就开始起作用。因为这三个参数都跟进程强相关,因此这三个参数的位置位于/proc/进程PID/ 路径下。
需要注意的是,oom_score参数是read only,取值范围0~1000,0代表never kill,1000代表aways kill,值越大,进程被选中kill的概率越大。内核对进程打分(oom_score)主要包括两部分:系统打分和用户打分,系统打分是根据进程的物理内存消耗量来计算的;用户打分就是 oom_score_adj 的值。
用户可以通过调整 oom_score_adj 的值来决定最终 oom_score 的值,oom_score_adj 的取值范围是 -1000~1000,root 可读写。值为0(缺省值)时,表示用户不调整 oom_score。值为-1000(OOM_SCORE_ADJ_MIN)时,oom_score加上此值会小于等于0,代表禁止 OOM killer 杀死该进程。另外,root进程拥有3%的内存使用特权(这个在kernel-4.19代码上面未发现这个),因此做最终 oom_score 计算时需要减去这些内存使用量。
oom_adj是一个旧的接口参数,其功能类似oom_score_adj,出于兼容的目的,目前仍然保留这个参数,取值范围是-16~15,root 可读写,缺省值是0,当操作这个参数的时候,kernel实际上是会换算成oom_score_adj。换算公式:oom_score_adj = OOM_SCORE_ADJ_MAX * oom_adj/(-OOM_DISABLE)。同时,其还有一个特殊值-17 ,表示 OOM_DISABLE(禁用OOM)。在函数实现上,通过逐级调用out_of_memory->select_bad_process->oom_evaluate_task->oom_badness,最终通过oom_badness函数来进行打分计算,得出最高分者,然后将其kill。
下面细讲一下oom_badness函数的实现。
//include/linux/oom.h
/*
* Details of the page allocation that triggered the oom killer that are used to
* determine what should be killed.
*/
struct oom_control {
/* Used to determine cpuset */
struct zonelist *zonelist;
/* Used to determine mempolicy */
nodemask_t *nodemask;
/* Memory cgroup in which oom is invoked, or NULL for global oom */
struct mem_cgroup *memcg;
/* Used to determine cpuset and node locality requirement */
const gfp_t gfp_mask;
/*
* order == -1 means the oom kill is required by sysrq, otherwise only
* for display purposes.
*/
const int order;
/*
* Only kill positive adj tasks. Used to behave more like Android's
* lowmemorykiller.
*/
const bool only_positive_adj;
/* Used by oom implementation, do not set */
unsigned long totalpages;
struct task_struct *chosen;
unsigned long chosen_points;
};
//include/linux/sched/coredump.h
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
* @memcg: task's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
//正主......
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages,
bool only_positive_adj)
{
long points;
long adj;
//判断这个进程p是否是不可kill的进程,如果是,函数返回true,直接return 0
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
//判断从当前进程或者该进程的子线程判断mm锁是否存在,如果存在,则返回包含该锁的当前进程或者该进程的子线程
p = find_lock_task_mm(p);
//如果不存在内存描述符mm,p为NULL,则直接返回,不kill
if (!p)
return 0;
/*
* Do not even consider tasks which are explicitly marked oom
* unkillable or have been already oom reaped or the are in
* the middle of vfork
*/
//得到用户打分adj
adj = (long)p->signal->oom_score_adj;
//如果满足这四个条件之一,则return0,不kill这个进程,也就不计算这个进程的得分,直接返回0。(1)oom_score_adj为-1000;(2)使能了只kill adj为正的task,但是adj<0;
//(3)内存描述符mm的flags是跳过OOM的;(4)进程处于vfork过程中,不能被kill
if (adj == OOM_SCORE_ADJ_MIN ||
(only_positive_adj && adj < 0) ||
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
//释放的这个锁,是在函数find_lock_task_mm里面加的
task_unlock(p);
return 0;
}
/*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
//如前面所述,系统打分是根据物理内存消耗量的,主要是三部分:RSS部分,swap file或者swap device上占用的内存情况以及页表占用的内存情况
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
mm_pgtables_bytes(p->mm) / PAGE_SIZE;//PAGE_SIZE = 4K
task_unlock(p);
/* Normalize to oom_score_adj units */
/*
根据前面oom_score_adj的描述,0表示用户不调整oom_score,负值表示要在实际打分值上减去一个折扣,
正值表示要惩罚该task,会增加该进程的oom_score。
在实际操作中,需要根据本次内存分配时可分配内存来计算
(如果没有内存分配约束,那么就是系统中的所有可用内存,如果系统支持cpuset,那么可分配内存就是该cpuset的实际额度值),
而参数totalpages,就是当前的可分配的内存上限值。
实际的分值(points)要根据oom_score_adj进行调整,例如如果oom_score_adj设定-500,那么表示实际分值要打五折(基数是totalpages),
也即该任务实际使用的内存要减去可分配的内存上限值的一半。
*/
adj *= totalpages / 1000;
points += adj;
/*
* Never return 0 for an eligible task regardless of the root bonus and
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
//除了那些不能kill的进程分值为0,符合条件的进程的实际分值最低分为1,所有负分均转换成1
return points > 0 ? points : 1;
}
//include/linux/sched.h
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
//通过tgid判断进程是否是init,如果是,返回true,因为1号进程不能被kill
if (is_global_init(p))
return true;
//如果这个task是内核线程,那么不会被kill,返回true
if (p->flags & PF_KTHREAD)
return true;
//如果开启了CONSTRAINT_MEMCG,但是这个进程不是开启MEMCG策略的这组进程的成员,则返回true,不需要kill
//不过看传入参数,这个memcg默认是NULL
/* When mem_cgroup_out_of_memory() and p is not member of the group */
if (memcg && !task_in_mem_cgroup(p, memcg))
return true;
//在NUMA系统中,如果进程是因为CONSTRAINT_MEMORY_POLICY约束导致的内存不足,不需要kill,返回true
/* p may not have freeable memory in nodemask */
if (!has_intersects_mems_allowed(p, nodemask))
return true;
return false;
}
如下可以看出1号进程init进程是不允许被OOM给kill掉的。
User:/proc/1 # cat oom_score
cat oom_score
0
User:/proc/1 # cat oom_score_adj
cat oom_score_adj
-1000
User:/proc/1 # cat oom_adj
cat oom_adj
-17
User:/proc/1 #
如前面Linux kernel内存管理之overcommit相关参数所描述的,当overcommit_memory为OVERCOMMIT_GUESS(0)时,linux 内核是允许用户进程过度提交内存申请,然后在真正要使用内存时,通过__vm_enough_memory进行判断是否分配用户进程所需的内存,当然这个判断相对来说不是很精确,会有漏网之鱼。因此,用户进程在申请虚拟内存的时候,可以多申请一些内存,然后期望其他用户进程它们申请的内存,在实际使用上没有用那么多。这时就存在一种情况,当其他进程实际使用内存时,没有剩余内存,但当前进程的虚拟内存分配也审核通过了,那么就会导致内存不足情况,此时就会触发OOM,然后根据panic_on_oom 参数来判断是panic还是走OOM killer流程。
前面讲发生OOM时,将涉及的各个模块详细介绍了一下。现在开始细讲一下总函数:out_of_memory函数的代码实现。发生OOM时,调用的就是这个函数来处理OOM。
/**
* out_of_memory - kill the "best" process when we run out of memory
* @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
//默认初始化constraint为CONSTRAINT_NONE
enum oom_constraint constraint = CONSTRAINT_NONE;
//判断OOM处理流程是否使能,如果disable,直接退出。一般在freeze processes会将其置位,即禁止OOM
if (oom_killer_disabled)
return false;
//在NUMA系统下,针对可热插拔的内存的处理,手机是UMA系统
if (try_online_one_block(numa_node_id())) {
/* Got some memory back */
WARN(1, "OOM killer had to online a memory block\n");
return true;
}
//当oc->memcg为NULL,即没有CONSTRAINT_MEMCG约束时,进入if
if (!is_memcg_oom(oc)) {
//oom_notify_list通知链中回调函数的调用将会进行内存回收,并通过freed值确认是否回收了内存
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return true;
}
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
//如果当前进程已经收到SIGKILL信号或者正在退出,那么自动选择它为被kill的进程
if (task_will_free_mem(current)) {
//将当前进程为TIF_MEMDIE
mark_oom_victim(current);
//调用wake_oom_reaper唤醒oom reaper内核线程收割内存
wake_oom_reaper(current);
return true;
}
/*
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
* invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
//gfp_mask表示非0,而且不是__GFP_FS,同时没有CONSTRAINT_MEMCG约束,直接返回true
if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA and memcg) that may require different handling.
*/
//主要针对NUMA系统(UMA永远都是CONSTRAINT_NONE),看下有没有什么约束策略
constraint = constrained_alloc(oc);
//非CONSTRAINT_MEMORY_POLICY约束,nodemask要置为NULL,这个参数本身就是用来标记MEMORY_POLICY策略的
if (constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
//根据panic_on_oom参数,判断发生OOM时,是否panic还是走OOM killer流程
check_panic_on_oom(oc, constraint);
//发生OOM,走OOM killer流程,判断kill哪个进程和如何kill这个进程
//这个if语句针对的是oom_kill_allocating_task为1的情况,需要简单判断一下当前进程的一下信息,看能否直接kill,如果可以,则调用oom_kill_process,否则跳过
/*如果(1)oc->memcg为NULL,没有CONSTRAINT_MEMCG约束,100%内存不足;
(2)oom_kill_allocating_task参数为1,不用靠计算得分来判定
(3)当前进程的内存描述符mm存在
(4)当前进程是可以kill的进程
(5)oom_score_adj不是-1000(OOM_SCORE_ADJ_MIN),这些条件均成立,则可以kill掉当前进程
*/
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
//因为要kill,current.usage参数减1,
get_task_struct(current);
//oc->chosen指向当前进程,代表这是个要kill掉的进程
oc->chosen = current;
//正式开始kill当前进程,前戏结束...;且注明oom_kill_allocating_task
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)",
false);
return true;
}
//针对oom_kill_allocating_task为0或者当前进程不能kill的时候,select_bad_process会选择一个最坏
//(占用内存最大)的进程,然后后面继续调用oom_kill_process来kill进程
select_bad_process(oc);
/* Found nothing?!?! */
//oc->chosen为NULL,意味着没有找到可以kill进程,此时会dump当前系统所有进程的内存状态等信息,然后尝试panic
if (!oc->chosen) {
//dump内存状态等信息
dump_header(oc, NULL);
//打印OOM且没有可kill的进程
pr_warn("Out of memory and no killable processes...\n");
/*
* If we got here due to an actual allocation at the
* system level, we cannot survive this and will enter
* an endless loop in the allocator. Bail out now.
*/
//如果OOM不是sysrq触发的,没有CONSTRAINT_MEMCG约束且only_positive_adj为false(意味着已经遍历所有系统进程,包括oom_score_adj为负的)
//为了避免无穷尽的死锁,直接panic
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) &&
!oc->only_positive_adj)
panic("System is deadlocked on memory\n");
}
//oc->chosen不为NULL且地址不为-1(-1在select_bad_process针对特殊情况赋予的),意味select_bad_process找到可以kill进程
if (oc->chosen && oc->chosen != (void *)-1UL)
//正式开始kill选中进程,前戏结束...;且is_memcg_oom来判断是没有约束的OOM还是有约束的OOM
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory",
//在msm-4.19/mm/Kconfig有配置,如果使能,则当用户空间进程触发的低内存,
//首先ulmk查杀,如果还不好使,则oom killer兜底用,一般用到oom killer,说明当前系统内存不足问题还是比较严重
/*android系统中,一般会使能。一般叫lmkd进程
*Android基于Linux的系统,其实Linux有类似的内存管理策略——OOM killer,OOM的策略更多的是用于分配内存不足时触发,将得分最高的进程杀掉。
*而lmk则会每隔一段时间检查一次,当系统剩余可用内存较低时,便会触发杀进程的策略,根据不同的剩余内存档位来来选择杀不同优先级的进程,
*而不是等到OOM时再来杀进程,真正OOM时系统可能已经处于异常状态,系统更希望的是未雨绸缪,在内存很低时来杀掉一些优先级较低的进程来保障
*后续操作的顺利进行。
*/
IS_ENABLED(CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER));
return !!oc->chosen;//oc->chosen为NULL,返回false;否则返回true
}
Linux vm运行参数之(二):OOM相关的参数
Linux OOM killer机制介绍
【Linux内存源码分析】内存溢出保护机制(OOM)
Cpusets学习
linux内存管理(十四)-内存OOM触发分析