Chipset: MSM8X25Q
Codebase: Android4.1
Kernel: 3.4.0
OOMkiller,即out of memory killer,是linux下面的一种管理当内存耗尽时的处理机制。当内存较少时,OOM会遍历整个进程链表,然后根据进程的内存使用情况以及它的oom score值最终找到得分较高的进程,然后发送kill信号将其杀掉。
伙伴系统中在分配内存时会做判断,当内存不足时,会调用核心函数out_of_memory(), 函数位于文件oom_kill.c@kernel/mm.
下面先分析out_of_memory()。
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; unsigned int points; enum oom_constraint constraint = CONSTRAINT_NONE; int killed = 0; ~~snip /*如果当前已经有Pending的kill信号,那么马上返回。 毕竟oom最中为了free memory而执行sig kill。*/ if (fatal_signal_pending(current)) { set_thread_flag(TIF_MEMDIE); return; } ~~snip /*用户空间可以通过/proc/sys/vm/panic_on_oom来改变oom的行为, 1表示oom的时候直接panic,0就只杀掉”best”进程而让系统继续运行。*/ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); read_lock(&tasklist_lock); /*同样/proc/sys/vm/ oom_kill_allocating_task为true时表示直接将当前分配的task 给kill掉。*/ if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && current->mm) { oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, nodemask, "Out of memory (oom_kill_allocating_task)"); goto out; } /*根据当前task的内存以oom score信息得到point值最高的那个。*/ p = select_bad_process(&points, totalpages, NULL, mpol_mask, force_kill); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } if (PTR_ERR(p) != -1UL) { /*唔,被杀了,苦逼!*/ oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, nodemask, "Out of memory"); killed = 1; } out: read_unlock(&tasklist_lock); /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current */ if (killed && !test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); }
static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask, bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; /*遍历所有进程*/ do_each_thread(g, p) { unsigned int points; /*处于退出的进程就不管了*/ if (p->exit_state) continue; /*有些核心的线程不能杀,如init, kernel_thread*/ if (oom_unkillable_task(p, memcg, nodemask)) continue; /*正在被oom killing的进程也不管。*/ if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) __thaw_task(p); if (!force_kill) return ERR_PTR(-1UL); } if (!p->mm) continue; if (p->flags & PF_EXITING) { if (p == current) { chosen = p; *ppoints = 1000; } else if (!force_kill) { /* * If this task is not being ptraced on exit, * then wait for it to finish before killing * some other task unnecessarily. */ if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) return ERR_PTR(-1UL); } } /*计算task对应的points*/ points = oom_badness(p, memcg, nodemask, totalpages); /*如果此task比上次的points要大,那么保存point.*/ if (points > *ppoints) { chosen = p; *ppoints = points; } } while_each_thread(g, p); return chosen; }
unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; /*oom_score_adj为-1000的不做处理,此值可以通过/proc/pid_num/oom_score_adj设置,范围为-1000 ~ 1000,值越大越容易被oom kill掉。*/ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. */ if (!totalpages) totalpages = 1; /* get_mm_rss获取当前用户空间使用文件和匿名页占有内存数,nr_ptes 获取 当前保存页表使用的内存。*/ points = get_mm_rss(p->mm) + p->mm->nr_ptes; /*获取交换内存使用的内存数*/ points += get_mm_counter(p->mm, MM_SWAPENTS); /*每个task同等计算,可不管。*/ points *= 1000; points /= totalpages; task_unlock(p); /*当该进程具有CAP_SYS_ADMIN能力,那么Point降低,因为具有ADMIN权限的 Task是被认为表现良好的。 */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) points -= 30; /*加上oom_score_adj,范围从-1000 ~ 1000. */ points += p->signal->oom_score_adj; /* * Never return 0 for an eligible task that may be killed since it's * possible that no single user task uses more than 0.1% of memory and * no single admin tasks uses more than 3.0%. */ if (points <= 0) return 1; /*1000封顶*/ return (points < 1000) ? points : 1000; }
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message) { struct task_struct *victim = p; struct task_struct *child; struct task_struct *t = p; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); return; } if (__ratelimit(&oom_rs)) dump_header(p, gfp_mask, order, memcg, nodemask); task_lock(p); pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", message, task_pid_nr(p), p->comm, points); task_unlock(p); /*当前被选定子进程的mm和父进程不一样时,找到其中最高point 的children task,然后替代父进程被杀掉,所以当一个进程有多个子进程并且 真用较多内存时,子进程有可能被杀掉,而父进程还可以活着。 */ do { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; if (child->mm == p->mm) continue; /* * oom_badness() returns 0 if the thread is unkillable */ child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { victim = child; victim_points = child_points; } } } while_each_thread(p, t); victim = find_lock_task_mm(victim); if (!victim) return; /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES))); task_unlock(victim); /* 只要mm是一样的,也就是说共享内存的进程,都会和当前找到最高point的 指定进程一起被杀掉。 */ for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; task_lock(p); /* Protect ->comm from prctl() */ pr_err("Kill process %d (%s) sharing same memory\n", task_pid_nr(p), p->comm); task_unlock(p); /*发送 SIGKILL信号。*/ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } set_tsk_thread_flag(victim, TIF_MEMDIE); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); }
所以,out_of_memory()做的任务就是遍历系统全部进程,然后根据内存使用情况以及oom_score_adj的值计算得到一个point, 最终将最高point的task给kill掉。
1. Malloc会引起OOM killer,可参考:
http://blog.dccmx.com/2011/04/oom-killer-on-linux
2. OOM killer值是管理计算lowmemory部分,即使High memory有很多空闲内存。
3. 进程rss的计算可参考此文:
http://filwmm1314.blog.163.com/blog/static/2182591920121016541582/
4. 影响到oom killer行为的文件有:
/proc/sys/vm/overcommit_memory
/proc/sys/vm/panic_on_oom
/proc/sys/vm/oom_kill_allocating_task
/porc/pid_xxx/oom_score_adj
2013/04/27