linux内存管理(十四)-内存OOM触发分析

在内存分配路径上,当内存不足的时候会触发kswapd、或者内存规整,极端情况会触发OOM,来获取更多内存。
在内存回收失败之后,会进行OOM,OOM的入口是__alloc_pages_may_oom,文件位于mm/page_alloc.c中:

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
	const struct alloc_context *ac, unsigned long *did_some_progress)
{
	struct oom_control oc = {//OOM控制参数
		.zonelist = ac->zonelist,
		.nodemask = ac->nodemask,
		.memcg = NULL,
		.gfp_mask = gfp_mask,
		.order = order,
	};
	struct page *page;

	*did_some_progress = 0;

	/*
	 * Acquire the oom lock.  If that fails, somebody else is
	 * making progress for us.
	 */
	//尝试加锁,如果获取不到锁则返回
	if (!mutex_trylock(&oom_lock)) {
		*did_some_progress = 1;
		schedule_timeout_uninterruptible(1);
		return NULL;
	}

	/*
	 * Go through the zonelist yet one more time, keep very high watermark
	 * here, this is only to catch a parallel oom killing, we must fail if
	 * we're still under heavy pressure. But make sure that this reclaim
	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
	 * allocation which will never fail due to oom_lock already held.
	 */
	//尝试再次使用高水位分配内存一次,判断是否需要启动oom
	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
				      ~__GFP_DIRECT_RECLAIM, order,
				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
	if (page)
		goto out;

	/* Coredumps can quickly deplete all memory reserves */
	if (current->flags & PF_DUMPCORE)
		goto out;
	/* The OOM killer will not help higher order allocs */
	//OOM不可以分配高于PAGE_ALLOC_COSTLY_ORDER的阶数,也就是3阶
	if (order > PAGE_ALLOC_COSTLY_ORDER)
		goto out;
	/*
	 * We have already exhausted all our reclaim opportunities without any
	 * success so it is time to admit defeat. We will skip the OOM killer
	 * because it is very likely that the caller has a more reasonable
	 * fallback than shooting a random task.
	 */
	//__GFP_NOFAIL是不允许内存申请失败的情况,如果不允许失败则从out退出
	if (gfp_mask & __GFP_RETRY_MAYFAIL)
		goto out;
	/* The OOM killer does not needlessly kill tasks for lowmem */
	//OOM不会为低端内存启动,如果是要分配低端内存则从out退出
	if (ac->high_zoneidx < ZONE_NORMAL)
		goto out;
	if (pm_suspended_storage())
		goto out;
	/*
	 * XXX: GFP_NOFS allocations should rather fail than rely on
	 * other request to make a forward progress.
	 * We are in an unfortunate situation where out_of_memory cannot
	 * do much for this context but let's try it to at least get
	 * access to memory reserved if the current task is killed (see
	 * out_of_memory). Once filesystems are ready to handle allocation
	 * failures more gracefully we should just bail out here.
	 */

	/* The OOM killer may not free memory on a specific node */
	//OOM不会释放特定节点上的内存
	if (gfp_mask & __GFP_THISNODE)
		goto out;

	/* Exhausted what can be done so it's blame time */
	//经过上面各种情况,仍然需要进行OOM处理。调用out_of_memory()。
	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
		*did_some_progress = 1;

		/*
		 * Help non-failing allocations by giving them access to memory
		 * reserves
		 */
		//对于__GFP_NOFAIL的分配情况,需要降低分配标准到无水线
		if (gfp_mask & __GFP_NOFAIL)
			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
					ALLOC_NO_WATERMARKS, ac);
	}
out:
	mutex_unlock(&oom_lock);
	return page;
}

__alloc_pages_may_oom会先配置OOM参数,再尝试高水位分配内存,成功就返回,失败再判断标志位等判断是否适合oom,适合才会调用out_of_memory进行OOM操作:

bool out_of_memory(struct oom_control *oc)
{
	unsigned long freed = 0;
	enum oom_constraint constraint = CONSTRAINT_NONE;

	//在freeze_processes会将其置位,即禁止OOM;因为在冻结过程不允许OOM
	if (oom_killer_disabled)
		return false;

	if (!is_memcg_oom(oc)) {//检查是否有资格杀死进程启动oom
		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
		if (freed > 0)
			/* Got some memory back in the last second. */
			return true;
	}

	/*
	 * If current has a pending SIGKILL or is exiting, then automatically
	 * select it.  The goal is to allow it to allocate so that it may
	 * quickly exit and free its memory.
	 */
	//如果当前进程将要退出,或者释放内存
	if (task_will_free_mem(current)) {
		mark_oom_victim(current);//标记当前进程作为OOM候选者
		wake_oom_reaper(current);//唤醒OOM reaper去收割进而释放内存
		return true;//由于当前进程由于自身原因将要退出,不需要经过下面的打分和杀死流程
	}

	/*
	 * The OOM killer does not compensate for IO-less reclaim.
	 * pagefault_out_of_memory lost its gfp context so we have to
	 * make sure exclude 0 mask - all other users should have at least
	 * ___GFP_DIRECT_RECLAIM to get here.
	 */
	//如果内存申请掩码包括__GFP_DS或__GFP_NOFAIL,则不进行OOM收割。
	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
		return true;

	/*
	 * Check if there were limitations on the allocation (only relevant for
	 * NUMA and memcg) that may require different handling.
	 */
	//检查是否需要处理不同的分配限制
	constraint = constrained_alloc(oc);
	if (constraint != CONSTRAINT_MEMORY_POLICY)
		oc->nodemask = NULL;
	//检查sysctl_panic_on_oom设置,以及是否由sysrq触发,来决定是否触发panic。
	check_panic_on_oom(oc, constraint);

	//如果设置了sysctl_oom_kill_allocating_task,那么当内存耗尽时,会把当前申请内存分配的进程杀掉。
	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
		get_task_struct(current);
		oc->chosen = current;
		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
		return true;
	}

	select_bad_process(oc);//遍历所有进程和进程下的线程,查找合适的候选进程
	/* Found nothing?!?! */
	if (!oc->chosen) {//如果没有合适候选进程,并且OOM不是由sysrq触发的,进入panic。
		dump_header(oc, NULL);
		pr_warn("Out of memory and no killable processes...\n");
		/*
		 * If we got here due to an actual allocation at the
		 * system level, we cannot survive this and will enter
		 * an endless loop in the allocator. Bail out now.
		 */
		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
			panic("System is deadlocked on memory\n");
	}
	if (oc->chosen && oc->chosen != (void *)-1UL)
		//杀死选中的进程
		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				 "Memory cgroup out of memory");
	return !!oc->chosen;
}

out_of_memory函数是OOM机制的核心,他主要做了三件事:

  1. 调用select_bad_process调挑选最’bad‘的进程
  2. 如果没有合适的进程,则调用dump_header,打印OOM信息,找出OOM原因
  3. 调用oom_kill_process杀死第一步挑选的进程

我们先看select_bad_process函数怎么选择最差的进程:

static void select_bad_process(struct oom_control *oc)
{
	if (is_memcg_oom(oc))
		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
	else {
		struct task_struct *p;

		rcu_read_lock();
		
		for_each_process(p)//遍历系统范围内所有进程线程
			if (oom_evaluate_task(p, oc))//评估每个进程的得分
				break;
		rcu_read_unlock();
	}

	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
}

select_bad_process主要是遍历系统范围所有进程,并且调用oom_evaluate_task函数对每一个进程评分:

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
	struct oom_control *oc = arg;
	unsigned long points;

	//进程1以及内核线程等等不能被kill的线程跳过
	if (oom_unkillable_task(task, NULL, oc->nodemask))
		goto next;

	/*
	 * This task already has access to memory reserves and is being killed.
	 * Don't allow any other task to have access to the reserves unless
	 * the task has MMF_OOM_SKIP because chances that it would release
	 * any memory is quite low.
	 */
	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
			goto next;
		goto abort;
	}

	/*
	 * If task is allocating a lot of memory and has been marked to be
	 * killed first if it triggers an oom, then select it.
	 */
	//如果任务分配了大量的内存则优先选择触发OOM的进程
	if (oom_task_origin(task)) {
		points = ULONG_MAX;
		goto select;
	}

	//启发式函数,以确定哪个候选任务杀死
	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
	//这里保证只取最高分的进程,所以分数最高者被选中。其他情况则直接跳过
	if (!points || points < oc->chosen_points)
		goto next;

	/* Prefer thread group leaders for display purposes */
	//选择线程组的所属的进程
	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
		goto next;
select:
	if (oc->chosen)
		put_task_struct(oc->chosen);
	get_task_struct(task);
	oc->chosen = task;//更新OOM选中的进程和当前最高分
	oc->chosen_points = points;
next:
	return 0;
abort:
	if (oc->chosen)
		put_task_struct(oc->chosen);
	oc->chosen = (void *)-1UL;
	return 1;
}

oom_evaluate_task首先会跳过不可以被杀死的内核进程,然后判断如果一个进程是否分配了大量内存空间就直接返回优先杀死该进程并且不用往下走去评分,否则往下走通过oom_badness函数评分后返回,每次返回的分数和OC结构体的分数比较好后把较大的分数和其进程保存在OC中。oom_badness()是给进程打分的函数,可以说是核心中的核心。现在看看oom_badness:

unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
			  const nodemask_t *nodemask, unsigned long totalpages)
{
	long points;
	long adj;

	//如果进程不可被杀,直接跳过
	if (oom_unkillable_task(p, memcg, nodemask))
		return 0;

	p = find_lock_task_mm(p);//找到进程p,并使用task_lock()锁上
	if (!p)
		return 0;

	/*
	 * Do not even consider tasks which are explicitly marked oom
	 * unkillable or have been already oom reaped or the are in
	 * the middle of vfork
	 */
	//获取当前进程的oom_score_adj参数
	adj = (long)p->signal->oom_score_adj;
	//如果当前进程oom_score_adj为最小值,说明此进程不参数'bad'评比,返回0
	if (adj == OOM_SCORE_ADJ_MIN ||
			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
			in_vfork(p)) {
		task_unlock(p);
		return 0;
	}

	/*
	 * The baseline for the badness score is the proportion of RAM that each
	 * task's rss, pagetable and swap space use.
	 */
	//计算oom分数,也就是每个任务的rss、可分页和交换空间使用的RAM比例
	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
	task_unlock(p);//解锁

	/* Normalize to oom_score_adj units */
	//参数oom_score_adj归一化处理,这里可以看出oom_score_adj对最终分数的影响,
	adj *= totalpages / 1000;
	//将归一化后的adj和points求和,作为当前进程的分数
	points += adj;

	/*
	 * Never return 0 for an eligible task regardless of the root bonus and
	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
	 */
	return points > 0 ? points : 1;
}

oom_badness首先跳过不可以杀死的进程,然后上进程锁,获取当前进程的oom_score_adj参数,如果oom_score_adj是最低值就不参加评选(我们设置某些进程不可以被杀死,可以通过设置这个参数达到目的的),mm->flags为MMF_OOM_SKIP的进程不参加评选,处于vfork()中的进程也不参加评选,最后就是计算进程的分数了,进程的得分取决于其消耗的内存大小再加上oom_score_adj的归一化处理结果。
select_bad_process调挑选最’bad‘的进程讲完了,现在说说dump_header如何打印OOM信息:

static void dump_header(struct oom_control *oc, struct task_struct *p)
{
	//显示在哪个进程中触发了OOM
	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
		current->comm, oc->gfp_mask, &oc->gfp_mask,
		nodemask_pr_args(oc->nodemask), oc->order,
			current->signal->oom_score_adj);
	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
		pr_warn("COMPACTION is disabled!!!\n");

	cpuset_print_current_mems_allowed();
	dump_stack();//输出当前现场的栈信息
	if (is_memcg_oom(oc))
		mem_cgroup_print_oom_info(oc->memcg, p);
	else {
		show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);//输出整个系统的内存使用情况
		if (is_dump_unreclaim_slabs())
			dump_unreclaimable_slab();
	}
	if (sysctl_oom_dump_tasks)
		dump_tasks(oc->memcg, oc->nodemask);//显示系统所有进程的内存使用情况
}

static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
	struct task_struct *p;
	struct task_struct *task;

	pr_info("Tasks state (memory values in pages):\n");
	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
	rcu_read_lock();
	for_each_process(p) {
		if (oom_unkillable_task(p, memcg, nodemask))//不可被kill的进程不显示
			continue;

		task = find_lock_task_mm(p);//内核线程等没有自己的mm,也无法被kill,所以不显示
		if (!task) {
			/*
			 * This is a kthread or all of p's threads have already
			 * detached their mm's.  There's no need to report
			 * them; they can't be oom killed anyway.
			 */
			continue;
		}

		//打印输出pid,kuid,tgid,total_vm和rss等信息
		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
			task->pid, from_kuid(&init_user_ns, task_uid(task)),
			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
			mm_pgtables_bytes(task->mm),
			get_mm_counter(task->mm, MM_SWAPENTS),
			task->signal->oom_score_adj, task->comm);
		task_unlock(task);
	}
	rcu_read_unlock();
}

dump_header主要打印了一下几种信息:

  • 触发了OOM的进程信息
  • 现场的栈信息
  • 整个系统的内存使用情况

现在说说oom_kill_process杀死第一步挑选的进程的流程:

static void oom_kill_process(struct oom_control *oc, const char *message)
{
	struct task_struct *p = oc->chosen;
	unsigned int points = oc->chosen_points;
	struct task_struct *victim = p;
	struct task_struct *child;
	struct task_struct *t;
	struct mem_cgroup *oom_group;
	unsigned int victim_points = 0;
	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
					      DEFAULT_RATELIMIT_BURST);

	/*
	 * If the task is already exiting, don't alarm the sysadmin or kill
	 * its children or threads, just give it access to memory reserves
	 * so it can die quickly
	 */
	task_lock(p);
	
	//如果p进程将要退出,或者释放内存
	if (task_will_free_mem(p)) {
		mark_oom_victim(p);//标记p进程作为OOM候选者
		wake_oom_reaper(p);//唤醒OOM reaper去收割进而释放内存
		task_unlock(p);//解锁
		put_task_struct(p);//释放进程内核栈,彻底释放了内核线程所占的系统资源
		return;
	}
	task_unlock(p);

	if (__ratelimit(&oom_rs))
		dump_header(oc, p);//打印系统栈信息、内存信息、所有进程的内存消耗情况

	//打印输出将要kill掉的进程名、pid、score
	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
		message, task_pid_nr(p), p->comm, points);

	/*
	 * If any of p's children has a different mm and is eligible for kill,
	 * the one with the highest oom_badness() score is sacrificed for its
	 * parent.  This attempts to lose the minimal amount of work done while
	 * still freeing memory.
	 */
	read_lock(&tasklist_lock);

	/*
	 * The task 'p' might have already exited before reaching here. The
	 * put_task_struct() will free task_struct 'p' while the loop still try
	 * to access the field of 'p', so, get an extra reference.
	 */
	get_task_struct(p);
	//遍历进程下的线程
	for_each_thread(p, t) {
		list_for_each_entry(child, &t->children, sibling) {
			unsigned int child_points;

			//如果子进程有自己单独的mm内存空间,则可以被选中代提父进程被kill
			if (process_shares_mm(child, p->mm))
				continue;
			/*
			 * oom_badness() returns 0 if the thread is unkillable
			 */
			//计算子线程的得分情况
			child_points = oom_badness(child,
				oc->memcg, oc->nodemask, oc->totalpages);
			//将得分最高者计为victim,得分为victim_points。
			if (child_points > victim_points) {
				put_task_struct(victim);//释放进程内核栈,彻底释放了内核线程所占的系统资源
				victim = child;
				victim_points = child_points;
				get_task_struct(victim);
			}
		}
	}
	put_task_struct(p);//释放进程内核栈,彻底释放了内核线程所占的系统资源
	read_unlock(&tasklist_lock);

	/*
	 * Do we need to kill the entire memory cgroup?
	 * Or even one of the ancestor memory cgroups?
	 * Check this out before killing the victim task.
	 */
	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

	__oom_kill_process(victim);

	/*
	 * If necessary, kill all tasks in the selected memory cgroup.
	 */
	if (oom_group) {
		mem_cgroup_print_oom_group(oom_group);
		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
		mem_cgroup_put(oom_group);
	}
}

oom_kill_process首先判断要杀死的进程如果要退出或者释放内存,就直接标记这个进程要oom并且唤醒OOM reaper去收割释放内存,否则就遍历进程下的全部线程,计算每一个线程得分,选择分数最高的线程标记,然后调用__oom_kill_process收割:

static void __oom_kill_process(struct task_struct *victim)
{
	struct task_struct *p;
	struct mm_struct *mm;
	bool can_oom_reap = true;

	p = find_lock_task_mm(victim);
	if (!p) {
		put_task_struct(victim);
		return;
	} else if (victim != p) {
		get_task_struct(p);
		put_task_struct(victim);
		victim = p;
	}

	/* Get a reference to safely compare mm after task_unlock(victim) */
	mm = victim->mm;
	mmgrab(mm);

	/* Raise event before sending signal: task reaper must see this */
	//在发送信号之前引发OOM_KILL事件,让任务收割者知道
	count_vm_event(OOM_KILL);
	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

	/*
	 * We should send SIGKILL before granting access to memory reserves
	 * in order to prevent the OOM victim from depleting the memory
	 * reserves from the user space under its control.
	 */
	//发送SIGKILL信号给victim进程
	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
	mark_oom_victim(victim);//标注TIF_MEMDIE是因为OOM被杀死
	
	//打印被kill进程的内存信息
	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
	task_unlock(victim);

	/*
	 * Kill all user processes sharing victim->mm in other thread groups, if
	 * any.  They don't get access to memory reserves, though, to avoid
	 * depletion of all memory.  This prevents mm->mmap_sem livelock when an
	 * oom killed thread cannot exit because it requires the semaphore and
	 * its contended by another thread trying to allocate memory itself.
	 * That thread will now get access to memory reserves since it has a
	 * pending fatal signal.
	 */
	rcu_read_lock();
	//遍历相关线程处理共享内存
	for_each_process(p) {
		if (!process_shares_mm(p, mm))//如果子进程没有自己单独的mm内存空间,则跳过
			continue;
		if (same_thread_group(p, victim))
			continue;
		if (is_global_init(p)) {
			can_oom_reap = false;
			set_bit(MMF_OOM_SKIP, &mm->flags);
			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
					task_pid_nr(victim), victim->comm,
					task_pid_nr(p), p->comm);
			continue;
		}
		/*
		 * No use_mm() user needs to read from the userspace so we are
		 * ok to reap it.
		 */
		//内核线程跳过
		if (unlikely(p->flags & PF_KTHREAD))
			continue;
		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
	}
	rcu_read_unlock();

	if (can_oom_reap)
		wake_oom_reaper(victim);//唤醒OOM Reaper内核线程收割

	mmdrop(mm);//释放mm空间的内存。包括申请的页面、mm结构体等
	put_task_struct(victim);//释放task_struct占用的内存空间,包括cgroup等等。
}

__oom_kill_process第一步通知任务收割者知道现在准备引发OOM_KILL事件,接着发送SIGKILL信号给victim进程,并且标注因为OOM被杀死的标志,然后打印被kill进程的内存信息,最后遍历进程下所有线程处理共享内存,因为共享内存其他进程还需要使用,不可以释放,最后调用wake_oom_reaper唤醒OOM Reaper内核线程收割。我们看看wake_oom_reaper如何唤醒OOM Reaper内核线程收割:

static void wake_oom_reaper(struct task_struct *tsk)
{
	/* mm is already queued? */
	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
		return;

	get_task_struct(tsk);

	spin_lock(&oom_reaper_lock);
	tsk->oom_reaper_list = oom_reaper_list;
	oom_reaper_list = tsk;
	spin_unlock(&oom_reaper_lock);
	trace_wake_reaper(tsk->pid);
	wake_up(&oom_reaper_wait);
}

wake_oom_reaper首先看看oom的队列是否准备好,然后设置oom回收的队列,最后唤醒oom_reaper_wait队列来收割进程。我们再看看oom_reaper_wait是什么队列呢?查找并且分析后发现,原来linux内核有一个函数叫oom_init,他是linux初始化的时候运行的,因为他是subsys_initcall队列中的,我们看看一些oom相关的全局变量和oom_init函数:

static struct task_struct *oom_reaper_th;//oom_reaper内核线程的task_struct指针
//OOM Reaper等待队列,oom_reaper线程在此等待,当有OOM产生的时候唤醒等待队列,并从oom_reaper_list中获取待收割进程结构体
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;//待收割的进程
static DEFINE_SPINLOCK(oom_reaper_lock);//oom线程锁

static int __init oom_init(void)
{
	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
	return 0;
}
subsys_initcall(oom_init)

oom_reaper_wait这个队列是通过DECLARE_WAIT_QUEUE_HEAD静态初始化的,而oom_init会创建一个内核线程oom_reaper,oom_reaper:

static int oom_reaper(void *unused)
{
	while (true) {
		struct task_struct *tsk = NULL;
		
		//oom_reaper在此睡眠,直到有OOM产生并且通过wake_oom_reaper()唤醒
		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
		spin_lock(&oom_reaper_lock);
		if (oom_reaper_list != NULL) {
			tsk = oom_reaper_list;
			oom_reaper_list = tsk->oom_reaper_list;
		}
		spin_unlock(&oom_reaper_lock);

		if (tsk)
			oom_reap_task(tsk);//收割OOM选中的最bad进程
	}

	return 0;
}

oom_reaper中就一直等待oom_reaper_wait工作队列,这个工作队列就是wake_oom_reaper唤醒的工作队列,等到wake_oom_reaper函数执行并且唤醒oom_reaper_wait工作队列后,第一步是给oom线程上锁,记录oom_reaper_list上要收割的进程后清空oom_reaper_list 队列,最后调用oom_reap_task函数收割进程。oom_reap_task:

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
	int attempts = 0;
	struct mm_struct *mm = tsk->signal->oom_mm;

	/* Retry the down_read_trylock(mmap_sem) a few times */
	//最多重试10次,每次间隔100ms,进行收割
	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
		schedule_timeout_idle(HZ/10);

	if (attempts <= MAX_OOM_REAP_RETRIES ||
	    test_bit(MMF_OOM_SKIP, &mm->flags))
		goto done;

	//收割失败,显示系统hold住的lock信息
	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
		task_pid_nr(tsk), tsk->comm);
	debug_show_all_locks();

done:
	tsk->oom_reaper_list = NULL;

	/*
	 * Hide this mm from OOM killer because it has been either reaped or
	 * somebody can't call up_write(mmap_sem).
	 */
	set_bit(MMF_OOM_SKIP, &mm->flags);

	/* Drop a reference taken by wake_oom_reaper */
	put_task_struct(tsk);
}

可以看到oom_reap_task最多会进行重试10次,每次间隔100ms,执行oom_reap_task_mm进行收割,成功会返回,失败会报错后返回。现在看看oom_reap_task_mm怎么进行收割:

static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
	bool ret = true;

	if (!down_read_trylock(&mm->mmap_sem)) {
		trace_skip_task_reaping(tsk->pid);
		return false;
	}

	/*
	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
	 * under mmap_sem for reading because it serializes against the
	 * down_write();up_write() cycle in exit_mmap().
	 */
	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
		trace_skip_task_reaping(tsk->pid);
		goto out_unlock;
	}

	trace_start_task_reaping(tsk->pid);

	/* failed to reap part of the address space. Try again later */
	//主要收割函数
	ret = __oom_reap_task_mm(mm);
	if (!ret)
		goto out_finish;

	//显示经过oom_reaper之后的victim进程所占用的内存信息
	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			task_pid_nr(tsk), tsk->comm,
			K(get_mm_counter(mm, MM_ANONPAGES)),
			K(get_mm_counter(mm, MM_FILEPAGES)),
			K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
	trace_finish_task_reaping(tsk->pid);
out_unlock:
	up_read(&mm->mmap_sem);

	return ret;
}

oom_reap_task_mm主要通过__oom_reap_task_mm收割进程,__oom_reap_task_mm:

bool __oom_reap_task_mm(struct mm_struct *mm)
{
	struct vm_area_struct *vma;
	bool ret = true;

	/*
	 * Tell all users of get_user/copy_from_user etc... that the content
	 * is no longer stable. No barriers really needed because unmapping
	 * should imply barriers already and the reader would hit a page fault
	 * if it stumbled over a reaped memory.
	 */
	set_bit(MMF_UNSTABLE, &mm->flags);

	//遍历进程所有vma区域
	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
		//如果vma是被锁住、巨型页、物理特殊页则跳过
		if (!can_madv_dontneed_vma(vma))
			continue;

		/*
		 * Only anonymous pages have a good chance to be dropped
		 * without additional steps which we cannot afford as we
		 * are OOM already.
		 *
		 * We do not even care about fs backed pages because all
		 * which are reclaimable have already been reclaimed and
		 * we do not want to block exit_mmap by keeping mm ref
		 * count elevated without a good reason.
		 */
		//匿名的非共享页
		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
			const unsigned long start = vma->vm_start;
			const unsigned long end = vma->vm_end;
			struct mmu_gather tlb;

			tlb_gather_mmu(&tlb, mm, start, end);//初始化 mmu_gather 结构体用于页表拆卸
			
			//通知mmu使vma区域失效
			if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
				tlb_finish_mmu(&tlb, start, end);//释放 mmu_gather 结构体
				ret = false;
				continue;
			}
			unmap_page_range(&tlb, vma, start, end, NULL);//系统解除vma映射
			
			//通知mmu使vma区域失效完成
			mmu_notifier_invalidate_range_end(mm, start, end);
			tlb_finish_mmu(&tlb, start, end);//释放 mmu_gather 结构体
		}
	}

	return ret;
}

static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
{
	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}

__oom_reap_task_mm会遍历进程每一个vma,首先跳过被锁住、巨型页、物理特殊页的vma,然后对匿名的非共享页的vma区域进行下面的操作:

  1. 初始化 mmu_gather 结构体用于页表拆卸
  2. 通知mmu使vma区域失效
  3. 系统解除vma映射
  4. 通知mmu使vma区域失效完成
  5. 释放 mmu_gather 结构体

到此,oom收割进程,释放内存就结束了。

你可能感兴趣的:(Linux,kernel,linux,内存管理)