Linux内存控制器(一)

1. memory_cgrp_subsys

// cftype: 用于定义和描述控制组的控制文件
// cftype->private:描述资源类型和资源属性
// dfl_cftypes和legacy_cftypes都是cftype的成员
struct cgroup_subsys memory_cgrp_subsys = {
	.css_alloc = mem_cgroup_css_alloc,
	.css_online = mem_cgroup_css_online,
	.css_offline = mem_cgroup_css_offline,
	.css_released = mem_cgroup_css_released,
	.css_free = mem_cgroup_css_free,
	.css_reset = mem_cgroup_css_reset,
	.can_attach = mem_cgroup_can_attach,
	.cancel_attach = mem_cgroup_cancel_attach,
	.post_attach = mem_cgroup_move_task,
	.bind = mem_cgroup_bind,
    // 默认层级
	.dfl_cftypes = memory_files,
    // 子层级
	.legacy_cftypes = mem_cgroup_legacy_files,
	.early_init = 0,
};

2. dfl_cftypes

static struct cftype memory_files[] = {
	{
        // 控制组和所有子控制组的当前内存使用量
		.name = "current",
		.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = memory_current_read,
	},
	{
        // 内存使用低界限
		.name = "low",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_low_show,
		.write = memory_low_write,
	},
	{
        // 内存使用高界限
		.name = "high",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_high_show,
		.write = memory_high_write,
	},
	{
        // 内存使用硬限制
		.name = "max",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_max_show,
		.write = memory_max_write,
	},
	{
        // 内存事件
		.name = "events",
		.flags = CFTYPE_NOT_ON_ROOT,
		.file_offset = offsetof(struct mem_cgroup, events_file),
		.seq_show = memory_events_show,
	},
	{
        // 查看内存使用的各种统计值
		.name = "stat",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_stat_show,
	},
	{ }	/* terminate */
};

static struct cftype swap_files[] = {
	{
        // 控制组和所有子控制组当前交换分区使用量
		.name = "swap.current",
		.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = swap_current_read,
	},
	{
        // 交换分区使用硬限制
		.name = "swap.max",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = swap_max_show,
		.write = swap_max_write,
	},
	{ }	/* terminate */
};

3. legacy_cftypes

// 根控制组对资源使用量没有限制,并且不允许在根控制组配置资源使用限制
// 进程默认属于根控制组,创建子进程时,子进程继承父进程加入的控制组
static struct cftype mem_cgroup_legacy_files[] = {
	{
        // 当前内存使用量
		.name = "usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
        // 见第6节
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 记录的最大内存使用量
		.name = "max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
        // 见第8节
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 内存使用硬限制
		.name = "limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
        // 见第7节
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 内存使用软限制
		.name = "soft_limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "failcnt",
		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 内存使用统计值
		.name = "stat",
		.seq_show = memcg_stat_show,
	},
	{
		.name = "force_empty",
		.write = mem_cgroup_force_empty_write,
	},
	{
        // 使用分层记账: 启用后子树中的所有内存控制组的内存使用都会被记账到这个内存控制组
		.name = "use_hierarchy",
		.write_u64 = mem_cgroup_hierarchy_write,
		.read_u64 = mem_cgroup_hierarchy_read,
	},
	{
        // 注册内存监控事件
		.name = "cgroup.event_control",		/* XXX: for compat */
		.write = memcg_write_event_control,
		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
	},
	{
		.name = "swappiness",
		.read_u64 = mem_cgroup_swappiness_read,
		.write_u64 = mem_cgroup_swappiness_write,
	},
	{
		.name = "move_charge_at_immigrate",
		.read_u64 = mem_cgroup_move_charge_read,
		.write_u64 = mem_cgroup_move_charge_write,
	},
	{
        // 是否禁止oom killer杀进程
		.name = "oom_control",
		.seq_show = mem_cgroup_oom_control_read,
		.write_u64 = mem_cgroup_oom_control_write,
		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
	},
	{
        // 内存压力级别
		.name = "pressure_level",
	},
#ifdef CONFIG_NUMA
	{
		.name = "numa_stat",
		.seq_show = memcg_numa_stat_show,
	},
#endif
	{
        // 内核内存使用限制
		.name = "kmem.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 内核内存使用量
		.name = "kmem.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "kmem.failcnt",
		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 记录的最大内核内存使用使用量
		.name = "kmem.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
	{
		.name = "kmem.slabinfo",
		.seq_start = memcg_slab_start,
		.seq_next = memcg_slab_next,
		.seq_stop = memcg_slab_stop,
		.seq_show = memcg_slab_show,
	},
#endif
	{
        // tcp缓冲区内存使用限制
		.name = "kmem.tcp.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // tcp缓冲区内存使用量
		.name = "kmem.tcp.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "kmem.tcp.failcnt",
		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 记录的最大tcp缓冲区内存使用量
		.name = "kmem.tcp.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{ },	/* terminate */
};

static struct cftype memsw_cgroup_files[] = {
	{
        // 内存+交换分区内存使用量
		.name = "memsw.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 记录的内存+交换分区最大内存使用量
		.name = "memsw.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
        // 记录的内存+交换分区内存使用量
		.name = "memsw.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "memsw.failcnt",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{ },	/* terminate */
};

4. mem_cgroup

struct mem_cgroup {
    // 所有资源控制器的基类
	struct cgroup_subsys_state css;

	/* Private memcg ID. Used to ID objects that outlive the cgroup */
	struct mem_cgroup_id id;

	/* Accounted resources */
    // _MEM类型的内存计数器: 记录内存的限制和当前使用量[见5.1节]
	struct page_counter memory;
	struct page_counter swap;

	/* Legacy consumer-oriented counters */
    // _MEMSWAP类型的内存计数器: 记录内存+交换分区的限制和当前使用量
	struct page_counter memsw;
    // _KMEM类型的内核内存计数器: 记录内核内存的限制和当前使用量
	struct page_counter kmem;
    // _TCP类型的tcp缓冲区计数器: 记录tcp缓冲区的限制和当前使用量
	struct page_counter tcpmem;

	/* Normal memory consumption range */
    // 内存使用低界限
	unsigned long low;
    // 内存使用高界限
	unsigned long high;

	/* Range enforcement for interrupt charges */
	struct work_struct high_work;
    
    // 内存使用软限制
	unsigned long soft_limit;

    /* vmpressure notifications */
	struct vmpressure vmpressure;

	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
    // 是否使用分层记账
	bool use_hierarchy;
	/* protected by memcg_oom_lock */
	bool		oom_lock;
	int		under_oom;

	int	swappiness;
	/* OOM-Killer disable */
	int		oom_kill_disable;

	/* handle for "memory.events" */
	struct cgroup_file events_file;

	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
	struct mem_cgroup_thresholds thresholds;

	/* thresholds for mem+swap usage. RCU-protected */
	struct mem_cgroup_thresholds memsw_thresholds;

	/* For oom notifier event fd */
	struct list_head oom_notify;

	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
	unsigned long move_charge_at_immigrate;
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t		moving_account;
	/* taken only while moving_account > 0 */
	spinlock_t		move_lock;
	struct task_struct	*move_lock_task;
	unsigned long		move_lock_flags;
	/*
	 * percpu counter.
	 */
    // 每cpu变量: 统计内存控制组状态(包括内存使用量和内存事件)[见5.3节]
	struct mem_cgroup_stat_cpu __percpu *stat;

	unsigned long		socket_pressure;

	/* Legacy tcp memory accounting */
	bool			tcpmem_active;
	int			tcpmem_pressure;

#ifndef CONFIG_SLOB
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
	int kmemcg_id;
	enum memcg_kmem_state kmem_state;
	struct list_head kmem_caches;
#endif

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif

#ifdef CONFIG_CGROUP_WRITEBACK
	struct list_head cgwb_list;
	struct wb_domain cgwb_domain;
#endif

	/* List of events which userspace want to receive */
	struct list_head event_list;
	spinlock_t event_list_lock;

    // 每个节点对应一个mem_cgroup_per_node实例[见5.2节]
	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
};

4.1 page_counter

// 页面计数器
struct page_counter {
    // 计数值
	atomic_long_t count;
    // 硬限制
	unsigned long limit;
    // 如果父控制组使用use_hierarchy, 则parent指向父控制组的页面计数器, 否则时空指针
	struct page_counter *parent;

	/* legacy */
    // 记录计数值的历史最大值
	unsigned long watermark;
    // 命中限制的次数
	unsigned long failcnt;
};

4.2 mem_cgroup_per_node

/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_node {
    // 内存控制组私有的lru链表
    // 当进程加入内存控制组后, 给进程分配的页面不再加入node的lru链表, 而是加入内存控制组私有的lru链表
	struct lruvec		lruvec;
	struct lruvec_stat __percpu *lruvec_stat;
	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];

	struct rb_node		tree_node;	/* RB tree node */
    // 内存使用量超过软限制的数值 = mem_cgroup.memory.count - mem_cgroup.soft_limit
	unsigned long		usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
    // 表示内存控制组是否在软限制树种
    // 当内存使用量超过软限制时, 通过成员tree_node把mem_cgroup_per_node实例加入软限制树
	bool			on_tree;
    // 指向mem_cgroup_per_node实例所属的内存控制组
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
						/* use container_of	   */
};

4.3 mem_cgroup_stat_cpu

struct mem_cgroup_stat_cpu {
    // 统计控制组内不同状态page的使用量[见5.4节]
	long count[MEMCG_NR_STAT];
    // 统计控制组内发生的不同类型事件的次数[见5.5节]
	unsigned long events[MEMCG_NR_EVENTS];
	unsigned long nr_page_events;
    // 统计控制组不同目标发生事件的次数[见5.6节]
	unsigned long targets[MEM_CGROUP_NTARGETS];
};

4.4 memcg_stat_item

// cgroup自定义的page状态
enum memcg_stat_item {
    // 文件缓存
	MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
    // 匿名内存
	MEMCG_RSS,
    // 匿名巨页
	MEMCG_RSS_HUGE,
    // swap缓存
	MEMCG_SWAP,
	MEMCG_SOCK,
	/* XXX: why are these zone and not node counters? */
	MEMCG_KERNEL_STACK_KB,
	MEMCG_NR_STAT,
};

4.5 memcg_event_item

/* Cgroup-specific events, on top of universal VM events */
enum memcg_event_item {
	MEMCG_LOW = NR_VM_EVENT_ITEMS,
	MEMCG_HIGH,
	MEMCG_MAX,
	MEMCG_OOM,
	MEMCG_NR_EVENTS,
};

4.6 mem_cgroup_events_target

enum mem_cgroup_events_target {
	MEM_CGROUP_TARGET_THRESH,
	MEM_CGROUP_TARGET_SOFTLIMIT,
	MEM_CGROUP_TARGET_NUMAINFO,
	MEM_CGROUP_NTARGETS,
};

5. mem_cgroup_read_u64

// 资源类型
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
	_KMEM,
	_TCP,
};

// 资源属性
enum {
	RES_USAGE,
	RES_LIMIT,
	RES_MAX_USAGE,
	RES_FAILCNT,
	RES_SOFT_LIMIT,
};

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
			       struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct page_counter *counter;

    // 解析资源类型, 从mem_cgroup中选择对应的页面计数器
	switch (MEMFILE_TYPE(cft->private)) {
	case _MEM:
		counter = &memcg->memory;
		break;
	case _MEMSWAP:
		counter = &memcg->memsw;
		break;
	case _KMEM:
		counter = &memcg->kmem;
		break;
	case _TCP:
		counter = &memcg->tcpmem;
		break;
	default:
		BUG();
	}

    // 解析资源属性
	switch (MEMFILE_ATTR(cft->private)) {
	case RES_USAGE:
        // 读取usage_in_bytes数据[见6.1节]
		if (counter == &memcg->memory)
			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
        // 读取memsw.usage_in_bytes数据
		if (counter == &memcg->memsw)
			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
		return (u64)page_counter_read(counter) * PAGE_SIZE;
	case RES_LIMIT:
        // 读取*.limit_in_bytes即内存使用限制值
		return (u64)counter->limit * PAGE_SIZE;
	case RES_MAX_USAGE:
        // 读取*.max_usage_in_bytes即历史最大内存使用量
		return (u64)counter->watermark * PAGE_SIZE;
	case RES_FAILCNT:
        // 读取*.failcnt数据
		return counter->failcnt;
	case RES_SOFT_LIMIT:
        // 读取软限制值
		return (u64)memcg->soft_limit * PAGE_SIZE;
	default:
		BUG();
	}
}

5.1 mem_cgroup_usage

static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
	unsigned long val = 0;

    // 根控制组
	if (mem_cgroup_is_root(memcg)) {
		struct mem_cgroup *iter;

        // 遍历根控制组下所有子控制组
		for_each_mem_cgroup_tree(iter, memcg) {
            // 读取控制组内page cache的数量[见6.2节]
			val += memcg_page_state(iter, MEMCG_CACHE);
            // 读取控制组内anonymous page的数量
			val += memcg_page_state(iter, MEMCG_RSS);
            // 如果开启swap, 还要统计swap cache的数量
			if (swap)
				val += memcg_page_state(iter, MEMCG_SWAP);
		}
	} else {
		if (!swap)
            // 读取_MEM类型page_counter->usage
            val = page_counter_read(&memcg->memory);
		else
            // 读取_MEMSWAP类型page_counter->usage
			val = page_counter_read(&memcg->memsw);
	}
	return val;
}

5.2 memcg_page_state

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
					     int idx)
{
    // 遍历每个cpu上的mem_cgroup_stat_cpu, 并统计该控制组内由idx指定状态的页面数量
	for_each_possible_cpu(cpu)
		val += per_cpu(memcg->stat->count[idx], cpu);

	if (val < 0)
		val = 0;

	return val;    
}

6. mem_cgroup_write

static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
    // 根据接口文件找到其对应的控制组
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long nr_pages;
	int ret;

	buf = strstrip(buf);
    // 解析写入的参数:将参数(可能带有后缀K, M, G, T, P, E)解析为页面数量
	ret = page_counter_memparse(buf, "-1", &nr_pages);
	if (ret)
		return ret;

    // 解析资源类型
	switch (MEMFILE_ATTR(of_cft(of)->private)) {
	case RES_LIMIT:
        // 如前面所说: 根控制组不能设置使用限制
		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
			ret = -EINVAL;
			break;
		}
        // 解析资源属性: 写入*.limit_in_bytes文件即设置硬限制
		switch (MEMFILE_TYPE(of_cft(of)->private)) {
		case _MEM:
            // 设置内存使用硬限制[见7.1节]
			ret = mem_cgroup_resize_limit(memcg, nr_pages);
			break;
		case _MEMSWAP:
			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
			break;
		case _KMEM:
			ret = memcg_update_kmem_limit(memcg, nr_pages);
			break;
		case _TCP:
			ret = memcg_update_tcp_limit(memcg, nr_pages);
			break;
		}
		break;
    // 设置软限制
	case RES_SOFT_LIMIT:
		memcg->soft_limit = nr_pages;
		ret = 0;
		break;
	}
	return ret ?: nbytes;
}

6.1 mem_cgroup_resize_limit

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
				   unsigned long limit)
{
	unsigned long curusage;
	unsigned long oldusage;
	bool enlarge = false;
	int retry_count;
	int ret;

	/*
	 * For keeping hierarchical_reclaim simple, how long we should retry
	 * is depends on callers. We set our retry-count to be function
	 * of # of children which we should visit in this loop.
	 */
	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
		      mem_cgroup_count_children(memcg);

    // 返回当前内存使用硬限制
	oldusage = page_counter_read(&memcg->memory);

	do {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}

		mutex_lock(&memcg_limit_mutex);
        // 新的限制不能超过内存+交换分区的限制
		if (limit > memcg->memsw.limit) {
			mutex_unlock(&memcg_limit_mutex);
			ret = -EINVAL;
			break;
		}
        // 增大硬限制
		if (limit > memcg->memory.limit)
			enlarge = true;
        // 更新硬限制
		ret = page_counter_limit(&memcg->memory, limit);
		mutex_unlock(&memcg_limit_mutex);

        // 更新成功则跳过跳出循环
		if (!ret)
			break;

        // 否则代表当前使用量已经超过硬限制, 需要针对该控制组进行内存回收
		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);

        // 回收完之后再次读取内存使用量
		curusage = page_counter_read(&memcg->memory);
		/* Usage is reduced ? */
        // 如果内存使用量仍然比之前大, 则进行重试
		if (curusage >= oldusage)
			retry_count--;
		else
            // 否则进行重试, 直到使用量小于硬限制
			oldusage = curusage;
	} while (retry_count);

	if (!ret && enlarge)
		memcg_oom_recover(memcg);

	return ret;
}

7. mem_cgroup_reset

static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
				size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	struct page_counter *counter;

    // 解析资源类型
	switch (MEMFILE_TYPE(of_cft(of)->private)) {
	case _MEM:
		counter = &memcg->memory;
		break;
	case _MEMSWAP:
		counter = &memcg->memsw;
		break;
	case _KMEM:
		counter = &memcg->kmem;
		break;
	case _TCP:
		counter = &memcg->tcpmem;
		break;
	default:
		BUG();
	}

    // 解析资源属性
	switch (MEMFILE_ATTR(of_cft(of)->private)) {
    // 设置*max_usage_in_bytes
	case RES_MAX_USAGE:
        // 更新历史最大使用量[见8.1节]
		page_counter_reset_watermark(counter);
		break;
	case RES_FAILCNT:
		counter->failcnt = 0;
		break;
	default:
		BUG();
	}

	return nbytes;
}

7.1 page_counter_reset_watermark

static inline void page_counter_reset_watermark(struct page_counter *counter)
{
    // 将当前内存使用量更新为历史最大值
	counter->watermark = page_counter_read(counter);
}

你可能感兴趣的:(Linux内存管理,linux)