1. memory_cgrp_subsys
struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
};
2. dfl_cftypes
static struct cftype memory_files[] = {
{
.name = "current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
{
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
.write = memory_low_write,
},
{
.name = "high",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_high_show,
.write = memory_high_write,
},
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_max_show,
.write = memory_max_write,
},
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
{
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{ }
};
static struct cftype swap_files[] = {
{
.name = "swap.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = swap_current_read,
},
{
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,
.write = swap_max_write,
},
{ }
};
3. legacy_cftypes
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
.seq_show = memcg_stat_show,
},
{
.name = "force_empty",
.write = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "cgroup.event_control",
.write = memcg_write_event_control,
.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
{
.name = "oom_control",
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.seq_show = memcg_numa_stat_show,
},
#endif
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
{
.name = "kmem.slabinfo",
.seq_start = memcg_slab_start,
.seq_next = memcg_slab_next,
.seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
{
.name = "kmem.tcp.limit_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.usage_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.failcnt",
.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{ },
};
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{ },
};
4. mem_cgroup
struct mem_cgroup {
struct cgroup_subsys_state css;
struct mem_cgroup_id id;
struct page_counter memory;
struct page_counter swap;
struct page_counter memsw;
struct page_counter kmem;
struct page_counter tcpmem;
unsigned long low;
unsigned long high;
struct work_struct high_work;
unsigned long soft_limit;
struct vmpressure vmpressure;
bool use_hierarchy;
bool oom_lock;
int under_oom;
int swappiness;
int oom_kill_disable;
struct cgroup_file events_file;
struct mutex thresholds_lock;
struct mem_cgroup_thresholds thresholds;
struct mem_cgroup_thresholds memsw_thresholds;
struct list_head oom_notify;
unsigned long move_charge_at_immigrate;
atomic_t moving_account;
spinlock_t move_lock;
struct task_struct *move_lock_task;
unsigned long move_lock_flags;
struct mem_cgroup_stat_cpu __percpu *stat;
unsigned long socket_pressure;
bool tcpmem_active;
int tcpmem_pressure;
#ifndef CONFIG_SLOB
int kmemcg_id;
enum memcg_kmem_state kmem_state;
struct list_head kmem_caches;
#endif
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
struct wb_domain cgwb_domain;
#endif
struct list_head event_list;
spinlock_t event_list_lock;
struct mem_cgroup_per_node *nodeinfo[0];
};
4.1 page_counter
struct page_counter {
atomic_long_t count;
unsigned long limit;
struct page_counter *parent;
unsigned long watermark;
unsigned long failcnt;
};
4.2 mem_cgroup_per_node
struct mem_cgroup_per_node {
struct lruvec lruvec;
struct lruvec_stat __percpu *lruvec_stat;
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
struct rb_node tree_node;
unsigned long usage_in_excess;
bool on_tree;
struct mem_cgroup *memcg;
};
4.3 mem_cgroup_stat_cpu
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
4.4 memcg_stat_item
enum memcg_stat_item {
MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
MEMCG_RSS,
MEMCG_RSS_HUGE,
MEMCG_SWAP,
MEMCG_SOCK,
MEMCG_KERNEL_STACK_KB,
MEMCG_NR_STAT,
};
4.5 memcg_event_item
enum memcg_event_item {
MEMCG_LOW = NR_VM_EVENT_ITEMS,
MEMCG_HIGH,
MEMCG_MAX,
MEMCG_OOM,
MEMCG_NR_EVENTS,
};
4.6 mem_cgroup_events_target
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
5. mem_cgroup_read_u64
enum res_type {
_MEM,
_MEMSWAP,
_OOM_TYPE,
_KMEM,
_TCP,
};
enum {
RES_USAGE,
RES_LIMIT,
RES_MAX_USAGE,
RES_FAILCNT,
RES_SOFT_LIMIT,
};
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct page_counter *counter;
switch (MEMFILE_TYPE(cft->private)) {
case _MEM:
counter = &memcg->memory;
break;
case _MEMSWAP:
counter = &memcg->memsw;
break;
case _KMEM:
counter = &memcg->kmem;
break;
case _TCP:
counter = &memcg->tcpmem;
break;
default:
BUG();
}
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
if (counter == &memcg->memory)
return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
if (counter == &memcg->memsw)
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->limit * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
return counter->failcnt;
case RES_SOFT_LIMIT:
return (u64)memcg->soft_limit * PAGE_SIZE;
default:
BUG();
}
}
5.1 mem_cgroup_usage
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg) {
val += memcg_page_state(iter, MEMCG_CACHE);
val += memcg_page_state(iter, MEMCG_RSS);
if (swap)
val += memcg_page_state(iter, MEMCG_SWAP);
}
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
else
val = page_counter_read(&memcg->memsw);
}
return val;
}
5.2 memcg_page_state
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
int idx)
{
for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
if (val < 0)
val = 0;
return val;
}
6. mem_cgroup_write
static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long nr_pages;
int ret;
buf = strstrip(buf);
ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) {
ret = -EINVAL;
break;
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
ret = mem_cgroup_resize_limit(memcg, nr_pages);
break;
case _MEMSWAP:
ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
break;
case _TCP:
ret = memcg_update_tcp_limit(memcg, nr_pages);
break;
}
break;
case RES_SOFT_LIMIT:
memcg->soft_limit = nr_pages;
ret = 0;
break;
}
return ret ?: nbytes;
}
6.1 mem_cgroup_resize_limit
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
unsigned long curusage;
unsigned long oldusage;
bool enlarge = false;
int retry_count;
int ret;
retry_count = MEM_CGROUP_RECLAIM_RETRIES *
mem_cgroup_count_children(memcg);
oldusage = page_counter_read(&memcg->memory);
do {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
mutex_lock(&memcg_limit_mutex);
if (limit > memcg->memsw.limit) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
if (limit > memcg->memory.limit)
enlarge = true;
ret = page_counter_limit(&memcg->memory, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
curusage = page_counter_read(&memcg->memory);
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
} while (retry_count);
if (!ret && enlarge)
memcg_oom_recover(memcg);
return ret;
}
7. mem_cgroup_reset
static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
struct page_counter *counter;
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
counter = &memcg->memory;
break;
case _MEMSWAP:
counter = &memcg->memsw;
break;
case _KMEM:
counter = &memcg->kmem;
break;
case _TCP:
counter = &memcg->tcpmem;
break;
default:
BUG();
}
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
page_counter_reset_watermark(counter);
break;
case RES_FAILCNT:
counter->failcnt = 0;
break;
default:
BUG();
}
return nbytes;
}
7.1 page_counter_reset_watermark
static inline void page_counter_reset_watermark(struct page_counter *counter)
{
counter->watermark = page_counter_read(counter);
}