Linux 内存管理(三)—— SLUB

目录

一、概述

二、SLUB

2.1 数据结构

2.2 初始化

2.2.1 静态建立过程

2.3  API

2.3.1 kmem_cache_create

2.3.2 kmem_cache_alloc

2.3.3 kmem_cache_free

2.3.4 kmalloc/kfree

三、参考


一、概述

伙伴系统最小的分配单位是页,这对于较小的内存需求会造成浪费,所以在伙伴系统之上引入了slab。slab是相同大小内存块的集合,在具体设计上,考虑NUMA上node到cpu的层级关系,slub被设计成一个三级缓存的形式【1】:

Linux 内存管理(三)—— SLUB_第1张图片

  • 第一个层次,每cpu一个,无锁
  • 第二个层次,每node一个,本node,cpu之间的锁
  • 第三个层次,每NUMA一个,node之间的锁

和缓存的思想一样,越是下层开销越大,每个级别的cache增加内存分配/释放的性能。

二、SLUB

我们说,slab可以看做是相同大小的小块内存的集合,它的规格由kmem_cache描述。

2.1 数据结构

[include/linux/slub_def.h]

struct kmem_cache {
	struct kmem_cache_cpu __percpu *cpu_slab;

	/* Used for retriving partial slabs etc */
	slab_flags_t flags;
	unsigned long min_partial;
	unsigned int size;	/* The size of an object including meta data */
	unsigned int object_size;/* The size of an object without meta data */
	unsigned int offset;	/* Free pointer offset. */
	struct kmem_cache_order_objects oo;

	/* Allocation and freeing of slabs */
	struct kmem_cache_order_objects max;
	struct kmem_cache_order_objects min;
	gfp_t allocflags;	/* gfp flags to use on each alloc */
	int refcount;		/* Refcount for slab cache destroy */
	void (*ctor)(void *);
	unsigned int inuse;		/* Offset to metadata */
	unsigned int align;		/* Alignment */
	unsigned int reserved;		/* Reserved bytes at the end of slabs */
	unsigned int red_left_pad;	/* Left redzone padding size */
	const char *name;	/* Name (only for display!) */
	struct list_head list;	/* List of slab caches */
	unsigned int useroffset;	/* Usercopy region offset */
	unsigned int usersize;		/* Usercopy region size */

	struct kmem_cache_node *node[MAX_NUMNODES];
};

其中描述slub规格的有:

  • object_size      小块内存的实际大小
  • size                  每个小块内存实际需要的大小
  • offset                到空闲指针的偏移
  • oo                     记录slab占用页面大小(order)和slab obj的数量
  • inuse                到metadata的偏移
  • align                 对齐
  • reserved           slab 一般不能正好容纳obj,表示剩余的空间

一个slab的简单示意如下:

Linux 内存管理(三)—— SLUB_第2张图片

  • 一个slab 占用2^order个页面,容纳nr_obj个obj
  • slab使用空闲链表组织,空闲指针如上图void*所示(当然也可以使用占用obj的模式)

下面来看一下slab三层缓存的描述:

struct kmem_cache_cpu {
	void **freelist;	/* Pointer to next available object */
	unsigned long tid;	/* Globally unique transaction id */
	struct page *page;	/* The slab from which we are allocating */
};

struct kmem_cache_node {
	spinlock_t list_lock;
	unsigned long nr_partial;
	struct list_head partial;
};

上述结构的关系如下图所示:

Linux 内存管理(三)—— SLUB_第3张图片

  • 各级缓存的门限值分别由cpu_partial, min_partial控制
  • kmem_cache_node 是一组:node[MAX_NUMNODES],图中没有画出

2.2 初始化

mm_init->kmem_cache_init

2.2.1 静态建立过程

由于kmem_cache这个管理的slub的结构也要纳入slub的, 其在kmem_cache_init中使用静态构建的bootstrap方式

Linux 内存管理(三)—— SLUB_第4张图片

[mm/slub.c]

void __init kmem_cache_init(void)
{
	static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node;

	kmem_cache_node = &boot_kmem_cache_node;
	kmem_cache = &boot_kmem_cache;

	create_boot_cache(kmem_cache_node, "kmem_cache_node",
		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);

	register_hotmemory_notifier(&slab_memory_callback_nb);

	/* Able to allocate the per node structures */
	slab_state = PARTIAL;

	create_boot_cache(kmem_cache, "kmem_cache",
			offsetof(struct kmem_cache, node) +
				nr_node_ids * sizeof(struct kmem_cache_node *),
		       SLAB_HWCACHE_ALIGN, 0, 0);

	kmem_cache = bootstrap(&boot_kmem_cache);

	kmem_cache_node = bootstrap(&boot_kmem_cache_node);

	/* Now we can use the kmem_cache to allocate kmalloc slabs */
	setup_kmalloc_cache_index_table();
	create_kmalloc_caches(0);

	/* Setup random freelists for each cache */
	init_freelist_randomization();

	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
				  slub_cpu_dead);

	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
		cache_line_size(),
		slub_min_order, slub_max_order, slub_min_objects,
		nr_cpu_ids, nr_node_ids);
}

create_boot_cache创建struct kmem_cache_node的kmem_cache,其核心是__kmem_cache_create,在这个过程中slub需要计算几个量,这里展示如下:

unsigned long calculate_alignment(unsigned long flags,
		unsigned long align, unsigned long size)
{
	if (flags & SLAB_HWCACHE_ALIGN) {
		unsigned long ralign = cache_line_size();
		while (size <= ralign / 2)
			ralign /= 2;
		align = max(align, ralign);
	}

	if (align < ARCH_SLAB_MINALIGN)
		align = ARCH_SLAB_MINALIGN;

	return ALIGN(align, sizeof(void *));
}
  • calculate_alignment计算slub的对齐属性
  • 如果指定了SLAB_HWCACHE_ALIGN,尽可能将obj放入一个cache中,但最终对齐不能小于指定align
  • align至少关于word对齐(不能跨越word边界)
static inline int slab_order(int size, int min_objects,
				int max_order, int fract_leftover, int reserved)
{
	int order;
	int rem;
	int min_order = slub_min_order;

	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
		return get_order(size * MAX_OBJS_PER_PAGE) - 1;

	for (order = max(min_order, get_order(min_objects * size + reserved));
			order <= max_order; order++) {

		unsigned long slab_size = PAGE_SIZE << order;

		rem = (slab_size - reserved) % size;

		if (rem <= slab_size / fract_leftover)
			break;
	}

	return order;
}
  • slab_order 计算需要为指定size的obj分配page的数量(以order表示,范围在[slub_min_order, slub_max_order]
  • 需要保证不浪费空间,不超过总空间的 1/ fract_leftover(rem/slab_size <= 1 / fract_leftover)
static inline int calculate_order(int size, int reserved)
{
	int order;
	int min_objects;
	int fraction;
	int max_objects;

	min_objects = slub_min_objects;
	if (!min_objects)
		min_objects = 4 * (fls(nr_cpu_ids) + 1);
	max_objects = order_objects(slub_max_order, size, reserved);
	min_objects = min(min_objects, max_objects);

	while (min_objects > 1) {
		fraction = 16;
		while (fraction >= 4) {
			order = slab_order(size, min_objects,
					slub_max_order, fraction, reserved);
			if (order <= slub_max_order)
				return order;
			fraction /= 2;
		}
		min_objects--;
	}

	order = slab_order(size, 1, slub_max_order, 1, reserved);
	if (order <= slub_max_order)
		return order;

	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
	if (order < MAX_ORDER)
		return order;
	return -ENOSYS;
}
  • 在min_objects条件下,合理的order数
  • 如果object太大,尝试放开(min_objects=1或者slub_max_order=MAX_ORDER)
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
	unsigned long flags = s->flags;
	size_t size = s->object_size;
	int order;

	size = ALIGN(size, sizeof(void *));
	s->inuse = size;

	size = ALIGN(size, s->align);
	s->size = size;
	if (forced_order >= 0)
		order = forced_order;
	else
		order = calculate_order(size, s->reserved);

	if (order < 0)
		return 0;

	s->allocflags = 0;
	if (order)
		s->allocflags |= __GFP_COMP;

	if (s->flags & SLAB_CACHE_DMA)
		s->allocflags |= GFP_DMA;

	if (s->flags & SLAB_RECLAIM_ACCOUNT)
		s->allocflags |= __GFP_RECLAIMABLE;

	s->oo = oo_make(order, size, s->reserved);
	s->min = oo_make(get_order(size), size, s->reserved);
	if (oo_objects(s->oo) > oo_objects(s->max))
		s->max = s->oo;

	return !!oo_objects(s->oo);
}
  • calculate_sizes计算object实际占用的空间,及需要分配page的数量(2^order)
  • 主要区分s->object_size,s->inuse,s->size(obj大小,跨word对齐,s->align对齐)
  • s->oo记录slub 最终的order和其包含的object数量
static inline int order_objects(int order, unsigned long size, int reserved)
{
	return ((PAGE_SIZE << order) - reserved) / size;
}

static inline struct kmem_cache_order_objects oo_make(int order,
		unsigned long size, int reserved)
{
	struct kmem_cache_order_objects x = {
		(order << OO_SHIFT) + order_objects(order, size, reserved)
	};

	return x;
}

slub的各指标都确定了,现在可以分配空间了,最终就是在底层调用前面计算的order大小的配置,这里主要关注page的一系列操作。先看一下和slub有关的page的结构:

 

struct page {
	unsigned long flags;							
	union {
	};
	union {		
		void *freelist;			
	};
	union {		
		struct {
			unsigned inuse:16;
			unsigned objects:15;
			unsigned frozen:1;
		};
	};
	union {
		struct {		
			struct page *next;	
			int pages;	
			int pobjects;	
		};
    }
	union {
		struct kmem_cache *slab_cache;	
	};
};
  • freelist 空闲链表头
  • inuse   当前已使用object数量
  • objects slub中object总数
  • frozen
  • slab_cache

kmem_cache未建立起来之前,使用early_kmem_cache_node_alloc分配kmem_cache_node, 重点看一下过程中分配一个slab的过程。

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
	page = alloc_slab_page(s, alloc_gfp, node, oo);

	page->objects = oo_objects(oo);

	order = compound_order(page);
	page->slab_cache = s;
	__SetPageSlab(page);
	if (page_is_pfmemalloc(page))
		SetPageSlabPfmemalloc(page);

	start = page_address(page);

	shuffle = shuffle_freelist(s, page);

	if (!shuffle) {
		for_each_object_idx(p, idx, s, start, page->objects) {
			setup_object(s, page, p);
			if (likely(idx < page->objects))
				set_freepointer(s, p, p + s->size);
			else
				set_freepointer(s, p, NULL);
		}
		page->freelist = fixup_red_left(s, start);
	}

	page->inuse = page->objects;
	page->frozen = 1;

out:
	if (gfpflags_allow_blocking(flags))
		local_irq_disable();
	if (!page)
		return NULL;

	mod_lruvec_page_state(page,
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
		1 << oo_order(oo));

	inc_slabs_node(s, page_to_nid(page), page->objects);

	return page;
}
  • alloc_slab_page分配对于order的page
  • 设置page的slab属性:__SetPageSlab(page);
  • 初始化freelist链表

一个slab分配后如下图所示:

Linux 内存管理(三)—— SLUB_第5张图片

下面来看上述slub结构是如何管理的:

static void early_kmem_cache_node_alloc(int node)
{
	struct page *page;
	struct kmem_cache_node *n;

	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);

	n = page->freelist;
	BUG_ON(!n);
	page->freelist = get_freepointer(kmem_cache_node, n);
	page->inuse = 1;
	page->frozen = 0;
	kmem_cache_node->node[node] = n;

	init_kmem_cache_node(n);
	inc_slabs_node(kmem_cache_node, node, page->objects);

	/*
	 * No locks need to be taken here as it has just been
	 * initialized and there is no concurrent access.
	 */
	__add_partial(n, page, DEACTIVATE_TO_HEAD);
}
  • 注意这里是分配struct kmem_cache_node结构,所以kmem_cache_node->node[node] = n; 这句话直接将kmem_cache中的node指向第一个kmem_cache_node

Linux 内存管理(三)—— SLUB_第6张图片

接下来是分配每cpu结构:

static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
				     2 * sizeof(void *));

	if (!s->cpu_slab)
		return 0;

	init_kmem_cache_cpus(s);

	return 1;
}
  • 可以看到就是简单的分配并初始化kmem_cache_cpu

至此,静态的"kmem_cache_node"结构就建立了,kmem_cache的状态变为slab_state = PARTIAL,那么接下来的"kmem_cache"的静态分配kmem_cache_node就可以从前面建立的slub中取了!

kmem_cache和kmem_cache_node的kmem_cache就创建完成了,可以调用标准的接口了,所以此时要将kmem_cache和kmem_cache_node纳入标准的kmem_cache管理中,由静态变为动态,先列函数流程:

Linux 内存管理(三)—— SLUB_第7张图片

static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
	int node;
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
	struct kmem_cache_node *n;

	memcpy(s, static_cache, kmem_cache->object_size);

	__flush_cpu_slab(s, smp_processor_id());
	for_each_kmem_cache_node(s, node, n) {
		struct page *p;

		list_for_each_entry(p, &n->partial, lru)
			p->slab_cache = s;
    }
	slab_init_memcg_params(s);
	list_add(&s->list, &slab_caches);
	memcg_link_cache(s);
	return s;
}
  • 首先分配一个kmem_cache的kmem_cache, 通过memcpy将静态转变为动态的,其中分配kmem_cache_zalloc->kmem_cache_alloc->slab_alloc->slab_alloc_node,我们上面已经分析过了

接下来一路通过__flush_cpu_slab将kmem_cache_cpu flush掉,主要函数是:

static void deactivate_slab(struct kmem_cache *s, struct page *page,
				void *freelist)
{
	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
	int lock = 0;
	enum slab_modes l = M_NONE, m = M_NONE;
	void *nextfree;
	int tail = DEACTIVATE_TO_HEAD;
	struct page new;
	struct page old;

	if (page->freelist) {
		stat(s, DEACTIVATE_REMOTE_FREES);
		tail = DEACTIVATE_TO_TAIL;
	}

	/*
	 * Stage one: Free all available per cpu objects back
	 * to the page freelist while it is still frozen. Leave the
	 * last one.
	 *
	 * There is no need to take the list->lock because the page
	 * is still frozen.
	 */
	while (freelist && (nextfree = get_freepointer(s, freelist))) {
		void *prior;
		unsigned long counters;

		do {
			prior = page->freelist;
			counters = page->counters;
			set_freepointer(s, freelist, prior);
			new.counters = counters;
			new.inuse--;
			VM_BUG_ON(!new.frozen);

		} while (!__cmpxchg_double_slab(s, page,
			prior, counters,
			freelist, new.counters,
			"drain percpu freelist"));

		freelist = nextfree;
	}
    ...
}
  • 先来看第一阶段,注释写的很明白,将kmem_cache_cpu对应的freelist空闲的obj放入page对应的freelist,最终要剩余一个free的obj,此时frozen=1

下面来看第二阶段:

	/*
	 * Stage two: Ensure that the page is unfrozen while the
	 * list presence reflects the actual number of objects
	 * during unfreeze.
	 *
	 * We setup the list membership and then perform a cmpxchg
	 * with the count. If there is a mismatch then the page
	 * is not unfrozen but the page is on the wrong list.
	 *
	 * Then we restart the process which may have to remove
	 * the page from the list that we just put it on again
	 * because the number of objects in the slab may have
	 * changed.
	 */
redo:

	old.freelist = page->freelist;
	old.counters = page->counters;
	VM_BUG_ON(!old.frozen);

	/* Determine target state of the slab */
	new.counters = old.counters;
	if (freelist) {
		new.inuse--;
		set_freepointer(s, freelist, old.freelist);
		new.freelist = freelist;
	} else
		new.freelist = old.freelist;

	new.frozen = 0;

	if (!new.inuse && n->nr_partial >= s->min_partial)
		m = M_FREE;
	else if (new.freelist) {
		m = M_PARTIAL;
		if (!lock) {
			lock = 1;
			/*
			 * Taking the spinlock removes the possiblity
			 * that acquire_slab() will see a slab page that
			 * is frozen
			 */
			spin_lock(&n->list_lock);
		}
	} else {
		m = M_FULL;
		if (kmem_cache_debug(s) && !lock) {
			lock = 1;
			/*
			 * This also ensures that the scanning of full
			 * slabs from diagnostic functions will not see
			 * any frozen slabs.
			 */
			spin_lock(&n->list_lock);
		}
	}

	if (l != m) {

		if (l == M_PARTIAL)

			remove_partial(n, page);

		else if (l == M_FULL)

			remove_full(s, n, page);

		if (m == M_PARTIAL) {

			add_partial(n, page, tail);
			stat(s, tail);

		} else if (m == M_FULL) {

			stat(s, DEACTIVATE_FULL);
			add_full(s, n, page);

		}
	}

	l = m;
	if (!__cmpxchg_double_slab(s, page,
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"))
		goto redo;

	if (lock)
		spin_unlock(&n->list_lock);

	if (m == M_FREE) {
		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
	}

2.3  API

  • struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))
  • void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
  • void kmem_cache_free(struct kmem_cache *s, void *x)
  • void *kmalloc(size_t size, gfp_t flags)
  • void kfree(const void *x)

2.3.1 kmem_cache_create

先来看kmem_cache_create,这个函数调用create_cache->__kmem_cache_create,该流程和上面的流程是一致的,不同的地方体现在init_kmem_cache_nodes会调用kmem_cache_alloc_node:

static int init_kmem_cache_nodes(struct kmem_cache *s)
{
	int node;

	for_each_node_state(node, N_NORMAL_MEMORY) {
		struct kmem_cache_node *n;

		if (slab_state == DOWN) {
			early_kmem_cache_node_alloc(node);
			continue;
		}
		n = kmem_cache_alloc_node(kmem_cache_node,
						GFP_KERNEL, node);
    ...
}

其流程如下:

Linux 内存管理(三)—— SLUB_第8张图片

上述过程和kmem_cache_alloc是一致的,放到下面分析.

2.3.2 kmem_cache_alloc

kmem_cache_alloc申请过程中各级缓存的形态如下图:

Linux 内存管理(三)—— SLUB_第9张图片

先说明一下分配的时候的流程:

  1. L1 HIT, 即命中本地kmem_cache_cpu(c->freelist != NULL),直接从freelist头取一个,所谓fastpath
  2. L1 MISS, L2 HIT(c->partial != NULL),在L2中取一个slab挂到L1(c->page)上
  3. L1 MISS, L2 MISS, L3 HIT(n->partial != NULL),在L3中取一个slab挂到L1(c->page)上,并取数量为cpu_partial / 2为上限个obj挂到L2(c->partial)上
  4. L1 MISS, L2 MISS, L3 MISS,在伙伴系统中分配一个slab挂到L1(c->page)上
  5. 伙伴系统也分配失败,OOM

下面结合上面的步骤分析一下kmem_cache_alloc流程:kmem_cache_alloc->slab_alloc->slab_alloc_node

static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void *object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;

	s = slab_pre_alloc_hook(s, gfpflags);
	if (!s)
		return NULL;
redo:
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));

	barrier();

	object = c->freelist;
	page = c->page;
	if (unlikely(!object || !node_match(page, node))) {
		object = __slab_alloc(s, gfpflags, node, addr, c);
		stat(s, ALLOC_SLOWPATH);
	} else {
		void *next_object = get_freepointer_safe(s, object);

		if (unlikely(!this_cpu_cmpxchg_double(
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
		prefetch_freepointer(s, next_object);
		stat(s, ALLOC_FASTPATH);
	}

	return object;
}

先看本地cache不为空的情况( c->freelist),即所谓fastpath操作,其实就是一个原子操作,移动freelist指针。

下面看一下slowpath的情况:

 

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;

	page = c->page;
	if (!page)
		goto new_slab;
    ...
load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);
	c->tid = next_tid(c->tid);
	return freelist;

new_slab:

	if (slub_percpu_partial(c)) {
		page = c->page = slub_percpu_partial(c);
		slub_set_percpu_partial(c, page);
		stat(s, CPU_PARTIAL_ALLOC);
		goto redo;
	}

	freelist = new_slab_objects(s, gfpflags, node, &c);

	if (unlikely(!freelist)) {
		slab_out_of_memory(s, gfpflags, node);
		return NULL;
	}

	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;
}
  • c->page为空,说明本地缓存为空,执行new_slab_objects,改函数首先尝试从kmem_cache_node对应的partial链表上去寻找合适的slub,对应get_partial函数,参照上图,这里分析一下acquire_slab

如果cpu partial为空,那么考虑kmem_cache_node,对应new_slab_objects->get_partial

static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
		struct kmem_cache_cpu *c)
{
	void *object;
	int searchnode = node;

	if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();
	else if (!node_present_pages(node))
		searchnode = node_to_mem_node(node);

	object = get_partial_node(s, get_node(s, searchnode), c, flags);
	if (object || node != NUMA_NO_NODE)
		return object;

	return get_any_partial(s, flags, c);
}
  • get_partial先从kmem_cache_node的searchnode的partial链表,如果不满足,按照distance去其他node对应的partial链表中获取。

先来看get_partial_node

static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
				struct kmem_cache_cpu *c, gfp_t flags)
{
	struct page *page, *page2;
	void *object = NULL;
	int available = 0;
	int objects;

	if (!n || !n->nr_partial)
		return NULL;

	spin_lock(&n->list_lock);
	list_for_each_entry_safe(page, page2, &n->partial, lru) {
		void *t;

		if (!pfmemalloc_match(page, flags))
			continue;

		t = acquire_slab(s, n, page, object == NULL, &objects);
		if (!t)
			break;

		available += objects;
		if (!object) {
			c->page = page;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;
		} else {
			put_cpu_partial(s, page, 0);
			stat(s, CPU_PARTIAL_NODE);
		}
		if (!kmem_cache_has_cpu_partial(s)
			|| available > s->cpu_partial / 2)
			break;

	}
	spin_unlock(&n->list_lock);
	return object;
}

get_partial_node描述了kmem_cache_node层次命中后的操作,可以看到主要是:

  1. 变量partial,acquire_slab获取partial链表上slab空闲的数量,对于第一个获取到的slab,挂到c->page上,即L1 上
  2. 然后将后续slab挂到c->partial,即L2上,直到满足其数量大于cpu_partial的一半
static inline void *acquire_slab(struct kmem_cache *s,
		struct kmem_cache_node *n, struct page *page,
		int mode, int *objects)
{
	void *freelist;
	unsigned long counters;
	struct page new;

	freelist = page->freelist;
	counters = page->counters;
	new.counters = counters;
	*objects = new.objects - new.inuse;
	if (mode) {
		new.inuse = page->objects;
		new.freelist = NULL;
	} else {
		new.freelist = freelist;
	}

	VM_BUG_ON(new.frozen);
	new.frozen = 1;

	if (!__cmpxchg_double_slab(s, page,
			freelist, counters,
			new.freelist, new.counters,
			"acquire_slab"))
		return NULL;

	remove_partial(n, page);
	return freelist;
}
  • acquire_slab 主要的操作就是将slub从kmem_cache_node的partial链摘除,这里注意,第一次获取的slab是挂在L1上的,所以要:
		new.inuse = page->objects;
		new.freelist = NULL;
  • remove_partial 对应摘链动作

如果L3 miss了,会执行new_slab从伙伴系统中分配:

	page = new_slab(s, flags, node);
	if (page) {
		c = raw_cpu_ptr(s->cpu_slab);
		if (c->page)
			flush_slab(s, c);

		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
		freelist = page->freelist;
		page->freelist = NULL;

		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
	} else
		freelist = NULL;

将申请到的slab,挂在L1上,相应的字段处理一下就好了,具体参照文章开始的框图。如果伙伴系统也失败了,只能OOM了。

综上,如果L1 miss,L2,L3,乃至从伙伴系统找到一个新的slab都要保证下面的状态:

page->freelist = NULL;
c->page = page;

这个很容易理解,一个slab不完全是free的,因此freelist指向也是不确定的。最终在下面

load_freelist:
    VM_BUG_ON(!c->page->frozen);
    c->freelist = get_freepointer(s, freelist);  //这里跳过第一个元素,慢速路径返回原freelist就能直接使用了

2.3.3 kmem_cache_free

下面分析kmem_cache_free,其函数调用流程大体如下:

Linux 内存管理(三)—— SLUB_第10张图片

释放的流程,可以参照下图,来自【2】

Linux 内存管理(三)—— SLUB_第11张图片

  1. 如果slab所在L1, L2, L3 中直接释放
  2. 如果slab是full,加入L2(cpu partial)链表中,这种情况下要检查L2 obj总数是否超过cpu_partial,如果超过,将L2移动到L3。
  3. 如果释放obj后slab变成empty且在L3中,如果slab数量nr_partial > min_partial,考虑将slab回收到伙伴系统中。

分段来看__slab_free:

static void __slab_free(struct kmem_cache *s, struct page *page,
			void *head, void *tail, int cnt, unsigned long addr)
{
	void *prior;
	int was_frozen;
	struct page new;
	unsigned long counters;
	struct kmem_cache_node *n = NULL;
	unsigned long uninitialized_var(flags);

	stat(s, FREE_SLOWPATH);

	do {
		if (unlikely(n)) {
			spin_unlock_irqrestore(&n->list_lock, flags);
			n = NULL;
		}
		prior = page->freelist;
		counters = page->counters;
		set_freepointer(s, tail, prior);
		new.counters = counters;
		was_frozen = new.frozen;
		new.inuse -= cnt;
		if ((!new.inuse || !prior) && !was_frozen) {

			if (kmem_cache_has_cpu_partial(s) && !prior) {

    			new.frozen = 1;

			} else { /* Needs to be taken off a list */

				n = get_node(s, page_to_nid(page));

				spin_lock_irqsave(&n->list_lock, flags);

			}
		}

	} while (!cmpxchg_double_slab(s, page,
		prior, counters,
		head, new.counters,
		"__slab_free"));

如果page不在kmem_cache_cpu中,有以下几种情况:

  • 在cpu partial链表中,此时frozen=1
  • slab是满的(page->freelist == NULL),不在任何链表中,此时frozen=0
  • 在node partial中,此时frozen=0

上面代码片段就是将obj释放到对应的slab中,需要考虑下面的情况:

  1. 如果slab是满的(!prior),因为没有在任何链表中,需要将其加入cpu partial中,因此将frozen=1
  2. 如果slab在node partial链表中,释放n个obj之后为slab变空,需要选择一个node

如果是上述情况1,即在full slab释放n个obj,那么将该slab放入cpu partial中,使用put_cpu_partial,即要考虑是否超过空闲门限cpu_partial:

	if (likely(!n)) {

		/*
		 * If we just froze the page then put it onto the
		 * per cpu partial list.
		 */
		if (new.frozen && !was_frozen) {
			put_cpu_partial(s, page, 1);
			stat(s, CPU_PARTIAL_FREE);
		}
		/*
		 * The list lock was not taken therefore no list
		 * activity can be necessary.
		 */
		if (was_frozen)
			stat(s, FREE_FROZEN);
		return;
	}

如果是情况2,需要判断空slab以及是否超过L3门限min_partial而将其释放给buddy allcator,根据slab情况将其从node partial移除,接着使用discard_slab释放:

if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
		goto slab_empty;

slab_empty:
	if (prior) {
		/*
		 * Slab on the partial list.
		 */
		remove_partial(n, page);
		stat(s, FREE_REMOVE_PARTIAL);
	} else {
		/* Slab must be on the full list */
		remove_full(s, n, page);
	}

	spin_unlock_irqrestore(&n->list_lock, flags);
	stat(s, FREE_SLAB);
	discard_slab(s, page);

2.3.4 kmalloc/kfree

kmalloc基于kmem_cache,下面的函数建立各个长度的slab,存储在kmalloc_caches中。

void __init create_kmalloc_caches(unsigned long flags)
{
	int i;

	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
		if (!kmalloc_caches[i])
			new_kmalloc_cache(i, flags);

		/*
		 * Caches that are not of the two-to-the-power-of size.
		 * These have to be created immediately after the
		 * earlier power of two caches
		 */
		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
			new_kmalloc_cache(1, flags);
		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
			new_kmalloc_cache(2, flags);
	}

	/* Kmalloc array is now usable */
	slab_state = UP;

#ifdef CONFIG_ZONE_DMA
	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
		struct kmem_cache *s = kmalloc_caches[i];

		if (s) {
			int size = kmalloc_size(i);
			char *n = kasprintf(GFP_NOWAIT,
				 "dma-kmalloc-%d", size);

			BUG_ON(!n);
			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
				size, SLAB_CACHE_DMA | flags);
		}
	}
#endif
}

在我的机器上KMALLOC_SHIFT_LOW是3, KMALLOC_SHIFT_HIGH是13,而kmalloc_info如下:

const struct kmalloc_info_struct kmalloc_info[] __initconst = {
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
	{"kmalloc-67108864", 67108864}
};

可见上述创建了表中kmalloc-8~kmalloc-8192,dma-kmalloc-8~dma-kmalloc-8192的slab

顺便看一下kmalloc

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
	if (__builtin_constant_p(size)) {
		if (size > KMALLOC_MAX_CACHE_SIZE)
			return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
		if (!(flags & GFP_DMA)) {
			int index = kmalloc_index(size);

			if (!index)
				return ZERO_SIZE_PTR;

			return kmem_cache_alloc_trace(kmalloc_caches[index],
					flags, size);
		}
#endif
	}
	return __kmalloc(size, flags);
}
  • size 大于8192(4K),使用伙伴系统分配kmalloc_large
  • 没有指定GFP_DMA标志,使用kmalloc_index(size)寻找size对应kmalloc_caches中的下标

kfree涉及到的函数上面也都分析过了,略过。

三、参考

【1】多核心Linux内核路径优化的不二法门之-slab与伙伴系统

【2】图解slub

 

 

 

 

你可能感兴趣的:(Linux内存管理)