这个kmem_cache_create()函数是一个和cpu结构有关系的函数,所在在公用函数中找不到(3.10.98内核版本中),我选择的是 arch/x86/kernel/
说明下参数:
const char *name :slab的名称
size_t size :每个对象的大小
size_t align :每个对象的对齐
unsigned long flags :对象不够,要申请内存时的标识
void (*ctor)(void *):构造函数
struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); }
看了下 3.10.98的kmem_cache_create()发现有很大的出入,还是先看看2.6.32版本的吧
/** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. * Note that kmem_cache_name() is not guaranteed to return the same pointer, * therefore applications must manage it themselves. * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; gfp_t gfp; /* * Sanity checks... these are all serious usage bugs. */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) {//常规检查,因为需要为name分配内存,在/proc/slabinfo显示,会睡眠,所以不能在中断上下文 printk(KERN_ERR "%s: Early error in slab %s\n", __func__, name); BUG(); } /* * We use cache_chain_mutex to ensure a consistent view of * cpu_online_mask as well. Please see cpuup_callback */ if (slab_is_available()) {//如果slab已经有效,就需要上锁。在前期初始化时,只有一个cpu在初始化slab,可以不加锁 get_online_cpus(); mutex_lock(&cache_chain_mutex); } list_for_each_entry(pc, &cache_chain, next) {//检查cache_chain上的所有slab,所有slab都会挂在全局变量cache_chain上 char tmp; int res; /* * This happens when the module gets unloaded and doesn't * destroy its slab cache and no-one else reuses the vmalloc * area of the module. Print a warning. */ res = probe_kernel_address(pc->name, tmp);//检查是否所有slab都有名字<span style="white-space:pre"> </span> if (res) { printk(KERN_ERR "SLAB: cache with size %d has lost its name\n", pc->buffer_size);//报错 continue; } if (!strcmp(pc->name, name)) {//检查下你起得名字是否已经在链表中了 printk(KERN_ERR "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } } #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif /* * Always checks flags, a caller might be expecting debug support which * isn't available. */ BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. *///字对齐,为什么不直接:size = (size + (BYTES_PER_WORD - 1)) & (~(BYTE_PER_WORD - 1)) if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } /* calculate the final buffer alignment: */ /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) {//高速缓冲行对齐 /* * Default alignment: as specified by the arch code. Except if * an object is really small, then squeeze multiple objects into * one cacheline. */ ralign = cache_line_size();//有体系结构提供的函数,对齐值 while (size <= ralign / 2)//对象比较小,则可以多几个对象填充到缓冲行 ralign /= 2; } else { ralign = BYTES_PER_WORD;//默认是字对齐 } /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated * alignment if either is greater than BYTES_PER_WORD. */ if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); }//上面的都是debug /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ if (ralign < align) {//体系结构中规定的最小对齐值 ralign = align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; if (slab_is_available())//如果slab生效了,则可以休眠 gfp = GFP_KERNEL; else//前期初始化,则不能睡眠 gfp = GFP_NOWAIT; /* Get cache's description obj. *///从cache_cache 的slab上分配一个cachep,cache_cache的slab就是为slab分配结构体的 cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; #if DEBUG cachep->obj_size = size; /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsigned long long); size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif #endif /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do * it too early on.) *///开始处理slab的头部结构体了,是存储在slab上还是在slab外面的其他地方?? if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)//对象比较大(大于512)则外置,从这里可以看出初始化时是内置的 /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB;//表示slab结构体外置 size = ALIGN(size, align);//对齐size
//计算碎片,具体实现看后面的函数分析
left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) {//空对象,错误 printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align);//slab头结构的大小 /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. *///充分利用碎片,如果可以的话,把slab头放到slab上 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {//如果碎片大小大于slab头结构(包括kmem_bufctl_t) flags &= ~CFLGS_OFF_SLAB;//变成内置的了 left_over -= slab_size;//改变碎片大小 } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);//如果对齐了在slab中还是放不下,那就外置,不需要对齐了 #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } //L1的缓冲行长度 cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < align)//必须对齐 cachep->colour_off = align; cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; } /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); if (slab_is_available()) { mutex_unlock(&cache_chain_mutex); put_online_cpus(); } return cachep; } EXPORT_SYMBOL(kmem_cache_create);
碎片计算函数分析
left_over = calculate_slab_order(cachep, size, align, flags); /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. * * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; int gfporder; for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {//0~10 unsigned int num; size_t remainder; cache_estimate(gfporder, size, align, flags, &remainder, &num); if (!num)//对象太大,2^gfporder个内存页不够一个对象,所以返回 NULL continue; if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). *///这个网上有很多解释的,这里就说说自己看法,就是用一个对象去测试下kmem_bufctl_t看看该数组的大小 offslab_limit = size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); if (num > offslab_limit)//对象数目不能太多 break; } /* Found something acceptable - save it away */ cachep->num = num;//给各种成员赋值 cachep->gfporder = gfporder; left_over = remainder; /* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ if (flags & SLAB_RECLAIM_ACCOUNT)//如果分配的是可以回收的页面,则不需要做下面的检查了,大不了被回收 break; /* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ if (gfporder >= slab_break_gfp_order)//达到最大的order break; /* * Acceptable internal fragmentation? *///浪费的空间小于1/8的(page << gfporder),退出 if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; }
注释已经说明了,计算给定的buffer size中有多少碎片
cache_estimate(gfporder, size, align, flags, &remainder, &num);
/* * Calculate the number of objects and left-over bytes for a given buffer size. */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, size_t align, int flags, size_t *left_over, unsigned int *num) { int nr_objs; size_t mgmt_size; size_t slab_size = PAGE_SIZE << gfporder;//分配的内存页 /* * The slab management structure can be either off the slab or * on it. For the latter case, the memory allocated for a * slab is used for: * * - The struct slab * - One kmem_bufctl_t for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ if (flags & CFLGS_OFF_SLAB) {//slab结构体外挂,这就比较简单 mgmt_size = 0; nr_objs = slab_size / buffer_size;//直接整除每个对象的大小 if (nr_objs > SLAB_LIMIT)//对象的限制 nr_objs = SLAB_LIMIT; } else {//slab结构体内置,会麻烦点 /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at * least @align. In the worst case, this result will * be one greater than the number of objects that fit * into the memory allocation when taking the padding * into account. *///内置,struct slab只有一个,而kmem_bufctl_t 就和对象一样多了,因为kmem_bufctl_t 就是用来查看对象是否空闲的 nr_objs = (slab_size - sizeof(struct slab)) / (buffer_size + sizeof(kmem_bufctl_t)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size > slab_size)//上面是没有对齐的比较,这里对齐后比较下是否越界,越界了就少一个对象 nr_objs--; if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; mgmt_size = slab_mgmt_size(nr_objs, align);//这是对齐后的,struct slab + nr_objs * sizeof(kmem_bufctl_t)的值 } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size;//所有的大小 - 所有对齐的对象大小 - 对齐对象的结构体和其他值 }
slab着色问题理解:http://blog.csdn.net/zqy2000zqy/article/details/1137895