创建新的缓存必须通过kmem_cache_create()函数来完成,流程如下:
1,从全局cache_cache中获得cache结构,因为全局cache_cache初始化对象的大小就是kmem_cache结构的大小,所以返回的指针正好可以转换为cache结构;调用 kmem_cache_zalloc(&cache_cache, gfp);
2,获得slab中碎片大小,由函数calculate_slab_order()实现;
3,计算并初始化cache的各种属性,如果是外置式,需要用kmem_find_general_cachep(slab_size, 0u)指定cachep->slabp_cache,用于存放slab对象和kmem_bufctl_t[]数组;
4,设置每个CPU上得本地cache,setup_cpu_cache();
5,cache创建完毕,将其加入到全局slab cache链表中;
struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))
struct kmem_cache * __kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL; gfp_t gfp; #if DEBUG #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif /* * Always checks flags, a caller might be expecting debug support which * isn't available. */ BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } /* calculate the final buffer alignment: */ /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { /* * Default alignment: as specified by the arch code. Except if * an object is really small, then squeeze multiple objects into * one cacheline. */ ralign = cache_line_size(); while (size <= ralign / 2) ralign /= 2; } else { ralign = BYTES_PER_WORD; } /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated * alignment if either is greater than BYTES_PER_WORD. */ if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); } /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ if (ralign < align) { ralign = align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; if (slab_is_available()) /* slab分配器是否已经可用 */ gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; /* Get cache's description obj. */ <span style="white-space:pre"> /* 获得struct kmem_cache对象 ,为什么能从cache中获得的对象是 kmem_cache结构呢,因为这里的全局变量cache_cache的对象大小 就是kmem_cache结构大小*/ </span> cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) return NULL; cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; cachep->object_size = size; cachep->align = align; #if DEBUG /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsigned long long); size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif #endif /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) *//* 确定slab管理对象的存储方式:内置还是外置 。通常,当对象大于等于512时,使用外置方式 。初始化阶段采用内置式。 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; size = ALIGN(size, align); left_over = calculate_slab_order(cachep, size, align, flags);/* 获得slab中碎片的大小 */ if (!cachep->num) { /* cachep->num为该cache中每个slab的对象数,为0,表示为该对象创建cache失败 */ printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); return NULL; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. *//* 如果这是一个外置式slab,并且碎片大小大于slab管理对象的大小 <span style="white-space:pre"> </span>,则可将slab管理对象移到slab中,改造成一个内置式slab */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; left_over -= slab_size; } <span style="white-space:pre"> /* align是针对slab对象的,如果slab管理对象是外置存储 ,自然不会像内置那样影响到后面slab对象的存储位置 ,也就不需要对齐了 */ </span> if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } cachep->colour_off = cache_line_size(); /* cache的着色块的单位大小 */ /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < align) cachep->colour_off = align; /* 着色块大小必须是对象要求对齐方式的倍数 */ cachep->colour = left_over / cachep->colour_off; /* 计算碎片区需要多少个着色快 */ cachep->slab_size = slab_size; /* slab管理对象的大小 */ cachep->flags = flags; cachep->allocflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size;/* slab对象的大小 */ cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { /* 分配一个slab管理区域对象,保存在slabp_cache中, 这个函数传入的大小为slab_size,也就是分配slab_size大小的cache ,在slab创建的时候如果是外置式,那么需要从分配的这里面 分配出slab对象,剩下的空间放kmem_bufctl_t[]数组, 如果是内置式的slab,此指针为空 */ cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; if (setup_cpu_cache(cachep, gfp)) { /* 设置每个cpu上的local cache */ __kmem_cache_destroy(cachep); return NULL; } if (flags & SLAB_DEBUG_OBJECTS) { /* * Would deadlock through slab_destroy()->call_rcu()-> * debug_object_activate()->kmem_cache_alloc(). */ WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); } /* cache setup completed, link it into the list */ list_add(&cachep->list, &slab_caches); /* cache创建完毕,将其加入到全局slab cache链表中 */ return cachep }static struct kmem_cache cache_cache = {
/** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. * * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ /*计算slab由几个页面组成,同时计算每个slab中有多少对象*/ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; int gfporder; for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsigned int num; size_t remainder; /* 计算slab中对象数 */ cache_estimate(gfporder, size, align, flags, &remainder, &num); /* 对象数为0,表示此order下,一个对象都放不下,检查下一order */ if (!num) continue; if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ /* 创建一个外置式slab时,要相应分配该slab的管理对象 ,包含struct slab对象和kmem_bufctl_t数组,分配管理对象的流程就是分配普通对象的流程 ,再来看一下分配对象的流程: kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc-> cache_alloc_refill-> cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc 可以看出这里可能存在一个循环,循环的关键在于alloc_slabmgmt函数 ,当slab管理对象是off-slab方式时,就形成了循环 。那么什么时候slab管理对象会采用外置式slab呢?显然当其管理的slab中对象很多 ,从而kmem_bufctl_t数组很大,致使整个管理对象也很大,此时才会形成循环 。故需要对kmem_bufctl_t的数目做限制,下面的算法是很粗略的,既然对象大小为size时 ,是外置式slab,那么我们假设管理对象的大小也是size,计算出kmem_bufctl_t数组的大小 ,即此大小的kmem_bufctl_t数组一定会造成管理对象是外置式slab。之所以说粗略 ,是指数组大小小于这个限制时,也不能确保管理对象一定是内置式slab。但这也不会引发错误 ,因为还有一个slab_break_gfp_order阀门来控制每个slab所占页面数,通常其值为1,即每个slab最多两个页面 ,外置式slab存放的都是大于512的大对象,所以 slab中不会有太多的大对象,kmem_bufctl_t数组也不会很大,粗略判断一下就足够了。 */ offslab_limit = size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); /* 对象数目大于限制,跳出循环,不再尝试更大的order ,避免slab中对象数目过多 ,此时计算的对象数也是有效的,循环一次没什么 */ if (num > offslab_limit) break; } /* Found something acceptable - save it away */ /* 每个slab中的对象数 */ cachep->num = num; /* slab的order,即由几个页面组成 */ cachep->gfporder = gfporder; /* slab中剩余空间(碎片)的大小 */ left_over = remainder; /* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ /* SLAB_RECLAIM_ACCOUNT表示此slab所占页面为可回收的 ,当内核检测是否有足够的页面满足用户态的需求时 ,此类页面将被计算在内,通过调用 kmem_freepages()函数可以释放分配给slab的页框。由于是可回收的 ,所以不需要做后面的碎片检测了 */ if (flags & SLAB_RECLAIM_ACCOUNT) break; /* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ /* slab_break_gfp_order为slab所占页面的阀门,超过这个阀门时 ,无论碎片大小,都不再检测更高的order了 */ if (gfporder >= slab_break_gfp_order) break; /* * Acceptable internal fragmentation? */ /* slab所占页面的大小是碎片大小的8倍以上 ,页面利用率较高,可以接受这样的order */ if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } /* 返回碎片大小 */ return left_over;设置cpu的cache
/*配置local cache和slab三链。*/ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { /* general cache初始化完毕,配置每个cpu的local cache */ if (g_cpucache_up == FULL) return enable_cpucache(cachep, gfp); /* 此时处于系统初始化阶段,g_cpucache_up记录general cache初始化的进度 ,比如PARTIAL_AC表示struct array_cache所在的cache已经创建, PARTIAL_L3表示struct kmem_list3所在的cache已经创建 ,注意创建这两个cache的先后顺序 。在初始化阶段只需配置主cpu的local cache和slab三链 */ if (g_cpucache_up == NONE) { /* * Note: the first kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ /* 初始化阶段创建struct array_cache所在cache时进入这个流程 ,此时struct array_cache所在的general cache还未创建 ,只能使用静态分配的全局变量initarray_generic表示的local cache */ cachep->array[smp_processor_id()] = &initarray_generic.cache; /* * If the cache that's used by kmalloc(sizeof(kmem_list3)) is * the first cache, then we need to set up all its list3s, * otherwise the creation of further caches will BUG(). */ /* 创建struct kmem_list3所在的cache是在struct array_cache所在cache之后 ,所以此时struct kmem_list3所在的 cache也一定没有创建,也需要使用全局变量 */ set_up_list3s(cachep, SIZE_AC); /* 执行到这struct array_cache所在的cache创建完毕 ,如果struct kmem_list3和struct array_cache位于同一个general cache中 ,不会再重复创建了 ,g_cpucache_up表示的进度更进一步 */ if (INDEX_AC == INDEX_L3) g_cpucache_up = PARTIAL_L3; else g_cpucache_up = PARTIAL_AC; } else { /* g_cpucache_up至少为PARTIAL_AC时进入这个流程,struct array_cache所在的 general cache已经建立起来,可以通过kmalloc分配了 */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); if (g_cpucache_up == PARTIAL_AC) { /* struct kmem_list3所在cache仍未创建完毕,还需使用全局的slab三链 */ set_up_list3s(cachep, SIZE_L3); /* 后面将会分析kmem_cache_init函数,只有创建struct kmem_list3所在 cache时才会进入此流程,上面的代码执行完,struct kmem_list3所在 cache也就创建完毕可以使用了,更新g_cpucache_up */ g_cpucache_up = PARTIAL_L3; } else { int node; for_each_online_node(node) { cachep->nodelists[node] =/* 通过kmalloc分配struct kmem_list3对象 */ kmalloc_node(sizeof(struct kmem_list3), gfp, node); BUG_ON(!cachep->nodelists[node]); /* 初始化slab三链 */ kmem_list3_init(cachep->nodelists[node]); } } } /* 设置回收时间 */ cachep->nodelists[numa_node_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; cpu_cache_get(cachep)->batchcount = 1; cpu_cache_get(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; }
/** * kmem_cache_destroy - delete a cache * @cachep: the cache to destroy * * Remove a &struct kmem_cache object from the slab cache. * * It is expected this function will be called by a module when it is * unloaded. This will remove the cache completely, and avoid a duplicate * cache being allocated each time a module is loaded and unloaded, if the * module doesn't have persistent in-kernel storage across loads and unloads. * * The cache must be empty before calling this function. * * The caller must guarantee that no one will allocate memory from the cache * during the kmem_cache_destroy(). */ void kmem_cache_destroy(struct kmem_cache *cachep) { BUG_ON(!cachep || in_interrupt()); /* Find the cache in the chain of caches. */ get_online_cpus(); mutex_lock(&slab_mutex); /* * the chain is never empty, cache_cache is never destroyed */ list_del(&cachep->list);/*将cache从cache_chain中删除*/ <span style="white-space:pre"> </span> <span style="white-space:pre"> /*释放完free链表,如果FULL链表或partial链表中还有slab,说明还有对象处于分配状态 因此不能销毁该缓存!*/ </span> if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); list_add(&cachep->list, &slab_caches); /*重新将缓存添加到cache_chain链表中*/ mutex_unlock(&slab_mutex); put_online_cpus(); return; } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) rcu_barrier(); __kmem_cache_destroy(cachep);/*释放cache所涉及到的各个描述符的存储对象*/ mutex_unlock(&slab_mutex); put_online_cpus();
static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; struct kmem_list3 *l3; /*将本地高速缓存,share本地高速缓存以及l3-->alien高速缓存的空闲对象释放slab*/ drain_cpu_caches(cachep); check_irq_on(); for_each_online_node(i) { l3 = cachep->nodelists[i]; if (!l3) continue; /*销毁空闲链表中的slab*/ drain_freelist(cachep, l3, l3->free_objects); /*判断full和partial是否为空,有一个不为空则ret就为1*/ ret += !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); } return (ret ? 1 : 0); }
static void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; struct kmem_list3 *l3; /* 释放每个cpu local cache使用的struct array_cache对象 ,注意此时是online cpu, cpu如果是down状 态,并没有释放 */ for_each_online_cpu(i) kfree(cachep->array[i]); /* NUMA: free the list3 structures */ for_each_online_node(i) {/*对每个在线的节点*/ l3 = cachep->nodelists[i]; if (l3) { /* 释放shared local cache使用的struct array_cache对象 */ kfree(l3->shared); free_alien_cache(l3->alien); kfree(l3);/*释放三链*/ } } /*释放cache,因为该cache为cache_cache中的对象,所以调用对象释放 函数*/ kmem_cache_free(&cache_cache, cachep); }
static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3 && l3->alien) drain_alien_cache(cachep, l3->alien); //destory arraycache } for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) drain_array(cachep, l3, l3->shared, 1, node);//destory share arraycache } }
/* * Remove slabs from the list of free slabs. * Specify the number of slabs to drain in tofree. * * Returns the actual number of slabs released. */ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree) { struct list_head *p; int nr_freed; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {/*slab中的对象还未释放完并且free链表不为空*/ spin_lock_irq(&l3->list_lock); p = l3->slabs_free.prev; if (p == &l3->slabs_free) {/*链表中已无元素*/ spin_unlock_irq(&l3->list_lock); goto out; } slabp = list_entry(p, struct slab, list);/*从free链表中取出一个slab*/ #if DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list);/*从链表中删除*/ /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ l3->free_objects -= cache->num; /*空闲对象数量总数减去num*/ spin_unlock_irq(&l3->list_lock); slab_destroy(cache, slabp);<span style="font-family: Consolas, 'Courier New', Courier, mono, serif; font-size: 13.3333px; line-height: 20px; background-color: rgb(248, 248, 248);"></span> /*销毁slab 见后面slab 销毁分析* / <span style="margin: 0px; padding: 0px; border: none; font-family: Consolas, 'Courier New', Courier, mono, serif; font-size: 13.3333px; line-height: 20px; background-color: rgb(248, 248, 248);"></span> nr_freed++; } out: return nr_freed; } /
static void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; struct kmem_list3 *l3; /*释放存储本地高速缓存描述符的对象*/ for_each_online_cpu(i) kfree(cachep->array[i]); /* NUMA: free the list3 structures */ for_each_online_node(i) { l3 = cachep->nodelists[i]; if (l3) { /*释放存储共享本地高速缓存描述符的对象*/ kfree(l3->shared); /*释放存储alien本地高速缓存描述符的对象*/ free_alien_cache(l3->alien); /*释放存储kmem_list3描述符的对象*/ kfree(l3); } } /*释放存储缓存描述符的对象*/ kmem_cache_free(&cache_cache, cachep); }