水平有限,描述不当之处还请之处,转载请注明出处http://blog.csdn.net/vanbreaker/article/details/7670272
创建新的缓存必须通过kmem_cache_create()函数来完成,原型如下
struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))
kmem_cache_create()的实际工作就是为新的缓存申请缓存描述符,array_cache描述符和kmem_list3描述符,并根据接收的参数对这三个结构中的变量进行相应的初始化。新创建的缓存是空的,不包含slab。
struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; gfp_t gfp; /* * Sanity checks... these are all serious usage bugs. */ /*做一些必要的检查,以下情况都是不合法的: 1.缓存名为空 2.处于中断环境中 3.缓存中的对象大小小于处理器的字长 4.缓存中的对象大小大于普通缓存的最大长度*/ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "%s: Early error in slab %s\n", __func__, name); BUG(); } /* * We use cache_chain_mutex to ensure a consistent view of * cpu_online_mask as well. Please see cpuup_callback */ if (slab_is_available()) { get_online_cpus(); mutex_lock(&cache_chain_mutex); } list_for_each_entry(pc, &cache_chain, next) { char tmp; int res; /* * This happens when the module gets unloaded and doesn't * destroy its slab cache and no-one else reuses the vmalloc * area of the module. Print a warning. */ res = probe_kernel_address(pc->name, tmp); if (res) { printk(KERN_ERR "SLAB: cache with size %d has lost its name\n", pc->buffer_size); continue; } if (!strcmp(pc->name, name)) { printk(KERN_ERR "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } } #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif /* * Always checks flags, a caller might be expecting debug support which * isn't available. */ BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ /*如果缓存对象大小没有对齐到处理器字长,则对齐之*/ if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } /* calculate the final buffer alignment: */ /* 1) arch recommendation: can be overridden for debug */ /*要求按照体系结构对齐*/ if (flags & SLAB_HWCACHE_ALIGN) { /* * Default alignment: as specified by the arch code. Except if * an object is really small, then squeeze multiple objects into * one cacheline. */ ralign = cache_line_size();/*对齐值取L1缓存行的大小*/ /*如果对象大小足够小,则不断压缩对齐值以保证能将足够多的对象装入一个缓存行*/ while (size <= ralign / 2) ralign /= 2; } else { ralign = BYTES_PER_WORD; /*对齐值取处理器字长*/ } /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated * alignment if either is greater than BYTES_PER_WORD. */ /*如果开启了DEBUG,则按需要进行相应的对齐*/ if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); } /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ if (ralign < align) { ralign = align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; /* Get cache's description obj. */ /*从cache_cache中分配一个高速缓存描述符*/ cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; #if DEBUG cachep->obj_size = size; /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsigned long long); size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif #endif /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do * it too early on.) */ /*如果缓存对象的大小不小于页面大小的1/8并且不处于slab初始化阶段, 则选择将slab描述符放在slab外部以腾出更多的空间给对象*/ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; /*将对象大小按之前确定的align对齐*/ size = ALIGN(size, align); /*计算slab的对象数,分配给slab的页框阶数并返回slab的剩余空间,即碎片大小*/ left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; } /*将slab管理区的大小按align进行对齐*/ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ /*如果之前确定将slab管理区放在slab外部,但是碎片空间大于slab管理区大小, 这时改变策略将slab管理区放在slab内部,这样可以节省外部空间,但是会牺牲 着色的颜色个数*/ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; left_over -= slab_size; } /*如果的确要将slab管理区放在外部,则不需按照该slab的对齐方式进行对齐了, 重新计算slab_size*/ if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } /*着色偏移区L1缓存行的大小*/ cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < align)/*着色偏移小于align的话则要取对齐值*/ cachep->colour_off = align; /*计算着色的颜色数目*/ cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; } /* cache setup completed, link it into the list */ /*将该高速缓存描述符添加进cache_chain*/ list_add(&cachep->next, &cache_chain); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); if (slab_is_available()) { mutex_unlock(&cache_chain_mutex); put_online_cpus(); } return cachep; }
再来看看两个辅助函数calculate_slab_order()和setup_cpu_cache()
static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; int gfporder; for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsigned int num; size_t remainder; /*根据gfporder计算对象数和剩余空间*/ cache_estimate(gfporder, size, align, flags, &remainder, &num); if (!num)/*如果计算出来的对象数为0则要增大分配给slab的页框阶数再进行计算*/ continue; if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ /*offslab_limit记录了在外部存储slab描述符时所允许的slab最大对象数*/ offslab_limit = size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); /*如果前面计算出的对象数num要大于允许的最大对象数,则不合法*/ if (num > offslab_limit) break; } /* Found something acceptable - save it away */ cachep->num = num; cachep->gfporder = gfporder; left_over = remainder; /* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ if (flags & SLAB_RECLAIM_ACCOUNT) break; /* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ if (gfporder >= slab_break_gfp_order) break; /* * Acceptable internal fragmentation? */ if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; }
static void cache_estimate(unsigned long gfporder, size_t buffer_size, size_t align, int flags, size_t *left_over, unsigned int *num) { int nr_objs; size_t mgmt_size; size_t slab_size = PAGE_SIZE << gfporder; /* * The slab management structure can be either off the slab or * on it. For the latter case, the memory allocated for a * slab is used for: * * - The struct slab * - One kmem_bufctl_t for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ /*如果slab描述符存储在slab外部,则slab的对象数即为slab_size/buffer_size*/ if (flags & CFLGS_OFF_SLAB) { mgmt_size = 0; nr_objs = slab_size / buffer_size; if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; } else {/*否则先减去slab管理区的大小再进行计算*/ /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at * least @align. In the worst case, this result will * be one greater than the number of objects that fit * into the memory allocation when taking the padding * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / (buffer_size + sizeof(kmem_bufctl_t)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size > slab_size) nr_objs--; if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; /*计算slab管理区的大小*/ mgmt_size = slab_mgmt_size(nr_objs, align); } /*保存slab对象数*/ *num = nr_objs; /*计算并保存slab的剩余空间*/ *left_over = slab_size - nr_objs*buffer_size - mgmt_size; }
在slab初始化完成后,也就是g_cpucache_up变量的值为FULL后,setup_cpu_cache()函数等价于setup_cpu_cache()-->enable_cpucache()
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; int limit, shared; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. */ /*根据对象的大小来确定本地高速缓存中的空闲对象上限*/ if (cachep->buffer_size > 131072) limit = 1; else if (cachep->buffer_size > PAGE_SIZE) limit = 8; else if (cachep->buffer_size > 1024) limit = 24; else if (cachep->buffer_size > 256) limit = 54; else limit = 120; /* * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array * replaces Bonwick's magazine layer. * On uniprocessor, it's functionally equivalent (but less efficient) * to a larger limit. Thus disabled by default. */ shared = 0; if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; #if DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; #endif err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); return err; }
static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; int i; /*申请一个ccupdate_struct*/ new = kzalloc(sizeof(*new), gfp); if (!new) return -ENOMEM; /*为每个CPU申请array_cache和用来跟踪本地CPU空闲对象的指针数组*/ for_each_online_cpu(i) { new->new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) kfree(new->new[i]); kfree(new); return -ENOMEM; } } new->cachep = cachep; /*将cachep和array_cache进关联*/ on_each_cpu(do_ccupdate_local, (void *)new, 1); check_irq_on(); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; for_each_online_cpu(i) { struct array_cache *ccold = new->new[i]; if (!ccold) continue; spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); kfree(ccold); } kfree(new); /*申请kmem_list3*/ return alloc_kmemlist(cachep, gfp); }