两个月前解决了lvm在线扩容的bug,当时未得空,现在得空记录一下,内核版本基于3.10.33。
线上万客云lvm在线扩容导致lvresize卡死,log 如下:
[47572.794000] BUG kmalloc-192(0:docker_cloud) (Tainted: P B O): Objects remaining in kmalloc-192(0:4dacd92fd73c5563ea9a5348c7e4955b07c3e6d61058caea00fb471bed88274b
[47572.813702] -----------------------------------------------------------------------------
\x000a
[47572.823675] INFO: Slab 0xc126155c objects=21 used=16 fp=0xc8c5f600 flags=0x0080
[47572.830920] CPU: 2 PID: 12715 Comm: lvresize Tainted: P B O 3.10.33 #57
[47572.838397] [] (unwind_backtrace+0x0/0xec) from [] (show_stack+0x10/0x14)
[47572.846992] [] (show_stack+0x10/0x14) from [] (slab_err+0x74/0x84)
[47572.855011] [] (slab_err+0x74/0x84) from [] (free_partial+0xd8/0x220)
[47572.863292] [] (free_partial+0xd8/0x220) from [] (__kmem_cache_shutdown+0x40/0xcc)
[47572.872700] [] (__kmem_cache_shutdown+0x40/0xcc) from [] (kmem_cache_destroy+0x64/0xf8)
[47572.882541] [] (kmem_cache_destroy+0x64/0xf8) from [] (kmem_cache_destroy_memcg_children+0x84/0x98)
[47572.893433] [] (kmem_cache_destroy_memcg_children+0x84/0x98) from [] (kmem_cache_destroy+0x14/0xf8)
[47572.904310] [] (kmem_cache_destroy+0x14/0xf8) from [] (bioset_free+0xe8/0x114)
[47572.913357] [] (bioset_free+0xe8/0x114) from [] (dm_swap_table+0x1bc/0x320)
[47572.922201] [] (dm_swap_table+0x1bc/0x320) from [] (dev_suspend+0x12c/0x268)
[47572.931251] [] (dev_suspend+0x12c/0x268) from [] (ctl_ioctl+0x4b0/0x4d0)
[47572.939581] [] (ctl_ioctl+0x4b0/0x4d0) from [] (do_vfs_ioctl+0x55c/0x5b0)
[47572.948179] [] (do_vfs_ioctl+0x55c/0x5b0) from [] (SyS_ioctl+0x50/0x7c)
[47572.956633] [] (SyS_ioctl+0x50/0x7c) from [] (ret_fast_syscall+0x0/0x30)
[47572.965162] INFO: Object 0xc8c5f000 @offset=0
[47572.969638] INFO: Object 0xc8c5f0c0 @offset=192
[47572.974286] INFO: Object 0xc8c5f240 @offset=576
[47572.978921] INFO: Object 0xc8c5f300 @offset=768
[47572.983592] INFO: Object 0xc8c5f480 @offset=1152
[47572.988314] INFO: Object 0xc8c5f780 @offset=1920
[47572.993096] INFO: Object 0xc8c5f840 @offset=2112
[47572.997813] INFO: Object 0xc8c5f900 @offset=2304
[47573.002585] INFO: Object 0xc8c5f9c0 @offset=2496
[47573.007298] INFO: Object 0xc8c5fa80 @offset=2688
[47573.012039] INFO: Object 0xc8c5fb40 @offset=2880
[47573.016796] INFO: Object 0xc8c5fc00 @offset=3072
[47573.021518] INFO: Object 0xc8c5fcc0 @offset=3264
[47573.026293] INFO: Object 0xc8c5fd80 @offset=3456
[47573.031005] INFO: Object 0xc8c5fe40 @offset=3648
[47573.035797] INFO: Object 0xc8c5ff00 @offset=3840
[47573.040508] =============================================================================
在lvresize系统调用里调用do_resume,do_resume函数调用dm_swap_table用新的映射表替换老的映射表,该函数还会调用 __bind_mempools为md设备绑定新的内存pool,通过bioset_free销毁老的内存pool,然后调用kmem_mem_destroy函数释放pool使用的kmem_cache。
该bug主要是lvm在线扩容调用lvresize命令触发,按照当时总结的复现文档,整个复现过程如下:
采用最新的内核,并且docker环境是配置好的,lvm在线扩容复现bug步骤如下:
(1)在u盘上创建两个1g的大文件
dd if=/dev/zero of=/media/sda1/lvm0.img bs=1M count=1024
dd if=/dev/zero of=/media/sda1/lvm1.img bs=1M count=1024
(2) 设置lvm环境变量
cd /root/lvm-plugin&&source lvm.sh
(3)清理上次创建的逻辑卷、卷组、loop设备
losetup -d /dev/loop0
losetup -d /dev/loop1
killall lvmetad
rm /run/lvm/archive/myvg*
rm /run/lvm/backup/myvg*
/root/lvm-plugin/bin/lvmetad -f &
dmsetup remove -f /dev/myvg/mylv
vgremove -y myvg
vgreduce --removemissing myvg
pvremove /dev/loop0 /dev/loop1
(4)创建loop设备、物理卷、卷组、逻辑卷,并格式化lvm为ext4,并挂载
losetup /dev/loop0 /media/sda1/lvm0.img
losetup /dev/loop1 /media/sda1/lvm1.img
pvcreate -ff -y /dev/loop0
vgcreate -y myvg /dev/loop0
lvcreate -y -L 1000M -n mylv myvg
mke2fs -t ext4 -F /dev/myvg/mylv
#mkdir /tmp/dcdn_base
mount -t ext4 -o rw,noatime,nodiratime,barrier=0,nosuid,nodev,data=ordered /dev/mapper/myvg-mylv /tmp/dcdn_base;
(5)在lvm目录运行docker任务
ulimit -v unlimited; /app/system/miner.plugin-dockerd.ipk/bin/docker run --rwlayer-dir=/tmp/ram3 --rwlayer-size=30M --log-opt max-size=100M --log-opt log-path=/tmp/dcdn_base/ --log-opt max-file=3 --log-driver json-file --ulimit core=999614896 --cpu-quota 200000 -v /tmp/dcdn_base/:/storage --name 1_ac912705_65c381a393cbc79e92c58b5bda4d1a5c --network=host --memory 200M -d 1/galaxyhapp:v5.5
(6)lvm扩容
pvcreate /dev/loop1 -ff
vgextend myvg /dev/loop1
lvresize -L 2000M /dev/myvg/mylv
#e2fsck -y -f /dev/myvg/mylv
resize2fs /dev/myvg/mylv
未修复的内核,会在lvresize这一步卡死,内核爆出crash。
<二>lvm io 处理流程
在分析log之前,有必要先看看lvm设备的io 处理流程。
(1)入口函数为lvm设备的make_request_fn函数,具体为dm_request函数,代码如下:
static void dm_request(struct request_queue *q, struct bio *bio)
{
struct mapped_device *md = q->queuedata;
struct page *page1 = NULL;
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
}
如果是基于块设备驱动层请求的设备,走blk_queue_bio分支,如果基于通用块层请求的映射设备,调用_dm_request函数,lvm是基于通用块层的映射设备,走_dm_request函数。
static void _dm_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
down_read(&md->io_lock);
cpu = part_stat_lock();
part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
up_read(&md->io_lock);
if (bio_rw(bio) != READA)
queue_io(md, bio);
else
bio_io_error(bio);
return;
}
__split_and_process_bio(md, bio);
up_read(&md->io_lock);
return;
}
lvresize会先suspend设备,然后再resume设备。当suspend设备时,会设置DMF_BLOCK_IO_FOR_SUSPEND标志。该标志设置的话,会调用queue_io函数将bio添加到md的defer_io延迟链表里。延迟defer_io里的bio的提交工作交给md->work工作队列处理。具体工作队列处理函数dm_wq_work还是会调用__split_and_process_bio函数进一步处理。
如果没有设置DMF_BLOCK_IO_FOR_SUSPEND标志,直接调用__split_and_process_bio函数处理。__split_and_process_bio源码如下:
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
struct clone_info ci;
int error = 0;
ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
bio_io_error(bio);
return;
}
ci.md = md;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->bio = bio;
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_sector;
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
if (bio->bi_rw & REQ_FLUSH) {
ci.bio = &ci.md->flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
} else {
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
while (ci.sector_count && !error)
error = __split_and_process_non_flush(&ci);
}
/* drop the extra reference count */
dec_pending(ci.io, error);
dm_table_put(ci.map);
}
13行,从md->io_pool里面申请dm_io结构体,15行设置dm_io的count计数为1。32-33行循环调用__split_and_process_non_flush函数split bio,如果不要split就调用一次。
37行调用dec_pending函数减少dm_io的count计数,保持计数平衡。
看下__split_and_process_non_flush函数:
static int __split_and_process_non_flush(struct clone_info *ci)
{
struct bio *bio = ci->bio;
struct dm_target *ti;
sector_t len, max;
int idx;
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __send_discard(ci);
else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
return __send_write_same(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
max = max_io_len(ci->sector, ti);
/*
* Optimise for the simple case where we can do all of
* the remaining io with a single clone.
*/
if (ci->sector_count <= max) {
__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
ci->idx, bio->bi_vcnt - ci->idx, 0,
ci->sector_count, 0);
ci->sector_count = 0;
return 0;
}
/*
* There are some bvecs that don't span targets.
* Do as many of these as possible.
*/
if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
len = __len_within_target(ci, max, &idx);
__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
ci->idx, idx - ci->idx, 0, len, 0);
ci->sector += len;
ci->sector_count -= len;
ci->idx = idx;
return 0;
}
/*
* Handle a bvec that must be split between two or more targets.
*/
return __split_bvec_across_targets(ci, ti, max);
}
该函数是比较核心的函数,负责处理bio 的split ,map 和提交。
(1)首先调用dm_table_find_target函数,在md设备映射表里根据bio扇区号查找对应的映射条目dm_target,映射条目dm_target是以起始扇区号排序插入dm设备的映射表里的。
(2) 计算bio 在对应的dm_target上可以下发的io请求上限max。bio在dm_tartget对应的下层设备上下发的最大io上限取决于两个因素,不能跨越dm_target边界,不能超过下层设备最大IO限制(ti->max_io_len),二者取其小。
(3) 处理实际的bio split工作。可分三种情况:1)整个bio大小小于io请求上限,不需要split。2)当前bvec小于io请求上限,但bio的整体大小大于io请求上限,那么就以max单位split bio。3)当前bvec的大小就大于io请求上限,并且有可能跨越多个dm_target,这种情况需要调用__split_bvec_across_targets函数处理。
(4)映射提交bio。无论bio,是否split,最后的均会调用__clone_and_map_data_bio函数来处理。
static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
sector_t sector, int nr_iovecs,
unsigned short idx, unsigned short bv_count,
unsigned offset, unsigned len,
unsigned split_bvec)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
unsigned target_bio_nr;
unsigned num_target_bios = 1;
/*
* Does the target want to receive duplicate copies of the bio?
*/
if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
num_target_bios = ti->num_write_bios(ti, bio);
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
if (split_bvec)
clone_split_bio(tio, bio, sector, idx, offset, len);
else
clone_bio(tio, bio, sector, idx, bv_count, len);
__map_bio(tio);
}
}
__clone_and_map_data_bio函数会为每个bio 申请dm_target_io,
struct dm_target_io {
struct dm_io *io;
struct dm_target *ti;
union map_info info;
unsigned target_bio_nr;
struct bio clone;
};
该函数使用dm_target_io里的dm_io结构体来的计数来跟踪多个split bio的完成情况,并将原始bio clone到dm_target_io里的clone里,最后提交给下层设备的是clone的bio,而不是原始bio。最后调用函数map_bio映射clone的bio。看下这个函数:
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
struct mapped_device *md;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
clone->bi_end_io = clone_endio;
clone->bi_private = tio;
/*
* Map the clone. If r == 0 we don't need to do
* anything, the target has assumed ownership of
* this io.
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
generic_make_request(clone);
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
/* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
free_tio(md, tio);
} else if (r) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
}
__map_bio函数主要完成clone bio的重定向工作。主要通过dm_target->type->map函数来完成。由于lvm采用的linear映射方式,map函数为linear_map函数。linear_map函数实现比较简单,bio的扇区等于dm_target在下层设备的起始扇区加上bio在dm_taeget映射条目里的offset。
函数最后调用generic_make_request函数下下层设备提交clone后bio 。
最后说下lvm的bio完成回调机制。这个和bcache设备的回调机制很类似。
假如bio分裂成bio1和bio2,每次提交会为原始bio 申请一个dm_io结构体:
struct dm_io {
struct mapped_device *md;
int error;
atomic_t io_count;
struct bio *bio;
unsigned long start_time;
spinlock_t endio_lock;
};
同时为clone bio 也就是bio1和bio2申请struct dm_target_io结构体。dm_tartget_io->io = dm_io。同时bio1,bio2的bi_end_io为clone_endio函数,private为dm_tartget_io结构体。来看clone_endio:
static void clone_endio(struct bio *bio, int error)
{
int r = 0;
struct dm_target_io *tio = bio->bi_private;
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
if (!bio_flagged(bio, BIO_UPTODATE) && !error)
error = -EIO;
if (endio) {
r = endio(tio->ti, bio, error);
if (r < 0 || r == DM_ENDIO_REQUEUE)
/*
* error and requeue request are handled
* in dec_pending().
*/
error = r;
else if (r == DM_ENDIO_INCOMPLETE)
/* The target will handle the io */
return;
else if (r) {
DMWARN("unimplemented target endio return value: %d", r);
BUG();
}
}
free_tio(md, tio);
dec_pending(io, error);
}
当bio1或者bio2 io 完成时,调用free_tio释放其对应的bio内存,同时也释放其对应的的dm_tartget_io。调用dec_pending减少dm_Io的io_count计数,当计数为0时,调用原始bio的回调函数,并释放dm_io结构体。
<二> bug log分析
在lvresize系统调用里调用kmem_cache_destroy销毁专有高速缓存爆出的crash。爆出crash的调用路径为:
kmem_cache_destroy
kmem_cache_destroy_memcg_children
kmem_cache_destroy
kmem_cache_close
free_partial
list_slab_objects
slab_err
|->slub_debug
|->print_page_info
根据slab_err和print_page_info函数的打印情况来看,可作出如下分析:
kmalloc-192(0:docker-cloud)为kmem_cache的name,其中kmalloc-192为root kmem_cache的name,(0:docker_cloud),其中0为kmem_cahce所在mem_cgroup的kmemcg_id,docker_cloud为所属mem_cgroup所在cgroup的name。
log第3行打印如下:
[47572.823675] INFO: Slab 0xc126155c objects=21 used=16 fp=0xc8c5f600 flags=0x008
该打印来自print_page_info函数。根据调用栈可知,这里打印的是调用free_partial 释放kmem_cache_node里的partial链表里的slab出错,然后通过print_page_info函数打印出的出错slab的信息。
Slab 0xc126155c为出错slab的地址,objects=21,说明一个完整的slab包含21个object,used=16,说明page->inuse为16,还在使用的object为16,fp=0xc8c5f600为slab的freelist链表首地址。
page->inuse为16这个有问题。page->inuse的含义是指当前slab正在被使用的object的数目,不过他还包括kmem_cache_cpu->freelist中的objcect的数目(如果kmem_cahce_cpu->page为此slab的话)。真正分配出去的object数目是page->inuse 减去kmem_cache_cpu->freelist中的object数目。
现在 kmalloc-192(0:docker_cloud)的中的slab的inuse为16,那么有没有可能这16个object位于kmem_cache_cpu的freelist上呢。
在kmem_cache_close会调用 flush_all,flush_all调用 flush_cpu_slab函数,flush_cpu_slab函数调用flush_slab,flub_slab函数最终调用deactivate_slab函数清空kmem_cache_cpu的freelist和page,并根据该slab的饱和情况是free或者加入kmem_cache_node的partial链表。
由此可见,page->inuse为16,说明调用kmem_cache_destroy函数的时候,其中的slab还有16个object被分配出去没有释放,这是memory leak了么。
在调用栈中bioset_free调用kmem_cache_destroy销毁的是md->bs->bio_slab,而md->bs来源于当前使用的dm_table的dm_table->mempools->bs。来看看dm_table->mempools->bs是在哪创建的。
在加载dm_table的时候调用dm_alloc_md_mempools函数为dm_table分配内存池,dm_alloc_md_mempools函数代码如下:
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
{
struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
struct kmem_cache *cachep;
unsigned int pool_size;
unsigned int front_pad;
if (!pools)
return NULL;
if (type == DM_TYPE_BIO_BASED) {
cachep = _io_cache;
pool_size = 16;
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
} else if (type == DM_TYPE_REQUEST_BASED) {
cachep = _rq_tio_cache;
pool_size = MIN_IOS;
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_bio_data_size is not used. See __bind_mempools(). */
WARN_ON(per_bio_data_size != 0);
} else
goto out;
pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
if (!pools->io_pool)
goto out;
pools->bs = bioset_create(pool_size, front_pad);
if (!pools->bs)
goto out;
if (integrity && bioset_integrity_create(pools->bs, pool_size))
goto out;
return pools;
out:
dm_free_md_mempools(pools);
return NULL;
}
值得注意,14行,front_pad大小为bio结构体的大小加上dm_target_io结构体的大小。
struct dm_target_io {
struct dm_io *io;
struct dm_target *ti;
union map_info info;
unsigned target_bio_nr;
struct bio clone;
};
bio结构图内嵌入了dm_target_io结构体。前面说过,在处理原始bio的时候,不会下发原始bio,会克隆原始bio,并为它分配dm_target_io结构体。 在24行,调用bioset_create函数, bioset_create函数调用kmem_cache_create函数创建大小为sizeof(bio) + sizeof(dm_target_io)的高速专用缓存。这样可以一次一体分配bio和dm_tartget_io结构体。
bioset_create函数调用bio_find_or_create_slab函数来创建kmem_cache。来看这个函数:
static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
{
unsigned int sz = sizeof(struct bio) + extra_size;
struct kmem_cache *slab = NULL;
struct bio_slab *bslab, *new_bio_slabs;
unsigned int new_bio_slab_max;
unsigned int i, entry = -1;
mutex_lock(&bio_slab_lock);
i = 0;
while (i < bio_slab_nr) {
bslab = &bio_slabs[i];
if (!bslab->slab && entry == -1)
entry = i;
else if (bslab->slab_size == sz) {
slab = bslab->slab;
bslab->slab_ref++;
break;
}
i++;
}
if (slab)
goto out_unlock;
if (bio_slab_nr == bio_slab_max && entry == -1) {
new_bio_slab_max = bio_slab_max << 1;
new_bio_slabs = krealloc(bio_slabs,
new_bio_slab_max * sizeof(struct bio_slab),
GFP_KERNEL);
if (!new_bio_slabs)
goto out_unlock;
bio_slab_max = new_bio_slab_max;
bio_slabs = new_bio_slabs;
}
if (entry == -1)
entry = bio_slab_nr++;
bslab = &bio_slabs[entry];
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
if (!slab)
goto out_unlock;
printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
dump_stack();
printk("kmem_cache->name %s!\n",slab->name);
bslab->slab = slab;
bslab->slab_ref = 1;
bslab->slab_size = sz;
out_unlock:
mutex_unlock(&bio_slab_lock);
return slab;
}
该函数在117行创建bio 的kmem_cache。但kmem_cache_create的第一个参数name是"bio-%d",不是应该是"kmalloc-192"么。
为此,我添加了49-50行打印,结果打印如下:
[ 109.616022] bio: create slab at 2
[ 109.616034] CPU: 1 PID: 2643 Comm: lvcreate Tainted: P O 3.10.33 #60
[ 109.623402] [] (unwind_backtrace+0x0/0xec) from [] (show_stack+0x10/0x14)
[ 109.632083] [] (show_stack+0x10/0x14) from [] (bioset_create+0x1a8/0x2a8)
[ 109.640704] [] (bioset_create+0x1a8/0x2a8) from [] (dm_alloc_md_mempools+0xdc/0xf4)
[ 109.650232] [] (dm_alloc_md_mempools+0xdc/0xf4) from [] (dm_table_alloc_md_mempools+0x78/0x8c)
[ 109.660579] [] (dm_table_alloc_md_mempools+0x78/0x8c) from [] (dm_table_complete+0x2d8/0x318)
[ 109.670990] [] (dm_table_complete+0x2d8/0x318) from [] (table_load+0x108/0x2c8)
[ 109.680460] [] (table_load+0x108/0x2c8) from [] (ctl_ioctl+0x4b0/0x4d0)
[ 109.688550] [] (ctl_ioctl+0x4b0/0x4d0) from [] (do_vfs_ioctl+0x55c/0x5b0)
[ 109.697131] [] (do_vfs_ioctl+0x55c/0x5b0) from [] (SyS_ioctl+0x50/0x7c)
[ 109.705626] [] (SyS_ioctl+0x50/0x7c) from [] (ret_fast_syscall+0x0/0x30)
[ 109.714164] kmem_cache->name kmalloc-192!
输出结果第一行和最后一行。本来要创建name为"bio-2"的kmem_cache,结果创建的kmem_cache的name为“kmalloc-192”。很明显kmalloc-192是kmalloc的通用高速缓存。
<三>kmem_cache的alias特性
之所以会出现上面的情况是因为kmem_cache的alias特性,或者叫kmem_cache重名、kmem_cache别名。指在kmem_cache_create创建kmem_cache的时候,会尝试复用slub中已经存在的kmem_cache,复用的基本条件是创建size与已存在的kmem_cache的size比较接近,且小于等于后者。
kmem_cache的alias特性的开启是本文bug的根本原因。
当调用kmem_cache_create函数的时候,会调用__kmem_cache_alias函数来尝试复用已经存在的kmem_cache,只有不能复用,才会真正创建。
__kmem_cache_alias函数调用find_mergeable函数来完成查找匹配工作。源码如下:
static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
size_t align, unsigned long flags, const char *name,
void (*ctor)(void *))
{
struct kmem_cache *s;
if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
return NULL;
if (ctor)
return NULL;
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
flags = kmem_cache_flags(size, flags, name, NULL);
list_for_each_entry(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
if (size > s->size)
continue;
if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
continue;
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
*/
if ((s->size & ~(align - 1)) != s->size)
continue;
if (s->size - size >= sizeof(void *))
continue;
if (!cache_match_memcg(s, memcg))
continue;
return s;
}
return NULL;
}
主要的匹配条件是创建size要不大于已存在的kmem_cache的size,而且两者size要足够接近(size按照cache_line_size 64字节对齐之后,和候选的kmem_cache的size大小相差不能大于一个size(void*),此例子中我们申请的size 154,按照cache_line_size 64字节对齐之后是192字节,正好等于kmalloc-192的object大小)。除此之外,还有看两者cgroup的匹配程度,具体来说,当调用kmem_cache_create函数的时候,此时只能匹配root kmem_cache,在其他路径上面(比如kmem_cache_alloc路径上),要求用户所在的mem_cgroup和候选的kmem_cache同属于一个mem_cgroup才算匹配成功。
总之,在为lvm的md设备在clone的 bio内存创建kmem_cache专用高速缓存的额时候会复用kmallo-192的kem_cache,md设备申请bio内存就会从kmalloc-192里去申请,同时kmalloc函数申请129-192范围内的内存也会从kmalloc-192里取申请。
属于docker_cloud的cgroup进程在 lvm设备上读写时候,回调用kmem_cache_alloc(kmalloc-192)来为bio申请内存。在__memcg_kmem_get_cache函数中,如果发现还没有为docker创建kmalloc-192在cgroup中的实例,还需要创建。创建之后的名字为kmalloc-192(0:docker_cloud),创建之后放入kmalloc-192的memcg_params->memcg_caches[idx]数组中,其中,idx为memcg的id。kmalloc-192是root kmem_cache,kmalloc-192(0:docker_cloud)是kmalloc-192的child kmem_cache。最多的情况下,每个mem_cgroup都可能在kmalloc-192 下创建cgroup实例。
__kmem_cache_alias函数在复用原有的kmem_cache成功之后,会将原来的kmem_cache->s->refcount++,refcount反映kmem_cache的复用次数。
调用kmem_cache_alloc的时候,只会从自己的cgroup kmem_cache实例中去分配。而非cgroup用户从root kmem_cache里去分配。
<四> kmem_cache_destroy流程
以本例子中的kmem_cache_destroy流程为例。
dm设备调用kmem_cache_destroy释放bio的kmem_cache。
kmem_cache_destroy释放的是kmalloc-192。kmem_cache_destroy源码如下:
void kmem_cache_destroy(struct kmem_cache *s)
{
/* Destroy all the children caches if we aren't a memcg cache */
kmem_cache_destroy_memcg_children(s);
get_online_cpus();
mutex_lock(&slab_mutex);
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
if (!__kmem_cache_shutdown(s)) {
mutex_unlock(&slab_mutex);
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
memcg_release_cache(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
} else {
list_add(&s->list, &slab_caches);
mutex_unlock(&slab_mutex);
printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
s->name);
dump_stack();
}
} else {
mutex_unlock(&slab_mutex);
}
put_online_cpus();
}
首先调用kmem_cache_destroy_memcg_children释放kmalloc_192的所有cgroup实例,对属于每个cgroup的kmem_cache实例调用kmem_cache_destroy函数进行释放(递归调用)。
对每个cgroup 实例调用kmem_cache_destroy的时候,kmem_cache_destroy_memcg_children函数 会马上返回,以为cgroup的实例本身没有chilld,是非root kmem_cache,只有root kmem_cache有child。
判断cgroup的实例refcount(一般为1),只有root kmem_cache的refcount为2,因为复用了。
refcount减1(变为0),进入_kmem_cache_shutdown流程,这是真正的释放函数。
也就是说 kmalloc-192(0:docker_cloud)执行_kmem_cache_shutdown流程。调用fluash_all函数将per cpu的kmem_cache_cpu的上的freelist和partilal上的slab刷回kmem_cache_node的partial,如果为满free的话,直接释放到伙伴系统。最后调用free_partial释放每个kmem_cache_node的partial上的slab内存到伙伴系统。
在调用free_partial函数的时候爆出文章开始的crash。
前面分析,crash log的原因是slab的的object没有释放。有没有可能是此时的bio还没有完成io并导致没法释放bio内存,就销毁了kmem_cache。
这个不会。
在do_suspend函数中首先会设置md设备的DMF_BLOCK_IO_FOR_SUSPEND标志,该标志会阻止_dm_request直接向下层设备提交bio,而是加入md->deferred延迟链表里面。md->deferred延迟队列的bio是由工作队列dm_wq_work函数处理的。DMF_BLOCK_IO_FOR_SUSPEND置位的话,同样会阻止dm_wq_wok函数向下层提交bio,导致dm_wq_work函数直接返回。
也就是DMF_BLOCK_IO_FOR_SUSPEND可以持续接受上层设备发下来的bio,但阻止向下层提交,全部累积到md->deferred延迟链表。注意累积的是上层下发的原始bio不会导致md的bio分配。
同时在do_suspend中,会调用dm_wait_for_completion(md, TASK_INTERRUPTIBLE)函数等待in flight的bio全部完成,也就是在设置该标志之前就已经处于in flight的bio,等待他们全部完成。
所以在调用kmem_cache_destroy销毁kmalloc-192的时候,所有md的bio内存就已经全部回收。
问题在于,此时的kmalloc-192是通用kmalloc和md的bio高速缓存共有的,kmem_cache->refcount为2。假如此前cgroup docker_cloud中的进程页调用了kmalloc函数申请192内存,它必然会从 kmalloc-192(0:docker_cloud)的kmem_cache里申请,而此时, kmalloc-192(0:docker_cloud)被md设备调用kmem_cache_destroy销毁,所以就导致上面的crash。
我们可以验证下,是否有容器cgroup docker_cloud中的进程从kmalloc-192里申请了内存。在_kmalloc函数中添加打印如下:
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
void *ret;
struct mem_cgroup *memcg = NULL;
struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp = NULL;
struct page *page = NULL;
int need_printk = 0;
memcg = mem_cgroup_from_task(current);
css = (struct cgroup_subsys_state*)memcg;
if(css && css->cgroup)
cgrp = css->cgroup;
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return kmalloc_large(size, flags);
s = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
if(s->name && !strcmp(s->name,"kmalloc-192") && cgrp && !strcmp(cgrp->name->name,"docker_cloud")){
printk("kmalloc 192!\n");
need_printk = 1;
}
ret = slab_alloc(s, flags, _RET_IP_);
if(need_printk){
//printk("kmalloc %lx!\n",(unsigned long)ret);
page = virt_to_head_page(ret);
printk("------------------------------------!\n");
dump_stack();
printk("------------------------------------!\n");
printk("size %d page %p!\n",size,page);
}
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
return ret;
}
EXPORT_SYMBOL(__kmalloc);
其中5-9行,11-14行,22-25行,28-35为添加打印。mem_cgroup_from_task用于从task获取进程所属的mem_cgroup。根据进程所在的cgroup名字和kmem_cache的name来过滤打印。
添加打印后,dmesg信息如下:
[ 7705.030324] ------------------------------------!
[ 7705.035167] CPU: 2 PID: 27317 Comm: runc:[2:INIT] Tainted: P O 3.10.33 #63
[ 7705.043019] [] (unwind_backtrace+0x0/0xec) from [] (show_stack+0x10/0x14)
[ 7705.051662] [] (show_stack+0x10/0x14) from [] (__kmalloc+0x2a4/0x38c)
[ 7705.059944] [] (__kmalloc+0x2a4/0x38c) from [] (blkg_alloc+0x134/0x148)
[ 7705.068405] [] (blkg_alloc+0x134/0x148) from [] (blkg_create+0x4c/0x250)
[ 7705.076923] [] (blkg_create+0x4c/0x250) from [] (blkg_lookup_create+0xb8/0xf0)
[ 7705.085998] [] (blkg_lookup_create+0xb8/0xf0) from [] (blk_throtl_bio+0x668/0x690)
[ 7705.095396] [] (blk_throtl_bio+0x668/0x690) from [] (generic_make_request_checks+0x320/0x448)
[ 7705.105737] [] (generic_make_request_checks+0x320/0x448) from [] (generic_make_request+0x10/0xd0)
[ 7705.116650] [] (generic_make_request+0x10/0xd0) from [] (submit_bio+0x18c/0x1e0)
[ 7705.125669] [] (submit_bio+0x18c/0x1e0) from [] (mpage_readpages+0x150/0x174)
[ 7705.134654] [] (mpage_readpages+0x150/0x174) from [] (__do_page_cache_readahead+0x214/0x348)
[ 7705.144884] [] (__do_page_cache_readahead+0x214/0x348) from [] (page_cache_sync_readahead+0x58/0x60)
[ 7705.155848] [] (page_cache_sync_readahead+0x58/0x60) from [] (generic_file_aio_read+0x2cc/0x750)
[ 7705.166456] [] (generic_file_aio_read+0x2cc/0x750) from [] (do_sync_read+0x78/0x9c)
[ 7705.175950] [] (do_sync_read+0x78/0x9c) from [] (vfs_read+0xa8/0x1ac)
[ 7705.184220] [] (vfs_read+0xa8/0x1ac) from [] (vfsub_read_u+0xc/0x28)
[ 7705.192413] [] (vfsub_read_u+0xc/0x28) from [] (aufs_read+0x90/0x100)
[ 7705.200677] [] (aufs_read+0x90/0x100) from [] (vfs_read+0xa8/0x1ac)
[ 7705.208783] [] (vfs_read+0xa8/0x1ac) from [] (kernel_read+0x38/0x44)
[ 7705.216975] [] (kernel_read+0x38/0x44) from [] (prepare_binprm+0x108/0x118)
[ 7705.225774] [] (prepare_binprm+0x108/0x118) from [] (do_execve+0x360/0x520)
[ 7705.234571] [] (do_execve+0x360/0x520) from [] (SyS_execve+0x30/0x44)
[ 7705.242853] [] (SyS_execve+0x30/0x44) from [] (ret_fast_syscall+0x0/0x30)
[ 7705.251470] ------------------------------------!
[ 7705.256292] size 144 page c131554c!
[ 7705.535555] kmalloc 192!
log打印路径是同步读的预读的的io 限制流程。blk_throtl_bio是blkio 的流控的核心函数,主要用户的blkio参数设置,来限制io的bps和iops。如果在blkcg的radix tree里根据queue->id没有找到blkgq,会调用blkcg_create调用kzalloc_nodes申请blkcg结构体,这一般发生在属于blkcg的用户第一次向块设备发起io的时候。每个块设备在同一blk cgroup里都有一份实例,就是blkgq,并且以基数的形式组织到blkcg->blkg_tree 里。
在blk_throtl_bio里,根据bio的css获取所属的blkcg(blk cgroup),然后根据被提交设备的queue->id,在blkcg的基数里查找本块设备对应的blkgq结构体,如果是第一次io,就创建blkgq,最后根据blkgq找到throtl_grp,blkgq和throtl_grp是一一对应的。throtl_grp里保存的是io的限速策略和限速参数。
在log里kmalloc申请的是144字节,处于129-192区间,所以选择kmalloc-192的通用kmem_cache, 同时当前进程属于docker_cloud的cgoup,所以最终会从kmalloc-192(0:docker_cloud)里进行分配。blkgq的内存一般在cgroup删除或者umount的时候释放。
所以在md设备调用kmem_cache_destroy销毁kmalloc-192(0:docker_cloud)的时候,尚有未释放的object在使用。
当一个kmem_cache被复用的时候,因为root kmem_cache的所有child kmem_cache(memcg对应的实例)是各个使用者的共享的。所以一个使用者调用kmem_cache_destroy销毁复用的kmem_cache的时候,不仅root kmem_cache的销毁要等到refcount为0,root kmem_cache下辖的child kmem_cache的销毁同样应该等到refcount为0。即在kmem_cache_destroy中,kmem_cache_destroy_memcg_children的执行逻辑应该放到root kmem_cache 的refcount判断之后。
<五> bug修正
commit:b8529907ba35d已经修正了这个bug。是在3.15内核修正了。因为3.15到3.10.33的有些跨度,函数和变量均有改变,backport有些难度。并且不能简单把kmem_cache_destroy_memcg_children函数放到refcout的判断之后,主要是因为kmem_cache_destroy的递归调用和slab_mutex的使用。
由于时间关系,当时选择关闭kmem_cache的alias特性,来解决这个问题。在灰度修正后的版本几个月里,线上再也没有出现过这个问题。