linux内核奇遇记之md源代码解读之四
转载请注明出处:http://blog.csdn.net/liumangxiong
运行阵列意味着阵列经历从无到有,建立了作为一个raid应有的属性(如同步重建),并为随后的读写做好的铺垫。那么运行阵列的时候到底做了哪些事情,让原来的磁盘像变形金刚一样组成一个新的巨无霸。现在就来看阵列运行处理流程:
5158 static int do_md_run(struct mddev *mddev)
5159 {
5160 int err;
5161
5162 err = md_run(mddev);
5163 if (err)
5164 goto out;
5165 err = bitmap_load(mddev);
5166 if (err) {
5167 bitmap_destroy(mddev);
5168 goto out;
5169 }
5170
5171 md_wakeup_thread(mddev->thread);
5172 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */5173
5174 set_capacity(mddev->gendisk, mddev->array_sectors);
5175 revalidate_disk(mddev->gendisk);
5176 mddev->changed = 1;
5177 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5178 out:
5179 return err;
5180 }
如果说运行阵列的过程是一本书,那么这个函数就是这本书的目录,每一个目录中都隐含着一个深刻的故事。
5162行,md_run运行阵列,这个函数比较长,我们按一段一段来分析:
4956 int md_run(struct mddev *mddev)
4957 {
4958 int err;
4959 struct md_rdev *rdev;
4960 struct md_personality *pers;
4961
4962 if (list_empty(&mddev->disks))
4963 /* cannot run an array with no devices.. */
4964 return -EINVAL;
4965
4966 if (mddev->pers)
4967 return -EBUSY;
4968 /* Cannot run until previous stop completes properly */
4969 if (mddev->sysfs_active)
4970 return -EBUSY;
4971
4972 /*
4973 * Analyze all RAID superblock(s)
4974 */
4975 if (!mddev->raid_disks) {
4976 if (!mddev->persistent)
4977 return -EINVAL;
4978 analyze_sbs(mddev);
4979 }
4962-4969行检查,阵列还没运行,所以直接到4978行。
4978行,analyze_sbs,分析超级块,依次分析每一个磁盘的超级块,不符合阵列需求的磁盘将会被踢出阵列。
3310 static void analyze_sbs(struct mddev * mddev)
3311 {
3312 int i;
3313 struct md_rdev *rdev, *freshest, *tmp;
3314 char b[BDEVNAME_SIZE];
3315
3316 freshest = NULL;
3317 rdev_for_each_safe(rdev, tmp, mddev)
3318 switch (super_types[mddev->major_version].
3319 load_super(rdev, freshest, mddev->minor_version)) {
3320 case 1:
3321 freshest = rdev;
3322 break;
3323 case 0:
3324 break;
3325 default:
3326 printk( KERN_ERR \
3327 "md: fatal superblock inconsistency in %s"
3328 " -- removing from array\n",
3329 bdevname(rdev->bdev,b));
3330 kick_rdev_from_array(rdev);
3331 }
3332
3333
3334 super_types[mddev->major_version].
3335 validate_super(mddev, freshest);
3336
3337 i = 0;
3338 rdev_for_each_safe(rdev, tmp, mddev) {
3339 if (mddev->max_disks &&
3340 (rdev->desc_nr >= mddev->max_disks ||
3341 i > mddev->max_disks)) {
3342 printk(KERN_WARNING
3343 "md: %s: %s: only %d devices permitted\n",
3344 mdname(mddev), bdevname(rdev->bdev, b),
3345 mddev->max_disks);
3346 kick_rdev_from_array(rdev);
3347 continue;
3348 }
3349 if (rdev != freshest)
3350 if (super_types[mddev->major_version].
3351 validate_super(mddev, rdev)) {
3352 printk(KERN_WARNING "md: kicking non-fresh %s"
3353 " from array!\n",
3354 bdevname(rdev->bdev,b));
3355 kick_rdev_from_array(rdev);
3356 continue;
3357 }
3358 if (mddev->level == LEVEL_MULTIPATH) {
3359 rdev->desc_nr = i++;
3360 rdev->raid_disk = rdev->desc_nr;
3361 set_bit(In_sync, &rdev->flags);
3362 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3363 rdev->raid_disk = -1;
3364 clear_bit(In_sync, &rdev->flags);
3365 }
3366 }
3367 }
3316-3331行,依次对阵列中每一个磁盘加载超级块,如果是最新的超级块则保存对应的struct md_rdev在freshest指针中,如果是不符合条件的超级块,将会踢出阵列。
3319行,我们用1.2版本的超级块,那么对应这里load_super为super_1_load函数,这个函数就是把超级块信息从磁盘读出来,然后保存在md_rdev->sb_page中。然而这个函数还额外做了一件事情,就是比较哪个磁盘的超级块最新,看函数原型:
1433 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
第一个参数就是要加载超级块的磁盘,第二个参数是目前为止最新的超级块,第一次比较时为空。当返回值为1时表示rdev为最新,当返回为0时表示rdfdev仍然为最新超级块,小于0表示非法超级块。
3330行,将非法超级块的磁盘踢出阵列。
3334行,对应的validate_super函数为super_1_validate,这个函数根据最新超级块信息初始化了阵列struct mddev信息,这里代码省略了不相关的if分支:
1600 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1601 {
1602 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1603 __u64 ev1 = le64_to_cpu(sb->events);
1604
1605 rdev->raid_disk = -1;
1606 clear_bit(Faulty, &rdev->flags);
1607 clear_bit(In_sync, &rdev->flags);
1608 clear_bit(WriteMostly, &rdev->flags);
1609
1610 if (mddev->raid_disks == 0) {
1611 mddev->major_version = 1;
1612 mddev->patch_version = 0;
1613 mddev->external = 0;
1614 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1615 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1616 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1617 mddev->level = le32_to_cpu(sb->level);
1618 mddev->clevel[0] = 0;
1619 mddev->layout = le32_to_cpu(sb->layout);
1620 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1621 mddev->dev_sectors = le64_to_cpu(sb->size);
1622 mddev->events = ev1;
1623 mddev->bitmap_info.offset = 0;
1624 mddev->bitmap_info.space = 0;
1625 /* Default location for bitmap is 1K after superblock
1626 * using 3K - total of 4K
1627 */
1628 mddev->bitmap_info.default_offset = 1024 >> 9;
1629 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1630 mddev->reshape_backwards = 0;
1631
1632 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1633 memcpy(mddev->uuid, sb->set_uuid, 16);
1634
1635 mddev->max_disks = (4096-256)/2;
1636
1637 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1638 mddev->bitmap_info.file == NULL) {
1639 mddev->bitmap_info.offset =
1640 (__s32)le32_to_cpu(sb->bitmap_offset);
1641 /* Metadata doesn't record how much space is available.
1642 * For 1.0, we assume we can use up to the superblock
1643 * if before, else to 4K beyond superblock.
1644 * For others, assume no change is possible.
1645 */
1646 if (mddev->minor_version > 0)
1647 mddev->bitmap_info.space = 0;
1648 else if (mddev->bitmap_info.offset > 0)
1649 mddev->bitmap_info.space =
1650 8 - mddev->bitmap_info.offset;
1651 else
1652 mddev->bitmap_info.space =
1653 -mddev->bitmap_info.offset;
1654 }
1655
1656 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1657 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1658 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1659 mddev->new_level = le32_to_cpu(sb->new_level);
1660 mddev->new_layout = le32_to_cpu(sb->new_layout);
1661 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1662 if (mddev->delta_disks < 0 ||
1663 (mddev->delta_disks == 0 &&
1664 (le32_to_cpu(sb->feature_map)
1665 & MD_FEATURE_RESHAPE_BACKWARDS)))
1666 mddev->reshape_backwards = 1;
1667 } else {
1668 mddev->reshape_position = MaxSector;
1669 mddev->delta_disks = 0;
1670 mddev->new_level = mddev->level;
1671 mddev->new_layout = mddev->layout;
1672 mddev->new_chunk_sectors = mddev->chunk_sectors;
1673 }
1674
1675 }
...
1695 if (mddev->level != LEVEL_MULTIPATH) {
1696 int role;
1697 if (rdev->desc_nr < 0 ||
1698 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1699 role = 0xffff;
1700 rdev->desc_nr = -1;
1701 } else
1702 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1703 switch(role) {
1704 case 0xffff: /* spare */
1705 break;
1706 case 0xfffe: /* faulty */
1707 set_bit(Faulty, &rdev->flags);
1708 break;
1709 default:
1710 if ((le32_to_cpu(sb->feature_map) &
1711 MD_FEATURE_RECOVERY_OFFSET))
1712 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1713 else
1714 set_bit(In_sync, &rdev->flags);
1715 rdev->raid_disk = role;
1716 break;
1717 }
1718 if (sb->devflags & WriteMostly1)
1719 set_bit(WriteMostly, &rdev->flags);
1720 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1721 set_bit(Replacement, &rdev->flags);
1722 } else /* MULTIPATH are always insync */
1723 set_bit(In_sync, &rdev->flags);
1724
1725 return 0;
1726 }
1602行,获取磁盘对应的超级块信息。
1610行,if分支成立,进入初始化struct mddev结构体,就是将阵列磁盘中最新超级块信息赋给struct mddev。
1695行,设置rdev->raid_disk和rdev->recovery_offset信息,注意这里的role有几个特殊值,0xffff表示热备盘,0xfffe表示faulty盘。recovery_offset顾名思义就是已重建偏移,In_sync表示磁盘在同步状态,WriteMostly表示优先读只用于raid1阵列。
又回到analyze_sbs函数中,
3338行,这个循环遍历阵列所有磁盘,依次validate每一个磁盘。这里的作用就是给每一个磁盘定一个身份,到底是数据盘啊还是热备盘,当然还有些磁盘超级块信息检查不合格,要淘汰出阵列的。
3350行,再一次进入validate_super函数,不过上一次主要作用是初始化struct mddev信息,这一次主要鉴定磁盘身份信息。
1600 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1601 {
1602 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1603 __u64 ev1 = le64_to_cpu(sb->events);
1604
1605 rdev->raid_disk = -1;
1606 clear_bit(Faulty, &rdev->flags);
1607 clear_bit(In_sync, &rdev->flags);
1608 clear_bit(WriteMostly, &rdev->flags);
1609
1610 if (mddev->raid_disks == 0) {
...
1675 } else if (mddev->pers == NULL) {
1676 /* Insist of good event counter while assembling, except for
1677 * spares (which don't need an event count) */
1678 ++ev1;
1679 if (rdev->desc_nr >= 0 &&
1680 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1681 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1682 if (ev1 < mddev->events)
1683 return -EINVAL;
1684 }
1695 if (mddev->level != LEVEL_MULTIPATH) {
1696 int role;
1697 if (rdev->desc_nr < 0 ||
1698 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1699 role = 0xffff;
1700 rdev->desc_nr = -1;
1701 } else
1702 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1703 switch(role) {
1704 case 0xffff: /* spare */
1705 break;
1706 case 0xfffe: /* faulty */
1707 set_bit(Faulty, &rdev->flags);
1708 break;
1709 default:
1710 if ((le32_to_cpu(sb->feature_map) &
1711 MD_FEATURE_RECOVERY_OFFSET))
1712 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1713 else
1714 set_bit(In_sync, &rdev->flags);
1715 rdev->raid_disk = role;
1716 break;
1717 }
1718 if (sb->devflags & WriteMostly1)
1719 set_bit(WriteMostly, &rdev->flags);
1720 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1721 set_bit(Replacement, &rdev->flags);
1722 } else /* MULTIPATH are always insync */
1723 set_bit(In_sync, &rdev->flags);
1724
1725 return 0;
1726 }
1610行,经过上一次struct mddev的初始化,这时raid_disk已经不为0了。
1675行,阵列还未运行起来,if成立进入分支。
1679行,先判断rdev->desc_nr是否合法,再判断是否为数据盘。
1682行,如果为数据盘,则判断时间戳是否为最新,不是最新的超级块,数据也不是最新的,就不能继续留在阵列中了。
1695行,设置rdev->raid_disk和rdev->recovery_offset信息。
analyze_sbs函数已经完成,返回到md_run函数中继续往下看:
4981 if (mddev->level != LEVEL_NONE)
4982 request_module("md-level-%d", mddev->level);
4983 else if (mddev->clevel[0])
4984 request_module("md-%s", mddev->clevel);
4985
4986 /*
4987 * Drop all container device buffers, from now on
4988 * the only valid external interface is through the md
4989 * device.
4990 */
4991 rdev_for_each(rdev, mddev) {
4992 if (test_bit(Faulty, &rdev->flags))
4993 continue;
4994 sync_blockdev(rdev->bdev);
4995 invalidate_bdev(rdev->bdev);
4996
4997 /* perform some consistency tests on the device.
4998 * We don't want the data to overlap the metadata,
4999 * Internal Bitmap issues have been handled elsewhere.
5000 */
5001 if (rdev->meta_bdev) {
5002 /* Nothing to check */;
5003 } else if (rdev->data_offset < rdev->sb_start) {
5004 if (mddev->dev_sectors &&
5005 rdev->data_offset + mddev->dev_sectors
5006 > rdev->sb_start) {
5007 printk("md: %s: data overlaps metadata\n",
5008 mdname(mddev));
5009 return -EINVAL;
5010 }
5011 } else {
5012 if (rdev->sb_start + rdev->sb_size/512
5013 > rdev->data_offset) {
5014 printk("md: %s: metadata overlaps data\n",
5015 mdname(mddev));
5016 return -EINVAL;
5017 }
5018 }
5019 sysfs_notify_dirent_safe(rdev->sysfs_state);
5020 }
4981-4984行,用于请求内核模块加载,因为linux内核模块可以按需加载,只有在需要该模块的时候再加载这样比较节约资源。
4991行,首先看注释,丢掉原磁盘设置的缓存,从现在开始这些磁盘只能由md访问了。就好像一个人要去当兵了,进入部队之后原来的身份证作废,新发了一个军人证,并且这个人以后只归部队管了,地方政府法庭不能管。
4992行,判断为faulty盘,坏盘就不用多费心思了。
4994行,刷磁盘buffer。
4995行,注销掉原来的身份证。
4997行,看注释,基本检查,看磁盘上数据部分与超级块是否overlap。rdev->data_offset表示磁盘上数据区开始偏移,rdev->sb_start表示超级块开始偏移,mddev->dev_sectors表示磁盘用于阵列的空间,rdev->sb_size表示超级块大小。
5019行,更新sysfs文件中磁盘state状态。
5022 if (mddev->bio_set == NULL)
5023 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5024
5025 spin_lock(&pers_lock);
5026 pers = find_pers(mddev->level, mddev->clevel);
5027 if (!pers || !try_module_get(pers->owner)) {
5028 spin_unlock(&pers_lock);
5029 if (mddev->level != LEVEL_NONE)
5030 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5031 mddev->level);
5032 else
5033 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5034 mddev->clevel);
5035 return -EINVAL;
5036 }
5037 mddev->pers = pers;
5038 spin_unlock(&pers_lock);
5039 if (mddev->level != pers->level) {
5040 mddev->level = pers->level;
5041 mddev->new_level = pers->level;
5042 }
5043 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5044
5045 if (mddev->reshape_position != MaxSector &&
5046 pers->start_reshape == NULL) {
5047 /* This personality cannot handle reshaping... */
5048 mddev->pers = NULL;
5049 module_put(pers->owner);
5050 return -EINVAL;
5051 }
5052
5053 if (pers->sync_request) {
5054 /* Warn if this is a potentially silly
5055 * configuration.
5056 */
5057 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5058 struct md_rdev *rdev2;
5059 int warned = 0;
5060
5061 rdev_for_each(rdev, mddev)
5062 rdev_for_each(rdev2, mddev) {
5063 if (rdev < rdev2 &&
5064 rdev->bdev->bd_contains ==
5065 rdev2->bdev->bd_contains) {
5066 printk(KERN_WARNING
5067 "%s: WARNING: %s appears to be"
5068 " on the same physical disk as"
5069 " %s.\n",
5070 mdname(mddev),
5071 bdevname(rdev->bdev,b),
5072 bdevname(rdev2->bdev,b2));
5073 warned = 1;
5074 }
5075 }
5076
5077 if (warned)
5078 printk(KERN_WARNING
5079 "True protection against single-disk"
5080 " failure might be compromised.\n");
5081 }
5082
5083 mddev->recovery = 0;
5084 /* may be over-ridden by personality */
5085 mddev->resync_max_sectors = mddev->dev_sectors;
5086
5087 mddev->ok_start_degraded = start_dirty_degraded;
5088
5089 if (start_readonly && mddev->ro == 0)
5090 mddev->ro = 2; /* read-only, but switch on first write */
5022行,创建bio内存池,用于读写时克隆保存原始bio。
5026行,查找对应阵列级别的struct md_personality是否存在,经过我们在4982行的request_module之后,新加载的模块会调用register_md_personality函数注册struct md_personality结构体,所以这里可以找到需要的pers。
5037行,将找到的pers赋值给mddev->pers。
5053行,这个if分支用于检查阵列中是否有两个struct md_rdev位于同一物理磁盘上。因为创建阵列可以用分区来创建,所以这里需要检查一下。如果两个struct md_rdev位于同一物理磁盘上,导致阵列性能很差。既然要玩raid就没有必要那么小气嘛,直接用整个磁盘,没有必要用磁盘分区。
5083行,初始化阵列sync标记。
5085行,初始化阵列最大同步偏移。
5087行,是否自动运行降级的脏阵列。可别小看了简简单单的一行代码,却代表了一个raid5阵列很复杂的问题。当一个raid5/6为脏并且降级时,就可能有数据错误的风险。为脏就是校验盘数据未经过同步,再加上降级就表示这一条带数据无法通过重建来恢复。所以md就不直接去运行阵列,而是由系统管理员手动运行。然而如果根文件系统是建立在raid上的时候,就会导致系统无法启动,所以就提供一个内核模块参数s
tart_dirty_degraded来控制强制运行这样的阵列。
但实际上情况并没有看起来那么严重,例如在一个raid5阵列上建立一个ext4文件系统,为脏部分代表阵列还没有同步,而没有同步的条带是没有文件存储在条带上的(如果存储代表已经写过,写过的条带是同步的),所以这个时候强制运行降级的脏阵列是没有问题的。
5089行,在很多用户的环境里,经常会遇到一个问题,就是系统重启之后查看cat /proc/mdstat目录下阵列resync=pending状态,解决这个问题有两个方法,一是使用命令mdadm --read-write /dev/md*,另一个是设置模块参数/sys/module/md_mod/parameters/start_ro为0。那么为什么要设置这样一个状态呢?代码作者neil brown,是为了解决在Debian系统启动时要做一个重要的事情,所以让阵列进入这个临时状态。还好只要有读写阵列就会自动解除这个临时状态,对于正常使用没有影响。
5092 err = mddev->pers->run(mddev);
5093 if (err)
5094 printk(KERN_ERR "md: pers->run() failed ...\n");
5095 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5096 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5097 " but 'external_size' not in effect?\n", __func__);
5098 printk(KERN_ERR
5099 "md: invalid array_size %llu > default size %llu\n",
5100 (unsigned long long)mddev->array_sectors / 2,
5101 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5102 err = -EINVAL;
5103 mddev->pers->stop(mddev);
5104 }
5105 if (err == 0 && mddev->pers->sync_request &&
5106 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5107 err = bitmap_create(mddev);
5108 if (err) {
5109 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5110 mdname(mddev), err);
5111 mddev->pers->stop(mddev);
5112 }
5113 }
5114 if (err) {
5115 module_put(mddev->pers->owner);
5116 mddev->pers = NULL;
5117 bitmap_destroy(mddev);
5118 return err;
5119 }
5092行,毫无疑问一看函数名就知道这一行是重中之重。这里选择raid1的run作示例,因为raid1是比较简单的,raid5和raid10在后面小节单独讲解。在讲run之前先简要说明一下mddev->pers->run是怎么调用到各个模块的run函数的?
首先每个模块初始化的时候都会调用到register_md_persionality函数,向md模块注册各自的struct md_personality结构,
7158 int register_md_personality(struct md_personality *p)
7159 {
7160 spin_lock(&pers_lock);
7161 list_add_tail(&p->list, &pers_list);
7162 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
7163 spin_unlock(&pers_lock);
7164 return 0;
7165 }
在md_run函数中根据mddev->level初始化mddev->pers,如果level为1,这里pers就指向raid1的struct md_personality raid1_personality,那么这里调用的run函数也就是raid1中的run函数。接着看raid1中的run函数:
2769 static int run(struct mddev *mddev)
2770 {
2771 struct r1conf *conf;
2772 int i;
2773 struct md_rdev *rdev;
2774 int ret;
2775 bool discard_supported = false;
2776
2777 if (mddev->level != 1) {
2778 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2779 mdname(mddev), mddev->level);
2780 return -EIO;
2781 }
2782 if (mddev->reshape_position != MaxSector) {
2783 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2784 mdname(mddev));
2785 return -EIO;
2786 }
2787 /*
2788 * copy the already verified devices into our private RAID1
2789 * bookkeeping area. [whatever we allocate in run(),
2790 * should be freed in stop()]
2791 */
2792 if (mddev->private == NULL)
2793 conf = setup_conf(mddev);
2794 else
2795 conf = mddev->private;
2796
2797 if (IS_ERR(conf))
2798 return PTR_ERR(conf);
2799
2800 if (mddev->queue)
2801 blk_queue_max_write_same_sectors(mddev->queue, 0);
2802
2803 rdev_for_each(rdev, mddev) {
2804 if (!mddev->gendisk)
2805 continue;
2806 disk_stack_limits(mddev->gendisk, rdev->bdev,
2807 rdev->data_offset << 9);
2808 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
2809 discard_supported = true;
2810 }
2811
2812 mddev->degraded = 0;
2813 for (i=0; i < conf->raid_disks; i++)
2814 if (conf->mirrors[i].rdev == NULL ||
2815 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2816 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2817 mddev->degraded++;
2818
2819 if (conf->raid_disks - mddev->degraded == 1)
2820 mddev->recovery_cp = MaxSector;
2821
2822 if (mddev->recovery_cp != MaxSector)
2823 printk(KERN_NOTICE "md/raid1:%s: not clean"
2824 " -- starting background reconstruction\n",
2825 mdname(mddev));
2826 printk(KERN_INFO
2827 "md/raid1:%s: active with %d out of %d mirrors\n",
2828 mdname(mddev), mddev->raid_disks - mddev->degraded,
2829 mddev->raid_disks);
2830
2831 /*
2832 * Ok, everything is just fine now
2833 */
2834 mddev->thread = conf->thread;
2835 conf->thread = NULL;
2836 mddev->private = conf;
2837
2838 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2839
2840 if (mddev->queue) {
2841 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2842 mddev->queue->backing_dev_info.congested_data = mddev;
2843 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2844
2845 if (discard_supported)
2846 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
2847 mddev->queue);
2848 else
2849 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
2850 mddev->queue);
2851 }
2852
2853 ret = md_integrity_register(mddev);
2854 if (ret)
2855 stop(mddev);
2856 return ret;
2857 }
2777-2786行,基本检查。
2792行,域private未赋值,进入if分支。
2793行,配置raid1环境。俗话说,国有国法,家有家规。如果说struct mddev是国法,那么setup_conf要建立的struct r1conf就是家规了,同样对于raid5和raid10都有自己有家规struct r5conf和struct r10conf。struct mddev存放是所有阵列共同的属性,而各自struct r*conf存放是私有的属性,而这些私有属性就是为了管理好各自管辖的磁盘。进入setup_conf函数:
2648 static struct r1conf *setup_conf(struct mddev *mddev)
2649 {
2650 struct r1conf *conf;
2651 int i;
2652 struct raid1_info *disk;
2653 struct md_rdev *rdev;
2654 int err = -ENOMEM;
2655
2656 conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
2657 if (!conf)
2658 goto abort;
2659
2660 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2661 * mddev->raid_disks * 2,
2662 GFP_KERNEL);
2663 if (!conf->mirrors)
2664 goto abort;
2665
2666 conf->tmppage = alloc_page(GFP_KERNEL);
2667 if (!conf->tmppage)
2668 goto abort;
2669
2670 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2671 if (!conf->poolinfo)
2672 goto abort;
2673 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2674 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2675 r1bio_pool_free,
2676 conf->poolinfo);
2677 if (!conf->r1bio_pool)
2678 goto abort;
2679
2680 conf->poolinfo->mddev = mddev;
2681
2682 err = -EINVAL;
2683 spin_lock_init(&conf->device_lock);
2684 rdev_for_each(rdev, mddev) {
2685 struct request_queue *q;
2686 int disk_idx = rdev->raid_disk;
2687 if (disk_idx >= mddev->raid_disks
2688 || disk_idx < 0)
2689 continue;
2690 if (test_bit(Replacement, &rdev->flags))
2691 disk = conf->mirrors + mddev->raid_disks + disk_idx;
2692 else
2693 disk = conf->mirrors + disk_idx;
2694
2695 if (disk->rdev)
2696 goto abort;
2697 disk->rdev = rdev;
2698 q = bdev_get_queue(rdev->bdev);
2699 if (q->merge_bvec_fn)
2700 mddev->merge_check_needed = 1;
2701
2702 disk->head_position = 0;
2703 disk->seq_start = MaxSector;
2704 }
2705 conf->raid_disks = mddev->raid_disks;
2706 conf->mddev = mddev;
2707 INIT_LIST_HEAD(&conf->retry_list);
2708
2709 spin_lock_init(&conf->resync_lock);
2710 init_waitqueue_head(&conf->wait_barrier);
2711
2712 bio_list_init(&conf->pending_bio_list);
2713 conf->pending_count = 0;
2714 conf->recovery_disabled = mddev->recovery_disabled - 1;
2715
2716 err = -EIO;
2717 for (i = 0; i < conf->raid_disks * 2; i++) {
2718
2719 disk = conf->mirrors + i;
2720
2721 if (i < conf->raid_disks &&
2722 disk[conf->raid_disks].rdev) {
2723 /* This slot has a replacement. */
2724 if (!disk->rdev) {
2725 /* No original, just make the replacement
2726 * a recovering spare
2727 */
2728 disk->rdev =
2729 disk[conf->raid_disks].rdev;
2730 disk[conf->raid_disks].rdev = NULL;
2731 } else if (!test_bit(In_sync, &disk->rdev->flags))
2732 /* Original is not in_sync - bad */
2733 goto abort;
2734 }
2735
2736 if (!disk->rdev ||
2737 !test_bit(In_sync, &disk->rdev->flags)) {
2738 disk->head_position = 0;
2739 if (disk->rdev &&
2740 (disk->rdev->saved_raid_disk < 0))
2741 conf->fullsync = 1;
2742 }
2743 }
2744
2745 err = -ENOMEM;
2746 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2747 if (!conf->thread) {
2748 printk(KERN_ERR
2749 "md/raid1:%s: couldn't allocate thread\n",
2750 mdname(mddev));
2751 goto abort;
2752 }
2753
2754 return conf;
2656-2680行,申请与读写相关的资源,后面讲读写的时候再深入。
2684行,对每个阵列中数据盘,在struct r1conf中建立关联,读写时用到。
2697行,建立struct r1conf到struct md_rdev关联。
2717行,磁盘replacement机制,这是阵列的高级特性,这里先不关注。
2746行,注册阵列处理线程。每个运行阵列都有这样的一个主线程,主要负责检查同步重建(只检查由另一线程负责具体处理),数据流处理。
小结一下,setup_conf函数主要作用是初始化struct r1conf,建立阵列数据流处理的上下文环境。
继续回到raid1的run函数中。
2803行,对阵列中每一个磁盘设置struct queue_limit,每个块设备都有一个struct queue_limit,表示块设备队列物理特性。这里主要作用是让磁盘请求队列根据阵列请求队列调整请求块大小和对齐。
2812-2817行,计算阵列降级磁盘数。
2834行,设置mddev->thread。
2836行,设置mddev->private为struct r1conf。
2838行,设置阵列大小。
2840-2851行,设置拥塞处理函数和请求合并函数。
2853行,块设备integrity,有兴趣可查看内核文档的integrity说明。
run函数就结束了,小结一下,run函数的主要作用是建立阵列读写的上下文环境,包括struct r1conf,阵列主线程等等。
继续回到md_run函数中。
5107行,创建阵列bitmap,具体过程在bitmap章节里再详细阅读。
接下来就是一些sysfs的显示和链接,最有欣赏价值的是mddev->safemode,什么是安全模式呢?没有写(包括同步和重建写)的时候就是安全模式,反之正在写的时候就不安全。因为对于有数据冗余的阵列来说,每一份数据都至少要写入两个物理磁盘中,在写的过程中程序异常或者系统掉电异常都会导致数据不一致,为了保证数据一致性,必须要在系统重启之后做全盘同步。然而全盘同步需要花费很长时间,bitmap的出现在一定程度上解决了这个问题,但却对阵列性能产生一定的消极作用。
经过了这么长的跋山涉水,终于又回到do_md_run的温暖怀抱了。这个函数不长,我们不厌其烦地再贴一次代码:
5158 static int do_md_run(struct mddev *mddev)
5159 {
5160 int err;
5161
5162 err = md_run(mddev);
5163 if (err)
5164 goto out;
5165 err = bitmap_load(mddev);
5166 if (err) {
5167 bitmap_destroy(mddev);
5168 goto out;
5169 }
5170
5171 md_wakeup_thread(mddev->thread);
5172 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5173
5174 set_capacity(mddev->gendisk, mddev->array_sectors);
5175 revalidate_disk(mddev->gendisk);
5176 mddev->changed = 1;
5177 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5178 out:
5179 return err;
5180 }
5165行,加载bitmap,同样留到bitmap章节再详解。
5171行,唤醒阵列主线程。
5172行,唤醒阵列同步线程。
5174行,设置虚拟gendisk磁盘大小。
5175行,运行磁盘,让磁盘为系统可见。
5176行,设置md改变标志。
5177行,上报磁盘信息到udev。
do_md_run完成,RUN_ARRAY命令也就执行完成了。
小结一下,do_md_run函数的作用就是向上虚拟一个块设备,向下包装磁盘,建立读写请求的通道,将对md设备的请求能够转发到磁盘上去。