Device mapper 中这三个对象和 target driver 插件一起构成了一个可迭代的设备树。在该树型结构中的顶层根节点是最终作为逻辑设备向外提供的 mapped device,叶子节点是 target device 所表示的底层物理设备。最小的设备树由单个 mapped device 和 target device 组成。每个 target device 都是被mapped device 独占的,只能被一个 mapped device 使用。一个 mapped device 可以映射到一个或者多个 target device 上,而一个 mapped device 又可以作为它上层 mapped device的 target device 被使用,该层次在理论上可以在 device mapper 架构下无限迭代下去。如下图所示:
在上图中我们可以看到 mapped device 1 通过映射表和 a、b、c 三个 target device 建立了映射关系,而 target device a 又是通过 mapped device 2 演化过来,mapped device 2 通过映射表和 target device d 建立映射关系,target device d 又可以通过其他的映射关系演化过来。
我们进一步看一下上述三个对象在代码中的具体实现,dm.c 文件定义的 mapped_device 结构用于表示 mapped device,它主要包括该 mapped device 相关的锁,注册的请求队列和一些内存池以及指向它所对应映射表的指针等域。
struct mapped_device{ struct rw_semaphore io_lock; struct mutex suspend_lock; rwlock_t map_lock; atomic_t holders; atomic_t open_count; unsigned long flags; struct request_queue *queue; struct gendisk *disk; char name[16]; void *interface_ptr; /* * A list of ios that arrived while we were suspended. */ atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; struct bio_list deferred; spinlock_t deferred_lock; /* * An error from the barrier request currently being processed. */ int barrier_error; /* * Processing queue (flush/barriers) */ struct workqueue_struct *wq; /* * The current mapping. */ struct dm_table *map; /* * io objects are allocated from here. */ mempool_t *io_pool; mempool_t *tio_pool; struct bio_set *bs; /* * Event handling. */ atomic_t event_nr; wait_queue_head_t eventq; atomic_t uevent_seq; struct list_head uevent_list; spinlock_t uevent_lock; /* Protect access to uevent_list */ /* * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; /* marker of flush suspend for request-based dm */ struct request suspend_rq; /* For saving the address of __make_request for request based dm */ make_request_fn *saved_make_request_fn; /* sysfs handle */ struct kobject kobj; /* zero-length barrier that will be cloned and submitted to targets */ struct bio barrier_bio; }
struct dm_table { struct mapped_device *md; atomic_t holders; unsigned type; /* btree table */ unsigned int depth; unsigned int counts[MAX_DEPTH]; /* in nodes */ sector_t *index[MAX_DEPTH]; unsigned int num_targets; unsigned int num_allocated; sector_t *highs; struct dm_target *targets; /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ * and FMODE_WRITE. */ fmode_t mode; /* a list of devices used by this table */ struct list_head devices; /* events get handed up using this callback */ void (*event_fn)(void *); void *event_context; struct dm_md_mempools *mempools; }
struct dm_target { struct dm_table *table; struct target_type *type; /* target limits */ sector_t begin; sector_t len; /* Always a power of 2 */ sector_t split_io; /* * A number of zero-length barrier requests that will be submitted * to the target for the purpose of flushing cache. * * The request number will be placed in union map_info->flush_request. * It is a responsibility of the target driver to remap these requests * to the real underlying devices. */ unsigned num_flush_requests; /* target specific data */ void *private; /* Used to provide an error string from the ctr */ char *error; }
struct cache_c{ struct dm_target *tgt; /* dm_target描述了一个设备,这个块设备映射为mapped_device中的某一段 它是映射设备的基本构成单元 */ struct dm_dev *disk_dev; /* Source device */ struct dm_dev *cache_dev; /* Cache device */ int on_ssd_version; spinlock_t cache_spin_lock;//为临界资源设置的锁,保证并发条件下的数据一致性 struct cacheblock *cache; /* Hash table for cache blocks cacheblock是在内存中的保存的cache信息,每一个SSD的块都对应一个cacheblock */ struct cache_set *cache_sets;//每一个SSD中的set都对应一个cache_set struct cache_md_block_head *md_blocks_buf;//更新SSD上元数据信息时需要用到这个结构 unsigned int md_block_size; /* Metadata block size in sectors 存放元数据信息的块大小,包含多少个扇区 */ sector_t size; /* Cache size cache中块的数量*/ unsigned int assoc; /* Cache associativity 每个set默认的block数量为512*/ unsigned int block_size; /* Cache block size 每个块中包含的扇区个数*/ unsigned int block_shift; /* Cache block size in bits */ unsigned int block_mask; /* Cache block mask */ unsigned int assoc_shift; /* Consecutive blocks size in bits */ unsigned int num_sets; /* Number of cache sets */ int cache_mode;//back、through、around wait_queue_head_t destroyq; /* Wait queue for I/O completion */ /* wait_queue_head_t,让进程休眠, 当你在用户空间需要读写一大片数据的时候,这个就用上了。 1、定义:wait_queue_head_t my_queue; 2、初始化 init_waitqueue_head(&my_queue); 3、在一个函数里面等待:wait_event(queue, condition) ;(别在中断里面搞) 4、在另一个函数里面唤醒:wake_up(wait_queue_head_t *queue); (这个可以在中断调用,去唤醒别的进程,特别是dma操作类的) */ /* XXX - Updates of nr_jobs should happen inside the lock. But doing it outside is OK since the filesystem is unmounted at this point */ atomic_t nr_jobs; /* Number of I/O jobs */ #define SLOW_REMOVE 1 #define FAST_REMOVE 2 atomic_t remove_in_prog;/*该逻辑设备是否正处于被删除状态,以及以一种什么方式删除*/ int dirty_thresh_set; /* Per set dirty threshold to start cleaning 此处是脏块数*/ int max_clean_ios_set; /* Max cleaning IOs per set 块数*/ int max_clean_ios_total; /* Total max cleaning IOs 块数*/ int clean_inprog;//程序中需要回写的块数 int sync_index;//同步操作时,搜索的第一个块号 int nr_dirty;//处于DIRTY状态的块的数量 unsigned long cached_blocks; /* Number of cached blocks */ unsigned long pending_jobs_count;//整个逻辑设备中所有块对应的等待job的总和数量 int md_blocks; /* Numbers of metadata blocks, including header */ /* Stats */ struct flashcache_stats flashcache_stats;//记录虚拟出的flashcache设备的状态 /* Errors */ struct flashcache_errors flashcache_errors;//记录虚拟出的flashcache设备的一些错误状态 #define IO_LATENCY_GRAN_USECS 250//描述IO延迟的直方图的粒度为250us #define IO_LATENCY_MAX_US_TRACK 10000 /* 10 ms *///跟踪的最大的IO延迟为10ms #define IO_LATENCY_BUCKETS (IO_LATENCY_MAX_US_TRACK / IO_LATENCY_GRAN_USECS) unsigned long latency_hist[IO_LATENCY_BUCKETS];//不超过10ms的请求分别统计 unsigned long latency_hist_10ms;//超过10ms的请求一起统计 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) struct work_struct delayed_clean;//Every pending function is represented by a work_struct #else struct delayed_work delayed_clean; /* To ensure that work queued will be executed after a specified time interval has passed since submission,the work_struct needs to be extended with a timer. The solution is as obvious as can be: struct delayed_work { struct work_struct work; struct timer_list timer; }; */ #endif unsigned long pid_expire_check; /* In "cache everything" mode: 1.If the pid of the process issuing the IO is in the blacklist, do not cache the IO. ELSE, 2.If the tgid is in the blacklist, don't cache this IO. UNLESS The particular pid is marked as an exception (and entered in the whitelist, which makes the IO cacheable). 3.Finally, even if IO is cacheable up to this point, skip sequential IO if configured by the sysctl. Conversely, in "cache nothing" mode: 1.If the pid of the process issuing the IO is in the whitelist, cache the IO. ELSE, 2.If the tgid is in the whitelist, cache this IO. UNLESS The particular pid is marked as an exception (and entered in the blacklist, which makes the IO non-cacheable). 4.Anything whitelisted is cached, regardless of sequential or random IO. */ struct flashcache_cachectl_pid *blacklist_head, *blacklist_tail; struct flashcache_cachectl_pid *whitelist_head, *whitelist_tail; int num_blacklist_pids, num_whitelist_pids; unsigned long blacklist_expire_check, whitelist_expire_check;//以上是与进程的 //黑名单列表与白名单列表的有关变量 #define PENDING_JOB_HASH_SIZE 32 struct pending_job *pending_job_hashbuckets[PENDING_JOB_HASH_SIZE]; struct cache_c *next_cache; void *sysctl_handle; // DM virtual device name, stored in superblock and restored on load char dm_vdevname[DEV_PATHLEN]; // real device names are now stored as UUIDs char cache_devname[DEV_PATHLEN]; char disk_devname[DEV_PATHLEN]; /* * If the SSD returns errors, in WRITETHRU and WRITEAROUND modes, * bypass the cache completely. If the SSD dies or is removed, * we want to continue sending requests to the device. *(这个device应该是指的整个虚拟出来的flashcache设备) */ int bypass_cache; /* Per device sysctls */ int sysctl_io_latency_hist;//这个变量置为1,才会画IO请求时间的直方图 /* Compute IO latencies and plot these out on a histogram. The scale is 250 usecs. This is disabled by default since internally flashcache uses gettimeofday() to compute latency and this can get expensive depending on the clock source used. 根据时钟源的不同,使用gettimeofday() 来计算时延可能会产生很大开销。 Setting this to 1 enables computation of IO latencies. The IO latency histogram is appended to 'dmsetup status'. */ int sysctl_do_sync; /* it is for write back dev.flashcache.<cachedev>.do_sync = 0 Schedule cleaning of all dirty blocks in the cache. */ int sysctl_stop_sync; /* it is for write back dev.flashcache.<cachedev>.stop_sync = 0 Stop the sync in progress. */ int sysctl_dirty_thresh;//这个是脏块比例 /* it is for write back dev.flashcache.<cachedev>.dirty_thresh_pct = 20 Flashcache will attempt to keep the dirty blocks in each set under this %. A lower dirty threshold increases disk writes, and reduces block overwrites, but increases the blocks available for read caching. (一个更低的脏页阀值,会增加磁盘的写操作, 为什么会降低块的重写率呢?) */ int sysctl_pid_do_expiry;//Enable expiry on the list of pids in the white/black lists. int sysctl_max_pids;//Maximum number of pids in the white/black lists. int sysctl_pid_expiry_secs;//Set the expiry on the pid white/black lists. int sysctl_reclaim_policy; /* Defaults to FIFO. Can be switched at runtime. FIFO (0) vs LRU (1) vs LFU(2) */ int sysctl_zerostats;//Zero stats (once). int sysctl_error_inject; int sysctl_fast_remove; /* it is for write back Don't sync dirty blocks when removing cache. On a reload both DIRTY and CLEAN blocks persist in the cache. This option can be used to do a quick cache remove. CAUTION: The cache still has uncommitted (to disk) dirty blocks after a fast_remove. */ int sysctl_cache_all; /* Global caching mode to cache everything or cache nothing. See section on Caching Controls. Defaults to "cache everything". */ int sysctl_fallow_clean_speed; /* 默认15分钟清理一次,也有不理想的地方,加大了回写的概率,相应的加大了刷盘 的数量, 增加后备慢速磁盘的负担。于是它引入另外参数 fallow_clean_speed 控制每次回刷的强度。 it is for write back The maximum number of "fallow clean" disk writes per set per second. Defaults to 2. */ int sysctl_fallow_delay; /* it is for write back In seconds. Clean dirty blocks that have been "idle" (not read or written) for fallow_delay seconds. Default is 15 minutes. Setting this to 0 disables idle cleaning completely. */ int sysctl_skip_seq_thresh_kb; /* Skip (don't cache) sequential IO larger than this number (in kb). 0 (default) means cache all IO, both sequential and random. Sequential IO can only be determined 'after the fact', so this much of each sequential I/O will be cached before we skip the rest. Does not affect searching for IO in an existing cache. */ /* Sequential I/O spotter */ struct sequential_io seq_recent_ios[SEQUENTIAL_TRACKER_QUEUE_DEPTH]; struct sequential_io *seq_io_head; struct sequential_io *seq_io_tail; }
static struct target_type flashcache_target = { .name = "flashcache", .version= {1, 0, 4}, .module = THIS_MODULE, .ctr = flashcache_ctr,//构建target device 的方法 .dtr = flashcache_dtr,//删除target device 的方法 .map = flashcache_map,//Target的映射IO请求的方法 .status = flashcache_status,//获取当前target device的状态 .ioctl = flashcache_ioctl,//使用户能在设备运行时,动态修改flashcache的参数 };
int flashcache_map(struct dm_target *ti, struct bio *bio, union map_info *map_context){ struct cache_c *dmc = (struct cache_c *) ti->private; int sectors = to_sector(bio->bi_size); int queued; if (sectors <= 32) size_hist[sectors]++;//bio请求的大小的直方图分布,貌似只记录16KB以内的 if (bio_barrier(bio))//设置一个点,强制使前面提交的io请求完成之后,才能处理这个io请求。 /* Insert a serialization point in the IO queue, forcing previously submitted IO to be completed before this one is issued. */ return -EOPNOTSUPP;/* Operation not supported on transport endpoint */ VERIFY(to_sector(bio->bi_size) <= dmc->block_size);//bi_size是字节,需被转换为扇区 if (bio_data_dir(bio) == READ) dmc->flashcache_stats.reads++;//flashcache_stats记录的是整个逻辑设备的状态 else dmc->flashcache_stats.writes++; spin_lock_irq(&dmc->cache_spin_lock);//关掉本地中断,并获得所要保护的自旋锁 if (unlikely(dmc->sysctl_pid_do_expiry && //设置了白名单或黑名单列表中的pid允许过期 (dmc->whitelist_head || dmc->blacklist_head)))//并且上述列表不为空 flashcache_pid_expiry_all_locked(dmc);//就要检查上述黑白名单列表中的过期pid,并将其删除 if (unlikely(dmc->bypass_cache) ||//这就是分别对应的几种不可缓存的情况 (to_sector(bio->bi_size) != dmc->block_size) || (bio_data_dir(bio) == WRITE && /* 之所以要求是write,因为对于读请求,即使是不可缓存的, 处理该读请求的时候,也是根据是否命中缓存来确定是从SSD中还是disk中服务该读请求, 只是读完之后的后续操作不一样: 可缓存的情况下需要将刚刚访问的块加入缓存块列表; 而不可缓存的情况下不需要加入缓存块列表。 而对于写请求就不一样了,根据是否可缓存,刚开始处理写请求就已经不一样了: 对于可缓存的情况下直接是SSD服务写请求; 对于不可缓存的情况下直接是disk服务写请求。 */ (dmc->cache_mode == FLASHCACHE_WRITE_AROUND || flashcache_uncacheable(dmc, bio)))) { /* 几种不可缓存的情况: 1.明确指定了bypass_cache 2.bio请求的大小不等于逻辑设备的块大小(在设计文档中有提过,但具体为什么???) 3.bio请求的类型为write, 并且(逻辑设备的缓存模式为write around或者处理该bio请求的进程为不可缓存) */ queued = flashcache_inval_blocks(dmc, bio); /* 即使是出现不可缓存的情况之后,还得满足其它条件,才能直接进行uncache disk io flashcache_inval_blocks函数 返回1的情况: 检测到与bio请求overlap的缓存块, 并且该缓存块或者是处于dirty状态, 或者是处于忙碌状态的valid缓存块, 或者是有待处理请求的valid缓存块。 返回0的情况: 没有检测到与bio请求overlap的缓存块; 或者 检测到与bio请求overlap的缓存块,但是 该是该缓存块并非处于dirty状态、 也不是处于忙碌状态的valid缓存块、 也不是有待处理请求的valid缓存块。 返回-12的情况: 分配job时,内存不足。 we invalidate any overlapping cache blocks (cleaning them first if necessary). */ spin_unlock_irq(&dmc->cache_spin_lock); if (queued) {//有可能检测到上述那些块 if (unlikely(queued < 0))//若是由于可用内存不够,没有检测到上述那些块 flashcache_bio_endio(bio, -EIO, dmc, NULL); /* 根据bio请求的服务时间,进行分类统计, 并画出直方图; 然后通知bio请求的结束,并返回对bio请求的处理结果 */ /* 找到上述那些块之后,就不能直接进行不带缓存的io处理, 因为你在disk里面处理了该bio请求之后, 就会将缓存块原来缓存的disk里面的内容覆盖掉, 如果以前的缓存块处于dirty状态,说明其内容还没有写回disk,不能被覆盖, 如果以前的缓存块处于忙碌状态或者有待处理请求,说明以前缓存块里面的内容还需被访问, 也不能被覆盖 */ } else { /* 若没有检测到上述那些块, 可以开始不带缓存的io请求处理 */ /* Start uncached IO */ flashcache_start_uncached_io(dmc, bio); } } else { /* 在可缓存的情况下 根据bio请求的访问类型,分别进行处理 */ spin_unlock_irq(&dmc->cache_spin_lock); if (bio_data_dir(bio) == READ) flashcache_read(dmc, bio); else flashcache_write(dmc, bio); } return DM_MAPIO_SUBMITTED; }
四、内核中建立一个mapped device的过程:
1、根据内核向用户空间提供的ioctl 接口传来的参数,用dm-ioctl.c文件中的dev_create函数创建相应的mapped device结构。这个过程很简单,主要是向内核申请必要的内存资源,包括mapped device和为进行IO操作预申请的内存池,通过内核提供的blk_queue_make_request函数注册该mapped device对应的请求队列dm_request。并将该mapped device作为磁盘块设备注册到内核中。
2、调用dm_hash_insert将创建好的mapped device插入到device mapper中的一个全局hash表中,该表中保存了内核中当前创建的所有mapped device。
3、用户空间命令通过ioctl调用table_load函数,该函数根据用户空间传来的参数构建指定mapped device的映射表和所映射的target device。该函数先构建相应的dm_table、dm_target结构,再调用dm-table.c中的dm_table_add_target函数根据用户传入的参数初始化这些结构,并且根据参数所指定的target类型,调用相应的target类型的构建函数ctr在内存中构建target device对应的结构,然后再根据所建立的dm_target结构更新dm_table中维护的B树。上述过程完毕后,再将建立好的dm_table添加到mapped device的全局hash表对应的hash_cell结构中。
4、最后通过ioctl调用do_resume函数建立mapped device和映射表之间的绑定关系,事实上该过程就是通过dm_swap_table函数将当前dm_table结构指针值赋予mapped_device相应的map域中,然后再修改mapped_device表示当前状态的域。
通过上述的4个主要步骤,device mapper在内核中就建立一个可以提供给用户使用的mapped device逻辑块设备。