linux内核源码阅读之facebook硬盘加速flashcache之三

上一节讲到在刷缓存的时候会调用new_kcahed_job创建kcached_job,由此我们也可以看到cache数据块与磁盘数据的对应关系。上一篇:http://blog.csdn.net/liumangxiong/article/details/11726651
现在继续从new_kcached_job函数中挖掘有用的信息。那就是cache块跟磁盘上扇区是怎么对应起来的?即329行的为什么要写的disk.sector是后面这个值呢?
job->disk.sector = dmc->cache[index].dbn;
最这里是时候揭开变量dmc也就是结构体struct cache_c的真面目了。dmc可以理解成device mapper context或者device mapper cache。先看struct cache_c
[cpp] view plain copy print ?
  1. 134struct cache_c {
  2. 135 struct dm_target *tgt;
  3. 136
  4. 137 struct dm_dev *disk_dev; /* Source device */
  5. 138 struct dm_dev *cache_dev; /* Cache device */
  6. 139
  7. 140#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  8. 141 struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
  9. 142#else
  10. 143 struct dm_kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
  11. 144 struct dm_io_client *io_client; /* Client memory pool*/
  12. 145#endif
  13. 146
  14. 147 spinlock_t cache_spin_lock;
  15. 148
  16. 149 struct cacheblock *cache; /* Hash table for cache blocks */
  17. 150 struct cache_set *cache_sets;
  18. 151 struct cache_md_sector_head *md_sectors_buf;
  19. 152
  20. 153 sector_t size; /* Cache size */
  21. 154 unsigned int assoc; /* Cache associativity */
  22. 155 unsigned int block_size; /* Cache block size */
  23. 156 unsigned int block_shift; /* Cache block size in bits */
  24. 157 unsigned int block_mask; /* Cache block mask */
  25. 158 unsigned int consecutive_shift; /* Consecutive blocks size in bits */
  26. 159
  27. 160 wait_queue_head_t destroyq; /* Wait queue for I/O completion */
  28. 161 /* XXX - Updates of nr_jobs should happen inside the lock. But doing it outside
  29. 162 is OK since the filesystem is unmounted at this point */
  30. 163 atomic_t nr_jobs; /* Number of I/O jobs */
  31. 164 atomic_t fast_remove_in_prog;
  32. 165
  33. 166 int dirty_thresh_set; /* Per set dirty threshold to start cleaning */
  34. 167 int max_clean_ios_set; /* Max cleaning IOs per set */
  35. 168 int max_clean_ios_total; /* Total max cleaning IOs */
  36. 169 int clean_inprog;
  37. 170 int sync_index;
  38. 171 int nr_dirty;
  39. 172
  40. 173 int md_sectors; /* Numbers of metadata sectors, including header */
  41. 174
  42. 175 /* Stats */
  43. 176 unsigned long reads; /* Number of reads */
  44. 177 unsigned long writes; /* Number of writes */
  45. 178 unsigned long read_hits; /* Number of cache hits */
  46. 179 unsigned long write_hits; /* Number of write hits (includes dirty write hits) */
  47. 180 unsigned long dirty_write_hits; /* Number of "dirty" write hits */
  48. 181 unsigned long replace; /* Number of cache replacements */
  49. 182 unsigned long wr_replace;
  50. 183 unsigned long wr_invalidates; /* Number of write invalidations */
  51. 184 unsigned long rd_invalidates; /* Number of read invalidations */
  52. 185 unsigned long pending_inval; /* Invalidations due to concurrent ios on same block */
  53. 186 unsigned long cached_blocks; /* Number of cached blocks */
  54. 187#ifdef FLASHCACHE_DO_CHECKSUMS
  55. 188 unsigned long checksum_store;
  56. 189 unsigned long checksum_valid;
  57. 190 unsigned long checksum_invalid;
  58. 191#endif
  59. 192 unsigned long enqueues; /* enqueues on pending queue */
  60. 193 unsigned long cleanings;
  61. 194 unsigned long noroom; /* No room in set */
  62. 195 unsigned long md_write_dirty; /* Metadata sector writes dirtying block */
  63. 196 unsigned long md_write_clean; /* Metadata sector writes cleaning block */
  64. 197 unsigned long pid_drops;
  65. 198 unsigned long pid_adds;
  66. 199 unsigned long pid_dels;
  67. 200 unsigned long expiry;
  68. 201 unsigned long front_merge, back_merge; /* Write Merging */
  69. 202 unsigned long uncached_reads, uncached_writes;
  70. 203 unsigned long disk_reads, disk_writes;
  71. 204 unsigned long ssd_reads, ssd_writes;
  72. 205 unsigned long ssd_readfills, ssd_readfill_unplugs;
  73. 206
  74. 207 unsigned long clean_set_calls;
  75. 208 unsigned long clean_set_less_dirty;
  76. 209 unsigned long clean_set_fails;
  77. 210 unsigned long clean_set_ios;
  78. 211 unsigned long set_limit_reached;
  79. 212 unsigned long total_limit_reached;
  80. 213
  81. 214 /* Errors */
  82. 215 int disk_read_errors;
  83. 216 int disk_write_errors;
  84. 217 int ssd_read_errors;
  85. 218 int ssd_write_errors;
  86. 219 int memory_alloc_errors;
  87. 220
  88. 221#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
  89. 222 struct work_struct delayed_clean;
  90. 223#else
  91. 224 struct delayed_work delayed_clean;
  92. 225#endif
  93. 226
  94. 227 /* State for doing readfills (batch writes to ssd) */
  95. 228 int readfill_in_prog;
  96. 229 struct kcached_job *readfill_queue;
  97. 230 struct work_struct readfill_wq;
  98. 231
  99. 232 unsigned long pid_expire_check;
  100. 233
  101. 234 struct flashcache_cachectl_pid *blacklist_head, *blacklist_tail;
  102. 235 struct flashcache_cachectl_pid *whitelist_head, *whitelist_tail;
  103. 236 int num_blacklist_pids, num_whitelist_pids;
  104. 237 unsigned long blacklist_expire_check, whitelist_expire_check;
  105. 238
  106. 239 struct cache_c *next_cache;
  107. 240
  108. 241 char cache_devname[DEV_PATHLEN];
  109. 242 char disk_devname[DEV_PATHLEN];
  110. 243};
134struct cache_c {
135	struct dm_target	*tgt;
136	
137	struct dm_dev 		*disk_dev;   /* Source device */
138	struct dm_dev 		*cache_dev; /* Cache device */
139
140#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
141	struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
142#else
143	struct dm_kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
144	struct dm_io_client *io_client; /* Client memory pool*/
145#endif
146
147	spinlock_t		cache_spin_lock;
148
149	struct cacheblock	*cache;	/* Hash table for cache blocks */
150	struct cache_set	*cache_sets;
151	struct cache_md_sector_head *md_sectors_buf;
152	
153	sector_t size;			/* Cache size */
154	unsigned int assoc;		/* Cache associativity */
155	unsigned int block_size;	/* Cache block size */
156	unsigned int block_shift;	/* Cache block size in bits */
157	unsigned int block_mask;	/* Cache block mask */
158	unsigned int consecutive_shift;	/* Consecutive blocks size in bits */
159
160	wait_queue_head_t destroyq;	/* Wait queue for I/O completion */
161	/* XXX - Updates of nr_jobs should happen inside the lock. But doing it outside
162	   is OK since the filesystem is unmounted at this point */
163	atomic_t nr_jobs;		/* Number of I/O jobs */
164	atomic_t fast_remove_in_prog;
165
166	int	dirty_thresh_set;	/* Per set dirty threshold to start cleaning */
167	int	max_clean_ios_set;	/* Max cleaning IOs per set */
168	int	max_clean_ios_total;	/* Total max cleaning IOs */
169	int	clean_inprog;
170	int	sync_index;
171	int	nr_dirty;
172
173	int	md_sectors;		/* Numbers of metadata sectors, including header */
174
175	/* Stats */
176	unsigned long reads;		/* Number of reads */
177	unsigned long writes;		/* Number of writes */
178	unsigned long read_hits;	/* Number of cache hits */
179	unsigned long write_hits;	/* Number of write hits (includes dirty write hits) */
180	unsigned long dirty_write_hits;	/* Number of "dirty" write hits */
181	unsigned long replace;		/* Number of cache replacements */
182	unsigned long wr_replace;
183	unsigned long wr_invalidates;	/* Number of write invalidations */
184	unsigned long rd_invalidates;	/* Number of read invalidations */
185	unsigned long pending_inval;	/* Invalidations due to concurrent ios on same block */
186	unsigned long cached_blocks;	/* Number of cached blocks */
187#ifdef FLASHCACHE_DO_CHECKSUMS
188	unsigned long checksum_store;
189	unsigned long checksum_valid;
190	unsigned long checksum_invalid;
191#endif
192	unsigned long enqueues;		/* enqueues on pending queue */
193	unsigned long cleanings;
194	unsigned long noroom;		/* No room in set */
195	unsigned long md_write_dirty;	/* Metadata sector writes dirtying block */
196	unsigned long md_write_clean;	/* Metadata sector writes cleaning block */
197	unsigned long pid_drops;
198	unsigned long pid_adds;
199	unsigned long pid_dels;
200	unsigned long expiry;
201	unsigned long front_merge, back_merge;	/* Write Merging */
202	unsigned long uncached_reads, uncached_writes;
203	unsigned long disk_reads, disk_writes;
204	unsigned long ssd_reads, ssd_writes;
205	unsigned long ssd_readfills, ssd_readfill_unplugs;
206
207	unsigned long clean_set_calls;
208	unsigned long clean_set_less_dirty;
209	unsigned long clean_set_fails;
210	unsigned long clean_set_ios;
211	unsigned long set_limit_reached;
212	unsigned long total_limit_reached;
213
214	/* Errors */
215	int	disk_read_errors;
216	int	disk_write_errors;
217	int	ssd_read_errors;
218	int	ssd_write_errors;
219	int	memory_alloc_errors;
220
221#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
222	struct work_struct delayed_clean;
223#else
224	struct delayed_work delayed_clean;
225#endif
226
227	/* State for doing readfills (batch writes to ssd) */
228	int readfill_in_prog;
229	struct kcached_job *readfill_queue;
230	struct work_struct readfill_wq;
231
232	unsigned long pid_expire_check;
233
234	struct flashcache_cachectl_pid *blacklist_head, *blacklist_tail;
235	struct flashcache_cachectl_pid *whitelist_head, *whitelist_tail;
236	int num_blacklist_pids, num_whitelist_pids;
237	unsigned long blacklist_expire_check, whitelist_expire_check;
238	
239	struct cache_c	*next_cache;
240
241	char cache_devname[DEV_PATHLEN];
242	char disk_devname[DEV_PATHLEN];
243};
这个多field,如果一个挨一个看一遍,估计我都要睡着了。就像看书一样,如果从第一章看到最后一章,看过之后脑子里总是一片空白。如果是先看目录,带着疑问找自己感兴趣的地方,不时回味一下为什么是这样子,不失为一种愉快并且有效的阅读方式。
那么在看这个数据结构之前,头脑风暴一下产生了以下的疑问:
1)源设备和目的设备分别是什么?映射后数据流是怎么样的?
2)缓存大小是多少?块大小多少?块是怎么组织的
3)脏数据刷新机制是什么样的?水位线是多少
第137,138行表示的磁盘和SSD盘,即目的盘和缓存盘。这里必须十分清楚缓存的概念,一般情况下讲到缓存都是在内存中的,但flashcache中提到缓存块的时候要记住是写在SSD盘上的。数据流的变化就是多了一层flashcache device,在命中情况下直接返回,不到磁盘层。
缓存大小由153行size表示,但要注意的是,这里的size既不是以字节为单位,而不是以sector为单位,而是以cache数据块为单位的。每个cache数据块为block_size大小。cache块的组织是以集合为单位,每个集合有assoc个块,简单地理解为二维数组,第一维得到的是一个集合,第二维得到集合内块数据。为了理解这个结构,来看函数flashcache_lookup,
[cpp] view plain copy print ?
  1. 543/*
  2. 544 * dbn is the starting sector, io_size is the number of sectors.
  3. 545 */
  4. 546static int
  5. 547flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index)
  6. 548{
  7. 549 sector_t dbn = bio->bi_sector;
  8. 550#if DMC_DEBUG
  9. 551 int io_size = to_sector(bio->bi_size);
  10. 552#endif
  11. 553 unsigned long set_number = hash_block(dmc, dbn);
  12. 554 int invalid, oldest_clean = -1;
  13. 555 int start_index;
  14. 556
  15. 557 start_index = dmc->assoc * set_number;
  16. 558 DPRINTK("Cache lookup : dbn %llu(%lu), set = %d",
  17. 559 dbn, io_size, set_number);
  18. 560 find_valid_dbn(dmc, dbn, start_index, index);
  19. 561 if (*index > 0) {
  20. 562 DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d",
  21. 563 dbn, io_size, *index);
  22. 564 /* We found the exact range of blocks we are looking for */
  23. 565 return VALID;
  24. 566 }
  25. 567 invalid = find_invalid_dbn(dmc, start_index);
  26. 568 if (invalid == -1) {
  27. 569 /* We didn't find an invalid entry, search for oldest valid entry */
  28. 570 find_reclaim_dbn(dmc, start_index, &oldest_clean);
  29. 571 }
  30. 572 /*
  31. 573 * Cache miss :
  32. 574 * We can't choose an entry marked INPROG, but choose the oldest
  33. 575 * INVALID or the oldest VALID entry.
  34. 576 */
  35. 577 *index = start_index + dmc->assoc;
  36. 578 if (invalid != -1) {
  37. 579 DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
  38. 580 dbn, io_size, set_number, invalid, start_index);
  39. 581 *index = invalid;
  40. 582 } else if (oldest_clean != -1) {
  41. 583 DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
  42. 584 dbn, io_size, set_number, oldest_clean, start_index);
  43. 585 *index = oldest_clean;
  44. 586 } else {
  45. 587 DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d",
  46. 588 dbn, io_size, set_number);
  47. 589 }
  48. 590 if (*index < (start_index + dmc->assoc))
  49. 591 return INVALID;
  50. 592 else {
  51. 593 dmc->noroom++;
  52. 594 return -1;
  53. 595 }
  54. 596}
543/* 
544 * dbn is the starting sector, io_size is the number of sectors.
545 */
546static int 
547flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index)
548{
549	sector_t dbn = bio->bi_sector;
550#if DMC_DEBUG
551	int io_size = to_sector(bio->bi_size);
552#endif
553	unsigned long set_number = hash_block(dmc, dbn);
554	int invalid, oldest_clean = -1;
555	int start_index;
556
557	start_index = dmc->assoc * set_number;
558	DPRINTK("Cache lookup : dbn %llu(%lu), set = %d",
559		dbn, io_size, set_number);
560	find_valid_dbn(dmc, dbn, start_index, index);
561	if (*index > 0) {
562		DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d",
563			     dbn, io_size, *index);
564		/* We found the exact range of blocks we are looking for */
565		return VALID;
566	}
567	invalid = find_invalid_dbn(dmc, start_index);
568	if (invalid == -1) {
569		/* We didn't find an invalid entry, search for oldest valid entry */
570		find_reclaim_dbn(dmc, start_index, &oldest_clean);
571	}
572	/* 
573	 * Cache miss :
574	 * We can't choose an entry marked INPROG, but choose the oldest
575	 * INVALID or the oldest VALID entry.
576	 */
577	*index = start_index + dmc->assoc;
578	if (invalid != -1) {
579		DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
580			     dbn, io_size, set_number, invalid, start_index);
581		*index = invalid;
582	} else if (oldest_clean != -1) {
583		DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
584			     dbn, io_size, set_number, oldest_clean, start_index);
585		*index = oldest_clean;
586	} else {
587		DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d",
588			dbn, io_size, set_number);
589	}
590	if (*index < (start_index + dmc->assoc))
591		return INVALID;
592	else {
593		dmc->noroom++;
594		return -1;
595	}
596}
第549行dbn是bio的起始扇区,第553行set_number就是这个扇区映射到的缓存集合,简单地看一下hash_block:
[cpp] view plain copy print ?
  1. 444/*
  2. 445 * Map a block from the source device to a block in the cache device.
  3. 446 */
  4. 447static unsigned long
  5. 448hash_block(struct cache_c *dmc, sector_t dbn)
  6. 449{
  7. 450 unsigned long set_number, value;
  8. 451
  9. 452 value = (unsigned long)
  10. 453 (dbn >> (dmc->block_shift + dmc->consecutive_shift));
  11. 454 set_number = value % (dmc->size >> dmc->consecutive_shift);
  12. 455 DPRINTK("Hash: %llu(%lu)->%lu", dbn, value, set_number);
  13. 456 return set_number;
  14. 457}
444/*
445 * Map a block from the source device to a block in the cache device.
446 */
447static unsigned long 
448hash_block(struct cache_c *dmc, sector_t dbn)
449{
450	unsigned long set_number, value;
451
452	value = (unsigned long)
453		(dbn >> (dmc->block_shift + dmc->consecutive_shift));
454	set_number = value % (dmc->size >> dmc->consecutive_shift);
455	DPRINTK("Hash: %llu(%lu)->%lu", dbn, value, set_number);
456	return set_number;
457}
看注释,源设备块映射到cache设备块。看452行,1<<dmc->block_shift就是块大小,1<<dmc->consecutive_shift就是每个集合大小,所以得到的value就是这个扇区在哪个集合上。但直接返回这个值还不行,一般情况下源设备比缓存大得多,所以源设备上多处位置会映射到缓存的一个集合上。所以有了454行,源设备的多个集合映射到缓存的同一个集合上,(dmc->size >> dmc->consecutive_shift)就表示集合的个数。
继续flashcache_lookup第557行,start_index就是这个集合第一个cache块的下标,560行find_valid_db就是查找缓存是否命中,如果命中的话,由index返回,如果不命中,返回-1。第561行就是判断缓存命中,如果命中就直接返回;不命中的话就继续567行查找可用的缓存块。第578行是找到可用缓存块,582就是找到干净的回收缓存块,586就是没有找到可用的缓存块。
回到cache_c结构中来,接着讲刷新。刷新是由第224行工作队列控制的struct delayed_work delayed_clean;这个队列为什么是delayed_work,搜下这个队列的调用,在函数flashcache_clean_set中,
if (do_delayed_clean)
schedule_delayed_work(&dmc->delayed_clean, 1*HZ);
那为什么是延迟1秒调用,看do_delayed_clean
if (dmc->cache_sets[set].nr_dirty > dmc->dirty_thresh_set)
do_delayed_clean = 1;
这里的意思就是超过阈值的时候延迟1秒再检查一遍,为什么不立即做而要延迟呢?这个函数再往回看就知道了,原来下发的请求已经超过某一个阈值,这个时候就不再下发。
除了这个队列之外,还需要有一些阈值来控制。从166行到171行就是这些相关的设置。
nr_dirty是当前集合里脏cache块数
dirty_thresh_set 是超过这个界面就要开始将脏数据写回磁盘
max_clean_ios_set 是单个集合下发写数据块的请求个数
max_clean_ios_total 是整个缓存下发写数据块的请求个数
clean_inprog 是已经下发的写数据块的请求个数
到这里再回去扫描一下cache_c结构,还有一些IO统计和错误统计的field。
每一场好戏都有精彩好戏在后头,cache_c也不例外,接着请三巨头隆重上场:
struct cacheblock *cache; /* Hash table for cache blocks */
struct cache_set *cache_sets;
struct cache_md_sector_head *md_sectors_buf;
第一个结构是cache块在内存中的表示,对应SSD上的是flash_cacheblock。第二个cache_set就是之前一直提到的集合。第三个用于flash_cacheblock刷新,即管理结构从内存cacheblock写到SSD的flash_cacheblock。下面逐一来看这三个结构体:
[cpp] view plain copy print ?
  1. 111/* Cache block metadata structure */
  2. 112struct cacheblock {
  3. 113 u_int16_t cache_state;
  4. 114 int16_t nr_queued; /* jobs in pending queue */
  5. 115 u_int16_t lru_prev, lru_next;
  6. 116 sector_t dbn; /* Sector number of the cached block */
  7. 117#ifdef FLASHCACHE_DO_CHECKSUMS
  8. 118 u_int64_t checksum;
  9. 119#endif
  10. 120 struct pending_job *head;
  11. 121};
111/* Cache block metadata structure */
112struct cacheblock {
113	u_int16_t	cache_state;
114	int16_t 	nr_queued;	/* jobs in pending queue */
115	u_int16_t	lru_prev, lru_next;
116	sector_t 	dbn;	/* Sector number of the cached block */
117#ifdef FLASHCACHE_DO_CHECKSUMS
118	u_int64_t 	checksum;
119#endif
120	struct pending_job *head;
121};

cache_state; cache块的状态
nr_queued; /* jobs in pending queue */ 等待工作个数
lru_prev, lru_next; 按LRU排序,指向前一个和后一个,注意这里是下标
dbn; /* Sector number of the cached block */ 对应磁盘的扇区
checksum; 校验
struct pending_job *head; 等待工作
第二个数据结构:
[cpp] view plain copy print ?
  1. 123struct cache_set {
  2. 124 u_int32_t set_fifo_next;
  3. 125 u_int32_t set_clean_next;
  4. 126 u_int32_t clean_inprog;
  5. 127 u_int32_t nr_dirty;
  6. 128 u_int16_t lru_head, lru_tail;
  7. 129};
123struct cache_set {
124	u_int32_t		set_fifo_next;
125	u_int32_t		set_clean_next;
126	u_int32_t		clean_inprog;
127	u_int32_t		nr_dirty;
128	u_int16_t		lru_head, lru_tail;
129};

第三个数据结构:
[cpp] view plain copy print ?
  1. 344/*
  2. 345 * We have one of these for *every* cache metadata sector, to keep track
  3. 346 * of metadata ios in progress for blocks covered in this sector. Only
  4. 347 * one metadata IO per sector can be in progress at any given point in
  5. 348 * time
  6. 349 */
  7. 350struct cache_md_sector_head {
  8. 351 u_int32_t nr_in_prog;
  9. 352 struct kcached_job *pending_jobs, *md_io_inprog;
  10. 353};
344/* 
345 * We have one of these for *every* cache metadata sector, to keep track
346 * of metadata ios in progress for blocks covered in this sector. Only
347 * one metadata IO per sector can be in progress at any given point in 
348 * time
349 */
350struct cache_md_sector_head {
351	u_int32_t		nr_in_prog;
352	struct kcached_job	*pending_jobs, *md_io_inprog;
353};
看注释,每一个cache metadata扇区对应一个struct cache_md_sector_head结构,用以追踪这个扇区上的IO,这个扇区的IO来自该扇区对应的每一个cache块的状态变化。每一次只允许一个IO在下发。在初始化时,nr_in_prog为0,两个队列也都为零。其中的一个cache块发生变化并且状态要更新到SSD中,这时创建一个job并挂入到pending_jobs,下发时将nr_in_prog置为1,并将job从pending_jobs移到md_io_inprog,如果job下发过程中又有其他job下发,就挂到pending_jobs,等md_io_inprog处理完成再继续下一次下发过程。
到这里,我们把flashcache重要的数据结构都过了一遍。下一节开始介绍flashcache的数据流。

你可能感兴趣的:(linux内核源码阅读之facebook硬盘加速flashcache之三)