linux内核源码阅读之facebook硬盘加速flashcache之八

前面我们的分析中重点关注正常的数据流程,这一小节关注如果有异常,那么流程是怎么走完的呢?
1)创建新任务时kcached_job申请不到
2)读写命中时cache块为忙
3)系统关机时处理,系统开机时处理,系统异常掉电后的处理
首先来看第1种情况,申请kcached_job是在函数flashcache_lookup中,
[cpp] view plain copy print ?
  1. 543/*
  2. 544 * dbn is the starting sector, io_size is the number of sectors.
  3. 545 */
  4. 546static int
  5. 547flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index)
  6. 548{
  7. 549 sector_t dbn = bio->bi_sector;
  8. 550#if DMC_DEBUG
  9. 551 int io_size = to_sector(bio->bi_size);
  10. 552#endif
  11. 553 unsigned long set_number = hash_block(dmc, dbn);
  12. 554 int invalid, oldest_clean = -1;
  13. 555 int start_index;
  14. 556
  15. 557 start_index = dmc->assoc * set_number;
  16. 558 DPRINTK("Cache lookup : dbn %llu(%lu), set = %d",
  17. 559 dbn, io_size, set_number);
  18. 560 find_valid_dbn(dmc, dbn, start_index, index);
  19. 561 if (*index > 0) {
  20. 562 DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d",
  21. 563 dbn, io_size, *index);
  22. 564 /* We found the exact range of blocks we are looking for */
  23. 565 return VALID;
  24. 566 }
  25. 567 invalid = find_invalid_dbn(dmc, start_index);
  26. 568 if (invalid == -1) {
  27. 569 /* We didn't find an invalid entry, search for oldest valid entry */
  28. 570 find_reclaim_dbn(dmc, start_index, &oldest_clean);
  29. 571 }
  30. 572 /*
  31. 573 * Cache miss :
  32. 574 * We can't choose an entry marked INPROG, but choose the oldest
  33. 575 * INVALID or the oldest VALID entry.
  34. 576 */
  35. 577 *index = start_index + dmc->assoc;
  36. 578 if (invalid != -1) {
  37. 579 DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
  38. 580 dbn, io_size, set_number, invalid, start_index);
  39. 581 *index = invalid;
  40. 582 } else if (oldest_clean != -1) {
  41. 583 DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
  42. 584 dbn, io_size, set_number, oldest_clean, start_index);
  43. 585 *index = oldest_clean;
  44. 586 } else {
  45. 587 DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d",
  46. 588 dbn, io_size, set_number);
  47. 589 }
  48. 590 if (*index < (start_index + dmc->assoc))
  49. 591 return INVALID;
  50. 592 else {
  51. 593 dmc->noroom++;
  52. 594 return -1;
  53. 595 }
  54. 596}
543/* 
544 * dbn is the starting sector, io_size is the number of sectors.
545 */
546static int 
547flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index)
548{
549     sector_t dbn = bio->bi_sector;
550#if DMC_DEBUG
551     int io_size = to_sector(bio->bi_size);
552#endif
553     unsigned long set_number = hash_block(dmc, dbn);
554     int invalid, oldest_clean = -1;
555     int start_index;
556
557     start_index = dmc->assoc * set_number;
558     DPRINTK("Cache lookup : dbn %llu(%lu), set = %d",
559          dbn, io_size, set_number);
560     find_valid_dbn(dmc, dbn, start_index, index);
561     if (*index > 0) {
562          DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d",
563                    dbn, io_size, *index);
564          /* We found the exact range of blocks we are looking for */
565          return VALID;
566     }
567     invalid = find_invalid_dbn(dmc, start_index);
568     if (invalid == -1) {
569          /* We didn't find an invalid entry, search for oldest valid entry */
570          find_reclaim_dbn(dmc, start_index, &oldest_clean);
571     }
572     /* 
573     * Cache miss :
574     * We can't choose an entry marked INPROG, but choose the oldest
575     * INVALID or the oldest VALID entry.
576     */
577     *index = start_index + dmc->assoc;
578     if (invalid != -1) {
579          DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
580                    dbn, io_size, set_number, invalid, start_index);
581          *index = invalid;
582     } else if (oldest_clean != -1) {
583          DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
584                    dbn, io_size, set_number, oldest_clean, start_index);
585          *index = oldest_clean;
586     } else {
587          DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d",
588               dbn, io_size, set_number);
589     }
590     if (*index < (start_index + dmc->assoc))
591          return INVALID;
592     else {
593          dmc->noroom++;
594          return -1;
595     }
596}

直接看返回值有三种情况:valid, invalid, -1
valid 命中,invalid 找到空闲块,-1 没有可用cache块,为什么会没有可用cache块呢?逐一来看代码。
553行,hash_block返回当前dbn所在的集合下标。
557行,start_index为当前集合第1个cache块下标。
560行,在当前集合里查找dbn是否命中,如果命中index返回cache块下标,否则置index=-1。
561行,命中,返回cache块下标。
567行,在当前集合里查找一个可用的cache块,找到返回cache块下标,否则返回-1.
568行,找不到可用的cache块。继续查看可否回收一个cache块。
578行,找到可用cache块。
582行,回收了一个cache块。
590行,不管是找到还是回收的,反正已经有cache块了,返回invalid
594行,没有cache块可用。
回到flashcache_read函数,1234行返回的值为-1,接着到1252行将相交的cache块的置为无效。如果说设置无效也失败的话,那这个请求就不能下发了,因为下发到磁盘之后,后面缓存中cache块往磁盘回写,这块数据就被覆盖了。所以就来到了1255行直接返回-EIO错误。
将相交cache块设置无效之后,来到1264行,尝试先刷一些脏cache块。最后到1267行将数据直接下发到磁盘。
直接下发到磁盘的回调函数是flashcache_uncached_io_callback
[cpp] view plain copy print ?
  1. 1864static void
  2. 1865flashcache_uncached_io_callback(unsigned long error, void *context)
  3. 1866{
  4. 1867 struct kcached_job *job = (struct kcached_job *) context;
  5. 1868
  6. 1869 VERIFY(job->index == -1);
  7. 1870 push_uncached_io_complete(job);
  8. 1871 schedule_work(&_kcached_wq);
  9. 1872}
1864static void 
1865flashcache_uncached_io_callback(unsigned long error, void *context)
1866{
1867     struct kcached_job *job = (struct kcached_job *) context;
1868
1869     VERIFY(job->index == -1);
1870     push_uncached_io_complete(job);
1871     schedule_work(&_kcached_wq);
1872}

_kcached_wq调用到函数flashcache_uncached_io_complete,
[cpp] view plain copy print ?
  1. 1805/*
  2. 1806 * We handle uncached IOs ourselves to deal with the problem of out of ordered
  3. 1807 * IOs corrupting the cache. Consider the case where we get 2 concurent IOs
  4. 1808 * for the same block Write-Read (or a Write-Write). Consider the case where
  5. 1809 * the first Write is uncacheable and the second IO is cacheable. If the
  6. 1810 * 2 IOs are out-of-ordered below flashcache, then we will cache inconsistent
  7. 1811 * data in flashcache (persistently).
  8. 1812 *
  9. 1813 * We do invalidations before launching uncacheable IOs to disk. But in case
  10. 1814 * of out of ordering the invalidations before launching the IOs does not help.
  11. 1815 * We need to invalidate after the IO completes.
  12. 1816 *
  13. 1817 * Doing invalidations after the completion of an uncacheable IO will cause
  14. 1818 * any overlapping dirty blocks in the cache to be written out and the IO
  15. 1819 * relaunched. If the overlapping blocks are busy, the IO is relaunched to
  16. 1820 * disk also (post invalidation). In these 2 cases, we will end up sending
  17. 1821 * 2 disk IOs for the block. But this is a rare case.
  18. 1822 *
  19. 1823 * When 2 IOs for the same block are sent down (by un co-operating processes)
  20. 1824 * the storage stack is allowed to re-order the IOs at will. So the applications
  21. 1825 * cannot expect any ordering at all.
  22. 1826 *
  23. 1827 * What we try to avoid here is inconsistencies between disk and the ssd cache.
  24. 1828 */
1805/*
1806 * We handle uncached IOs ourselves to deal with the problem of out of ordered
1807 * IOs corrupting the cache. Consider the case where we get 2 concurent IOs
1808 * for the same block Write-Read (or a Write-Write). Consider the case where
1809 * the first Write is uncacheable and the second IO is cacheable. If the 
1810 * 2 IOs are out-of-ordered below flashcache, then we will cache inconsistent
1811 * data in flashcache (persistently).
1812 * 
1813 * We do invalidations before launching uncacheable IOs to disk. But in case
1814 * of out of ordering the invalidations before launching the IOs does not help.
1815 * We need to invalidate after the IO completes.
1816 * 
1817 * Doing invalidations after the completion of an uncacheable IO will cause 
1818 * any overlapping dirty blocks in the cache to be written out and the IO 
1819 * relaunched. If the overlapping blocks are busy, the IO is relaunched to 
1820 * disk also (post invalidation). In these 2 cases, we will end up sending
1821 * 2 disk IOs for the block. But this is a rare case.
1822 * 
1823 * When 2 IOs for the same block are sent down (by un co-operating processes)
1824 * the storage stack is allowed to re-order the IOs at will. So the applications
1825 * cannot expect any ordering at all.
1826 * 
1827 * What we try to avoid here is inconsistencies between disk and the ssd cache.
1828 */

首先看注释。uncached的IO由于其返回顺序无法预测,可能引起cache数据错误。例如,有2个对同一个块并发IO,一个是写,另一个是读。写IO直接下发到磁盘,读IO找到可用的cache块,读先回来而写后回来,读回来时将cache块设置为读出数据,而实际上这个时候该数据已经不是最新的了。
在下发uncached IO时已经将相关cache块设置为invalid。但是如果出现了前面讲了乱序IO下发时invalid是没有用的。因此还需要在IO结束的时候再次invalid cache块。
做invalid cache块将可能触发脏块写回磁盘,然后然发这个IO。如果这个要写的脏块忙,需要等到空闲再发起,然后再次启动这个uncached IO。在这两种情况下,都要发起两次IO,一次是写脏次,一次是重新发uncached IO。
两个IO同时操作同一块数据时(由不相关进程下发),是有可能按任意次序完成的,所以上层应用不能预测其次序。
我们所能做的只是保持缓存和磁盘的一致。
其实上面说了半天,就是用来解释为什么在uncached IO结束的时候也要调用一下1844行的flashcache_inval_blocks。
[cpp] view plain copy print ?
  1. 1829void
  2. 1830flashcache_uncached_io_complete(struct kcached_job *job)
  3. 1831{
  4. 1832 struct cache_c *dmc = job->dmc;
  5. 1833 unsigned long flags;
  6. 1834 int queued;
  7. 1835 int error = job->error;
  8. 1836
  9. 1837 if (unlikely(error)) {
  10. 1838 if (bio_data_dir(job->bio) == WRITE)
  11. 1839 dmc->disk_write_errors++;
  12. 1840 else
  13. 1841 dmc->disk_read_errors++;
  14. 1842 }
  15. 1843 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  16. 1844 queued = flashcache_inval_blocks(dmc, job->bio);
  17. 1845 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  18. 1846 if (queued) {
  19. 1847 if (unlikely(queued < 0))
  20. 1848 flashcache_bio_endio(job->bio, -EIO);
  21. 1849 /*
  22. 1850 * The IO will be re-executed.
  23. 1851 * The do_pending logic will re-launch the
  24. 1852 * disk IO post-invalidation calling start_uncached_io.
  25. 1853 * This should be a rare occurrence though.
  26. 1854 * XXX - We should track this.
  27. 1855 */
  28. 1856 } else {
  29. 1857 flashcache_bio_endio(job->bio, error);
  30. 1858 }
  31. 1859 flashcache_free_cache_job(job);
  32. 1860 if (atomic_dec_and_test(&dmc->nr_jobs))
  33. 1861 wake_up(&dmc->destroyq);
  34. 1862}
1829void 
1830flashcache_uncached_io_complete(struct kcached_job *job)
1831{
1832     struct cache_c *dmc = job->dmc;
1833     unsigned long flags;
1834     int queued;
1835     int error = job->error;
1836
1837     if (unlikely(error)) {
1838          if (bio_data_dir(job->bio) == WRITE)
1839               dmc->disk_write_errors++;
1840          else
1841               dmc->disk_read_errors++;
1842     }
1843     spin_lock_irqsave(&dmc->cache_spin_lock, flags);
1844     queued = flashcache_inval_blocks(dmc, job->bio);
1845     spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
1846     if (queued) {
1847          if (unlikely(queued < 0))
1848               flashcache_bio_endio(job->bio, -EIO);
1849          /* 
1850          * The IO will be re-executed.
1851          * The do_pending logic will re-launch the 
1852          * disk IO post-invalidation calling start_uncached_io.
1853          * This should be a rare occurrence though.
1854          * XXX - We should track this.
1855          */
1856     } else {
1857          flashcache_bio_endio(job->bio, error);
1858     }
1859     flashcache_free_cache_job(job);
1860     if (atomic_dec_and_test(&dmc->nr_jobs))
1861          wake_up(&dmc->destroyq);
1862}

1837行,uncached IO失败,做下统计。
1846行,看简简单单就一个queued判断,其背后的哲学可真不小。从1847行我们看出queued不仅有可能是小于0的,还有可能是大于0的,等于0的情况下最简单:到1857行返回IO。
先看queued小于0的情况,小于0表示申请pending_job失败,到1848行,返回失败,但这时候悲剧就出现了,就像注释里描述的一样,磁盘的数据是uncached IO写回的数据,但缓存里却是另一份数据。
如果queued大于0,实际上在这个函数里就什么都没做,到1859行释放kcached_job。
但这只是表面现象,就好像看到别人成功很容易,却不曾知道别人在背后下了多少苦功。queued大于0在这里没有做什么事情。但在背后默默努力工作着。所以现实中你看到的都是片面的,你听到的都是不可靠的,就连孔老夫子也曾经感慨说,自己亲眼看到的事情都不一定是事情的真相。所以有一句话叫做谣言止于智者,缺少思考的人只会成为他人利用的对象。所以下一次再看到一篇没有证实的微博、一段评论、一则小道消息时,如果对他人会产生伤害就不要再随意转发了。
为了追踪到queued何时返回大于0,我们跟到flashcache_inval_blocks,再继续跟到flashcache_inval_block_set:
[cpp] view plain copy print ?
  1. 1288/*
  2. 1289 * Invalidate any colliding blocks if they are !BUSY and !DIRTY. If the colliding
  3. 1290 * block is DIRTY, we need to kick off a write. In both cases, we need to wait
  4. 1291 * until the underlying IO is finished, and then proceed with the invalidation.
  5. 1292 */
  6. 1293static int
  7. 1294flashcache_inval_block_set(struct cache_c *dmc, int set, struct bio *bio, int rw,
  8. 1295 struct pending_job *pjob)
  9. 1296{
  10. 1297 sector_t io_start = bio->bi_sector;
  11. 1298 sector_t io_end = bio->bi_sector + (to_sector(bio->bi_size) - 1);
  12. 1299 int start_index, end_index, i;
  13. 1300 struct cacheblock *cacheblk;
  14. 1301
  15. 1302 start_index = dmc->assoc * set;
  16. 1303 end_index = start_index + dmc->assoc;
  17. 1304 for (i = start_index ; i < end_index ; i++) {
  18. 1305 sector_t start_dbn = dmc->cache[i].dbn;
  19. 1306 sector_t end_dbn = start_dbn + dmc->block_size;
  20. 1307
  21. 1308 cacheblk = &dmc->cache[i];
  22. 1309 if (cacheblk->cache_state & INVALID)
  23. 1310 continue;
  24. 1311 if ((io_start >= start_dbn && io_start < end_dbn) ||
  25. 1312 (io_end >= start_dbn && io_end < end_dbn)) {
  26. 1313 /* We have a match */
  27. 1314 if (rw == WRITE)
  28. 1315 dmc->wr_invalidates++;
  29. 1316 else
  30. 1317 dmc->rd_invalidates++;
  31. 1318 if (!(cacheblk->cache_state & (BLOCK_IO_INPROG | DIRTY)) &&
  32. 1319 (cacheblk->head == NULL)) {
  33. 1320 dmc->cached_blocks--;
  34. 1321 DPRINTK("Cache invalidate (!BUSY): Block %llu %lx",
  35. 1322 start_dbn, cacheblk->cache_state);
  36. 1323 cacheblk->cache_state = INVALID;
  37. 1324 continue;
  38. 1325 }
  39. 1326 /*
  40. 1327 * The conflicting block has either IO in progress or is
  41. 1328 * Dirty. In all cases, we need to add ourselves to the
  42. 1329 * pending queue. Then if the block is dirty, we kick off
  43. 1330 * an IO to clean the block.
  44. 1331 * Note that if the block is dirty and IO is in progress
  45. 1332 * on it, the do_pending handler will clean the block
  46. 1333 * and then process the pending queue.
  47. 1334 */
  48. 1335 flashcache_enq_pending(dmc, bio, i, INVALIDATE, pjob);
  49. 1336 if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) {
  50. 1337 /*
  51. 1338 * Kick off block write.
  52. 1339 * We can't kick off the write under the spinlock.
  53. 1340 * Instead, we mark the slot DISKWRITEINPROG, drop
  54. 1341 * the spinlock and kick off the write. A block marked
  55. 1342 * DISKWRITEINPROG cannot change underneath us.
  56. 1343 * to enqueue ourselves onto it's pending queue.
  57. 1344 *
  58. 1345 * XXX - The dropping of the lock here can be avoided if
  59. 1346 * we punt the cleaning of the block to the worker thread,
  60. 1347 * at the cost of a context switch.
  61. 1348 */
  62. 1349 cacheblk->cache_state |= DISKWRITEINPROG;
  63. 1350 spin_unlock_irq(&dmc->cache_spin_lock);
  64. 1351 flashcache_dirty_writeback(dmc, i); /* Must inc nr_jobs */
  65. 1352 spin_lock_irq(&dmc->cache_spin_lock);
  66. 1353 }
  67. 1354 return 1;
  68. 1355 }
  69. 1356 }
  70. 1357 return 0;
  71. 1358}
1288/*
1289 * Invalidate any colliding blocks if they are !BUSY and !DIRTY. If the colliding
1290 * block is DIRTY, we need to kick off a write. In both cases, we need to wait 
1291 * until the underlying IO is finished, and then proceed with the invalidation.
1292 */
1293static int
1294flashcache_inval_block_set(struct cache_c *dmc, int set, struct bio *bio, int rw,
1295                  struct pending_job *pjob)
1296{
1297     sector_t io_start = bio->bi_sector;
1298     sector_t io_end = bio->bi_sector + (to_sector(bio->bi_size) - 1);
1299     int start_index, end_index, i;
1300     struct cacheblock *cacheblk;
1301     
1302     start_index = dmc->assoc * set;
1303     end_index = start_index + dmc->assoc;
1304     for (i = start_index ; i < end_index ; i++) {
1305          sector_t start_dbn = dmc->cache[i].dbn;
1306          sector_t end_dbn = start_dbn + dmc->block_size;
1307          
1308          cacheblk = &dmc->cache[i];
1309          if (cacheblk->cache_state & INVALID)
1310               continue;
1311          if ((io_start >= start_dbn && io_start < end_dbn) ||
1312              (io_end >= start_dbn && io_end < end_dbn)) {
1313               /* We have a match */
1314               if (rw == WRITE)
1315                    dmc->wr_invalidates++;
1316               else
1317                    dmc->rd_invalidates++;
1318               if (!(cacheblk->cache_state & (BLOCK_IO_INPROG | DIRTY)) &&
1319                   (cacheblk->head == NULL)) {
1320                    dmc->cached_blocks--;               
1321                    DPRINTK("Cache invalidate (!BUSY): Block %llu %lx",
1322                         start_dbn, cacheblk->cache_state);
1323                    cacheblk->cache_state = INVALID;
1324                    continue;
1325               }
1326               /*
1327               * The conflicting block has either IO in progress or is 
1328               * Dirty. In all cases, we need to add ourselves to the 
1329               * pending queue. Then if the block is dirty, we kick off
1330               * an IO to clean the block. 
1331               * Note that if the block is dirty and IO is in progress
1332               * on it, the do_pending handler will clean the block
1333               * and then process the pending queue.
1334               */
1335               flashcache_enq_pending(dmc, bio, i, INVALIDATE, pjob);
1336               if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) {
1337                    /* 
1338                    * Kick off block write.
1339                    * We can't kick off the write under the spinlock.
1340                    * Instead, we mark the slot DISKWRITEINPROG, drop 
1341                    * the spinlock and kick off the write. A block marked
1342                    * DISKWRITEINPROG cannot change underneath us. 
1343                    * to enqueue ourselves onto it's pending queue.
1344                    *
1345                    * XXX - The dropping of the lock here can be avoided if
1346                    * we punt the cleaning of the block to the worker thread,
1347                    * at the cost of a context switch.
1348                    */
1349                    cacheblk->cache_state |= DISKWRITEINPROG;
1350                    spin_unlock_irq(&dmc->cache_spin_lock);
1351                    flashcache_dirty_writeback(dmc, i); /* Must inc nr_jobs */
1352                    spin_lock_irq(&dmc->cache_spin_lock);
1353               }
1354               return 1;
1355          }
1356     }
1357     return 0;
1358}

我们直接找返回大于0的地方就在1354行,再继续往回找是1311行if里面,这个if语句就表示bio跟cache块有交集。
1318行,如果cache块不为脏且不忙的话直接设置invalid,并continue。
接着看1327行注释,冲突块可能是忙或者脏,在这两种情况下,都需要加入pending队列。如果只是脏,立即触发一次写回磁盘。如果同时是脏和忙,那么do_pending处理函数会先将脏块写回然后再继续处理。作者真是费了苦心来写这一大堆注释,但如果没有这些注释,后面在do_pending的处理也确实不大好看懂。
到这里故事还没有结束,因为在1335行插入了一个pending_job,那么这个任务什么时候执行呢?
在flashcache_md_write_done里会看到调用到flashcache_do_pending,
[cpp] view plain copy print ?
  1. 359void
  2. 360flashcache_do_pending(struct kcached_job *job)
  3. 361{
  4. 362 if (job->error)
  5. 363 flashcache_do_pending_error(job);
  6. 364 else
  7. 365 flashcache_do_pending_noerror(job);
  8. 366}
359void
360flashcache_do_pending(struct kcached_job *job)
361{
362     if (job->error)
363          flashcache_do_pending_error(job);
364     else
365          flashcache_do_pending_noerror(job);
366}

362行,IO返回错误,跟进去看看错误处理
[cpp] view plain copy print ?
  1. 262/*
  2. 263 * Common error handling for everything.
  3. 264 * 1) If the block isn't dirty, invalidate it.
  4. 265 * 2) Error all pending IOs that totally or partly overlap this block.
  5. 266 * 3) Free the job.
  6. 267 */
  7. 268static void
  8. 269flashcache_do_pending_error(struct kcached_job *job)
  9. 270{
  10. 271 struct cache_c *dmc = job->dmc;
  11. 272 unsigned long flags;
  12. 273 struct cacheblock *cacheblk = &dmc->cache[job->index];
  13. 274
  14. 275 DMERR("flashcache_do_pending_error: error %d block %lu action %d",
  15. 276 -job->error, job->disk.sector, job->action);
  16. 277 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  17. 278 VERIFY(cacheblk->cache_state & VALID);
  18. 279 /* Invalidate block if possible */
  19. 280 if ((cacheblk->cache_state & DIRTY) == 0) {
  20. 281 dmc->cached_blocks--;
  21. 282 dmc->pending_inval++;
  22. 283 cacheblk->cache_state &= ~VALID;
  23. 284 cacheblk->cache_state |= INVALID;
  24. 285 }
  25. 286 flashcache_free_pending_jobs(dmc, cacheblk, job->error);
  26. 287 cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
  27. 288 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  28. 289 flashcache_free_cache_job(job);
  29. 290 if (atomic_dec_and_test(&dmc->nr_jobs))
  30. 291 wake_up(&dmc->destroyq);
  31. 292}
262/* 
263 * Common error handling for everything.
264 * 1) If the block isn't dirty, invalidate it.
265 * 2) Error all pending IOs that totally or partly overlap this block.
266 * 3) Free the job.
267 */
268static void
269flashcache_do_pending_error(struct kcached_job *job)
270{
271     struct cache_c *dmc = job->dmc;
272     unsigned long flags;
273     struct cacheblock *cacheblk = &dmc->cache[job->index];
274
275     DMERR("flashcache_do_pending_error: error %d block %lu action %d", 
276           -job->error, job->disk.sector, job->action);
277     spin_lock_irqsave(&dmc->cache_spin_lock, flags);
278     VERIFY(cacheblk->cache_state & VALID);
279     /* Invalidate block if possible */
280     if ((cacheblk->cache_state & DIRTY) == 0) {
281          dmc->cached_blocks--;
282          dmc->pending_inval++;
283          cacheblk->cache_state &= ~VALID;
284          cacheblk->cache_state |= INVALID;
285     }
286     flashcache_free_pending_jobs(dmc, cacheblk, job->error);
287     cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
288     spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
289     flashcache_free_cache_job(job);
290     if (atomic_dec_and_test(&dmc->nr_jobs))
291          wake_up(&dmc->destroyq);
292}

如果cache块不为脏,则直接设置为invalid。将pending IO都返回错误,释放kcached_job。
但我们更关心flashcache_do_pending_noerror
[cpp] view plain copy print ?
  1. 294static void
  2. 295flashcache_do_pending_noerror(struct kcached_job *job)
  3. 296{
  4. 297 struct cache_c *dmc = job->dmc;
  5. 298 int index = job->index;
  6. 299 unsigned long flags;
  7. 300 struct pending_job *pending_job;
  8. 301 int queued;
  9. 302 struct cacheblock *cacheblk = &dmc->cache[index];
  10. 303
  11. 304 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  12. 305 if (cacheblk->cache_state & DIRTY) {
  13. 306 cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
  14. 307 cacheblk->cache_state |= DISKWRITEINPROG;
  15. 308 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  16. 309 flashcache_dirty_writeback(dmc, index);
  17. 310 goto out;
  18. 311 }
  19. 312 DPRINTK("flashcache_do_pending: Index %d %lx",
  20. 313 index, cacheblk->cache_state);
  21. 314 VERIFY(cacheblk->cache_state & VALID);
  22. 315 dmc->cached_blocks--;
  23. 316 dmc->pending_inval++;
  24. 317 cacheblk->cache_state &= ~VALID;
  25. 318 cacheblk->cache_state |= INVALID;
  26. 319 while (cacheblk->head) {
  27. 320 VERIFY(!(cacheblk->cache_state & DIRTY));
  28. 321 pending_job = cacheblk->head;
  29. 322 cacheblk->head = pending_job->next;
  30. 323 VERIFY(cacheblk->nr_queued > 0);
  31. 324 cacheblk->nr_queued--;
  32. 325 if (pending_job->action == INVALIDATE) {
  33. 326 DPRINTK("flashcache_do_pending: INVALIDATE %llu",
  34. 327 next_job->bio->bi_sector);
  35. 328 VERIFY(pending_job->bio != NULL);
  36. 329 queued = flashcache_inval_blocks(dmc, pending_job->bio);
  37. 330 if (queued) {
  38. 331 if (unlikely(queued < 0)) {
  39. 332 /*
  40. 333 * Memory allocation failure inside inval_blocks.
  41. 334 * Fail this io.
  42. 335 */
  43. 336 flashcache_bio_endio(pending_job->bio, -EIO);
  44. 337 }
  45. 338 flashcache_free_pending_job(pending_job);
  46. 339 continue;
  47. 340 }
  48. 341 }
  49. 342 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  50. 343 DPRINTK("flashcache_do_pending: Sending down IO %llu",
  51. 344 pending_job->bio->bi_sector);
  52. 345 /* Start uncached IO */
  53. 346 flashcache_start_uncached_io(dmc, pending_job->bio);
  54. 347 flashcache_free_pending_job(pending_job);
  55. 348 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  56. 349 }
  57. 350 VERIFY(cacheblk->nr_queued == 0);
  58. 351 cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
  59. 352 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  60. 353out:
  61. 354 flashcache_free_cache_job(job);
  62. 355 if (atomic_dec_and_test(&dmc->nr_jobs))
  63. 356 wake_up(&dmc->destroyq);
  64. 357}
294static void
295flashcache_do_pending_noerror(struct kcached_job *job)
296{
297     struct cache_c *dmc = job->dmc;
298     int index = job->index;
299     unsigned long flags;
300     struct pending_job *pending_job;
301     int queued;
302     struct cacheblock *cacheblk = &dmc->cache[index];
303
304     spin_lock_irqsave(&dmc->cache_spin_lock, flags);
305     if (cacheblk->cache_state & DIRTY) {
306          cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
307          cacheblk->cache_state |= DISKWRITEINPROG;
308          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
309          flashcache_dirty_writeback(dmc, index);
310          goto out;
311     }
312     DPRINTK("flashcache_do_pending: Index %d %lx",
313          index, cacheblk->cache_state);
314     VERIFY(cacheblk->cache_state & VALID);
315     dmc->cached_blocks--;
316     dmc->pending_inval++;
317     cacheblk->cache_state &= ~VALID;
318     cacheblk->cache_state |= INVALID;
319     while (cacheblk->head) {
320          VERIFY(!(cacheblk->cache_state & DIRTY));
321          pending_job = cacheblk->head;
322          cacheblk->head = pending_job->next;
323          VERIFY(cacheblk->nr_queued > 0);
324          cacheblk->nr_queued--;
325          if (pending_job->action == INVALIDATE) {
326               DPRINTK("flashcache_do_pending: INVALIDATE  %llu",
327                    next_job->bio->bi_sector);
328               VERIFY(pending_job->bio != NULL);
329               queued = flashcache_inval_blocks(dmc, pending_job->bio);
330               if (queued) {
331                    if (unlikely(queued < 0)) {
332                         /*
333                         * Memory allocation failure inside inval_blocks.
334                         * Fail this io.
335                         */
336                         flashcache_bio_endio(pending_job->bio, -EIO);
337                    }
338                    flashcache_free_pending_job(pending_job);
339                    continue;
340               }
341          }
342          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
343          DPRINTK("flashcache_do_pending: Sending down IO %llu",
344               pending_job->bio->bi_sector);
345          /* Start uncached IO */
346          flashcache_start_uncached_io(dmc, pending_job->bio);
347          flashcache_free_pending_job(pending_job);
348          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
349     }
350     VERIFY(cacheblk->nr_queued == 0);
351     cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
352     spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
353out:
354     flashcache_free_cache_job(job);
355     if (atomic_dec_and_test(&dmc->nr_jobs))
356          wake_up(&dmc->destroyq);
357}

这个函数分为两部分,第一部分是如果cache块为脏,则下发后立即返回,等待第二次调用,第二次调用才真正到处理pending_job,记得我们在上文中插入的pending_job是INVALIDATE的,那么这里也正如前面注释里所说下发了两次IO,一次是写回脏块,后一次是下发uncached IO。
319行,取出invalid的pending_job
320行,确认非dirty,因为第一次调用的时候已经写回了
325行,if语句成立,
329行,因为cache块已写回,就不脏不忙了,flashcache_inval_blocks只要设置invalid就可以返回成功
346行,下发uncached IO
至此uncached IO之旅告一个段落了。
接下来讲第2种情况读写命中但cache块忙的情况下是怎么处理的。
读写IO在cache块忙的情况下做出的表现是惊人的一致,那就是创建pending_job并挂入cache块队列中。这对我们来说已经是轻车熟路,不过我们这一次要跟踪的是读写IO的情况。经过前面的分析我们知道,pending_job是在flashcache_do_pending_noerror函数中处理的。同样如果为脏块要刷一次脏块,第二次进入到319行循环,由于Action为READCACHE或者WRITECACHE,直接到346行下发uncached IO。第2种情况的处理也就宣告结束了。似乎显得仓促,现实就是这样的,永远别想像电影里那样大起大落,只要你内心够从容,平平淡淡才是真。
第3种情况就留给大家自己分析,如果对这几个小节都已经熟悉,那就已经是小case了。
至此,flashcache源码的分析也就结束了,我也非常高兴能够坚持写完,因为这确实是一个非常耗时间的过程。如果你阅读之后能有所收获,那将是我最大的欢喜了。

你可能感兴趣的:(linux内核源码阅读之facebook硬盘加速flashcache之八)