linux内核源码阅读之facebook硬盘加速利器flashcache之一

从来没有写过源码阅读,这种感觉越来越强烈,虽然劣于文笔,但还是下定决心认真写一回。
源代码下载请参见上一篇flashcache之我见 http://blog.csdn.net/liumangxiong/article/details/11643473
下面代码对应的是tag下面的1.0版本的。

看内核模块源码,闭着眼睛打开flashcache_init函数,区区百来行代码何足惧也。
[cpp] view plain copy print ?
  1. 1963int __init
  2. 1964flashcache_init(void)
  3. 1965{
  4. 1966 int r;
  5. 1967
  6. 1968 r = flashcache_jobs_init();
  7. 1969 if (r)
  8. 1970 return r;
  9. 1971 atomic_set(&nr_cache_jobs, 0);
  10. 1972 atomic_set(&nr_pending_jobs, 0);
  11. 1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
  12. 1974 INIT_WORK(&_kcached_wq, do_work, NULL);
  13. 1975#else
  14. 1976 INIT_WORK(&_kcached_wq, do_work);
  15. 1977#endif
  16. 1978 for (r = 0 ; r < 33 ; r++)
  17. 1979 size_hist[r] = 0;
  18. 1980 r = dm_register_target(&flashcache_target);
  19. 1981 if (r < 0) {
  20. 1982 DMERR("cache: register failed %d", r);
  21. 1983 }
  22. 1984#ifdef CONFIG_PROC_FS
  23. 1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  24. 1986 flashcache_table_header =
  25. 1987 register_sysctl_table(flashcache_root_table, 1);
  26. 1988#else
  27. 1989 flashcache_table_header =
  28. 1990 register_sysctl_table(flashcache_root_table);
  29. 1991#endif
  30. 1992 {
  31. 1993 struct proc_dir_entry *entry;
  32. 1994
  33. 1995 entry = create_proc_entry("flashcache_stats", 0, NULL);
  34. 1996 if (entry)
  35. 1997 entry->proc_fops = &flashcache_stats_operations;
  36. 1998 entry = create_proc_entry("flashcache_errors", 0, NULL);
  37. 1999 if (entry)
  38. 2000 entry->proc_fops = &flashcache_errors_operations;
  39. 2001 entry = create_proc_entry("flashcache_iosize_hist", 0, NULL);
  40. 2002 if (entry)
  41. 2003 entry->proc_fops = &flashcache_iosize_hist_operations;
  42. 2004 entry = create_proc_entry("flashcache_pidlists", 0, NULL);
  43. 2005 if (entry)
  44. 2006 entry->proc_fops = &flashcache_pidlists_operations;
  45. 2007 entry = create_proc_entry("flashcache_version", 0, NULL);
  46. 2008 if (entry)
  47. 2009 entry->proc_fops = &flashcache_version_operations;
  48. 2010 }
  49. 2011#endif
  50. 2012 flashcache_control = (struct flashcache_control_s *)
  51. 2013 kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL);
  52. 2014 flashcache_control->synch_flags = 0;
  53. 2015 register_reboot_notifier(&flashcache_notifier);
  54. 2016 return r;
  55. 2017}
1963int __init 
1964flashcache_init(void)
1965{
1966	int r;
1967
1968	r = flashcache_jobs_init();
1969	if (r)
1970		return r;
1971	atomic_set(&nr_cache_jobs, 0);
1972	atomic_set(&nr_pending_jobs, 0);
1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
1974	INIT_WORK(&_kcached_wq, do_work, NULL);
1975#else
1976	INIT_WORK(&_kcached_wq, do_work);
1977#endif
1978	for (r = 0 ; r < 33 ; r++)
1979		size_hist[r] = 0;
1980	r = dm_register_target(&flashcache_target);
1981	if (r < 0) {
1982		DMERR("cache: register failed %d", r);
1983	}
1984#ifdef CONFIG_PROC_FS
1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
1986	flashcache_table_header = 
1987		register_sysctl_table(flashcache_root_table, 1);
1988#else
1989	flashcache_table_header = 
1990		register_sysctl_table(flashcache_root_table);
1991#endif
1992	{
1993		struct proc_dir_entry *entry;
1994		
1995		entry = create_proc_entry("flashcache_stats", 0, NULL);
1996		if (entry)
1997			entry->proc_fops =  &flashcache_stats_operations;
1998		entry = create_proc_entry("flashcache_errors", 0, NULL);
1999		if (entry)
2000			entry->proc_fops =  &flashcache_errors_operations;
2001		entry = create_proc_entry("flashcache_iosize_hist", 0, NULL);
2002		if (entry)
2003			entry->proc_fops =  &flashcache_iosize_hist_operations;
2004		entry = create_proc_entry("flashcache_pidlists", 0, NULL);
2005		if (entry)
2006			entry->proc_fops =  &flashcache_pidlists_operations;
2007		entry = create_proc_entry("flashcache_version", 0, NULL);
2008		if (entry)
2009			entry->proc_fops =  &flashcache_version_operations;
2010	}
2011#endif
2012	flashcache_control = (struct flashcache_control_s *)
2013		kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL);
2014	flashcache_control->synch_flags = 0;
2015	register_reboot_notifier(&flashcache_notifier);
2016	return r;
2017}

先大致看一眼,flashcache_jobs_init()分配job内存结构的,INIT_WORK初始化WORK的,接下来一看proc字眼就知道是/proc下目录的文件,再后来创建一个flashcache_control_s管理结构,再注册一个关机回调函数。
这样就走马观花地把这个函数看完了,那让写代码的人情何以堪?
再问一下自己,flashcache究竟做了什么?脑子里还是一片空白。那接下来就到每个函数内探个究竟。
[cpp] view plain copy print ?
  1. <PRE class=html name="code">441static int
  2. 442flashcache_jobs_init(void)
  3. 443{
  4. 444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  5. 445 _job_cache = kmem_cache_create("kcached-jobs",
  6. 446 sizeof(struct kcached_job),
  7. 447 __alignof__(struct kcached_job),
  8. 448 0, NULL, NULL);
  9. 449#else
  10. 450 _job_cache = kmem_cache_create("kcached-jobs",
  11. 451 sizeof(struct kcached_job),
  12. 452 __alignof__(struct kcached_job),
  13. 453 0, NULL);
  14. 454#endif
  15. 455 if (!_job_cache)
  16. 456 return -ENOMEM;
  17. 457
  18. 458 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
  19. 459 mempool_free_slab, _job_cache);
  20. 460 if (!_job_pool) {
  21. 461 kmem_cache_destroy(_job_cache);
  22. 462 return -ENOMEM;
  23. 463 }
  24. 464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  25. 465 _pending_job_cache = kmem_cache_create("pending-jobs",
  26. 466 sizeof(struct pending_job),
  27. 467 __alignof__(struct pending_job),
  28. 468 0, NULL, NULL);
  29. 469#else
  30. 470 _pending_job_cache = kmem_cache_create("pending-jobs",
  31. 471 sizeof(struct pending_job),
  32. 472 __alignof__(struct pending_job),
  33. 473 0, NULL);
  34. 474#endif
  35. 475 if (!_pending_job_cache) {
  36. 476 mempool_destroy(_job_pool);
  37. 477 kmem_cache_destroy(_job_cache);
  38. 478 return -ENOMEM;
  39. 479 }
  40. 480
  41. 481 _pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
  42. 482 mempool_free_slab, _pending_job_cache);
  43. 483 if (!_pending_job_pool) {
  44. 484 kmem_cache_destroy(_pending_job_cache);
  45. 485 mempool_destroy(_job_pool);
  46. 486 kmem_cache_destroy(_job_cache);
  47. 487 return -ENOMEM;
  48. 488 }
  49. 489
  50. 490 return 0;
  51. 491}</PRE><BR><BR>
   
   
   
   
[html] view plain copy print ?
  1. 441static int
  2. 442flashcache_jobs_init(void)
  3. 443{
  4. 444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  5. 445 _job_cache = kmem_cache_create("kcached-jobs",
  6. 446 sizeof(struct kcached_job),
  7. 447 __alignof__(struct kcached_job),
  8. 448 0, NULL, NULL);
  9. 449#else
  10. 450 _job_cache = kmem_cache_create("kcached-jobs",
  11. 451 sizeof(struct kcached_job),
  12. 452 __alignof__(struct kcached_job),
  13. 453 0, NULL);
  14. 454#endif
  15. 455 if (!_job_cache)
  16. 456 return -ENOMEM;
  17. 457
  18. 458 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
  19. 459 mempool_free_slab, _job_cache);
  20. 460 if (!_job_pool) {
  21. 461 kmem_cache_destroy(_job_cache);
  22. 462 return -ENOMEM;
  23. 463 }
  24. 464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  25. 465 _pending_job_cache = kmem_cache_create("pending-jobs",
  26. 466 sizeof(struct pending_job),
  27. 467 __alignof__(struct pending_job),
  28. 468 0, NULL, NULL);
  29. 469#else
  30. 470 _pending_job_cache = kmem_cache_create("pending-jobs",
  31. 471 sizeof(struct pending_job),
  32. 472 __alignof__(struct pending_job),
  33. 473 0, NULL);
  34. 474#endif
  35. 475 if (!_pending_job_cache) {
  36. 476 mempool_destroy(_job_pool);
  37. 477 kmem_cache_destroy(_job_cache);
  38. 478 return -ENOMEM;
  39. 479 }
  40. 480
  41. 481 _pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
  42. 482 mempool_free_slab, _pending_job_cache);
  43. 483 if (!_pending_job_pool) {
  44. 484 kmem_cache_destroy(_pending_job_cache);
  45. 485 mempool_destroy(_job_pool);
  46. 486 kmem_cache_destroy(_job_cache);
  47. 487 return -ENOMEM;
  48. 488 }
  49. 489
  50. 490 return 0;
  51. 491}
441static int 
442flashcache_jobs_init(void)
443{
444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
445	_job_cache = kmem_cache_create("kcached-jobs",
446	                               sizeof(struct kcached_job),
447	                               __alignof__(struct kcached_job),
448	                               0, NULL, NULL);
449#else
450	_job_cache = kmem_cache_create("kcached-jobs",
451	                               sizeof(struct kcached_job),
452	                               __alignof__(struct kcached_job),
453	                               0, NULL);
454#endif
455	if (!_job_cache)
456		return -ENOMEM;
457
458	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
459	                           mempool_free_slab, _job_cache);
460	if (!_job_pool) {
461		kmem_cache_destroy(_job_cache);
462		return -ENOMEM;
463	}
464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
465	_pending_job_cache = kmem_cache_create("pending-jobs",
466					       sizeof(struct pending_job),
467					       __alignof__(struct pending_job),
468					       0, NULL, NULL);
469#else
470	_pending_job_cache = kmem_cache_create("pending-jobs",
471					       sizeof(struct pending_job),
472					       __alignof__(struct pending_job),
473					       0, NULL);
474#endif
475	if (!_pending_job_cache) {
476		mempool_destroy(_job_pool);
477		kmem_cache_destroy(_job_cache);
478		return -ENOMEM;
479	}
480
481	_pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
482					   mempool_free_slab, _pending_job_cache);
483	if (!_pending_job_pool) {
484		kmem_cache_destroy(_pending_job_cache);
485		mempool_destroy(_job_pool);
486		kmem_cache_destroy(_job_cache);
487		return -ENOMEM;
488	}
489
490	return 0;
491}



首先是flashcache_jobs_init()函数,该函数里创建了两类job和两类的mem_pool,就像双胞胎看起来一样,实际上并不一样。
_job_pool => flashcache_alloc_cache_job => new_kcached_job 调用new_kcached_job 有好多个,有flashcache_dirty_writeback、flashcache_read_hit、flashcache_read_miss、flashcache_write_miss、flashcache_write_hit、flashcache_dirty_writeback_sync、flashcache_start_uncached_io。如果仔细地看一下这些函数的名称,发现这些函数所做的事情正是一个写缓存的基本操作和动作,即writeback, writethrough, hit, miss。
现在就以flashcache_dirty_writeback为例,看看到底在kcacheed_job起了什么作用?
[cpp] view plain copy print ?
  1. <PRE class=cpp name="code">944static void
  2. 945flashcache_dirty_writeback(struct cache_c *dmc, int index)
  3. 946{
  4. 947 struct kcached_job *job;
  5. 948 unsigned long flags;
  6. 949 struct cacheblock *cacheblk = &dmc->cache[index];
  7. 950 int device_removal = 0;
  8. 951
  9. 952 DPRINTK("flashcache_dirty_writeback: Index %d", index);
  10. 953 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  11. 954 VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG);
  12. 955 VERIFY(cacheblk->cache_state & DIRTY);
  13. 956 dmc->cache_sets[index / dmc->assoc].clean_inprog++;
  14. 957 dmc->clean_inprog++;
  15. 958 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  16. 959 job = new_kcached_job(dmc, NULL, index);
  17. 960 if (unlikely(sysctl_flashcache_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) {
  18. 961 if (job)
  19. 962 flashcache_free_cache_job(job);
  20. 963 job = NULL;
  21. 964 sysctl_flashcache_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL;
  22. 965 }
  23. 966 /*
  24. 967 * If the device is being (fast) removed, do not kick off any more cleanings.
  25. 968 */
  26. 969 if (unlikely(atomic_read(&dmc->fast_remove_in_prog))) {
  27. 970 DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu",
  28. 971 cacheblk->dbn);
  29. 972 if (job)
  30. 973 flashcache_free_cache_job(job);
  31. 974 job = NULL;
  32. 975 device_removal = 1;
  33. 976 }
  34. 977 if (unlikely(job == NULL)) {
  35. 978 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  36. 979 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
  37. 980 dmc->clean_inprog--;
  38. 981 flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
  39. 982 cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
  40. 983 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  41. 984 if (device_removal == 0)
  42. 985 DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu",
  43. 986 cacheblk->dbn);
  44. 987 } else {
  45. 988 job->bio = NULL;
  46. 989 job->action = WRITEDISK;
  47. 990 atomic_inc(&dmc->nr_jobs);
  48. 991 dmc->ssd_reads++;
  49. 992 dmc->disk_writes++;
  50. 993#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  51. 994 kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0,
  52. 995 flashcache_kcopyd_callback, job);
  53. 996#else
  54. 997 dm_kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0,
  55. 998 (dm_kcopyd_notify_fn) flashcache_kcopyd_callback,
  56. 999 (void *)job);
  57. 1000#endif
  58. 1001 }
  59. 1002}</PRE>
   
   
   
   
[cpp] view plain copy print ?
  1. 944static void
  2. 945flashcache_dirty_writeback(struct cache_c *dmc, int index)
  3. 946{
  4. 947 struct kcached_job *job;
  5. 948 unsigned long flags;
  6. 949 struct cacheblock *cacheblk = &dmc->cache[index];
  7. 950 int device_removal = 0;
  8. 951
  9. 952 DPRINTK("flashcache_dirty_writeback: Index %d", index);
  10. 953 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  11. 954 VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG);
  12. 955 VERIFY(cacheblk->cache_state & DIRTY);
  13. 956 dmc->cache_sets[index / dmc->assoc].clean_inprog++;
  14. 957 dmc->clean_inprog++;
  15. 958 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  16. 959 job = new_kcached_job(dmc, NULL, index);
  17. 960 if (unlikely(sysctl_flashcache_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) {
  18. 961 if (job)
  19. 962 flashcache_free_cache_job(job);
  20. 963 job = NULL;
  21. 964 sysctl_flashcache_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL;
  22. 965 }
  23. 966 /*
  24. 967 * If the device is being (fast) removed, do not kick off any more cleanings.
  25. 968 */
  26. 969 if (unlikely(atomic_read(&dmc->fast_remove_in_prog))) {
  27. 970 DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu",
  28. 971 cacheblk->dbn);
  29. 972 if (job)
  30. 973 flashcache_free_cache_job(job);
  31. 974 job = NULL;
  32. 975 device_removal = 1;
  33. 976 }
  34. 977 if (unlikely(job == NULL)) {
  35. 978 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  36. 979 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
  37. 980 dmc->clean_inprog--;
  38. 981 flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
  39. 982 cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
  40. 983 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  41. 984 if (device_removal == 0)
  42. 985 DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu",
  43. 986 cacheblk->dbn);
  44. 987 } else {
  45. 988 job->bio = NULL;
  46. 989 job->action = WRITEDISK;
  47. 990 atomic_inc(&dmc->nr_jobs);
  48. 991 dmc->ssd_reads++;
  49. 992 dmc->disk_writes++;
  50. 993#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  51. 994 kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0,
  52. 995 flashcache_kcopyd_callback, job);
  53. 996#else
  54. 997 dm_kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0,
  55. 998 (dm_kcopyd_notify_fn) flashcache_kcopyd_callback,
  56. 999 (void *)job);
  57. 1000#endif
  58. 1001 }
  59. 1002}
944static void
945flashcache_dirty_writeback(struct cache_c *dmc, int index)
946{
947	struct kcached_job *job;
948	unsigned long flags;
949	struct cacheblock *cacheblk = &dmc->cache[index];
950	int device_removal = 0;
951	
952	DPRINTK("flashcache_dirty_writeback: Index %d", index);
953	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
954	VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG);
955	VERIFY(cacheblk->cache_state & DIRTY);
956	dmc->cache_sets[index / dmc->assoc].clean_inprog++;
957	dmc->clean_inprog++;
958	spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
959	job = new_kcached_job(dmc, NULL, index);
960	if (unlikely(sysctl_flashcache_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) {
961		if (job)
962			flashcache_free_cache_job(job);
963		job = NULL;
964		sysctl_flashcache_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL;
965	}
966	/*
967	 * If the device is being (fast) removed, do not kick off any more cleanings.
968	 */
969	if (unlikely(atomic_read(&dmc->fast_remove_in_prog))) {
970		DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu", 
971		      cacheblk->dbn);
972		if (job)
973			flashcache_free_cache_job(job);
974		job = NULL;
975		device_removal = 1;
976	}
977	if (unlikely(job == NULL)) {
978		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
979		dmc->cache_sets[index / dmc->assoc].clean_inprog--;
980		dmc->clean_inprog--;
981		flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
982		cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
983		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
984		if (device_removal == 0)
985			DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu", 
986			      cacheblk->dbn);
987	} else {
988		job->bio = NULL;
989		job->action = WRITEDISK;
990		atomic_inc(&dmc->nr_jobs);
991		dmc->ssd_reads++;
992		dmc->disk_writes++;
993#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
994		kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 
995			    flashcache_kcopyd_callback, job);
996#else
997		dm_kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 
998			       (dm_kcopyd_notify_fn) flashcache_kcopyd_callback, 
999			       (void *)job);
1000#endif
1001	}
1002}

首先是用new_kcached_job申请一个kcached_job结构体,接下来判断dmc->fast_remove_in_prog,这个是移除flashcache标志,设备都要删除掉了,显然就没必要再下发命令了。再判断job是否为空,else这里才是干的正事。这里job->action = WRITEDISK;是最重要的一句话,就是前面讲的写缓存基本操作,而这个action就可以看作是一个状态机,对应的状态如下:
[html] view plain copy print ?
  1. <PRE class=html name="code">245/* kcached/pending job states */
  2. 246#define READCACHE 1
  3. 247#define WRITECACHE 2
  4. 248#define READDISK 3
  5. 249#define WRITEDISK 4
  6. 250#define READFILL 5 /* Read Cache Miss Fill */
  7. 251#define INVALIDATE 6
  8. 252#define WRITEDISK_SYNC 7
  9. </PRE><BR>
   
   
   
   
[html] view plain copy print ?
  1. 245/* kcached/pending job states */
  2. 246#define READCACHE 1
  3. 247#define WRITECACHE 2
  4. 248#define READDISK 3
  5. 249#define WRITEDISK 4
  6. 250#define READFILL 5 /* Read Cache Miss Fill */
  7. 251#define INVALIDATE 6
  8. 252#define WRITEDISK_SYNC 7
245/* kcached/pending job states */
246#define READCACHE	1
247#define WRITECACHE	2
248#define READDISK	3
249#define WRITEDISK	4
250#define READFILL	5	/* Read Cache Miss Fill */
251#define INVALIDATE	6
252#define WRITEDISK_SYNC	7

这里设置的是WRITEDISK,就是写磁盘,那是从哪里写呢?是从写缓存写的,写缓存的数据又是在哪里呢?我们把SSD盘当作写缓存,所以是从SSD盘写到磁盘。那我们是不是要做很多事情,先从SSD读数据然后再往磁盘写呢?是的,但是我们不用做太多的事情,因为linux内核有大名鼎鼎的kcopyd线程,我们只需要把这些烦索的工作交给kcopyd完成就可以了,调用的接口是
int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
第一个参数是kcopyd_client,这是是flashcache_ctr即flashcache设备创建的构造函数中创建的,即每一个flashcache设备都对应一个kcopyd_client,那么为什么要创建这个结构体呢?可以简单地理解为使用kcopyd服务的一个句柄。第二参数是数据源,第三个为目的数量,第四个参数为要写的目标,第五个参数为额外标识,这里都设置为0,第六个参数fn是回调函数,设置了回调函数则此函数为异步,不阻塞,如果fn设置为NULL,则会同步等待。最后一个参数context是用于回调函数使用的参数,这里传入的正是我们现在最关心的job。
我们已经把kcached_job派发出去了,接着来看是kcached_job是什么时候回来的,回来又做了什么事情,最后是怎么销毁的?
在dm_kcopyd_copy中设置的回调函数是flashcache_kcopyd_callback。
[html] view plain copy print ?
  1. 901static void
  2. 902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context)
  3. 903{
  4. 904 struct kcached_job *job = (struct kcached_job *)context;
  5. 905 struct cache_c *dmc = job->dmc;
  6. 906 int index = job->index;
  7. 907 unsigned long flags;
  8. 908
  9. 909 VERIFY(!in_interrupt());
  10. 910 DPRINTK("kcopyd_callback: Index %d", index);
  11. 911 VERIFY(job->bio == NULL);
  12. 912 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  13. 913 VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY));
  14. 914 if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) {
  15. 915 read_err = -EIO;
  16. 916 sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR;
  17. 917 }
  18. 918 if (likely(read_err == 0 && write_err == 0)) {
  19. 919 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  20. 920 flashcache_md_write(job);
  21. 921 } else {
  22. 922 /* Disk write failed. We can not purge this block from flash */
  23. 923 DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu",
  24. 924 -read_err, -write_err, job->disk.sector);
  25. 925 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
  26. 926 VERIFY(dmc->clean_inprog > 0);
  27. 927 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
  28. 928 dmc->clean_inprog--;
  29. 929 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  30. 930 /* Set the error in the job and let do_pending() handle the error */
  31. 931 if (read_err) {
  32. 932 dmc->ssd_read_errors++;
  33. 933 job->error = read_err;
  34. 934 } else {
  35. 935 dmc->disk_write_errors++;
  36. 936 job->error = write_err;
  37. 937 }
  38. 938 flashcache_do_pending(job);
  39. 939 flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */
  40. 940 dmc->cleanings++;
  41. 941 }
  42. 942}
901static void 
902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context)
903{
904	struct kcached_job *job = (struct kcached_job *)context;
905	struct cache_c *dmc = job->dmc;
906	int index = job->index;
907	unsigned long flags;
908
909	VERIFY(!in_interrupt());
910	DPRINTK("kcopyd_callback: Index %d", index);
911	VERIFY(job->bio == NULL);
912	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
913	VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY));
914	if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) {
915		read_err = -EIO;
916		sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR;
917	}
918	if (likely(read_err == 0 && write_err == 0)) {
919		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
920		flashcache_md_write(job);
921	} else {
922		/* Disk write failed. We can not purge this block from flash */
923		DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", 
924		      -read_err, -write_err, job->disk.sector);
925		VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
926		VERIFY(dmc->clean_inprog > 0);
927		dmc->cache_sets[index / dmc->assoc].clean_inprog--;
928		dmc->clean_inprog--;
929		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
930		/* Set the error in the job and let do_pending() handle the error */
931		if (read_err) {
932			dmc->ssd_read_errors++;			
933			job->error = read_err;
934		} else {
935			dmc->disk_write_errors++;			
936			job->error = write_err;
937		}
938		flashcache_do_pending(job);
939		flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */
940		dmc->cleanings++;
941	}
942}

到这里就表明写缓存的数据写到磁盘的过程已经完成了。首先检查结果是否成功了,如果都成功的话就调用flashcache_md_write。
[cpp] view plain copy print ?
  1. 860
  2. 861/*
  3. 862 * Kick off a cache metadata update (called from workqueue).
  4. 863 * Cache metadata update IOs to a given metadata sector are serialized using the
  5. 864 * nr_in_prog bit in the md sector bufhead.
  6. 865 * If a metadata IO is already in progress, we queue up incoming metadata updates
  7. 866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
  8. 867 * cluster all these pending updates and do all of them as 1 flash write (that
  9. 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
  10. 869 * list and does all of those updates.
  11. 870 */
  12. 871void
  13. 872flashcache_md_write(struct kcached_job *job)
  14. 873{
  15. 874 struct cache_c *dmc = job->dmc;
  16. 875 struct cache_md_sector_head *md_sector_head;
  17. 876 unsigned long flags;
  18. 877
  19. 878 VERIFY(!in_interrupt());
  20. 879 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
  21. 880 job->action == WRITEDISK_SYNC);
  22. 881 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
  23. 882 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  24. 883 /* If a write is in progress for this metadata sector, queue this update up */
  25. 884 if (md_sector_head->nr_in_prog != 0) {
  26. 885 struct kcached_job **nodepp;
  27. 886
  28. 887 /* A MD update is already in progress, queue this one up for later */
  29. 888 nodepp = &md_sector_head->pending_jobs;
  30. 889 while (*nodepp != NULL)
  31. 890 nodepp = &((*nodepp)->next);
  32. 891 job->next = NULL;
  33. 892 *nodepp = job;
  34. 893 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  35. 894 } else {
  36. 895 md_sector_head->nr_in_prog = 1;
  37. 896 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  38. 897 flashcache_md_write_kickoff(job);
  39. 898 }
  40. 899}
860
861/* 
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the 
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that 
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */
871void
872flashcache_md_write(struct kcached_job *job)
873{
874	struct cache_c *dmc = job->dmc;
875	struct cache_md_sector_head *md_sector_head;
876	unsigned long flags;
877	
878	VERIFY(!in_interrupt());
879	VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 
880	       job->action == WRITEDISK_SYNC);
881	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
882	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
883	/* If a write is in progress for this metadata sector, queue this update up */
884	if (md_sector_head->nr_in_prog != 0) {
885		struct kcached_job **nodepp;
886		
887		/* A MD update is already in progress, queue this one up for later */
888		nodepp = &md_sector_head->pending_jobs;
889		while (*nodepp != NULL)
890			nodepp = &((*nodepp)->next);
891		job->next = NULL;
892		*nodepp = job;
893		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
894	} else {
895		md_sector_head->nr_in_prog = 1;
896		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
897		flashcache_md_write_kickoff(job);
898	}
899}

如果函数有注释还是仔细看一下吧,据个人观察,写linux内核的哥们都是惜字如金,如果他愿意写注释,那看注释绝对比看代码更重要,更有意义,如果有文档的话,那文档就是重中之重。看到这里有注释,真是欣喜万分,基本上看了注释不用看代码都行,但对于我这样的小菜鸟来说,有时还不能完全领会大侠的神意,就会继续读一下代码。
[cpp] view plain copy print ?
  1. 861/*
  2. 862 * Kick off a cache metadata update (called from workqueue).
  3. 863 * Cache metadata update IOs to a given metadata sector are serialized using the
  4. 864 * nr_in_prog bit in the md sector bufhead.
  5. 865 * If a metadata IO is already in progress, we queue up incoming metadata updates
  6. 866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
  7. 867 * cluster all these pending updates and do all of them as 1 flash write (that
  8. 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
  9. 869 * list and does all of those updates.
  10. 870 */
861/* 
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the 
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that 
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */

派发cache metadata更新(从workqueue调用=》因为这里是从kcopyd回调回来的,所以这里友情提示一下,在内核要十分关心调用的上下文,是看内核代码的必修课,有时也是解决疑难问题的基础)。cache metadata的更新是由结构cache_md_sector_head中nr_in_prog字段来控制更新次序的(就是说更新cache metadata是按次序的,如果前面的更新未完成,后面的更新就排队等候)。排队等候的kcached_job就挂在cache_md_sector_head的pending_jobs上。在前面的更新操作回来时,就一次性把pending_jobs上的所有更新操作一次性派发。(因为所有更新就是对应一个sector中flashcache管理结构的)。
这一段看不明白也没关系,因为这里还没有讲到flashcache的数据组织。但必须明白,我们在flashcache_dirty_writeback中把脏数据从写缓存SSD刷到磁盘,这里要做的事情就是把这个脏数据的的metadata从内存刷到SSD,这样就保证了在异常掉电的情况下元数据可以从SSD中找回。
到这里kcached_job还没有销毁,我们继续跟踪下去 flashcache_md_write=>flashcache_md_write_kickoff。
[cpp] view plain copy print ?
  1. 660static void
  2. 661flashcache_md_write_kickoff(struct kcached_job *job)
  3. 662{
  4. 663 struct cache_c *dmc = job->dmc;
  5. 664 struct flash_cacheblock *md_sector;
  6. 665 int md_sector_ix;
  7. 666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
  8. 667 struct io_region where;
  9. 668#else
  10. 669 struct dm_io_region where;
  11. 670#endif
  12. 671 int i;
  13. 672 struct cache_md_sector_head *md_sector_head;
  14. 673 struct kcached_job *orig_job = job;
  15. 674 unsigned long flags;
  16. 675
  17. 676 if (flashcache_alloc_md_sector(job)) {
  18. 677 DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu",
  19. 678 job->action, job->disk.sector);
  20. 679 flashcache_md_write_callback(-EIO, job);
  21. 680 return;
  22. 681 }
  23. 682 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  24. 683 /*
  25. 684 * Transfer whatever is on the pending queue to the md_io_inprog queue.
  26. 685 */
  27. 686 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
  28. 687 md_sector_head->md_io_inprog = md_sector_head->pending_jobs;
  29. 688 md_sector_head->pending_jobs = NULL;
  30. 689 md_sector = job->md_sector;
  31. 690 md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR;
  32. 691 /* First copy out the entire sector */
  33. 692 for (i = 0 ;
  34. 693 i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ;
  35. 694 i++, md_sector_ix++) {
  36. 695 md_sector[i].dbn = dmc->cache[md_sector_ix].dbn;
  37. 696#ifdef FLASHCACHE_DO_CHECKSUMS
  38. 697 md_sector[i].checksum = dmc->cache[md_sector_ix].checksum;
  39. 698#endif
  40. 699 md_sector[i].cache_state =
  41. 700 dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY);
  42. 701 }
  43. 702 /* Then set/clear the DIRTY bit for the "current" index */
  44. 703 if (job->action == WRITECACHE) {
  45. 704 /* DIRTY the cache block */
  46. 705 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state =
  47. 706 (VALID | DIRTY);
  48. 707 } else { /* job->action == WRITEDISK* */
  49. 708 /* un-DIRTY the cache block */
  50. 709 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
  51. 710 }
  52. 711
  53. 712 for (job = md_sector_head->md_io_inprog ;
  54. 713 job != NULL ;
  55. 714 job = job->next) {
  56. 715 if (job->action == WRITECACHE) {
  57. 716 /* DIRTY the cache block */
  58. 717 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state =
  59. 718 (VALID | DIRTY);
  60. 719 } else { /* job->action == WRITEDISK* */
  61. 720 /* un-DIRTY the cache block */
  62. 721 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
  63. 722 }
  64. 723 }
  65. 724 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  66. 725 where.bdev = dmc->cache_dev->bdev;
  67. 726 where.count = 1;
  68. 727 where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index);
  69. 728 dmc->ssd_writes++;
  70. 729 dm_io_async_bvec(1, &where, WRITE,
  71. 730 &orig_job->md_io_bvec,
  72. 731 flashcache_md_write_callback, orig_job);
  73. 732 flashcache_unplug_device(dmc->cache_dev->bdev);
  74. 733}
660static void
661flashcache_md_write_kickoff(struct kcached_job *job)
662{
663	struct cache_c *dmc = job->dmc;	
664	struct flash_cacheblock *md_sector;
665	int md_sector_ix;
666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
667	struct io_region where;
668#else
669	struct dm_io_region where;
670#endif
671	int i;
672	struct cache_md_sector_head *md_sector_head;
673	struct kcached_job *orig_job = job;
674	unsigned long flags;
675
676	if (flashcache_alloc_md_sector(job)) {
677		DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu", 
678		      job->action, job->disk.sector);
679		flashcache_md_write_callback(-EIO, job);
680		return;
681	}
682	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
683	/*
684	 * Transfer whatever is on the pending queue to the md_io_inprog queue.
685	 */
686	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
687	md_sector_head->md_io_inprog = md_sector_head->pending_jobs;
688	md_sector_head->pending_jobs = NULL;
689	md_sector = job->md_sector;
690	md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR;
691	/* First copy out the entire sector */
692	for (i = 0 ; 
693	     i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ; 
694	     i++, md_sector_ix++) {
695		md_sector[i].dbn = dmc->cache[md_sector_ix].dbn;
696#ifdef FLASHCACHE_DO_CHECKSUMS
697		md_sector[i].checksum = dmc->cache[md_sector_ix].checksum;
698#endif
699		md_sector[i].cache_state = 
700			dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY);
701	}
702	/* Then set/clear the DIRTY bit for the "current" index */
703	if (job->action == WRITECACHE) {
704		/* DIRTY the cache block */
705		md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 
706			(VALID | DIRTY);
707	} else { /* job->action == WRITEDISK* */
708		/* un-DIRTY the cache block */
709		md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
710	}
711
712	for (job = md_sector_head->md_io_inprog ; 
713	     job != NULL ;
714	     job = job->next) {
715		if (job->action == WRITECACHE) {
716			/* DIRTY the cache block */
717			md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 
718				(VALID | DIRTY);
719		} else { /* job->action == WRITEDISK* */
720			/* un-DIRTY the cache block */
721			md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
722		}
723	}
724	spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
725	where.bdev = dmc->cache_dev->bdev;
726	where.count = 1;
727	where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index);
728	dmc->ssd_writes++;
729	dm_io_async_bvec(1, &where, WRITE,
730			 &orig_job->md_io_bvec,
731			 flashcache_md_write_callback, orig_job);
732	flashcache_unplug_device(dmc->cache_dev->bdev);
733}

这里cacheblock 信息保存到job->md_io_bvec的page页中,再调用dm_io_async_bvec将数据写到SSD盘中。我们来看一下该函数原型:
[cpp] view plain copy print ?
 
  
[cpp] view plain copy print ?
  1. static int dm_io_async_bvec(unsigned int num_regions,
  2. struct dm_io_region *where, int rw,
  3. struct bio_vec *bvec, io_notify_fn fn,
  4. void *context)
static int dm_io_async_bvec(unsigned int num_regions, 
			    struct dm_io_region *where, int rw, 
			    struct bio_vec *bvec, io_notify_fn fn, 
			    void *context)

该函数与之前的dm_kcopyd_copy类似,我们最关心的是参数where,因为这是人生最重要的一课,你是谁?你要到哪里去?
where的bdev域就是目标设备,而sector域就是起始地址,count表示要写的扇区数。这个函数就是把dmc->cache的管理结构打包到job->md_io_bvec中,然后写到SSD对应位置上。
再接下来看写SSD完成调用flashcache_md_write_callback:
[cpp] view plain copy print ?
  1. 621void
  2. 622flashcache_md_write_callback(unsigned long error, void *context)
  3. 623{
  4. 624 struct kcached_job *job = (struct kcached_job *)context;
  5. 625
  6. 626 job->error = error;
  7. 627 push_md_complete(job);
  8. 628 schedule_work(&_kcached_wq);
  9. 629}
621void 
622flashcache_md_write_callback(unsigned long error, void *context)
623{
624	struct kcached_job *job = (struct kcached_job *)context;
625
626	job->error = error;
627	push_md_complete(job);
628	schedule_work(&_kcached_wq);
629}

该函数只是简单地设置job的返回值,然后放到_md_complete_jobs这个链表里,然后通知workqueue处理。为什么不直接在这个函数里处理,而要放到后面处理呢?这就像每个公司都有个漂亮的前台秘书,这个物流公司送来了大箱的物料,美女秘书当然不会自己搬,随便撒个娇一大群工科男都抢着干活。这里函数是写完成的回调函数,是在软中断中调用的,软中断跟美女秘书一样,干不了重活,只能简单地签收一下,剩下的活就由workqueue来完成了。
要继续我们的跟踪,那就得问workqueue是从哪里来的,workqueue做了什么,或者说对job做了什么?
flashcache_init=>INIT_WORK(&_kcached_wq, do_work);=>process_jobs(&_md_complete_jobs, flashcache_md_write_done);
先看process_jobs
[cpp] view plain copy print ?
  1. 284static void
  2. 285process_jobs(struct list_head *jobs,
  3. 286 void (*fn) (struct kcached_job *))
  4. 287{
  5. 288 struct kcached_job *job;
  6. 289
  7. 290 while ((job = pop(jobs)))
  8. 291 (void)fn(job);
  9. 292}
284static void
285process_jobs(struct list_head *jobs,
286	     void (*fn) (struct kcached_job *))
287{
288	struct kcached_job *job;
289
290	while ((job = pop(jobs)))
291		(void)fn(job);
292}

就是从队列中把刚才美女秘书签收的job取出来,然后调用fn,fn就是这里注册的flashcache_md_write_done。
从函数名有个蛋(done),就好像每天下午的5点半,一天的忙碌立马可以收工了,但是悲剧的LZ现在每个月都要加班72个小时,这样想想大家有没有从LZ的不幸中找到自己的幸福?
[cpp] view plain copy print ?
  1. 735void
  2. 736flashcache_md_write_done(struct kcached_job *job)
  3. 737{
  4. 738 struct cache_c *dmc = job->dmc;
  5. 739 struct cache_md_sector_head *md_sector_head;
  6. 740 int index;
  7. 741 unsigned long flags;
  8. 742 struct kcached_job *job_list;
  9. 743 int error = job->error;
  10. 744 struct kcached_job *next;
  11. 745 struct cacheblock *cacheblk;
  12. 746
  13. 747 VERIFY(!in_interrupt());
  14. 748 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
  15. 749 job->action == WRITEDISK_SYNC);
  16. 750 flashcache_free_md_sector(job);
  17. 751 job->md_sector = NULL;
  18. 752 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
  19. 753 job_list = job;
  20. 754 job->next = md_sector_head->md_io_inprog;
  21. 755 md_sector_head->md_io_inprog = NULL;
  22. 756 for (job = job_list ; job != NULL ; job = next) {
  23. 757 next = job->next;
  24. 758 job->error = error;
  25. 759 index = job->index;
  26. 760 cacheblk = &dmc->cache[index];
  27. 761 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  28. 762 if (job->action == WRITECACHE) {
  29. 763 if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) {
  30. 764 job->error = -EIO;
  31. 765 sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR;
  32. 766 }
  33. 767 if (likely(job->error == 0)) {
  34. 768 if ((cacheblk->cache_state & DIRTY) == 0) {
  35. 769 dmc->cache_sets[index / dmc->assoc].nr_dirty++;
  36. 770 dmc->nr_dirty++;
  37. 771 }
  38. 772 dmc->md_write_dirty++;
  39. 773 cacheblk->cache_state |= DIRTY;
  40. 774 } else
  41. 775 dmc->ssd_write_errors++;
  42. 776 flashcache_bio_endio(job->bio, job->error);
  43. 777 if (job->error || cacheblk->head) {
  44. 778 if (job->error) {
  45. 779 DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu",
  46. 780 -job->error, cacheblk->dbn);
  47. 781 }
  48. 782 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  49. 783 flashcache_do_pending(job);
  50. 784 } else {
  51. 785 cacheblk->cache_state &= ~BLOCK_IO_INPROG;
  52. 786 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  53. 787 flashcache_free_cache_job(job);
  54. 788 if (atomic_dec_and_test(&dmc->nr_jobs))
  55. 789 wake_up(&dmc->destroyq);
  56. 790 }
  57. 791 } else {
  58. 792 int action = job->action;
  59. 793
  60. 794 if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) {
  61. 795 job->error = -EIO;
  62. 796 sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR;
  63. 797 }
  64. 798 /*
  65. 799 * If we have an error on a WRITEDISK*, no choice but to preserve the
  66. 800 * dirty block in cache. Fail any IOs for this block that occurred while
  67. 801 * the block was being cleaned.
  68. 802 */
  69. 803 if (likely(job->error == 0)) {
  70. 804 dmc->md_write_clean++;
  71. 805 cacheblk->cache_state &= ~DIRTY;
  72. 806 VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0);
  73. 807 VERIFY(dmc->nr_dirty > 0);
  74. 808 dmc->cache_sets[index / dmc->assoc].nr_dirty--;
  75. 809 dmc->nr_dirty--;
  76. 810 } else
  77. 811 dmc->ssd_write_errors++;
  78. 812 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
  79. 813 VERIFY(dmc->clean_inprog > 0);
  80. 814 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
  81. 815 dmc->clean_inprog--;
  82. 816 if (job->error || cacheblk->head) {
  83. 817 if (job->error) {
  84. 818 DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu",
  85. 819 -job->error, cacheblk->dbn);
  86. 820 }
  87. 821 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  88. 822 flashcache_do_pending(job);
  89. 823 /* Kick off more cleanings */
  90. 824 if (action == WRITEDISK)
  91. 825 flashcache_clean_set(dmc, index / dmc->assoc);
  92. 826 else
  93. 827 flashcache_sync_blocks(dmc);
  94. 828 } else {
  95. 829 cacheblk->cache_state &= ~BLOCK_IO_INPROG;
  96. 830 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  97. 831 flashcache_free_cache_job(job);
  98. 832 if (atomic_dec_and_test(&dmc->nr_jobs))
  99. 833 wake_up(&dmc->destroyq);
  100. 834 /* Kick off more cleanings */
  101. 835 if (action == WRITEDISK)
  102. 836 flashcache_clean_set(dmc, index / dmc->assoc);
  103. 837 else
  104. 838 flashcache_sync_blocks(dmc);
  105. 839 }
  106. 840 dmc->cleanings++;
  107. 841 if (action == WRITEDISK_SYNC)
  108. 842 flashcache_update_sync_progress(dmc);
  109. 843 }
  110. 844 }
  111. 845 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
  112. 846 if (md_sector_head->pending_jobs != NULL) {
  113. 847 /* peel off the first job from the pending queue and kick that off */
  114. 848 job = md_sector_head->pending_jobs;
  115. 849 md_sector_head->pending_jobs = job->next;
  116. 850 job->next = NULL;
  117. 851 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  118. 852 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
  119. 853 job->action == WRITEDISK_SYNC);
  120. 854 flashcache_md_write_kickoff(job);
  121. 855 } else {
  122. 856 md_sector_head->nr_in_prog = 0;
  123. 857 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
  124. 858 }
  125. 859}
  126. 860
735void
736flashcache_md_write_done(struct kcached_job *job)
737{
738	struct cache_c *dmc = job->dmc;
739	struct cache_md_sector_head *md_sector_head;
740	int index;
741	unsigned long flags;
742	struct kcached_job *job_list;
743	int error = job->error;
744	struct kcached_job *next;
745	struct cacheblock *cacheblk;
746		
747	VERIFY(!in_interrupt());
748	VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 
749	       job->action == WRITEDISK_SYNC);
750	flashcache_free_md_sector(job);
751	job->md_sector = NULL;
752	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
753	job_list = job;
754	job->next = md_sector_head->md_io_inprog;
755	md_sector_head->md_io_inprog = NULL;
756	for (job = job_list ; job != NULL ; job = next) {
757		next = job->next;
758		job->error = error;
759		index = job->index;
760		cacheblk = &dmc->cache[index];
761		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
762		if (job->action == WRITECACHE) {
763			if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) {
764				job->error = -EIO;
765				sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR;
766			}
767			if (likely(job->error == 0)) {
768				if ((cacheblk->cache_state & DIRTY) == 0) {
769					dmc->cache_sets[index / dmc->assoc].nr_dirty++;
770					dmc->nr_dirty++;
771				}
772				dmc->md_write_dirty++;
773				cacheblk->cache_state |= DIRTY;
774			} else
775				dmc->ssd_write_errors++;
776			flashcache_bio_endio(job->bio, job->error);
777			if (job->error || cacheblk->head) {
778				if (job->error) {
779					DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu", 
780					      -job->error, cacheblk->dbn);
781				}
782				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
783				flashcache_do_pending(job);
784			} else {
785				cacheblk->cache_state &= ~BLOCK_IO_INPROG;
786				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
787				flashcache_free_cache_job(job);
788				if (atomic_dec_and_test(&dmc->nr_jobs))
789					wake_up(&dmc->destroyq);
790			}
791		} else {
792			int action = job->action;
793
794			if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) {
795				job->error = -EIO;
796				sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR;
797			}
798			/*
799			 * If we have an error on a WRITEDISK*, no choice but to preserve the 
800			 * dirty block in cache. Fail any IOs for this block that occurred while
801			 * the block was being cleaned.
802			 */
803			if (likely(job->error == 0)) {
804				dmc->md_write_clean++;
805				cacheblk->cache_state &= ~DIRTY;
806				VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0);
807				VERIFY(dmc->nr_dirty > 0);
808				dmc->cache_sets[index / dmc->assoc].nr_dirty--;
809				dmc->nr_dirty--;
810			} else 
811				dmc->ssd_write_errors++;
812			VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
813			VERIFY(dmc->clean_inprog > 0);
814			dmc->cache_sets[index / dmc->assoc].clean_inprog--;
815			dmc->clean_inprog--;
816			if (job->error || cacheblk->head) {
817				if (job->error) {
818					DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu", 
819					      -job->error, cacheblk->dbn);
820				}
821				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
822				flashcache_do_pending(job);
823				/* Kick off more cleanings */
824				if (action == WRITEDISK)
825					flashcache_clean_set(dmc, index / dmc->assoc);
826				else
827					flashcache_sync_blocks(dmc);
828			} else {
829				cacheblk->cache_state &= ~BLOCK_IO_INPROG;
830				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
831				flashcache_free_cache_job(job);
832				if (atomic_dec_and_test(&dmc->nr_jobs))
833					wake_up(&dmc->destroyq);
834				/* Kick off more cleanings */
835				if (action == WRITEDISK)
836					flashcache_clean_set(dmc, index / dmc->assoc);
837				else
838					flashcache_sync_blocks(dmc);
839			}
840			dmc->cleanings++;
841			if (action == WRITEDISK_SYNC)
842				flashcache_update_sync_progress(dmc);
843		}
844	}
845	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
846	if (md_sector_head->pending_jobs != NULL) {
847		/* peel off the first job from the pending queue and kick that off */
848		job = md_sector_head->pending_jobs;
849		md_sector_head->pending_jobs = job->next;
850		job->next = NULL;
851		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
852		VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
853		       job->action == WRITEDISK_SYNC);
854		flashcache_md_write_kickoff(job);
855	} else {
856		md_sector_head->nr_in_prog = 0;
857		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
858	}
859}
860

首先是flashcache_free_md_sector,这个函数只是简单地把刚才分配的记录cacheblock 的page页释放。哪个刚才啊?就是flashcache_md_write_kickoff中flashcache_alloc_md_sector申请的page页。所以看这个函数时要回头再去看看flashcache_md_write_kickoff,所以前面提到了上下文,那么在这里kickoff是上文,done就是下文,上文种什么因,下文就得到什么果。上文申请了page页,下文就要释放page页;上文把dmc->md_sectors_buf[]中struct kcached_job *md_io_inprog对应的kcached_job都已经下发了,下文这里才有一个for循环。细心的你可能会问,为什么这里的kcached_job可以一起下发?那首先要来了解一下这里的kcached_job是干什么的。是结构体上的:
[cpp] view plain copy print ?
  1. /*
  2. * We have one of these for *every* cache metadata sector, to keep track
  3. * of metadata ios in progress for blocks covered in this sector. Only
  4. * one metadata IO per sector can be in progress at any given point in
  5. * time
  6. */
  7. struct cache_md_sector_head {
  8. u_int32_t nr_in_prog;
  9. struct kcached_job *pending_jobs, *md_io_inprog;
  10. };
/* 
 * We have one of these for *every* cache metadata sector, to keep track
 * of metadata ios in progress for blocks covered in this sector. Only
 * one metadata IO per sector can be in progress at any given point in 
 * time
 */
struct cache_md_sector_head {
	u_int32_t		nr_in_prog;
	struct kcached_job	*pending_jobs, *md_io_inprog;
};

按规矩先看注释,每一个cache metadata扇区都有对应一个cache_md_sector_head结构,用于同步进程(内存中)cacheblock metadata到cache metadata扇区。同时只能有一个IO在同步,对应的是cache_md_sector_head->nr_in_prog。回答上面的问题,就是这些kcached_job是对应同一个扇区内的不同metadata的写,所以可以合并。这个扇区指的是SSD盘上存放flash_block结构的。
再回到flashcache_md_write_done函数中,在for循环中job->action为WRITEDISK,所以直接来到for循环中else,迎面而来的又是一行注释,在WRITEDISK*发生错误时,只有保持cacheblock的DIRTY标志。接下来判断有错误或者cacheblock上还有pending_job,那么继续下发IO,否则的话清除cacheblock的处理标志,这里我们终于见到了kcached_job完成了他的使命,调用flashcache_free_cache_job将该结构返回给内存池。
似乎到这里我们就可以像童话里讲的“从此他们过上了幸福的生活”来结束kcached_job的介绍。然而回归资源池也意味着kcached_job的再生,接着判断action==WRITEDISK,调用flashcache_clean_set,将超过脏水平线的cache块刷回到磁盘。就是说在每次写磁盘返回的时候这个workqueue都会检查一下脏水平线,如果超过就继续往下刷,这就又回到了本文最开始的flashcache_dirty_writeback函数,真是因果联系,环环相扣,kcached_job的再生不是为了自己,而是为cacheblock的再生,所以说人不能只为自己活着,每个人只是万千轮回里的一个元素,都是为了成全其他元素而进入六道轮回。
下面一篇会从flashcache的数据结构和存储设计来分析。

你可能感兴趣的:(linux内核源码阅读之facebook硬盘加速利器flashcache之一)