1963int __init 1964flashcache_init(void) 1965{ 1966 int r; 1967 1968 r = flashcache_jobs_init(); 1969 if (r) 1970 return r; 1971 atomic_set(&nr_cache_jobs, 0); 1972 atomic_set(&nr_pending_jobs, 0); 1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) 1974 INIT_WORK(&_kcached_wq, do_work, NULL); 1975#else 1976 INIT_WORK(&_kcached_wq, do_work); 1977#endif 1978 for (r = 0 ; r < 33 ; r++) 1979 size_hist[r] = 0; 1980 r = dm_register_target(&flashcache_target); 1981 if (r < 0) { 1982 DMERR("cache: register failed %d", r); 1983 } 1984#ifdef CONFIG_PROC_FS 1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) 1986 flashcache_table_header = 1987 register_sysctl_table(flashcache_root_table, 1); 1988#else 1989 flashcache_table_header = 1990 register_sysctl_table(flashcache_root_table); 1991#endif 1992 { 1993 struct proc_dir_entry *entry; 1994 1995 entry = create_proc_entry("flashcache_stats", 0, NULL); 1996 if (entry) 1997 entry->proc_fops = &flashcache_stats_operations; 1998 entry = create_proc_entry("flashcache_errors", 0, NULL); 1999 if (entry) 2000 entry->proc_fops = &flashcache_errors_operations; 2001 entry = create_proc_entry("flashcache_iosize_hist", 0, NULL); 2002 if (entry) 2003 entry->proc_fops = &flashcache_iosize_hist_operations; 2004 entry = create_proc_entry("flashcache_pidlists", 0, NULL); 2005 if (entry) 2006 entry->proc_fops = &flashcache_pidlists_operations; 2007 entry = create_proc_entry("flashcache_version", 0, NULL); 2008 if (entry) 2009 entry->proc_fops = &flashcache_version_operations; 2010 } 2011#endif 2012 flashcache_control = (struct flashcache_control_s *) 2013 kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL); 2014 flashcache_control->synch_flags = 0; 2015 register_reboot_notifier(&flashcache_notifier); 2016 return r; 2017}
441static int 442flashcache_jobs_init(void) 443{ 444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) 445 _job_cache = kmem_cache_create("kcached-jobs", 446 sizeof(struct kcached_job), 447 __alignof__(struct kcached_job), 448 0, NULL, NULL); 449#else 450 _job_cache = kmem_cache_create("kcached-jobs", 451 sizeof(struct kcached_job), 452 __alignof__(struct kcached_job), 453 0, NULL); 454#endif 455 if (!_job_cache) 456 return -ENOMEM; 457 458 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, 459 mempool_free_slab, _job_cache); 460 if (!_job_pool) { 461 kmem_cache_destroy(_job_cache); 462 return -ENOMEM; 463 } 464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) 465 _pending_job_cache = kmem_cache_create("pending-jobs", 466 sizeof(struct pending_job), 467 __alignof__(struct pending_job), 468 0, NULL, NULL); 469#else 470 _pending_job_cache = kmem_cache_create("pending-jobs", 471 sizeof(struct pending_job), 472 __alignof__(struct pending_job), 473 0, NULL); 474#endif 475 if (!_pending_job_cache) { 476 mempool_destroy(_job_pool); 477 kmem_cache_destroy(_job_cache); 478 return -ENOMEM; 479 } 480 481 _pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, 482 mempool_free_slab, _pending_job_cache); 483 if (!_pending_job_pool) { 484 kmem_cache_destroy(_pending_job_cache); 485 mempool_destroy(_job_pool); 486 kmem_cache_destroy(_job_cache); 487 return -ENOMEM; 488 } 489 490 return 0; 491}
245/* kcached/pending job states */ 246#define READCACHE 1 247#define WRITECACHE 2 248#define READDISK 3 249#define WRITEDISK 4 250#define READFILL 5 /* Read Cache Miss Fill */ 251#define INVALIDATE 6 252#define WRITEDISK_SYNC 7
unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
901static void 902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context) 903{ 904 struct kcached_job *job = (struct kcached_job *)context; 905 struct cache_c *dmc = job->dmc; 906 int index = job->index; 907 unsigned long flags; 908 909 VERIFY(!in_interrupt()); 910 DPRINTK("kcopyd_callback: Index %d", index); 911 VERIFY(job->bio == NULL); 912 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 913 VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY)); 914 if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) { 915 read_err = -EIO; 916 sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR; 917 } 918 if (likely(read_err == 0 && write_err == 0)) { 919 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 920 flashcache_md_write(job); 921 } else { 922 /* Disk write failed. We can not purge this block from flash */ 923 DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", 924 -read_err, -write_err, job->disk.sector); 925 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0); 926 VERIFY(dmc->clean_inprog > 0); 927 dmc->cache_sets[index / dmc->assoc].clean_inprog--; 928 dmc->clean_inprog--; 929 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 930 /* Set the error in the job and let do_pending() handle the error */ 931 if (read_err) { 932 dmc->ssd_read_errors++; 933 job->error = read_err; 934 } else { 935 dmc->disk_write_errors++; 936 job->error = write_err; 937 } 938 flashcache_do_pending(job); 939 flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */ 940 dmc->cleanings++; 941 } 942}
860 861/* 862 * Kick off a cache metadata update (called from workqueue). 863 * Cache metadata update IOs to a given metadata sector are serialized using the 864 * nr_in_prog bit in the md sector bufhead. 865 * If a metadata IO is already in progress, we queue up incoming metadata updates 866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we 867 * cluster all these pending updates and do all of them as 1 flash write (that 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs 869 * list and does all of those updates. 870 */ 871void 872flashcache_md_write(struct kcached_job *job) 873{ 874 struct cache_c *dmc = job->dmc; 875 struct cache_md_sector_head *md_sector_head; 876 unsigned long flags; 877 878 VERIFY(!in_interrupt()); 879 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 880 job->action == WRITEDISK_SYNC); 881 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)]; 882 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 883 /* If a write is in progress for this metadata sector, queue this update up */ 884 if (md_sector_head->nr_in_prog != 0) { 885 struct kcached_job **nodepp; 886 887 /* A MD update is already in progress, queue this one up for later */ 888 nodepp = &md_sector_head->pending_jobs; 889 while (*nodepp != NULL) 890 nodepp = &((*nodepp)->next); 891 job->next = NULL; 892 *nodepp = job; 893 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 894 } else { 895 md_sector_head->nr_in_prog = 1; 896 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 897 flashcache_md_write_kickoff(job); 898 } 899}
861/* 862 * Kick off a cache metadata update (called from workqueue). 863 * Cache metadata update IOs to a given metadata sector are serialized using the 864 * nr_in_prog bit in the md sector bufhead. 865 * If a metadata IO is already in progress, we queue up incoming metadata updates 866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we 867 * cluster all these pending updates and do all of them as 1 flash write (that 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs 869 * list and does all of those updates. 870 */
660static void 661flashcache_md_write_kickoff(struct kcached_job *job) 662{ 663 struct cache_c *dmc = job->dmc; 664 struct flash_cacheblock *md_sector; 665 int md_sector_ix; 666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) 667 struct io_region where; 668#else 669 struct dm_io_region where; 670#endif 671 int i; 672 struct cache_md_sector_head *md_sector_head; 673 struct kcached_job *orig_job = job; 674 unsigned long flags; 675 676 if (flashcache_alloc_md_sector(job)) { 677 DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu", 678 job->action, job->disk.sector); 679 flashcache_md_write_callback(-EIO, job); 680 return; 681 } 682 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 683 /* 684 * Transfer whatever is on the pending queue to the md_io_inprog queue. 685 */ 686 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)]; 687 md_sector_head->md_io_inprog = md_sector_head->pending_jobs; 688 md_sector_head->pending_jobs = NULL; 689 md_sector = job->md_sector; 690 md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR; 691 /* First copy out the entire sector */ 692 for (i = 0 ; 693 i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ; 694 i++, md_sector_ix++) { 695 md_sector[i].dbn = dmc->cache[md_sector_ix].dbn; 696#ifdef FLASHCACHE_DO_CHECKSUMS 697 md_sector[i].checksum = dmc->cache[md_sector_ix].checksum; 698#endif 699 md_sector[i].cache_state = 700 dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY); 701 } 702 /* Then set/clear the DIRTY bit for the "current" index */ 703 if (job->action == WRITECACHE) { 704 /* DIRTY the cache block */ 705 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 706 (VALID | DIRTY); 707 } else { /* job->action == WRITEDISK* */ 708 /* un-DIRTY the cache block */ 709 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID; 710 } 711 712 for (job = md_sector_head->md_io_inprog ; 713 job != NULL ; 714 job = job->next) { 715 if (job->action == WRITECACHE) { 716 /* DIRTY the cache block */ 717 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 718 (VALID | DIRTY); 719 } else { /* job->action == WRITEDISK* */ 720 /* un-DIRTY the cache block */ 721 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID; 722 } 723 } 724 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 725 where.bdev = dmc->cache_dev->bdev; 726 where.count = 1; 727 where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index); 728 dmc->ssd_writes++; 729 dm_io_async_bvec(1, &where, WRITE, 730 &orig_job->md_io_bvec, 731 flashcache_md_write_callback, orig_job); 732 flashcache_unplug_device(dmc->cache_dev->bdev); 733}
static int dm_io_async_bvec(unsigned int num_regions, struct dm_io_region *where, int rw, struct bio_vec *bvec, io_notify_fn fn, void *context)
621void 622flashcache_md_write_callback(unsigned long error, void *context) 623{ 624 struct kcached_job *job = (struct kcached_job *)context; 625 626 job->error = error; 627 push_md_complete(job); 628 schedule_work(&_kcached_wq); 629}
284static void 285process_jobs(struct list_head *jobs, 286 void (*fn) (struct kcached_job *)) 287{ 288 struct kcached_job *job; 289 290 while ((job = pop(jobs))) 291 (void)fn(job); 292}
735void 736flashcache_md_write_done(struct kcached_job *job) 737{ 738 struct cache_c *dmc = job->dmc; 739 struct cache_md_sector_head *md_sector_head; 740 int index; 741 unsigned long flags; 742 struct kcached_job *job_list; 743 int error = job->error; 744 struct kcached_job *next; 745 struct cacheblock *cacheblk; 746 747 VERIFY(!in_interrupt()); 748 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 749 job->action == WRITEDISK_SYNC); 750 flashcache_free_md_sector(job); 751 job->md_sector = NULL; 752 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)]; 753 job_list = job; 754 job->next = md_sector_head->md_io_inprog; 755 md_sector_head->md_io_inprog = NULL; 756 for (job = job_list ; job != NULL ; job = next) { 757 next = job->next; 758 job->error = error; 759 index = job->index; 760 cacheblk = &dmc->cache[index]; 761 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 762 if (job->action == WRITECACHE) { 763 if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) { 764 job->error = -EIO; 765 sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR; 766 } 767 if (likely(job->error == 0)) { 768 if ((cacheblk->cache_state & DIRTY) == 0) { 769 dmc->cache_sets[index / dmc->assoc].nr_dirty++; 770 dmc->nr_dirty++; 771 } 772 dmc->md_write_dirty++; 773 cacheblk->cache_state |= DIRTY; 774 } else 775 dmc->ssd_write_errors++; 776 flashcache_bio_endio(job->bio, job->error); 777 if (job->error || cacheblk->head) { 778 if (job->error) { 779 DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu", 780 -job->error, cacheblk->dbn); 781 } 782 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 783 flashcache_do_pending(job); 784 } else { 785 cacheblk->cache_state &= ~BLOCK_IO_INPROG; 786 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 787 flashcache_free_cache_job(job); 788 if (atomic_dec_and_test(&dmc->nr_jobs)) 789 wake_up(&dmc->destroyq); 790 } 791 } else { 792 int action = job->action; 793 794 if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) { 795 job->error = -EIO; 796 sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR; 797 } 798 /* 799 * If we have an error on a WRITEDISK*, no choice but to preserve the 800 * dirty block in cache. Fail any IOs for this block that occurred while 801 * the block was being cleaned. 802 */ 803 if (likely(job->error == 0)) { 804 dmc->md_write_clean++; 805 cacheblk->cache_state &= ~DIRTY; 806 VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0); 807 VERIFY(dmc->nr_dirty > 0); 808 dmc->cache_sets[index / dmc->assoc].nr_dirty--; 809 dmc->nr_dirty--; 810 } else 811 dmc->ssd_write_errors++; 812 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0); 813 VERIFY(dmc->clean_inprog > 0); 814 dmc->cache_sets[index / dmc->assoc].clean_inprog--; 815 dmc->clean_inprog--; 816 if (job->error || cacheblk->head) { 817 if (job->error) { 818 DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu", 819 -job->error, cacheblk->dbn); 820 } 821 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 822 flashcache_do_pending(job); 823 /* Kick off more cleanings */ 824 if (action == WRITEDISK) 825 flashcache_clean_set(dmc, index / dmc->assoc); 826 else 827 flashcache_sync_blocks(dmc); 828 } else { 829 cacheblk->cache_state &= ~BLOCK_IO_INPROG; 830 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 831 flashcache_free_cache_job(job); 832 if (atomic_dec_and_test(&dmc->nr_jobs)) 833 wake_up(&dmc->destroyq); 834 /* Kick off more cleanings */ 835 if (action == WRITEDISK) 836 flashcache_clean_set(dmc, index / dmc->assoc); 837 else 838 flashcache_sync_blocks(dmc); 839 } 840 dmc->cleanings++; 841 if (action == WRITEDISK_SYNC) 842 flashcache_update_sync_progress(dmc); 843 } 844 } 845 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 846 if (md_sector_head->pending_jobs != NULL) { 847 /* peel off the first job from the pending queue and kick that off */ 848 job = md_sector_head->pending_jobs; 849 md_sector_head->pending_jobs = job->next; 850 job->next = NULL; 851 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 852 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 853 job->action == WRITEDISK_SYNC); 854 flashcache_md_write_kickoff(job); 855 } else { 856 md_sector_head->nr_in_prog = 0; 857 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 858 } 859} 860
/* * We have one of these for *every* cache metadata sector, to keep track * of metadata ios in progress for blocks covered in this sector. Only * one metadata IO per sector can be in progress at any given point in * time */ struct cache_md_sector_head { u_int32_t nr_in_prog; struct kcached_job *pending_jobs, *md_io_inprog; };