先把代码与注释贴出来,有时间再来写分析流程:
/**************************************************************************** * dm-cache.c * Device mapper target for block-level disk caching * * Copyright (C) International Business Machines Corp., 2006 * Copyright (C) Ming Zhao, Florida International University, 2007-2009 * * Authors: Ming Zhao, Stephen Bromfield, Douglas Otstott, * Dulcardo Clavijo ([email protected]) * Other contributors: * Eric Van Hensbergen, Reng Zeng * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * ****************************************************************************/ #include <linux/blk_types.h> #include <linux/atomic.h> #include <asm/checksum.h> #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/pagemap.h> #include "dm.h" #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> #define DMC_DEBUG 0 #define DM_MSG_PREFIX "cache" #define DMC_PREFIX "dm-cache: " #if DMC_DEBUG #define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) #else #define DPRINTK( s, arg... ) #endif /* Default cache parameters 默认cache参数*/ #define DEFAULT_CACHE_SIZE 65536 #define DEFAULT_CACHE_ASSOC 1024 #define DEFAULT_BLOCK_SIZE 8 #define CONSECUTIVE_BLOCKS 512 /* Write policy 写策略*/ #define WRITE_THROUGH 0 #define WRITE_BACK 1 #define DEFAULT_WRITE_POLICY WRITE_THROUGH /* Number of pages for I/O */ #define DMCACHE_COPY_PAGES 1024 //默认用于IO的Page数为1024 /* States of a cache block 缓存块状态(4位)*/ #define INVALID 0 // --1- 1- 1- 1-- #define VALID 1 /* Valid */ #define RESERVED 2 /* Allocated but data not in place yet */ #define DIRTY 4 /* Locally modified */ #define WRITEBACK 8 /* In the process of write back */ #define is_state(x, y) (x & y) #define set_state(x, y) (x |= y) #define clear_state(x, y) (x &= ~y) /* * Cache context 总控制结构 */ struct cache_c { struct dm_dev *src_dev; /* 原始设备,磁盘 */ struct dm_dev *cache_dev; /*缓存设备,SSD设备*/ struct dm_kcopyd_client *kcp_client; /* kcopyd客户端. */ struct cacheblock *cache; /* 连续的cacheblock数组,元数个数为size) */ sector_t size; /* cache大小,单位为block ,默为65536*/ unsigned int bits; /* cache大小位掩码,默认为16*/ unsigned int assoc; /* set大小,默认为1024 */ unsigned int block_size; /* cacheblock块大小,单位为sector,默认为8. */ unsigned int block_shift; /* cacheblock块大小位掩码3 */ unsigned int block_mask; /* cacheblock块大小掩码7 */ unsigned int consecutive_shift; /* 连续块大小位掩码,默认为10*/ unsigned long counter; /* 最后访问时间 */ unsigned int write_policy; /* 写策略,默认为0 write_through */ sector_t dirty_blocks; /* 脏块数量 */ spinlock_t lock; /* 页保护锁*/ struct page_list *pages; /* 用于IO的page_list列表,默认1024项.{page_list *next,page*page}*/ unsigned int nr_pages; /* 总Page数,默认1024.*/ unsigned int nr_free_pages; /* 空闭Page数目.*/ wait_queue_head_t destroyq; /* 等待队列头. */ atomic_t nr_jobs; /* io job数 Number of I/O jobs */ struct dm_io_client *io_client; /* dm-io客户端*/ /* Stats 静态状态*/ unsigned long reads; /* Number of reads,读次数 */ unsigned long writes; /* Number of writes,写次数 */ unsigned long cache_hits; /* Number of cache hits ,命中次数*/ unsigned long replace; /* Number of cache replacements 替换次数*/ unsigned long writeback; /* Number of replaced dirty blocks 替换脏块的数量*/ unsigned long dirty; /* Number of submitted dirty blocks 提交脏块的数量*/ }; /* cacheblock元数据结构 */ struct cacheblock { spinlock_t lock; /* 保护锁 */ sector_t block; /* 此cacheblock对应的起始扇区号 */ unsigned short state; /* cacheblock的状态*/ unsigned long counter; /* 最后访问时间(访问次数) */ struct bio_list bios; /* 阻塞的bio请求链表 */ //{bio*head,bio*tail,} }; /* Structure for a kcached job */ struct kcached_job { // job 工作结构 struct list_head list; struct cache_c *dmc; struct bio *bio; /* 原始BIO,Original bio */ struct dm_io_region src; // 原始设备IO区域 struct dm_io_region dest; // 目标设备IO区域 struct cacheblock *cacheblock; //cache块 int rw; //读写标志 /* * When the original bio is not aligned with cache blocks, * we need extra bvecs and pages for padding. */ // 原始BIO与cache块不对齐时,需要补充额外的bio向量和page struct bio_vec *bvec; unsigned int nr_pages; struct page_list *pages; }; /**************************************************************************** * Wrapper functions for using the new dm_io API ****************************************************************************/ static int dm_io_sync_vm(unsigned int num_regions, struct dm_io_region *where, int rw, void *data, unsigned long *error_bits, struct cache_c *dmc) { struct dm_io_request iorq; //同步虚拟内存io iorq.bi_rw= rw; iorq.mem.type = DM_IO_VMA; iorq.mem.ptr.vma = data; iorq.notify.fn = NULL; iorq.client = dmc->io_client; return dm_io(&iorq, num_regions, where, error_bits); } static int dm_io_async_bvec(unsigned int num_regions, struct dm_io_region *where, int rw, struct bio_vec *bvec, io_notify_fn fn, void *context) { struct kcached_job *job = (struct kcached_job *)context; struct cache_c *dmc = job->dmc; struct dm_io_request iorq; //异步bio向量io iorq.bi_rw = (rw | (1 << REQ_SYNC)); iorq.mem.type = DM_IO_BVEC; iorq.mem.ptr.bvec = bvec; iorq.notify.fn = fn; iorq.notify.context = context; iorq.client = dmc->io_client; return dm_io(&iorq, num_regions, where, NULL); } /**************************************************************************** * Functions and data structures for implementing a kcached to handle async * I/O. Code for page and queue handling is borrowed from kcopyd.c. ****************************************************************************/ /* * Functions for handling pages used by async I/O. * The data asked by a bio request may not be aligned with cache blocks, in * which case additional pages are required for the request that is forwarded * to the server. A pool of pages are reserved for this purpose. */ //分配page_list {page_list *next,page*page} static struct page_list *alloc_pl(void) { struct page_list *pl; pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) return NULL; pl->page = alloc_page(GFP_KERNEL); if (!pl->page) { kfree(pl); return NULL; } return pl; } //释放page_list static void free_pl(struct page_list *pl) { __free_page(pl->page); kfree(pl); } //释放page_list链表 static void drop_pages(struct page_list *pl) { struct page_list *next; while (pl) { next = pl->next; free_pl(pl); pl = next; } } static int kcached_get_pages(struct cache_c *dmc, unsigned int nr, struct page_list **pages) { struct page_list *pl; spin_lock(&dmc->lock); if (dmc->nr_free_pages < nr) { DPRINTK("kcached_get_pages: No free pages: %u<%u", dmc->nr_free_pages, nr); spin_unlock(&dmc->lock); return -ENOMEM; } dmc->nr_free_pages -= nr; for (*pages = pl = dmc->pages; --nr; pl = pl->next) ; dmc->pages = pl->next; pl->next = NULL; spin_unlock(&dmc->lock); return 0; } static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl) { struct page_list *cursor; spin_lock(&dmc->lock); for (cursor = pl; cursor->next; cursor = cursor->next) dmc->nr_free_pages++; dmc->nr_free_pages++; cursor->next = dmc->pages; dmc->pages = pl; spin_unlock(&dmc->lock); } static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr) { unsigned int i; struct page_list *pl = NULL, *next; for (i = 0; i < nr; i++) { next = alloc_pl(); if (!next) { if (pl) drop_pages(pl); return -ENOMEM; } next->next = pl; pl = next; } kcached_put_pages(dmc, pl); dmc->nr_pages += nr; return 0; } static void free_bio_pages(struct cache_c *dmc) { BUG_ON(dmc->nr_free_pages != dmc->nr_pages); drop_pages(dmc->pages); dmc->pages = NULL; dmc->nr_free_pages = dmc->nr_pages = 0; } static struct workqueue_struct *_kcached_wq; //工作队列 static struct work_struct _kcached_work; //工作 static inline void wake(void) { //将工作(_kcached_work)加入工作队列(_kcached_wq) queue_work(_kcached_wq, &_kcached_work); } #define MIN_JOBS 1024 //最少job数目 static struct kmem_cache *_job_cache; //job缓存cache static mempool_t *_job_pool; //job池 static DEFINE_SPINLOCK(_job_lock); //链表保护自旋锁 static LIST_HEAD(_complete_jobs); //等待完成IO的job链表 static LIST_HEAD(_io_jobs); //等待执行io的链表 static LIST_HEAD(_pages_jobs); //等待分配page的IO链表 static int jobs_init(void) { //创建job 缓存cache _job_cache = kmem_cache_create("kcached-jobs", sizeof(struct kcached_job), __alignof__(struct kcached_job), 0, NULL); if (!_job_cache) return -ENOMEM; //创建job内存池 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, mempool_free_slab, _job_cache); if (!_job_pool) { kmem_cache_destroy(_job_cache); return -ENOMEM; } return 0; } static void jobs_exit(void) { BUG_ON(!list_empty(&_complete_jobs)); BUG_ON(!list_empty(&_io_jobs)); BUG_ON(!list_empty(&_pages_jobs)); mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); _job_pool = NULL; _job_cache = NULL; } /* * Functions to push and pop a job onto the head of a given job list. */ static inline struct kcached_job *pop(struct list_head *jobs) { struct kcached_job *job = NULL; unsigned long flags; spin_lock_irqsave(&_job_lock, flags); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct kcached_job, list); list_del(&job->list); } spin_unlock_irqrestore(&_job_lock, flags); return job; } static inline void push(struct list_head *jobs, struct kcached_job *job) { unsigned long flags; spin_lock_irqsave(&_job_lock, flags); list_add_tail(&job->list, jobs); spin_unlock_irqrestore(&_job_lock, flags); } /**************************************************************************** * Functions for asynchronously fetching data from source device and storing * data in cache device. Because the requested data may not align with the * cache blocks, extra handling is required to pad a block request and extract * the requested data from the results. ****************************************************************************/ static void io_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *) context; if (error) { /* TODO */ DMERR("io_callback: io error"); return; } if (job->rw == READ) { job->rw = WRITE; push(&_io_jobs, job); } else push(&_complete_jobs, job); wake(); } /* * Fetch data from the source device asynchronously. * For a READ bio, if a cache block is larger than the requested data, then * additional data are prefetched. Larger cache block size enables more * aggressive read prefetching, which is useful for read-mostly usage. * For a WRITE bio, if a cache block is larger than the requested data, the * entire block needs to be fetched, and larger block size incurs more overhead. * In scenaros where writes are frequent, 4KB is a good cache block size. */ static int do_fetch(struct kcached_job *job) { int r = 0, i, j; struct bio *bio = job->bio; struct cache_c *dmc = job->dmc; unsigned int offset, head, tail, remaining, nr_vecs, idx = 0; struct bio_vec *bvec; struct page_list *pl; printk("do_fetch"); offset = (unsigned int) (bio->bi_sector & dmc->block_mask); head = to_bytes(offset); tail = to_bytes(dmc->block_size) - bio->bi_size - head; DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u", bio->bi_sector, job->src.sector, job->dest.sector, job->src.count, head, tail); if (bio_data_dir(bio) == READ) { /* The original request is a READ */ if (0 == job->nr_pages) { /* The request is aligned to cache block */ r = dm_io_async_bvec(1, &job->src, READ, bio->bi_io_vec + bio->bi_idx, io_callback, job); return r; } nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages; bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO); if (!bvec) { DMERR("do_fetch: No memory"); return 1; } pl = job->pages; i = 0; while (head) { bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE); bvec[i].bv_offset = 0; bvec[i].bv_page = pl->page; head -= bvec[i].bv_len; pl = pl->next; i++; } remaining = bio->bi_size; j = bio->bi_idx; while (remaining) { bvec[i] = bio->bi_io_vec[j]; remaining -= bvec[i].bv_len; i++; j++; } while (tail) { bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE); bvec[i].bv_offset = 0; bvec[i].bv_page = pl->page; tail -= bvec[i].bv_len; pl = pl->next; i++; } job->bvec = bvec; r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job); return r; } else { /* The original request is a WRITE */ pl = job->pages; if (head && tail) { /* Special case */ bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL); if (!bvec) { DMERR("do_fetch: No memory"); return 1; } for (i=0; i<job->nr_pages; i++) { bvec[i].bv_len = PAGE_SIZE; bvec[i].bv_offset = 0; bvec[i].bv_page = pl->page; pl = pl->next; } job->bvec = bvec; r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job); return r; } bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx) * sizeof(*bvec), GFP_KERNEL); if (!bvec) { DMERR("do_fetch: No memory"); return 1; } i = 0; while (head) { bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE); bvec[i].bv_offset = 0; bvec[i].bv_page = pl->page; head -= bvec[i].bv_len; pl = pl->next; i++; } remaining = bio->bi_size; j = bio->bi_idx; while (remaining) { bvec[i] = bio->bi_io_vec[j]; remaining -= bvec[i].bv_len; i++; j++; } if (tail) { idx = i; bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) & (PAGE_SIZE - 1); bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset; bvec[i].bv_page = pl->page; tail -= bvec[i].bv_len; pl = pl->next; i++; while (tail) { bvec[i].bv_len = PAGE_SIZE; bvec[i].bv_offset = 0; bvec[i].bv_page = pl->page; tail -= bvec[i].bv_len; pl = pl->next; i++; } } job->bvec = bvec; r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx, io_callback, job); printk("do_fetch end"); return r; } } /* * Store data to the cache source device asynchronously. * For a READ bio request, the data fetched from the source device are returned * to kernel and stored in cache at the same time. * For a WRITE bio request, the data are written to the cache and source device * at the same time. */ static int do_store(struct kcached_job *job) { int i, j, r = 0; struct bio *bio = job->bio ; struct cache_c *dmc = job->dmc; unsigned int offset, head, tail, remaining, nr_vecs; struct bio_vec *bvec; offset = (unsigned int) (bio->bi_sector & dmc->block_mask); head = to_bytes(offset); tail = to_bytes(dmc->block_size) - bio->bi_size - head; DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u", bio->bi_sector, job->src.sector, job->dest.sector, job->src.count, head, tail); if (0 == job->nr_pages) /* Original request is aligned with cache blocks */ r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx, io_callback, job); else { if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) { DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail); nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx; if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++; DPRINTK("Create %u new vecs", nr_vecs); bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL); if (!bvec) { DMERR("do_store: No memory"); return 1; } i = 0; while (head) { bvec[i].bv_len = min(head, job->bvec[i].bv_len); bvec[i].bv_offset = 0; bvec[i].bv_page = job->bvec[i].bv_page; head -= bvec[i].bv_len; i++; } remaining = bio->bi_size; j = bio->bi_idx; while (remaining) { bvec[i] = bio->bi_io_vec[j]; remaining -= bvec[i].bv_len; i++; j++; } j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE; bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) - j * PAGE_SIZE; bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset; bvec[i].bv_page = job->bvec[j].bv_page; tail -= bvec[i].bv_len; i++; j++; while (tail) { bvec[i] = job->bvec[j]; tail -= bvec[i].bv_len; i++; j++; } kfree(job->bvec); job->bvec = bvec; } r = dm_io_async_bvec(1, &job->dest, WRITE, job->bvec, io_callback, job); } return r; } static int do_io(struct kcached_job *job) { int r = 0; if (job->rw == READ) { /* Read from source device */ r = do_fetch(job); } else { /* Write to cache device */ r = do_store(job); } return r; } static int do_pages(struct kcached_job *job) { int r = 0; r = kcached_get_pages(job->dmc, job->nr_pages, &job->pages); if (r == -ENOMEM) /* can't complete now */ return 1; /* this job is ready for io */ push(&_io_jobs, job); return 0; } /* * Flush the bios that are waiting for this cache insertion or write back. */ static void flush_bios(struct cacheblock *cacheblock) { struct bio *bio; struct bio *n; spin_lock(&cacheblock->lock); bio = bio_list_get(&cacheblock->bios); if (is_state(cacheblock->state, WRITEBACK)) { /* Write back finished */ cacheblock->state = VALID; } else { /* Cache insertion finished */ set_state(cacheblock->state, VALID); clear_state(cacheblock->state, RESERVED); } spin_unlock(&cacheblock->lock); while (bio) { n = bio->bi_next; bio->bi_next = NULL; DPRINTK("Flush bio: %llu->%llu (%u bytes)", cacheblock->block, bio->bi_sector, bio->bi_size); generic_make_request(bio); bio = n; } } static int do_complete(struct kcached_job *job) { int r = 0; struct bio *bio = job->bio; DPRINTK("do_complete: %llu", bio->bi_sector); bio_endio(bio, 0); if (job->nr_pages > 0) { kfree(job->bvec); kcached_put_pages(job->dmc, job->pages); } flush_bios(job->cacheblock); mempool_free(job, _job_pool); if (atomic_dec_and_test(&job->dmc->nr_jobs)) wake_up(&job->dmc->destroyq); return r; } /* * Run through a list for as long as possible. Returns the count * of successful jobs. */ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcached_job *)) { struct kcached_job *job; int r, count = 0; while ((job = pop(jobs))) { r = fn(job); if (r < 0) { /* error this rogue job */ DMERR("process_jobs: Job processing error"); } if (r > 0) { /* * We couldn't service this job ATM, so * push this job back onto the list. */ push(jobs, job); break; } count++; } return count; } static void do_work(struct work_struct *ignored) { process_jobs(&_complete_jobs, do_complete); process_jobs(&_pages_jobs, do_pages); process_jobs(&_io_jobs, do_io); } static void queue_job(struct kcached_job *job) { atomic_inc(&job->dmc->nr_jobs); if (job->nr_pages > 0) /* Request pages */ push(&_pages_jobs, job); else /* Go ahead to do I/O */ push(&_io_jobs, job); wake(); } static int kcached_init(struct cache_c *dmc) { int r; spin_lock_init(&dmc->lock); dmc->pages = NULL; dmc->nr_pages = dmc->nr_free_pages = 0; r = alloc_bio_pages(dmc, DMCACHE_COPY_PAGES); if (r) { DMERR("kcached_init: Could not allocate bio pages"); return r; } init_waitqueue_head(&dmc->destroyq); atomic_set(&dmc->nr_jobs, 0); return 0; } void kcached_client_destroy(struct cache_c *dmc) { /* Wait for completion of all jobs submitted by this client. */ wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs)); free_bio_pages(dmc); } /**************************************************************************** * Functions for writing back dirty blocks. * We leverage kcopyd to write back dirty blocks because it is convenient to * use and it is not reasonble to reimplement the same function here. But we * need to reserve pages for both kcached and kcopyd. TODO: dynamically change * the number of reserved pages. ****************************************************************************/ static void copy_callback(int read_err, unsigned int write_err, void *context) { struct cacheblock *cacheblock = (struct cacheblock *) context; flush_bios(cacheblock); } static void copy_block(struct cache_c *dmc, struct dm_io_region src, struct dm_io_region dest, struct cacheblock *cacheblock) { DPRINTK("Copying: %llu:%llu->%llu:%llu", src.sector, src.count * 512, dest.sector, dest.count * 512); dm_kcopyd_copy(dmc->kcp_client, &src, 1, &dest, 0, \ (dm_kcopyd_notify_fn) copy_callback, (void *)cacheblock); } static void write_back(struct cache_c *dmc, sector_t index, unsigned int length) { struct dm_io_region src, dest; struct cacheblock *cacheblock = &dmc->cache[index]; unsigned int i; DPRINTK("Write back block %llu(%llu, %u)", index, cacheblock->block, length); src.bdev = dmc->cache_dev->bdev; src.sector = index << dmc->block_shift; src.count = dmc->block_size * length; dest.bdev = dmc->src_dev->bdev; dest.sector = cacheblock->block; dest.count = dmc->block_size * length; for (i=0; i<length; i++) set_state(dmc->cache[index+i].state, WRITEBACK); dmc->dirty_blocks -= length; copy_block(dmc, src, dest, cacheblock); } /**************************************************************************** * Functions for implementing the various cache operations. ****************************************************************************/ /* * Map a block from the source device to a block in the cache device. */ static unsigned long hash_block(struct cache_c *dmc, sector_t block) { unsigned long set_number, value; value = (unsigned long)(block >> (dmc->block_shift + dmc->consecutive_shift)); set_number = hash_long(value, dmc->bits) / dmc->assoc; return set_number; } /* * Reset the LRU counters (the cache's global counter and each cache block's * counter). This seems to be a naive implementaion. However, consider the * rareness of this event, it might be more efficient that other more complex * schemes. TODO: a more elegant solution. */ static void cache_reset_counter(struct cache_c *dmc) { sector_t i; struct cacheblock *cache = dmc->cache; DPRINTK("Reset LRU counters"); for (i=0; i<dmc->size; i++) cache[i].counter = 0; dmc->counter = 0; } /* * lookup cache block * * return value : * 1: cache hit (cache_block stores the index of the matched block) * 0: cache miss but frame is allocated for insertion; cache_block stores the * frame's index: * If there are empty frames, then the first encounted is used. * If there are clean frames, then the LRU clean block is replaced. * 2: cache miss and frame is not allocated; cache_block stores the LRU dirty * block's index: * This happens when the entire set is dirty. * -1: cache miss and no room for insertion: * This happens when the entire set in transition modes (RESERVED or * WRITEBACK). * */ static int cache_lookup(struct cache_c *dmc, sector_t block, sector_t *cache_block) { unsigned long set_number = hash_block(dmc, block); sector_t index; int i, res; unsigned int cache_assoc = dmc->assoc; struct cacheblock *cache = dmc->cache; int invalid = -1, oldest = -1, oldest_clean = -1; unsigned long counter = ULONG_MAX, clean_counter = ULONG_MAX; index=set_number * cache_assoc; for (i=0; i<cache_assoc; i++, index++) { if (is_state(cache[index].state, VALID) || is_state(cache[index].state, RESERVED)) { if (cache[index].block == block) { *cache_block = index; /* Reset all counters if the largest one is going to overflow */ if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc); cache[index].counter = ++dmc->counter; break; } else { /* Don't consider blocks that are in the middle of copying */ if (!is_state(cache[index].state, RESERVED) && !is_state(cache[index].state, WRITEBACK)) { if (!is_state(cache[index].state, DIRTY) && cache[index].counter < clean_counter) { clean_counter = cache[index].counter; oldest_clean = i; } if (cache[index].counter < counter) { counter = cache[index].counter; oldest = i; } } } } else { if (-1 == invalid) invalid = i; } } res = i < cache_assoc ? 1 : 0; if (!res) { /* Cache miss */ if (invalid != -1) /* 第一个空闭可用的块 Choose the first empty frame */ *cache_block = set_number * cache_assoc + invalid; else if (oldest_clean != -1) /* LRU算法求干净的块 Choose the LRU clean block to replace */ *cache_block = set_number * cache_assoc + oldest_clean; else if (oldest != -1) { /* LRU算法求脏块 Choose the LRU dirty block to evict */ res = 2; *cache_block = set_number * cache_assoc + oldest; } else { res = -1; } } if (-1 == res) DPRINTK("Cache lookup: Block %llu(%lu):%s", block, set_number, "NO ROOM"); else DPRINTK("Cache lookup: Block %llu(%lu):%llu(%s)", block, set_number, *cache_block, 1 == res ? "HIT" : (0 == res ? "MISS" : "WB NEEDED")); return res; } /* * Insert a block into the cache (in the frame specified by cache_block). */ static int cache_insert(struct cache_c *dmc, sector_t block, sector_t cache_block) { struct cacheblock *cache = dmc->cache; /* Mark the block as RESERVED because although it is allocated, the data are not in place until kcopyd finishes its job. */ cache[cache_block].block = block; cache[cache_block].state = RESERVED; if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc); cache[cache_block].counter = ++dmc->counter; return 1; } /* * Invalidate a block (specified by cache_block) in the cache. */ static void cache_invalidate(struct cache_c *dmc, sector_t cache_block) { struct cacheblock *cache = dmc->cache; DPRINTK("Cache invalidate: Block %llu(%llu)", cache_block, cache[cache_block].block); clear_state(cache[cache_block].state, VALID); } /* * Handle a cache hit: * For READ, serve the request from cache is the block is ready; otherwise, * queue the request for later processing. * For write, invalidate the cache block if write-through. If write-back, * serve the request from cache if the block is ready, or queue the request * for later processing if otherwise. */ static int cache_hit(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { unsigned int offset = (unsigned int)(bio->bi_sector & dmc->block_mask); struct cacheblock *cache = dmc->cache; dmc->cache_hits++; if (bio_data_dir(bio) == READ) { /* READ hit */ bio->bi_bdev = dmc->cache_dev->bdev; bio->bi_sector = (cache_block << dmc->block_shift) + offset; spin_lock(&cache[cache_block].lock); if (is_state(cache[cache_block].state, VALID)) { /* Valid cache block */ spin_unlock(&cache[cache_block].lock); return 1; } /* Cache block is not ready yet */ DPRINTK("Add to bio list %s(%llu)", dmc->cache_dev->name, bio->bi_sector); bio_list_add(&cache[cache_block].bios, bio); spin_unlock(&cache[cache_block].lock); return 0; } else { /* WRITE hit */ if (dmc->write_policy == WRITE_THROUGH) { /* Invalidate cached data */ cache_invalidate(dmc, cache_block); bio->bi_bdev = dmc->src_dev->bdev; return 1; } /* Write delay */ if (!is_state(cache[cache_block].state, DIRTY)) { set_state(cache[cache_block].state, DIRTY); dmc->dirty_blocks++; } spin_lock(&cache[cache_block].lock); /* In the middle of write back */ if (is_state(cache[cache_block].state, WRITEBACK)) { /* Delay this write until the block is written back */ bio->bi_bdev = dmc->src_dev->bdev; DPRINTK("Add to bio list %s(%llu)", dmc->src_dev->name, bio->bi_sector); bio_list_add(&cache[cache_block].bios, bio); spin_unlock(&cache[cache_block].lock); return 0; } /* Cache block not ready yet */ if (is_state(cache[cache_block].state, RESERVED)) { bio->bi_bdev = dmc->cache_dev->bdev; bio->bi_sector = (cache_block << dmc->block_shift) + offset; DPRINTK("Add to bio list %s(%llu)", dmc->cache_dev->name, bio->bi_sector); bio_list_add(&cache[cache_block].bios, bio); spin_unlock(&cache[cache_block].lock); return 0; } /* Serve the request from cache */ bio->bi_bdev = dmc->cache_dev->bdev; bio->bi_sector = (cache_block << dmc->block_shift) + offset; spin_unlock(&cache[cache_block].lock); return 1; } } static struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio, sector_t request_block, sector_t cache_block) { struct dm_io_region src, dest; struct kcached_job *job; src.bdev = dmc->src_dev->bdev; src.sector = request_block; src.count = dmc->block_size; dest.bdev = dmc->cache_dev->bdev; dest.sector = cache_block << dmc->block_shift; dest.count = src.count; job = mempool_alloc(_job_pool, GFP_NOIO); job->dmc = dmc; job->bio = bio; job->src = src; job->dest = dest; job->cacheblock = &dmc->cache[cache_block]; return job; } /* * Handle a read cache miss: * Update the metadata; fetch the necessary block from source device; * store data to cache device. */ static int cache_read_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { struct cacheblock *cache = dmc->cache; unsigned int offset, head, tail; struct kcached_job *job; sector_t request_block, left; offset = (unsigned int)(bio->bi_sector & dmc->block_mask); request_block = bio->bi_sector - offset; if (cache[cache_block].state & VALID) { DPRINTK("Replacing %llu->%llu", cache[cache_block].block, request_block); dmc->replace++; } else DPRINTK("Insert block %llu at empty frame %llu", request_block, cache_block); cache_insert(dmc, request_block, cache_block); /* Update metadata first */ job = new_kcached_job(dmc, bio, request_block, cache_block); head = to_bytes(offset); left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block; if (left < dmc->block_size) { tail = to_bytes(left) - bio->bi_size - head; job->src.count = left; job->dest.count = left; } else tail = to_bytes(dmc->block_size) - bio->bi_size - head; /* Requested block is aligned with a cache block */ if (0 == head && 0 == tail) job->nr_pages= 0; else /* Need new pages to store extra data */ job->nr_pages = dm_div_up(head, PAGE_SIZE) + dm_div_up(tail, PAGE_SIZE); job->rw = READ; /* Fetch data from the source device */ DPRINTK("Queue job for %llu (need %u pages)", bio->bi_sector, job->nr_pages); queue_job(job); return 0; } /* * Handle a write cache miss: * If write-through, forward the request to source device. * If write-back, update the metadata; fetch the necessary block from source * device; write to cache device. */ static int cache_write_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { struct cacheblock *cache = dmc->cache; unsigned int offset, head, tail; struct kcached_job *job; sector_t request_block, left; if (dmc->write_policy == WRITE_THROUGH) { /* Forward request to souuce */ bio->bi_bdev = dmc->src_dev->bdev; return 1; } offset = (unsigned int)(bio->bi_sector & dmc->block_mask); request_block = bio->bi_sector - offset; if (cache[cache_block].state & VALID) { DPRINTK("Replacing %llu->%llu", cache[cache_block].block, request_block); dmc->replace++; } else DPRINTK("Insert block %llu at empty frame %llu", request_block, cache_block); /* Write delay */ cache_insert(dmc, request_block, cache_block); /* Update metadata first */ set_state(cache[cache_block].state, DIRTY); dmc->dirty_blocks++; job = new_kcached_job(dmc, bio, request_block, cache_block); head = to_bytes(offset); left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block; if (left < dmc->block_size) { tail = to_bytes(left) - bio->bi_size - head; job->src.count = left; job->dest.count = left; } else tail = to_bytes(dmc->block_size) - bio->bi_size - head; if (0 == head && 0 == tail) { /* Requested is aligned with a cache block */ job->nr_pages = 0; job->rw = WRITE; } else if (head && tail){ /* Special case: need to pad both head and tail */ job->nr_pages = dm_div_up(to_bytes(job->src.count), PAGE_SIZE); job->rw = READ; } else { if (head) { /* Fetch only head */ job->src.count = to_sector(head); job->nr_pages = dm_div_up(head, PAGE_SIZE); } else { /* Fetch only tail */ job->src.sector = bio->bi_sector + to_sector(bio->bi_size); job->src.count = to_sector(tail); job->nr_pages = dm_div_up(tail, PAGE_SIZE); } job->rw = READ; } queue_job(job); return 0; } /* Handle cache misses */ static int cache_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { if (bio_data_dir(bio) == READ) return cache_read_miss(dmc, bio, cache_block); else return cache_write_miss(dmc, bio, cache_block); } /**************************************************************************** * Functions for implementing the operations on a cache mapping. ****************************************************************************/ /* * Decide the mapping and perform necessary cache operations for a bio request. */ static int cache_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct cache_c *dmc = (struct cache_c *) ti->private; sector_t request_block, cache_block = 0, offset; int res; offset = bio->bi_sector & dmc->block_mask; request_block = bio->bi_sector - offset; DPRINTK("Got a %s for %llu ((%llu:%llu), %u bytes)", bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ? "READ":"READA"), bio->bi_sector, request_block, offset, bio->bi_size); if (bio_data_dir(bio) == READ) dmc->reads++; else dmc->writes++; res = cache_lookup(dmc, request_block, &cache_block); if (1 == res) /* Cache hit; server request from cache */ return cache_hit(dmc, bio, cache_block); else if (0 == res) /* Cache miss; replacement block is found */ return cache_miss(dmc, bio, cache_block); else if (2 == res) { /* Entire cache set is dirty; initiate a write-back */ write_back(dmc, cache_block, 1); dmc->writeback++; } /* Forward to source device */ bio->bi_bdev = dmc->src_dev->bdev; return 1; } struct meta_dmc { sector_t size; unsigned int block_size; unsigned int assoc; unsigned int write_policy; unsigned int chksum; }; /* Load metadata stored by previous session from disk. */ static int load_metadata(struct cache_c *dmc) { struct dm_io_region where; unsigned long bits; sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; sector_t meta_size, *meta_data, i, j, index = 0, limit, order; struct meta_dmc *meta_dmc; unsigned int chksum = 0, chksum_sav, consecutive_blocks; meta_dmc = (struct meta_dmc *)vmalloc(512); if (!meta_dmc) { DMERR("load_metadata: Unable to allocate memory"); return 1; } where.bdev = dmc->cache_dev->bdev; where.sector = dev_size - 1; where.count = 1; dm_io_sync_vm(1, &where, READ, meta_dmc, &bits, dmc); DPRINTK("Loaded cache conf: block size(%u), cache size(%llu), " \ "associativity(%u), write policy(%u), chksum(%u)", meta_dmc->block_size, meta_dmc->size, meta_dmc->assoc, meta_dmc->write_policy, meta_dmc->chksum); dmc->block_size = meta_dmc->block_size; dmc->block_shift = ffs(dmc->block_size) - 1; dmc->block_mask = dmc->block_size - 1; dmc->size = meta_dmc->size; dmc->bits = ffs(dmc->size) - 1; dmc->assoc = meta_dmc->assoc; consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ? dmc->assoc : CONSECUTIVE_BLOCKS; dmc->consecutive_shift = ffs(consecutive_blocks) - 1; dmc->write_policy = meta_dmc->write_policy; chksum_sav = meta_dmc->chksum; vfree((void *)meta_dmc); order = dmc->size * sizeof(struct cacheblock); DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \ "(capacity:%lluMB, associativity:%u, block size:%u " \ "sectors(%uKB), %s)", (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock), (unsigned long long) dmc->size, (unsigned long long) dmc->size * dmc->block_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT), dmc->write_policy ? "write-back" : "write-through"); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { DMERR("load_metadata: Unable to allocate memory"); return 1; } meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512); /* When requesting a new bio, the number of requested bvecs has to be less than BIO_MAX_PAGES. Otherwise, null is returned. In dm-io.c, this return value is not checked and kernel Oops may happen. We set the limit here to avoid such situations. (2 additional bvecs are required by dm-io for bookeeping.) */ limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT); meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit))); if (!meta_data) { DMERR("load_metadata: Unable to allocate memory"); vfree((void *)dmc->cache); return 1; } while(index < meta_size) { where.sector = dev_size - 1 - meta_size + index; where.count = min(meta_size - index, limit); dm_io_sync_vm(1, &where, READ, meta_data, &bits, dmc); for (i=to_bytes(index)/sizeof(sector_t), j=0; j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size; i++, j++) { if(meta_data[j]) { dmc->cache[i].block = meta_data[j]; dmc->cache[i].state = 1; } else dmc->cache[i].state = 0; } chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum); index += where.count; } vfree((void *)meta_data); if (chksum != chksum_sav) { /* Check the checksum of the metadata */ DPRINTK("Cache metadata loaded from disk is corrupted"); vfree((void *)dmc->cache); return 1; } DMINFO("Cache metadata loaded from disk (offset %llu)", (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);; return 0; } /* Store metadata onto disk. */ static int dump_metadata(struct cache_c *dmc) { struct dm_io_region where; unsigned long bits; sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; sector_t meta_size, i, j, index = 0, limit, *meta_data; struct meta_dmc *meta_dmc; unsigned int chksum = 0; meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512); limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT); meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit))); if (!meta_data) { DMERR("dump_metadata: Unable to allocate memory"); return 1; } where.bdev = dmc->cache_dev->bdev; while(index < meta_size) { where.sector = dev_size - 1 - meta_size + index; where.count = min(meta_size - index, limit); for (i=to_bytes(index)/sizeof(sector_t), j=0; j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size; i++, j++) { /* Assume all invalid cache blocks store 0. We lose the block that * is actually mapped to offset 0. */ meta_data[j] = dmc->cache[i].state ? dmc->cache[i].block : 0; } chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum); dm_io_sync_vm(1, &where, WRITE, meta_data, &bits, dmc); index += where.count; } vfree((void *)meta_data); meta_dmc = (struct meta_dmc *)vmalloc(512); if (!meta_dmc) { DMERR("dump_metadata: Unable to allocate memory"); return 1; } meta_dmc->block_size = dmc->block_size; meta_dmc->size = dmc->size; meta_dmc->assoc = dmc->assoc; meta_dmc->write_policy = dmc->write_policy; meta_dmc->chksum = chksum; DPRINTK("Store metadata to disk: block size(%u), cache size(%llu), " \ "associativity(%u), write policy(%u), checksum(%u)", meta_dmc->block_size, (unsigned long long) meta_dmc->size, meta_dmc->assoc, meta_dmc->write_policy, meta_dmc->chksum); where.sector = dev_size - 1; where.count = 1; dm_io_sync_vm(1, &where, WRITE, meta_dmc, &bits, dmc); vfree((void *)meta_dmc); DMINFO("Cache metadata saved to disk (offset %llu)", (unsigned long long) dev_size - 1 - (unsigned long long) meta_size); return 0; } /* * Construct a cache mapping. * arg[0]: path to source device * arg[1]: path to cache device * arg[2]: cache persistence (if set, cache conf is loaded from disk) * Cache configuration parameters (if not set, default values are used. * arg[3]: cache block size (in sectors) * arg[4]: cache size (in blocks) * arg[5]: cache associativity * arg[6]: write caching policy */ static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct cache_c *dmc; unsigned int consecutive_blocks, persistence = 0; sector_t localsize, i, order; sector_t data_size, meta_size, dev_size; unsigned long long cache_size; int r = -EINVAL; if (argc < 2) { ti->error = "dm-cache: Need at least 2 arguments (src dev and cache dev)"; goto bad; } dmc = kmalloc(sizeof(*dmc), GFP_KERNEL); if (dmc == NULL) { ti->error = "dm-cache: Failed to allocate cache context"; r = ENOMEM; goto bad; } r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dmc->src_dev); if (r) { ti->error = "dm-cache: Source device lookup failed"; goto bad1; } r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dmc->cache_dev); if (r) { ti->error = "dm-cache: Cache device lookup failed"; goto bad2; } dmc->io_client = dm_io_client_create(); if (IS_ERR(dmc->io_client)) { r = PTR_ERR(dmc->io_client); ti->error = "Failed to create io client\n"; goto bad3; } dmc->kcp_client = dm_kcopyd_client_create(); if (dmc->kcp_client == NULL) { ti->error = "Failed to initialize kcopyd client\n"; goto bad4; } r = kcached_init(dmc); if (r) { ti->error = "Failed to initialize kcached"; goto bad5; } if (argc >= 3) { if (sscanf(argv[2], "%u", &persistence) != 1) { ti->error = "dm-cache: Invalid cache persistence"; r = -EINVAL; goto bad6; } } if (1 == persistence) { if (load_metadata(dmc)) { ti->error = "dm-cache: Invalid cache configuration"; r = -EINVAL; goto bad6; } goto init; /* Skip reading cache parameters from command line */ } else if (persistence != 0) { ti->error = "dm-cache: Invalid cache persistence"; r = -EINVAL; goto bad6; } if (argc >= 4) { if (sscanf(argv[3], "%u", &dmc->block_size) != 1) { ti->error = "dm-cache: Invalid block size"; r = -EINVAL; goto bad6; } if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) { ti->error = "dm-cache: Invalid block size"; r = -EINVAL; goto bad6; } } else dmc->block_size = DEFAULT_BLOCK_SIZE; dmc->block_shift = ffs(dmc->block_size) - 1; dmc->block_mask = dmc->block_size - 1; if (argc >= 5) { if (sscanf(argv[4], "%llu", &cache_size) != 1) { ti->error = "dm-cache: Invalid cache size"; r = -EINVAL; goto bad6; } dmc->size = (sector_t) cache_size; if (!dmc->size || (dmc->size & (dmc->size - 1))) { ti->error = "dm-cache: Invalid cache size"; r = -EINVAL; goto bad6; } } else dmc->size = DEFAULT_CACHE_SIZE; localsize = dmc->size; dmc->bits = ffs(dmc->size) - 1; if (argc >= 6) { if (sscanf(argv[5], "%u", &dmc->assoc) != 1) { ti->error = "dm-cache: Invalid cache associativity"; r = -EINVAL; goto bad6; } if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) || dmc->size < dmc->assoc) { ti->error = "dm-cache: Invalid cache associativity"; r = -EINVAL; goto bad6; } } else dmc->assoc = DEFAULT_CACHE_ASSOC; DMINFO("%lld", dmc->cache_dev->bdev->bd_inode->i_size); dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; data_size = dmc->size * dmc->block_size; meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512) + 1; if ((data_size + meta_size) > dev_size) { DMERR("Requested cache size exeeds the cache device's capacity" \ "(%llu+%llu>%llu)", (unsigned long long) data_size, (unsigned long long) meta_size, (unsigned long long) dev_size); ti->error = "dm-cache: Invalid cache size"; r = -EINVAL; goto bad6; } consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ? dmc->assoc : CONSECUTIVE_BLOCKS; dmc->consecutive_shift = ffs(consecutive_blocks) - 1; if (argc >= 7) { if (sscanf(argv[6], "%u", &dmc->write_policy) != 1) { ti->error = "dm-cache: Invalid cache write policy"; r = -EINVAL; goto bad6; } if (dmc->write_policy != 0 && dmc->write_policy != 1) { ti->error = "dm-cache: Invalid cache write policy"; r = -EINVAL; goto bad6; } } else dmc->write_policy = DEFAULT_WRITE_POLICY; order = dmc->size * sizeof(struct cacheblock); localsize = data_size >> 11; DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \ "(capacity:%lluMB, associativity:%u, block size:%u " \ "sectors(%uKB), %s)", (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock), (unsigned long long) dmc->size, (unsigned long long) data_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT), dmc->write_policy ? "write-back" : "write-through"); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { ti->error = "Unable to allocate memory"; r = -ENOMEM; goto bad6; } init: /* Initialize the cache structs */ for (i=0; i<dmc->size; i++) { bio_list_init(&dmc->cache[i].bios); if(!persistence) dmc->cache[i].state = 0; dmc->cache[i].counter = 0; spin_lock_init(&dmc->cache[i].lock); } dmc->counter = 0; dmc->dirty_blocks = 0; dmc->reads = 0; dmc->writes = 0; dmc->cache_hits = 0; dmc->replace = 0; dmc->writeback = 0; dmc->dirty = 0; ti->split_io = dmc->block_size; ti->private = dmc; return 0; bad6: kcached_client_destroy(dmc); bad5: dm_kcopyd_client_destroy(dmc->kcp_client); bad4: dm_io_client_destroy(dmc->io_client); bad3: dm_put_device(ti, dmc->cache_dev); bad2: dm_put_device(ti, dmc->src_dev); bad1: kfree(dmc); bad: return r; } static void cache_flush(struct cache_c *dmc) { struct cacheblock *cache = dmc->cache; sector_t i = 0; unsigned int j; DMINFO("Flush dirty blocks (%llu) ...", (unsigned long long) dmc->dirty_blocks); while (i< dmc->size) { j = 1; if (is_state(cache[i].state, DIRTY)) { while ((i+j) < dmc->size && is_state(cache[i+j].state, DIRTY) && (cache[i+j].block == cache[i].block + j * dmc->block_size)) { j++; } dmc->dirty += j; write_back(dmc, i, j); } i += j; } } /* * Destroy the cache mapping. */ static void cache_dtr(struct dm_target *ti) { struct cache_c *dmc = (struct cache_c *) ti->private; if (dmc->dirty_blocks > 0) cache_flush(dmc); kcached_client_destroy(dmc); dm_kcopyd_client_destroy(dmc->kcp_client); if (dmc->reads + dmc->writes > 0) DMINFO("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \ "replacement(%lu), replaced dirty blocks(%lu), " \ "flushed dirty blocks(%lu)", dmc->reads, dmc->writes, dmc->cache_hits, dmc->cache_hits * 100 / (dmc->reads + dmc->writes), dmc->replace, dmc->writeback, dmc->dirty); dump_metadata(dmc); /* Always dump metadata to disk before exit */ vfree((void *)dmc->cache); dm_io_client_destroy(dmc->io_client); dm_put_device(ti, dmc->src_dev); dm_put_device(ti, dmc->cache_dev); kfree(dmc); } /* * Report cache status: * Output cache stats upon request of device status; * Output cache configuration upon request of table status. */ static int cache_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct cache_c *dmc = (struct cache_c *) ti->private; int sz = 0; switch (type) { case STATUSTYPE_INFO: DMEMIT("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \ "replacement(%lu), replaced dirty blocks(%lu)", dmc->reads, dmc->writes, dmc->cache_hits, (dmc->reads + dmc->writes) > 0 ? \ dmc->cache_hits * 100 / (dmc->reads + dmc->writes) : 0, dmc->replace, dmc->writeback); break; case STATUSTYPE_TABLE: DMEMIT("conf: capacity(%lluM), associativity(%u), block size(%uK), %s", (unsigned long long) dmc->size * dmc->block_size >> 11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), dmc->write_policy ? "write-back":"write-through"); break; } return 0; } /**************************************************************************** * Functions for manipulating a cache target. ****************************************************************************/ static struct target_type cache_target = { .name = "cache", .version= {1, 0, 1}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, .map = cache_map, .status = cache_status, }; /* * Initiate a cache target. */ int __init dm_cache_init(void) { int r; r = jobs_init();// _job_cache ,_job_pool 初始化 if (r) return r; //创建工作队列 _kcached_wq = create_singlethread_workqueue("kcached"); if (!_kcached_wq) { DMERR("failed to start kcached"); return -ENOMEM; } INIT_WORK(&_kcached_work, do_work); //创建工作 r = dm_register_target(&cache_target); //注册dm-cache target if (r < 0) { DMERR("cache: register failed %d", r); destroy_workqueue(_kcached_wq); } return r; } /* * Destroy a cache target. */ static void __exit dm_cache_exit(void) { dm_unregister_target(&cache_target); jobs_exit(); destroy_workqueue(_kcached_wq); } module_init(dm_cache_init); module_exit(dm_cache_exit); MODULE_DESCRIPTION(DM_NAME " cache target"); MODULE_AUTHOR("Ming Zhao <[email protected]>"); MODULE_LICENSE("GPL");