soft raid5阅读笔记之七--MD中的bitmap

本节主要介绍MD中的bitmap的机制,该机制主要用于减少不必要的同步操作。在真正的数据IO写操作之前先将该chunk对应的bitmap内存中的bit位设置为1,写入磁盘文件bitmap文件中,而在真正的数据写完成之后,再将bitmap文件中的bit位清零。这样,在进行一次IO写操作中,就多了两次磁盘的写操作,势必影响IO的效率,因此,在linux内核中,关于这部分做了两个方面的优化:1)批量写入;2)延迟清除。使得bitmap的操作现在缓存中操作,必要时再写入磁盘。
在进入主题之前,先看看这部分涉及的几个主要的数据结构:
1)超级块位于磁盘文件开始前256个字节,用于记录bitmap文件的管理信息,主要的域为chunksize(bitmap文件中一个bit对应的chunk的大小)。
typedef struct bitmap_super_s {
     __le32 magic;        /*  0  BITMAP_MAGIC */
     __le32 version;      /*  4  the bitmap major for now, could change... */
     __u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
     __le64 events;       /* 24  event counter for the bitmap (1)*/
     __le64 events_cleared;/*32  event counter when last bit cleared (2) */
     __le64 sync_size;    /* 40  the size of the md device's sync range(3) */
     __le32 state;        /* 48  bitmap state information */
     __le32 chunksize;    /* 52  the bitmap chunk size in bytes */
     __le32 daemon_sleep; /* 56  seconds between disk flushes */
     __le32 write_behind; /* 60  number of outstanding write-behind writes */

     __u8  pad[256 - 64]; /* set to zero */
} bitmap_super_t;


2)从注释可以看出,该结构体代表了bitmap在内存中的页;
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
     /*
     * map points to the actual memory page映射到实际物理页的指针
     */
     char *map;
     /*
     * in emergencies (when map cannot be alloced), hijack the map特殊情况下,使用映射的指针作为计数器,因为一个计数器的大小为16位,因此,
     * pointer and use it as two counters itself可以将指针作为两个计数器来使用;
     */
     unsigned int hijacked:1;
     /*
     * count of dirty bits on the page     在一个物理页中的dirty位的计数器
     */
     unsigned int  count:31;
};


3)bitmap在磁盘中的文件表现,每个mddev(磁盘阵列)包含一个bitmap文件。
/* the main bitmap structure - one per mddev */
struct bitmap {
     struct bitmap_page *bp;     /*bitmap文件对应的物理内存页的数组*/
     unsigned long pages; /* total number of pages in the bitmap bitmap文件映射到内存中总共占用的页数*/
     unsigned long missing_pages; /* number of pages not yet allocated */

     mddev_t *mddev; /* the md device that the bitmap is for */

     int counter_bits; /* how many bits per block counter */

     /* bitmap chunksize -- how much data does each bit represent?每个bit位代表的数据chunk大小 */
     unsigned long chunksize;
     unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
     unsigned long chunks; /* total number of data chunks for the array 阵列中总共包含的数据chunk的数量*/

     /* We hold a count on the chunk currently being synced, and drop
     * it when the last block is started.  If the resync is aborted
     * midway, we need to be able to drop that count, so we remember
     * the counted chunk..
     */
     unsigned long syncchunk;

     __u64     events_cleared;
     int need_sync;

     /* bitmap spinlock */
     spinlock_t lock;
     /*bitmap有两种表现形式:1)存放在MD设备之外,此时file指向的就是bitmap文件对应的file;2)存放在MD设备中,此时offset代表了bitmap距离superblock的偏移值*/
     long offset; /* offset from superblock if file is NULL */
     struct file *file; /* backing disk file */
     struct page *sb_page; /* cached copy of the bitmap file superblock                bimap文件的superblock对应的内存页*/
     struct page **filemap; /* list of cache pages for the file                          bitmap文件映射到内存中所在的物理页框的指针数组*/
     unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
     unsigned long file_pages; /* number of pages in the file                bitmap文件映射到内存页的数量*/
     int last_page_size; /* bytes in the last page */

     unsigned long flags;

     int allclean;

     unsigned long max_write_behind; /* write-behind mode */
     atomic_t behind_writes;

     /*
     * the bitmap daemon - periodically wakes up and sweeps the bitmap     bitmap的后台程序成员---周期性的被唤醒,清除响应的bit位,
     * file, cleaning up bits and flushing out pages to disk as necessary          并在必要时写入磁盘中
     */
     unsigned long daemon_lastrun; /* jiffies of last run */
     unsigned long daemon_sleep; /* how many seconds between updates? */
     unsigned long last_end_sync; /* when we lasted called end_sync to
                          * update bitmap with resync progress */

     atomic_t pending_writes; /* pending writes to the bitmap file */
     wait_queue_head_t write_wait;
     wait_queue_head_t overflow_wait;

};
 * in-memory bitmap:     内存中的bitmap:使用16位的块计数器来跟踪挂起的写到每个chunk上的请求的计数,高两位用于特殊的目的,
* 第一位表示是否需要同步,第二位表示同步是否处于激活状态
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed.  The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync |               counter                          |
* | needed | active |                                                |
* |  (0-1) |  (0-1) |              (0-16383)                         |
* +--------+--------+------------------------------------------------+


下面我们来看看相关的一些宏定义,有助于理解bitmap的实现:
#define PAGE_BITS (PAGE_SIZE << 3)     /*一页包含的bit位的数量*/    =4KB*8=32Kbit
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)     /*一页包含的bit位的数量的偏移值*/     =12+3=15

typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16          /*计数器包含的bit位数量*/
#define COUNTER_BIT_SHIFT 4     /*计数器包含的bit位数量的偏移值*/
#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)     /*计数器包含多少字节*/    =16/8=2byte
#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)     /*计数器包含字节的偏移值*/     =4-3=1
/*记录挂起的写操作的计数器相关的宏定义*/
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)

/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)     /*一页包含的计数器的个数*/
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)     /*一页包含的计数器个数的偏移值*/
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)          /*一页包含的计数器个数的掩码*/

#define BITMAP_BLOCK_SIZE 512     /*bitmap的块大小=扇区的大小*/
#define BITMAP_BLOCK_SHIFT 9     /*bitmap的快大小的偏移值*/

/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)     /*一个chunk包含的块的个数*/
#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)     /*偏移值*/
#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)                    /*掩码*/

/* when hijacked, the counters and bits represent even larger "chunks" */
/* there will be 1024 chunks represented by each counter in the page pointers */
#define PAGEPTR_BLOCK_RATIO(bitmap) \
               (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
#define PAGEPTR_BLOCK_SHIFT(bitmap) \
               (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)

/*
* on-disk bitmap:
*
* Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
* file a page at a time. There's a superblock at the start of the file.
*/

/* map chunks (bits) to file pages - offset by the size of the superblock */
#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))     /*chunk 的位偏移*/


在介绍函数调用关系之前,先介绍一下bitmap内存页的状态标志:
enum bitmap_page_attr {
     BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced   bitmap中的数据位为dirty,说明需要同步到磁盘中,在写操作之前设置
     BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared  bitmap中的数据位需要被清除掉,在写数据完成后设置
     BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced  bitmap中的数据位需要被同步到磁盘中
};


下面重点分析函数的调用关系,主要包含以下几个部分:
1)设置 BITMAP_PAGE_DIRTY:在发送写请求make_request()中调用add_stripe_bio(),进而调用bitmap_startwrite(),该函数就是通过调用set_page_attr()函数来 设置 BITMAP_PAGE_DIRTY;
soft raid5阅读笔记之七--MD中的bitmap_第1张图片
2)在守护线程中,将标志位 BITMAP_PAGE_DIRTY的内存页写入到磁盘文件中:

3)在写操作完成后,调用bitmap_endwrite()函数,完成对 BITMAP_PAGE_CLEAN状态信息的设置:

4)在守护线程中,调用bitmap_daemon_work()完成对bitmap文件的磁盘写操作:
soft raid5阅读笔记之七--MD中的bitmap_第2张图片
在这里,我们重点分析下面几个函数:
1)bitmap_daemon_work():该函数主要的功能就是在清除bit位,并将bitmap内存页写入到磁盘文件中;但是过程比较难理解,是通过三次调用该函数才完成:
  1. 第一次进入,先清除BITMAP_PAGE_CLEAN状态信息,而此时的*bmc=2,做*bmc--后重新设置BITMAP_PAGE_CLEAN状态信息;
  2. 第二次进入,*bmc--后清除掉BITMAP_PAGE_CLEAN状态信息,同时设置BITMAP_PAGE_NEEDWRITE状态信息;
  3. 第三次进入,才是真正调用write_page()完成对bitmap磁盘文件的写入操作;

/*该函数主要是更新bitmap的写计数器,在add_stripe_bio()函数中被调用,调用的条件是conf->mddev->bitmap && firstwrite
 *(第一次写,并且bitmap指针不为空)
 *@bitmap:指向内存中bitmap结构体的指针
 *@offset:起始扇区
 *@sectors:写请求的扇区数
 *@behind:标志是否为write-behind
 *返回值:为0 */
int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind){
     if (!bitmap) return 0;

     if (behind) {
          atomic_inc(&bitmap->behind_writes);
          PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
            atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
     }

     while (sectors) {     /*以block大小为单位,循环更新bitmap的计数器*/
          int blocks;
          bitmap_counter_t *bmc;

          spin_lock_irq(&bitmap->lock);
          bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);     /*获取当前扇区所对应的计数器*/
          if (!bmc) {
               spin_unlock_irq(&bitmap->lock);
               return 0;
          }

          if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) {     /*如果计数器达到最大值,则等待,调用schedule()放弃CPU*/
               DEFINE_WAIT(__wait);
               /* note that it is safe to do the prepare_to_wait
               * after the test as long as we do it before dropping
               * the spinlock.
               */
               prepare_to_wait(&bitmap->overflow_wait, &__wait,
                         TASK_UNINTERRUPTIBLE);
               spin_unlock_irq(&bitmap->lock);
               blk_unplug(bitmap->mddev->queue);
               schedule();
               finish_wait(&bitmap->overflow_wait, &__wait);
               continue;
          }

          switch(*bmc) {
          case 0:           /*如果当前的计数器为0*/
               bitmap_file_set_bit(bitmap, offset); /*将bitmap的内存页中对应的bit设置为1,表明要写磁盘,并将页的状态设置为BITMAP_PAGE_DIRTY*/
               bitmap_count_page(bitmap,offset, 1);     /*增加bitmap页的计数器*/
               blk_plug_device_unlocked(bitmap->mddev->queue);     /*蓄流,等待更多的写操作*/
               /* fall through */
          case 1:
               *bmc = 2;
          }

          (*bmc)++;

          spin_unlock_irq(&bitmap->lock);

          offset += blocks;
          if (sectors > blocks)
               sectors -= blocks;
          else sectors = 0;
     }
     bitmap->allclean = 0;
     return 0;
}

/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk
* 该函数主要泄流,在让任何写操作执行之前,我们需要同步bitmap文件中的dirty页到磁盘上
*/
void bitmap_unplug(struct bitmap *bitmap)
{
     unsigned long i, flags;
     int dirty, need_write;
     struct page *page;
     int wait = 0;

     if (!bitmap)
          return;

     /* look at each page to see if there are any set bits that need to be
     * flushed out to disk */
     for (i = 0; i < bitmap->file_pages; i++) {     /*循环遍历bitmap文件对应的内存也,查看是否需要为dirty或是need_write*/
          spin_lock_irqsave(&bitmap->lock, flags);
          if (!bitmap->filemap) {
               spin_unlock_irqrestore(&bitmap->lock, flags);
               return;
          }
          page = bitmap->filemap[i];     
          dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
          need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
          clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
          clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
          if (dirty)
               wait = 1;
          spin_unlock_irqrestore(&bitmap->lock, flags);

          if (dirty | need_write)
               write_page(bitmap, page, 0);     /*将bitmap文件中的dirty或need_write页写入磁盘中*/
     }
     if (wait) { /* if any writes were performed, we need to wait on them如果有写bitmap文件的写操作在执行,则等待写操作完成 */
          if (bitmap->file)     /*如果bitmap磁盘文件存放在MD设备之外*/
               wait_event(bitmap->write_wait,
                       atomic_read(&bitmap->pending_writes)==0);
          else                    /*如果bitmap磁盘文件存放在MD设备自身*/
               md_super_wait(bitmap->mddev);     /*等待MD设备的superblock写完成*/
     }
     if (bitmap->flags & BITMAP_WRITE_ERROR)
          bitmap_file_kick(bitmap);
}
/*循环检查该数据块是否需要同步,如果设置了dirty或是need-write,则需要同步,否则不需要同步操作*/
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
                int degraded)
{
     /* bitmap_start_sync must always report on multiples of whole
     * pages, otherwise resync (which is very PAGE_SIZE based) will
     * get confused.
     * So call __bitmap_start_sync repeatedly (if needed) until
     * At least PAGE_SIZE>>9 blocks are covered.
     * Return the 'or' of the result.
     */
     int rv = 0;
     int blocks1;

     *blocks = 0;
     while (*blocks < (PAGE_SIZE>>9)) {
          rv |= __bitmap_start_sync(bitmap, offset,
                           &blocks1, degraded);
          offset += blocks1;
          *blocks += blocks1;
     }
     return rv;
}


你可能感兴趣的:(linux内核之软RAID)