Linux I/O Block--递交I/O请求

      在通用块层中,bio用来描述单一的I/O请求,它记录了一次I/O操作所必需的相关信息,如用于I/O操作的数据缓存位置,I/O操作的块设备起始扇区,是读操作还是写操作等等。struct bio的定义如下

struct bio {
	sector_t		bi_sector;	/* device address in 512 byte
						   sectors */
	struct bio		*bi_next;	/* request queue link */
	struct block_device	*bi_bdev;
	unsigned long		bi_flags;	/* status, command, etc */
	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
						 * top bits priority
						 */

	unsigned short		bi_vcnt;	/* how many bio_vec's */
	unsigned short		bi_idx;		/* current index into bvl_vec */

	/* Number of segments in this BIO after
	 * physical address coalescing is performed.
	 */
	unsigned int		bi_phys_segments;

	unsigned int		bi_size;	/* residual I/O count */

	/*
	 * To keep track of the max segment size, we account for the
	 * sizes of the first and last mergeable segments in this bio.
	 */
	unsigned int		bi_seg_front_size;
	unsigned int		bi_seg_back_size;

	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */

	unsigned int		bi_comp_cpu;	/* completion CPU */

	atomic_t		bi_cnt;		/* pin count */

	struct bio_vec		*bi_io_vec;	/* the actual vec list */

	bio_end_io_t		*bi_end_io;

	void			*bi_private;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	struct bio_integrity_payload *bi_integrity;  /* data integrity */
#endif

	bio_destructor_t	*bi_destructor;	/* destructor */

	/*
	 * We can inline a number of vecs at the end of the bio, to avoid
	 * double allocations for a small number of bio_vecs. This member
	 * MUST obviously be kept at the very end of the bio.
	 */
	struct bio_vec		bi_inline_vecs[0];
};


bi_sector:该I/O操作的起始扇区号

bi_rw:指明了读写方向

bi_vcnt:该I/O操作中涉及到了多少个缓存向量,每个缓存向量由[page,offset,len]来描述

bi_idx:指示当前的缓存向量

bi_io_vec:缓存向量数组

缓存向量的定义:

struct bio_vec {
	struct page	*bv_page;
	unsigned int	bv_len;
	unsigned int	bv_offset;
};


 

struct request用于描述提交给块设备的I/O请求,bio会动态地添加进request,因此一个request往往会包含若干相邻的bio。

struct request {
	struct list_head queuelist;
	struct call_single_data csd;
	int cpu;

	struct request_queue *q;

	unsigned int cmd_flags;
	enum rq_cmd_type_bits cmd_type;
	unsigned long atomic_flags;

	/* the following two fields are internal, NEVER access directly */
	sector_t __sector;		/* sector cursor */
	unsigned int __data_len;	/* total data len */

	struct bio *bio;
	struct bio *biotail;

	struct hlist_node hash;	/* merge hash */
	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		void *completion_data;
	};

	/*
	 * two pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.
	 */
	void *elevator_private;
	void *elevator_private2;

	struct gendisk *rq_disk;
	unsigned long start_time;

	/* Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;

	unsigned short ioprio;

	void *special;		/* opaque pointer available for LLD use */
	char *buffer;		/* kaddr of the current segment if available */

	int tag;
	int errors;

	int ref_count;

	/*
	 * when request is used as a packet command carrier
	 */
	unsigned short cmd_len;
	unsigned char __cmd[BLK_MAX_CDB];
	unsigned char *cmd;

	unsigned int extra_len;	/* length of alignment and padding */
	unsigned int sense_len;
	unsigned int resid_len;	/* residual count */
	void *sense;

	unsigned long deadline;
	struct list_head timeout_list;
	unsigned int timeout;
	int retries;

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;

	/* for bidi */
	struct request *next_rq;
};

queuelist:用于将request链入请求队列的链表元素

q:指向所属的请求队列

__sector:下一个要传输的bio的起始扇区号

__data_len:request要传输的数据字节数

bio,biotail:用于维护request中的bio链表

 

在之前介绍的gendisk结构中,可以看到每个块设备(或分区)都对应了一个request_queue的结构,该结构用来容纳request,并且包含了相应的递交request以及I/O调度的方法

递交一个bio的主要工作是从generic_make_request()函数开始的,我们以此为入口来分析一个bio的递交过程。在每个进程的task_struct中,都包含有两个变量----struct bio *bio_list, **bio_tail,generic_make_request()的主要工作就是用这两个变量来维护当前待添加的bio链表,实际的提交操作会由generic_make_request()调用__generic_make_request()函数完成。而在__generic_make_request()中,会调用到queue_list中定义的make_request_fn函数,也就是特定于设备的提交请求函数来完成后续的工作。在这里便会有一些问题,大部分设备的make_request_fn都可以直接定义为内核实现的__make_request函数,而一些设备需要使用自己的make_request_fn,而自行实现的make_request_fn有可能会递归调用gerneric_make_request(),由于内核的堆栈十分有限,因此在generic_make_request()的实现中,玩了一些小把戏,使得递归的深度不会超过一层。我们注意到bio_tail是一个二级指针,这个值最初是NULL,当有bio添加进来,bio_tail将会指向bio->bi_next(如果bio全都递交上去了,则bio_tail将会指向bio_list),也就是说除了第一次调用外,其他每次递归调用generic_make_request()函数都会出现bio_tail不为NULL的情形,因此当bio_tail不为NULL时,则只将bio添加到由bio_list和bio_tail维护的链表中,然后直接返回,而不调用__generic_make_request(),这样便防止了多重递归的产生

void generic_make_request(struct bio *bio)
{
	if (current->bio_tail) {//current->bio_tail不为空则表明有bio正在提交,也就是说是处于递归调用
		/* make_request is active */
		bio->bi_next = NULL;
		/*这里current->tail有两种情况,当current的bio链表为空时,bio_tail指向的是bio_list
		当current的bio链表不为空时,bio_tail指向的是最后一个bio的bi_next指针,因此
		这句的实际作用就是将bio添加到了current的bio链表的尾部*/
		*(current->bio_tail) = bio;
		current->bio_tail = &bio->bi_next;
		/*这里直接返回,遍历并且提交bio的工作永远都是交给最先调用的generic_make_request来处理的,避免了多重递归*/
		return;
	}
	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to the next (which is NULL) and bio_tail
	 * to &bio_list, thus initialising the bio_list of new bios to be
	 * added.  __generic_make_request may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so fixup bio_list and
	 * bio_tail or bi_next, and call into __generic_make_request again.
	 *
	 * The loop was structured like this to make only one call to
	 * __generic_make_request (which is important as it is large and
	 * inlined) and to keep the structure simple.
	 */
	BUG_ON(bio->bi_next);
	do {
		current->bio_list = bio->bi_next;//这里取current的待提交bio链表的下一个bio
		if (bio->bi_next == NULL)//bi_next为空,也就是说待提交链表已经空了,只剩下最后一个bio了
			current->bio_tail = ¤t->bio_list;//bio_tail指向bio_list
		else
			bio->bi_next = NULL;//否则将bio提取出来
		__generic_make_request(bio);//提交bio
		bio = current->bio_list;//取新的待提交bio
	} while (bio);
	current->bio_tail = NULL; /* deactivate */
}



__generic_make_request()首先由bio对应的block_device获取等待队列q,然后要检查对应的设备是不是分区,如果是分区的话要将扇区地址进行重新计算,最后调用make_request_fn完成bio的递交

static inline void __generic_make_request(struct bio *bio)
{
	struct request_queue *q;
	sector_t old_sector;
	int ret, nr_sectors = bio_sectors(bio);//提取bio的大小,以扇区为单位
	dev_t old_dev;
	int err = -EIO;

	might_sleep();

	//这里检查bio的传输起始扇区是否超过设备的最大扇区,并且两者之间的差不能小于nr_sector
	if (bio_check_eod(bio, nr_sectors))
		goto end_io;

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
	 * Stacking drivers are expected to know what they are doing.
	 */
	old_sector = -1;
	old_dev = 0;
	do {
		char b[BDEVNAME_SIZE];

		q = bdev_get_queue(bio->bi_bdev);//获取对应设备的请求队列
		if (unlikely(!q)) {
			printk(KERN_ERR
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
			goto end_io;
		}
		
		/*下面做一些必要的检查*/
		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
			     nr_sectors > queue_max_hw_sectors(q))) {
			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
			       bdevname(bio->bi_bdev, b),
			       bio_sectors(bio),
			       queue_max_hw_sectors(q));
			goto end_io;
		}

		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
			goto end_io;

		if (should_fail_request(bio))
			goto end_io;

		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		 //如果bio指定的是一个分区,则传输点要重新进行计算
		blk_partition_remap(bio);

		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
			goto end_io;

		if (old_sector != -1)
			trace_block_remap(q, bio, old_dev, old_sector);

		old_sector = bio->bi_sector;
		old_dev = bio->bi_bdev->bd_dev;

		if (bio_check_eod(bio, nr_sectors))
			goto end_io;

		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
		    !blk_queue_discard(q)) {
			err = -EOPNOTSUPP;
			goto end_io;
		}

		trace_block_bio_queue(q, bio);

		ret = q->make_request_fn(q, bio);//这里是关键,调用请求队列中的make_request_fn函数处理请求
	} while (ret);

	return;

end_io:
	bio_endio(bio, err);
}

辅助函数blk_partition_remap():

static inline void blk_partition_remap(struct bio *bio)
{
	struct block_device *bdev = bio->bi_bdev;

	/*首先要保证传输的大小不能小于1个扇区并且bdev确实是分区*/
	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
		struct hd_struct *p = bdev->bd_part;//获取分区信息

		bio->bi_sector += p->start_sect;//在传输起点的原基础上加上分区的起始扇区号
		bio->bi_bdev = bdev->bd_contains;//将bio的bdev置为主设备

		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
				    bdev->bd_dev,
				    bio->bi_sector - p->start_sect);
	}
}

可以看到这里将bio的参考对象设置为了主设备,而不是分区,因此对应的扇区起始号也要计算为扇区的绝对值。

大多数的make_request_fn函数都可以直接定义为__make_request(),我们通过这个函数来分析递交bio的关键操作

static int __make_request(struct request_queue *q, struct bio *bio)
{
	struct request *req;
	int el_ret;
	unsigned int bytes = bio->bi_size;
	const unsigned short prio = bio_prio(bio);
	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
	int rw_flags;

	/*如果BIO_RW_BARRIER被置位(表示必须得让请求队列中的所有bio传递完毕才处理自己),
	  但是不支持hardbarrier,不能进行bio的提交*/
	if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
		bio_endio(bio, -EOPNOTSUPP);
		return 0;
	}
	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	spin_lock_irq(q->queue_lock);

	//如果BIO_RW_BARRIER被置位或者请求队列为空,则情况比较简单,不用进行bio的合并,跳转到get_rq处处理
	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
		goto get_rq;

	/**请求队列不为空**/
	
	/*elv_merge()试图寻找一个已存在的request,将bio并入其中*/
	el_ret = elv_merge(q, &req, bio);
	
	switch (el_ret) {
	case ELEVATOR_BACK_MERGE:
		BUG_ON(!rq_mergeable(req));

		/*相关检查*/
		if (!ll_back_merge_fn(q, req, bio))
			break;

		trace_block_bio_backmerge(q, bio);

		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
			blk_rq_set_mixed_merge(req);

		/*这里将bio插入到request尾部*/
		req->biotail->bi_next = bio;
		req->biotail = bio;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_back_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;

	case ELEVATOR_FRONT_MERGE:
		BUG_ON(!rq_mergeable(req));

		if (!ll_front_merge_fn(q, req, bio))
			break;

		trace_block_bio_frontmerge(q, bio);

		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
			blk_rq_set_mixed_merge(req);
			req->cmd_flags &= ~REQ_FAILFAST_MASK;
			req->cmd_flags |= ff;
		}

		/*这里将bio插入到request的头部*/
		bio->bi_next = req->bio;
		req->bio = bio;

		/*
		 * may not be valid. if the low level driver said
		 * it didn't need a bounce buffer then it better
		 * not touch req->buffer either...
		 */
		req->buffer = bio_data(bio);
		req->__sector = bio->bi_sector;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_front_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;

	/* ELV_NO_MERGE: elevator says don't/can't merge. */
	default:
		;
	}

get_rq:/**下面的代码对应请求队列为空的情况,需要先分配一个request,再将bio插入***/
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);//确定读写标识
	if (sync)
		rw_flags |= REQ_RW_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request_wait(q, rw_flags, bio);//分配一个新的request

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	 //根据bio初始化新分配的request,并将bio插入到request中
	init_request_from_bio(req, bio);

	spin_lock_irq(q->queue_lock);
	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
	    bio_flagged(bio, BIO_CPU_AFFINE))
		req->cpu = blk_cpu_to_group(smp_processor_id());
	if (queue_should_plug(q) && elv_queue_empty(q))
		blk_plug_device(q);
	add_request(q, req);//将request插入到请求队列
out:
	if (unplug || !queue_should_plug(q))
		__generic_unplug_device(q);
	spin_unlock_irq(q->queue_lock);
	return 0;
}


elv_merge()是执行合并的关键所在,执行完后会有三种情况:

1.bio添加到了一个request的bio链表尾部

2.bio添加到了一个request的bio链表首部

3.未能找到一个request可以添加,将重新分配一个request

int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
	struct elevator_queue *e = q->elevator;
	struct request *__rq;
	int ret;

	/*
	 * First try one-hit cache.
	 */
	//last_merge指向了最近进行合并操作的request,最先试图将bio合并到该request中
	if (q->last_merge) {
		ret = elv_try_merge(q->last_merge, bio);
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}

	if (blk_queue_nomerges(q))//请求队列不允许合并请求,则返回NO_MERGE
		return ELEVATOR_NO_MERGE;

	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	 //根据bio的起始扇区号,通过rq的哈希表寻找一个request,可以将bio合并到request的尾部
	__rq = elv_rqhash_find(q, bio->bi_sector);
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

	/*如果以上的方法不成功,则调用特定于io调度器的elevator_merge_fn函数寻找一个合适的request*/
	if (e->ops->elevator_merge_fn)
		return e->ops->elevator_merge_fn(q, req, bio);

	return ELEVATOR_NO_MERGE;
}

elevator_merge_fn是特定于I/O调度器的方式,至此,递交I/O请求的通用层部分也就分析完了。

你可能感兴趣的:(Linux I/O Block--递交I/O请求)