QQ群:852283276
微信:arm80x86
微信公众号:青儿创客基地
B站:主页 https://space.bilibili.com/208826118
Linux那些事儿之我是Block层(11)传说中的内存映射(上)
Linux那些事儿之我是Block层(12)传说中的内存映射(下)
linux内核分析:read过程分析
write每次最大能写多少字节的数据
The iov_iter interface
Linux使用blk_rq_map_user
来完成用户态内存和bio的映射,blk_rq_map_user
会填充struct request
的struct bio
结构体,
int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask)
{
struct iovec iov;
struct iov_iter i;
int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i);
if (unlikely(ret < 0))
return ret;
return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask);
}
EXPORT_SYMBOL(blk_rq_map_user);
函数import_single_range
中,MAX_RW_COUNT是一个宏:INT_MAX & PAGE_MASK,INT_MAX是2^31
,理论上每次write可写的buff大小是2^31-2^12=2147479552
,然后使用了迭代器,
int import_single_range(int rw, void __user *buf, size_t len,
struct iovec *iov, struct iov_iter *i)
{
if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT;
if (unlikely(!access_ok(!rw, buf, len)))
return -EFAULT;
iov->iov_base = buf;
iov->iov_len = len;
iov_iter_init(i, rw, iov, 1, len);
return 0;
}
EXPORT_SYMBOL(import_single_range);
blk_rq_map_user_iov调用__blk_rq_map_user_iov
/**
* blk_rq_map_user_iov - map user data to a request, for passthrough requests
* @q: request queue where request should be inserted
* @rq: request to map data to
* @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Description:
* Data will be mapped directly for zero copy I/O, if possible. Otherwise
* a kernel bounce buffer is used.
*
* A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
* before being submitted to the device, as pages mapped may be out of
* reach. It's the callers responsibility to make sure this happens. The
* original bio must be passed back in to blk_rq_unmap_user() for proper
* unmapping.
*/
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
bool copy = false;
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret;
if (!iter_is_iovec(iter))
goto fail;
if (map_data)
copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
i = *iter;
do {
ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
if (ret)
goto unmap_rq;
if (!bio)
bio = rq->bio;
} while (iov_iter_count(&i));
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->rq_flags |= RQF_COPY_USER;
return 0;
unmap_rq:
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
return -EINVAL;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
函数bio_add_pc_page
把page添加到bio,
/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
* number of reasons, such as the bio being full or target block device
* limitations. The target block device must allow bio's up to PAGE_SIZE,
* so it is always possible to add a single page to an empty bio.
*
* This should only be used by REQ_PC bios.
*/
int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
*page, unsigned int len, unsigned int offset)
{
int retried_segments = 0;
struct bio_vec *bvec;
/*
* cloned bio must not modify vec list
*/
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
return 0;
/*
* For filesystems with a blocksize smaller than the pagesize
* we will often be called with the same page as last time and
* a consecutive offset. Optimize this special case.
*/
if (bio->bi_vcnt > 0) {
struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
prev->bv_len += len;
bio->bi_iter.bi_size += len;
goto done;
}
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
if (bvec_gap_to_prev(q, prev, offset))
return 0;
}
if (bio->bi_vcnt >= bio->bi_max_vecs)
return 0;
/*
* setup the new entry, we might clear it again later if we
* cannot add the page
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt];
bvec->bv_page = page;
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
bio->bi_phys_segments++;
bio->bi_iter.bi_size += len;
/*
* Perform a recount if the number of segments is greater
* than queue_max_segments(q).
*/
while (bio->bi_phys_segments > queue_max_segments(q)) {
if (retried_segments)
goto failed;
retried_segments = 1;
blk_recount_segments(q, bio);
}
/* If we may be able to merge these biovecs, force a recount */
if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio_clear_flag(bio, BIO_SEG_VALID);
done:
return len;
failed:
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
bio->bi_vcnt--;
bio->bi_iter.bi_size -= len;
blk_recount_segments(q, bio);
return 0;
}
EXPORT_SYMBOL(bio_add_pc_page);
使用blk_rq_map_sg
把request映射到一个scatterlist,
//petalinux2015.2.1, kernel 3.19
blk_rq_map_sg //block\blk-merge.c
__blk_bios_map_sg
__blk_segment_map_sg
在__blk_segment_map_sg
中,BIOVEC_PHYS_MERGEABLE
检测相邻bio_vec是否可以合并,BIOVEC_SEG_BOUNDARY
用于检测是否超过边界,比如某些硬件不能跨64MB边界传输,其单次发起DMA的最大大小就是64MB,可以通过queue_max_segment_size
来控制scatterlist的表项。
static inline void
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
struct scatterlist *sglist, struct bio_vec *bvprv,
struct scatterlist **sg, int *nsegs, int *cluster)
{
int nbytes = bvec->bv_len;
if (*sg && *cluster) {
if ((*sg)->length + nbytes > queue_max_segment_size(q))
goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
goto new_segment;
(*sg)->length += nbytes;
} else {
new_segment:
if (!*sg)
*sg = sglist;
else {
/*
* If the driver previously mapped a shorter
* list, we could see a termination bit
* prematurely unless it fully inits the sg
* table on each mapping. We KNOW that there
* must be more entries here or the driver
* would be buggy, so force clear the
* termination bit to avoid doing a full
* sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
*sg = sg_next(*sg);
}
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs)++;
}
*bvprv = *bvec;
}