blk-mq编程,主要要调用两个函数进行初始化工作,blk_mq_init_queue这是第二个。该函数先是申请了struct request_queue结构,这个请求队列后面用于赋值给磁盘那个结构体的相应成员。
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct request_queue *uninit_q, *q;
//分配struct request_queue并初始化
uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
/*
1:分配每个cpu专属的软件队列并初始化
2:分配硬件队列,并初始化
3:建立软件队列和硬件队列的联系
*/
q = blk_mq_init_allocated_queue(set, uninit_q);
if (IS_ERR(q))
blk_cleanup_queue(uninit_q);
return q;
}
EXPORT_SYMBOL(blk_mq_init_queue);
一眼望过去,确实有点复杂,不过,多看几遍就好了,
这里面,主要就是给struct request_queue *q结构体里面的成员变量赋值的,简单的变量赋值就不分析了,看看它调用的函数进行分析。
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);//percpu变量 软件队列
if (!q->queue_ctx)
goto err_exit;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q); //主要是初始化kobject变量
//二级指针 硬件队列
q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node);
if (!q->queue_hw_ctx)
goto err_percpu;
//赋值q->mq_map,这个数组保存了每个CPU对应的硬件队列编号
q->mq_map = set->mq_map;
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
/*定时器初始化,设置超时时间*/
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (!(set->flags & BLK_MQ_F_SG_MERGE))
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
blk_queue_make_request(q, blk_mq_make_request);
if (q->mq_ops->poll)
q->poll_fn = blk_mq_poll;
/*
* Do this after blk_queue_make_request() overrides it...
*/
q->nr_requests = set->queue_depth; //防止被覆盖
/*
* Default to classic polling
*/
q->poll_nsec = -1;
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
int ret;
ret = elevator_init_mq(q);
if (ret)
return ERR_PTR(ret);
}
return q;
err_hctxs:
kfree(q->queue_hw_ctx);
err_percpu:
free_percpu(q->queue_ctx);
err_exit:
q->mq_ops = NULL;
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
这个函数一看也没什么分析的,主要也是给poll_cb进行赋值,采用了很多的默认函数进行赋值。
struct blk_stat_callback *
blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
int (*bucket_fn)(const struct request *), unsigned int buckets, void *data)
{
struct blk_stat_callback *cb;
cb = kmalloc(sizeof(*cb), GFP_KERNEL);
if (!cb)
return NULL;
cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), GFP_KERNEL);
if (!cb->stat) {
kfree(cb);
return NULL;
}
cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), __alignof__(struct blk_rq_stat));
if (!cb->cpu_stat) {
kfree(cb->stat);
kfree(cb);
return NULL;
}
cb->timer_fn = timer_fn;
cb->bucket_fn = bucket_fn;
cb->data = data;
cb->buckets = buckets;
timer_setup(&cb->timer, blk_stat_timer_fn, 0);
return cb;
}
EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
接着到这个函数,也是变量的初始化工作,struct request_queue队列里面的kobject变量初始化,以及取出在每一个cpu上q->queue_ctx结构体,然后对齐成员kobject变量进行初始化。
void blk_mq_sysfs_init(struct request_queue *q)
{
struct blk_mq_ctx *ctx;
int cpu;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
for_each_possible_cpu(cpu) {
ctx = per_cpu_ptr(q->queue_ctx, cpu);//返回每个cpu上的q->queue_ctx变量的首地址
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
}
}
这个函数相对来说比较重要。后面再补充。
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q)
{
int i, j;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
blk_mq_sysfs_unregister(q);
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int node;
if (hctxs[i])
continue;
/*
根据set->nr_hw_queues的值分配struct blk_mq_hw_ctx(硬件队列结构体个数)
*/
node = blk_mq_hw_queue_to_node(q->mq_map, i);
hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), GFP_KERNEL, node);
if (!hctxs[i])
break;
/*
以下几个赋值操作都是相当于对struct blk_mq_hw_ctx结构体的
成员进行初始化操作(指针类型的变量就是申请内存,普通变量就是赋值)
*/
if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, node)) {
kfree(hctxs[i]);
hctxs[i] = NULL;
break;
}
atomic_set(&hctxs[i]->nr_active, 0);
hctxs[i]->numa_node = node;
hctxs[i]->queue_num = i;
/*
从函数名称也不难看出,该函数是对hctxs[i]的大部分成员变量进行初始化工作
*/
if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
free_cpumask_var(hctxs[i]->cpumask);
kfree(hctxs[i]);
hctxs[i] = NULL;
break;
}
blk_mq_hctx_kobj_init(hctxs[i]);
}
for (j = i; j < q->nr_hw_queues; j++) { //正常不会走到这里,这里类似于goto的处理?
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
if (hctx->tags)
blk_mq_free_map_and_requests(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
kobject_put(&hctx->kobj);
hctxs[j] = NULL;
}
}
q->nr_hw_queues = i;
mutex_unlock(&q->sysfs_lock);
blk_mq_sysfs_register(q);
}
这个函数也是给q的其成员赋值的,先大概熟悉一下,如果有实际的调试分析,需要了解某个参数的值,到时再回来看吧。
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
/*
* set defaults
*/
q->nr_requests = BLKDEV_MAX_RQ; //这个值后面会重新赋值进行覆盖
q->make_request_fn = mfn;
blk_queue_dma_alignment(q, 511);
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
blk_set_default_limits(&q->limits);
}
EXPORT_SYMBOL(blk_queue_make_request);
也是赋值,IO操作完成时,会调用这个回调函数。
void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
{
q->softirq_done_fn = fn;
}
EXPORT_SYMBOL(blk_queue_softirq_done);
static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues)
{
unsigned int i;
for_each_possible_cpu(i) { //硬件队列数
/*返回这个变量在编号为i的cpu上的起始地址*/
struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
struct blk_mq_hw_ctx *hctx;
//其成员做一些赋值操作
__ctx->cpu = i;
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
/*
* Set local node, IFF we have more than one hw queue. If
* not, we remain on the home node of the device
*/
hctx = blk_mq_map_queue(q, i); //取出每个cpu上的硬件队列
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = local_memory_node(cpu_to_node(i));
}
}