nvme 驱动详解 之1

按照老的套路,在分析一个driver时,我们首先看这个driver相关的kconfig及Makefile文件,察看相关的源代码文件.

在开始阅读一个driver,通常都是从module_initor syscall_init函数看起。

 

下面让我们开始nvme的旅程吧。

首先打开driver/block下的kconfig文件,其中定义了BLK_DEV_NVMEconfig,如下。

config BLK_DEV_NVME

         tristate "NVMExpress block device"

         depends on PCI

         ---help---

           The NVM Express driver is for solid statedrives directly

           connected to the PCI or PCI Express bus.  If you know you

           don't have one of these, it is safe to answerN.

 

           To compile this driver as a module, choose Mhere: the

           module will be called nvme.

通过console,输入makemenuconfig,搜索BLK_DEV_NEME得到如下依赖关系。

Symbol: BLK_DEV_NVME [=m]                                                                      

  | Type : tristate                                                                                

  | Prompt: NVM Express block device                                                                

  |  Location:                                                                                     

  |    -> Device Drivers                                                                            

  | (1)  -> Block devices (BLK_DEV [=y])                                                           

  |  Defined at drivers/block/Kconfig:313                                                          

  |   Dependson: BLK_DEV [=y] && PCI [=y]

可以看到nemv 依赖于BLKPCI

打开driver/block/Makefile,搜索NVME,可以看到:

obj-$(CONFIG_BLK_DEV_NVME)    += nvme.o

nvme-y              := nvme-core.o nvme-scsi.o

 

关于和BLK相关的文件,打开block/Makefile:

obj-$(CONFIG_BLOCK) := bio.oelevator.o blk-core.o blk-tag.o blk-sysfs.o \

                            blk-flush.o blk-settings.o blk-ioc.oblk-map.o \

                            blk-exec.o blk-merge.o blk-softirq.oblk-timeout.o \

                            blk-iopoll.o blk-lib.o blk-mq.oblk-mq-tag.o \

                            blk-mq-sysfs.o blk-mq-cpu.oblk-mq-cpumap.o ioctl.o \

                            genhd.o scsi_ioctl.o partition-generic.oioprio.o \

                            partitions/

哇塞,是不是很多?不要担心,NVME也只是用了BLOCK层的一些函数而已,不用把所用与BLOCK相关的文件都看了,除非你有精力去研究。

 

好了,到目前为止,我们知道了要看哪些文件了,nvme-core.cnvme-scsi.c是必须的,剩下的就是当我们的driver调用到block层哪些函数再去研究。

 

打开nvme-core,查看入口函数,module_init(nvme_init);

static int __init nvme_init(void)

{

         int result;

 

         init_waitqueue_head(&nvme_kthread_wait);//创建等待队列

 

         nvme_workq =create_singlethread_workqueue("nvme");//创建工作队列

         if (!nvme_workq)

                   return -ENOMEM;

 

         result= register_blkdev(nvme_major, "nvme");//注册块设备

         if (result < 0)

                   goto kill_workq;

         else if (result > 0)

                   nvme_major = result;

 

         result= pci_register_driver(&nvme_driver);//注册pci driver

         if (result)

                   goto unregister_blkdev;

         return 0;

 

 unregister_blkdev:

         unregister_blkdev(nvme_major, "nvme");

 kill_workq:

         destroy_workqueue(nvme_workq);

         return result;

}

注册pci driver后,会调用nvme_driver中的probe函数。发现开始总是美好的,函数是如此的简洁,不要高兴的太早,痛苦的经历正在逼近。

static int nvme_probe(struct pci_dev*pdev, const struct pci_device_id *id)

{

         int node, result = -ENOMEM;

         struct nvme_dev *dev;

 

         node = dev_to_node(&pdev->dev);//获取node节点,与NUMA系统有关。

         if (node == NUMA_NO_NODE)

                   set_dev_node(&pdev->dev, 0);

 

         dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);

         if (!dev)

                   return -ENOMEM;

         dev->entry = kzalloc_node(num_possible_cpus() *sizeof(*dev->entry),//分配msix-entry

                                                                 GFP_KERNEL,node);

         if (!dev->entry)

                   goto free;

         dev->queues = kzalloc_node((num_possible_cpus() + 1) *sizeof(void *),//分配queues 资源,

                                                                 GFP_KERNEL,node);//这里之所以多1,是因为有admin-queues

         if (!dev->queues)

                   goto free;

 

         INIT_LIST_HEAD(&dev->namespaces);//初始化namespaces链表。

         dev->reset_workfn = nvme_reset_failed_dev;

         INIT_WORK(&dev->reset_work, nvme_reset_workfn);

         dev->pci_dev = pci_dev_get(pdev);

         pci_set_drvdata(pdev, dev);

         result = nvme_set_instance(dev);//设置pci设备的句柄instance,代表该设备。

         if (result)

                   goto put_pci;

 

         result = nvme_setup_prp_pools(dev);//设置dma需要的prp内存池。

         if (result)

                   goto release;

 

         kref_init(&dev->kref);

         result = nvme_dev_start(dev);//创建admin queue io queue request irq

         if (result)

                   goto release_pools;

 

         if (dev->online_queues > 1)

                   result = nvme_dev_add(dev);//初始化mq,并增加一个实际可用的nvme dev,并且admin_queue可以发送cmd

         if (result)

                   goto shutdown;

 

         scnprintf(dev->name, sizeof(dev->name),"nvme%d", dev->instance);

         dev->miscdev.minor = MISC_DYNAMIC_MINOR;

         dev->miscdev.parent = &pdev->dev;

         dev->miscdev.name = dev->name;

         dev->miscdev.fops = &nvme_dev_fops;

         result = misc_register(&dev->miscdev);//注册一个misc设备

         if (result)

                   goto remove;

 

         nvme_set_irq_hints(dev);

 

         dev->initialized = 1;

         return 0;

 

 remove:

         nvme_dev_remove(dev);

         nvme_dev_remove_admin(dev);

         nvme_free_namespaces(dev);

 shutdown:

         nvme_dev_shutdown(dev);

 release_pools:

         nvme_free_queues(dev, 0);

         nvme_release_prp_pools(dev);

 release:

         nvme_release_instance(dev);

 put_pci:

         pci_dev_put(dev->pci_dev);

 free:

         kfree(dev->queues);

         kfree(dev->entry);

         kfree(dev);

         return result;

}

上面每一个主要功能的函数都简单了注释了一下,描述了做的哪些工作,下面具体看看那些函数怎么实现的。

static int nvme_set_instance(structnvme_dev *dev)

{

         int instance, error;

 

         do {

                   if (!ida_pre_get(&nvme_instance_ida,GFP_KERNEL))

                            return -ENODEV;

 

                   spin_lock(&dev_list_lock);

                   error = ida_get_new(&nvme_instance_ida,&instance);

                   spin_unlock(&dev_list_lock);

         } while (error == -EAGAIN);

 

         if (error)

                   return -ENODEV;

 

         dev->instance = instance;//该函数获得设备的instance,相当于该设备的id,代表着该设备。

         return 0;

}

 

Nvme_setup_prp_pools用来创建dma时所用的内存池,prp_page_pool是虚拟内核地址,

static int nvme_setup_prp_pools(structnvme_dev *dev)

{

         struct device *dmadev = &dev->pci_dev->dev;

         dev->prp_page_pool = dma_pool_create("prp listpage", dmadev,

                                                        PAGE_SIZE,PAGE_SIZE, 0);

         if (!dev->prp_page_pool)

                   return -ENOMEM;

 

         /* Optimisation for I/Os between 4k and 128k */

         dev->prp_small_pool = dma_pool_create("prp list256", dmadev,

                                                        256, 256, 0);

         if (!dev->prp_small_pool) {

                   dma_pool_destroy(dev->prp_page_pool);

                   return -ENOMEM;

         }

         return 0;

}

 

下面是一个重量级的函数之一,nvme_dev_start;

static intnvme_dev_start(struct nvme_dev *dev)

{

         int result;

         bool start_thread = false;

 

         result = nvme_dev_map(dev);

         if (result)

                   return result;

 

         result = nvme_configure_admin_queue(dev);//配置adminsubmit queue 和complete queue,64 depth

         if (result)

                   goto unmap;

 

         spin_lock(&dev_list_lock);

         if (list_empty(&dev_list) &&IS_ERR_OR_NULL(nvme_thread)) {

                   start_thread = true;

                   nvme_thread = NULL;

         }

         list_add(&dev->node, &dev_list);

         spin_unlock(&dev_list_lock);

 

         if (start_thread) {

                   nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");

                   wake_up_all(&nvme_kthread_wait);

         } else

                   wait_event_killable(nvme_kthread_wait,nvme_thread);

 

         if (IS_ERR_OR_NULL(nvme_thread)) {

                   result = nvme_thread ? PTR_ERR(nvme_thread) :-EINTR;

                   goto disable;

         }

 

         nvme_init_queue(dev->queues[0],0);//始化queue,并online_queues++

         result = nvme_alloc_admin_tags(dev);

         if (result)

                   goto disable;

 

         result = nvme_setup_io_queues(dev);

         if (result)

                   goto free_tags;

 

         nvme_set_irq_hints(dev);

 

         return result;

 

 free_tags:

         nvme_dev_remove_admin(dev);

 disable:

         nvme_disable_queue(dev, 0);

         nvme_dev_list_remove(dev);

 unmap:

         nvme_dev_unmap(dev);

         return result;

}

首先看nvme_configure_admin_queue(dev) 这个函数。

static intnvme_configure_admin_queue(struct nvme_dev *dev)

{

         int result;

         u32 aqa;

         u64 cap = readq(&dev->bar->cap);//读cap寄存器

         struct nvme_queue *nvmeq;

         unsigned page_shift = PAGE_SHIFT;

         unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;

         unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;

 

         if (page_shift < dev_page_min) {

                   dev_err(&dev->pci_dev->dev,

                                     "Minimum device page size(%u) too large for "

                                     "host (%u)\n", 1<< dev_page_min,

                                     1 << page_shift);

                   return -ENODEV;

         }

         if (page_shift > dev_page_max) {

                   dev_info(&dev->pci_dev->dev,

                                     "Device maximum page size(%u) smaller than "

                                     "host (%u); enablingwork-around\n",

                                     1 << dev_page_max, 1<< page_shift);

                   page_shift = dev_page_max;

         }

 

         result = nvme_disable_ctrl(dev, cap);//disable controller

         if (result < 0)

                   return result;

 

         nvmeq = dev->queues[0];

         if (!nvmeq) {

                   nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);//如果nvmeq==null,就创建nvmeq

                   if (!nvmeq)

                            return -ENOMEM;

         }

 

         aqa = nvmeq->q_depth - 1;

         aqa |= aqa << 16;

 

         dev->page_size = 1 << page_shift;

 

         dev->ctrl_config = NVME_CC_CSS_NVM;

         dev->ctrl_config |= (page_shift - 12) <

         dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;

         dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;

 

         writel(aqa, &dev->bar->aqa);

         writeq(nvmeq->sq_dma_addr, &dev->bar->asq);

         writeq(nvmeq->cq_dma_addr, &dev->bar->acq); //该语句是创建nvmeqsubmit queuecomplete queue

 

         result = nvme_enable_ctrl(dev, cap);

         if (result)

                   goto free_nvmeq;

 

         nvmeq->cq_vector = 0;

         result = queue_request_irq(dev, nvmeq, nvmeq->irqname);//注册中断

         if (result)

                   goto free_nvmeq;

 

         return result;

 

 free_nvmeq:

         nvme_free_queues(dev, 0);

         return result;

}

下面看一下在nvme_alloc_queue函数中作了什么。

static struct nvme_queue *nvme_alloc_queue(structnvme_dev *dev, int qid,

                                                                 intdepth)

{

         struct device *dmadev = &dev->pci_dev->dev;

         struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq),GFP_KERNEL);

         if (!nvmeq)

                   return NULL;

 

         nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),

                                                 &nvmeq->cq_dma_addr, GFP_KERNEL); //分配complete queue cmds空间,深度为depth个。

         if (!nvmeq->cqes)

                   goto free_nvmeq;

 

         nvmeq->sq_cmds = dma_alloc_coherent(dmadev,SQ_SIZE(depth),

                                               &nvmeq->sq_dma_addr,GFP_KERNEL);//分配submit queuecmds空间,深度为depth个。

         if (!nvmeq->sq_cmds)

                   goto free_cqdma;

 

         nvmeq->q_dmadev = dmadev;

         nvmeq->dev = dev;

         snprintf(nvmeq->irqname, sizeof(nvmeq->irqname),"nvme%dq%d",

                            dev->instance, qid);//设置nvmeqirqname

         spin_lock_init(&nvmeq->q_lock);

         nvmeq->cq_head = 0;

         nvmeq->cq_phase = 1;

         nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

         nvmeq->q_depth = depth;

         nvmeq->qid = qid;

         dev->queue_count++;

         dev->queues[qid] = nvmeq;//将分配的nvmeq保存在dev->queues[qid]位置

 

         return nvmeq;//返回得到的nvmeq

 

 free_cqdma:

         dma_free_coherent(dmadev, CQ_SIZE(depth), (void*)nvmeq->cqes,

                                                                 nvmeq->cq_dma_addr);

 free_nvmeq:

         kfree(nvmeq);

         return NULL;

}

 

到此,我们完成了admin queue的complete queue和submit queue的创建和中断的注册。下面一句是nvme_kthread 守护进程的创建,这个我们稍候再讲。我们先看一下下面的函数。

static void nvme_init_queue(structnvme_queue *nvmeq, u16 qid)

{

         struct nvme_dev *dev = nvmeq->dev;

 

         spin_lock_irq(&nvmeq->q_lock);

         nvmeq->sq_tail = 0;//完成一些nvmeq的初始化工作

         nvmeq->cq_head = 0;

         nvmeq->cq_phase = 1;

         nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

         memset((void *)nvmeq->cqes, 0,CQ_SIZE(nvmeq->q_depth));

         dev->online_queues++;//将dev->online_queues++,代表online_queues增加1

         spin_unlock_irq(&nvmeq->q_lock);

}

 

下面的函数时nvme使用mq的核心。

static int nvme_alloc_admin_tags(structnvme_dev *dev)

{

         if (!dev->admin_q) {//初始化admin_qnull,故进入if分支

                   dev->admin_tagset.ops = &nvme_mq_admin_ops;//初始化blk_mq_tag_set结构体,nvme_mq_admin_opsrun request会用到

                   dev->admin_tagset.nr_hw_queues = 1;//hardware queue个数为1

                   dev->admin_tagset.queue_depth = NVME_AQ_DEPTH -1;

                   dev->admin_tagset.timeout = ADMIN_TIMEOUT;

                   dev->admin_tagset.numa_node =dev_to_node(&dev->pci_dev->dev);

                   dev->admin_tagset.cmd_size = sizeof(structnvme_cmd_info);

                   dev->admin_tagset.driver_data = dev;

 

                   if (blk_mq_alloc_tag_set(&dev->admin_tagset))//分配一个tag set与一个或多个request queues关联。

                            return -ENOMEM;

 

                   dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);//初始化request_queue

                   if (IS_ERR(dev->admin_q)) {

                            blk_mq_free_tag_set(&dev->admin_tagset);

                            return -ENOMEM;

                   }

                   if (!blk_get_queue(dev->admin_q)){

                            nvme_dev_remove_admin(dev);

                            return -ENODEV;

                   }

         } else

                   blk_mq_unfreeze_queue(dev->admin_q);

 

         return 0;

}

下面依次介绍blk_mq中相关的函数。

先看张图,一个mq的schdule.

blk_mq_alloc_tag_set(&dev->admin_tagset)这个函数所做工作可以用下图简单概括.

/*

 * Alloc a tag set to be associated with one ormore request queues.

 * May fail with EINVAL for various errorconditions. May adjust the

 * requested depth down, if if it too large. Inthat case, the set

 * value will be stored in set->queue_depth.

 */

int blk_mq_alloc_tag_set(struct blk_mq_tag_set*set)

{

         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 <

 

         if (!set->nr_hw_queues)

                   return -EINVAL;

         if (!set->queue_depth)

                   return -EINVAL;

         if (set->queue_depth < set->reserved_tags +BLK_MQ_TAG_MIN)

                   return -EINVAL;

 

         if (!set->nr_hw_queues || !set->ops->queue_rq ||!set->ops->map_queue)

                   return -EINVAL;

 

         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {

                   pr_info("blk-mq: reduced tag depth to%u\n",

                            BLK_MQ_MAX_DEPTH);

                   set->queue_depth = BLK_MQ_MAX_DEPTH;

         }

 

         /*

          * If a crashdump isactive, then we are potentially in a very

          * memory constrainedenvironment. Limit us to 1 queue and

          * 64 tags to preventusing too much memory.

          */

         if (is_kdump_kernel()) {

                   set->nr_hw_queues = 1;

                   set->queue_depth = min(64U,set->queue_depth);

         }

 

         set->tags = kmalloc_node(set->nr_hw_queues *   //在这里给tags分配与nr_hw_queues个空间

                                      sizeof(struct blk_mq_tags *),

                                      GFP_KERNEL, set->numa_node);

         if (!set->tags)

                   return -ENOMEM;

 

         if (blk_mq_alloc_rq_maps(set))

                   goto enomem;

 

         mutex_init(&set->tag_list_lock);

         INIT_LIST_HEAD(&set->tag_list);

 

         return 0;

enomem:

         kfree(set->tags);

         set->tags = NULL;

         return -ENOMEM;

}

 

/*

 * Allocate the request maps associated withthis tag_set. Note that this

 * may reduce the depth asked for, if memory istight. set->queue_depth

 * will be updated to reflect the allocateddepth.

 */

static int blk_mq_alloc_rq_maps(structblk_mq_tag_set *set)

{

         unsigned int depth;

         int err;

 

         depth = set->queue_depth;

         do {

                   err = __blk_mq_alloc_rq_maps(set);//如果成功,则跳出,set->tags[xxx]等资源初始化完毕,否则,将queue_depth减半,创建。

                   if (!err)

                            break;

 

                   set->queue_depth >>= 1;

                   if (set->queue_depth < set->reserved_tags+ BLK_MQ_TAG_MIN) {

                            err = -ENOMEM;

                            break;

                   }

         } while (set->queue_depth);

 

         if (!set->queue_depth || err) {

                   pr_err("blk-mq: failed to allocate requestmap\n");

                   return -ENOMEM;

         }

 

         if (depth != set->queue_depth)

                   pr_info("blk-mq: reduced tag depth (%u ->%u)\n",

                                                        depth,set->queue_depth);

 

         return 0;

}

 

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set*set)

{

         int i;

 

         for (i = 0; i nr_hw_queues; i++) {//根据nr_hw_queues循环

                   set->tags[i] = blk_mq_init_rq_map(set, i);//初始化tag[i]

                   if (!set->tags[i])

                            goto out_unwind;

         }

 

         return 0;

 

out_unwind:

         while (--i >= 0)

                   blk_mq_free_rq_map(set,set->tags[i], i);

 

         return -ENOMEM;

}

 

static structblk_mq_tags *blk_mq_init_rq_map(structblk_mq_tag_set *set,

                   unsigned int hctx_idx)

{

         struct blk_mq_tags *tags;

         unsigned int i, j, entries_per_page,max_order = 4;

         size_t rq_size, left;

 

         tags = blk_mq_init_tags(set->queue_depth,set->reserved_tags,

                                     set->numa_node);//初始化tags

         if (!tags)

                   return NULL;

 

         INIT_LIST_HEAD(&tags->page_list);

 

         tags->rqs =kzalloc_node(set->queue_depth * sizeof(struct request *),

                                      GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,

                                      set->numa_node);//分配requests资源,每个queuequeue_depth

         if (!tags->rqs) {

                   blk_mq_free_tags(tags);

                   return NULL;

         }

 

         /*

          * rq_size is the size of the request plusdriver payload, rounded

          * to the cacheline size

          */

         rq_size = round_up(sizeof(structrequest) + set->cmd_size,

                                     cache_line_size());//设置request的大小,request大小为request 结构体与cmd_set结构体之和

         left = rq_size * set->queue_depth;

 

         for (i = 0; i < set->queue_depth;) {

                   int this_order = max_order;

                   struct page *page;

                   int to_do;

                   void *p;

 

                   while (left

                            this_order--;

 

                   do {

                            page =alloc_pages_node(set->numa_node,

                                     GFP_KERNEL| __GFP_NOWARN | __GFP_NORETRY,

                                     this_order);

                            if (page)

                                     break;

                            if (!this_order--)

                                     break;

                            if(order_to_size(this_order) < rq_size)

                                     break;

                   } while (1);

 

                   if (!page)

                            goto fail;

 

                   page->private =this_order;

                   list_add_tail(&page->lru,&tags->page_list);

 

                   p = page_address(page);

                   entries_per_page =order_to_size(this_order) / rq_size;

                   to_do = min(entries_per_page,set->queue_depth - i);

                   left -= to_do * rq_size;

                   for (j = 0; j < to_do;j++) {

                            tags->rqs[i] = p;

                            tags->rqs[i]->atomic_flags= 0;

                            tags->rqs[i]->cmd_flags= 0;

                            if(set->ops->init_request) {

                                     if(set->ops->init_request(set->driver_data,

                                                        tags->rqs[i],hctx_idx, i,

                                                        set->numa_node)){//这里调用init_request初始化request

                                               tags->rqs[i]= NULL;

                                               gotofail;

                                     }

                            }

 

                            p += rq_size;

                            i++;

                   }

         }

 

         return tags;

 

fail:

         blk_mq_free_rq_map(set, tags,hctx_idx);

         return NULL;

}

 

structblk_mq_tags *blk_mq_init_tags(unsignedint total_tags,

                                          unsigned int reserved_tags, int node)

{

         struct blk_mq_tags *tags;

 

         if (total_tags > BLK_MQ_TAG_MAX) {

                   pr_err("blk-mq: tagdepth too large\n");

                   return NULL;

         }

 

         tags = kzalloc_node(sizeof(*tags),GFP_KERNEL, node);//分配tags资源

         if (!tags)

                   return NULL;

 

         tags->nr_tags = total_tags;

         tags->nr_reserved_tags =reserved_tags;

 

         return blk_mq_init_bitmap_tags(tags,node);//初始化bitmap tags

}

 

static structblk_mq_tags *blk_mq_init_bitmap_tags(structblk_mq_tags *tags,

                                                           int node)

{

         unsigned int depth = tags->nr_tags -tags->nr_reserved_tags;//depth为总共的tags-保留的tags数。

 

         if (bt_alloc(&tags->bitmap_tags,depth, node, false))//初始化bitmap_tags

                   goto enomem;

         if(bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))//初始化breserved_tags

                   goto enomem;

 

         return tags;

enomem:

         bt_free(&tags->bitmap_tags);

         kfree(tags);

         return NULL;

}

 

 

static intbt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,

                            int node, boolreserved)

{

         int i;

 

         bt->bits_per_word = ilog2(BITS_PER_LONG);//BITS_PER_LONG 定义为64,则bits_per_word=6

 

         /*

          * Depth can be zero for reserved tags, that'snot a failure

          * condition.

          */

         if (depth) {//此处depth=64

                   unsigned int nr,tags_per_word;

 

                   tags_per_word = (1 <bits_per_word);

 

                   /*

                    * If the tag space is small, shrink the numberof tags

                    * per word so we spread over a few cachelines,at least.

                    * If less than 4 tags, just forget about it,it's not

                    * going to work optimally anyway.

                    */

                   if (depth >= 4) {

                            while (tags_per_word* 4 > depth) {

                                     bt->bits_per_word--;

                                     tags_per_word= (1 << bt->bits_per_word);

                            }

                   }

 

                   nr = ALIGN(depth,tags_per_word) / tags_per_word;//align函数是以tags_per_word整数倍对齐,每个word记录的tags数为tags_per_word,这样depthtags需要的word数为nr

                   bt->map = kzalloc_node(nr* sizeof(struct blk_align_bitmap),

                                                        GFP_KERNEL,node);//于是分配nrmap来记录这个tags

                   if (!bt->map)

                            return -ENOMEM;

 

                   bt->map_nr = nr;

         }

 

         bt->bs = kzalloc(BT_WAIT_QUEUES *sizeof(*bt->bs), GFP_KERNEL);

         if (!bt->bs) {

                   kfree(bt->map);

                   return -ENOMEM;

         }

 

         bt_update_count(bt, depth);//更新map中的depth

 

         for (i = 0; i < BT_WAIT_QUEUES; i++){

                   init_waitqueue_head(&bt->bs[i].wait);

                   atomic_set(&bt->bs[i].wait_cnt,bt->wake_cnt);

         }

 

         return 0;

}

 

/*

         初始化requestqueue

*/

structrequest_queue *blk_mq_init_queue(structblk_mq_tag_set *set)

{

         struct blk_mq_hw_ctx **hctxs;

         struct blk_mq_ctx __percpu *ctx;

         struct request_queue *q;

         unsigned int *map;

         int i;

 

         ctx = alloc_percpu(struct blk_mq_ctx);//分配ctx结构体空间

         if (!ctx)

                   return ERR_PTR(-ENOMEM);

 

         hctxs =kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,//分配nr hw queue hctxs结构体空间

                            set->numa_node);

 

         if (!hctxs)

                   goto err_percpu;

 

         map = blk_mq_make_queue_map(set);//得到cpuhwQueued映射map

         if (!map)

                   goto err_map;

 

         for (i = 0; i < set->nr_hw_queues;i++) {

                   int node =blk_mq_hw_queue_to_node(map, i);

 

                   hctxs[i] =kzalloc_node(sizeof(struct blk_mq_hw_ctx),//hctxs[i]的一些属性进行赋值

                                               GFP_KERNEL,node);

                   if (!hctxs[i])

                            goto err_hctxs;

 

                   if(!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,

                                                        node))

                            goto err_hctxs;

 

                   atomic_set(&hctxs[i]->nr_active,0);

                   hctxs[i]->numa_node =node;

                   hctxs[i]->queue_num = i;

         }

 

         q = blk_alloc_queue_node(GFP_KERNEL,set->numa_node);//分配一个request_queue资源,并初始化

         if (!q)

                   goto err_hctxs;

 

         /*

          *Init percpu_ref in atomic mode so that it's faster to shutdown.

          * See blk_register_queue() for details.

          */

         if(percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,

                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))

                   goto err_map;

 

         setup_timer(&q->timeout,blk_mq_rq_timer, (unsigned long) q);

         blk_queue_rq_timeout(q, 30000);

 

         q->nr_queues = nr_cpu_ids;

         q->nr_hw_queues =set->nr_hw_queues;

         q->mq_map = map;

 

         q->queue_ctx = ctx;

         q->queue_hw_ctx = hctxs;

 

         q->mq_ops = set->ops;

         q->queue_flags |=QUEUE_FLAG_MQ_DEFAULT;

 

         if (!(set->flags &BLK_MQ_F_SG_MERGE))

                   q->queue_flags |= 1<< QUEUE_FLAG_NO_SG_MERGE;

 

         q->sg_reserved_size = INT_MAX;

 

         INIT_WORK(&q->requeue_work,blk_mq_requeue_work);

         INIT_LIST_HEAD(&q->requeue_list);

         spin_lock_init(&q->requeue_lock);

 

         if (q->nr_hw_queues > 1)

                   blk_queue_make_request(q,blk_mq_make_request);//设置make_request函数指针

         else

                   blk_queue_make_request(q,blk_sq_make_request);

 

         if (set->timeout)

                   blk_queue_rq_timeout(q,set->timeout);

 

         /*

          * Do this after blk_queue_make_request()overrides it...

          */

         q->nr_requests =set->queue_depth;

 

         if (set->ops->complete)

                   blk_queue_softirq_done(q,set->ops->complete);

 

         blk_mq_init_cpu_queues(q,set->nr_hw_queues);//初始化sw queue

 

         if (blk_mq_init_hw_queues(q, set))//初始化hw queue

                   goto err_hw;

 

         mutex_lock(&all_q_mutex);

         list_add_tail(&q->all_q_node,&all_q_list);

         mutex_unlock(&all_q_mutex);

 

         blk_mq_add_queue_tag_set(set, q);

 

         blk_mq_map_swqueue(q);//映射sw queue

 

         return q;

 

err_hw:

         blk_cleanup_queue(q);

err_hctxs:

         kfree(map);

         for (i = 0; i nr_hw_queues; i++) {

                   if (!hctxs[i])

                            break;

                   free_cpumask_var(hctxs[i]->cpumask);

                   kfree(hctxs[i]);

         }

err_map:

         kfree(hctxs);

err_percpu:

         free_percpu(ctx);

         return ERR_PTR(-ENOMEM);

}

 

该函数用于启动设置io queue。这时候admin queue已经初始化完毕,可以给adminqueue下发cmd来创建io queue

static int nvme_setup_io_queues(structnvme_dev *dev)

{

         struct nvme_queue *adminq =dev->queues[0];

         struct pci_dev *pdev = dev->pci_dev;

         int result, i, vecs, nr_io_queues,size;

 

         nr_io_queues = num_possible_cpus();//得到cpu num

         result = set_queue_count(dev,nr_io_queues);//发送set feature cmd 创建io queue

         if (result <= 0)

                   return result;

         if (result < nr_io_queues)

                   nr_io_queues = result;

 

         size = db_bar_size(dev, nr_io_queues);//计算所需要的bar size,如果>8192,则需要重新分配bar size 空间。

         if (size > 8192) {

                   iounmap(dev->bar);

                   do {

                            dev->bar =ioremap(pci_resource_start(pdev, 0), size);

                            if (dev->bar)

                                     break;

                            if (!--nr_io_queues)

                                     return-ENOMEM;

                            size =db_bar_size(dev, nr_io_queues);

                   } while (1);

                   dev->dbs = ((void __iomem*)dev->bar) + 4096;

                   adminq->q_db =dev->dbs;

         }

 

         /* Deregister the admin queue'sinterrupt */

         free_irq(dev->entry[0].vector,adminq);

 

         /*

          * If we enable msix early due to not intx,disable it again before

          * setting up the full range we need.

          */

         if (!pdev->irq)

                   pci_disable_msix(pdev);

 

         for (i = 0; i < nr_io_queues; i++)

                   dev->entry[i].entry = i;

         vecs = pci_enable_msix_range(pdev,dev->entry, 1, nr_io_queues);//请求msix 范围

        

         if (vecs < 0) {

                   vecs =pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));

                   if (vecs < 0) {

                            vecs = 1;

                   } else {

                            for (i = 0; i

                                     dev->entry[i].vector= i + pdev->irq;

                   }

         }

 

         /*

          * Should investigate if there's a performancewin from allocating

          * more queues than interrupt vectors; it mightallow the submission

          * path to scale better, even if the receivepath is limited by the

          * number of interrupts.

          */

         nr_io_queues = vecs;

         dev->max_qid = nr_io_queues;

 

         result = queue_request_irq(dev, adminq,adminq->irqname);

         if (result)

                   goto free_queues;

 

         /* Free previously allocated queuesthat are no longer usable */

         nvme_free_queues(dev, nr_io_queues +1);

         nvme_create_io_queues(dev);//创建io queues

 

         return 0;

 

 free_queues:

         nvme_free_queues(dev, 1);

         return result;

}

 

static void nvme_create_io_queues(structnvme_dev *dev)

{

         unsigned i;

 

         for (i = dev->queue_count; i <=dev->max_qid; i++)

                   if (!nvme_alloc_queue(dev, i,dev->q_depth))//分配nvmeq结构体,并记录到dev->queues[]数组中,并分配submit queue complete queue所需要的空间。

                            break;

 

         for (i = dev->online_queues; i <=dev->queue_count - 1; i++)

                   if(nvme_create_queue(dev->queues[i], i))//admin queue发送cmd创建cq sq

                            break;

}

 

 

static int nvme_dev_add(structnvme_dev *dev)

{

         struct pci_dev *pdev = dev->pci_dev;

         int res;

         unsigned nn, i;

         struct nvme_ns *ns;

         struct nvme_id_ctrl *ctrl;

         struct nvme_id_ns *id_ns;

         void *mem;

         dma_addr_t dma_addr;

         int shift =NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;//最大page size

 

         mem =dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);//申请8192大小的dma 空间,来存放identify data

         if (!mem)

                   return -ENOMEM;

 

         res = nvme_identify(dev, 0, 1, dma_addr);//发送identify cmd

         if (res) {

                   dev_err(&pdev->dev,"Identify Controller failed (%d)\n", res);

                   res = -EIO;

                   goto out;

         }

 

         ctrl = mem;

         nn = le32_to_cpup(&ctrl->nn);//获得namespace number

         dev->oncs =le16_to_cpup(&ctrl->oncs);

         dev->abort_limit = ctrl->acl + 1;

         dev->vwc = ctrl->vwc;

         dev->event_limit = min(ctrl->aerl+ 1, 8);

         memcpy(dev->serial, ctrl->sn,sizeof(ctrl->sn));

         memcpy(dev->model, ctrl->mn,sizeof(ctrl->mn));

         memcpy(dev->firmware_rev,ctrl->fr, sizeof(ctrl->fr));

         if (ctrl->mdts)

                   dev->max_hw_sectors = 1<< (ctrl->mdts + shift - 9);

         if ((pdev->vendor ==PCI_VENDOR_ID_INTEL) &&

                            (pdev->device ==0x0953) && ctrl->vs[3]) {

                   unsigned int max_hw_sectors;

 

                   dev->stripe_size = 1<< (ctrl->vs[3] + shift);

                   max_hw_sectors =dev->stripe_size >> (shift - 9);

                   if (dev->max_hw_sectors) {

                            dev->max_hw_sectors= min(max_hw_sectors,

                                                                 dev->max_hw_sectors);

                   } else

                            dev->max_hw_sectors= max_hw_sectors;

         }

 

         dev->tagset.ops = &nvme_mq_ops;

         dev->tagset.nr_hw_queues =dev->online_queues - 1;

         dev->tagset.timeout =NVME_IO_TIMEOUT;

         dev->tagset.numa_node =dev_to_node(&dev->pci_dev->dev);

         dev->tagset.queue_depth =

                                     min_t(int,dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;

         dev->tagset.cmd_size = sizeof(structnvme_cmd_info);

         dev->tagset.flags =BLK_MQ_F_SHOULD_MERGE;

         dev->tagset.driver_data = dev;

 

         if(blk_mq_alloc_tag_set(&dev->tagset))//设置tagset,为io queue用

                   goto out;

 

         id_ns = mem;

         for (i = 1; i <= nn; i++) {

                   res = nvme_identify(dev, i,0, dma_addr);//发送identify cmd

                   if (res)

                            continue;

 

                   if (id_ns->ncap == 0)

                            continue;

 

                   res = nvme_get_features(dev,NVME_FEAT_LBA_RANGE, i,

                                                                 dma_addr+ 4096, NULL);

                   if (res)

                            memset(mem + 4096,0, 4096);

 

                   ns = nvme_alloc_ns(dev, i,mem, mem + 4096);//为每个namespace 分配资源

                   if (ns)

                            list_add_tail(&ns->list,&dev->namespaces);

         }

         list_for_each_entry(ns,&dev->namespaces, list)

                   add_disk(ns->disk);//注册分区,这里注册namespce更恰当,在user layer表现为一个分区

         res = 0;

 

 out:

         dma_free_coherent(&dev->pci_dev->dev,8192, mem, dma_addr);

         return res;

}

 

 

static structnvme_ns *nvme_alloc_ns(structnvme_dev *dev, unsigned nsid,

                            struct nvme_id_ns*id, struct nvme_lba_range_type *rt)

{

         struct nvme_ns *ns;

         struct gendisk *disk;

         int node =dev_to_node(&dev->pci_dev->dev);

         int lbaf;

 

         if (rt->attributes &NVME_LBART_ATTRIB_HIDE)

                   return NULL;

 

         ns = kzalloc_node(sizeof(*ns),GFP_KERNEL, node);

         if (!ns)

                   return NULL;

         ns->queue =blk_mq_init_queue(&dev->tagset);//初始化request queue

         if (IS_ERR(ns->queue))

                   goto out_free_ns;

         queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES,ns->queue);

         queue_flag_set_unlocked(QUEUE_FLAG_NONROT,ns->queue);

         queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,ns->queue);

         ns->dev = dev;

         ns->queue->queuedata = ns;

 

         disk = alloc_disk_node(0, node);//分配disk 资源

         if (!disk)

                   goto out_free_queue;

 

         ns->ns_id = nsid;

         ns->disk = disk;

         lbaf = id->flbas & 0xf;

         ns->lba_shift =id->lbaf[lbaf].ds;

         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);

         blk_queue_logical_block_size(ns->queue,1 << ns->lba_shift);

         if (dev->max_hw_sectors)

                   blk_queue_max_hw_sectors(ns->queue,dev->max_hw_sectors);

         if (dev->stripe_size)

                   blk_queue_chunk_sectors(ns->queue,dev->stripe_size >> 9);

         if (dev->vwc &NVME_CTRL_VWC_PRESENT)

                   blk_queue_flush(ns->queue,REQ_FLUSH | REQ_FUA);

 

         disk->major = nvme_major;

         disk->first_minor = 0;

         disk->fops = &nvme_fops;

         disk->private_data = ns;

         disk->queue = ns->queue;

         disk->driverfs_dev =&dev->pci_dev->dev;

         disk->flags = GENHD_FL_EXT_DEVT;

         sprintf(disk->disk_name,"nvme%dn%d", dev->instance, nsid);

         set_capacity(disk,le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));

 

         if (dev->oncs &NVME_CTRL_ONCS_DSM)

                   nvme_config_discard(ns);

 

         return ns;

 

 out_free_queue:

         blk_cleanup_queue(ns->queue);

 out_free_ns:

         kfree(ns);

         return NULL;

}

 

到此,整个nvme 初始化过程分析完毕。。。。。。

你可能感兴趣的:(Linux,device,driver)