驱动 | Linux | NVMe | 2. nvme_probe

本文主要参考这里 12 的解析和 linux 源码 3
此处推荐一个可以便捷查看 linux 源码的网站 bootlin 4

更新:2022 / 02 / 19


驱动 | Linux | NVMe | 2. nvme_probe

  • nvme_pci_alloc_dev
    • nvme_reset_work
      • nvme_pci_enable
        • pci_alloc_irq_vectors
          • pci_alloc_irq_vectors_affinity
        • nvme_pci_configure_admin_queue
          • 1. 读取 NVM Subsystem Reset 寄存器
          • 2&5. nvme_disable_ctrl / nvme_enable_ctrl
            • nvme_wait_ready
          • 3. nvme_alloc_queue
          • 4. lo_hi_writeq
          • 6. nvme_init_queue
          • 7. queue_request_irq
    • nvme_init_ctrl
      • dev_set_name
        • kobject_set_name_vargs
  • nvme_dev_map
    • pci_request_mem_regions
        • pci_select_bars
      • pci_request_selected_regions
    • nvme_remap_bar
  • nvme_setup_prp_pools
  • nvme_pci_enable
      • pci_alloc_irq_vectors
        • pci_alloc_irq_vectors_affinity
      • pci_enable_pcie_error_reporting
        • pcie_aer_is_native
        • pcie_capability_set_word
        • pcibios_err_to_errno
      • nvme_pci_configure_admin_queue
          • 1. 读取 NVM Subsystem Reset 寄存器
          • 2&5. nvme_disable_ctrl / nvme_enable_ctrl
            • nvme_wait_ready
          • 3. nvme_alloc_queue
          • 4. lo_hi_writeq
          • 6. nvme_init_queue
          • 7. queue_request_irq
  • nvme_alloc_admin_tag_set
    • blk_mq_alloc_tag_set
    • blk_mq_init_queue
  • nvme_init_ctrl_finish
    • nvme_init_identify
      • nvme_identify_ctrl
        • c.identify
        • __nvme_submit_sync_cmd
          • 1. blk_mq_alloc_request
          • 2. blk_rq_map_kern
          • 3. nvme_execute_rq
  • nvme_setup_io_queues
      • nvme_set_queue_count
        • nvme_set_features
          • c.features
            • cpu_to_le32
          • __nvme_submit_sync_cmd
      • nvme_create_io_queues
        • nvme_create_queue
          • adapter_alloc_cq / adapter_alloc_sq
  • nvme_start_ctrl
      • nvme_queue_scan
        • insert_work
  • flush_work
      • start_flush_work
  • 参考链接


先来回忆一下 nvme_probe 的定义:

static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct nvme_dev *dev;
	int result = -ENOMEM;
	
	// 1.
	dev = nvme_pci_alloc_dev(pdev, id);
	if (!dev)
		return -ENOMEM;
		
	// 2. 获得PCI Bar的虚拟地址
	result = nvme_dev_map(dev);
	if (result)
		goto out_uninit_ctrl;
	
	// 3. 设置 DMA 需要的 PRP 内存池
	result = nvme_setup_prp_pools(dev);
	if (result)
		goto out_dev_unmap;
	
	// 4. 配置prp和sgl
	result = nvme_pci_alloc_iod_mempool(dev);
	if (result)
		goto out_release_prp_pools;
	
	// 打印出被识别到的dev是什么pci function
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
		
	// 5.
	result = nvme_pci_enable(dev);
	if (result)
		goto out_release_iod_mempool;
	
	// 6. 对nvme_alloc_admin_tag_set结构体初始化,在这个过程中特别提一下ops的赋值(后续会用到)。
	result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
				&nvme_mq_admin_ops, sizeof(struct nvme_iod));
	if (result)
		goto out_disable;

	/*
	 * Mark the controller as connecting before sending admin commands to
	 * allow the timeout handler to do the right thing.
	 */
	// 在发送admin commands前将nvme ctrl的状态标记为connecting,否则说明nvme ctrl处于busy繁忙状态,将nvme ctrl disable。
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
		result = -EBUSY;
		goto out_disable;
	}
		
	// 7. 初始化NVMe Controller结构
	result = nvme_init_ctrl_finish(&dev->ctrl, false);
	if (result)
		goto out_disable;
	
	// 8. allocate dma for dbbuf
	nvme_dbbuf_dma_alloc(dev);
	
	// 9. 配置host memory buffer
	result = nvme_setup_host_mem(dev);
	if (result < 0)
		goto out_disable;
	
	// 10. 配置io queues
	result = nvme_setup_io_queues(dev);
	if (result)
		goto out_disable;

	// 11. 对nvme_alloc_io_tag_set结构体初始化
	if (dev->online_queues > 1) {
		nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
				nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
		nvme_dbbuf_set(dev);
	}

	// 如果tagset未置起,打印出警告io q并未创建的信息。
	if (!dev->ctrl.tagset)
		dev_warn(dev->ctrl.device, "IO queues not created\n");

	// 如果nvme ctrl的状态并非alive,打印出警告nvme ctrl无法被标记为alive的信息。
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller live state\n");
		result = -ENODEV;
		goto out_disable;
	}
	
	// 12. 为设备设置私有数据指针 
	pci_set_drvdata(pdev, dev);
	
	// 13. 设置队列并针对不同的cpu进行队列重整
	nvme_start_ctrl(&dev->ctrl);
	// 14. 递减引用次数
	nvme_put_ctrl(&dev->ctrl);
	// 15. 等待队列中最后一个实例完成执行;
	flush_work(&dev->ctrl.scan_work);
	return 0;

out_disable:
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
	nvme_dev_disable(dev, true);
	nvme_free_host_mem(dev);
	nvme_dev_remove_admin(dev);
	nvme_dbbuf_dma_free(dev);
	nvme_free_queues(dev, 0);
out_release_iod_mempool:
	mempool_destroy(dev->iod_mempool);
out_release_prp_pools:
	nvme_release_prp_pools(dev);
out_dev_unmap:
	nvme_dev_unmap(dev);
out_uninit_ctrl:
	nvme_uninit_ctrl(&dev->ctrl);
	return result;
}

接下来对 nvme_probe 函数的过程进行结构整理,如下图所示:

初始化nvme ctrl
获得这个 pci_dev 的 numa 节点
为节点配置空间和workqueue
初始化work变量
增加设备对象引用次数
初始化nvme ctrl结构
获得PCI Bar的虚拟地址
申请内存空间
将IO地址空间映射到内核的虚拟空间中
设置 DMA 需要的 PRP 内存池
设置2个prp list长度不同的dma pool
配置prp和sgl
初始化pci function
使能nvme设备的内存空间iomem, 即之前映射的bar空间
设置设备具有获得总线的能力, 即调用这个函数使设备具备申请使用PCI总线的能力
设定这个nvme设备的DMA区域大小
读取寄存器获取controller的状态
为设备分配中断请求
匹配主机端CMB映射表到controller端并配置CMB size
使能pcie错误报告功能, 将标准的PCI错误码转换为预定义的错误码
在 suspend之前保存设备当前的PCI Configuration Space
构建admin queue
初始化nvme_alloc__tag_set结构体
结束nvme ctrl的初始化
为dbbuf配置dma
配置host memory buffer
配置io q
构建q count
创建q
初始化nvme_alloc_io_tag_set结构体
为设备设置私有数据指针
进行队列重整
递减引用次数
等待队列中最后一个实例完成执行
nvme_probe
nvme_pci_alloc_dev
dev_to_node
kzalloc_node
INIT_WORK
get_device
nvme_init_ctrl
nvme_dev_map
pci_request_mem_regions
nvme_remap_bar
nvme_setup_prp_pools
dma_pool_create
nvme_pci_alloc_iod_mempool
nvme_pci_enable
pci_enable_device_mem
pci_set_master
dma_set_mask_and_coherent
NVME_REG_CSTS
pci_alloc_irq_vectors
nvme_map_cmb
pci_enable_pcie_error_reporting
pci_save_state
nvme_pci_configure_admin_queue
nvme_alloc_admin_tag_set
nvme_init_ctrl_finish
nvme_dbbuf_dma_alloc
nvme_setup_host_mem
nvme_setup_io_queues
nvme_set_queue_count
nvme_create_io_queues
nvme_alloc_io_tag_set
pci_set_drvdata
nvme_start_ctrl
nvme_put_ctrl
flush_work

再来对 nvme_probe 过程中的一些函数的使用进行进一步分析,往下看:


nvme_pci_alloc_dev

nvme_probe 中使用 nvme_pci_alloc_dev

static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
		const struct pci_device_id *id)
{
	unsigned long quirks = id->driver_data;
	
	// 1. 通过调用 dev_to_node 得到这个 pci_dev 的 numa 节点。
	int node = dev_to_node(&pdev->dev);
	struct nvme_dev *dev;
	int ret = -ENOMEM;
	
	// 如果没有制定的话,默认用 first_memory_node,也就是第一个 numa 节点,并调用 set_dev_node 来设置。
	if (node == NUMA_NO_NODE)
		set_dev_node(&pdev->dev, first_memory_node);
	
	// 2. 为 nvme dev 节点分配空间
	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
	if (!dev)
		return NULL;
		
	// 3. 初始化两个work变量, 放在nvme_workq中执行
	// 4. 调用nvme_reset_work进行reset操作
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
	// 初始化互斥锁
	mutex_init(&dev->shutdown_lock);
	
	// 5. 分配queue
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
	if (!dev->queues)
		goto out_free_dev;

	// 6. 增加设备对象的引用计数
	dev->dev = get_device(&pdev->dev);

	quirks |= check_vendor_combination_bug(pdev);
	if (!noacpi && acpi_storage_d3(&pdev->dev)) {
		/*
		 * Some systems use a bios work around to ask for D3 on
		 * platforms that support kernel managed suspend.
		 */
		dev_info(&pdev->dev,
			 "platform quirk: setting simple suspend\n");
		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
	}
	// 初始化 NVMe Controller 结构
	ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			     quirks);
	if (ret)
		goto out_put_device;
	
	dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
	dma_set_max_seg_size(&pdev->dev, 0xffffffff);

	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
	dev->ctrl.max_segments = NVME_MAX_SEGS;

	/*
	 * There is no support for SGLs for metadata (yet), so we are limited to
	 * a single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;
	return dev;

out_put_device:
	put_device(dev->dev);
	kfree(dev->queues);
out_free_dev:
	kfree(dev);
	return ERR_PTR(ret);
}

nvme_pci_alloc_dev 做了什么呢?


nvme_reset_work

nvme_pci_alloc_dev 中有调用 nvme_reset_work

static void nvme_reset_work(struct work_struct *work)
{
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
	int result;
		
	// 1. 检查NVME_CTRL_RESETTING标志,来确保nvme_reset_work不会被重复进入.
	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
			 dev->ctrl.state);
		return;
	}

	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
		nvme_dev_disable(dev, false);
	nvme_sync_queues(&dev->ctrl);

	mutex_lock(&dev->shutdown_lock);
	// 2. 
	result = nvme_pci_enable(dev);
	if (result)
		goto out_unlock;
	// 3. 
	nvme_unquiesce_admin_queue(&dev->ctrl);
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
		result = -EBUSY;
		goto out;
	}

	result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
	if (result)
		goto out;

	nvme_dbbuf_dma_alloc(dev);

	result = nvme_setup_host_mem(dev);
	if (result < 0)
		goto out;

	result = nvme_setup_io_queues(dev);
	if (result)
		goto out;

	/*
	 * Freeze and update the number of I/O queues as thos might have
	 * changed.  If there are no I/O queues left after this reset, keep the
	 * controller around but remove all namespaces.
	 */
	if (dev->online_queues > 1) {
		nvme_unquiesce_io_queues(&dev->ctrl);
		nvme_wait_freeze(&dev->ctrl);
		nvme_pci_update_nr_queues(dev);
		nvme_dbbuf_set(dev);
		nvme_unfreeze(&dev->ctrl);
	} else {
		dev_warn(dev->ctrl.device, "IO queues lost\n");
		nvme_mark_namespaces_dead(&dev->ctrl);
		nvme_unquiesce_io_queues(&dev->ctrl);
		nvme_remove_namespaces(&dev->ctrl);
		nvme_free_tagset(dev);
	}

	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller live state\n");
		result = -ENODEV;
		goto out;
	}

	nvme_start_ctrl(&dev->ctrl);
	return;

 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
 out:
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
		 result);
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
	nvme_dev_disable(dev, true);
	nvme_mark_namespaces_dead(&dev->ctrl);
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}

nvme_pci_enable

nvme_reset_work 中有调用 nvme_pci_enable,其定义如下:

static int nvme_pci_enable(struct nvme_dev *dev)
{
	int result = -ENOMEM;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	int dma_address_bits = 64;
	
	// 1. 使能nvme设备的内存空间iomem,也就是之前映射的bar空间。
	if (pci_enable_device_mem(pdev))
		return result;
	
	// 设置设备具有获得总线的能力,即调用这个函数,使设备具备申请使用PCI总线的能力。
	pci_set_master(pdev);

	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
		dma_address_bits = 48;
	// 设定这个nvme设备的DMA区域大小,64 bits或者48 bits
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
		goto disable;
	
	// 读取Controller寄存器NVME_REG_CSTS,判断Controller的状态
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
		result = -ENODEV;
		goto disable;
	}

	/*
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
	 */
	// 为设备分配中断请求。nvme设备支持三种中断模式:INITx/MSI/MSI-X.
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		goto disable;
	
	// 获取设备64位的Controller Capabilities(CAP)
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);

	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
				io_queue_depth);
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
	// 设置Doorbell地址,这里的4096来自SQ Tail DB的起始地址0x1000
	dev->dbs = dev->bar + 4096;

	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
			dev->q_depth);
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
	}

	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */

	nvme_map_cmb(dev);
	
	// 错误处理
	pci_enable_pcie_error_reporting(pdev);
	// Suspend之前保存设备当下的状态
	pci_save_state(pdev);

	result = nvme_pci_configure_admin_queue(dev);
	if (result)
		goto free_irq;
	return result;

 free_irq:
	pci_free_irq_vectors(pdev);
 disable:
	pci_disable_device(pdev);
	return result;
}

pci_alloc_irq_vectors

nvme_pci_enable 中有调用 pci_alloc_irq_vectors,其定义如下:

/**
 * pci_alloc_irq_vectors() - Allocate multiple device interrupt vectors
 * @dev:      the PCI device to operate on
 * @min_vecs: minimum required number of vectors (must be >= 1)
 * @max_vecs: maximum desired number of vectors
 * @flags:    One or more of:
 *
 *            * %PCI_IRQ_MSIX      Allow trying MSI-X vector allocations
 *            * %PCI_IRQ_MSI       Allow trying MSI vector allocations
 *
 *            * %PCI_IRQ_LEGACY    Allow trying legacy INTx interrupts, if
 *              and only if @min_vecs == 1
 *
 *            * %PCI_IRQ_AFFINITY  Auto-manage IRQs affinity by spreading
 *              the vectors around available CPUs
 *
 * Allocate up to @max_vecs interrupt vectors on device. MSI-X irq
 * vector allocation has a higher precedence over plain MSI, which has a
 * higher precedence over legacy INTx emulation.
 *
 * Upon a successful allocation, the caller should use pci_irq_vector()
 * to get the Linux IRQ number to be passed to request_threaded_irq().
 * The driver must call pci_free_irq_vectors() on cleanup.
 *
 * Return: number of allocated vectors (which might be smaller than
 * @max_vecs), -ENOSPC if less than @min_vecs interrupt vectors are
 * available, other errnos otherwise.
 */
int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
			  unsigned int max_vecs, unsigned int flags)
{
	return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs,
					      flags, NULL);
}
EXPORT_SYMBOL(pci_alloc_irq_vectors);

pci_alloc_irq_vectors_affinity
/**
 * pci_alloc_irq_vectors_affinity() - Allocate multiple device interrupt
 *                                    vectors with affinity requirements
 * @dev:      the PCI device to operate on
 * @min_vecs: minimum required number of vectors (must be >= 1)
 * @max_vecs: maximum desired number of vectors
 * @flags:    allocation flags, as in pci_alloc_irq_vectors()
 * @affd:     affinity requirements (can be %NULL).
 *
 * Same as pci_alloc_irq_vectors(), but with the extra @affd parameter.
 * Check that function docs, and &struct irq_affinity, for more details.
 */
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
				   unsigned int max_vecs, unsigned int flags,
				   struct irq_affinity *affd)
{
	struct irq_affinity msi_default_affd = {0};
	int nvecs = -ENOSPC;
	
	// 当IRQ为Affinity自动分配时,IRQ中断会分配给所有CPUs。
	// 在nvme_pci_enable过程中调用时,*affd=NULL
	if (flags & PCI_IRQ_AFFINITY) {
		if (!affd)
			affd = &msi_default_affd;
	} else {
		if (WARN_ON(affd))
			affd = NULL;
	}
	
	// 分配MSI-X中断,配置MSI-X capability structure
	if (flags & PCI_IRQ_MSIX) {
		nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
						affd, flags);
		if (nvecs > 0)
			return nvecs;
	}
	
	// 分配MSI中断,配置MSI capability structure
	if (flags & PCI_IRQ_MSI) {
		nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
		if (nvecs > 0)
			return nvecs;
	}

	/* use legacy IRQ if allowed */
	// 分配INITx中断
	if (flags & PCI_IRQ_LEGACY) {
		if (min_vecs == 1 && dev->irq) {
			/*
			 * Invoke the affinity spreading logic to ensure that
			 * the device driver can adjust queue configuration
			 * for the single interrupt case.
			 */
			if (affd)
				irq_create_affinity_masks(1, affd);
			pci_intx(dev, 1);
			return 1;
		}
	}

	return nvecs;
}
EXPORT_SYMBOL(pci_alloc_irq_vectors_affinity);

这三种中断不能同时 enable,比如要采用 MSI-X 中断,那就必须把 INITxMSI 中断 disable


nvme_pci_configure_admin_queue

nvme_pci_enable 中有调用 nvme_pci_configure_admin_queue,其定义如下:

static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
	int result;
	u32 aqa;
	struct nvme_queue *nvmeq;

	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;
	
	// 1. 从CAP寄存器中判断对Subsystem Reset的支持情况
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;

	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);

	/*
	 * If the device has been passed off to us in an enabled state, just
	 * clear the enabled bit.  The spec says we should set the 'shutdown
	 * notification bits', but doing so may cause the device to complete
	 * commands to the admin queue ... and we don't know what memory that
	 * might be pointing at!
	 */
	// 2. 调用nvme_disable_ctrl
	result = nvme_disable_ctrl(&dev->ctrl, false);
	if (result < 0)
		return result;
	
	// 3. 调用nvme_alloc_queue
	// 设备disable之后第一次调用nvmeq, 此时值为Null。这时需要调用nvme_alloc_queue分配NVMe queue.
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
	if (result)
		return result;

	dev->ctrl.numa_node = dev_to_node(dev->dev);

	nvmeq = &dev->queues[0];
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

	// 4. nvme_alloc_queue分配NVMe queue后,就要将nvme admin queue的属性以及已经分配的admin SQ/CQ内存地址写入寄存器。
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);

	// 5. 对admin queue分配内存之后,调用nvme_enable_ctrl将设备enable。这个函数与nvme_disable_ctrl函数类似,只是过程相反。
	result = nvme_enable_ctrl(&dev->ctrl);
	if (result)
		return result;
	
	// 6. 对之前申请的queue进行初始化操作
	nvmeq->cq_vector = 0;
	nvme_init_queue(nvmeq, 0);
	
	// 7. 调用queue_request_irq申请中断。这个函数主要的工作是设置中断处理函数,默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
	result = queue_request_irq(nvmeq);
	if (result) {
		dev->online_queues--;
		return result;
	}

	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
	return result;
}

从上面的代码,我们可以了解到 nvme_pci_configure_admin_queue 中大致的步骤如下:

    1. CAP 寄存器中判断对 Subsystem Reset 的支持情况;
    1. 调用 nvme_disable_ctrl
    1. 调用 nvme_alloc_queue
    1. 调用 lo_hi_writeq
    1. 调用 nvme_enable_ctrl
    1. 调用 nvme_init_queue
    1. 调用 queue_request_irq

对于上面的步骤,挨个逐步分析:


1. 读取 NVM Subsystem Reset 寄存器

ControllerCAP 寄存器 bit[36] 定义了对 NVM subsystem reset 是否支持,如下图:

驱动 | Linux | NVMe | 2. nvme_probe_第1张图片
一般情况下,NVM subsystem 就是一块 SSD 了,由 ControllerNAND 以及接口组成一个 NVM subsystem
NVM Subsystem ResetController Level Reset 的一种。


2&5. nvme_disable_ctrl / nvme_enable_ctrl

在对 NVMe controller 进行操作时需要通过 nvme_disable_ctrl 将设备 disable,完成后再调用 nvme_enable_ctrl 将设备 enable

int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
	int ret;

	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
	if (shutdown)
		ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
	else
		ctrl->ctrl_config &= ~NVME_CC_ENABLE;
	
	// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
	// reg_write32通过NVME_REG_CC寄存器disable设备。
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;

	if (shutdown) {
		return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
				       NVME_CSTS_SHST_CMPLT,
				       ctrl->shutdown_timeout, "shutdown");
	}
	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
		msleep(NVME_QUIRK_DELAY_AMOUNT);
	// 在函数最后,通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。
	// 超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
			       (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
// 通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
#define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)

驱动 | Linux | NVMe | 2. nvme_probe_第2张图片


nvme_wait_ready
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
		u32 timeout, const char *op)
{
	unsigned long timeout_jiffies = jiffies + timeout * HZ;
	u32 csts;
	int ret;

	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
		if (csts == ~0)
			return -ENODEV;
		if ((csts & mask) == val)
			break;

		usleep_range(1000, 2000);
		if (fatal_signal_pending(current))
			return -EINTR;
		if (time_after(jiffies, timeout_jiffies)) {
			dev_err(ctrl->device,
				"Device not ready; aborting %s, CSTS=0x%x\n",
				op, csts);
			return -ENODEV;
		}
	}

	return ret;
}
// 同 nvme_disable_ctrl 的过程基本相反
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
	unsigned dev_page_min;
	u32 timeout;
	int ret;

	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
	if (ret) {
		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
		return ret;
	}
	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;

	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
		dev_err(ctrl->device,
			"Minimum device page size %u too large for host (%u)\n",
			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
		return -ENODEV;
	}

	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
		ctrl->ctrl_config = NVME_CC_CSS_CSI;
	else
		ctrl->ctrl_config = NVME_CC_CSS_NVM;

	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
		u32 crto;

		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
		if (ret) {
			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
				ret);
			return ret;
		}

		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
			ctrl->ctrl_config |= NVME_CC_CRIME;
			timeout = NVME_CRTO_CRIMT(crto);
		} else {
			timeout = NVME_CRTO_CRWMT(crto);
		}
	} else {
		timeout = NVME_CAP_TIMEOUT(ctrl->cap);
	}

	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
	// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
	// reg_write32通过NVME_REG_CC寄存器disable设备
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;

	/* Flush write to device (required if transport is PCI) */
	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
	if (ret)
		return ret;

	ctrl->ctrl_config |= NVME_CC_ENABLE;
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
			       (timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);

3. nvme_alloc_queue

设备 disable 之后需要调用 nvme_alloc_queue 分配 NVMe queue

static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
{
	struct nvme_queue *nvmeq = &dev->queues[qid];

	if (dev->ctrl.queue_count > qid)
		return 0;

	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
	nvmeq->q_depth = depth;
	
	// 1. 调用 dma_alloc_coherent 为 completion queue 分配内存以供 DMA 使用。nvmeq->cqes为申请到的内存的虚拟地址,供内核使用。
	// 而nvmeq->cq_dma_addr就是这块内存的物理地址,供DMA控制器使用。
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
	if (!nvmeq->cqes)
		goto free_nvmeq;
	
	// 2. 调用 nvme_alloc_sq_cmds 来处理 submission queue,假如nvme版本是1.2或者以上的,并且cmb支持 submission queue,那就使用 cmb。
	// 否则的话,和 completion queue 一样使用dma_alloc_coherent来分配内存。
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
		goto free_cqdma;

	nvmeq->dev = dev;
	spin_lock_init(&nvmeq->sq_lock);
	spin_lock_init(&nvmeq->cq_poll_lock);
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
	nvmeq->qid = qid;
	dev->ctrl.queue_count++;

	return 0;

 free_cqdma:
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
 free_nvmeq:
	return -ENOMEM;
}

调用 nvme_alloc_queue,设备 disable 之后第一次调用 nvmeq,此时值为 Null
这时需要调用 nvme_alloc_queue 分配 NVMe queue


4. lo_hi_writeq

nvme_alloc_queue 分配 NVMe queue 后,就要将 nvme admin queue 的属性以及已经分配的admin SQ/CQ 内存地址写入寄存器。

static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
{
	writel(val, addr);
	writel(val >> 32, addr + 4);
}

6. nvme_init_queue
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
	struct nvme_dev *dev = nvmeq->dev;

	nvmeq->sq_tail = 0;
	nvmeq->last_sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
	nvme_dbbuf_init(dev, nvmeq, qid);
	dev->online_queues++;
	wmb(); /* ensure the first interrupt sees the initialization */
}

在这个过程中,对 SQ TailCQ Head 以及 CQ phase 等变量进行初始化赋值,然后通过 q_db 指向 Doorbell 寄存器。


7. queue_request_irq
static int queue_request_irq(struct nvme_queue *nvmeq)
{
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
}

调用 queue_request_irq 申请中断。
默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。

线程化与中断上下文的概念:

中断线程化是实现 Linux 实时性的一个重要步骤,在 linux 标准内核中,中断是最高优先级的执行单元,不管内核当时处理什么,只要有中断事件,系统将立即响应该事件并执行相应的中断处理代码,除非当时中断关闭。因此,如果系统有严重的网络或 I/O 负载,中断将非常频繁,后发生的实时任务将很难有机会运行,也就是说,毫无实时性可言。

中断线程化之后,中断将作为内核线程运行而且赋予不同的实时优先级,实时任务可以有比中断线程更高的优先级,这样,实时任务就可以作为最高优先级的执行单元来运行,即使在严重负载下仍有实时性保证。

内核空间和用户空间是操作系统理论的基础之一,即内核功能模块运行在内核空间,而应用程序运行在用户空间。现代的 CPU 都具有不同的操作模式,代表不同的级别,不同的级别具有不同的功能,在较低的级别中将禁止某些操作。

Linux 系统设计时利用了这种硬件特性,使用了两个级别,最高级别和最低级别,内核运行在最高级别(内核态),这个级别可以进行所有操作,而应用程序运行在较低级别(用户态),在这个级别,处理器控制着对硬件的直接访问以及对内存的非授权访问。

内核态和用户态有自己的内存映射,即自己的地址空间。

正是有了不同运行状态的划分,才有了上下文的概念。用户空间的应用程序,如果想要请求系统服务,比如操作一个物理设备,或者映射一段设备空间的地址到用户空间,就必须通过系统调用来(操作系统提供给用户空间的接口函数)实现。通过系统调用,用户空间的应用程序就会进入内核空间,由内核代表该进程运行于内核空间,这就涉及到上下文的切换,用户空间和内核空间具有不同的地址映射,通用或专用的寄存器组。而用户空间的进程要传递很多变量、参数给内核,内核也要保存用户进程的一些寄存器、变量等,以便系统调用结束后回到用户空间继续执行。所谓的进程上下文,就是一个进程在执行的时候,CPU 的所有寄存器中的值、进程的状态以及堆栈中的内容,当内核需要切换到另一个进程时,它需要保存当前进程的所有状态,即保存当前进程的进程上下文,以便再次执行该进程时,能够恢复切换时的状态,继续执行。

同理,硬件通过触发信号,导致内核调用中断处理程序,进入内核空间。这个过程中,硬件的一些变量和参数也要传递给内核,内核通过这些参数进行中断处理,中断上下文就可以理解为硬件传递过来的这些参数和内核需要保存的一些环境,主要是被中断的进程的环境。


nvme_init_ctrl

nvme_pci_alloc_dev 中有调用 nvme_init_ctrl,再看一下其定义,如下:

/*
 * Initialize a NVMe controller structures.  This needs to be called during
 * earliest initialization so that we have the initialized structured around
 * during probing.
 */
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
		const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
	int ret;

	ctrl->state = NVME_CTRL_NEW;
	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
	spin_lock_init(&ctrl->lock);
	mutex_init(&ctrl->scan_lock);
	INIT_LIST_HEAD(&ctrl->namespaces);
	xa_init(&ctrl->cels);
	init_rwsem(&ctrl->namespaces_rwsem);
	ctrl->dev = dev;
	ctrl->ops = ops;
	ctrl->quirks = quirks;
	ctrl->numa_node = NUMA_NO_NODE;
	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
	init_waitqueue_head(&ctrl->state_wq);

	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;

	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
			PAGE_SIZE);
	ctrl->discard_page = alloc_page(GFP_KERNEL);
	if (!ctrl->discard_page) {
		ret = -ENOMEM;
		goto out;
	}

	ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
	if (ret < 0)
		goto out;
	ctrl->instance = ret;

	device_initialize(&ctrl->ctrl_device);
	ctrl->device = &ctrl->ctrl_device;
	ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
			ctrl->instance);
	ctrl->device->class = nvme_class;
	ctrl->device->parent = ctrl->dev;
	if (ops->dev_attr_groups)
		ctrl->device->groups = ops->dev_attr_groups;
	else
		ctrl->device->groups = nvme_dev_attr_groups;
	ctrl->device->release = nvme_free_ctrl;
	dev_set_drvdata(ctrl->device, ctrl);
	// 1. set device name with nvme%d
	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
	if (ret)
		goto out_release_instance;

	nvme_get_ctrl(ctrl);
	cdev_init(&ctrl->cdev, &nvme_dev_fops);
	ctrl->cdev.owner = ops->module;
	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
	if (ret)
		goto out_free_name;

	/*
	 * Initialize latency tolerance controls.  The sysfs files won't
	 * be visible to userspace unless the device actually supports APST.
	 */
	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
		min(default_ps_max_latency_us, (unsigned long)S32_MAX));

	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
	nvme_mpath_init_ctrl(ctrl);
	ret = nvme_auth_init_ctrl(ctrl);
	if (ret)
		goto out_free_cdev;

	return 0;
out_free_cdev:
	cdev_device_del(&ctrl->cdev, ctrl->device);
out_free_name:
	nvme_put_ctrl(ctrl);
	kfree_const(ctrl->device->kobj.name);
out_release_instance:
	ida_free(&nvme_instance_ida, ctrl->instance);
out:
	if (ctrl->discard_page)
		__free_page(ctrl->discard_page);
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);

dev_set_name

nvme_init_ctrl 中有调用 dev_set_name 以创建一个名字叫 nvmex 的字符设备,其定义如下:

/**
 * dev_set_name - set a device name
 * @dev: device
 * @fmt: format string for the device's name
 */
int dev_set_name(struct device *dev, const char *fmt, ...)
{
	va_list vargs;
	int err;

	va_start(vargs, fmt);
	err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
	va_end(vargs);
	return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);

这个 nvmex 中的 x 是通过 kobject_set_name_vargs 获得唯一的索引值。


kobject_set_name_vargs

dev_set_name 中有调用 kobject_set_name_vargs,其定义如下:

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
				  va_list vargs)
{
	const char *s;

	if (kobj->name && !fmt)
		return 0;

	s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
	if (!s)
		return -ENOMEM;

	/*
	 * ewww... some of these buggers have '/' in the name ... If
	 * that's the case, we need to make sure we have an actual
	 * allocated copy to modify, since kvasprintf_const may have
	 * returned something from .rodata.
	 */
	if (strchr(s, '/')) {
		char *t;

		t = kstrdup(s, GFP_KERNEL);
		kfree_const(s);
		if (!t)
			return -ENOMEM;
		strreplace(t, '/', '!');
		s = t;
	}
	kfree_const(kobj->name);
	kobj->name = s;

	return 0;
}


nvme_dev_map

nvme_probe 中使用 nvme_dev_map 来获得 PCI Bar 的虚拟地址,

static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (pci_request_mem_regions(pdev, "nvme"))
		return -ENODEV;
	
	# NVME_REG_DBS	= 0x1000,	/* SQ 0 Tail Doorbell */
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
		goto release;

	return 0;
  release:
	pci_release_mem_regions(pdev);
	return -ENODEV;
}

pci_request_mem_regions

nvme_dev_map 中有调用 pci_request_mem_regions,其定义如下:

static inline int
pci_request_mem_regions(struct pci_dev *pdev, const char *name)
{
	return pci_request_selected_regions(pdev,
			    pci_select_bars(pdev, IORESOURCE_MEM), name);
}

pci_select_bars

pci_request_mem_regions 中有调用 pci_select_bars,其定义如下:

/**
 * pci_select_bars - Make BAR mask from the type of resource
 * @dev: the PCI device for which BAR mask is made
 * @flags: resource type mask to be selected
 *
 * This helper routine makes bar mask from the type of resource.
 */
int pci_select_bars(struct pci_dev *dev, unsigned long flags)
{
	int i, bars = 0;
	for (i = 0; i < PCI_NUM_RESOURCES; i++)
		if (pci_resource_flags(dev, i) & flags)
			bars |= (1 << i);
	return bars;
}
EXPORT_SYMBOL(pci_select_bars);

调用 pci_select_bars,其返回值为 mask。因为 pci 设备的 header 配置空间有 632 位的Bar 寄存器,所以 mark 中的每一位的值就代表其中一个 Bar 是否被置起:

驱动 | Linux | NVMe | 2. nvme_probe_第3张图片

pci_request_selected_regions

pci_request_mem_regions 中有调用 pci_request_selected_regions,其定义如下:

/**
 * pci_request_selected_regions - Reserve selected PCI I/O and memory resources
 * @pdev: PCI device whose resources are to be reserved
 * @bars: Bitmask of BARs to be requested
 * @res_name: Name to be associated with resource
 */
int pci_request_selected_regions(struct pci_dev *pdev, int bars,
				 const char *res_name)
{
	return __pci_request_selected_regions(pdev, bars, res_name, 0);
}
EXPORT_SYMBOL(pci_request_selected_regions);

调用 pci_request_selected_regions,这个函数的一个参数就是之前调用 pci_select_bars 返回的mask 值,作用就是把对应的这个几个 bar 保留起来,不让别人使用。


nvme_remap_bar

nvme_dev_map 中有调用 nvme_remap_bar,其定义如下:

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	// 将一个IO地址空间映射到内核的虚拟地址空间上去
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

调用 ioremap,在 linux 中我们无法直接访问物理地址,需要映射到虚拟地址,ioremap 就是这个作用。
映射完后,我们访问 dev->bar 就可以直接操作 nvme 设备上的寄存器了。但是代码中,并没有根据 pci_select_bars 的返回值来决定映射哪个 bar,而是直接映射 bar0,原因是 nvme 协议中强制规定了 bar0 就是内存映射的基址。

驱动 | Linux | NVMe | 2. nvme_probe_第4张图片


nvme_setup_prp_pools

nvme_probe 中使用 nvme_setup_prp_pools 来设置 DMA 需要的 PRP 内存池,

static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
						NVME_CTRL_PAGE_SIZE,
						NVME_CTRL_PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

	/* Optimisation for I/Os between 4k and 128k */
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
	return 0;
}

nvme_setup_prp_pools 主要是创建了两个 dma pool,后面就可以通过其他 dma 函数从 dma pool 中获得 memory 了。

  • prp_page_pool 提供的是块大小为 Page_Size (格式化时确定,例如 4KB ) 的内存,主要是为了对于不一样长度的 prp list 来做优化。
  • prp_small_pool 里提供的是块大小为 256 字节的内存。

nvme_pci_enable

nvme_probe 中使用 nvme_pci_enable

static int nvme_pci_enable(struct nvme_dev *dev)
{
	int result = -ENOMEM;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	int dma_address_bits = 64;
	
	// 1. 使能nvme设备的内存空间iomem,也就是之前映射的bar空间。
	if (pci_enable_device_mem(pdev))
		return result;
	
	// 2. 设置设备具有获得总线的能力,即调用这个函数,使设备具备申请使用PCI总线的能力。
	pci_set_master(pdev);

	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
		dma_address_bits = 48;
	// 3. 设定这个nvme设备的DMA区域大小,64 bits或者48 bits
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
		goto disable;
	
	// 4. 读取Controller寄存器NVME_REG_CSTS,判断Controller的状态
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
		result = -ENODEV;
		goto disable;
	}

	/*
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
	 */
	// 5. 为设备分配中断请求。nvme设备支持三种中断模式:INITx/MSI/MSI-X.
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		goto disable;
	
	// 获取设备64位的Controller Capabilities(CAP)
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);

	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
				io_queue_depth);
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
	// 设置Doorbell地址,这里的4096来自SQ Tail DB的起始地址0x1000
	dev->dbs = dev->bar + 4096;

	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
			dev->q_depth);
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
	}

	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
	
	// 6. 将主机端host的CMB(Controller Memory Buffer) 映射表配置到nvme controller以及CMB size等
	nvme_map_cmb(dev);
	
	// 7. 使能pcie错误报告功能
	pci_enable_pcie_error_reporting(pdev);
	
	// 8. suspend之前保存设备当下的PCI Configuration Space
	pci_save_state(pdev);

	// 9. 构建admin q
	result = nvme_pci_configure_admin_queue(dev);
	if (result)
		goto free_irq;
	return result;

 free_irq:
	pci_free_irq_vectors(pdev);
 disable:
	pci_disable_device(pdev);
	return result;
}

pci_alloc_irq_vectors

nvme_pci_enable 中有调用 pci_alloc_irq_vectors,其定义如下:

/**
 * pci_alloc_irq_vectors() - Allocate multiple device interrupt vectors
 * @dev:      the PCI device to operate on
 * @min_vecs: minimum required number of vectors (must be >= 1)
 * @max_vecs: maximum desired number of vectors
 * @flags:    One or more of:
 *
 *            * %PCI_IRQ_MSIX      Allow trying MSI-X vector allocations
 *            * %PCI_IRQ_MSI       Allow trying MSI vector allocations
 *
 *            * %PCI_IRQ_LEGACY    Allow trying legacy INTx interrupts, if
 *              and only if @min_vecs == 1
 *
 *            * %PCI_IRQ_AFFINITY  Auto-manage IRQs affinity by spreading
 *              the vectors around available CPUs
 *
 * Allocate up to @max_vecs interrupt vectors on device. MSI-X irq
 * vector allocation has a higher precedence over plain MSI, which has a
 * higher precedence over legacy INTx emulation.
 *
 * Upon a successful allocation, the caller should use pci_irq_vector()
 * to get the Linux IRQ number to be passed to request_threaded_irq().
 * The driver must call pci_free_irq_vectors() on cleanup.
 *
 * Return: number of allocated vectors (which might be smaller than
 * @max_vecs), -ENOSPC if less than @min_vecs interrupt vectors are
 * available, other errnos otherwise.
 */
int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
			  unsigned int max_vecs, unsigned int flags)
{
	return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs,
					      flags, NULL);
}
EXPORT_SYMBOL(pci_alloc_irq_vectors);

pci_alloc_irq_vectors_affinity

/**
 * pci_alloc_irq_vectors_affinity() - Allocate multiple device interrupt
 *                                    vectors with affinity requirements
 * @dev:      the PCI device to operate on
 * @min_vecs: minimum required number of vectors (must be >= 1)
 * @max_vecs: maximum desired number of vectors
 * @flags:    allocation flags, as in pci_alloc_irq_vectors()
 * @affd:     affinity requirements (can be %NULL).
 *
 * Same as pci_alloc_irq_vectors(), but with the extra @affd parameter.
 * Check that function docs, and &struct irq_affinity, for more details.
 */
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
				   unsigned int max_vecs, unsigned int flags,
				   struct irq_affinity *affd)
{
	struct irq_affinity msi_default_affd = {0};
	int nvecs = -ENOSPC;
	
	// 当IRQ为Affinity自动分配时,IRQ中断会分配给所有CPUs。
	// 在nvme_pci_enable过程中调用时,*affd=NULL
	if (flags & PCI_IRQ_AFFINITY) {
		if (!affd)
			affd = &msi_default_affd;
	} else {
		if (WARN_ON(affd))
			affd = NULL;
	}
	
	// 分配MSI-X中断,配置MSI-X capability structure
	if (flags & PCI_IRQ_MSIX) {
		nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
						affd, flags);
		if (nvecs > 0)
			return nvecs;
	}
	
	// 分配MSI中断,配置MSI capability structure
	if (flags & PCI_IRQ_MSI) {
		nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
		if (nvecs > 0)
			return nvecs;
	}

	/* use legacy IRQ if allowed */
	// 分配INITx中断
	if (flags & PCI_IRQ_LEGACY) {
		if (min_vecs == 1 && dev->irq) {
			/*
			 * Invoke the affinity spreading logic to ensure that
			 * the device driver can adjust queue configuration
			 * for the single interrupt case.
			 */
			if (affd)
				irq_create_affinity_masks(1, affd);
			pci_intx(dev, 1);
			return 1;
		}
	}

	return nvecs;
}
EXPORT_SYMBOL(pci_alloc_irq_vectors_affinity);

这三种中断不能同时 enable,比如要采用 MSI-X 中断,那就必须把 INITxMSI 中断 disable


pci_enable_pcie_error_reporting

nvme_pci_enable 中有调用 pci_enable_pcie_error_reporting,其定义如下:

int pci_enable_pcie_error_reporting(struct pci_dev *dev)
{
	int rc;
	
	// 如果pcie_aer功能不正常,
	if (!pcie_aer_is_native(dev))
	 	// EIO, I/O error
		return -EIO;
	
	rc = pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
	return pcibios_err_to_errno(rc);
}
EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);

pcie_aer_is_native

pci_enable_pcie_error_reporting 中有调用 pcie_aer_is_native 来判断 pcie_aer 功能是否处于原声状态,如下:

int pcie_aer_is_native(struct pci_dev *dev)
{
	// 构建 pcie_host_bridge 结构体,
	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
	
	// aer_cap, AER capability offset
	if (!dev->aer_cap)
		return 0;
	
	// native_aer=1, OS may use PCIe AER
	return pcie_ports_native || host->native_aer;
}

如果设备不支持 AER,返回 0。否则,返回一个类似于判断值的值,用于指示 pcie_aer 功能是否为原生状态【?】


pcie_capability_set_word

pci_enable_pcie_error_reporting 中有调用 pcie_capability_set_word 来设置 PCI_EXP_DEVCTLPCI_EXP_AER_FLAGS

int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
				       u16 clear, u16 set)
{
	int ret;
	u16 val;
	
	// 读取pcie capability相应位置的值,如果不符合预期则调用pcie_capability_write_word将期望值写入期望位置。
	ret = pcie_capability_read_word(dev, pos, &val);
	if (!ret) {
		val &= ~clear;
		val |= set;
		ret = pcie_capability_write_word(dev, pos, val);
	}

	return ret;
}
EXPORT_SYMBOL(pcie_capability_clear_and_set_word);

pcibios_err_to_errno

pci_enable_pcie_error_reporting 中有调用 pcibios_err_to_errno 来将可能由 PCI 设备返回的错误值 “翻译” 为非 PCI 代码(已在 include/uapi/asm-generic/errno-base.h 中进行相应的定义),如下:

/* Error values that may be returned by PCI functions */
#define PCIBIOS_SUCCESSFUL			0x00
#define PCIBIOS_FUNC_NOT_SUPPORTED	0x81
#define PCIBIOS_BAD_VENDOR_ID		0x83
#define PCIBIOS_DEVICE_NOT_FOUND	0x86
#define PCIBIOS_BAD_REGISTER_NUMBER	0x87
#define PCIBIOS_SET_FAILED			0x88
#define PCIBIOS_BUFFER_TOO_SMALL	0x89

/* Translate above to generic errno for passing back through non-PCI code */
static inline int pcibios_err_to_errno(int err)
{
	if (err <= PCIBIOS_SUCCESSFUL)
		return err; /* Assume already errno */

	switch (err) {
	case PCIBIOS_FUNC_NOT_SUPPORTED:
		return -ENOENT;
	case PCIBIOS_BAD_VENDOR_ID:
		return -ENOTTY;
	case PCIBIOS_DEVICE_NOT_FOUND:
		return -ENODEV;
	case PCIBIOS_BAD_REGISTER_NUMBER:
		return -EFAULT;
	case PCIBIOS_SET_FAILED:
		return -EIO;
	case PCIBIOS_BUFFER_TOO_SMALL:
		return -ENOSPC;
	}

	return -ERANGE;
}

nvme_pci_configure_admin_queue

nvme_pci_enable 中有调用 nvme_pci_configure_admin_queue,其定义如下:

static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
	int result;
	u32 aqa;
	struct nvme_queue *nvmeq;
	
	// 1. 将bar空间映射到内核的虚拟地址空间上去
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;
	
	// 2. 从CAP寄存器中读取NSSRC判断对Subsystem Reset的支持情况
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
	
	// 3. 写CSTS_NSSRO到CSTS寄存器中
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);

	/*
	 * If the device has been passed off to us in an enabled state, just
	 * clear the enabled bit.  The spec says we should set the 'shutdown
	 * notification bits', but doing so may cause the device to complete
	 * commands to the admin queue ... and we don't know what memory that
	 * might be pointing at!
	 */
	// 3. 调用nvme_disable_ctrl
	result = nvme_disable_ctrl(&dev->ctrl, false);
	if (result < 0)
		return result;
	
	// 4. 调用nvme_alloc_queue
	// 设备disable之后第一次调用nvmeq, 此时值为Null。这时需要调用nvme_alloc_queue分配NVMe queue.
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
	if (result)
		return result;
	
	// 获得这个dev所对应的numa节点
	dev->ctrl.numa_node = dev_to_node(dev->dev);

	nvmeq = &dev->queues[0];
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

	// 5. nvme_alloc_queue分配NVMe queue后,就要将nvme admin queue的属性以及已经分配的admin SQ/CQ内存地址写入寄存器。
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);

	// 6. 对admin queue分配内存之后,调用nvme_enable_ctrl将设备enable。这个函数与nvme_disable_ctrl函数类似,只是过程相反。
	result = nvme_enable_ctrl(&dev->ctrl);
	if (result)
		return result;
	
	// 7. 对之前申请的queue进行初始化操作
	nvmeq->cq_vector = 0;
	nvme_init_queue(nvmeq, 0);
	
	// 8. 调用queue_request_irq申请中断。这个函数主要的工作是设置中断处理函数,默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
	result = queue_request_irq(nvmeq);
	if (result) {
		dev->online_queues--;
		return result;
	}
	
	// 是能NVMe Queue
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
	return result;
}

从上面的代码,我们可以了解到 nvme_pci_configure_admin_queue 中大致的步骤如下:

    1. CAP 寄存器中判断对 Subsystem Reset 的支持情况;
    1. 调用 nvme_disable_ctrl
    1. 调用 nvme_alloc_queue
    1. 调用 lo_hi_writeq
    1. 调用 nvme_enable_ctrl
    1. 调用 nvme_init_queue
    1. 调用 queue_request_irq

对于上面的步骤,挨个逐步分析:


1. 读取 NVM Subsystem Reset 寄存器

ControllerCAP 寄存器 bit[36] 定义了对 NVM subsystem reset 是否支持,如下图:

驱动 | Linux | NVMe | 2. nvme_probe_第5张图片
一般情况下,NVM subsystem 就是一块 SSD 了,由 ControllerNAND 以及接口组成一个 NVM subsystem
NVM Subsystem ResetController Level Reset 的一种。


2&5. nvme_disable_ctrl / nvme_enable_ctrl

在对 NVMe controller 进行操作时需要通过 nvme_disable_ctrl 将设备 disable,完成后再调用 nvme_enable_ctrl 将设备 enable

int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
	int ret;

	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
	if (shutdown)
		ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
	else
		ctrl->ctrl_config &= ~NVME_CC_ENABLE;
	
	// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
	// reg_write32通过NVME_REG_CC寄存器disable设备。
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;

	if (shutdown) {
		return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
				       NVME_CSTS_SHST_CMPLT,
				       ctrl->shutdown_timeout, "shutdown");
	}
	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
		msleep(NVME_QUIRK_DELAY_AMOUNT);
	// 在函数最后,通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。
	// 超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
			       (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
// 通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
#define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)

驱动 | Linux | NVMe | 2. nvme_probe_第6张图片


nvme_wait_ready
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
		u32 timeout, const char *op)
{
	unsigned long timeout_jiffies = jiffies + timeout * HZ;
	u32 csts;
	int ret;

	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
		if (csts == ~0)
			return -ENODEV;
		if ((csts & mask) == val)
			break;

		usleep_range(1000, 2000);
		if (fatal_signal_pending(current))
			return -EINTR;
		if (time_after(jiffies, timeout_jiffies)) {
			dev_err(ctrl->device,
				"Device not ready; aborting %s, CSTS=0x%x\n",
				op, csts);
			return -ENODEV;
		}
	}

	return ret;
}
// 同 nvme_disable_ctrl 的过程基本相反
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
	unsigned dev_page_min;
	u32 timeout;
	int ret;

	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
	if (ret) {
		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
		return ret;
	}
	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;

	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
		dev_err(ctrl->device,
			"Minimum device page size %u too large for host (%u)\n",
			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
		return -ENODEV;
	}

	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
		ctrl->ctrl_config = NVME_CC_CSS_CSI;
	else
		ctrl->ctrl_config = NVME_CC_CSS_NVM;

	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
		u32 crto;

		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
		if (ret) {
			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
				ret);
			return ret;
		}

		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
			ctrl->ctrl_config |= NVME_CC_CRIME;
			timeout = NVME_CRTO_CRIMT(crto);
		} else {
			timeout = NVME_CRTO_CRWMT(crto);
		}
	} else {
		timeout = NVME_CAP_TIMEOUT(ctrl->cap);
	}

	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
	// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
	// reg_write32通过NVME_REG_CC寄存器disable设备
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;

	/* Flush write to device (required if transport is PCI) */
	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
	if (ret)
		return ret;

	ctrl->ctrl_config |= NVME_CC_ENABLE;
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
			       (timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);

3. nvme_alloc_queue

设备 disable 之后需要调用 nvme_alloc_queue 分配 NVMe queue

static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
{
	struct nvme_queue *nvmeq = &dev->queues[qid];

	if (dev->ctrl.queue_count > qid)
		return 0;

	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
	nvmeq->q_depth = depth;
	
	// 1. 调用 dma_alloc_coherent 为 completion queue 分配内存以供 DMA 使用。nvmeq->cqes为申请到的内存的虚拟地址,供内核使用。
	// 而nvmeq->cq_dma_addr就是这块内存的物理地址,供DMA控制器使用。
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
	if (!nvmeq->cqes)
		goto free_nvmeq;
	
	// 2. 调用 nvme_alloc_sq_cmds 来处理 submission queue,假如nvme版本是1.2或者以上的,并且cmb支持 submission queue,那就使用 cmb。
	// 否则的话,和 completion queue 一样使用dma_alloc_coherent来分配内存。
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
		goto free_cqdma;

	nvmeq->dev = dev;
	spin_lock_init(&nvmeq->sq_lock);
	spin_lock_init(&nvmeq->cq_poll_lock);
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
	nvmeq->qid = qid;
	dev->ctrl.queue_count++;

	return 0;

 free_cqdma:
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
 free_nvmeq:
	return -ENOMEM;
}

调用 nvme_alloc_queue,设备 disable 之后第一次调用 nvmeq,此时值为 Null
这时需要调用 nvme_alloc_queue 分配 NVMe queue


4. lo_hi_writeq

nvme_alloc_queue 分配 NVMe queue 后,就要将 nvme admin queue 的属性以及已经分配的admin SQ/CQ 内存地址写入寄存器。

static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
{
	writel(val, addr);
	writel(val >> 32, addr + 4);
}

6. nvme_init_queue
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
	struct nvme_dev *dev = nvmeq->dev;

	nvmeq->sq_tail = 0;
	nvmeq->last_sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
	nvme_dbbuf_init(dev, nvmeq, qid);
	dev->online_queues++;
	wmb(); /* ensure the first interrupt sees the initialization */
}

在这个过程中,对 SQ TailCQ Head 以及 CQ phase 等变量进行初始化赋值,然后通过 q_db 指向 Doorbell 寄存器。


7. queue_request_irq
static int queue_request_irq(struct nvme_queue *nvmeq)
{
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
}

调用 queue_request_irq 申请中断。
默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。

线程化与中断上下文的概念:

中断线程化是实现 Linux 实时性的一个重要步骤,在 linux 标准内核中,中断是最高优先级的执行单元,不管内核当时处理什么,只要有中断事件,系统将立即响应该事件并执行相应的中断处理代码,除非当时中断关闭。因此,如果系统有严重的网络或 I/O 负载,中断将非常频繁,后发生的实时任务将很难有机会运行,也就是说,毫无实时性可言。

中断线程化之后,中断将作为内核线程运行而且赋予不同的实时优先级,实时任务可以有比中断线程更高的优先级,这样,实时任务就可以作为最高优先级的执行单元来运行,即使在严重负载下仍有实时性保证。

内核空间和用户空间是操作系统理论的基础之一,即内核功能模块运行在内核空间,而应用程序运行在用户空间。现代的 CPU 都具有不同的操作模式,代表不同的级别,不同的级别具有不同的功能,在较低的级别中将禁止某些操作。

Linux 系统设计时利用了这种硬件特性,使用了两个级别,最高级别和最低级别,内核运行在最高级别(内核态),这个级别可以进行所有操作,而应用程序运行在较低级别(用户态),在这个级别,处理器控制着对硬件的直接访问以及对内存的非授权访问。

内核态和用户态有自己的内存映射,即自己的地址空间。

正是有了不同运行状态的划分,才有了上下文的概念。用户空间的应用程序,如果想要请求系统服务,比如操作一个物理设备,或者映射一段设备空间的地址到用户空间,就必须通过系统调用来(操作系统提供给用户空间的接口函数)实现。通过系统调用,用户空间的应用程序就会进入内核空间,由内核代表该进程运行于内核空间,这就涉及到上下文的切换,用户空间和内核空间具有不同的地址映射,通用或专用的寄存器组。而用户空间的进程要传递很多变量、参数给内核,内核也要保存用户进程的一些寄存器、变量等,以便系统调用结束后回到用户空间继续执行。所谓的进程上下文,就是一个进程在执行的时候,CPU 的所有寄存器中的值、进程的状态以及堆栈中的内容,当内核需要切换到另一个进程时,它需要保存当前进程的所有状态,即保存当前进程的进程上下文,以便再次执行该进程时,能够恢复切换时的状态,继续执行。

同理,硬件通过触发信号,导致内核调用中断处理程序,进入内核空间。这个过程中,硬件的一些变量和参数也要传递给内核,内核通过这些参数进行中断处理,中断上下文就可以理解为硬件传递过来的这些参数和内核需要保存的一些环境,主要是被中断的进程的环境。


nvme_alloc_admin_tag_set

nvme_probe 中使用 nvme_mq_admin_opsnvme_alloc_admin_tag_set 进行初始化,

static const struct blk_mq_ops nvme_mq_admin_ops = {
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.init_hctx	= nvme_admin_init_hctx,
	.init_request	= nvme_pci_init_request,
	.timeout	= nvme_timeout,
};

再回归到 nvme_alloc_admin_tag_set,如下:

int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
		const struct blk_mq_ops *ops, unsigned int cmd_size)
{
	int ret;

	memset(set, 0, sizeof(*set));
	set->ops = ops;
	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
	if (ctrl->ops->flags & NVME_F_FABRICS)
		set->reserved_tags = NVMF_RESERVED_TAGS;
	set->numa_node = ctrl->numa_node;
	set->flags = BLK_MQ_F_NO_SCHED;
	if (ctrl->ops->flags & NVME_F_BLOCKING)
		set->flags |= BLK_MQ_F_BLOCKING;
	set->cmd_size = cmd_size;
	set->driver_data = ctrl;
	set->nr_hw_queues = 1;
	set->timeout = NVME_ADMIN_TIMEOUT;

	// 1. 分配tag set并与request queue关联
	ret = blk_mq_alloc_tag_set(set);
	if (ret)
		return ret;
	
	// 2. 对hardware queue和software queues进行初始化,并配置两者之间的mapping关系,最后将返回值传递给dev->ctrl.admin_q。
	ctrl->admin_q = blk_mq_init_queue(set);
	if (IS_ERR(ctrl->admin_q)) {
		ret = PTR_ERR(ctrl->admin_q);
		goto out_free_tagset;
	}

	if (ctrl->ops->flags & NVME_F_FABRICS) {
		ctrl->fabrics_q = blk_mq_init_queue(set);
		if (IS_ERR(ctrl->fabrics_q)) {
			ret = PTR_ERR(ctrl->fabrics_q);
			goto out_cleanup_admin_q;
		}
	}

	ctrl->admin_tagset = set;
	return 0;

out_cleanup_admin_q:
	blk_mq_destroy_queue(ctrl->admin_q);
	blk_put_queue(ctrl->admin_q);
out_free_tagset:
	blk_mq_free_tag_set(set);
	ctrl->admin_q = NULL;
	ctrl->fabrics_q = NULL;
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);

这个函数是 NVMe 设备采用 Multi-QueueMQ )的核心函数,所以在展开解析这个函数之前,我们先聊聊 Linux Multi-Queue Block Layer

多队列、原生异步、无锁是 NVMe 的最大特色,这些为高性能而生的设计迫使 Linux Kernel3.19 抛弃了老的单队列 Block Layer 而转向 Multi-Queue Block Layer

这个 Multi-Queue Block Layer 的架构直接对应于 NVMe 的多队列设计,如下图:

驱动 | Linux | NVMe | 2. nvme_probe_第7张图片

所谓的 Multi-Queue 机制就是在多核 CPU 的情况下,将不同的 block 层提交队列分配到不同的CPU 核上,以更好的平衡 IO 的工作负载,大幅提高 SSD 等存储设备的 IO 效率。

Multi-Queue Block Layer 长啥样子呢?如下图所示:

驱动 | Linux | NVMe | 2. nvme_probe_第8张图片
Multi-Queue Block Layer 分为两层,Software QueuesHardware Dispatch Queues

Softeware Queuesper core 的,Queue 的数目与协议有关系,比如 NVMe 协议,可以有最多64KIO SQ/CQSoftware Queues 层做的事情如上图标识部分。

Hardware Queues 数目由底层设备驱动决定,可以 1 个或者多个。最大支持数目一般会与 MSI-X 中断最大数目一样,支持 2K。设备驱动通过 map_queue 维护 Software QueuesHardware Queues 之间的对接关系。

需要强调一点,Hardware QueuesSoftware Queues 的数目不一定相等,上图 1:1 Mapping 的情况属于最理想的情况。

到这里,Multi-Queue Block Layer 基本理论我们就算回顾完毕了,再回头看看nvme_alloc_admin_tag_set 这个函数。


blk_mq_alloc_tag_set

nvme_alloc_admin_tag_set 中使用 blk_mq_alloc_tag_set

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
 * requested depth down, if it's too large. In that case, the set
 * value will be stored in set->queue_depth.
 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
	int i, ret;

	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

	if (!set->nr_hw_queues)
		return -EINVAL;
	if (!set->queue_depth)
		return -EINVAL;
	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
		return -EINVAL;

	if (!set->ops->queue_rq)
		return -EINVAL;

	if (!set->ops->get_budget ^ !set->ops->put_budget)
		return -EINVAL;

	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
		pr_info("blk-mq: reduced tag depth to %u\n",
			BLK_MQ_MAX_DEPTH);
		set->queue_depth = BLK_MQ_MAX_DEPTH;
	}

	if (!set->nr_maps)
		set->nr_maps = 1;
	else if (set->nr_maps > HCTX_MAX_TYPES)
		return -EINVAL;

	/*
	 * If a crashdump is active, then we are potentially in a very
	 * memory constrained environment. Limit us to 1 queue and
	 * 64 tags to prevent using too much memory.
	 */
	if (is_kdump_kernel()) {
		set->nr_hw_queues = 1;
		set->nr_maps = 1;
		set->queue_depth = min(64U, set->queue_depth);
	}
	/*
	 * There is no use for more h/w queues than cpus if we just have
	 * a single map
	 */
	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
		set->nr_hw_queues = nr_cpu_ids;

	if (set->flags & BLK_MQ_F_BLOCKING) {
		set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
		if (!set->srcu)
			return -ENOMEM;
		ret = init_srcu_struct(set->srcu);
		if (ret)
			goto out_free_srcu;
	}

	ret = -ENOMEM;
	set->tags = kcalloc_node(set->nr_hw_queues,
				 sizeof(struct blk_mq_tags *), GFP_KERNEL,
				 set->numa_node);
	if (!set->tags)
		goto out_cleanup_srcu;

	for (i = 0; i < set->nr_maps; i++) {
		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
						  sizeof(set->map[i].mq_map[0]),
						  GFP_KERNEL, set->numa_node);
		if (!set->map[i].mq_map)
			goto out_free_mq_map;
		set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
	}

	blk_mq_update_queue_map(set);

	ret = blk_mq_alloc_set_map_and_rqs(set);
	if (ret)
		goto out_free_mq_map;

	mutex_init(&set->tag_list_lock);
	INIT_LIST_HEAD(&set->tag_list);

	return 0;

out_free_mq_map:
	for (i = 0; i < set->nr_maps; i++) {
		kfree(set->map[i].mq_map);
		set->map[i].mq_map = NULL;
	}
	kfree(set->tags);
	set->tags = NULL;
out_cleanup_srcu:
	if (set->flags & BLK_MQ_F_BLOCKING)
		cleanup_srcu_struct(set->srcu);
out_free_srcu:
	if (set->flags & BLK_MQ_F_BLOCKING)
		kfree(set->srcu);
	return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

blk_mq_init_queue

nvme_alloc_admin_tag_set 中使用 blk_mq_init_queue

static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
		void *queuedata)
{
	struct request_queue *q;
	int ret;

	q = blk_alloc_queue(set->numa_node);
	if (!q)
		return ERR_PTR(-ENOMEM);
	q->queuedata = queuedata;
	ret = blk_mq_init_allocated_queue(set, q);
	if (ret) {
		blk_put_queue(q);
		return ERR_PTR(ret);
	}
	return q;
}

struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
	return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
		struct request_queue *q)
{
	/* mark the queue as mq asap */
	q->mq_ops = set->ops;

	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
					     blk_mq_poll_stats_bkt,
					     BLK_MQ_POLL_STATS_BKTS, q);
	if (!q->poll_cb)
		goto err_exit;

	if (blk_mq_alloc_ctxs(q))
		goto err_poll;

	/* init q->mq_kobj and sw queues' kobjects */
	blk_mq_sysfs_init(q);

	INIT_LIST_HEAD(&q->unused_hctx_list);
	spin_lock_init(&q->unused_hctx_lock);

	xa_init(&q->hctx_table);

	blk_mq_realloc_hw_ctxs(set, q);
	if (!q->nr_hw_queues)
		goto err_hctxs;

	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);

	q->tag_set = set;

	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
	blk_mq_update_poll_flag(q);

	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
	INIT_LIST_HEAD(&q->requeue_list);
	spin_lock_init(&q->requeue_lock);

	q->nr_requests = set->queue_depth;

	/*
	 * Default to classic polling
	 */
	q->poll_nsec = BLK_MQ_POLL_CLASSIC;

	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
	blk_mq_add_queue_tag_set(set, q);
	blk_mq_map_swqueue(q);
	return 0;

err_hctxs:
	blk_mq_release(q);
err_poll:
	blk_stat_free_callback(q->poll_cb);
	q->poll_cb = NULL;
err_exit:
	q->mq_ops = NULL;
	return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
						struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned long i, j;

	/* protect against switching io scheduler  */
	mutex_lock(&q->sysfs_lock);
	for (i = 0; i < set->nr_hw_queues; i++) {
		int old_node;
		int node = blk_mq_get_hctx_node(set, i);
		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);

		if (old_hctx) {
			old_node = old_hctx->numa_node;
			blk_mq_exit_hctx(q, set, old_hctx, i);
		}

		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
			if (!old_hctx)
				break;
			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
					node, old_node);
			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
			WARN_ON_ONCE(!hctx);
		}
	}
	/*
	 * Increasing nr_hw_queues fails. Free the newly allocated
	 * hctxs and keep the previous q->nr_hw_queues.
	 */
	if (i != set->nr_hw_queues) {
		j = q->nr_hw_queues;
	} else {
		j = i;
		q->nr_hw_queues = set->nr_hw_queues;
	}

	xa_for_each_start(&q->hctx_table, j, hctx, j)
		blk_mq_exit_hctx(q, set, hctx, j);
	mutex_unlock(&q->sysfs_lock);
}

nvme_init_ctrl_finish

nvme_probe 中使用 nvme_init_ctrl_finish 来初始化 NVMe Controller 结构,

/*
 * Initialize the cached copies of the Identify data and various controller
 * register in our nvme_ctrl structure.  This should be called as soon as
 * the admin queue is fully up and running.
 */
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
{
	int ret;
	
	// 读取NVME的版本
	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
	if (ret) {
		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
		return ret;
	}

	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
	
	// NVMe 1.1之后,支持subsystem Reset
	if (ctrl->vs >= NVME_VS(1, 1, 0))
		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
	
	// 1. 读取 identify data
	ret = nvme_init_identify(ctrl);
	if (ret)
		return ret;

	ret = nvme_configure_apst(ctrl);
	if (ret < 0)
		return ret;

	ret = nvme_configure_timestamp(ctrl);
	if (ret < 0)
		return ret;

	ret = nvme_configure_host_options(ctrl);
	if (ret < 0)
		return ret;

	nvme_configure_opal(ctrl, was_suspended);

	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
		/*
		 * Do not return errors unless we are in a controller reset,
		 * the controller works perfectly fine without hwmon.
		 */
		ret = nvme_hwmon_init(ctrl);
		if (ret == -EINTR)
			return ret;
	}

	ctrl->identified = true;

	return 0;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);

nvme_init_identify

nvme_init_ctrl_finish 中调用 nvme_init_identify

static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
	struct nvme_id_ctrl *id;
	u32 max_hw_sectors;
	bool prev_apst_enabled;
	int ret;
    
    // 1. 调用 nvme_identify_ctrl 读取 identify data.
	ret = nvme_identify_ctrl(ctrl, &id);
	if (ret) {
		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
		return -EIO;
	}

	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
		if (ret < 0)
			goto out_free;
	}

	if (!(ctrl->ops->flags & NVME_F_FABRICS))
		ctrl->cntlid = le16_to_cpu(id->cntlid);

	if (!ctrl->identified) {
		unsigned int i;

		/*
		 * Check for quirks.  Quirk can depend on firmware version,
		 * so, in principle, the set of quirks present can change
		 * across a reset.  As a possible future enhancement, we
		 * could re-scan for quirks every time we reinitialize
		 * the device, but we'd have to make sure that the driver
		 * behaves intelligently if the quirks change.
		 */
		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
			if (quirk_matches(id, &core_quirks[i]))
				ctrl->quirks |= core_quirks[i].quirks;
		}

		ret = nvme_init_subsystem(ctrl, id);
		if (ret)
			goto out_free;
	}
	memcpy(ctrl->subsys->firmware_rev, id->fr,
	       sizeof(ctrl->subsys->firmware_rev));

	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
	}

	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
	ctrl->crdt[2] = le16_to_cpu(id->crdt3);

	ctrl->oacs = le16_to_cpu(id->oacs);
	ctrl->oncs = le16_to_cpu(id->oncs);
	ctrl->mtfa = le16_to_cpu(id->mtfa);
	ctrl->oaes = le32_to_cpu(id->oaes);
	ctrl->wctemp = le16_to_cpu(id->wctemp);
	ctrl->cctemp = le16_to_cpu(id->cctemp);

	atomic_set(&ctrl->abort_limit, id->acl + 1);
	ctrl->vwc = id->vwc;
	if (id->mdts)
		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
	else
		max_hw_sectors = UINT_MAX;
	ctrl->max_hw_sectors =
		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
	
	// 2. 调用 nvme_set_queue_limits 设置 queue write cache 的大小.
	nvme_set_queue_limits(ctrl, ctrl->admin_q);
	ctrl->sgls = le32_to_cpu(id->sgls);
	ctrl->kas = le16_to_cpu(id->kas);
	ctrl->max_namespaces = le32_to_cpu(id->mnan);
	ctrl->ctratt = le32_to_cpu(id->ctratt);

	ctrl->cntrltype = id->cntrltype;
	ctrl->dctype = id->dctype;

	if (id->rtd3e) {
		/* us -> s */
		u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;

		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
						 shutdown_timeout, 60);

		if (ctrl->shutdown_timeout != shutdown_timeout)
			dev_info(ctrl->device,
				 "Shutdown timeout set to %u seconds\n",
				 ctrl->shutdown_timeout);
	} else
		ctrl->shutdown_timeout = shutdown_timeout;
	
	// 3. 初始化APSTE的配置
	ctrl->npss = id->npss;
	ctrl->apsta = id->apsta;
	prev_apst_enabled = ctrl->apst_enabled;
	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
		if (force_apst && id->apsta) {
			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
			ctrl->apst_enabled = true;
		} else {
			ctrl->apst_enabled = false;
		}
	} else {
		ctrl->apst_enabled = id->apsta;
	}
	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));

	if (ctrl->ops->flags & NVME_F_FABRICS) {
		ctrl->icdoff = le16_to_cpu(id->icdoff);
		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
		ctrl->maxcmd = le16_to_cpu(id->maxcmd);

		/*
		 * In fabrics we need to verify the cntlid matches the
		 * admin connect
		 */
		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
			dev_err(ctrl->device,
				"Mismatching cntlid: Connect %u vs Identify "
				"%u, rejecting\n",
				ctrl->cntlid, le16_to_cpu(id->cntlid));
			ret = -EINVAL;
			goto out_free;
		}

		if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
			dev_err(ctrl->device,
				"keep-alive support is mandatory for fabrics\n");
			ret = -EINVAL;
			goto out_free;
		}
	} else {
		ctrl->hmpre = le32_to_cpu(id->hmpre);
		ctrl->hmmin = le32_to_cpu(id->hmmin);
		ctrl->hmminds = le32_to_cpu(id->hmminds);
		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
	}
	
	// 4. 对于multi-port devices, 初始化multi-path
	ret = nvme_mpath_init_identify(ctrl, id);
	if (ret < 0)
		goto out_free;
	
	if (ctrl->apst_enabled && !prev_apst_enabled)
		dev_pm_qos_expose_latency_tolerance(ctrl->device);
	else if (!ctrl->apst_enabled && prev_apst_enabled)
		dev_pm_qos_hide_latency_tolerance(ctrl->device);

out_free:
	kfree(id);
	return ret;
}

从上面来看,主要做了这么几部分的工作:

  1. 调用 nvme_identify_ctrl 来读取 identify data
  2. 调用 nvme_set_queue_limits 设置 queue write cache 的大小;
  3. 调用初始化 APSTE 的配置;
  4. 对于 multi-port devices,初始化 multi-path

nvme_identify_ctrl

static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
	struct nvme_command c = { };
	int error;
	
	// 1. 构造 identify cmd;
	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
	c.identify.opcode = nvme_admin_identify;
	c.identify.cns = NVME_ID_CNS_CTRL;

	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
	if (!*id)
		return -ENOMEM;
	
	// 2. 提交 identify cmd 到 admin_q
	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
			sizeof(struct nvme_id_ctrl));
	if (error)
		kfree(*id);
	return error;
}

c.identify

nvme_identify_ctrl 函数先建立 identify Command(opcode=0x6),如下所示:

struct nvme_identify {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__u64			rsvd2[2];
	union nvme_data_ptr	dptr;
	__u8			cns;
	__u8			rsvd3;
	__le16			ctrlid;
	__u8			rsvd11[3];
	__u8			csi;
	__u32			rsvd12[4];
};

Opcode 在协议中规定如下:

驱动 | Linux | NVMe | 2. nvme_probe_第9张图片
而在源码中的定义,如下:

enum nvme_admin_opcode {
	...
	nvme_admin_identify		= 0x06,
	...

无论是从 NVMe 协议又或者源码的角度,对 identify commandOpcode 的定义是一致的。

Identify Command 下发后返回的是 4KBIdentify Data Structure,这个 data structure 可以描述controller,也可以描述 namespace, 具体是描述什么要取决于 CNS ( Controller or Namespace Structure ) byte

先回顾一下在 NVMe Spec 中有几种 CNS 的分类:

  • CNS = 0x00h,代表描述的是 Namespace data structure
  • CNS = 0x01h,代表描述的是 Controller data structure
  • CNS = 0x02h ,代表描述的是 Namespace list

然后是 NVME_ID_CNS_CTRL 在源码中的定义,如下:

enum {
	...
	NVME_ID_CNS_CTRL		= 0x01,
  	...

__nvme_submit_sync_cmd

nvme_identify_ctrl 函数已经建立了 Identify Command,驱动是怎么提交这个 admin command 呢?

实际上,admin command 的提交过程主要调用了 nvme_submit_sync_cmd 函数,但最终调用的函数是 __nvme_submit_sync_cmd,如下所示:

/*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
		union nvme_result *result, void *buffer, unsigned bufflen,
		int qid, int at_head, blk_mq_req_flags_t flags)
{
	struct request *req;
	int ret;
	
	// 1. 申请一个 request_queue, 并完成相应的初始化;
	if (qid == NVME_QID_ANY)
		req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
	else
		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
						qid - 1);

	if (IS_ERR(req))
		return PTR_ERR(req);
	nvme_init_request(req, cmd);
	
	// 2. 调用blk_rq_map_kern完成request queue与bio以及bio与内核空间buffer的关联
	if (buffer && bufflen) {
		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
		if (ret)
			goto out;
	}
	
	// 3. 调用blk_excute_rq实现最终的命令发送。
	ret = nvme_execute_rq(req, at_head);
	if (result && ret >= 0)
		*result = nvme_req(req)->result;
 out:
	blk_mq_free_request(req);
	return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);

1. blk_mq_alloc_request

nvme_submit_sync_cmd 中调用 blk_mq_alloc_request 或者 blk_mq_alloc_request_hctx 来申请一个 request_queuecmd 参数,在这里也就是 Identify command 会通过逐层调用 blk_mq_alloc_cached_requestblk_mq_rq_cache_fill 来传递到 request queue 中。
并完成相应的初始化,如下:

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
		blk_mq_req_flags_t flags)
{
	struct request *rq;
	
	// 申请request侯,调用blk_mq_alloc_cached_request将opf塞入q中
	rq = blk_mq_alloc_cached_request(q, opf, flags);
	if (!rq) {
		struct blk_mq_alloc_data data = {
			.q		= q,
			.flags		= flags,
			.cmd_flags	= opf,
			.nr_tags	= 1,
		};
		int ret;

		ret = blk_queue_enter(q, flags);
		if (ret)
			return ERR_PTR(ret);

		rq = __blk_mq_alloc_requests(&data);
		if (!rq)
			goto out_queue_exit;
	}
	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
	return rq;
out_queue_exit:
	blk_queue_exit(q);
	return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
						   blk_opf_t opf,
						   blk_mq_req_flags_t flags)
{
	struct blk_plug *plug = current->plug;
	struct request *rq;

	if (!plug)
		return NULL;

	if (rq_list_empty(plug->cached_rq)) {
		if (plug->nr_ios == 1)
			return NULL;
		// 通过调用 blk_mq_rq_cache_fill来填充opf进入q中
		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
		if (!rq)
			return NULL;
	} else {
		rq = rq_list_peek(&plug->cached_rq);
		if (!rq || rq->q != q)
			return NULL;

		if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
			return NULL;
		if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
			return NULL;

		plug->cached_rq = rq_list_next(rq);
	}

	rq->cmd_flags = opf;
	INIT_LIST_HEAD(&rq->queuelist);
	return rq;
}
static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
					    struct blk_plug *plug,
					    blk_opf_t opf,
					    blk_mq_req_flags_t flags)
{
	struct blk_mq_alloc_data data = {
		.q		= q,
		.flags		= flags,
		.cmd_flags	= opf,
		.nr_tags	= plug->nr_ios,
		.cached_rq	= &plug->cached_rq,
	};
	struct request *rq;

	if (blk_queue_enter(q, flags))
		return NULL;

	plug->nr_ios = 1;

	rq = __blk_mq_alloc_requests(&data);
	if (unlikely(!rq))
		blk_queue_exit(q);
	return rq;
}

2. blk_rq_map_kern

如果 buffer & bufflen 不为 0,则说明这次 nvme admin 命令需要传输数据。
既然需要传输数据,就需要得到 bio 的支持, 那么就调用 blk_rq_map_kern 完成 request queuebio 以及 bio 与内核空间 buffer 的关联。毕竟 block layer 并不认识内核空间或者用户空间,而只认识 bio

/**
 * blk_rq_map_kern - map kernel data to a request, for passthrough requests
 * @q:		request queue where request should be inserted
 * @rq:		request to fill
 * @kbuf:	the kernel buffer
 * @len:	length of user data
 * @gfp_mask:	memory allocation flags
 *
 * Description:
 *    Data will be mapped directly if possible. Otherwise a bounce
 *    buffer is used. Can be called multiple times to append multiple
 *    buffers.
 */
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
		    unsigned int len, gfp_t gfp_mask)
{
	int reading = rq_data_dir(rq) == READ;
	unsigned long addr = (unsigned long) kbuf;
	struct bio *bio;
	int ret;
	
	// 在怎样的情况下,bufflen是无效的?2种:
	if (len > (queue_max_hw_sectors(q) << 9))
		return -EINVAL;
	if (!len || !kbuf)
		return -EINVAL;
	
	// 关联bio和内核空间buffer
	if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
	    blk_queue_may_bounce(q))
		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
	else
		bio = bio_map_kern(q, kbuf, len, gfp_mask);

	if (IS_ERR(bio))
		return PTR_ERR(bio);

	bio->bi_opf &= ~REQ_OP_MASK;
	bio->bi_opf |= req_op(rq);

	ret = blk_rq_append_bio(rq, bio);
	if (unlikely(ret)) {
		bio_uninit(bio);
		kfree(bio);
	}
	return ret;
}
EXPORT_SYMBOL(blk_rq_map_kern);

3. nvme_execute_rq
/*
 * Return values:
 * 0:  success
 * >0: nvme controller's cqe status response
 * <0: kernel error in lieu of controller response
 */
static int nvme_execute_rq(struct request *rq, bool at_head)
{
	blk_status_t status;
	
	// 调用 blk_execute_rq 将request插入执行队列
	status = blk_execute_rq(rq, at_head);
	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
		// #define	EINTR		 4	/* Interrupted system call */
		return -EINTR;
	if (nvme_req(rq)->status)
		return nvme_req(rq)->status;
	return blk_status_to_errno(status);
}

调用 blk_execute_rq 来将 request queue 插入执行队列队头或者队尾,等待执行完毕后返回 0>0 或者 <0 的状态码(定义如注释部分所示)。

那么是如何插入的呢?如下所示:

/**
 * blk_execute_rq - insert a request into queue for execution
 * @rq:		request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution and wait for completion.
 * Return: The blk_status_t result provided to blk_mq_end_request().
 */
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
	struct blk_rq_wait wait = {
		.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
	};

	WARN_ON(irqs_disabled());
	WARN_ON(!blk_rq_is_passthrough(rq));
	
	# 等待 I/O schedular queue结束
	rq->end_io_data = &wait;
	rq->end_io = blk_end_sync_rq;

	blk_account_io_start(rq);
	blk_mq_sched_insert_request(rq, at_head, true, false);

	if (blk_rq_is_poll(rq)) {
		blk_rq_poll_completion(rq, &wait.done);
	} else {
		/*
		 * Prevent hang_check timer from firing at us during very long
		 * I/O
		 */
		unsigned long hang_check = sysctl_hung_task_timeout_secs;

		if (hang_check)
			while (!wait_for_completion_io_timeout(&wait.done,
					hang_check * (HZ/2)))
				;
		else
			wait_for_completion_io(&wait.done);
	}

	return wait.ret;
}
EXPORT_SYMBOL(blk_execute_rq);

nvme_setup_io_queues

nvme_probe 中使用 nvme_setup_io_queues 来初始化 IO Queue 结构,

static int nvme_setup_io_queues(struct nvme_dev *dev)
{
	struct nvme_queue *adminq = &dev->queues[0];
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	unsigned int nr_io_queues;
	unsigned long size;
	int result;

	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	
	// 每一个 nvme_dev 是一个 PCI function,获取到的 nr_allocated_queues 赋值给 nr_io_queues
	nr_io_queues = dev->nr_allocated_queues - 1;
	// 1. 发送set feature cmd设置IO queues数目为nr_io_queues
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
		return result;

	if (nr_io_queues == 0)
		return 0;

	/*
	 * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
	 * from set to unset. If there is a window to it is truely freed,
	 * pci_free_irq_vectors() jumping into this window will crash.
	 * And take lock to avoid racing with pci_free_irq_vectors() in
	 * nvme_dev_disable() path.
	 */
	result = nvme_setup_io_queues_trylock(dev);
	if (result)
		return result;
	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
		pci_free_irq(pdev, 0, adminq);

	if (dev->cmb_use_sqes) {
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0) {
			dev->q_depth = result;
			dev->ctrl.sqsize = result - 1;
		} else {
			dev->cmb_use_sqes = false;
		}
	}

	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues) {
			result = -ENOMEM;
			goto out_unlock;
		}
	} while (1);
	adminq->q_db = dev->dbs;

 retry:
	/* Deregister the admin queue's interrupt */
	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
		pci_free_irq(pdev, 0, adminq);

	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
	pci_free_irq_vectors(pdev);

	result = nvme_setup_irqs(dev, nr_io_queues);
	if (result <= 0) {
		result = -EIO;
		goto out_unlock;
	}

	dev->num_vecs = result;
	result = max(result - 1, 1);
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];

	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
	result = queue_request_irq(adminq);
	if (result)
		goto out_unlock;
	set_bit(NVMEQ_ENABLED, &adminq->flags);
	mutex_unlock(&dev->shutdown_lock);
	
	// 2. 确定了 IO queues 的数目之后,调用 nvme_creat_io_queues 函数开始真正干活了,完成 IO queues 的创建。
	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_delete_io_queues(dev);
		result = nvme_setup_io_queues_trylock(dev);
		if (result)
			return result;
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
out_unlock:
	mutex_unlock(&dev->shutdown_lock);
	return result;
}

执行的过程中主要分为两步:

    1. 调用 nvme_set_queue_count 发送 set feature cmd 设置 IO queues 的数目;
    1. 确定了 IO queues 的数目之后,调用 nvme_creat_io_queues 函数开始真正干活了,完成 IO queues 的创建。

nvme_set_queue_count

nvme_setup_io_queues 中使用 nvme_set_queue_count 来设置 IO Queue 的数量,

int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
	u32 q_count = (*count - 1) | ((*count - 1) << 16);
	u32 result;
	int status, nr_io_queues;
	
	// 通过NVME_FEAT_NUM_QUEUES和q_count来等参数来发送nvme_set_features的命令并返回执行状态。
	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
			&result);
	if (status < 0)
		return status;

	/*
	 * Degraded controllers might return an error when setting the queue
	 * count.  We still want to be able to bring them online and offer
	 * access to the admin queue, as that might be only way to fix them up.
	 */
	if (status > 0) {
		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
		*count = 0;
	} else {
		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
		*count = min(*count, nr_io_queues);
	}

	return 0;
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);

nvme_set_features

c.features

nvme_set_queue_count 函数先建立 nvme_set_features,如下所示:

int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
		      unsigned int dword11, void *buffer, size_t buflen,
		      u32 *result)
{
	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
			     buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_set_features);
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
	union nvme_result res = { 0 };
	struct nvme_command c = { };
	int ret;

	c.features.opcode = op;
	// 调用 cpu_to_le32 来实现主机格式和小端之间的转换,让代码更好得跨平台。
	c.features.fid = cpu_to_le32(fid);
	c.features.dword11 = cpu_to_le32(dword11);
	
	// 参考之前的 `nvme_identify_ctrl` 中 `nvme_submit_sync_cmd` 中的用法的概述。
	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
			buffer, buflen, NVME_QID_ANY, 0, 0);
	if (ret >= 0 && result)
		*result = le32_to_cpu(res.u32);
	return ret;
}

建立 nvme_set_features 的方式是调用 nvme_features,并传入 nvme_admin_set_featuresOpcode )、NVME_FEAT_NUM_QUEUESFID)和 &nr_io_queuesDword 11)。

Opcode 在协议中规定如下:
驱动 | Linux | NVMe | 2. nvme_probe_第10张图片
而在源码中的定义,如下:

enum nvme_admin_opcode {
	...
	nvme_admin_set_features		= 0x09,
	...
};

无论是从 NVMe 协议又或者源码的角度,对 nvme_set_featureOpcode 的定义是一致的。


IO queues 数目设置在 set feature command 中的 feature ID=0x7h,如下图:

驱动 | Linux | NVMe | 2. nvme_probe_第11张图片
IO queues 的具体数目在 Dword11 设置,如下图,

驱动 | Linux | NVMe | 2. nvme_probe_第12张图片
FID 在源码中的定义如下,

enum {
	...
	NVME_FEAT_NUM_QUEUES	= 0x07,
	...

cpu_to_le32

可以看到在赋值 c.identify.fidc.identify.dword11 时,采用了 cpu_to_le32 这样的函数,因为在nvme 协议里规定的一些消息格式都是按照小端存储的,但是我们的主机可能是小端的 x86,也可能是大端的 arm 或者其他类型,用了这样的函数就可以做到主机格式和小端之间的转换,让代码更好得跨平台,这也是 Linux 系统强大的地方。

它的定义如下,


__nvme_submit_sync_cmd

set feature command 的三个关键参数 ( Opcode , FID, dword11 ) 配置完成后,再调用__nvme_submit_sync_cmd 去执行,最终完成 IO queues 数目的设置。

__nvme_submit_sync_cmd的执行过程参考上面。


nvme_create_io_queues

nvme_setup_io_queues 中使用 nvme_creat_io_queues 来创建 IO Queues

static int nvme_create_io_queues(struct nvme_dev *dev)
{
	unsigned i, max, rw_queues;
	int ret = 0;

	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
	 	// 1. 调用nvme_alloc_queue申请SQ/CQ所需内存
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
			ret = -ENOMEM;
			break;
		}
	}

	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
	} else {
		rw_queues = max;
	}

	for (i = dev->online_queues; i <= max; i++) {
		bool polled = i > rw_queues;
		
		// 2. 调用nvme_create_queue真正实现SQ/CQ的创建。
		ret = nvme_create_queue(&dev->queues[i], i, polled);
		if (ret)
			break;
	}

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
}

上面的代码显示 nvme_create_io_queue 在创建 IO queues 过程中主要进行了两步:

  • 调用 nvme_alloc_queue 申请 SQ / CQ 所需内存,参考对 nvme_alloc_queue 的介绍;
  • 调用 nvme_create_queue 真正实现 SQ / CQ 的创建。

nvme_create_queue

nvme_setup_io_queues 中使用 nvme_create_queue 来真正实现 SQ / CQ 的创建,如下:

static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
	u16 vector = 0;

	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
	
	// 1. 调用adapter_alloc_cq创建CQ
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
	if (result)
		return result;
	
	// 调用adapter_alloc_sq创建SQ
	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
		return result;
	if (result)
		goto release_cq;

	nvmeq->cq_vector = vector;

	result = nvme_setup_io_queues_trylock(dev);
	if (result)
		return result;
	// 2. 调用 nvme_init_queue 初始化前面创建的CQ/SQ. 
	nvme_init_queue(nvmeq, qid);
	if (!polled) {
		// 3. 调用 queue_request_irq 申请中断
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}

	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
	mutex_unlock(&dev->shutdown_lock);
	return result;

release_sq:
	dev->online_queues--;
	mutex_unlock(&dev->shutdown_lock);
	adapter_delete_sq(dev, qid);
release_cq:
	adapter_delete_cq(dev, qid);
	return result;
}

adapter_alloc_cq / adapter_alloc_sq

nvme_create_queue 中使用 adapter_alloc_cq 或者 adapter_alloc_sq 来真正实现 SQ / CQ 的创建,如下:

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
		struct nvme_queue *nvmeq, s16 vector)
{
	struct nvme_command c = { };
	int flags = NVME_QUEUE_PHYS_CONTIG;

	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
		flags |= NVME_CQ_IRQ_ENABLED;

	/*
	 * Note: we (ab)use the fact that the prp fields survive if no data
	 * is attached to the request.
	 */
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
	c.create_cq.irq_vector = cpu_to_le16(vector);

	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	struct nvme_ctrl *ctrl = &dev->ctrl;
	struct nvme_command c = { };
	int flags = NVME_QUEUE_PHYS_CONTIG;

	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

	/*
	 * Note: we (ab)use the fact that the prp fields survive if no data
	 * is attached to the request.
	 */
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}

create I/O SQ 命令中的 opcodeprp1sqidqsizecreate I/O CQ 命令类似, 这里主要提一下 sq_flagscqid

cq_flags 在这里是 pc flagQPRIO flag 合并的值,

int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

驱动 | Linux | NVMe | 2. nvme_probe_第13张图片
QPRIO=Queue Priority,代表了 SQ 中命令的冲裁级别。在 NVMe Spec 没有规定 Command 存入 SQ 队列的执行顺序,Controller 可以一次取出多个 Command 进行批量处理。df 一个 SQ 队列中的 Command 执行顺序是不固定,同时在多个 SQ 队列之间的 Command 执行顺序也不固定,这就涉及到了 NVMe Spec 定义的命令仲裁机制。

create I/O CQ 命令相比,create I/O SQ 命令多了一个 cqid 值。因为 SQCQ 是相互对应的,IO SQIO CQ 可以一对一,也可以多对一。

create I/O submission queue command6 个关键参数 ( OpcodePRP1sqidqsizesq_flagscqid ) 配置完成后,再调用 __nvme_submit_sync_cmd 去执行,最终完成 IO queues 数目的设置,__nvme_submit_sync_cmd 的执行过程参考上面的介绍。


nvme_start_ctrl

nvme_probe 中使用 nvme_start_ctrl 来启动控制器的工作,

void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
	nvme_start_keep_alive(ctrl);
	
	// 1. 开启AEN(Asyncchronous Event Notification)功能
	nvme_enable_aen(ctrl);

	/*
	 * persistent discovery controllers need to send indication to userspace
	 * to re-read the discovery log page to learn about possible changes
	 * that were missed. We identify persistent discovery controllers by
	 * checking that they started once before, hence are reconnecting back.
	 */
	// 2. 确认nvme ctroller的状态是否有改变过
	if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
	    nvme_discovery_ctrl(ctrl))
		nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
	
	if (ctrl->queue_count > 1) {
		// 3. 确认nvme ctroller的状态是否有改变过
		nvme_queue_scan(ctrl);
		nvme_unquiesce_io_queues(ctrl);
		nvme_mpath_update(ctrl);
	}

	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);

nvme_queue_scan

nvme_start_ctrl 中使用 nvme_queue_scan 来扫描队列,

void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
	/*
	 * Only new queue scan work when admin and IO queues are both alive
	 */
	// 在确认nvme ctrl确实是alive并且ctrl已经被tagset的前提下,对nvme_wq中的queue进行队列重整
	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
		queue_work(nvme_wq, &ctrl->scan_work);
}
/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0				CPU1
 *
 *   WRITE_ONCE(x, 1);			[ @work is being executed ]
 *   r0 = queue_work(wq, work);		  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
		   struct work_struct *work)
{
	bool ret = false;
	unsigned long flags;
	
	// 设置中断flag
	local_irq_save(flags);

	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
		__queue_work(cpu, wq, work);
		ret = true;
	}
	
	// 恢复中断flag
	local_irq_restore(flags);
	return ret;
}
EXPORT_SYMBOL(queue_work_on);
static void __queue_work(int cpu, struct workqueue_struct *wq,
			 struct work_struct *work)
{
	struct pool_workqueue *pwq;
	struct worker_pool *last_pool;
	struct list_head *worklist;
	unsigned int work_flags;
	unsigned int req_cpu = cpu;

	/*
	 * While a work item is PENDING && off queue, a task trying to
	 * steal the PENDING will busy-loop waiting for it to either get
	 * queued or lose PENDING.  Grabbing PENDING and queueing should
	 * happen with IRQ disabled.
	 */
	lockdep_assert_irqs_disabled();


	/* if draining, only works from the same workqueue are allowed */
	if (unlikely(wq->flags & __WQ_DRAINING) &&
	    WARN_ON_ONCE(!is_chained_work(wq)))
		return;
	rcu_read_lock();
retry:
	/* pwq which will be used unless @work is executing elsewhere */
	if (wq->flags & WQ_UNBOUND) {
		if (req_cpu == WORK_CPU_UNBOUND)
			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
	} else {
		if (req_cpu == WORK_CPU_UNBOUND)
			cpu = raw_smp_processor_id();
		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
	}

	/*
	 * If @work was previously on a different pool, it might still be
	 * running there, in which case the work needs to be queued on that
	 * pool to guarantee non-reentrancy.
	 */
	last_pool = get_work_pool(work);
	if (last_pool && last_pool != pwq->pool) {
		struct worker *worker;

		raw_spin_lock(&last_pool->lock);

		worker = find_worker_executing_work(last_pool, work);

		if (worker && worker->current_pwq->wq == wq) {
			pwq = worker->current_pwq;
		} else {
			/* meh... not running there, queue here */
			raw_spin_unlock(&last_pool->lock);
			raw_spin_lock(&pwq->pool->lock);
		}
	} else {
		raw_spin_lock(&pwq->pool->lock);
	}

	/*
	 * pwq is determined and locked.  For unbound pools, we could have
	 * raced with pwq release and it could already be dead.  If its
	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
	 * without another pwq replacing it in the numa_pwq_tbl or while
	 * work items are executing on it, so the retrying is guaranteed to
	 * make forward-progress.
	 */
	if (unlikely(!pwq->refcnt)) {
		if (wq->flags & WQ_UNBOUND) {
			raw_spin_unlock(&pwq->pool->lock);
			cpu_relax();
			goto retry;
		}
		/* oops */
		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
			  wq->name, cpu);
	}

	/* pwq determined, queue */
	trace_workqueue_queue_work(req_cpu, pwq, work);

	if (WARN_ON(!list_empty(&work->entry)))
		goto out;

	pwq->nr_in_flight[pwq->work_color]++;
	work_flags = work_color_to_flags(pwq->work_color);

	if (likely(pwq->nr_active < pwq->max_active)) {
		trace_workqueue_activate_work(work);
		pwq->nr_active++;
		worklist = &pwq->pool->worklist;
		if (list_empty(worklist))
			pwq->pool->watchdog_ts = jiffies;
	} else {
		work_flags |= WORK_STRUCT_INACTIVE;
		worklist = &pwq->inactive_works;
	}

	debug_work_activate(work);
	// 将work插入pool中
	insert_work(pwq, work, worklist, work_flags);

out:
	raw_spin_unlock(&pwq->pool->lock);
	rcu_read_unlock();
}

insert_work

nvme_queue_scan 中使用 insert_work 来将 work 插入 pool 中,

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
			struct list_head *head, unsigned int extra_flags)
{
	struct worker_pool *pool = pwq->pool;

	/* record the work call stack in order to print it in KASAN reports */
	kasan_record_aux_stack_noalloc(work);

	/* we own @work, set data and link */
	set_work_pwq(work, pwq, extra_flags);
	list_add_tail(&work->entry, head);
	get_pwq(pwq);

	if (__need_more_worker(pool))
		wake_up_worker(pool);
}

flush_work

nvme_probe 中使用 flush_work 来等待队列中最后一个实例完成执行,

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
	return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

进一步调用了 ___flush_work,如下,

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
	struct wq_barrier barr;

	if (WARN_ON(!wq_online))
		return false;

	if (WARN_ON(!work->func))
		return false;

	lock_map_acquire(&work->lockdep_map);
	lock_map_release(&work->lockdep_map);

	if (start_flush_work(work, &barr, from_cancel)) {
		wait_for_completion(&barr.done);
		destroy_work_on_stack(&barr.work);
		return true;
	} else {
		return false;
	}
}

start_flush_work

flush_work 中使用 start_flush_work

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
			     bool from_cancel)
{
	struct worker *worker = NULL;
	struct worker_pool *pool;
	struct pool_workqueue *pwq;

	might_sleep();

	rcu_read_lock();
	pool = get_work_pool(work);
	if (!pool) {
		rcu_read_unlock();
		return false;
	}

	raw_spin_lock_irq(&pool->lock);
	/* see the comment in try_to_grab_pending() with the same code */
	pwq = get_work_pwq(work);
	if (pwq) {
		if (unlikely(pwq->pool != pool))
			goto already_gone;
	} else {
		worker = find_worker_executing_work(pool, work);
		if (!worker)
			goto already_gone;
		pwq = worker->current_pwq;
	}

	check_flush_dependency(pwq->wq, work);

	insert_wq_barrier(pwq, barr, work, worker);
	raw_spin_unlock_irq(&pool->lock);

	/*
	 * Force a lock recursion deadlock when using flush_work() inside a
	 * single-threaded or rescuer equipped workqueue.
	 *
	 * For single threaded workqueues the deadlock happens when the work
	 * is after the work issuing the flush_work(). For rescuer equipped
	 * workqueues the deadlock happens when the rescuer stalls, blocking
	 * forward progress.
	 */
	if (!from_cancel &&
	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
		lock_map_acquire(&pwq->wq->lockdep_map);
		lock_map_release(&pwq->wq->lockdep_map);
	}
	rcu_read_unlock();
	return true;
already_gone:
	raw_spin_unlock_irq(&pool->lock);
	rcu_read_unlock();
	return false;
}

参考链接


  1. NVMe驱动系列文章 ↩︎

  2. Linux NVMe Driver学习笔记之1:概述与nvme_core_init函数解析 ↩︎

  3. linux ↩︎

  4. bootlin ↩︎

你可能感兴趣的:(驱动,Linux,linux)