本文主要参考这里 1’ 2 的解析和 linux
源码 3。
此处推荐一个可以便捷查看 linux
源码的网站 bootlin
4。
更新:2022 / 02 / 19
先来回忆一下 nvme_probe
的定义:
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct nvme_dev *dev;
int result = -ENOMEM;
// 1.
dev = nvme_pci_alloc_dev(pdev, id);
if (!dev)
return -ENOMEM;
// 2. 获得PCI Bar的虚拟地址
result = nvme_dev_map(dev);
if (result)
goto out_uninit_ctrl;
// 3. 设置 DMA 需要的 PRP 内存池
result = nvme_setup_prp_pools(dev);
if (result)
goto out_dev_unmap;
// 4. 配置prp和sgl
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
goto out_release_prp_pools;
// 打印出被识别到的dev是什么pci function
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
// 5.
result = nvme_pci_enable(dev);
if (result)
goto out_release_iod_mempool;
// 6. 对nvme_alloc_admin_tag_set结构体初始化,在这个过程中特别提一下ops的赋值(后续会用到)。
result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
&nvme_mq_admin_ops, sizeof(struct nvme_iod));
if (result)
goto out_disable;
/*
* Mark the controller as connecting before sending admin commands to
* allow the timeout handler to do the right thing.
*/
// 在发送admin commands前将nvme ctrl的状态标记为connecting,否则说明nvme ctrl处于busy繁忙状态,将nvme ctrl disable。
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller CONNECTING\n");
result = -EBUSY;
goto out_disable;
}
// 7. 初始化NVMe Controller结构
result = nvme_init_ctrl_finish(&dev->ctrl, false);
if (result)
goto out_disable;
// 8. allocate dma for dbbuf
nvme_dbbuf_dma_alloc(dev);
// 9. 配置host memory buffer
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out_disable;
// 10. 配置io queues
result = nvme_setup_io_queues(dev);
if (result)
goto out_disable;
// 11. 对nvme_alloc_io_tag_set结构体初始化
if (dev->online_queues > 1) {
nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
nvme_dbbuf_set(dev);
}
// 如果tagset未置起,打印出警告io q并未创建的信息。
if (!dev->ctrl.tagset)
dev_warn(dev->ctrl.device, "IO queues not created\n");
// 如果nvme ctrl的状态并非alive,打印出警告nvme ctrl无法被标记为alive的信息。
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
"failed to mark controller live state\n");
result = -ENODEV;
goto out_disable;
}
// 12. 为设备设置私有数据指针
pci_set_drvdata(pdev, dev);
// 13. 设置队列并针对不同的cpu进行队列重整
nvme_start_ctrl(&dev->ctrl);
// 14. 递减引用次数
nvme_put_ctrl(&dev->ctrl);
// 15. 等待队列中最后一个实例完成执行;
flush_work(&dev->ctrl.scan_work);
return 0;
out_disable:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
out_release_prp_pools:
nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
nvme_uninit_ctrl(&dev->ctrl);
return result;
}
接下来对 nvme_probe
函数的过程进行结构整理,如下图所示:
再来对 nvme_probe
过程中的一些函数的使用进行进一步分析,往下看:
在 nvme_probe
中使用 nvme_pci_alloc_dev
,
static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
const struct pci_device_id *id)
{
unsigned long quirks = id->driver_data;
// 1. 通过调用 dev_to_node 得到这个 pci_dev 的 numa 节点。
int node = dev_to_node(&pdev->dev);
struct nvme_dev *dev;
int ret = -ENOMEM;
// 如果没有制定的话,默认用 first_memory_node,也就是第一个 numa 节点,并调用 set_dev_node 来设置。
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
// 2. 为 nvme dev 节点分配空间
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return NULL;
// 3. 初始化两个work变量, 放在nvme_workq中执行
// 4. 调用nvme_reset_work进行reset操作
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
// 初始化互斥锁
mutex_init(&dev->shutdown_lock);
// 5. 分配queue
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto out_free_dev;
// 6. 增加设备对象的引用计数
dev->dev = get_device(&pdev->dev);
quirks |= check_vendor_combination_bug(pdev);
if (!noacpi && acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
*/
dev_info(&pdev->dev,
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
// 初始化 NVMe Controller 结构
ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
if (ret)
goto out_put_device;
dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
dma_set_max_seg_size(&pdev->dev, 0xffffffff);
/*
* Limit the max command size to prevent iod->sg allocations going
* over a single page.
*/
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
/*
* There is no support for SGLs for metadata (yet), so we are limited to
* a single integrity segment for the separate metadata pointer.
*/
dev->ctrl.max_integrity_segments = 1;
return dev;
out_put_device:
put_device(dev->dev);
kfree(dev->queues);
out_free_dev:
kfree(dev);
return ERR_PTR(ret);
}
nvme_pci_alloc_dev
做了什么呢?
在 nvme_pci_alloc_dev
中有调用 nvme_reset_work
,
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
// 1. 检查NVME_CTRL_RESETTING标志,来确保nvme_reset_work不会被重复进入.
if (dev->ctrl.state != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
return;
}
/*
* If we're called to reset a live controller first shut it down before
* moving on.
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
nvme_sync_queues(&dev->ctrl);
mutex_lock(&dev->shutdown_lock);
// 2.
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
// 3.
nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/*
* Introduce CONNECTING state from nvme-fc/rdma transports to mark the
* initializing procedure here.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller CONNECTING\n");
result = -EBUSY;
goto out;
}
result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
if (result)
goto out;
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out;
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
* Freeze and update the number of I/O queues as thos might have
* changed. If there are no I/O queues left after this reset, keep the
* controller around but remove all namespaces.
*/
if (dev->online_queues > 1) {
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
nvme_pci_update_nr_queues(dev);
nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
} else {
dev_warn(dev->ctrl.device, "IO queues lost\n");
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_free_tagset(dev);
}
/*
* If only admin queue live, keep it to do further investigation or
* recovery.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
"failed to mark controller live state\n");
result = -ENODEV;
goto out;
}
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
/*
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
* may be holding this pci_dev's device lock.
*/
dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
result);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
在 nvme_reset_work
中有调用 nvme_pci_enable
,其定义如下:
static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
int dma_address_bits = 64;
// 1. 使能nvme设备的内存空间iomem,也就是之前映射的bar空间。
if (pci_enable_device_mem(pdev))
return result;
// 设置设备具有获得总线的能力,即调用这个函数,使设备具备申请使用PCI总线的能力。
pci_set_master(pdev);
if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
dma_address_bits = 48;
// 设定这个nvme设备的DMA区域大小,64 bits或者48 bits
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
goto disable;
// 读取Controller寄存器NVME_REG_CSTS,判断Controller的状态
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices and/or platforms don't advertise or work with INTx
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
// 为设备分配中断请求。nvme设备支持三种中断模式:INITx/MSI/MSI-X.
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
goto disable;
// 获取设备64位的Controller Capabilities(CAP)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
// 设置Doorbell地址,这里的4096来自SQ Tail DB的起始地址0x1000
dev->dbs = dev->bar + 4096;
/*
* Some Apple controllers require a non-standard SQE size.
* Interestingly they also seem to ignore the CC:IOSQES register
* so we don't bother updating it here.
*/
if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
dev->io_sqes = 7;
else
dev->io_sqes = NVME_NVM_IOSQES;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
dev->q_depth = 2;
dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
"set queue depth=%u to work around controller resets\n",
dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
dev->q_depth = 64;
dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
"set queue depth=%u\n", dev->q_depth);
}
/*
* Controllers with the shared tags quirk need the IO queue to be
* big enough so that we get 32 tags for the admin queue
*/
if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
dev->q_depth = NVME_AQ_DEPTH + 2;
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
dev->q_depth);
}
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
nvme_map_cmb(dev);
// 错误处理
pci_enable_pcie_error_reporting(pdev);
// Suspend之前保存设备当下的状态
pci_save_state(pdev);
result = nvme_pci_configure_admin_queue(dev);
if (result)
goto free_irq;
return result;
free_irq:
pci_free_irq_vectors(pdev);
disable:
pci_disable_device(pdev);
return result;
}
在 nvme_pci_enable
中有调用 pci_alloc_irq_vectors
,其定义如下:
/**
* pci_alloc_irq_vectors() - Allocate multiple device interrupt vectors
* @dev: the PCI device to operate on
* @min_vecs: minimum required number of vectors (must be >= 1)
* @max_vecs: maximum desired number of vectors
* @flags: One or more of:
*
* * %PCI_IRQ_MSIX Allow trying MSI-X vector allocations
* * %PCI_IRQ_MSI Allow trying MSI vector allocations
*
* * %PCI_IRQ_LEGACY Allow trying legacy INTx interrupts, if
* and only if @min_vecs == 1
*
* * %PCI_IRQ_AFFINITY Auto-manage IRQs affinity by spreading
* the vectors around available CPUs
*
* Allocate up to @max_vecs interrupt vectors on device. MSI-X irq
* vector allocation has a higher precedence over plain MSI, which has a
* higher precedence over legacy INTx emulation.
*
* Upon a successful allocation, the caller should use pci_irq_vector()
* to get the Linux IRQ number to be passed to request_threaded_irq().
* The driver must call pci_free_irq_vectors() on cleanup.
*
* Return: number of allocated vectors (which might be smaller than
* @max_vecs), -ENOSPC if less than @min_vecs interrupt vectors are
* available, other errnos otherwise.
*/
int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags)
{
return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs,
flags, NULL);
}
EXPORT_SYMBOL(pci_alloc_irq_vectors);
/**
* pci_alloc_irq_vectors_affinity() - Allocate multiple device interrupt
* vectors with affinity requirements
* @dev: the PCI device to operate on
* @min_vecs: minimum required number of vectors (must be >= 1)
* @max_vecs: maximum desired number of vectors
* @flags: allocation flags, as in pci_alloc_irq_vectors()
* @affd: affinity requirements (can be %NULL).
*
* Same as pci_alloc_irq_vectors(), but with the extra @affd parameter.
* Check that function docs, and &struct irq_affinity, for more details.
*/
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags,
struct irq_affinity *affd)
{
struct irq_affinity msi_default_affd = {0};
int nvecs = -ENOSPC;
// 当IRQ为Affinity自动分配时,IRQ中断会分配给所有CPUs。
// 在nvme_pci_enable过程中调用时,*affd=NULL
if (flags & PCI_IRQ_AFFINITY) {
if (!affd)
affd = &msi_default_affd;
} else {
if (WARN_ON(affd))
affd = NULL;
}
// 分配MSI-X中断,配置MSI-X capability structure
if (flags & PCI_IRQ_MSIX) {
nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
affd, flags);
if (nvecs > 0)
return nvecs;
}
// 分配MSI中断,配置MSI capability structure
if (flags & PCI_IRQ_MSI) {
nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
if (nvecs > 0)
return nvecs;
}
/* use legacy IRQ if allowed */
// 分配INITx中断
if (flags & PCI_IRQ_LEGACY) {
if (min_vecs == 1 && dev->irq) {
/*
* Invoke the affinity spreading logic to ensure that
* the device driver can adjust queue configuration
* for the single interrupt case.
*/
if (affd)
irq_create_affinity_masks(1, affd);
pci_intx(dev, 1);
return 1;
}
}
return nvecs;
}
EXPORT_SYMBOL(pci_alloc_irq_vectors_affinity);
这三种中断不能同时 enable
,比如要采用 MSI-X
中断,那就必须把 INITx
和 MSI
中断 disable
。
在 nvme_pci_enable
中有调用 nvme_pci_configure_admin_queue
,其定义如下:
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
struct nvme_queue *nvmeq;
result = nvme_remap_bar(dev, db_bar_size(dev, 0));
if (result < 0)
return result;
// 1. 从CAP寄存器中判断对Subsystem Reset的支持情况
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
if (dev->subsystem &&
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
/*
* If the device has been passed off to us in an enabled state, just
* clear the enabled bit. The spec says we should set the 'shutdown
* notification bits', but doing so may cause the device to complete
* commands to the admin queue ... and we don't know what memory that
* might be pointing at!
*/
// 2. 调用nvme_disable_ctrl
result = nvme_disable_ctrl(&dev->ctrl, false);
if (result < 0)
return result;
// 3. 调用nvme_alloc_queue
// 设备disable之后第一次调用nvmeq, 此时值为Null。这时需要调用nvme_alloc_queue分配NVMe queue.
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
return result;
dev->ctrl.numa_node = dev_to_node(dev->dev);
nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
// 4. nvme_alloc_queue分配NVMe queue后,就要将nvme admin queue的属性以及已经分配的admin SQ/CQ内存地址写入寄存器。
writel(aqa, dev->bar + NVME_REG_AQA);
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
// 5. 对admin queue分配内存之后,调用nvme_enable_ctrl将设备enable。这个函数与nvme_disable_ctrl函数类似,只是过程相反。
result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
// 6. 对之前申请的queue进行初始化操作
nvmeq->cq_vector = 0;
nvme_init_queue(nvmeq, 0);
// 7. 调用queue_request_irq申请中断。这个函数主要的工作是设置中断处理函数,默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
result = queue_request_irq(nvmeq);
if (result) {
dev->online_queues--;
return result;
}
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
}
从上面的代码,我们可以了解到 nvme_pci_configure_admin_queue
中大致的步骤如下:
CAP
寄存器中判断对 Subsystem Reset
的支持情况;nvme_disable_ctrl
;nvme_alloc_queue
;lo_hi_writeq
;nvme_enable_ctrl
;nvme_init_queue
;queue_request_irq
;对于上面的步骤,挨个逐步分析:
Controller
的 CAP
寄存器 bit[36]
定义了对 NVM subsystem reset
是否支持,如下图:
一般情况下,NVM subsystem
就是一块 SSD
了,由 Controller
、NAND
以及接口组成一个 NVM subsystem
。
NVM Subsystem Reset
是 Controller Level Reset
的一种。
在对 NVMe controller
进行操作时需要通过 nvme_disable_ctrl
将设备 disable
,完成后再调用 nvme_enable_ctrl
将设备 enable
。
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
if (shutdown)
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
else
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
// reg_write32通过NVME_REG_CC寄存器disable设备。
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
if (shutdown) {
return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
NVME_CSTS_SHST_CMPLT,
ctrl->shutdown_timeout, "shutdown");
}
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
// 在函数最后,通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。
// 超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
(NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
// 通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
u32 timeout, const char *op)
{
unsigned long timeout_jiffies = jiffies + timeout * HZ;
u32 csts;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if (csts == ~0)
return -ENODEV;
if ((csts & mask) == val)
break;
usleep_range(1000, 2000);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout_jiffies)) {
dev_err(ctrl->device,
"Device not ready; aborting %s, CSTS=0x%x\n",
op, csts);
return -ENODEV;
}
}
return ret;
}
// 同 nvme_disable_ctrl 的过程基本相反
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
unsigned dev_page_min;
u32 timeout;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
return -ENODEV;
}
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
ctrl->ctrl_config = NVME_CC_CSS_CSI;
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
u32 crto;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
if (ret) {
dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
ret);
return ret;
}
if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
ctrl->ctrl_config |= NVME_CC_CRIME;
timeout = NVME_CRTO_CRIMT(crto);
} else {
timeout = NVME_CRTO_CRWMT(crto);
}
} else {
timeout = NVME_CAP_TIMEOUT(ctrl->cap);
}
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
// reg_write32通过NVME_REG_CC寄存器disable设备
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
/* Flush write to device (required if transport is PCI) */
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
if (ret)
return ret;
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
(timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
设备 disable
之后需要调用 nvme_alloc_queue
分配 NVMe queue
。
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
{
struct nvme_queue *nvmeq = &dev->queues[qid];
if (dev->ctrl.queue_count > qid)
return 0;
nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
nvmeq->q_depth = depth;
// 1. 调用 dma_alloc_coherent 为 completion queue 分配内存以供 DMA 使用。nvmeq->cqes为申请到的内存的虚拟地址,供内核使用。
// 而nvmeq->cq_dma_addr就是这块内存的物理地址,供DMA控制器使用。
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
// 2. 调用 nvme_alloc_sq_cmds 来处理 submission queue,假如nvme版本是1.2或者以上的,并且cmb支持 submission queue,那就使用 cmb。
// 否则的话,和 completion queue 一样使用dma_alloc_coherent来分配内存。
if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
spin_lock_init(&nvmeq->cq_poll_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->qid = qid;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
调用 nvme_alloc_queue
,设备 disable
之后第一次调用 nvmeq
,此时值为 Null
。
这时需要调用 nvme_alloc_queue
分配 NVMe queue
。
nvme_alloc_queue
分配 NVMe queue
后,就要将 nvme admin queue
的属性以及已经分配的admin SQ/CQ
内存地址写入寄存器。
static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
{
writel(val, addr);
writel(val >> 32, addr + 4);
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
nvmeq->sq_tail = 0;
nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
}
在这个过程中,对 SQ Tail
,CQ Head
以及 CQ phase
等变量进行初始化赋值,然后通过 q_db
指向 Doorbell
寄存器。
static int queue_request_irq(struct nvme_queue *nvmeq)
{
struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
int nr = nvmeq->dev->ctrl.instance;
if (use_threaded_interrupts) {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
} else {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
}
}
调用 queue_request_irq
申请中断。
默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
线程化与中断上下文的概念:
中断线程化是实现
Linux
实时性的一个重要步骤,在linux
标准内核中,中断是最高优先级的执行单元,不管内核当时处理什么,只要有中断事件,系统将立即响应该事件并执行相应的中断处理代码,除非当时中断关闭。因此,如果系统有严重的网络或I/O
负载,中断将非常频繁,后发生的实时任务将很难有机会运行,也就是说,毫无实时性可言。
中断线程化之后,中断将作为内核线程运行而且赋予不同的实时优先级,实时任务可以有比中断线程更高的优先级,这样,实时任务就可以作为最高优先级的执行单元来运行,即使在严重负载下仍有实时性保证。内核空间和用户空间是操作系统理论的基础之一,即内核功能模块运行在内核空间,而应用程序运行在用户空间。现代的
CPU
都具有不同的操作模式,代表不同的级别,不同的级别具有不同的功能,在较低的级别中将禁止某些操作。
Linux
系统设计时利用了这种硬件特性,使用了两个级别,最高级别和最低级别,内核运行在最高级别(内核态),这个级别可以进行所有操作,而应用程序运行在较低级别(用户态),在这个级别,处理器控制着对硬件的直接访问以及对内存的非授权访问。
内核态和用户态有自己的内存映射,即自己的地址空间。
正是有了不同运行状态的划分,才有了上下文的概念。用户空间的应用程序,如果想要请求系统服务,比如操作一个物理设备,或者映射一段设备空间的地址到用户空间,就必须通过系统调用来(操作系统提供给用户空间的接口函数)实现。通过系统调用,用户空间的应用程序就会进入内核空间,由内核代表该进程运行于内核空间,这就涉及到上下文的切换,用户空间和内核空间具有不同的地址映射,通用或专用的寄存器组。而用户空间的进程要传递很多变量、参数给内核,内核也要保存用户进程的一些寄存器、变量等,以便系统调用结束后回到用户空间继续执行。所谓的进程上下文,就是一个进程在执行的时候,CPU
的所有寄存器中的值、进程的状态以及堆栈中的内容,当内核需要切换到另一个进程时,它需要保存当前进程的所有状态,即保存当前进程的进程上下文,以便再次执行该进程时,能够恢复切换时的状态,继续执行。
同理,硬件通过触发信号,导致内核调用中断处理程序,进入内核空间。这个过程中,硬件的一些变量和参数也要传递给内核,内核通过这些参数进行中断处理,中断上下文就可以理解为硬件传递过来的这些参数和内核需要保存的一些环境,主要是被中断的进程的环境。
在 nvme_pci_alloc_dev
中有调用 nvme_init_ctrl
,再看一下其定义,如下:
/*
* Initialize a NVMe controller structures. This needs to be called during
* earliest initialization so that we have the initialized structured around
* during probing.
*/
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
int ret;
ctrl->state = NVME_CTRL_NEW;
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
ctrl->numa_node = NUMA_NO_NODE;
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
PAGE_SIZE);
ctrl->discard_page = alloc_page(GFP_KERNEL);
if (!ctrl->discard_page) {
ret = -ENOMEM;
goto out;
}
ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
if (ret < 0)
goto out;
ctrl->instance = ret;
device_initialize(&ctrl->ctrl_device);
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
ctrl->device->class = nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
else
ctrl->device->groups = nvme_dev_attr_groups;
ctrl->device->release = nvme_free_ctrl;
dev_set_drvdata(ctrl->device, ctrl);
// 1. set device name with nvme%d
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
if (ret)
goto out_release_instance;
nvme_get_ctrl(ctrl);
cdev_init(&ctrl->cdev, &nvme_dev_fops);
ctrl->cdev.owner = ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
if (ret)
goto out_free_name;
/*
* Initialize latency tolerance controls. The sysfs files won't
* be visible to userspace unless the device actually supports APST.
*/
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
nvme_mpath_init_ctrl(ctrl);
ret = nvme_auth_init_ctrl(ctrl);
if (ret)
goto out_free_cdev;
return 0;
out_free_cdev:
cdev_device_del(&ctrl->cdev, ctrl->device);
out_free_name:
nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
out_release_instance:
ida_free(&nvme_instance_ida, ctrl->instance);
out:
if (ctrl->discard_page)
__free_page(ctrl->discard_page);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
在 nvme_init_ctrl
中有调用 dev_set_name
以创建一个名字叫 nvmex
的字符设备,其定义如下:
/**
* dev_set_name - set a device name
* @dev: device
* @fmt: format string for the device's name
*/
int dev_set_name(struct device *dev, const char *fmt, ...)
{
va_list vargs;
int err;
va_start(vargs, fmt);
err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
va_end(vargs);
return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);
这个 nvmex
中的 x
是通过 kobject_set_name_vargs
获得唯一的索引值。
在 dev_set_name
中有调用 kobject_set_name_vargs
,其定义如下:
/**
* kobject_set_name_vargs() - Set the name of a kobject.
* @kobj: struct kobject to set the name of
* @fmt: format string used to build the name
* @vargs: vargs to format the string.
*/
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
va_list vargs)
{
const char *s;
if (kobj->name && !fmt)
return 0;
s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
if (!s)
return -ENOMEM;
/*
* ewww... some of these buggers have '/' in the name ... If
* that's the case, we need to make sure we have an actual
* allocated copy to modify, since kvasprintf_const may have
* returned something from .rodata.
*/
if (strchr(s, '/')) {
char *t;
t = kstrdup(s, GFP_KERNEL);
kfree_const(s);
if (!t)
return -ENOMEM;
strreplace(t, '/', '!');
s = t;
}
kfree_const(kobj->name);
kobj->name = s;
return 0;
}
在 nvme_probe
中使用 nvme_dev_map
来获得 PCI
Bar
的虚拟地址,
static int nvme_dev_map(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
# NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */
if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
goto release;
return 0;
release:
pci_release_mem_regions(pdev);
return -ENODEV;
}
在 nvme_dev_map
中有调用 pci_request_mem_regions
,其定义如下:
static inline int
pci_request_mem_regions(struct pci_dev *pdev, const char *name)
{
return pci_request_selected_regions(pdev,
pci_select_bars(pdev, IORESOURCE_MEM), name);
}
在 pci_request_mem_regions
中有调用 pci_select_bars
,其定义如下:
/**
* pci_select_bars - Make BAR mask from the type of resource
* @dev: the PCI device for which BAR mask is made
* @flags: resource type mask to be selected
*
* This helper routine makes bar mask from the type of resource.
*/
int pci_select_bars(struct pci_dev *dev, unsigned long flags)
{
int i, bars = 0;
for (i = 0; i < PCI_NUM_RESOURCES; i++)
if (pci_resource_flags(dev, i) & flags)
bars |= (1 << i);
return bars;
}
EXPORT_SYMBOL(pci_select_bars);
调用 pci_select_bars
,其返回值为 mask
。因为 pci
设备的 header
配置空间有 6
个 32
位的Bar
寄存器,所以 mark
中的每一位的值就代表其中一个 Bar
是否被置起:
在 pci_request_mem_regions
中有调用 pci_request_selected_regions
,其定义如下:
/**
* pci_request_selected_regions - Reserve selected PCI I/O and memory resources
* @pdev: PCI device whose resources are to be reserved
* @bars: Bitmask of BARs to be requested
* @res_name: Name to be associated with resource
*/
int pci_request_selected_regions(struct pci_dev *pdev, int bars,
const char *res_name)
{
return __pci_request_selected_regions(pdev, bars, res_name, 0);
}
EXPORT_SYMBOL(pci_request_selected_regions);
调用 pci_request_selected_regions
,这个函数的一个参数就是之前调用 pci_select_bars
返回的mask
值,作用就是把对应的这个几个 bar
保留起来,不让别人使用。
在 nvme_dev_map
中有调用 nvme_remap_bar
,其定义如下:
static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (size <= dev->bar_mapped_size)
return 0;
if (size > pci_resource_len(pdev, 0))
return -ENOMEM;
if (dev->bar)
iounmap(dev->bar);
// 将一个IO地址空间映射到内核的虚拟地址空间上去
dev->bar = ioremap(pci_resource_start(pdev, 0), size);
if (!dev->bar) {
dev->bar_mapped_size = 0;
return -ENOMEM;
}
dev->bar_mapped_size = size;
dev->dbs = dev->bar + NVME_REG_DBS;
return 0;
}
调用 ioremap
,在 linux
中我们无法直接访问物理地址,需要映射到虚拟地址,ioremap
就是这个作用。
映射完后,我们访问 dev->bar
就可以直接操作 nvme
设备上的寄存器了。但是代码中,并没有根据 pci_select_bars
的返回值来决定映射哪个 bar
,而是直接映射 bar0
,原因是 nvme
协议中强制规定了 bar0
就是内存映射的基址。
在 nvme_probe
中使用 nvme_setup_prp_pools
来设置 DMA
需要的 PRP
内存池,
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
nvme_setup_prp_pools
主要是创建了两个 dma pool
,后面就可以通过其他 dma
函数从 dma pool
中获得 memory
了。
prp_page_pool
提供的是块大小为 Page_Size
(格式化时确定,例如 4KB
) 的内存,主要是为了对于不一样长度的 prp list
来做优化。prp_small_pool
里提供的是块大小为 256
字节的内存。在 nvme_probe
中使用 nvme_pci_enable
,
static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
int dma_address_bits = 64;
// 1. 使能nvme设备的内存空间iomem,也就是之前映射的bar空间。
if (pci_enable_device_mem(pdev))
return result;
// 2. 设置设备具有获得总线的能力,即调用这个函数,使设备具备申请使用PCI总线的能力。
pci_set_master(pdev);
if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
dma_address_bits = 48;
// 3. 设定这个nvme设备的DMA区域大小,64 bits或者48 bits
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
goto disable;
// 4. 读取Controller寄存器NVME_REG_CSTS,判断Controller的状态
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices and/or platforms don't advertise or work with INTx
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
// 5. 为设备分配中断请求。nvme设备支持三种中断模式:INITx/MSI/MSI-X.
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
goto disable;
// 获取设备64位的Controller Capabilities(CAP)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
// 设置Doorbell地址,这里的4096来自SQ Tail DB的起始地址0x1000
dev->dbs = dev->bar + 4096;
/*
* Some Apple controllers require a non-standard SQE size.
* Interestingly they also seem to ignore the CC:IOSQES register
* so we don't bother updating it here.
*/
if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
dev->io_sqes = 7;
else
dev->io_sqes = NVME_NVM_IOSQES;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
dev->q_depth = 2;
dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
"set queue depth=%u to work around controller resets\n",
dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
dev->q_depth = 64;
dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
"set queue depth=%u\n", dev->q_depth);
}
/*
* Controllers with the shared tags quirk need the IO queue to be
* big enough so that we get 32 tags for the admin queue
*/
if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
dev->q_depth = NVME_AQ_DEPTH + 2;
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
dev->q_depth);
}
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
// 6. 将主机端host的CMB(Controller Memory Buffer) 映射表配置到nvme controller以及CMB size等
nvme_map_cmb(dev);
// 7. 使能pcie错误报告功能
pci_enable_pcie_error_reporting(pdev);
// 8. suspend之前保存设备当下的PCI Configuration Space
pci_save_state(pdev);
// 9. 构建admin q
result = nvme_pci_configure_admin_queue(dev);
if (result)
goto free_irq;
return result;
free_irq:
pci_free_irq_vectors(pdev);
disable:
pci_disable_device(pdev);
return result;
}
在 nvme_pci_enable
中有调用 pci_alloc_irq_vectors
,其定义如下:
/**
* pci_alloc_irq_vectors() - Allocate multiple device interrupt vectors
* @dev: the PCI device to operate on
* @min_vecs: minimum required number of vectors (must be >= 1)
* @max_vecs: maximum desired number of vectors
* @flags: One or more of:
*
* * %PCI_IRQ_MSIX Allow trying MSI-X vector allocations
* * %PCI_IRQ_MSI Allow trying MSI vector allocations
*
* * %PCI_IRQ_LEGACY Allow trying legacy INTx interrupts, if
* and only if @min_vecs == 1
*
* * %PCI_IRQ_AFFINITY Auto-manage IRQs affinity by spreading
* the vectors around available CPUs
*
* Allocate up to @max_vecs interrupt vectors on device. MSI-X irq
* vector allocation has a higher precedence over plain MSI, which has a
* higher precedence over legacy INTx emulation.
*
* Upon a successful allocation, the caller should use pci_irq_vector()
* to get the Linux IRQ number to be passed to request_threaded_irq().
* The driver must call pci_free_irq_vectors() on cleanup.
*
* Return: number of allocated vectors (which might be smaller than
* @max_vecs), -ENOSPC if less than @min_vecs interrupt vectors are
* available, other errnos otherwise.
*/
int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags)
{
return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs,
flags, NULL);
}
EXPORT_SYMBOL(pci_alloc_irq_vectors);
/**
* pci_alloc_irq_vectors_affinity() - Allocate multiple device interrupt
* vectors with affinity requirements
* @dev: the PCI device to operate on
* @min_vecs: minimum required number of vectors (must be >= 1)
* @max_vecs: maximum desired number of vectors
* @flags: allocation flags, as in pci_alloc_irq_vectors()
* @affd: affinity requirements (can be %NULL).
*
* Same as pci_alloc_irq_vectors(), but with the extra @affd parameter.
* Check that function docs, and &struct irq_affinity, for more details.
*/
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags,
struct irq_affinity *affd)
{
struct irq_affinity msi_default_affd = {0};
int nvecs = -ENOSPC;
// 当IRQ为Affinity自动分配时,IRQ中断会分配给所有CPUs。
// 在nvme_pci_enable过程中调用时,*affd=NULL
if (flags & PCI_IRQ_AFFINITY) {
if (!affd)
affd = &msi_default_affd;
} else {
if (WARN_ON(affd))
affd = NULL;
}
// 分配MSI-X中断,配置MSI-X capability structure
if (flags & PCI_IRQ_MSIX) {
nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
affd, flags);
if (nvecs > 0)
return nvecs;
}
// 分配MSI中断,配置MSI capability structure
if (flags & PCI_IRQ_MSI) {
nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
if (nvecs > 0)
return nvecs;
}
/* use legacy IRQ if allowed */
// 分配INITx中断
if (flags & PCI_IRQ_LEGACY) {
if (min_vecs == 1 && dev->irq) {
/*
* Invoke the affinity spreading logic to ensure that
* the device driver can adjust queue configuration
* for the single interrupt case.
*/
if (affd)
irq_create_affinity_masks(1, affd);
pci_intx(dev, 1);
return 1;
}
}
return nvecs;
}
EXPORT_SYMBOL(pci_alloc_irq_vectors_affinity);
这三种中断不能同时 enable
,比如要采用 MSI-X
中断,那就必须把 INITx
和 MSI
中断 disable
。
在 nvme_pci_enable
中有调用 pci_enable_pcie_error_reporting
,其定义如下:
int pci_enable_pcie_error_reporting(struct pci_dev *dev)
{
int rc;
// 如果pcie_aer功能不正常,
if (!pcie_aer_is_native(dev))
// EIO, I/O error
return -EIO;
rc = pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
return pcibios_err_to_errno(rc);
}
EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
在 pci_enable_pcie_error_reporting
中有调用 pcie_aer_is_native
来判断 pcie_aer
功能是否处于原声状态,如下:
int pcie_aer_is_native(struct pci_dev *dev)
{
// 构建 pcie_host_bridge 结构体,
struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
// aer_cap, AER capability offset
if (!dev->aer_cap)
return 0;
// native_aer=1, OS may use PCIe AER
return pcie_ports_native || host->native_aer;
}
如果设备不支持 AER
,返回 0
。否则,返回一个类似于判断值的值,用于指示 pcie_aer
功能是否为原生状态【?】
在 pci_enable_pcie_error_reporting
中有调用 pcie_capability_set_word
来设置 PCI_EXP_DEVCTL
和 PCI_EXP_AER_FLAGS
,
int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
u16 clear, u16 set)
{
int ret;
u16 val;
// 读取pcie capability相应位置的值,如果不符合预期则调用pcie_capability_write_word将期望值写入期望位置。
ret = pcie_capability_read_word(dev, pos, &val);
if (!ret) {
val &= ~clear;
val |= set;
ret = pcie_capability_write_word(dev, pos, val);
}
return ret;
}
EXPORT_SYMBOL(pcie_capability_clear_and_set_word);
在 pci_enable_pcie_error_reporting
中有调用 pcibios_err_to_errno
来将可能由 PCI
设备返回的错误值 “翻译” 为非 PCI
代码(已在 include/uapi/asm-generic/errno-base.h
中进行相应的定义),如下:
/* Error values that may be returned by PCI functions */
#define PCIBIOS_SUCCESSFUL 0x00
#define PCIBIOS_FUNC_NOT_SUPPORTED 0x81
#define PCIBIOS_BAD_VENDOR_ID 0x83
#define PCIBIOS_DEVICE_NOT_FOUND 0x86
#define PCIBIOS_BAD_REGISTER_NUMBER 0x87
#define PCIBIOS_SET_FAILED 0x88
#define PCIBIOS_BUFFER_TOO_SMALL 0x89
/* Translate above to generic errno for passing back through non-PCI code */
static inline int pcibios_err_to_errno(int err)
{
if (err <= PCIBIOS_SUCCESSFUL)
return err; /* Assume already errno */
switch (err) {
case PCIBIOS_FUNC_NOT_SUPPORTED:
return -ENOENT;
case PCIBIOS_BAD_VENDOR_ID:
return -ENOTTY;
case PCIBIOS_DEVICE_NOT_FOUND:
return -ENODEV;
case PCIBIOS_BAD_REGISTER_NUMBER:
return -EFAULT;
case PCIBIOS_SET_FAILED:
return -EIO;
case PCIBIOS_BUFFER_TOO_SMALL:
return -ENOSPC;
}
return -ERANGE;
}
在 nvme_pci_enable
中有调用 nvme_pci_configure_admin_queue
,其定义如下:
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
struct nvme_queue *nvmeq;
// 1. 将bar空间映射到内核的虚拟地址空间上去
result = nvme_remap_bar(dev, db_bar_size(dev, 0));
if (result < 0)
return result;
// 2. 从CAP寄存器中读取NSSRC判断对Subsystem Reset的支持情况
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
// 3. 写CSTS_NSSRO到CSTS寄存器中
if (dev->subsystem &&
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
/*
* If the device has been passed off to us in an enabled state, just
* clear the enabled bit. The spec says we should set the 'shutdown
* notification bits', but doing so may cause the device to complete
* commands to the admin queue ... and we don't know what memory that
* might be pointing at!
*/
// 3. 调用nvme_disable_ctrl
result = nvme_disable_ctrl(&dev->ctrl, false);
if (result < 0)
return result;
// 4. 调用nvme_alloc_queue
// 设备disable之后第一次调用nvmeq, 此时值为Null。这时需要调用nvme_alloc_queue分配NVMe queue.
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
return result;
// 获得这个dev所对应的numa节点
dev->ctrl.numa_node = dev_to_node(dev->dev);
nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
// 5. nvme_alloc_queue分配NVMe queue后,就要将nvme admin queue的属性以及已经分配的admin SQ/CQ内存地址写入寄存器。
writel(aqa, dev->bar + NVME_REG_AQA);
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
// 6. 对admin queue分配内存之后,调用nvme_enable_ctrl将设备enable。这个函数与nvme_disable_ctrl函数类似,只是过程相反。
result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
// 7. 对之前申请的queue进行初始化操作
nvmeq->cq_vector = 0;
nvme_init_queue(nvmeq, 0);
// 8. 调用queue_request_irq申请中断。这个函数主要的工作是设置中断处理函数,默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
result = queue_request_irq(nvmeq);
if (result) {
dev->online_queues--;
return result;
}
// 是能NVMe Queue
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
}
从上面的代码,我们可以了解到 nvme_pci_configure_admin_queue
中大致的步骤如下:
CAP
寄存器中判断对 Subsystem Reset
的支持情况;nvme_disable_ctrl
;nvme_alloc_queue
;lo_hi_writeq
;nvme_enable_ctrl
;nvme_init_queue
;queue_request_irq
;对于上面的步骤,挨个逐步分析:
Controller
的 CAP
寄存器 bit[36]
定义了对 NVM subsystem reset
是否支持,如下图:
一般情况下,NVM subsystem
就是一块 SSD
了,由 Controller
、NAND
以及接口组成一个 NVM subsystem
。
NVM Subsystem Reset
是 Controller Level Reset
的一种。
在对 NVMe controller
进行操作时需要通过 nvme_disable_ctrl
将设备 disable
,完成后再调用 nvme_enable_ctrl
将设备 enable
。
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
if (shutdown)
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
else
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
// reg_write32通过NVME_REG_CC寄存器disable设备。
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
if (shutdown) {
return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
NVME_CSTS_SHST_CMPLT,
ctrl->shutdown_timeout, "shutdown");
}
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
// 在函数最后,通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。
// 超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
(NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
// 通过读取状态寄存器NVME_REG_CSTS来等待设备真正停止。超时上限是根据CAP寄存器Bit[31:24]的Timeout域来计算出来的,每个单位代表500ms。
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
u32 timeout, const char *op)
{
unsigned long timeout_jiffies = jiffies + timeout * HZ;
u32 csts;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if (csts == ~0)
return -ENODEV;
if ((csts & mask) == val)
break;
usleep_range(1000, 2000);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout_jiffies)) {
dev_err(ctrl->device,
"Device not ready; aborting %s, CSTS=0x%x\n",
op, csts);
return -ENODEV;
}
}
return ret;
}
// 同 nvme_disable_ctrl 的过程基本相反
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
unsigned dev_page_min;
u32 timeout;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
return -ENODEV;
}
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
ctrl->ctrl_config = NVME_CC_CSS_CSI;
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
u32 crto;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
if (ret) {
dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
ret);
return ret;
}
if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
ctrl->ctrl_config |= NVME_CC_CRIME;
timeout = NVME_CRTO_CRIMT(crto);
} else {
timeout = NVME_CRTO_CRWMT(crto);
}
} else {
timeout = NVME_CAP_TIMEOUT(ctrl->cap);
}
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
// 这里的ctrl->ops就是之前nvme_probe函数中nvme_init_ctrl时传进去的nvme_pci_ctrl_ops
// reg_write32通过NVME_REG_CC寄存器disable设备
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
/* Flush write to device (required if transport is PCI) */
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
if (ret)
return ret;
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
(timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
设备 disable
之后需要调用 nvme_alloc_queue
分配 NVMe queue
。
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
{
struct nvme_queue *nvmeq = &dev->queues[qid];
if (dev->ctrl.queue_count > qid)
return 0;
nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
nvmeq->q_depth = depth;
// 1. 调用 dma_alloc_coherent 为 completion queue 分配内存以供 DMA 使用。nvmeq->cqes为申请到的内存的虚拟地址,供内核使用。
// 而nvmeq->cq_dma_addr就是这块内存的物理地址,供DMA控制器使用。
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
// 2. 调用 nvme_alloc_sq_cmds 来处理 submission queue,假如nvme版本是1.2或者以上的,并且cmb支持 submission queue,那就使用 cmb。
// 否则的话,和 completion queue 一样使用dma_alloc_coherent来分配内存。
if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
spin_lock_init(&nvmeq->cq_poll_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->qid = qid;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
调用 nvme_alloc_queue
,设备 disable
之后第一次调用 nvmeq
,此时值为 Null
。
这时需要调用 nvme_alloc_queue
分配 NVMe queue
。
nvme_alloc_queue
分配 NVMe queue
后,就要将 nvme admin queue
的属性以及已经分配的admin SQ/CQ
内存地址写入寄存器。
static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
{
writel(val, addr);
writel(val >> 32, addr + 4);
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
nvmeq->sq_tail = 0;
nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
}
在这个过程中,对 SQ Tail
,CQ Head
以及 CQ phase
等变量进行初始化赋值,然后通过 q_db
指向 Doorbell
寄存器。
static int queue_request_irq(struct nvme_queue *nvmeq)
{
struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
int nr = nvmeq->dev->ctrl.instance;
if (use_threaded_interrupts) {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
} else {
return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
}
}
调用 queue_request_irq
申请中断。
默认情况下不使用线程化的中断处理,而是使用中断上下文的中断处理。
线程化与中断上下文的概念:
中断线程化是实现
Linux
实时性的一个重要步骤,在linux
标准内核中,中断是最高优先级的执行单元,不管内核当时处理什么,只要有中断事件,系统将立即响应该事件并执行相应的中断处理代码,除非当时中断关闭。因此,如果系统有严重的网络或I/O
负载,中断将非常频繁,后发生的实时任务将很难有机会运行,也就是说,毫无实时性可言。
中断线程化之后,中断将作为内核线程运行而且赋予不同的实时优先级,实时任务可以有比中断线程更高的优先级,这样,实时任务就可以作为最高优先级的执行单元来运行,即使在严重负载下仍有实时性保证。内核空间和用户空间是操作系统理论的基础之一,即内核功能模块运行在内核空间,而应用程序运行在用户空间。现代的
CPU
都具有不同的操作模式,代表不同的级别,不同的级别具有不同的功能,在较低的级别中将禁止某些操作。
Linux
系统设计时利用了这种硬件特性,使用了两个级别,最高级别和最低级别,内核运行在最高级别(内核态),这个级别可以进行所有操作,而应用程序运行在较低级别(用户态),在这个级别,处理器控制着对硬件的直接访问以及对内存的非授权访问。
内核态和用户态有自己的内存映射,即自己的地址空间。
正是有了不同运行状态的划分,才有了上下文的概念。用户空间的应用程序,如果想要请求系统服务,比如操作一个物理设备,或者映射一段设备空间的地址到用户空间,就必须通过系统调用来(操作系统提供给用户空间的接口函数)实现。通过系统调用,用户空间的应用程序就会进入内核空间,由内核代表该进程运行于内核空间,这就涉及到上下文的切换,用户空间和内核空间具有不同的地址映射,通用或专用的寄存器组。而用户空间的进程要传递很多变量、参数给内核,内核也要保存用户进程的一些寄存器、变量等,以便系统调用结束后回到用户空间继续执行。所谓的进程上下文,就是一个进程在执行的时候,CPU
的所有寄存器中的值、进程的状态以及堆栈中的内容,当内核需要切换到另一个进程时,它需要保存当前进程的所有状态,即保存当前进程的进程上下文,以便再次执行该进程时,能够恢复切换时的状态,继续执行。
同理,硬件通过触发信号,导致内核调用中断处理程序,进入内核空间。这个过程中,硬件的一些变量和参数也要传递给内核,内核通过这些参数进行中断处理,中断上下文就可以理解为硬件传递过来的这些参数和内核需要保存的一些环境,主要是被中断的进程的环境。
在 nvme_probe
中使用 nvme_mq_admin_ops
对 nvme_alloc_admin_tag_set
进行初始化,
static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
.init_request = nvme_pci_init_request,
.timeout = nvme_timeout,
};
再回归到 nvme_alloc_admin_tag_set
,如下:
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
if (ctrl->ops->flags & NVME_F_FABRICS)
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
set->flags = BLK_MQ_F_NO_SCHED;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
// 1. 分配tag set并与request queue关联
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ret;
// 2. 对hardware queue和software queues进行初始化,并配置两者之间的mapping关系,最后将返回值传递给dev->ctrl.admin_q。
ctrl->admin_q = blk_mq_init_queue(set);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->fabrics_q = blk_mq_init_queue(set);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
}
}
ctrl->admin_tagset = set;
return 0;
out_cleanup_admin_q:
blk_mq_destroy_queue(ctrl->admin_q);
blk_put_queue(ctrl->admin_q);
out_free_tagset:
blk_mq_free_tag_set(set);
ctrl->admin_q = NULL;
ctrl->fabrics_q = NULL;
return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
这个函数是 NVMe
设备采用 Multi-Queue
( MQ
)的核心函数,所以在展开解析这个函数之前,我们先聊聊 Linux Multi-Queue Block Layer
。
多队列、原生异步、无锁是 NVMe
的最大特色,这些为高性能而生的设计迫使 Linux Kernel
在3.19
抛弃了老的单队列 Block Layer
而转向 Multi-Queue Block Layer
。
这个 Multi-Queue Block Layer
的架构直接对应于 NVMe
的多队列设计,如下图:
所谓的 Multi-Queue
机制就是在多核 CPU
的情况下,将不同的 block
层提交队列分配到不同的CPU
核上,以更好的平衡 IO
的工作负载,大幅提高 SSD
等存储设备的 IO
效率。
Multi-Queue Block Layer
长啥样子呢?如下图所示:
Multi-Queue Block Layer
分为两层,Software Queues
和 Hardware Dispatch Queues
。
Softeware Queues
是 per core
的,Queue
的数目与协议有关系,比如 NVMe
协议,可以有最多64K
对 IO
SQ/CQ
。Software Queues
层做的事情如上图标识部分。
Hardware Queues
数目由底层设备驱动决定,可以 1
个或者多个。最大支持数目一般会与 MSI-X
中断最大数目一样,支持 2K
。设备驱动通过 map_queue
维护 Software Queues
和 Hardware Queues
之间的对接关系。
需要强调一点,Hardware Queues
与 Software Queues
的数目不一定相等,上图 1:1 Mapping
的情况属于最理想的情况。
到这里,Multi-Queue Block Layer
基本理论我们就算回顾完毕了,再回头看看nvme_alloc_admin_tag_set
这个函数。
在 nvme_alloc_admin_tag_set
中使用 blk_mq_alloc_tag_set
,
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
* requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
int i, ret;
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
if (!set->nr_hw_queues)
return -EINVAL;
if (!set->queue_depth)
return -EINVAL;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
return -EINVAL;
if (!set->ops->queue_rq)
return -EINVAL;
if (!set->ops->get_budget ^ !set->ops->put_budget)
return -EINVAL;
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
set->queue_depth = BLK_MQ_MAX_DEPTH;
}
if (!set->nr_maps)
set->nr_maps = 1;
else if (set->nr_maps > HCTX_MAX_TYPES)
return -EINVAL;
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->nr_maps = 1;
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus if we just have
* a single map
*/
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
if (set->flags & BLK_MQ_F_BLOCKING) {
set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
if (!set->srcu)
return -ENOMEM;
ret = init_srcu_struct(set->srcu);
if (ret)
goto out_free_srcu;
}
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
goto out_cleanup_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
}
blk_mq_update_queue_map(set);
ret = blk_mq_alloc_set_map_and_rqs(set);
if (ret)
goto out_free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map);
set->map[i].mq_map = NULL;
}
kfree(set->tags);
set->tags = NULL;
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
out_free_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
在 nvme_alloc_admin_tag_set
中使用 blk_mq_init_queue
,
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
blk_put_queue(q);
return ERR_PTR(ret);
}
return q;
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
if (blk_mq_alloc_ctxs(q))
goto err_poll;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
xa_init(&q->hctx_table);
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth;
/*
* Default to classic polling
*/
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
return 0;
err_hctxs:
blk_mq_release(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i, j;
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
if (old_hctx) {
old_node = old_hctx->numa_node;
blk_mq_exit_hctx(q, set, old_hctx, i);
}
if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
if (!old_hctx)
break;
pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
node, old_node);
hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
WARN_ON_ONCE(!hctx);
}
}
/*
* Increasing nr_hw_queues fails. Free the newly allocated
* hctxs and keep the previous q->nr_hw_queues.
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
} else {
j = i;
q->nr_hw_queues = set->nr_hw_queues;
}
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
mutex_unlock(&q->sysfs_lock);
}
在 nvme_probe
中使用 nvme_init_ctrl_finish
来初始化 NVMe Controller
结构,
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
* the admin queue is fully up and running.
*/
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
{
int ret;
// 读取NVME的版本
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
// NVMe 1.1之后,支持subsystem Reset
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
// 1. 读取 identify data
ret = nvme_init_identify(ctrl);
if (ret)
return ret;
ret = nvme_configure_apst(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_timestamp(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_host_options(ctrl);
if (ret < 0)
return ret;
nvme_configure_opal(ctrl, was_suspended);
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
/*
* Do not return errors unless we are in a controller reset,
* the controller works perfectly fine without hwmon.
*/
ret = nvme_hwmon_init(ctrl);
if (ret == -EINTR)
return ret;
}
ctrl->identified = true;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
在 nvme_init_ctrl_finish
中调用 nvme_init_identify
,
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
int ret;
// 1. 调用 nvme_identify_ctrl 读取 identify data.
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
return -EIO;
}
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
if (ret < 0)
goto out_free;
}
if (!(ctrl->ops->flags & NVME_F_FABRICS))
ctrl->cntlid = le16_to_cpu(id->cntlid);
if (!ctrl->identified) {
unsigned int i;
/*
* Check for quirks. Quirk can depend on firmware version,
* so, in principle, the set of quirks present can change
* across a reset. As a possible future enhancement, we
* could re-scan for quirks every time we reinitialize
* the device, but we'd have to make sure that the driver
* behaves intelligently if the quirks change.
*/
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
if (quirk_matches(id, &core_quirks[i]))
ctrl->quirks |= core_quirks[i].quirks;
}
ret = nvme_init_subsystem(ctrl, id);
if (ret)
goto out_free;
}
memcpy(ctrl->subsys->firmware_rev, id->fr,
sizeof(ctrl->subsys->firmware_rev));
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
ctrl->crdt[0] = le16_to_cpu(id->crdt1);
ctrl->crdt[1] = le16_to_cpu(id->crdt2);
ctrl->crdt[2] = le16_to_cpu(id->crdt3);
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->oncs = le16_to_cpu(id->oncs);
ctrl->mtfa = le16_to_cpu(id->mtfa);
ctrl->oaes = le32_to_cpu(id->oaes);
ctrl->wctemp = le16_to_cpu(id->wctemp);
ctrl->cctemp = le16_to_cpu(id->cctemp);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
if (id->mdts)
max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
else
max_hw_sectors = UINT_MAX;
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
// 2. 调用 nvme_set_queue_limits 设置 queue write cache 的大小.
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
ctrl->ctratt = le32_to_cpu(id->ctratt);
ctrl->cntrltype = id->cntrltype;
ctrl->dctype = id->dctype;
if (id->rtd3e) {
/* us -> s */
u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_info(ctrl->device,
"Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
// 3. 初始化APSTE的配置
ctrl->npss = id->npss;
ctrl->apsta = id->apsta;
prev_apst_enabled = ctrl->apst_enabled;
if (ctrl->quirks & NVME_QUIRK_NO_APST) {
if (force_apst && id->apsta) {
dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
ctrl->apst_enabled = true;
} else {
ctrl->apst_enabled = false;
}
} else {
ctrl->apst_enabled = id->apsta;
}
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
/*
* In fabrics we need to verify the cntlid matches the
* admin connect
*/
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
dev_err(ctrl->device,
"Mismatching cntlid: Connect %u vs Identify "
"%u, rejecting\n",
ctrl->cntlid, le16_to_cpu(id->cntlid));
ret = -EINVAL;
goto out_free;
}
if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
dev_err(ctrl->device,
"keep-alive support is mandatory for fabrics\n");
ret = -EINVAL;
goto out_free;
}
} else {
ctrl->hmpre = le32_to_cpu(id->hmpre);
ctrl->hmmin = le32_to_cpu(id->hmmin);
ctrl->hmminds = le32_to_cpu(id->hmminds);
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
// 4. 对于multi-port devices, 初始化multi-path
ret = nvme_mpath_init_identify(ctrl, id);
if (ret < 0)
goto out_free;
if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
out_free:
kfree(id);
return ret;
}
从上面来看,主要做了这么几部分的工作:
nvme_identify_ctrl
来读取 identify data
;nvme_set_queue_limits
设置 queue write cache
的大小;APSTE
的配置;multi-port devices
,初始化 multi-path
;static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
int error;
// 1. 构造 identify cmd;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CTRL;
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
if (!*id)
return -ENOMEM;
// 2. 提交 identify cmd 到 admin_q
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
if (error)
kfree(*id);
return error;
}
nvme_identify_ctrl
函数先建立 identify Command(opcode=0x6)
,如下所示:
struct nvme_identify {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 cns;
__u8 rsvd3;
__le16 ctrlid;
__u8 rsvd11[3];
__u8 csi;
__u32 rsvd12[4];
};
其 Opcode
在协议中规定如下:
enum nvme_admin_opcode {
...
nvme_admin_identify = 0x06,
...
无论是从 NVMe
协议又或者源码的角度,对 identify command
的 Opcode
的定义是一致的。
Identify Command
下发后返回的是 4KB
的 Identify Data Structure
,这个 data structure
可以描述controller
,也可以描述 namespace
, 具体是描述什么要取决于 CNS
( Controller or Namespace Structure
) byte
。
先回顾一下在 NVMe Spec
中有几种 CNS
的分类:
CNS
= 0x00h
,代表描述的是 Namespace data structure
;CNS
= 0x01h
,代表描述的是 Controller data structure
;CNS
= 0x02h
,代表描述的是 Namespace list
;然后是 NVME_ID_CNS_CTRL
在源码中的定义,如下:
enum {
...
NVME_ID_CNS_CTRL = 0x01,
...
nvme_identify_ctrl
函数已经建立了 Identify Command
,驱动是怎么提交这个 admin command
呢?
实际上,admin command
的提交过程主要调用了 nvme_submit_sync_cmd
函数,但最终调用的函数是 __nvme_submit_sync_cmd
,如下所示:
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
int qid, int at_head, blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
// 1. 申请一个 request_queue, 并完成相应的初始化;
if (qid == NVME_QID_ANY)
req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
else
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
qid - 1);
if (IS_ERR(req))
return PTR_ERR(req);
nvme_init_request(req, cmd);
// 2. 调用blk_rq_map_kern完成request queue与bio以及bio与内核空间buffer的关联
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
// 3. 调用blk_excute_rq实现最终的命令发送。
ret = nvme_execute_rq(req, at_head);
if (result && ret >= 0)
*result = nvme_req(req)->result;
out:
blk_mq_free_request(req);
return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
在 nvme_submit_sync_cmd
中调用 blk_mq_alloc_request
或者 blk_mq_alloc_request_hctx
来申请一个 request_queue
,cmd
参数,在这里也就是 Identify command
会通过逐层调用 blk_mq_alloc_cached_request
和 blk_mq_rq_cache_fill
来传递到 request queue
中。
并完成相应的初始化,如下:
struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
blk_mq_req_flags_t flags)
{
struct request *rq;
// 申请request侯,调用blk_mq_alloc_cached_request将opf塞入q中
rq = blk_mq_alloc_cached_request(q, opf, flags);
if (!rq) {
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = opf,
.nr_tags = 1,
};
int ret;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = __blk_mq_alloc_requests(&data);
if (!rq)
goto out_queue_exit;
}
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
blk_opf_t opf,
blk_mq_req_flags_t flags)
{
struct blk_plug *plug = current->plug;
struct request *rq;
if (!plug)
return NULL;
if (rq_list_empty(plug->cached_rq)) {
if (plug->nr_ios == 1)
return NULL;
// 通过调用 blk_mq_rq_cache_fill来填充opf进入q中
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
plug->cached_rq = rq_list_next(rq);
}
rq->cmd_flags = opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
}
static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_plug *plug,
blk_opf_t opf,
blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = opf,
.nr_tags = plug->nr_ios,
.cached_rq = &plug->cached_rq,
};
struct request *rq;
if (blk_queue_enter(q, flags))
return NULL;
plug->nr_ios = 1;
rq = __blk_mq_alloc_requests(&data);
if (unlikely(!rq))
blk_queue_exit(q);
return rq;
}
如果 buffer
& bufflen
不为 0
,则说明这次 nvme admin
命令需要传输数据。
既然需要传输数据,就需要得到 bio
的支持, 那么就调用 blk_rq_map_kern
完成 request queue
与bio
以及 bio
与内核空间 buffer
的关联。毕竟 block layer
并不认识内核空间或者用户空间,而只认识 bio
。
/**
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
* @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
* @gfp_mask: memory allocation flags
*
* Description:
* Data will be mapped directly if possible. Otherwise a bounce
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
unsigned int len, gfp_t gfp_mask)
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
struct bio *bio;
int ret;
// 在怎样的情况下,bufflen是无效的?2种:
if (len > (queue_max_hw_sectors(q) << 9))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
// 关联bio和内核空间buffer
if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
blk_queue_may_bounce(q))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
bio_uninit(bio);
kfree(bio);
}
return ret;
}
EXPORT_SYMBOL(blk_rq_map_kern);
/*
* Return values:
* 0: success
* >0: nvme controller's cqe status response
* <0: kernel error in lieu of controller response
*/
static int nvme_execute_rq(struct request *rq, bool at_head)
{
blk_status_t status;
// 调用 blk_execute_rq 将request插入执行队列
status = blk_execute_rq(rq, at_head);
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
// #define EINTR 4 /* Interrupted system call */
return -EINTR;
if (nvme_req(rq)->status)
return nvme_req(rq)->status;
return blk_status_to_errno(status);
}
调用 blk_execute_rq
来将 request queue
插入执行队列队头或者队尾,等待执行完毕后返回 0
,>0
或者 <0
的状态码(定义如注释部分所示)。
那么是如何插入的呢?如下所示:
/**
* blk_execute_rq - insert a request into queue for execution
* @rq: request to insert
* @at_head: insert request at head or tail of queue
*
* Description:
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
* Return: The blk_status_t result provided to blk_mq_end_request().
*/
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
struct blk_rq_wait wait = {
.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
};
WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq));
# 等待 I/O schedular queue结束
rq->end_io_data = &wait;
rq->end_io = blk_end_sync_rq;
blk_account_io_start(rq);
blk_mq_sched_insert_request(rq, at_head, true, false);
if (blk_rq_is_poll(rq)) {
blk_rq_poll_completion(rq, &wait.done);
} else {
/*
* Prevent hang_check timer from firing at us during very long
* I/O
*/
unsigned long hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait.done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&wait.done);
}
return wait.ret;
}
EXPORT_SYMBOL(blk_execute_rq);
在 nvme_probe
中使用 nvme_setup_io_queues
来初始化 IO Queue
结构,
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
unsigned int nr_io_queues;
unsigned long size;
int result;
/*
* Sample the module parameters once at reset time so that we have
* stable values to work with.
*/
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
// 每一个 nvme_dev 是一个 PCI function,获取到的 nr_allocated_queues 赋值给 nr_io_queues
nr_io_queues = dev->nr_allocated_queues - 1;
// 1. 发送set feature cmd设置IO queues数目为nr_io_queues
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
/*
* Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
* from set to unset. If there is a window to it is truely freed,
* pci_free_irq_vectors() jumping into this window will crash.
* And take lock to avoid racing with pci_free_irq_vectors() in
* nvme_dev_disable() path.
*/
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
pci_free_irq(pdev, 0, adminq);
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0) {
dev->q_depth = result;
dev->ctrl.sqsize = result - 1;
} else {
dev->cmb_use_sqes = false;
}
}
do {
size = db_bar_size(dev, nr_io_queues);
result = nvme_remap_bar(dev, size);
if (!result)
break;
if (!--nr_io_queues) {
result = -ENOMEM;
goto out_unlock;
}
} while (1);
adminq->q_db = dev->dbs;
retry:
/* Deregister the admin queue's interrupt */
if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
pci_free_irq(pdev, 0, adminq);
/*
* If we enable msix early due to not intx, disable it again before
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0) {
result = -EIO;
goto out_unlock;
}
dev->num_vecs = result;
result = max(result - 1, 1);
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
* more queues than interrupt vectors; it might allow the submission
* path to scale better, even if the receive path is limited by the
* number of interrupts.
*/
result = queue_request_irq(adminq);
if (result)
goto out_unlock;
set_bit(NVMEQ_ENABLED, &adminq->flags);
mutex_unlock(&dev->shutdown_lock);
// 2. 确定了 IO queues 的数目之后,调用 nvme_creat_io_queues 函数开始真正干活了,完成 IO queues 的创建。
result = nvme_create_io_queues(dev);
if (result || dev->online_queues < 2)
return result;
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
nvme_delete_io_queues(dev);
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
nvme_suspend_io_queues(dev);
goto retry;
}
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
dev->io_queues[HCTX_TYPE_DEFAULT],
dev->io_queues[HCTX_TYPE_READ],
dev->io_queues[HCTX_TYPE_POLL]);
return 0;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
return result;
}
执行的过程中主要分为两步:
nvme_set_queue_count
发送 set feature cmd
设置 IO queues
的数目;IO queues
的数目之后,调用 nvme_creat_io_queues
函数开始真正干活了,完成 IO queues
的创建。在 nvme_setup_io_queues
中使用 nvme_set_queue_count
来设置 IO Queue
的数量,
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
u32 result;
int status, nr_io_queues;
// 通过NVME_FEAT_NUM_QUEUES和q_count来等参数来发送nvme_set_features的命令并返回执行状态。
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
if (status < 0)
return status;
/*
* Degraded controllers might return an error when setting the queue
* count. We still want to be able to bring them online and offer
* access to the admin queue, as that might be only way to fix them up.
*/
if (status > 0) {
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
return 0;
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
nvme_set_queue_count
函数先建立 nvme_set_features
,如下所示:
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_set_features);
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
union nvme_result res = { 0 };
struct nvme_command c = { };
int ret;
c.features.opcode = op;
// 调用 cpu_to_le32 来实现主机格式和小端之间的转换,让代码更好得跨平台。
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
// 参考之前的 `nvme_identify_ctrl` 中 `nvme_submit_sync_cmd` 中的用法的概述。
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
buffer, buflen, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
}
建立 nvme_set_features
的方式是调用 nvme_features
,并传入 nvme_admin_set_features
( Opcode
)、NVME_FEAT_NUM_QUEUES
( FID
)和 &nr_io_queues
( Dword 11
)。
其 Opcode
在协议中规定如下:
而在源码中的定义,如下:
enum nvme_admin_opcode {
...
nvme_admin_set_features = 0x09,
...
};
无论是从 NVMe
协议又或者源码的角度,对 nvme_set_feature
的 Opcode
的定义是一致的。
IO queues
数目设置在 set feature command
中的 feature ID=0x7h
,如下图:
而 IO queues
的具体数目在 Dword11
设置,如下图,
enum {
...
NVME_FEAT_NUM_QUEUES = 0x07,
...
可以看到在赋值 c.identify.fid
和 c.identify.dword11
时,采用了 cpu_to_le32
这样的函数,因为在nvme
协议里规定的一些消息格式都是按照小端存储的,但是我们的主机可能是小端的 x86
,也可能是大端的 arm
或者其他类型,用了这样的函数就可以做到主机格式和小端之间的转换,让代码更好得跨平台,这也是 Linux
系统强大的地方。
它的定义如下,
set feature command
的三个关键参数 ( Opcode
, FID
, dword11
) 配置完成后,再调用__nvme_submit_sync_cmd
去执行,最终完成 IO queues
数目的设置。
__nvme_submit_sync_cmd的执行过程参考上面。
在 nvme_setup_io_queues
中使用 nvme_creat_io_queues
来创建 IO Queues
。
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max, rw_queues;
int ret = 0;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
// 1. 调用nvme_alloc_queue申请SQ/CQ所需内存
if (nvme_alloc_queue(dev, i, dev->q_depth)) {
ret = -ENOMEM;
break;
}
}
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
dev->io_queues[HCTX_TYPE_READ];
} else {
rw_queues = max;
}
for (i = dev->online_queues; i <= max; i++) {
bool polled = i > rw_queues;
// 2. 调用nvme_create_queue真正实现SQ/CQ的创建。
ret = nvme_create_queue(&dev->queues[i], i, polled);
if (ret)
break;
}
/*
* Ignore failing Create SQ/CQ commands, we can continue with less
* than the desired amount of queues, and even a controller without
* I/O queues can still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example.
*/
return ret >= 0 ? 0 : ret;
}
上面的代码显示 nvme_create_io_queue
在创建 IO queues
过程中主要进行了两步:
nvme_alloc_queue
申请 SQ
/ CQ
所需内存,参考对 nvme_alloc_queue
的介绍;nvme_create_queue
真正实现 SQ
/ CQ
的创建。在 nvme_setup_io_queues
中使用 nvme_create_queue
来真正实现 SQ
/ CQ
的创建,如下:
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
u16 vector = 0;
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/*
* A queue's vector matches the queue identifier unless the controller
* has only one vector available.
*/
if (!polled)
vector = dev->num_vecs == 1 ? 0 : qid;
else
set_bit(NVMEQ_POLLED, &nvmeq->flags);
// 1. 调用adapter_alloc_cq创建CQ
result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result)
return result;
// 调用adapter_alloc_sq创建SQ
result = adapter_alloc_sq(dev, qid, nvmeq);
if (result < 0)
return result;
if (result)
goto release_cq;
nvmeq->cq_vector = vector;
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
// 2. 调用 nvme_init_queue 初始化前面创建的CQ/SQ.
nvme_init_queue(nvmeq, qid);
if (!polled) {
// 3. 调用 queue_request_irq 申请中断
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
}
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
mutex_unlock(&dev->shutdown_lock);
return result;
release_sq:
dev->online_queues--;
mutex_unlock(&dev->shutdown_lock);
adapter_delete_sq(dev, qid);
release_cq:
adapter_delete_cq(dev, qid);
return result;
}
在 nvme_create_queue
中使用 adapter_alloc_cq
或者 adapter_alloc_sq
来真正实现 SQ
/ CQ
的创建,如下:
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq, s16 vector)
{
struct nvme_command c = { };
int flags = NVME_QUEUE_PHYS_CONTIG;
if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
flags |= NVME_CQ_IRQ_ENABLED;
/*
* Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
c.create_cq.opcode = nvme_admin_create_cq;
c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(vector);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
struct nvme_ctrl *ctrl = &dev->ctrl;
struct nvme_command c = { };
int flags = NVME_QUEUE_PHYS_CONTIG;
/*
* Some drives have a bug that auto-enables WRRU if MEDIUM isn't
* set. Since URGENT priority is zeroes, it makes all queues
* URGENT.
*/
if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
flags |= NVME_SQ_PRIO_MEDIUM;
/*
* Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
c.create_sq.opcode = nvme_admin_create_sq;
c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
c.create_sq.sqid = cpu_to_le16(qid);
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
create I/O SQ
命令中的 opcode
,prp1
,sqid
,qsize
与 create I/O CQ
命令类似, 这里主要提一下 sq_flags
和 cqid
。
cq_flags
在这里是 pc flag
与 QPRIO flag
合并的值,
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
QPRIO=Queue Priority
,代表了 SQ
中命令的冲裁级别。在 NVMe Spec
没有规定 Command
存入 SQ
队列的执行顺序,Controller
可以一次取出多个 Command
进行批量处理。df
一个 SQ
队列中的 Command
执行顺序是不固定,同时在多个 SQ
队列之间的 Command
执行顺序也不固定,这就涉及到了 NVMe Spec
定义的命令仲裁机制。
create I/O CQ
命令相比,create I/O SQ
命令多了一个 cqid
值。因为 SQ
和 CQ
是相互对应的,IO SQ
和 IO CQ
可以一对一,也可以多对一。
create I/O submission queue command
的 6
个关键参数 ( Opcode
,PRP1
,sqid
,qsize
,sq_flags
,cqid
) 配置完成后,再调用 __nvme_submit_sync_cmd
去执行,最终完成 IO queues
数目的设置,__nvme_submit_sync_cmd
的执行过程参考上面的介绍。
在 nvme_probe
中使用 nvme_start_ctrl
来启动控制器的工作,
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
nvme_start_keep_alive(ctrl);
// 1. 开启AEN(Asyncchronous Event Notification)功能
nvme_enable_aen(ctrl);
/*
* persistent discovery controllers need to send indication to userspace
* to re-read the discovery log page to learn about possible changes
* that were missed. We identify persistent discovery controllers by
* checking that they started once before, hence are reconnecting back.
*/
// 2. 确认nvme ctroller的状态是否有改变过
if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
nvme_discovery_ctrl(ctrl))
nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
if (ctrl->queue_count > 1) {
// 3. 确认nvme ctroller的状态是否有改变过
nvme_queue_scan(ctrl);
nvme_unquiesce_io_queues(ctrl);
nvme_mpath_update(ctrl);
}
nvme_change_uevent(ctrl, "NVME_EVENT=connected");
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
在 nvme_start_ctrl
中使用 nvme_queue_scan
来扫描队列,
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
/*
* Only new queue scan work when admin and IO queues are both alive
*/
// 在确认nvme ctrl确实是alive并且ctrl已经被tagset的前提下,对nvme_wq中的queue进行队列重整
if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
/**
* queue_work - queue work on a workqueue
* @wq: workqueue to use
* @work: work to queue
*
* Returns %false if @work was already on a queue, %true otherwise.
*
* We queue the work to the CPU on which it was submitted, but if the CPU dies
* it can be processed by another CPU.
*
* Memory-ordering properties: If it returns %true, guarantees that all stores
* preceding the call to queue_work() in the program order will be visible from
* the CPU which will execute @work by the time such work executes, e.g.,
*
* { x is initially 0 }
*
* CPU0 CPU1
*
* WRITE_ONCE(x, 1); [ @work is being executed ]
* r0 = queue_work(wq, work); r1 = READ_ONCE(x);
*
* Forbids: r0 == true && r1 == 0
*/
static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away. Callers that fail to ensure that the specified
* CPU cannot go away will execute on a randomly chosen CPU.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;
// 设置中断flag
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}
// 恢复中断flag
local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL(queue_work_on);
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
/*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
lockdep_assert_irqs_disabled();
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
if (wq->flags & WQ_UNBOUND) {
if (req_cpu == WORK_CPU_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
} else {
if (req_cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
raw_spin_lock(&pwq->pool->lock);
}
} else {
raw_spin_lock(&pwq->pool->lock);
}
/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
/* oops */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry)))
goto out;
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_INACTIVE;
worklist = &pwq->inactive_works;
}
debug_work_activate(work);
// 将work插入pool中
insert_work(pwq, work, worklist, work_flags);
out:
raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}
在 nvme_queue_scan
中使用 insert_work
来将 work
插入 pool
中,
/**
* insert_work - insert a work into a pool
* @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
* Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
* work_struct flags.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
struct worker_pool *pool = pwq->pool;
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack_noalloc(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
if (__need_more_worker(pool))
wake_up_worker(pool);
}
在 nvme_probe
中使用 flush_work
来等待队列中最后一个实例完成执行,
/**
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
*
* Wait until @work has finished execution. @work is guaranteed to be idle
* on return if it hasn't been requeued since flush started.
*
* Return:
* %true if flush_work() waited for the work to finish execution,
* %false if it was already idle.
*/
bool flush_work(struct work_struct *work)
{
return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);
进一步调用了 ___flush_work
,如下,
static bool __flush_work(struct work_struct *work, bool from_cancel)
{
struct wq_barrier barr;
if (WARN_ON(!wq_online))
return false;
if (WARN_ON(!work->func))
return false;
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return true;
} else {
return false;
}
}
在 flush_work
中使用 start_flush_work
,
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
bool from_cancel)
{
struct worker *worker = NULL;
struct worker_pool *pool;
struct pool_workqueue *pwq;
might_sleep();
rcu_read_lock();
pool = get_work_pool(work);
if (!pool) {
rcu_read_unlock();
return false;
}
raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
if (unlikely(pwq->pool != pool))
goto already_gone;
} else {
worker = find_worker_executing_work(pool, work);
if (!worker)
goto already_gone;
pwq = worker->current_pwq;
}
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);
/*
* Force a lock recursion deadlock when using flush_work() inside a
* single-threaded or rescuer equipped workqueue.
*
* For single threaded workqueues the deadlock happens when the work
* is after the work issuing the flush_work(). For rescuer equipped
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
if (!from_cancel &&
(pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
rcu_read_unlock();
return true;
already_gone:
raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}
NVMe驱动系列文章 ↩︎
Linux NVMe Driver学习笔记之1:概述与nvme_core_init函数解析 ↩︎
linux ↩︎
bootlin ↩︎