内核3.13版本。
static int nvme_dev_start(struct nvme_dev *dev)
{
int result;
result = nvme_dev_map(dev); //pcie bar空间映射
if (result)
return result;
result = nvme_configure_admin_queue(dev); //管理队列配置
if (result)
goto unmap;
spin_lock(&dev_list_lock);
list_add(&dev->node, &dev_list); //dev_list没太明白是定义在那个文件的?
spin_unlock(&dev_list_lock);
result = nvme_setup_io_queues(dev); //io队列配置
if (result && result != -EBUSY)
goto disable;
return result;
disable:
spin_lock(&dev_list_lock);
list_del_init(&dev->node);
spin_unlock(&dev_list_lock);
unmap:
nvme_dev_unmap(dev);
return result;
}
该函数主要做了4减事情。
1:nvme_dev_map,主要就是pci bar 那一套编程套路
2:管理队列的配置
3:dev_list?
4:io队列的配置
static int nvme_dev_map(struct nvme_dev *dev)
{
int bars, result = -ENOMEM;
struct pci_dev *pdev = dev->pci_dev;
if (pci_enable_device_mem(pdev))
return result;
dev->entry[0].vector = pdev->irq;
pci_set_master(pdev);
bars = pci_select_bars(pdev, IORESOURCE_MEM);
if (pci_request_selected_regions(pdev, bars, "nvme"))
goto disable_pci;
if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
goto disable;
pci_set_drvdata(pdev, dev);
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);//bar0
if (!dev->bar)
goto disable;
dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap)); //doorbell stride 32-35
dev->dbs = ((void __iomem *)dev->bar) + 4096; //sq tail
return 0;
disable:
pci_release_regions(pdev);
disable_pci:
pci_disable_device(pdev);
return result;
}
这个函数没啥介绍的,对应的unmap如下:
static void nvme_dev_unmap(struct nvme_dev *dev)
{
if(dev->pci_dev->msi_enabled)
pci_disable_msi(dev->pci_dev);
else if (dev->pci_dev->msix_enabled)
pci_disable_msix(dev->pci_dev);
if(dev->bar) {
iounmap(dev->bar);
dev->bar = NULL;
}
pci_release_regions(dev->pci_dev);
if (pci_is_enabled(dev->pci_dev))
pci_disable_device(dev->pci_dev);
}
static int nvme_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
u64 cap = readq(&dev->bar->cap);
struct nvme_queue *nvmeq;
result = nvme_disable_ctrl(dev, cap); //根据文档说明,配置队列时先disable控制器
if (result < 0)
return result;
nvmeq = dev->queues[0];
if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, 64, 0);//admin 队列,深度为64
if (!nvmeq)
return -ENOMEM;
dev->queues[0] = nvmeq; //保存nvmeq起始地址
}
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
//使能控制器状态
dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
//设置一个page的大小
dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
//配置cqe和sqe元素的大小,2^4,2^6
dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
writel(aqa, &dev->bar->aqa);//配置cq队列和sq队列深度
writeq(nvmeq->sq_dma_addr, &dev->bar->asq); //配置sq队列dma起始地址
writeq(nvmeq->cq_dma_addr, &dev->bar->acq); //配置cq队列dma起始地址
/*
cc->Controller Configuration
31-24 : Reserved
23-20 : I/O Completion queue entry size
19-16 : I/O Submission Queue Entry Size
15-14 : Shutdown Notification
13-11 : Arbitration Mechanism Selected
10-7 : Memory Page size
6-4 : IO Command Set Selected
3-1 : Reserved
0 : Enable
*/
writel(dev->ctrl_config, &dev->bar->cc);//这里使能了控制器
//读Controller Configuration的第一个bit,判断控制器是否已经enable
result = nvme_enable_ctrl(dev, cap);
if (result)
return result;
result = queue_request_irq(dev, nvmeq, "nvme admin");
if (result)
return result;
spin_lock(&nvmeq->q_lock);
nvme_init_queue(nvmeq, 0); //admin queue初始化
spin_unlock(&nvmeq->q_lock);
return result;
}
注释都解释的比较清楚了,这里在贴一些相关函数调用的代码。
enable/disable控制器相关的。起始就是对寄存器的一些值的判断。
static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
{
unsigned long timeout;
u32 bit = enabled ? NVME_CSTS_RDY : 0;
timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
msleep(100);
if (fatal_signal_pending(current))//可以被一些fatal的信号打断
return -EINTR;
if (time_after(jiffies, timeout)) { //jiffies比timeout靠后返回1
dev_err(&dev->pci_dev->dev, "Device not ready; aborting initialisation\n");
return -ENODEV;
}
}
return 0;
}
static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
{
u32 cc = readl(&dev->bar->cc);
if (cc & NVME_CC_ENABLE)
writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);//取反使能bit
return nvme_wait_ready(dev, cap, false);
}
static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
{
return nvme_wait_ready(dev, cap, true);
}
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector)
{
struct device *dmadev = &dev->pci_dev->dev;
unsigned extra = nvme_queue_extra(depth);
struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
if (!nvmeq)
return NULL;
nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
goto free_cqdma;
nvmeq->q_dmadev = dmadev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->q_lock);//队列的自旋锁
nvmeq->cq_head = 0;//cq head位置
nvmeq->cq_phase = 1;//后续需要它确定cq tail的位置
init_waitqueue_head(&nvmeq->sq_full);//初始化等待队列头
init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);//初始化等待队列
bio_list_init(&nvmeq->sq_cong);//初始化bio 链表(单向链表)
nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];//qid队列的门铃寄存器基地址
nvmeq->q_depth = depth;//队列深度
nvmeq->cq_vector = vector;//队列的中断向量
nvmeq->q_suspended = 1;//还没初始化,所以这个先设置为1
dev->queue_count++;//总的队列个数加一
return nvmeq;
free_cqdma:
dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
free_nvmeq:
kfree(nvmeq);
return NULL;
}
static unsigned nvme_queue_extra(int depth)
{
/*
DIV_ROUND_UP(depth, 8)用来记录bit位(一共有depth个bit)
depth * sizeof(struct nvme_cmd_info)用来记录depth个bit的struct nvme_cmd_info信息
*/
return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
unsigned extra = nvme_queue_extra(nvmeq->q_depth);
nvmeq->sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
memset(nvmeq->cmdid_data, 0, extra);
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_cancel_ios(nvmeq, false);
nvmeq->q_suspended = 0;
}
static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
{
int depth = nvmeq->q_depth - 1;
struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
unsigned long now = jiffies;
int cmdid;
for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { //从第一个bit开始遍历每一个bit,返回被置位(值为1)的位置
void *ctx;
nvme_completion_fn fn;
static struct nvme_completion cqe = {
.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), //7 << 1
};
/*
初始化时timeout为0,所以流程不会走到下面
now比info[cmdid].timeout大,函数返回1,
所以流程要往下走,time_afte函数要返回1(即时间超时?now大于timeout)
*/
if (timeout && !time_after(now, info[cmdid].timeout))
continue;
if (info[cmdid].ctx == CMD_CTX_CANCELLED) //也有可能是这个(提交的命令出现错误的情况?),因为bit还没清,所以遍历的时候是有这种情况的
continue;
dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid)
ctx = cancel_cmdid(nvmeq, cmdid, &fn);
fn(nvmeq->dev, ctx, &cqe);
}
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct pci_dev *pdev = dev->pci_dev;
int result, cpu, i, vecs, nr_io_queues, size, q_depth;
nr_io_queues = num_online_cpus();
result = set_queue_count(dev, nr_io_queues);
if (result < 0)
return result;
if (result < nr_io_queues)
nr_io_queues = result;
size = db_bar_size(dev, nr_io_queues);
if (size > 8192) { //因为dev_map函数里map的size是8192,大于8192需要重新映射
iounmap(dev->bar);//先解除映射
do {
dev->bar = ioremap(pci_resource_start(pdev, 0), size);//再重新映射
if (dev->bar)//如果一次性映射成功是最好的
break;
if (!--nr_io_queues)//如果一次性映射不成功,那么就逐步减少nr_io_queues的值,直到映射成功
return -ENOMEM;
size = db_bar_size(dev, nr_io_queues);//重新计算nr_io_queues减少以后的size,直到map成功
} while (1);
//管理队列的需要重新赋值
dev->dbs = ((void __iomem *)dev->bar) + 4096;
dev->queues[0]->q_db = dev->dbs;
}
//注销管理队列的中断
free_irq(dev->entry[0].vector, dev->queues[0]);
vecs = nr_io_queues;
//entry 初始化
for (i = 0; i < vecs; i++)
dev->entry[i].entry = i;
for (;;) {
//请求分配vecs个中断,返回0表示成功
result = pci_enable_msix(pdev, dev->entry, vecs);
if (result <= 0)
break;
vecs = result;
}
if (result < 0) { //考虑result小于0的情况
vecs = nr_io_queues;
if (vecs > 32)
vecs = 32;
for (;;) {
result = pci_enable_msi_block(pdev, vecs);
if (result == 0) {
for (i = 0; i < vecs; i++)
dev->entry[i].vector = i + pdev->irq;
break;
} else if (result < 0) {
vecs = 1;
break;
}
vecs = result;
}
}
//应该调查分配比中断向量更多的队列是否有性能优势;它可能允许提交路径更好地扩展,即使接收路径受到中断数量的限制。
nr_io_queues = vecs;
result = queue_request_irq(dev, dev->queues[0], "nvme admin");
if (result) {
dev->queues[0]->q_suspended = 1;
goto free_queues;
}
/*
释放以前分配的不再可用的队列,即队列id比nr_io_queues还大的队列
不过这里感觉应该不会进来,因为前面只创建了管理队列
*/
spin_lock(&dev_list_lock);
for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
spin_lock(&nvmeq->q_lock);
nvme_cancel_ios(nvmeq, false);
spin_unlock(&nvmeq->q_lock);
nvme_free_queue(nvmeq);
dev->queue_count--;
dev->queues[i] = NULL;
}
spin_unlock(&dev_list_lock);
/*将不同的队列和cpu进行绑定*/
cpu = cpumask_first(cpu_online_mask);
for (i = 0; i < nr_io_queues; i++) {
//https://zhuanlan.zhihu.com/p/163850501
irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
cpu = cpumask_next(cpu, cpu_online_mask);
}
/*
page45
Maximum Queue Entries Supported (MQES):该字段表示控制器支持的最大单个队列大小。
对于基于pcie实现的NVMe,此值适用于主机创建的I/O提交队列和I/O完成队列。对于基于fabricimplementation的NVMe,
这个值只适用于主机创建的I/O提交队列。这是一个基于0的值。最小值为1h,即2条
*/
q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH);
for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);
if (!dev->queues[i + 1]) {
result = -ENOMEM;
goto free_queues;
}
}
for (; i < num_possible_cpus(); i++) {
//该函数是取数的最高二进制阶数,即将给定值四舍五入到最接近的二次方
int target = i % rounddown_pow_of_two(dev->queue_count - 1);
dev->queues[i + 1] = dev->queues[target + 1];
}
//创建io queue,索引从1开始
for (i = 1; i < dev->queue_count; i++) {
result = nvme_create_queue(dev->queues[i], i);
if (result) {
for (--i; i > 0; i--)
nvme_disable_queue(dev, i);
goto free_queues;
}
}
return 0;
free_queues:
nvme_free_queues(dev);
return result;
}
static int set_queue_count(struct nvme_dev *dev, int count)
{
int status;
u32 result, q_count = (count - 1) | ((count - 1) << 16);
/*
设置队列个数(不包括管理队列), page212:
0-15bit:number of I/O submission queues Requested(NSQR)
16:31bit:Number of I/O completion queue requested(NCQR)
最大值65535,q_count设置为0说明配置的队列个数是1,如果配置
65535将发生错误(要配置65534)
*/
status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result);
if (status)
return status < 0 ? -EIO : -EBUSY;
return min(result & 0xffff, result >> 16) + 1;
}
这个函数也没啥说的,看注释吧。
差不多就总结到这里了。关于nvme_dev_start函数的总结。