virtio设备可以基于不同总线来实现,本文介绍基于pci实现的virtio-pci设备。以virtio-blk为例,首先介绍PCI配置空间内容,virtio-pci实现的硬件基础——capability,最后分析PIC设备的初始化以及virtio-pci设备的初始化。
capabilities pointer
字段(0x34)存放了附加寄存器组的起始地址。这里的地址表示附加空间在pci设备空间内的偏移status
字段的capabilities list
bit标记自己在64字节预定义配置空间之后有附加的寄存器组,capabilities pointer
会存放寄存器组链表的头部指针,这里的指针代表寄存器在配置空间内的偏移capabilities list
格式如下,第1个字节存放capability ID,标识后面配置空间实现的是哪种capability,第2个字节存放下一个capability的地址。capability ID查阅参见pci spec3.0 附录H。virtio-blk实现的capability有两种,一种是MSI-X( Message Signaled Interrupts - Extension),ID为0x11,一种是Vendor Specific,ID为0x9,后面一种capability设计目的就是让厂商实现自己的功能。virtio-blk的实现以此为基础capability
布局,左边是每个capability
指向的物理地址空间布局。virtio-pci设备的初始化,前后端通知,数据传递等核心功能,就在这5个capability中实现virtio_pci_common_cfg
,它是virtio前后端沟通的主要桥梁,common config分两部分,第一部分用于设备配置,第二部分用于virtqueue使用。virtio驱动初始化利用第一部分来和后端进行沟通协商,比如支持的特性(guest_feature),初始化时设备的状态(device_status),设备的virtqueue个数(num_queues)。第二部分用来实现前后段数据传输。后面会详细提到两部分在virtio初始化和数据传输中的作用。virtio_pci_common_cfg
数据结构如下bus/slot/function
地址,然后通过0XCFC的IO空间读取或者写入数据。地址空间0XCF8的初始化发生在pci_arch_init
里面。pci_subsys_init
x86_init.pci.init => x86_default_pci_init
pci_legacy_init
pcibios_scan_root
x86_pci_root_bus_resources // 为Host bridge分配资源,通常情况下就是64K IO空间地址和内存空间地址就在这里划分
pci_scan_root_bus // 枚举总线树上的设备
pci_create_root_bus // 创建Host bridge
pci_scan_child_bus // 扫描总线树上所有设备,如果有pci桥,递归扫描下去
pci_scan_slot
pci_scan_single_device // 扫描设备,读取vendor id和device id
pci_scan_device
pci_setup_device
pci_read_bases
__pci_read_base // 读取bar空间大小
/*
* Resources are tree-like, allowing
* nesting etc..
*/
struct resource {
resource_size_t start;
resource_size_t end;
const char *name;
unsigned long flags;
unsigned long desc;
struct resource *parent, *sibling, *child;
};
resource代表一个资源,可以是一段IO地址区间,或者Mem地址区间,总线树上每枚举一个设备,Host bridge就根据设备的BAR空间大小分配合适的资源给这个PCI设备用,这里的资源就是IO或者内存空间的物理地址。PCI设备BAR寄存器的值就是从这里申请得来的。申请的流程如下
pci_read_bases
/* 遍历每个BAR寄存器,读取其内容,并为其申请物理地址空间 */
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos]; // 申请的地址空间放在这里面
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
}
region.start = l64;
region.end = l64 + sz64;
/* 申请资源,将申请到的资源放在res中, region存放PCI设备BAR空间区间 */
pcibios_bus_to_resource(dev->bus, res, ®ion);
分析资源申请函数,它首先取出PCI设备所在的Host bridge,pci_host_bridge.windows链表维护了Host bridge管理的所有资源,遍历其windows成员链表,找到合适的区间,然后分给PCI设备。
至此,PCI设备有了PCI域的物理地址,当扫描结束后,内核会逐一为这些PCI设备配置这个物理地址
/*
* The pci_dev structure is used to describe PCI devices.
*/
struct pci_dev {
struct list_head bus_list; /* node in per-bus list */
struct pci_bus *bus; /* bus this device is on */
struct pci_bus *subordinate; /* bus this device bridges to */
void *sysdata; /* hook for sys-specific extension */
struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */
unsigned int devfn; /* encoded device & function index */
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class; /* 3 bytes: (base,sub,prog-if) */
u8 revision; /* PCI revision, low byte of class word */
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
#ifdef CONFIG_PCIEAER
u16 aer_cap; /* AER capability offset */
#endif
u8 pcie_cap; /* PCIe capability offset */
u8 msi_cap; /* MSI capability offset */
u8 msix_cap; /* MSI-X capability offset */
u8 pcie_mpss:3; /* PCIe Max Payload Size Supported */
u8 rom_base_reg; /* which config register controls the ROM */
u8 pin; /* which interrupt pin this device uses */
u16 pcie_flags_reg; /* cached PCIe Capabilities Register */
unsigned long *dma_alias_mask;/* mask of enabled devfn aliases */
struct pci_driver *driver; /* which driver has allocated this device */
u64 dma_mask; /* Mask of the bits of bus address this
device implements. Normally this is
0xffffffff. You only need to change
this if your device has broken DMA
or supports 64-bit transfers. */
......
}
BAR
寄存器写入分配到的地址空间起始值,完成配置。流程如下:pci_subsys_init
pcibios_resource_survey
pcibios_allocate_bus_resources(&pci_root_buses); // 首先将整个资源按照总线再分成一段段空间
pcibios_allocate_resources(0); // 检查资源是否统一并且不冲突
pcibios_allocate_resources(1);
pcibios_assign_resources(); // 写入地址到BAR寄存器
pci_assign_resource
_pci_assign_resource
__pci_assign_resource
pci_bus_alloc_resource
pci_update_resource
pci_std_update_resource
pci_write_config_dword(dev, reg, new) // 往BAR寄存器写入起始地址
BAR
寄存器中写入的地址,乍一看就是系统的物理地址,但实际上,它与CPU域的物理地址有所不同,它是PCI域的物理地址。两个域的地址需要通过Host bridge的转换。只不过,X86上Host bridge偷懒了,直接采用了一一映射的方式。因此两个域的地址空间看起来一样。在别的结构上(PowerPC)这个地址不一样。
cat /proc/ioports
cat /proc/iomem
virsh qemu-monitor-command vm --hmp info pci
struct bus_type pci_bus_type = {
.name = "pci",
.match = pci_bus_match,
.uevent = pci_uevent,
.probe = pci_device_probe,
.remove = pci_device_remove,
......
};
pci_driver_init
bus_register(&pci_bus_type) // pci总线数据结构
priv->subsys.kobj.kset = bus_kset; // 指向代表顶层bus的kset
priv->devices_kset = kset_create_and_add("devices", NULL, &priv->subsys.kobj);
priv->drivers_kset = kset_create_and_add("drivers", NULL, &priv->subsys.kobj);
生成的pci目录如下:
创建的devices和drivers如下:
static struct pci_driver virtio_pci_driver = {
.name = "virtio-pci",
.id_table = virtio_pci_id_table,
.probe = virtio_pci_probe,
.remove = virtio_pci_remove,
......
}
module_pci_driver(virtio_pci_driver)
pci_register_driver
__pci_register_driver
int __pci_register_driver(struct pci_driver *drv, struct module *owner,
const char *mod_name)
{
/* initialize common driver fields */
drv->driver.name = drv->name;
drv->driver.bus = &pci_bus_type; // 将驱动程序的总线指向了pci_bus_type
drv->driver.owner = owner;
drv->driver.mod_name = mod_name;
drv->driver.groups = drv->groups;
spin_lock_init(&drv->dynids.lock);
INIT_LIST_HEAD(&drv->dynids.list);
/* register with core */
return driver_register(&drv->driver); // 向驱动核心注册
}
driver_register
driver_find // 查找是否总线上已存在相同驱动,防止重复注册
bus_add_driver
driver_create_file(drv, &driver_attr_uevent) // 在virtio-pci目录下创建uevent属性文件
add_bind_files(drv) // 在virtio-pci目录下创建bind/unbind属性文件
/* driver_attr_uevent 变量通过以下宏定义,其余driver属性文件类似 */
static DRIVER_ATTR_WO(uevent)
#define DRIVER_ATTR_WO(_name) \
struct driver_attribute driver_attr_##_name = __ATTR_WO(_name)
重点看unbind属性文件的创建,它为用户提供了卸载驱动的接口,当用户向unbind属性文件里面写入pci设备的地址时,内核会将该设备与其驱动解绑,相当于绑定的逆操作,对应的解邦操作函数unbind_store,如下:
/* Manually detach a device from its associated driver. */
static ssize_t unbind_store(struct device_driver *drv, const char *buf,
size_t count)
{
struct bus_type *bus = bus_get(drv->bus); // 找到驱动所在总线
struct device *dev;
int err = -ENODEV;
dev = bus_find_device_by_name(bus, NULL, buf); // 通过buf中存放的设备名字找到其在内核中对应的device
if (dev && dev->driver == drv) { // 确认设备的驱动就是自己
if (dev->parent) /* Needed for USB */
device_lock(dev->parent);
device_release_driver(dev); // 解绑定!!!
if (dev->parent)
device_unlock(dev->parent);
err = count;
}
put_device(dev);
bus_put(bus);
return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, S_IWUSR, NULL, unbind_store)
#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
struct driver_attribute driver_attr_##_name = \
__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
回到virtio-pci驱动的注册流程都走完之后,sysfs中多了virtio-pci驱动的目录和属性文件,如下:
int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
void * data, int (*fn)(struct device_driver *, void *));
pci_bus_match
pci_match_device
pci_match_id
pci_match_one_device
static inline const struct pci_device_id *
pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
{
if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
(id->device == PCI_ANY_ID || id->device == dev->device) &&
(id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
(id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
!((id->class ^ dev->class) & id->class_mask))
return id;
return NULL;
}
pci_match_one_device
函数中,第一个参数是设备驱动注册时硬编码的ID结构体,第二个参数是pci设备,当PCI驱动指定的ID为PCI_ANY_ID时,表示可以匹配任何的ID,查看virtio_pci_driver注册时设置的virtio_pci_id_table,如下,可以看到,驱动只设置了vendor id,所有只要vendor id为0x1af4,都可以match成功。在系统枚举PCI设备时,已经从PCI设备的配置空间中读到了vendor id。因此,如果是virtio设备,不论是哪一种,都可以成功绑定virtio-pci驱动static const struct pci_device_id virtio_pci_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
{ 0 }
};
#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4
#define PCI_DEVICE(vend,dev) \
.vendor = (vend), .device = (dev), \
.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
PCI总线match设备和驱动成功后,驱动程序核心会把device结构中的driver指针指向这个驱动程序,两者就联系起来,然后调用device_driver结构中的probe函数探测PCI设备。这里就是virtio_pci_driver指定的virtio_pci_probe函数。probe的主要动作包含:
virtio_pci_driver.probe
virtio_pci_probe
pci_enable_device
pci_enable_device
pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO) // 打开内存和IO访问权限
do_pci_enable_device
pcibios_enable_device
pci_enable_resources
pci_write_config_word(dev, PCI_COMMAND, cmd) //向command寄存器字段写1
cap探测入口在virtio_pci_modern_probe
,如果是传统模式,入口在virtio_pci_legacy_probe
,这里以modern probe为例
virtio_pci_probe
virtio_pci_modern_probe
virtio_pci_find_capability
pci_find_capability(dev, PCI_CAP_ID_VNDR)
pos = __pci_bus_find_cap_start // 判断入口点,如果是普通pci设备,返回0x34,这个地方存放cap链表的入口偏移
pos = __pci_find_next_cap // 依次搜索每一条cap,找到类型为PCI_CAP_ID_VNDR的第一个cap,返回它在配置空间的偏移
__pci_find_next_cap_ttl
函数首先通过pci_find_capability
查找类型为PCI_CAP_ID_VNDR(0x9)的capability bar位置,这是PCI规范中定义的扩展capability类型,在查找前首先确定capability在配置空间的位置入口,检查PCI设备是否实现capabilty,如果实现了,是普通设备或者pci桥,它在配置空间偏移0x34的地方,如果是Card Bus,它在配置空间偏移0x14的地方,找到capabitliy起始位置后依次查找链表上每个cap,直到找到PCI_CAP_ID_VNDR
类型的cap(检查cap空间type字段是否为PCI_CAP_ID_VNDR
),找到后返回cap在配置空间中的偏移。整个过程关键代码和示意图如下
/**
* virtio_pci_find_capability - walk capabilities to find device info.
* @dev: the pci device
* @cfg_type: the VIRTIO_PCI_CAP_* value we seek
* @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
*
* Returns offset of the capability, or 0.
*/
static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
u32 ioresource_types, int *bars)
{
int pos;
/* 查找cap结构的在配置空间中的偏移地址 */
for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
pos > 0;
pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
u8 type, bar;
/* 取出virtio_pci_cap数据结构中type成员的值 */
pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, cfg_type), &type);
/* 取出virtio_pci_cap数据结构中bar成员的值 */
pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, bar), &bar);
/* Ignore structures with reserved BAR values */
if (bar > 0x5)
continue;
/* 如果是我们想要的type,返回该cap在配置空间中的偏移 */
if (type == cfg_type) {
if (pci_resource_len(dev, bar) &&
pci_resource_flags(dev, bar) & ioresource_types) {
*bars |= (1 << bar);
return pos;
}
}
}
return 0;
}
/* This is the PCI capability header: */
struct virtio_pci_cap {
__u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
__u8 cap_next; /* Generic PCI field: next ptr. */
__u8 cap_len; /* Generic PCI field: capability length */
__u8 cfg_type; /* Identifies the structure. */
__u8 bar; /* Where to find it. */
__u8 padding[3]; /* Pad to full dword. */
__le32 offset; /* Offset within bar. */
__le32 length; /* Length of the structure, in bytes. */
};
virtio_blk_config
common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
IORESOURCE_IO | IORESOURCE_MEM,
&vp_dev->modern_bars);
isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
IORESOURCE_IO | IORESOURCE_MEM,
&vp_dev->modern_bars);
notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
IORESOURCE_IO | IORESOURCE_MEM,
&vp_dev->modern_bars);
/* Device capability is only mandatory for devices that have
* device-specific configuration.
*/
device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
IORESOURCE_IO | IORESOURCE_MEM,
&vp_dev->modern_bars);
map_capability
将BAR空间映射到内核的虚拟地址空间(3G - 4G)vp_dev->common = map_capability(pci_dev, common,
sizeof(struct virtio_pci_common_cfg), 4,
0, sizeof(struct virtio_pci_common_cfg),
NULL);
vp_dev->device = map_capability(pci_dev, device, 0, 4,
0, PAGE_SIZE,
&vp_dev->device_len);
map_capability
pci_iomap_range(dev, bar, offset, length)
if (flags & IORESOURCE_IO) // 如果BAR空间实现的是IO空间,将其映射到CPU的IO地址空间
return __pci_ioport_map(dev, start, len);
if (flags & IORESOURCE_MEM) // 如果BAR空间实现的内存空间,将其映射到CPU的内存地址空间
return ioremap(start, len);
/* Again, we don't know how much we should map, but PAGE_SIZE
* is more than enough for all existing devices.
*/
if (device) {
vp_dev->device = map_capability(pci_dev, device, 0, 4,
0, PAGE_SIZE,
&vp_dev->device_len);
if (!vp_dev->device)
goto err_map_device;
vp_dev->vdev.config = &virtio_pci_config_ops; // 注册配置空间操作函数
} else {
vp_dev->vdev.config = &virtio_pci_config_nodev_ops;
}
vp_dev->config_vector = vp_config_vector;
vp_dev->setup_vq = setup_vq; // 注册virtqueue初始化函数
vp_dev->del_vq = del_vq;
register_virtio_device
函数向virtio总线注册设备,可以触发virtio总线上的match操作,然后进行virtio设备的探测,这里我们以virtio-blk设备为例,流程如下:virtio_pci_probe
pci_enable_device
virtio_pci_modern_probe
register_virtio_device
dev->dev.bus = &virtio_bus // 将virtio_device.dev.bus设置成virito总线!!!
dev->config->reset(dev) // 复位virtio设备
virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE) // 设置设备状态为ACKNOWLEDGE,表示我们已经发现这个virtio设备
device_register(&dev->dev) // 向virtio总线注册设备,触发总线上的match操作
1af4:1041 network device (modern)
1af4:1042 block device (modern)
1af4:1043 console device (modern)
1af4:1044 entropy generator device (modern)
1af4:1045 balloon device (modern)
1af4:1048 SCSI host bus adapter device (modern)
1af4:1049 9p filesystem device (modern)
1af4:1050 virtio gpu device (modern)
1af4:1052 virtio input device (modern)
legacy:
#define PCI_DEVICE_ID_VIRTIO_NET 0x1000
#define PCI_DEVICE_ID_VIRTIO_BLOCK 0x1001
#define PCI_DEVICE_ID_VIRTIO_BALLOON 0x1002
#define PCI_DEVICE_ID_VIRTIO_CONSOLE 0x1003
#define PCI_DEVICE_ID_VIRTIO_SCSI 0x1004
#define PCI_DEVICE_ID_VIRTIO_RNG 0x1005
#define PCI_DEVICE_ID_VIRTIO_9P 0x1009
#define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012
#define PCI_DEVICE_ID_VIRTIO_PMEM 0x1013
#define PCI_DEVICE_ID_VIRTIO_IOMMU 0x1014
#define PCI_DEVICE_ID_VIRTIO_MEM 0x1015