Linux PCI Express
配置空间读写内核实现
PCI-E是用来互联如计算和通信平台应用中外围设备的第三代高性能I/O总线。PCI-E采用了与PCI相同的使用模型和读写(load-store)通信模型,支持各种常见的事务,如存储器读/写、IO读/写和配置读/写事务。其存储器、IO和配置地址空间与PCI的地址空间相同。PCI Express与PCI系统是软件向后兼容的。
PCI-E的配置空间大小为4096字节,如图1所示。其中前256字节是与PCI兼容的配置寄存器,该区域可以用以下两种机制访问:
· PCI配置访问机制。
· PCI Express增强型配置机制。
图1 PCI-E配置空间
Memory-mappedI/O (MMIO)与port I/O
MMIO和port I/O(也称为port-mapped I/O或PMIO)是两种CPU与外设之间进行I/O操作的方式。
PortI/O是通过特殊的CPU指令来进行I/O操作,在x86架构上,可以通过指令in和out在特定的端口上进行I/O读写。I/O设备拥有与内存不同的地址空间,实现的方式是通过在CPU上额外的I/O pin或者将整个总线赋予端口。
MMIO即内存映射I/O,它是PCI规范一部分,I/O设备被放置在内存空间而不是I/O空。从处理器角度看,内存映射I/O后系统设备访问起来和内存一样。这样访问AGP/PCI-E显卡上的帧缓存,BIOS,PCI设备就可以使用读写内存一样的汇编指令完成,简化了程序设计的难度和接口的复杂性。
对软件人员来说,MMIO比Port I/O更方便使用。
用户空间的两个命令lspci和setpci来查看/修改PCI及PCI-E配置空间。用户命令执行的结果,是由内核来确定。那么我们关心一个问题:内核是如何真正去读取和修改配置空间的?
Linux内核提供了以下PCI/PCI-E配置空间访问接口,在驱动编写过程中,我们可以直接使用下面函数。这些
· pci_{read,write}_config_byte()
· pci_{read,write}_config_word()
· pci_{read,write}_config_dword()
函数的定义在文件include/linux/pci.h中。
00513: static inline int pci_read_config_byte(struct pci_dev *dev, int where,
00513: u8 *val)
00514: {
00515: return pci_bus_read_config_byte (dev- >bus, dev- >devfn, where, val);
00516: }
00517: static inline int pci_read_config_word(struct pci_dev *dev, int where,
00517: u16 *val)
00518: {
00519: return pci_bus_read_config_word (dev- >bus, dev- >devfn, where, val);
00520: }
00521: static inline int pci_read_config_dword(struct pci_dev *dev,
00521: int where, u32*val)
00522: {
00523: return pci_bus_read_config_dword (dev- >bus, dev- >devfn, where, val);
00524: }
00525: static inline int pci_write_config_byte(struct pci_dev *dev, int where,
00525: u8val)
00526: {
00527: return pci_bus_write_config_byte(dev- >bus, dev- >devfn, where, val);
00528: }
00529: static inline int pci_write_config_word(struct pci_dev *dev,
00529: int where, u16 val)
00530: {
00531: return pci_bus_write_config_word (dev- >bus, dev- >devfn, where, val);
00532: }
00533: static inline int pci_write_config_dword(struct pci_dev *dev,
00533: int where,u32 val)
00534: {
00535: return pci_bus_write_config_dword(dev- >bus, dev- >devfn, where, val);
00536: }
在PCI/PCI-E 配置空间读写API接口中,我们看到是对pci_bus_{read,write}_config_{byte, word, dword}的的封装。这些函数由drivers/pci/access.c中以宏的方式定义。
00024: #define PCI_OP_READ(size,type,len) \
00025: int pci_bus_read_config_##size \
00026: (struct pci_bus *bus , unsigned int devfn, int pos , type *value) \
00027: { \
00028: int res ; \
00029: unsigned long flags ; \
00030: u32 data = 0; \
00031: if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
00032: spin_lock_irqsave(&pci_lock, flags ); \
00033: res = bus - >ops- >read(bus , devfn, pos , len, &data ); \
00034: *value = (type)data ; \
00035: spin_unlock_irqrestore(&pci_lock, flags ); \
00036: return res ; \
00037: }
00038:
00039: #define PCI_OP_WRITE(size,type,len) \
00040: int pci_bus_write_config_##size \
00041: (struct pci_bus *bus , unsigned int devfn, int pos , type value) \
00042: { \
00043: int res ; \
00044: unsigned long flags ; \
00045: if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
00046: spin_lock_irqsave(&pci_lock, flags ); \
00047: res = bus - >ops- >write(bus , devfn, pos , len, value); \
00048: spin_unlock_irqrestore(&pci_lock, flags ); \
00049: return res ; \
00050: }
00059: EXPORT_SYMBOL(pci_bus_read_config_byte);
00060: EXPORT_SYMBOL(pci_bus_read_config_word);
00061: EXPORT_SYMBOL(pci_bus_read_config_dword);
00062: EXPORT_SYMBOL(pci_bus_write_config_byte);
00063: EXPORT_SYMBOL(pci_bus_write_config_word);
00064: EXPORT_SYMBOL(pci_bus_write_config_dword);
pci_bus_{read,write}_config_{byte, word, dword}()等函数,调用的是bus->ops->write、bus->ops->read方法。显然,现在的bus总线是PCI/PCI-E,我们就关注内核定义PCI/PCI-E总线的读写操作方法。
注:Linux内核没有专门将PCI-E列为一种总线,而是将PCI-E合并到PCI总线中。
PCI总线读写方法为pci_root_ops,对应的读写函数分别为pci_read()、pci_write()。实现在文件arch/i386/pci/common.c中。
00036: static intpci_read(struct pci_bus *bus, unsigned intdevfn, int where, int size, u32
00036: *value)
00037: {
00038: return raw_pci_ops - >read(pci_domain_nr(bus), bus- >number,
00039: devfn, where, size, value);
00040: }
00041:
00042: static intpci_write(struct pci_bus *bus, unsigned intdevfn, int where, int size,
00042: u32 value)
00043: {
00044: return raw_pci_ops - >write(pci_domain_nr(bus), bus- >number,
00045: devfn, where, size, value);
00046: }
00047:
00048: struct pci_ops pci_root_ops = {
00049: .read = pci_read,
00050: .write = pci_write,
00051: };
pci_read()、pci_write()依赖于raw_pci_ops全局变量。
内核在启动时,会执行pci_access_init()函数,在文件arch/i386/pci/init.c中。该函数中,确定了raw_pci_ops值。
00005: / * arch_initcall has too randomordering, so call theinitializers
00006: in the right sequence from here. */
00007: static __init int pci_access_init(void)
00008: {
00009: #ifdef CONFIG_PCI_MMCONFIG
00010: pci_mmcfg_init();
00011: #endif
00012: dmi_check_pciprobe();
00013:
00014: if (raw_pci_ops )
00015: return 0;
00016:
00017: #ifdef CONFIG_PCI_BIOS
00018: pci_pcbios_init();
00019: #endif
00020: / *
00021: * don't check for raw_pci_ops here because we want pcbios as last
00022: * fallback, yet it'sneeded to run first to set pcibios_last_bus
00023: * in case legacy PCI probingis used. otherwise detecting peer busses
00024: * fails.
00025: */
00026: #ifdef CONFIG_PCI_DIRECT
00027: pci_direct_init();
00028: #endif
00029: return0;
00030: } ? end pci_access_init ?
00031: arch_initcall(pci_access_init);
对于访问PCI空间,通过Port I/O方式则可以实现完全访问。但要访问全部的PCI-E配置空间,则需要MMIO方式。MMIO方式访问,则需要Linux内核支持。在编译内核时,选中以下选项即可。
Bus options (PCI etc.) --->
--- PCI support
[*] Support mmconfig PCI config spaceaccess
即需要选中“Supportmmconfig PCI config space access”。 若没有选中该项,则用户通过lspci或setpci命令,访问不到PCI-E的扩展配置空间(256~4096字节)。
为了访问PCI-E扩展配置空间,pci_access_init()函数会调用pci_mmcfg_init()。于是将raw_pci_ops的值设为pci_mmcfg,代码都在文件arch/i386/pci/mmconfig.c中。
00152: void __init pci_mmcfg_init(void)
00153: {
... ...
00173: raw_pci_ops = &pci_mmcfg ;
... ...
00176: } ? end pci_mmcfg_init ?
00147: static struct pci_raw_ops pci_mmcfg = {
00148: .read = pci_mmcfg_read,
00149: .write = pci_mmcfg_write,
00150: };
00151:
若内核中没有选中“Support mmconfig PCIconfig space access”,则raw_pci_ops方法为:pci_direct_conf1或pci_direct_conf2。通常情况下,使用pci_direct_conf1。代码在文件/arc/i386/pci/direct.c中。
00257: void __init pci_direct_init(void)
00258: {
00259: struct resource *region, *region2;
......
00267: if (pci_check_type1()) {
00268: printk(KERN_INFO "PCI: Using configuration type 1\n");
00269: raw_pci_ops = &pci_direct_conf1;
00270: return;
00271: }
... ...
00284: if (pci_check_type2()) {
00285: printk(KERN_INFO "PCI: Using configuration type 2\n");
00286: raw_pci_ops = &pci_direct_conf2 ;
00287: return;
00288: }
00293: } ? end pci_direct_init ?
00079: struct pci_raw_ops pci_direct_conf1 = {
00080: .read = pci_conf1_read,
00081: .write = pci_conf1_write,
00082: };
00171: #undef PCI_CONF2_ADDRESS
00172:
00173: static struct pci_raw_ops pci_direct_conf2 = {
00174: .read = pci_conf2_read,
00175: .write = pci_conf2_write,
00176: };
Port I/O方式也称为直接方式访问。
PCI规范规定,直接操作port读取PCI配置信息时,通过CONFIG_ADDRESS和CONFIG_DATA;两个寄存器进行。CONFIG_ADDRESS的值为0xCF8,CONFIG_DATA的值为0xCFC,两个寄存器都为32bit。两个寄存器就是对应x86架构中的端口号。图2为CONFIG_ADDRESS寄存器格式。
图2 CONFIG_ADDRESS寄存器格式
bit31是使能对PCI Bus CONFIG_DATA的访问;
bit 30~24为保留,为只读,访问时返回值为0;
bit 23~16是Bus号;
bit 15~10是设备号;
bit 10~8是功能号;
bit 7~2是配置空间中的寄存器,单位为DWORD。
bit 1~0为只读,读取时放回为0。
这样直接访问PCI配置空间时,分为两步:
第一步是向CONFIG_ADDRESS寄存器(端口0xCF8)写入要读/写的位置;
第二步是从CONFIG_DATA寄存器(端口0xCFC)读/写所需要数据。
Linux内核对PCI配置空间直接访问的实现函数分别为pci_conf1_read()/pci_conf1_write()和pci_conf2_read()/pci_conf2_write(),分别对应读写Type 0和Type 1的配置空间。对于我们的PCI-E外设来说,是Type 0型配置空间。这里我们只关注Type 0。
函数pci_conf1_read()和pci_conf1_write()函数在文件arch/i386/pci/direct.c中。
00017: int pci_conf1_read(unsigned int seg, unsigned intbus,
00018: unsigned int devfn, int reg, int len, u32 *value)
00019: {
00020: unsigned long flags;
00021:
00022: if ((bus > 255) || (devfn > 255) || (reg > 255)) {
00023: *value = - 1;
00024: return - EINVAL;
00025: }
00026:
00027: spin_lock_irqsave(&pci_config_lock , flags);
00028:
00029: outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
00030:
00031: switch (len) {
00032: case 1:
00033: *value = inb(0xCFC + (reg & 3));
00034: break;
00035: case 2:
00036: *value = inw(0xCFC + (reg & 2));
00037: break;
00038: case 4:
00039: *value = inl(0xCFC);
00040: break;
00041: }
00042:
00043: spin_unlock_irqrestore(&pci_config_lock , flags);
00044:
00045: return0;
00046: } ? end pci_conf1_read ?
00047:
00048: int pci_conf1_write(unsigned int seg, unsigned intbus,
00049: unsigned int devfn, int reg, int len, u32 value)
00050: {
00051: unsigned long flags;
00052:
00053: if ((bus > 255) || (devfn > 255) || (reg > 255))
00054: return - EINVAL;
00056: spin_lock_irqsave(&pci_config_lock , flags);
00057:
00058: outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
00059:
00060: switch (len) {
00061: case 1:
00062: outb((u8)value, 0xCFC + (reg & 3));
00063: break;
00064: case 2:
00065: outw((u16)value, 0xCFC + (reg & 2));
00066: break;
00067: case 4:
00068: outl((u32)value, 0xCFC);
00069: break;
00070: }
00071:
00072: spin_unlock_irqrestore(&pci_config_lock , flags);
00073:
00074: return0;
00075: } ? end pci_conf1_write ?
00076:
Port I/O方式只能访问PCI配置空间,而不能访问PCI-E扩展配置空间(257~4096字节),此时只能通过MMIO方式。Linux内核中的MMIO实现读/写分别对应函数pci_mmcfg_write()和pci_mmcfg_read()。函数在文件arch/i386/pci/mmconfig.c中。
00071: static intpci_mmcfg_read(unsigned int seg, unsigned intbus,
00072: unsigned int devfn, int reg, int len, u32 *value)
00073: {
00074: unsigned long flags;
00075: u32 base;
00076:
00077: if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
00078: err: *value = - 1;
00079: return - EINVAL;
00080: }
00081:
00082: if (reg < 256)
00083: return pci_conf1_read(seg,bus,devfn,reg,len,value);
00084:
00085: base = get_base_addr(seg, bus, devfn);
00086: if (! base)
00087: goto ↑err;
00088:
00089: spin_lock_irqsave(&pci_config_lock , flags);
00090:
00091: pci_exp_set_dev_base(base, bus, devfn);
00092:
00093: switch (len) {
00094: case 1:
00095: *value = mmio_config_readb(mmcfg_virt_addr + reg);
00096: break;
00097: case 2:
00098: *value = mmio_config_readw(mmcfg_virt_addr + reg);
00099: break;
00100: case 4:
00101: *value = mmio_config_readl(mmcfg_virt_addr + reg);
00102: break;
00103: }
00104:
00105: spin_unlock_irqrestore(&pci_config_lock , flags);
00106:
00107: return0;
00108: } ? end pci_mmcfg_read ?
00110: static int pci_mmcfg_write(unsigned int seg, unsigned intbus,
00111: unsigned int devfn, int reg, int len, u32 value)
00112: {
00113: unsigned long flags;
00114: u32 base;
00115:
00116: if ((bus > 255) || (devfn > 255) || (reg > 4095))
00117: return - EINVAL;
00118:
00119: if (reg < 256)
00120: return pci_conf1_write(seg,bus,devfn,reg,len,value);
00121:
00122: base = get_base_addr(seg, bus, devfn);
00123: if (! base)
00124: return - EINVAL;
00125:
00126: spin_lock_irqsave(&pci_config_lock , flags);
00127:
00128: pci_exp_set_dev_base(base, bus, devfn);
00129:
00130: switch (len) {
00131: case 1:
00132: mmio_config_writeb(mmcfg_virt_addr + reg, value);
00133: break;
00134: case 2:
00135: mmio_config_writew(mmcfg_virt_addr + reg, value);
00136: break;
00137: case 4:
00138: mmio_config_writel(mmcfg_virt_addr + reg, value);
00139: break;
00140: }
00141:
00142: spin_unlock_irqrestore(&pci_config_lock , flags);
00143:
00144: return0;
00145: } ? end pci_mmcfg_write ?
若访问的配置空间在前面256字节范围内,则直接调用直接访问方式(Port I/O)。若访问PCI-E扩展配置空间,则首先通过get_base_addr()函数获取设备对应的内存空间物理地址,然后通过pci_exp_set_dev_base()函数将物理地址映射到一个线性地址,最后通过mmio_config_{read, write}{b, w, l}执行真正的读写。
00028/ *
00029: * Functions for accessing PCI configuration space with MMCONFIGaccesses
00030: */
00031: static u32get_base_addr(unsigned int seg, int bus, unsigned devfn)
00032: {
00033: int cfg_num = - 1;
00034: struct acpi_table_mcfg_config*cfg;
00035:
00036: while(1) {
00037: ++cfg_num;
00038: if (cfg_num >= pci_mmcfg_config_num ) {
00039: break;
00040: }
00041: cfg = &pci_mmcfg_config[cfg_num];
00042: if (cfg- >pci_segment_group_number ! = seg)
00043: continue;
00044: if ((cfg- >start_bus_number <= bus) &&
00045: (cfg- >end_bus_number >= bus))
00046: return cfg- >base_address;
00047: }
00048:
00049: / * Handle more broken MCFG tableson Asus etc.
00050: They only contain a single entryfor bus 0- 0. Assume
00051: this applies to all busses. */
00052: cfg = &pci_mmcfg_config [0];
00053: if (pci_mmcfg_config_num == 1 &&
00054: cfg- >pci_segment_group_number == 0 &&
00055: (cfg- >start_bus_number | cfg- >end_bus_number) == 0)
00056: return cfg- >base_address;
00057:
全局变量pci_mmcfg_config是所有PCI/PCI-E设备的MMIO映射表,MMIO映射表是内核根据BIOS POST结构初始化PCI总线时设置好,内核读取分配的值即可。
通过get_base_addr()获取到的地址是物理地址,为了能读取,还需通过桉树pci_exp_set_dev_base(base, bus, devfn);理地址转换为逻辑地址。
00062: static inline void pci_exp_set_dev_base(unsigned int base, int bus, int
00062: devfn)
00063: {
00064: u32 dev_base = base| (bus << 20) | (devfn << 12);
00065: if (dev_base != mmcfg_last_accessed_device ) {
00066: mmcfg_last_accessed_device = dev_base;
00067: set_fixmap_nocache(FIX_PCIE_MCFG, dev_base);
00068: }
00069: }
文件include/asm-i386/fixmap.h。
00100: / *
00101: * Some hardwarewants to get fixmapped withoutcaching.
00102: */
00103: #define set_fixmap_nocache(idx, phys) \
00104: __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
00105:
文件arch/i386/mm/pgtable.c。
00140: void __set_fixmap(enum fixed_addresses idx, unsigned longphys, pgprot_t
00140: flags)
00141: {
00142: unsigned long address = __fix_to_virt(idx);
00143:
00144: if (idx >= __end_of_fixed_addresses) {
00145: BUG();
00146: return;
00147: }
00148: set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
00149: }
00150:
00023: #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
前面我们已经提到可以通过lspci和setpci命令来读写PCI/PCI-E配置。而这些命令的实现是基于内核提供的/sysfs接口或/proc接口。
内核为PCI/PCI-E总线提供的/sysfs读写方法如下,文件drivers/pci/pci-sysfs.c。
00510: static struct bin_attribute pci_config_attr = {
00511: .attr = {
00512: .name = "config",
00513: .mode = S_IRUGO | S_IWUSR,
00514: .owner = THIS_MODULE,
00515: },
00516: .size =256,
00517: .read = pci_read_config,
00518: .write = pci_write_config,
00519: };
00520:
00521: static struct bin_attribute pcie_config_attr = {
00522: .attr = {
00523: .name = "config",
00524: .mode = S_IRUGO | S_IWUSR,
00525: .owner = THIS_MODULE,
00526: },
00527: .size =4096,
00528: .read = pci_read_config,
00529: .write = pci_write_config,
00530: };
00531:
pci_read_config()和pci_write_config()函数进而调用pci_user_write_config_{dword, word, byte}。我们来看一下setpci命令执行时(图3),内核栈信息和lspci栈信息(图4)。
由栈信息我们可以看出,函数最终调用pci_conf1_write()函数。也就是/sysfs提供的读写接口,也最终是使用Port I/O和MMIO方式。
图3 pci_conf1_write()函数调用栈
图4 pci_mmcfg_read()函数调用栈
原文:http://www.ilinuxkernel.com/files/5/Linux_PCI_Express_Kernel_RW.htm