PCIe在调试过程中,经常会出现扫描不到对端EP设备的问题,在问题定位过程中,了解内核中pcie枚举流程至关重要。
PCIe枚举过程一般分为三步:
1.创建根节点
2.扫描根节点下设备
3.为根节点下设备分配资源
从总线扫描pcie设备的函数pci_scan_child_bus开始分析
unsigned int pci_scan_child_bus(struct pci_bus *bus)
{
unsigned int devfn, pass, max = bus->busn_res.start;
struct pci_dev *dev;
dev_dbg(&bus->dev, "scanning bus\n");
/* Go find them, Rover! */
for (devfn = 0; devfn < 0x100; devfn += 8)
pci_scan_slot(bus, devfn);
/* Reserve buses for SR-IOV capability. */
max += pci_iov_bus_range(bus);
/*
* After performing arch-dependent fixup of the bus, look behind
* all PCI-to-PCI bridges on this bus.
*/
if (!bus->is_added) {
dev_dbg(&bus->dev, "fixups for bus\n");
pcibios_fixup_bus(bus);
bus->is_added = 1;
}
for (pass = 0; pass < 2; pass++)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (pci_is_bridge(dev))
max = pci_scan_bridge(bus, dev, max, pass);
}
/*
* Make sure a hotplug bridge has at least the minimum requested
* number of buses.
*/
if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) {
if (max - bus->busn_res.start < pci_hotplug_bus_size - 1)
max = bus->busn_res.start + pci_hotplug_bus_size - 1;
}
/*
* We've scanned the bus and so we know all about what's on
* the other side of any bridges that may be on this bus plus
* any devices.
*
* Return how far we've got finding sub-buses.
*/
dev_dbg(&bus->dev, "bus scan returning with max=%02x\n", max);
return max;
}
EXPORT_SYMBOL_GPL(pci_scan_child_bus);
该函数的核心代码为
for (devfn = 0; devfn < 0x100; devfn += 8)
pci_scan_slot(bus, devfn);
这里的bus变量来源于pci_create_root_bus,也就是创建的根总线的总线号
devfn : 设备和功能号。
这里使用的是穷举法,把所有的dev和function都尝试一次。
pci_scan_slot函数:
int pci_scan_slot(struct pci_bus *bus, int devfn)
{
unsigned fn, nr = 0;
struct pci_dev *dev;
if (only_one_child(bus) && (devfn > 0))
return 0; /* Already scanned the entire slot */
dev = pci_scan_single_device(bus, devfn);
if (!dev)
return 0;
if (!dev->is_added)
nr++;
for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
dev = pci_scan_single_device(bus, devfn + fn);
if (dev) {
if (!dev->is_added)
nr++;
dev->multifunction = 1;
}
}
/* only one slot has pcie device */
if (bus->self && nr)
pcie_aspm_init_link_state(bus->self);
return nr;
}
pci_scan_single_device:
struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
{
struct pci_dev *dev;
dev = pci_get_slot(bus, devfn);
if (dev) {
pci_dev_put(dev);
return dev;
}
dev = pci_scan_device(bus, devfn);
if (!dev)
return NULL;
pci_device_add(dev, bus);
return dev;
}
核心函数为pci_scan_device:
static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
{
struct pci_dev *dev;
u32 l;
if (!pci_bus_read_dev_vendor_id(bus, devfn, &l, 60*1000))
return NULL;
dev = pci_alloc_dev(bus);
if (!dev)
return NULL;
dev->devfn = devfn;
dev->vendor = l & 0xffff;
dev->device = (l >> 16) & 0xffff;
pci_set_of_node(dev);
if (pci_setup_device(dev)) {
pci_bus_put(dev->bus);
kfree(dev);
return NULL;
}
return dev;
}
pci_bus_read_dev_vendor_id会去读设备的devid和venderid,如果读不到,说明设备不存在了,如果读到了,就会创建一个pdev。
所以能不能扫描对端设备,就看能不能读到设备的vendorid。
bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l,
int crs_timeout)
{
int delay = 1;
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
return false;
/* some broken boards return 0 or ~0 if a slot is empty: */
if (*l == 0xffffffff || *l == 0x00000000 ||
*l == 0x0000ffff || *l == 0xffff0000)
return false;
/*
* Configuration Request Retry Status. Some root ports return the
* actual device ID instead of the synthetic ID (0xFFFF) required
* by the PCIe spec. Ignore the device ID and only check for
* (vendor id == 1).
*/
while ((*l & 0xffff) == 0x0001) {
if (!crs_timeout)
return false;
msleep(delay);
delay *= 2;
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
return false;
/* Card hasn't responded in 60 seconds? Must be stuck. */
if (delay > crs_timeout) {
printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not responding\n",
pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
PCI_FUNC(devfn));
return false;
}
}
return true;
}
while循环里每隔一段时间就会进行一次venderid的配置读写,从上述代码中可以看出,退出循环的原因会有两个
1. config配置读写失败了 (说明链路不通)
2. 超时没有得到响应(预留时间为60s,已经非常长了,第一点失败的可能性更大)
所以扫描不到对端设备时:
1.确认建链是否成功,建链失败,肯定扫描不到对端
2.确认对端的配置空间是否可写(对端的pcie模块是否处于解复位状态)
3.确认type0,type1,iatu等参数配置是否正确,如果正确了,确认配置访问的地址空间大小是否足够。