DPC的全称是downstream port containment,是针对root port和pcie switch检测到不可恢复的错误时,就会通知下游端口的业务,以防止数据损坏的扩散.
其代码在drivers/pci/pcie/dpc.c
static struct pcie_port_service_driver dpcdriver = {
.name = "dpc",
.port_type = PCIE_ANY_PORT,
.service = PCIE_PORT_SERVICE_DPC,
.probe = dpc_probe,
.remove = dpc_remove,
.reset_link = dpc_reset_link,
};
我们重点看看其probe函数
static int dpc_probe(struct pcie_device *dev)
{
struct dpc_dev *dpc;
struct pci_dev *pdev = dev->port;
struct device *device = &dev->device;
int status;
u16 ctl, cap;
if (pcie_aer_get_firmware_first(pdev))
return -ENOTSUPP;
dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
if (!dpc)
return -ENOMEM;
#检查是否支持这个feature
dpc->cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
dpc->dev = dev;
set_service_data(dev, dpc);
#申请一个中断,这里用到了中断线程化,
status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
dpc_handler, IRQF_SHARED,
"pcie-dpc", dpc);
if (status) {
pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
status);
return status;
}
}
我们看看中断处理函数的处理
中断处理函数只是清掉中断,然后返回IRQ_HANDLED,就会直接调用dpc_handler来处理
static irqreturn_t dpc_irq(int irq, void *context)
{
struct dpc_dev *dpc = (struct dpc_dev *)context;
struct pci_dev *pdev = dpc->dev->port;
u16 cap = dpc->cap_pos, status;
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || status == (u16)(~0))
return IRQ_NONE;
pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
PCI_EXP_DPC_STATUS_INTERRUPT);
if (status & PCI_EXP_DPC_STATUS_TRIGGER)
return IRQ_WAKE_THREAD;
return IRQ_HANDLED;
}
static irqreturn_t dpc_handler(int irq, void *context)
{
struct aer_err_info info;
struct dpc_dev *dpc = context;
struct pci_dev *pdev = dpc->dev->port;
u16 cap = dpc->cap_pos, status, source, reason, ext_reason;
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
pci_info(pdev, "containment event, status:%#06x source:%#06x\n",
status, source);
reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
#打印log 来表示当前检测到错误类型
ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
pci_warn(pdev, "%s detected\n",
(reason == 0) ? "unmasked uncorrectable error" :
(reason == 1) ? "ERR_NONFATAL" :
(reason == 2) ? "ERR_FATAL" :
(ext_reason == 0) ? "RP PIO error" :
(ext_reason == 1) ? "software trigger" :
"reserved error");
/* We configure DPC so it only triggers on ERR_FATAL */
#广播检测到的错误
pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
return IRQ_HANDLED;
}
void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
u32 service)
{
pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
struct pci_bus *bus;
#广播接收到的错误
pci_dbg(dev, "broadcast error_detected message\n");
if (state == pci_channel_io_frozen)
pci_walk_bus(bus, report_frozen_detected, &status);
else
pci_walk_bus(bus, report_normal_detected, &status);
#如果只这这两种错误,则不用广播,也无法尝试aer修复
if (state == pci_channel_io_frozen &&
reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
goto failed;
#广播错误
if (status == PCI_ERS_RESULT_CAN_RECOVER) {
status = PCI_ERS_RESULT_RECOVERED;
pci_dbg(dev, "broadcast mmio_enabled message\n");
pci_walk_bus(bus, report_mmio_enabled, &status);
}
#广播错误
if (status == PCI_ERS_RESULT_NEED_RESET) {
/*
* TODO: Should call platform-specific
* functions to reset slot before calling
* drivers' slot_reset callbacks?
*/
status = PCI_ERS_RESULT_RECOVERED;
pci_dbg(dev, "broadcast slot_reset message\n");
pci_walk_bus(bus, report_slot_reset, &status);
}
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
pci_dbg(dev, "broadcast resume message\n");
pci_walk_bus(bus, report_resume, &status);
pci_aer_clear_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
pci_info(dev, "AER: Device recovery successful\n");
return;
failed:
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
/* TODO: Should kernel panic here? */
pci_info(dev, "AER: Device recovery failed\n");
}