1、x86平台主要使用的中断类型有pic、apic及msi中断,在多核系统下的apic结构图如下所示,每个cpu有一个lapic,外部中断通过ioapic转发到lapic,如果是msi中断,则绕过了io apic直接发给lapic。
2、kvm初始化过程为每个虚拟机维护一个pic主控制器、一个pic备控制器以及一个ioapic控制器,每个vcpu维护一个lapic控制器。同时每个虚拟机有一张中断路由表(kvm_irq_routing_table)。中断路由表里的chip二维数组保存非msi中断的gsi号,每个中断都有自己的routing_entry,routing_entry保存了中断的类型(pci、ioapic、msi)、中断号、以及set触发函数,所有的routing_entry以gsi为索引信息挂接到route_table的map链表里(可能同一个中断号会同时关联pic、ioapic两种中断type)。
ioapic里还维护了一张中断重映射表(redirtbl),负责为每个ioapic引脚(总共24个引脚)收到的中断选择路由到哪个lapic,每个vcpu的lapic控制器则模拟了主要的apic寄存器(IRR、ISR、EOI)。
3、中断路由表初始过程
kvm创建好pci、ioapic控制器后,会先使用default_routing(kvm/irq_common.c)安装默认的中断路由表。
kvm_arch_vm_ioctl
kvm_create_pic
kvm_ioapic_init
kvm_setup_default_irq_routing
kvm_set_irq_routing
setup_routing_entry
static int setup_routing_entry(struct kvm *kvm,
struct kvm_irq_routing_table *rt,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
struct kvm_kernel_irq_routing_entry *ei;
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
e->gsi = ue->gsi;
e->type = ue->type;
//设置每个routing_entry信息
r = kvm_set_routing_entry(kvm, e, ue);
if (r)
goto out;
if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
//将routing_entry连接到route_table的map链表
hlist_add_head(&e->link, &rt->map[e->gsi]);
r = 0;
out:
return r;
}
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
int delta;
unsigned max_pin;
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
max_pin = KVM_IOAPIC_NUM_PINS;
e->set = kvm_set_ioapic_irq;
break;
default:
goto out;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
e->irqchip.pin = ue->u.irqchip.pin + delta;
if (e->irqchip.pin >= max_pin)
goto out;
break;
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
goto out;
break;
default:
goto out;
}
r = 0;
out:
return r;
}
setup_routing_entry的ue参数即为default_routing,以上的流程主要就是将default_routing定义的路由信息保存到routing_table里,default_routing初始化定义了0-24号中断的基本信息,如中断type(都是非msi的IRQCHIP类型,包括pic、ioapic),中断gsi号等。中断路由表除了初始化安装外,还可以通过KVM_SET_GSI_ROUTING重新安装。
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
#define PIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
static const struct kvm_irq_routing_entry default_routing[] = {
ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};
4、中断触发流程
当vfio或vhost等后端通过eventfd唤醒kvm中断处理函数后,会进入irqfd_inject,然后调用kvm_set_irq,kvm_set_irq主要是查找中断路由表,找到中断对应的routing_entry,然后调用其set触发函数,如果是ioapic类型的中断,则会调用kvm_set_ioapic_irq,最后进入ioapic_service处理函数。ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。
static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
struct kvm_lapic_irq irqe;
int ret;
if (entry->fields.mask)
return -1;
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
entry->fields.dest_id, entry->fields.dest_mode,
entry->fields.delivery_mode, entry->fields.vector,
entry->fields.trig_mode);
irqe.dest_id = entry->fields.dest_id;
irqe.vector = entry->fields.vector;
irqe.dest_mode = entry->fields.dest_mode;
irqe.trig_mode = entry->fields.trig_mode;
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
ioapic->irr_delivered |= 1 << irq;
if (irq == RTC_GSI && line_status) {
/*
* pending_eoi cannot ever become negative (see
* rtc_status_pending_eoi_check_valid) and the caller
* ensures that it is only called if it is >= zero, namely
* if rtc_irq_check_coalesced returns false).
*/
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
&ioapic->rtc_status.dest_map);
ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
entry->fields.remote_irr = 1;
return ret;
}
lapic收到中断后,会根据不同的delivery_mode调用不同的处理函数,以常见的APIC_DM_FIXED为例,处理函数还会判断是否启用apicv功能,使用apicv和不使用apicv走不同的触发流程。
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
struct dest_map *dest_map)
{
case APIC_DM_FIXED:
//设置触发模式
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
else
apic_clear_vector(vector, apic->regs + APIC_TMR);
}
//判断是否使用apicv
if (vcpu->arch.apicv_active)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
//irr寄存器对应bit位置1
kvm_lapic_set_irr(vector, apic);
//标记中断请求事件
kvm_make_request(KVM_REQ_EVENT, vcpu);
//把vcpu拉回到host
kvm_vcpu_kick(vcpu);
}
break;
}
1)、如果使能了apicv,最终调用vmx_deliver_posted_interrupt,使用中断posting的方式来通知vcpu处理中断。
static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
//嵌套虚拟化的场景
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
return;
//将pi_desc对应的bit位置1
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
//设置pi_desc.on为1,表明有中断需要处理
r = pi_test_and_set_on(&vmx->pi_desc);
kvm_make_request(KVM_REQ_EVENT, vcpu);
//判断vcpu是否处在Guest running状态,如果是,则给vcpu发送IPI中断POSTED_INTR_VECTOR
//该IPI中断vcpu可以直接在non-root模式下处理,不需要vm-exit
//如果vcpu处于非running状态,则将vcpu唤醒,这样vcpu执行vm_entry的时候就能感知到有中断需要处理
if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
kvm_vcpu_kick(vcpu);
}
2)、如果没有使能apicv功能,则标记lapic的IRR寄存器,通过kvm_make_request标记vcpu有中断请求事件,然后触发vcpu vm-exit。当vcpu重新回到Guest模式时,会检查是否有中断请求事件,如果有,则设置ISR、PPR等寄存器信息。
vcpu_enter_guest
inject_pending_event
kvm_cpu_get_interrupt
kvm_get_apic_interrupt
kvm_queue_interrupt
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
//从irr寄存器获取优先级最高的中断向量
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
if (vector == -1)
return -1;
/*
* We get here even with APIC virtualization enabled, if doing
* nested virtualization and L1 runs with the "acknowledge interrupt
* on exit" mode. Then we cannot inject the interrupt via RVI,
* because the process would deliver it through the IDT.
*/
//设置isr寄存器,表明vcpu正在处理该中断
apic_set_isr(vector, apic);
//设置ppr寄存器
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
return vector;
}
最后再调用vmx_inject_irq将之前保存在kvm_queued_interrupt的中断信息写到vmcs的VM_ENTRY_INTR_INFO_FIELD,等vcpu执行vm_entry时,就能感知到该中断的存在。
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (vcpu->arch.interrupt.soft)
inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
if (vcpu->arch.interrupt.soft) {
intr |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);