From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jan Kiszka Subject: [PATCH] KVM: VMX: Fix race between pending IRQ and NMI Date: Mon, 10 Nov 2008 16:52:40 +0100 Message-ID: <491858C8.2040401@siemens.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-15 Content-Transfer-Encoding: 7bit Cc: "Xu, Jiajun" , "Yang, Sheng" , Avi Kivity To: kvm-devel Return-path: Received: from gecko.sbs.de ([194.138.37.40]:23287 "EHLO gecko.sbs.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751347AbYKJPxX (ORCPT ); Mon, 10 Nov 2008 10:53:23 -0500 Sender: kvm-owner@vger.kernel.org List-ID: This patch addresses item #2215532 in the kvm bug tracker, but was finally also visible with other Linux guests that use the NMI watchdog: There is a subtle race in kvm-intel between a pending IRQ and a briefly later arriving NMI (e.g. from the watchdog). If the IRQ was injected but the guest exited again on ejection due to some page fault, the flag interrupt.pending remained true. If now some NMI just happened to be pended as well, that one overruled the IRQ and was re-injected instead (what is OK!). But during the next run of vmx_complete_interrupts the originally pending IRQ fell on the floor and was forgotten. That means we dequeued some IRQ from the [A]PIC, but never delivered it, effectively causing a stall of IRQ deliveries. You may guess that it took me a while to understand this... The patch below addresses the issue by turning interrupt.pending into a three-state variable: NONE, QUEUED (but not currently injected), and INJECTED. If we overwrite some IRQ injection with an NMI, the state gets properly updated. Moreover, we only transit from INJECTED to NONE to avoid loosing IRQs. To simplify review and maintenance, the patch aligns the decision pattern in vmx_intr_assist with do_interrupt_requests. Signed-off-by: Jan Kiszka --- arch/x86/include/asm/kvm_host.h | 6 +++ arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++------------- arch/x86/kvm/x86.h | 4 +- 3 files changed, 49 insertions(+), 22 deletions(-) Index: b/arch/x86/include/asm/kvm_host.h =================================================================== --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -301,7 +301,11 @@ struct kvm_vcpu_arch { } exception; struct kvm_queued_interrupt { - bool pending; + enum { + KVMIRQ_NONE, + KVMIRQ_QUEUED, + KVMIRQ_INJECTED + } pending; u8 nr; } interrupt; Index: b/arch/x86/kvm/vmx.c =================================================================== --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1037,7 +1037,7 @@ static int set_guest_debug(struct kvm_vc static int vmx_get_irq(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.interrupt.pending) + if (vcpu->arch.interrupt.pending == KVMIRQ_NONE) return -1; return vcpu->arch.interrupt.nr; } @@ -2487,9 +2487,16 @@ static void do_interrupt_requests(struct } if (vcpu->arch.nmi_injected) { vmx_inject_nmi(vcpu); + if (vcpu->arch.interrupt.pending == KVMIRQ_INJECTED) + /* + * Degrade pending state, we will properly reinject + * after the NMI. + */ + vcpu->arch.interrupt.pending = KVMIRQ_QUEUED; if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window) enable_nmi_window(vcpu); - else if (vcpu->arch.irq_summary + else if (vcpu->arch.interrupt.pending != KVMIRQ_NONE + || vcpu->arch.irq_summary || kvm_run->request_interrupt_window) enable_irq_window(vcpu); return; @@ -2498,14 +2505,18 @@ static void do_interrupt_requests(struct enable_nmi_window(vcpu); if (vcpu->arch.interrupt_window_open) { - if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) + if (vcpu->arch.irq_summary && + vcpu->arch.interrupt.pending == KVMIRQ_NONE) kvm_do_inject_irq(vcpu); - if (vcpu->arch.interrupt.pending) + if (vcpu->arch.interrupt.pending != KVMIRQ_NONE) { + vcpu->arch.interrupt.pending = KVMIRQ_INJECTED; vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + } } if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) + (vcpu->arch.irq_summary || kvm_run->request_interrupt_window + || vcpu->arch.interrupt.pending != KVMIRQ_NONE)) enable_irq_window(vcpu); } @@ -2624,7 +2635,8 @@ static int handle_exception(struct kvm_v cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + if (vcpu->arch.interrupt.pending != KVMIRQ_NONE + || vcpu->arch.exception.pending) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -3244,7 +3256,8 @@ static void vmx_complete_interrupts(stru GUEST_INTR_STATE_NMI); else vmx->vcpu.arch.nmi_injected = false; - } + } else if (vmx->vcpu.arch.interrupt.pending == KVMIRQ_INJECTED) + kvm_clear_interrupt_queue(&vmx->vcpu); kvm_clear_exception_queue(&vmx->vcpu); if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { @@ -3253,9 +3266,7 @@ static void vmx_complete_interrupts(stru } else kvm_queue_exception(&vmx->vcpu, vector); vmx->idt_vectoring_info = 0; - } - kvm_clear_interrupt_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + } else if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { kvm_queue_interrupt(&vmx->vcpu, vector); vmx->idt_vectoring_info = 0; } @@ -3278,22 +3289,34 @@ static void vmx_intr_assist(struct kvm_v } if (vcpu->arch.nmi_injected) { vmx_inject_nmi(vcpu); + if (vcpu->arch.interrupt.pending == KVMIRQ_INJECTED) + /* + * Degrade pending state, we will properly reinject + * after the NMI. + */ + vcpu->arch.interrupt.pending = KVMIRQ_QUEUED; if (vcpu->arch.nmi_pending) enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) + else if (vcpu->arch.interrupt.pending != KVMIRQ_NONE + || kvm_cpu_has_interrupt(vcpu)) enable_irq_window(vcpu); return; } - if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vcpu->arch.interrupt_window_open) + if (vcpu->arch.interrupt_window_open) { + if (vcpu->arch.interrupt.pending == KVMIRQ_NONE + && kvm_cpu_has_interrupt(vcpu)) kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); - else - enable_irq_window(vcpu); - } - if (vcpu->arch.interrupt.pending) { - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + + if (vcpu->arch.interrupt.pending != KVMIRQ_NONE) { + vcpu->arch.interrupt.pending = KVMIRQ_INJECTED; + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + } } + if (!vcpu->arch.interrupt_window_open + && (vcpu->arch.interrupt.pending != KVMIRQ_NONE + || kvm_cpu_has_interrupt(vcpu))) + enable_irq_window(vcpu); } /* Index: b/arch/x86/kvm/x86.h =================================================================== --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,13 +10,13 @@ static inline void kvm_clear_exception_q static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) { - vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.pending = KVMIRQ_QUEUED; vcpu->arch.interrupt.nr = vector; } static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) { - vcpu->arch.interrupt.pending = false; + vcpu->arch.interrupt.pending = KVMIRQ_NONE; } #endif -- Siemens AG, Corporate Technology, CT SE 2 ES-OS Corporate Competence Center Embedded Linux