From: Avi Kivity <avi@redhat.com>
To: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Jan Kiszka <jan.kiszka@siemens.com>
Subject: [PATCH 14/45] KVM: VMX: work around lacking VNMI support
Date: Mon, 8 Dec 2008 13:36:25 +0200 [thread overview]
Message-ID: <1228736216-15787-15-git-send-email-avi@redhat.com> (raw)
In-Reply-To: <1228736216-15787-1-git-send-email-avi@redhat.com>
From: Jan Kiszka <jan.kiszka@siemens.com>
Older VMX supporting CPUs do not provide the "Virtual NMI" feature for
tracking the NMI-blocked state after injecting such events. For now
KVM is unable to inject NMIs on those CPUs.
Derived from Sheng Yang's suggestion to use the IRQ window notification
for detecting the end of NMI handlers, this patch implements virtual
NMI support without impact on the host's ability to receive real NMIs.
The downside is that the given approach requires some heuristics that
can cause NMI nesting in vary rare corner cases.
The approach works as follows:
- inject NMI and set a software-based NMI-blocked flag
- arm the IRQ window start notification whenever an NMI window is
requested
- if the guest exits due to an opening IRQ window, clear the emulated
NMI-blocked flag
- if the guest net execution time with NMI-blocked but without an IRQ
window exceeds 1 second, force NMI-blocked reset and inject anyway
This approach covers most practical scenarios:
- succeeding NMIs are seperated by at least one open IRQ window
- the guest may spin with IRQs disabled (e.g. due to a bug), but
leaving the NMI handler takes much less time than one second
- the guest does not rely on strict ordering or timing of NMIs
(would be problematic in virtualized environments anyway)
Successfully tested with the 'nmi n' monitor command, the kgdbts
testsuite on smp guests (additional patches required to add debug
register support to kvm) + the kernel's nmi_watchdog=1, and a Siemens-
specific board emulation (+ guest) that comes with its own NMI
watchdog mechanism.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/vmx.c | 174 ++++++++++++++++++++++++++++++++++------------------
1 files changed, 115 insertions(+), 59 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f16a62c..2180109 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -90,6 +90,11 @@ struct vcpu_vmx {
} rmode;
int vpid;
bool emulation_required;
+
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -2230,6 +2235,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.rmode.active = 0;
+ vmx->soft_vnmi_blocked = 0;
+
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2342,29 @@ out:
return ret;
}
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ if (!cpu_has_virtual_nmis()) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2360,6 +2390,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
+
++vcpu->stat.nmi_injections;
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
@@ -2384,6 +2427,8 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
!(guest_intr & (GUEST_INTR_STATE_STI |
GUEST_INTR_STATE_MOV_SS |
GUEST_INTR_STATE_NMI));
+ if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+ vcpu->arch.nmi_window_open = 0;
vcpu->arch.interrupt_window_open =
((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2403,55 +2448,31 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
kvm_queue_interrupt(vcpu, irq);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- if (!cpu_has_virtual_nmis())
- return;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
static void do_interrupt_requests(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
vmx_update_window_states(vcpu);
- if (cpu_has_virtual_nmis()) {
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending
- || kvm_run->request_nmi_window)
- enable_nmi_window(vcpu);
- else if (vcpu->arch.irq_summary
- || kvm_run->request_interrupt_window)
- enable_irq_window(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
return;
}
- if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
+ }
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
enable_nmi_window(vcpu);
+ else if (vcpu->arch.irq_summary
+ || kvm_run->request_interrupt_window)
+ enable_irq_window(vcpu);
+ return;
}
+ if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
+ enable_nmi_window(vcpu);
if (vcpu->arch.interrupt_window_open) {
if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -3097,6 +3118,37 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
printk(KERN_WARNING "%s: unexpected, valid vectoring info "
"(0x%x) and exit reason is 0x%x\n",
__func__, vectoring_info, exit_reason);
+
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (vcpu->arch.interrupt_window_open) {
+ vmx->soft_vnmi_blocked = 0;
+ vcpu->arch.nmi_window_open = 1;
+ } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+ (kvm_run->request_nmi_window || vcpu->arch.nmi_pending)) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->soft_vnmi_blocked = 0;
+ vmx->vcpu.arch.nmi_window_open = 1;
+ }
+
+ /*
+ * If the user space waits to inject an NNI, exit ASAP
+ */
+ if (vcpu->arch.nmi_window_open && kvm_run->request_nmi_window
+ && !vcpu->arch.nmi_pending) {
+ kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
+ ++vcpu->stat.nmi_window_exits;
+ return 0;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3146,7 +3198,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
if (unblock_nmi && vector != DF_VECTOR)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
- }
+ } else if (unlikely(vmx->soft_vnmi_blocked))
+ vmx->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
idt_vectoring_info = vmx->idt_vectoring_info;
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3186,27 +3240,25 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
vmx_update_window_states(vcpu);
- if (cpu_has_virtual_nmis()) {
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
return;
}
}
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending)
+ enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu))
+ enable_irq_window(vcpu);
+ return;
+ }
if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
if (vcpu->arch.interrupt_window_open)
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
@@ -3255,6 +3307,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+ vmx->entry_time = ktime_get();
+
/* Handle invalid guest state instead of entering VMX */
if (vmx->emulation_required && emulate_invalid_guest_state) {
handle_invalid_guest_state(vcpu, kvm_run);
--
1.6.0.3
next prev parent reply other threads:[~2008-12-08 11:42 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-12-08 11:36 [PATCH 00/45] KVM Updates for 2.6.29 (Part 1 of 3) Avi Kivity
2008-12-08 11:36 ` [PATCH 01/45] KVM: x86 emulator: consolidate push reg Avi Kivity
2008-12-08 11:36 ` [PATCH 02/45] KVM: VMX: include all IRQ window exits in statistics Avi Kivity
2008-12-08 11:36 ` [PATCH 03/45] KVM: VMX: Use INTR_TYPE_NMI_INTR instead of magic value Avi Kivity
2008-12-08 11:36 ` [PATCH 04/45] KVM: VMX: Support for NMI task gates Avi Kivity
2008-12-08 11:36 ` [PATCH 05/45] KVM: x86: Reset pending/inject NMI state on CPU reset Avi Kivity
2008-12-08 11:36 ` [PATCH 06/45] KVM: VMX: refactor/fix IRQ and NMI injectability determination Avi Kivity
2008-12-08 11:36 ` [PATCH 07/45] KVM: VMX: refactor IRQ and NMI window enabling Avi Kivity
2008-12-08 11:36 ` [PATCH 08/45] KVM: VMX: fix real-mode NMI support Avi Kivity
2008-12-08 11:36 ` [PATCH 09/45] KVM: x86: Enable NMI Watchdog via in-kernel PIT source Avi Kivity
2008-12-08 11:36 ` [PATCH 10/45] KVM: x86: VCPU with pending NMI is runnabled Avi Kivity
2008-12-08 11:36 ` [PATCH 11/45] KVM: Kick NMI receiving VCPU Avi Kivity
2008-12-08 11:36 ` [PATCH 12/45] KVM: x86: Support for user space injected NMIs Avi Kivity
2008-12-08 12:07 ` Jan Kiszka
2008-12-10 8:46 ` Avi Kivity
2008-12-10 9:16 ` Jan Kiszka
2008-12-08 11:36 ` [PATCH 13/45] KVM: VMX: Provide support " Avi Kivity
2008-12-08 11:36 ` Avi Kivity [this message]
2008-12-08 11:36 ` [PATCH 15/45] KVM: call kvm_arch_vcpu_reset() instead of the kvm_x86_ops callback Avi Kivity
2008-12-08 11:36 ` [PATCH 16/45] x86: Rename mtrr_state struct and macro names Avi Kivity
2008-12-08 11:36 ` [PATCH 17/45] x86: Export some definition of MTRR Avi Kivity
2008-12-08 11:36 ` [PATCH 18/45] KVM: Improve MTRR structure Avi Kivity
2008-12-08 11:36 ` [PATCH 19/45] KVM: VMX: Add PAT support for EPT Avi Kivity
2008-12-08 11:36 ` [PATCH 20/45] KVM: Add local get_mtrr_type() to support MTRR Avi Kivity
2008-12-08 11:36 ` [PATCH 21/45] KVM: Enable MTRR for EPT Avi Kivity
2008-12-08 11:36 ` [PATCH 22/45] KVM: Clean up kvm_x86_emulate.h Avi Kivity
2008-12-08 11:36 ` [PATCH 23/45] KVM: MMU: Extend kvm_mmu_page->slot_bitmap size Avi Kivity
2008-12-08 11:36 ` [PATCH 24/45] KVM: VMX: Move private memory slot position Avi Kivity
2008-12-08 11:36 ` [PATCH 25/45] KVM: x86 emulator: Add decode entries for 0x04 and 0x05 opcodes (add acc, imm) Avi Kivity
2008-12-08 11:36 ` [PATCH 26/45] KVM: x86: Fix and refactor NMI watchdog emulation Avi Kivity
2008-12-08 11:36 ` [PATCH 27/45] KVM: x86: Optimize NMI watchdog delivery Avi Kivity
2008-12-08 11:36 ` [PATCH 28/45] KVM: IRQ ACK notifier should be used with in-kernel irqchip Avi Kivity
2008-12-08 11:36 ` [PATCH 29/45] KVM: x86: Fix typo in function name Avi Kivity
2008-12-08 11:36 ` [PATCH 30/45] KVM: SVM: Set the 'g' bit of the cs selector for cross-vendor migration Avi Kivity
2008-12-08 11:36 ` [PATCH 31/45] KVM: SVM: Set the 'busy' flag of the TR selector Avi Kivity
2008-12-08 11:36 ` [PATCH 32/45] KVM: allow emulator to adjust rip for emulated pio instructions Avi Kivity
2008-12-08 11:36 ` [PATCH 33/45] KVM: VMX: Handle mmio emulation when guest state is invalid Avi Kivity
2008-12-08 11:36 ` [PATCH 34/45] KVM: ia64: Re-organize data sturure of guests' data area Avi Kivity
2008-12-08 11:36 ` [PATCH 35/45] KVM: ia64: Remove lock held by halted vcpu Avi Kivity
2008-12-08 11:36 ` [PATCH 36/45] KVM: Enable Function Level Reset for assigned device Avi Kivity
2008-12-08 11:36 ` [PATCH 37/45] KVM: MMU: Fix aliased gfns treated as unaliased Avi Kivity
2008-12-08 11:36 ` [PATCH 38/45] KVM: ppc: Move 440-specific TLB code into 44x_tlb.c Avi Kivity
2008-12-08 11:36 ` [PATCH 39/45] KVM: ppc: Rename "struct tlbe" to "struct kvmppc_44x_tlbe" Avi Kivity
2008-12-08 11:36 ` [PATCH 40/45] KVM: ppc: combine booke_guest.c and booke_host.c Avi Kivity
2008-12-08 11:36 ` [PATCH 41/45] KVM: ppc: Refactor powerpc.c to relocate 440-specific code Avi Kivity
2008-12-08 11:36 ` [PATCH 42/45] ppc: Create disassemble.h to extract instruction fields Avi Kivity
2008-12-08 11:36 ` [PATCH 43/45] KVM: ppc: refactor instruction emulation into generic and core-specific pieces Avi Kivity
2008-12-08 11:36 ` [PATCH 44/45] KVM: ppc: Move the last bits of 44x code out of booke.c Avi Kivity
2008-12-08 11:36 ` [PATCH 45/45] KVM: ppc: create struct kvm_vcpu_44x and introduce container_of() accessor Avi Kivity
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1228736216-15787-15-git-send-email-avi@redhat.com \
--to=avi@redhat.com \
--cc=jan.kiszka@siemens.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox