* Patch "x86/kvm/vmx: Defer TR reload after VM exit" has been added to the 4.9-stable tree
@ 2017-12-21 8:41 gregkh
0 siblings, 0 replies; only message in thread
From: gregkh @ 2017-12-21 8:41 UTC (permalink / raw)
To: luto, edubezval, eduval, gregkh, pbonzini; +Cc: stable, stable-commits
This is a note to let you know that I've just added the patch titled
x86/kvm/vmx: Defer TR reload after VM exit
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@vger.kernel.org> know about it.
>From b7ffc44d5b2ea163899d09289ca7743d5c32e926 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Feb 2017 08:56:14 -0800
Subject: x86/kvm/vmx: Defer TR reload after VM exit
From: Andy Lutomirski <luto@kernel.org>
commit b7ffc44d5b2ea163899d09289ca7743d5c32e926 upstream.
Intel's VMX is daft and resets the hidden TSS limit register to 0x67
on VMX reload, and the 0x67 is not configurable. KVM currently
reloads TR using the LTR instruction on every exit, but this is quite
slow because LTR is serializing.
The 0x67 limit is entirely harmless unless ioperm() is in use, so
defer the reload until a task using ioperm() is actually running.
Here's some poorly done benchmarking using kvm-unit-tests:
Before:
cpuid 1313
vmcall 1195
mov_from_cr8 11
mov_to_cr8 17
inl_from_pmtimer 6770
inl_from_qemu 6856
inl_from_kernel 2435
outl_to_kernel 1402
After:
cpuid 1291
vmcall 1181
mov_from_cr8 11
mov_to_cr8 16
inl_from_pmtimer 6457
inl_from_qemu 6209
inl_from_kernel 2339
outl_to_kernel 1391
Signed-off-by: Andy Lutomirski <luto@kernel.org>
[Force-reload TR in invalidate_tss_limit. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
arch/x86/include/asm/desc.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/ioport.c | 5 ++++
arch/x86/kernel/process.c | 10 +++++++++
arch/x86/kvm/vmx.c | 23 ++++++++-------------
4 files changed, 72 insertions(+), 14 deletions(-)
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -213,6 +213,54 @@ static inline void native_load_tr_desc(v
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
+static inline void force_reload_TR(void)
+{
+ struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ tss_desc tss;
+
+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
+
+ /*
+ * LTR requires an available TSS, and the TSS is currently
+ * busy. Make it be available so that LTR will work.
+ */
+ tss.type = DESC_TSS;
+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+ load_TR_desc();
+}
+
+DECLARE_PER_CPU(bool, need_tr_refresh);
+
+static inline void refresh_TR(void)
+{
+ WARN_ON(preemptible());
+
+ if (unlikely(this_cpu_read(need_tr_refresh))) {
+ force_reload_TR();
+ this_cpu_write(need_tr_refresh, false);
+ }
+}
+
+/*
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
+ */
+static inline void invalidate_tss_limit(void)
+{
+ WARN_ON(preemptible());
+
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+ force_reload_TR();
+ else
+ this_cpu_write(need_tr_refresh, true);
+}
+
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/bitmap.h>
#include <asm/syscalls.h>
+#include <asm/desc.h>
/*
* this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
+
+ preempt_disable();
+ refresh_TR();
+ preempt_enable();
}
/*
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -33,6 +33,7 @@
#include <asm/mce.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
+#include <asm/desc.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -82,6 +83,9 @@ void idle_notifier_unregister(struct not
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
+DEFINE_PER_CPU(bool, need_tr_refresh);
+EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
+
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -227,6 +231,12 @@ void __switch_to_xtra(struct task_struct
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
+
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_TR();
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1959,19 +1959,6 @@ static void add_atomic_switch_msr(struct
m->host[i].value = host_val;
}
-static void reload_tss(void)
-{
- /*
- * VT restores TR but not its size. Useless.
- */
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- struct desc_struct *descs;
-
- descs = (void *)gdt->address;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-}
-
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2141,7 +2128,7 @@ static void __vmx_load_host_state(struct
loadsegment(es, vmx->host_state.es_sel);
}
#endif
- reload_tss();
+ invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
@@ -2265,6 +2252,14 @@ static void vmx_vcpu_load(struct kvm_vcp
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
+ /*
+ * VM exits change the host TR limit to 0x67 after a VM
+ * exit. This is okay, since 0x67 covers everything except
+ * the IO bitmap and have have code to handle the IO bitmap
+ * being lost after a VM exit.
+ */
+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
+
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
Patches currently in stable-queue which might be from luto@kernel.org are
queue-4.9/x86-mm-refactor-flush_tlb_mm_range-to-merge-local-and-remote-cases.patch
queue-4.9/x86-mm-pass-flush_tlb_info-to-flush_tlb_others-etc.patch
queue-4.9/x86-mm-rework-lazy-tlb-to-track-the-actual-loaded-mm.patch
queue-4.9/x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch
queue-4.9/x86-mm-use-new-merged-flush-logic-in-arch_tlbbatch_flush.patch
queue-4.9/x86-kvm-vmx-simplify-segment_base.patch
queue-4.9/x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
queue-4.9/x86-mm-reduce-indentation-in-flush_tlb_func.patch
queue-4.9/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch
queue-4.9/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch
queue-4.9/mm-x86-mm-make-the-batched-unmap-tlb-flush-api-more-generic.patch
queue-4.9/x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
queue-4.9/x86-mm-change-the-leave_mm-condition-for-local-tlb-flushes.patch
queue-4.9/x86-mm-be-more-consistent-wrt-page_shift-vs-page_size-in-tlb-flush-code.patch
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2017-12-21 8:44 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-21 8:41 Patch "x86/kvm/vmx: Defer TR reload after VM exit" has been added to the 4.9-stable tree gregkh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).