[MODERATED] [PATCH 0/2] L1TF KVM v2 0

All of lore.kernel.org
 help / color / mirror / Atom feed

* [MODERATED] [PATCH 0/2] L1TF KVM v2 0
@ 2018-06-20 16:42 Paolo Bonzini
  2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
  2018-06-20 16:42 ` [MODERATED] [PATCH 2/2] L1TF KVM v2 2 Paolo Bonzini
  0 siblings, 2 replies; 14+ messages in thread
From: Paolo Bonzini @ 2018-06-20 16:42 UTC (permalink / raw)
  To: speck

Here is at last the respin (almost rewrite) of the L1 terminal fault KVM
mitigation patches, adding a TLB flush on vmentry.  They take into account
Thomas and Peter's review.

Thanks,

Paolo

Paolo Bonzini (2):
  kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
  kvm: x86: add support for L1D flush MSR

 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  7 +++-
 arch/x86/include/asm/msr-index.h   |  3 ++
 arch/x86/kvm/mmu.c                 |  1 +
 arch/x86/kvm/svm.c                 |  3 +-
 arch/x86/kvm/vmx.c                 | 46 +++++++++++++++++++++++++-
 arch/x86/kvm/x86.c                 | 66 ++++++++++++++++++++++++++++++++++++--
 7 files changed, 122 insertions(+), 5 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 16:42 [MODERATED] [PATCH 0/2] L1TF KVM v2 0 Paolo Bonzini
@ 2018-06-20 16:42 ` Paolo Bonzini
  2018-06-20 20:13   ` [MODERATED] " Andi Kleen
                     ` (2 more replies)
  2018-06-20 16:42 ` [MODERATED] [PATCH 2/2] L1TF KVM v2 2 Paolo Bonzini
  1 sibling, 3 replies; 14+ messages in thread
From: Paolo Bonzini @ 2018-06-20 16:42 UTC (permalink / raw)
  To: speck

This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".

"vmentry_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
"vmentry_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
in the kind of code that they execute.  The idea is based on Intel's
patches, but I am treating all vmexits as confined unless they execute
specific code that is considered unsafe.  In other words there is
no hardcoded list of "safe" exit reasons; vmexits are considered safe
unless they trigger the emulator, which could be a good target for
other speculative execution-based threats, or the MMU, which can bring
host page tables in the L1 cache.  In addition, executing userspace or
another process will trigger a flush.

The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
Coffee Lake (for the latter, this is independent of whether microcode
or the generic flush code are used).

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  The suggestion there is to disable hyperthreading
unless you've configured your system to dedicate each core to a specific
guest.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++++-
 arch/x86/kvm/mmu.c              |  1 +
 arch/x86/kvm/svm.c              |  3 ++-
 arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0ebe659f2802..863c52f6f1f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+	
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -939,7 +943,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f440d43c8d5a..9e8b954d5a94 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3839,6 +3839,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d9305f1723f5..715a2f34e815 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5407,8 +5407,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 48989f78be60..c16d55489f95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		*need_l1d_flush = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		*need_l1d_flush = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		*need_l1d_flush = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9753,6 +9793,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -11803,6 +11844,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12920,7 +12964,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 06dd4cdb2ca8..8bf8f672cece 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6055,6 +6056,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6538,10 +6541,45 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6561,10 +6599,15 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6595,6 +6638,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6619,6 +6664,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7262,6 +7308,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7401,7 +7448,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7588,6 +7641,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8704,6 +8758,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [MODERATED] [PATCH 2/2] L1TF KVM v2 2
  2018-06-20 16:42 [MODERATED] [PATCH 0/2] L1TF KVM v2 0 Paolo Bonzini
  2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
@ 2018-06-20 16:42 ` Paolo Bonzini
  2018-06-20 20:25   ` [MODERATED] " Jiri Kosina
  1 sibling, 1 reply; 14+ messages in thread
From: Paolo Bonzini @ 2018-06-20 16:42 UTC (permalink / raw)
  To: speck

Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
If it is active, the displacement flush is unnecessary.  Tested on
a Coffee Lake machine.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/include/asm/msr-index.h   | 3 +++
 arch/x86/kvm/x86.c                 | 9 ++++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 578793e97431..aebf89c4175d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -333,6 +333,7 @@
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* IA32_FLUSH_L1D MSR */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
 
 /*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b9255e..f43bd9f23053 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -65,6 +65,9 @@
 
 #define MSR_MTRRcap			0x000000fe
 
+#define MSR_IA32_FLUSH_L1D             0x10b
+#define MSR_IA32_FLUSH_L1D_VALUE       0x00000001
+
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8bf8f672cece..c88ed554bb66 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6551,8 +6551,15 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 
 void kvm_l1d_flush(void)
 {
+	int size;
+
+	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+		wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE);
+		return;
+	}
+
 	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
-	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+	size = PAGE_SIZE << L1D_CACHE_ORDER;
 	asm volatile(
 		/* First ensure the pages are in the TLB */
 		"xorl %%eax, %%eax\n\t"
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
@ 2018-06-20 20:13   ` Andi Kleen
  2018-06-20 20:16     ` Thomas Gleixner
  2018-06-20 20:24     ` Jiri Kosina
  2018-06-20 21:01   ` Konrad Rzeszutek Wilk
  2018-06-21  8:15   ` Peter Zijlstra
  2 siblings, 2 replies; 14+ messages in thread
From: Andi Kleen @ 2018-06-20 20:13 UTC (permalink / raw)
  To: speck

> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;

I thought we agreed to remove this? We should be only using the MSR based flush
methods, as microcode updates are needed in any case for other things.

-Andi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 20:13   ` [MODERATED] " Andi Kleen
@ 2018-06-20 20:16     ` Thomas Gleixner
  2018-06-20 20:31       ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-06-20 20:24     ` Jiri Kosina
  1 sibling, 1 reply; 14+ messages in thread
From: Thomas Gleixner @ 2018-06-20 20:16 UTC (permalink / raw)
  To: speck

On Wed, 20 Jun 2018, speck for Andi Kleen wrote:
> > +
> > +/*
> > + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> > + * 64 KiB because the replacement algorithm is not exactly LRU.
> > + */
> > +#define L1D_CACHE_ORDER 4
> > +static void *__read_mostly empty_zero_pages;
> 
> I thought we agreed to remove this? We should be only using the MSR based flush
> methods, as microcode updates are needed in any case for other things.

And when are those microcode updates materializing? As far as I know they
are still in limbo, but you might have a better answer.

If that stuff goes public early and micro code updates are not available
then we have absolutely nothing.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 20:13   ` [MODERATED] " Andi Kleen
  2018-06-20 20:16     ` Thomas Gleixner
@ 2018-06-20 20:24     ` Jiri Kosina
  1 sibling, 0 replies; 14+ messages in thread
From: Jiri Kosina @ 2018-06-20 20:24 UTC (permalink / raw)
  To: speck

On Wed, 20 Jun 2018, speck for Andi Kleen wrote:

> > + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> > + * 64 KiB because the replacement algorithm is not exactly LRU.
> > + */
> > +#define L1D_CACHE_ORDER 4
> > +static void *__read_mostly empty_zero_pages;
> 
> I thought we agreed to remove this? We should be only using the MSR based flush
> methods, as microcode updates are needed in any case for other things.

For spectre v4, we rushed pretty hard to have the kernel code shipped to 
our users by the CRD, and here we are weeks after the CRD, and the ucode 
is still not released. Therefore I think it's reasonable to be a bit more 
precautious if we want to keep the users safe from day 1.

What exactly is the problem with doing this only on systems that don't 
have CPUID[7,0].EDX[28] feature?

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM v2 2
  2018-06-20 16:42 ` [MODERATED] [PATCH 2/2] L1TF KVM v2 2 Paolo Bonzini
@ 2018-06-20 20:25   ` Jiri Kosina
  2018-06-20 20:32     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 14+ messages in thread
From: Jiri Kosina @ 2018-06-20 20:25 UTC (permalink / raw)
  To: speck

On Wed, 20 Jun 2018, speck for Paolo Bonzini wrote:

> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 578793e97431..aebf89c4175d 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -333,6 +333,7 @@
>  #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
>  #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
>  #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
> +#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* IA32_FLUSH_L1D MSR */
>  #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
>  
>  /*
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 53d5b1b9255e..f43bd9f23053 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -65,6 +65,9 @@
>  
>  #define MSR_MTRRcap			0x000000fe
>  
> +#define MSR_IA32_FLUSH_L1D             0x10b
> +#define MSR_IA32_FLUSH_L1D_VALUE       0x00000001
> +
>  #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
>  #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
>  #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 8bf8f672cece..c88ed554bb66 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6551,8 +6551,15 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>  
>  void kvm_l1d_flush(void)
>  {
> +	int size;
> +
> +	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {

Umm, where exactly do you set this feature bit?

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 20:16     ` Thomas Gleixner
@ 2018-06-20 20:31       ` Konrad Rzeszutek Wilk
  2018-06-20 20:49         ` Thomas Gleixner
  2018-06-20 20:52         ` [MODERATED] " Jiri Kosina
  0 siblings, 2 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-20 20:31 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 10:16:33PM +0200, speck for Thomas Gleixner wrote:
> On Wed, 20 Jun 2018, speck for Andi Kleen wrote:
> > > +
> > > +/*
> > > + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> > > + * 64 KiB because the replacement algorithm is not exactly LRU.
> > > + */
> > > +#define L1D_CACHE_ORDER 4
> > > +static void *__read_mostly empty_zero_pages;
> > 
> > I thought we agreed to remove this? We should be only using the MSR based flush
> > methods, as microcode updates are needed in any case for other things.
> 
> And when are those microcode updates materializing? As far as I know they
> are still in limbo, but you might have a better answer.

The microcode is out already. The GPZv4 micrcode also had L1 CMD FLUSH implemented in it.
> 
> If that stuff goes public early and micro code updates are not available
> then we have absolutely nothing.
> 
> Thanks,
> 
> 	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM v2 2
  2018-06-20 20:25   ` [MODERATED] " Jiri Kosina
@ 2018-06-20 20:32     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-20 20:32 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 10:25:45PM +0200, speck for Jiri Kosina wrote:
> On Wed, 20 Jun 2018, speck for Paolo Bonzini wrote:
> 
> > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > index 578793e97431..aebf89c4175d 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -333,6 +333,7 @@
> >  #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
> >  #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
> >  #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
> > +#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* IA32_FLUSH_L1D MSR */
> >  #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
> >  
> >  /*
> > diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> > index 53d5b1b9255e..f43bd9f23053 100644
> > --- a/arch/x86/include/asm/msr-index.h
> > +++ b/arch/x86/include/asm/msr-index.h
> > @@ -65,6 +65,9 @@
> >  
> >  #define MSR_MTRRcap			0x000000fe
> >  
> > +#define MSR_IA32_FLUSH_L1D             0x10b
> > +#define MSR_IA32_FLUSH_L1D_VALUE       0x00000001
> > +
> >  #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
> >  #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
> >  #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 8bf8f672cece..c88ed554bb66 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -6551,8 +6551,15 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
> >  
> >  void kvm_l1d_flush(void)
> >  {
> > +	int size;
> > +
> > +	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> 
> Umm, where exactly do you set this feature bit?

CPUID.7 array ends up being set  automatically in cpu/common.c.

> 
> Thanks,
> 
> -- 
> Jiri Kosina
> SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 20:31       ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-06-20 20:49         ` Thomas Gleixner
  2018-06-20 20:52         ` [MODERATED] " Jiri Kosina
  1 sibling, 0 replies; 14+ messages in thread
From: Thomas Gleixner @ 2018-06-20 20:49 UTC (permalink / raw)
  To: speck

On Wed, 20 Jun 2018, speck for Konrad Rzeszutek Wilk wrote:

> On Wed, Jun 20, 2018 at 10:16:33PM +0200, speck for Thomas Gleixner wrote:
> > On Wed, 20 Jun 2018, speck for Andi Kleen wrote:
> > > > +
> > > > +/*
> > > > + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> > > > + * 64 KiB because the replacement algorithm is not exactly LRU.
> > > > + */
> > > > +#define L1D_CACHE_ORDER 4
> > > > +static void *__read_mostly empty_zero_pages;
> > > 
> > > I thought we agreed to remove this? We should be only using the MSR based flush
> > > methods, as microcode updates are needed in any case for other things.
> > 
> > And when are those microcode updates materializing? As far as I know they
> > are still in limbo, but you might have a better answer.
> 
> The microcode is out already. The GPZv4 micrcode also had L1 CMD FLUSH implemented in it.

I'm not talking about the stuff Intel shipped behind the scenes in
homoeopathic doses. The official download site is still on the 20180425
Release which has nothing of that AFAICT.

   0x00000007 0x00: eax=0x00000000 ebx=0x000027ab ecx=0x00000000 edx=0x0c000000

Can you spot bit 28 being set in EDX?

[    0.000000] microcode: microcode updated early to revision 0x11, date = 2018-01-22

That's the version published early this year...

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 20:31       ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-06-20 20:49         ` Thomas Gleixner
@ 2018-06-20 20:52         ` Jiri Kosina
  1 sibling, 0 replies; 14+ messages in thread
From: Jiri Kosina @ 2018-06-20 20:52 UTC (permalink / raw)
  To: speck

On Wed, 20 Jun 2018, speck for Konrad Rzeszutek Wilk wrote:

> The microcode is out already.

Isn't the latest publicly available one still 20180425?

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
  2018-06-20 20:13   ` [MODERATED] " Andi Kleen
@ 2018-06-20 21:01   ` Konrad Rzeszutek Wilk
  2018-06-20 21:43     ` Konrad Rzeszutek Wilk
  2018-06-21  8:15   ` Peter Zijlstra
  2 siblings, 1 reply; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-20 21:01 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 06:42:06PM +0200, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH v2 1/2] kvm: x86: mitigation for L1 cache terminal fault
>  vulnerabilities


Going to comment here as well here in case Paolo you are focused only
on this patch and not looking at others.
> 
> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".
> 
> "vmentry_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
> "vmentry_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
> in the kind of code that they execute.  The idea is based on Intel's
> patches, but I am treating all vmexits as confined unless they execute
> specific code that is considered unsafe.  In other words there is
> no hardcoded list of "safe" exit reasons; vmexits are considered safe
> unless they trigger the emulator, which could be a good target for
> other speculative execution-based threats, or the MMU, which can bring
> host page tables in the L1 cache.  In addition, executing userspace or
> another process will trigger a flush.
> 
> The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
> is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
> Coffee Lake (for the latter, this is independent of whether microcode
> or the generic flush code are used).

What about Haswell? Is the generic flush better or worst than using the microcode?
Also what about Skylake? If there are differences should we add some switch
table to pick the best choice?

See below.
> 
> The mitigation does not in any way try to do anything about hyperthreading;
> it is possible for a sibling thread to read data from the cache during a
> vmexit, before the host completes the flush, or to read data from the cache
> while a sibling runs.  The suggestion there is to disable hyperthreading
> unless you've configured your system to dedicate each core to a specific
> guest.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 ++++-
>  arch/x86/kvm/mmu.c              |  1 +
>  arch/x86/kvm/svm.c              |  3 ++-
>  arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 111 insertions(+), 5 deletions(-)

Also there should be a Documentation/.. patch.

> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 0ebe659f2802..863c52f6f1f2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
>  
>  	/* be preempted when it's in kernel-mode(cpl=0) */
>  	bool preempted_in_kernel;
> +	

Giant space above can be removed.
> +	/* for L1 terminal fault vulnerability */
> +	bool vcpu_unconfined;
>  };
>  
>  struct kvm_lpage_info {
> @@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
>  	u64 signal_exits;
>  	u64 irq_window_exits;
>  	u64 nmi_window_exits;
> +	u64 l1d_flush;
>  	u64 halt_exits;
>  	u64 halt_successful_poll;
>  	u64 halt_attempted_poll;
> @@ -939,7 +943,7 @@ struct kvm_x86_ops {
>  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
>  	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>  
> -	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
> +	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
>  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
>  	void (*vcpu_put)(struct kvm_vcpu *vcpu);
>  
> @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>  		     struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>  
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f440d43c8d5a..9e8b954d5a94 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3839,6 +3839,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
>  {
>  	int r = 1;
>  
> +	vcpu->arch.vcpu_unconfined = true;
>  	switch (vcpu->arch.apf.host_apf_reason) {
>  	default:
>  		trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d9305f1723f5..715a2f34e815 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5407,8 +5407,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
>  		svm->asid_generation--;
>  }
>  
> -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
>  {
> +	*need_l1d_flush = false;
>  }
>  
>  static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 48989f78be60..c16d55489f95 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@
>  };
>  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>  
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
> +
>  static bool __read_mostly enable_vpid = 1;
>  module_param_named(vpid, enable_vpid, bool, 0444);
>  
> @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> +	vmx_save_host_state(vcpu);
> +	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {

|| !(boot_cpu_has(X86_BUG_L1TF);
> +		*need_l1d_flush = false;
> +		return;
> +	}
> +
> +	switch (vmentry_l1d_flush) {
> +	case 0:
> +		*need_l1d_flush = false;
> +		break;
> +	case 1:
> +		/*
> +		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> +		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
> +		 * following cases:
> +		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> +		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
> +		 *   on every nested entry.  Nested hypervisors do not bother clearing
> +		 *   the cache.
> +		 * - anything that runs the emulator (the slow paths for EPT misconfig
> +		 *   or I/O instruction)
> +		 * - anything that can cause get_user_pages (EPT violation, and again
> +		 *   the slow paths for EPT misconfig or I/O instruction)
> +		 * - anything that can run code outside KVM (external interrupt,
> +		 *   which can run interrupt handlers or irqs; or the sched_in
> +		 *   preempt notifier)
> +		 */
> +		break;
> +	case 2:
> +	default:
> +		*need_l1d_flush = true;
> +		break;
> +	}
> +}
> +
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>  {
>  	if (!vmx->host_state.loaded)
> @@ -9753,6 +9793,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		vcpu->arch.vcpu_unconfined = true;
>  	}
>  }
>  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -11803,6 +11844,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return ret;
>  	}
>  
> +	/* Hide L1D cache contents from the nested guest.  */
> +	vmx->vcpu.arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>  	 * by event injection, halt vcpu.
> @@ -12920,7 +12964,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
>  	.vcpu_free = vmx_free_vcpu,
>  	.vcpu_reset = vmx_vcpu_reset,
>  
> -	.prepare_guest_switch = vmx_save_host_state,
> +	.prepare_guest_switch = vmx_prepare_guest_switch,
>  	.vcpu_load = vmx_vcpu_load,
>  	.vcpu_put = vmx_vcpu_put,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 06dd4cdb2ca8..8bf8f672cece 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	{ "irq_injections", VCPU_STAT(irq_injections) },
>  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
>  	{ "req_event", VCPU_STAT(req_event) },
> +	{ "l1d_flush", VCPU_STAT(l1d_flush) },
>  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
>  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
>  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6055,6 +6056,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>  	bool writeback = true;
>  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>  
> +	vcpu->arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
>  	 * never reused.
> @@ -6538,10 +6541,45 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>  };
>  #endif
>  
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
> +	int size = PAGE_SIZE << L1D_CACHE_ORDER;

ASSERT(boot_cpu_has(X86_BUG_L1TF)) ?

But back to the description - perhaps we should have a switch statement
if we want to pick this code instead of using the microcode provided one?

> +	asm volatile(
> +		/* First ensure the pages are in the TLB */
> +		"xorl %%eax, %%eax\n\t"
> +		"11: \n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $4096, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"
> +		/* Now fill the cache */
> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		: : "r" (empty_zero_pages), "r" (size)
> +		: "eax", "ebx", "ecx", "edx");
> +}
> +
>  int kvm_arch_init(void *opaque)
>  {
>  	int r;
>  	struct kvm_x86_ops *ops = opaque;
> +	struct page *page;
>  
>  	if (kvm_x86_ops) {
>  		printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6561,10 +6599,15 @@ int kvm_arch_init(void *opaque)
>  	}
>  
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +
>  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
>  	if (!shared_msrs) {
>  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> -		goto out;
> +		goto out_free_zero_pages;
>  	}
>  
>  	r = kvm_mmu_module_init();
> @@ -6595,6 +6638,8 @@ int kvm_arch_init(void *opaque)
>  
>  	return 0;
>  
> +out_free_zero_pages:
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  out_free_percpu:
>  	free_percpu(shared_msrs);
>  out:
> @@ -6619,6 +6664,7 @@ void kvm_arch_exit(void)
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  	free_percpu(shared_msrs);
>  }
>  
> @@ -7262,6 +7308,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		kvm_cpu_accept_dm_intr(vcpu);
>  
>  	bool req_immediate_exit = false;
> +	bool need_l1d_flush;
>  
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> @@ -7401,7 +7448,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	preempt_disable();
>  
> -	kvm_x86_ops->prepare_guest_switch(vcpu);
> +	need_l1d_flush = vcpu->arch.vcpu_unconfined;
> +	vcpu->arch.vcpu_unconfined = false;
> +	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
> +	if (need_l1d_flush) {
> +		vcpu->stat.l1d_flush++;
> +		kvm_l1d_flush();
> +	}

Would it be more safer to move the kvm_l1d_flush in the vmx_vcpu_run?
(And also the equivalant for SVM)?


>  
>  	/*
>  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7588,6 +7641,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  	struct kvm *kvm = vcpu->kvm;
>  
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> +	vcpu->arch.vcpu_unconfined = true;
>  
>  	for (;;) {
>  		if (kvm_vcpu_running(vcpu)) {
> @@ -8704,6 +8758,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +	vcpu->arch.vcpu_unconfined = true;
>  	kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>  
> -- 
> 1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 21:01   ` Konrad Rzeszutek Wilk
@ 2018-06-20 21:43     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-20 21:43 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 05:01:44PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Wed, Jun 20, 2018 at 06:42:06PM +0200, speck for Paolo Bonzini wrote:
> > From: Paolo Bonzini <pbonzini@redhat.com>
> > Subject: [PATCH v2 1/2] kvm: x86: mitigation for L1 cache terminal fault
> >  vulnerabilities
> 
> 
> Going to comment here as well here in case Paolo you are focused only
> on this patch and not looking at others.

Here inline is your patch with some fixes I enumerated.. I also updated the
commit description a bit.


From 9f67d58b6972e0c28c9c8b655593e0701b8d91cf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Jun 2018 15:49:44 -0400
Subject: [PATCH] kvm: x86: mitigation for L1 cache terminal fault
 vulnerabilities

We add two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".

"vmentry_l1d_flush=2" is simply doing an L1 cache flush on every VMENTER.
"vmentry_l1d_flush=1" is trying to avoid so many L1 cache flueshes on
VMENTER and instead only does if the reason for entering the hypervisor
is based on the type of code that is executed. The idea is based on Intel's
patches, but we treat all vmexits as safe unless they execute
specific code that is considered unsafe. There is no hardcoded list of
"safe" exit reasons; but vmexits are considered safe unless:
 - They trigger the emulator, which could be a good target for
   other speculative execution-based threats,
 - or the MMU, which can bring host page tables in the L1 cache.
 - In addition, executing userspace or another process will trigger a flush.

The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
Coffee Lake (for the latter, this is independent of whether microcode
or the generic flush code are used).

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  The suggestion there is to disable hyperthreading
unless you've configured your system to dedicate each core to a specific
guest.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Add checks for X86_BUG_L1TF
   - Rework the commit description.
---
 Documentation/admin-guide/kernel-parameters.txt | 11 +++++
 arch/x86/include/asm/kvm_host.h                 |  7 ++-
 arch/x86/kvm/mmu.c                              |  1 +
 arch/x86/kvm/svm.c                              |  3 +-
 arch/x86/kvm/vmx.c                              | 48 ++++++++++++++++++-
 arch/x86/kvm/x86.c                              | 62 ++++++++++++++++++++++++-
 6 files changed, 127 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4e56e6d0f124..dd1db796a463 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1923,6 +1923,17 @@
 			SMT (aka Hyper-Threading) enabled then don't load KVM module.
 			Default is 0 (allow module to be loaded).
 
+	kvm.vmentry_l1d_flush=[KVM] Mitigation for L1 Terminal Fault CVE.
+			Valid arguments: 0, 1, 2
+
+			2 does an L1 cache flush on every VMENTER.
+			1 tries to avoid so many L1 cache flush on VMENTERs and instead
+			do it only if the kind of code that is executed would lead to
+			leaking host memory.
+			0 disables the mitigation
+
+			Default is 1 (do L1 cache flush in specific instances)
+
 	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
 			KVM MMU at runtime.
 			Default is 0 (off)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..16da2973f939 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -939,7 +943,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..4d4e3dc2494e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..302afb2643fa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 559a12b6184d..b4b2af1ed223 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -2618,6 +2621,45 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    !static_cpu_has(X86_BUG_L1TF)) {
+		*need_l1d_flush = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		*need_l1d_flush = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		*need_l1d_flush = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9751,6 +9793,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -11811,6 +11854,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12928,7 +12974,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1065d4e7c5fd..08df49e2c2d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6537,10 +6540,48 @@ static struct notifier_block pvclock_gtod_notifier = {
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+	ASSERT(boot_cpu_has(X86_BUG_L1TF));
+
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6569,10 +6610,15 @@ int kvm_arch_init(void *opaque)
 				"being able to snoop the host memory!");
 	}
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6603,6 +6649,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6627,6 +6675,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7266,6 +7315,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7405,7 +7455,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7592,6 +7648,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8711,6 +8768,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
2.13.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM v2 1
  2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
  2018-06-20 20:13   ` [MODERATED] " Andi Kleen
  2018-06-20 21:01   ` Konrad Rzeszutek Wilk
@ 2018-06-21  8:15   ` Peter Zijlstra
  2 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2018-06-21  8:15 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 06:42:06PM +0200, speck for Paolo Bonzini wrote:
> "vmentry_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
   ^^^^^^^                                                         ^^^^^^

Pick one :-) The code seems to do vmentry (as makes sense).

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-06-21  8:15 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-20 16:42 [MODERATED] [PATCH 0/2] L1TF KVM v2 0 Paolo Bonzini
2018-06-20 16:42 ` [MODERATED] [PATCH 1/2] L1TF KVM v2 1 Paolo Bonzini
2018-06-20 20:13   ` [MODERATED] " Andi Kleen
2018-06-20 20:16     ` Thomas Gleixner
2018-06-20 20:31       ` [MODERATED] " Konrad Rzeszutek Wilk
2018-06-20 20:49         ` Thomas Gleixner
2018-06-20 20:52         ` [MODERATED] " Jiri Kosina
2018-06-20 20:24     ` Jiri Kosina
2018-06-20 21:01   ` Konrad Rzeszutek Wilk
2018-06-20 21:43     ` Konrad Rzeszutek Wilk
2018-06-21  8:15   ` Peter Zijlstra
2018-06-20 16:42 ` [MODERATED] [PATCH 2/2] L1TF KVM v2 2 Paolo Bonzini
2018-06-20 20:25   ` [MODERATED] " Jiri Kosina
2018-06-20 20:32     ` Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.