[MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5

All of lore.kernel.org
 help / color / mirror / Atom feed

* [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
@ 2018-06-20 20:43 konrad.wilk
  2018-06-20 20:57 ` [MODERATED] " Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 5+ messages in thread
From: konrad.wilk @ 2018-06-20 20:43 UTC (permalink / raw)
  To: speck

Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
If it is active, the displacement flush is unnecessary.  Tested on
a Coffee Lake machine.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++++-
 arch/x86/kvm/mmu.c              |  1 +
 arch/x86/kvm/svm.c              |  3 ++-
 arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..7d7626cffc21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+	
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -939,7 +943,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..4d4e3dc2494e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..302afb2643fa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 559a12b6184d..ba83d0cb9b03 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		*need_l1d_flush = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		*need_l1d_flush = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		*need_l1d_flush = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1065d4e7c5fd..c0079aae38ba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
 				"being able to snoop the host memory!");
 	}
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
  2018-06-20 20:43 [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5 konrad.wilk
@ 2018-06-20 20:57 ` Konrad Rzeszutek Wilk
  2018-06-21  3:17   ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 5+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-20 20:57 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> If it is active, the displacement flush is unnecessary.  Tested on
> a Coffee Lake machine.

ooops, that nice commit description you wrote Paolo got messed up. Sorry!

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 ++++-
>  arch/x86/kvm/mmu.c              |  1 +
>  arch/x86/kvm/svm.c              |  3 ++-
>  arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 111 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c13cd28d9d1b..7d7626cffc21 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
>  
>  	/* be preempted when it's in kernel-mode(cpl=0) */
>  	bool preempted_in_kernel;
> +	

There is a giant space right above here.. And I believe Paolo's original patch had it too.

> +	/* for L1 terminal fault vulnerability */
> +	bool vcpu_unconfined;
>  };
>  
>  struct kvm_lpage_info {
> @@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
>  	u64 signal_exits;
>  	u64 irq_window_exits;
>  	u64 nmi_window_exits;
> +	u64 l1d_flush;
>  	u64 halt_exits;
>  	u64 halt_successful_poll;
>  	u64 halt_attempted_poll;
> @@ -939,7 +943,7 @@ struct kvm_x86_ops {
>  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
>  	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>  
> -	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
> +	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
>  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
>  	void (*vcpu_put)(struct kvm_vcpu *vcpu);
>  
> @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>  		     struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>  
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d594690d8b95..4d4e3dc2494e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
>  {
>  	int r = 1;
>  
> +	vcpu->arch.vcpu_unconfined = true;
>  	switch (vcpu->arch.apf.host_apf_reason) {
>  	default:
>  		trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f059a73f0fd0..302afb2643fa 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
>  		svm->asid_generation--;
>  }
>  
> -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
>  {
> +	*need_l1d_flush = false;
>  }
>  
>  static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..ba83d0cb9b03 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
>  };
>  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>  
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
> +
>  static bool __read_mostly enable_vpid = 1;
>  module_param_named(vpid, enable_vpid, bool, 0444);
>  
> @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> +	vmx_save_host_state(vcpu);
> +	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {

and 
	|| !boot_cpu_has(X86_BUG_L1TF)

> +		*need_l1d_flush = false;
> +		return;
> +	}
> +

> +	switch (vmentry_l1d_flush) {
> +	case 0:
> +		*need_l1d_flush = false;
> +		break;
> +	case 1:
> +		/*
> +		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> +		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
> +		 * following cases:
> +		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> +		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
> +		 *   on every nested entry.  Nested hypervisors do not bother clearing
> +		 *   the cache.
> +		 * - anything that runs the emulator (the slow paths for EPT misconfig
> +		 *   or I/O instruction)
> +		 * - anything that can cause get_user_pages (EPT violation, and again
> +		 *   the slow paths for EPT misconfig or I/O instruction)
> +		 * - anything that can run code outside KVM (external interrupt,
> +		 *   which can run interrupt handlers or irqs; or the sched_in
> +		 *   preempt notifier)
> +		 */
> +		break;
> +	case 2:
> +	default:
> +		*need_l1d_flush = true;
> +		break;
> +	}
> +}
> +
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>  {
>  	if (!vmx->host_state.loaded)
> @@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		vcpu->arch.vcpu_unconfined = true;
>  	}
>  }
>  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return ret;
>  	}
>  
> +	/* Hide L1D cache contents from the nested guest.  */
> +	vmx->vcpu.arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>  	 * by event injection, halt vcpu.
> @@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.vcpu_free = vmx_free_vcpu,
>  	.vcpu_reset = vmx_vcpu_reset,
>  
> -	.prepare_guest_switch = vmx_save_host_state,
> +	.prepare_guest_switch = vmx_prepare_guest_switch,
>  	.vcpu_load = vmx_vcpu_load,
>  	.vcpu_put = vmx_vcpu_put,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1065d4e7c5fd..c0079aae38ba 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	{ "irq_injections", VCPU_STAT(irq_injections) },
>  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
>  	{ "req_event", VCPU_STAT(req_event) },
> +	{ "l1d_flush", VCPU_STAT(l1d_flush) },
>  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
>  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
>  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>  	bool writeback = true;
>  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>  
> +	vcpu->arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
>  	 * never reused.
> @@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
>  };
>  #endif
>  
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
> +	int size = PAGE_SIZE << L1D_CACHE_ORDER;

I am thinking we should have:

	ASSERT(boot_cpu_has(X86_BUG_L1TF));

or such?

> +	asm volatile(
> +		/* First ensure the pages are in the TLB */
> +		"xorl %%eax, %%eax\n\t"
> +		"11: \n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $4096, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"
> +		/* Now fill the cache */
> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		: : "r" (empty_zero_pages), "r" (size)
> +		: "eax", "ebx", "ecx", "edx");
> +}
> +
>  int kvm_arch_init(void *opaque)
>  {
>  	int r;
>  	struct kvm_x86_ops *ops = opaque;
> +	struct page *page;
>  
>  	if (kvm_x86_ops) {
>  		printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
>  				"being able to snoop the host memory!");
>  	}
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +
>  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
>  	if (!shared_msrs) {
>  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> -		goto out;
> +		goto out_free_zero_pages;
>  	}
>  
>  	r = kvm_mmu_module_init();
> @@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
>  
>  	return 0;
>  
> +out_free_zero_pages:
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  out_free_percpu:
>  	free_percpu(shared_msrs);
>  out:
> @@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  	free_percpu(shared_msrs);
>  }
>  
> @@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		kvm_cpu_accept_dm_intr(vcpu);
>  
>  	bool req_immediate_exit = false;
> +	bool need_l1d_flush;
>  
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> @@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	preempt_disable();
>  
> -	kvm_x86_ops->prepare_guest_switch(vcpu);
> +	need_l1d_flush = vcpu->arch.vcpu_unconfined;
> +	vcpu->arch.vcpu_unconfined = false;
> +	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
> +	if (need_l1d_flush) {
> +		vcpu->stat.l1d_flush++;
> +		kvm_l1d_flush();
> +	}

I was thinking that this kvm_l1d_flush should really happen right before 
you an VMENTER. I was thinking to change your patch for this exact reason
but I am curious why you put it here?

>  
>  	/*
>  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  	struct kvm *kvm = vcpu->kvm;
>  
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> +	vcpu->arch.vcpu_unconfined = true;
>  
>  	for (;;) {
>  		if (kvm_vcpu_running(vcpu)) {
> @@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +	vcpu->arch.vcpu_unconfined = true;
>  	kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>  
> -- 
> 2.14.3
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
  2018-06-20 20:57 ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-06-21  3:17   ` Konrad Rzeszutek Wilk
  2018-06-21 14:04     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 5+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-21  3:17 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 04:57:02PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> > kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> > 
> > Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> > If it is active, the displacement flush is unnecessary.  Tested on
> > a Coffee Lake machine.
> 
> ooops, that nice commit description you wrote Paolo got messed up. Sorry!

Ended up fixing this up to be:

From a2029479d4c83c8e3476d8b37d288fb441576b43 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Jun 2018 15:49:44 -0400
Subject: [PATCH v2.2 05/12] kvm: x86: mitigation for L1 cache terminal fault
 vulnerabilities

We add two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".

"vmentry_l1d_flush=2" is simply doing an L1 cache flush on every VMENTER.
"vmentry_l1d_flush=1" is trying to avoid so many L1 cache flueshes on
VMENTER and instead only does if the reason for entering the hypervisor
is based on the type of code that is executed. The idea is based on Intel's
patches, but we treat all vmexits as safe unless they execute
specific code that is considered unsafe. There is no hardcoded list of
"safe" exit reasons; but vmexits are considered safe unless:
 - They trigger the emulator, which could be a good target for
   other speculative execution-based threats,
 - or the MMU, which can bring host page tables in the L1 cache.
 - In addition, executing userspace or another process will trigger a flush.

The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
Coffee Lake (for the latter, this is independent of whether microcode
or the generic flush code are used).

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  The suggestion there is to disable hyperthreading
unless you've configured your system to dedicate each core to a specific
guest.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Add checks for X86_BUG_L1TF
   - Rework the commit description.
v3: change module parameter to S_IRUGO
    move the kvm_l1d_flush closer to VMENTER
---
 Documentation/admin-guide/kernel-parameters.txt | 11 +++++
 arch/x86/include/asm/kvm_host.h                 |  8 ++++
 arch/x86/kvm/mmu.c                              |  1 +
 arch/x86/kvm/svm.c                              |  1 +
 arch/x86/kvm/vmx.c                              | 51 +++++++++++++++++++++-
 arch/x86/kvm/x86.c                              | 58 ++++++++++++++++++++++++-
 6 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4e56e6d0f124..dd1db796a463 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1923,6 +1923,17 @@
 			SMT (aka Hyper-Threading) enabled then don't load KVM module.
 			Default is 0 (allow module to be loaded).
 
+	kvm.vmentry_l1d_flush=[KVM] Mitigation for L1 Terminal Fault CVE.
+			Valid arguments: 0, 1, 2
+
+			2 does an L1 cache flush on every VMENTER.
+			1 tries to avoid so many L1 cache flush on VMENTERs and instead
+			do it only if the kind of code that is executed would lead to
+			leaking host memory.
+			0 disables the mitigation
+
+			Default is 1 (do L1 cache flush in specific instances)
+
 	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
 			KVM MMU at runtime.
 			Default is 0 (off)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..78748925a370 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,12 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
+
+	/* must flush the L1 Data cache */
+	bool flush_cache_req;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +887,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -1449,6 +1456,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..4d4e3dc2494e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..fffc447f5410 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5437,6 +5437,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.flush_cache_req = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 559a12b6184d..d8ef111b8f5f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, S_IRUGO);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -2618,6 +2621,45 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+	vmx_save_host_state(vcpu);
+
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    !static_cpu_has(X86_BUG_L1TF)) {
+		vcpu->arch.flush_cache_req = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		vcpu->arch.flush_cache_req = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		vcpu->arch.flush_cache_req = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9751,6 +9793,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -10008,6 +10051,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
 		(unsigned long)&current_evmcs->host_rsp : 0;
 
+	if (vcpu->arch.flush_cache_req)
+		kvm_l1d_flush();
+
 	asm(
 		/* Store host registers */
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -11811,6 +11857,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12928,7 +12977,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1065d4e7c5fd..b3365883ccaf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6537,10 +6540,49 @@ static struct notifier_block pvclock_gtod_notifier = {
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+	ASSERT(boot_cpu_has(X86_BUG_L1TF));
+
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(kvm_l1d_flush);
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6569,10 +6611,15 @@ int kvm_arch_init(void *opaque)
 				"being able to snoop the host memory!");
 	}
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6603,6 +6650,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6627,6 +6676,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7405,7 +7455,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
+	vcpu->arch.flush_cache_req = vcpu->arch.vcpu_unconfined;
 	kvm_x86_ops->prepare_guest_switch(vcpu);
+	vcpu->arch.vcpu_unconfined = false;
+	if (vcpu->arch.flush_cache_req)
+		vcpu->stat.l1d_flush++;
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7592,6 +7646,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8711,6 +8766,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
2.13.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
  2018-06-21  3:17   ` Konrad Rzeszutek Wilk
@ 2018-06-21 14:04     ` Konrad Rzeszutek Wilk
  2018-06-21 17:29       ` Paolo Bonzini
  0 siblings, 1 reply; 5+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-21 14:04 UTC (permalink / raw)
  To: speck

On Wed, Jun 20, 2018 at 11:17:59PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Wed, Jun 20, 2018 at 04:57:02PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> > On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> > > kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> > > 
> > > Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> > > If it is active, the displacement flush is unnecessary.  Tested on
> > > a Coffee Lake machine.
> > 
> > ooops, that nice commit description you wrote Paolo got messed up. Sorry!
> 
> Ended up fixing this up to be:
> 
> >From a2029479d4c83c8e3476d8b37d288fb441576b43 Mon Sep 17 00:00:00 2001
> From: Paolo Bonzini <pbonzini@redhat.com>
> Date: Wed, 20 Jun 2018 15:49:44 -0400
> Subject: [PATCH v2.2 05/12] kvm: x86: mitigation for L1 cache terminal fault
>  vulnerabilities
> 
> We add two mitigation modes for CVE-2018-3620, aka L1 terminal
> fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".
> 
> "vmentry_l1d_flush=2" is simply doing an L1 cache flush on every VMENTER.
> "vmentry_l1d_flush=1" is trying to avoid so many L1 cache flueshes on
> VMENTER and instead only does if the reason for entering the hypervisor
> is based on the type of code that is executed. The idea is based on Intel's
> patches, but we treat all vmexits as safe unless they execute
> specific code that is considered unsafe. There is no hardcoded list of
> "safe" exit reasons; but vmexits are considered safe unless:
>  - They trigger the emulator, which could be a good target for
>    other speculative execution-based threats,
>  - or the MMU, which can bring host page tables in the L1 cache.
>  - In addition, executing userspace or another process will trigger a flush.

What about softirqs? Won't that happen on every VMEXIT? And how about
interrupts? They can happen too, any way to figure out whether an interrupt
happend between VMEXIT -> VMENTER? 

Also would it make sense to flip this around - that is everything is considered
a blacklist (aka always flush), except those routines which we believe
are OK and hence can be whitelisted?

And then what about the situation where say in a month somebody fixes a bug
or adds a new feature that brings in the host page table (alloc_page?)
and now we have to move it from whitelist to blacklist - there is not much
of an automatic way to detect this I think?

Or maybe there is? What are some of the calls that KVM can make that would
bring in the host page tables? I am assuming any call that hits 'page_alloc'
or 'kzalloc', 'vzalloc' or such? But others?

Is there a good way to figure this out except running trace on an VMEXIT->VMENTER
using the different kvm-unit tests and seeing this?


> 
> The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
> is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
> Coffee Lake (for the latter, this is independent of whether microcode
> or the generic flush code are used).
> 
> The mitigation does not in any way try to do anything about hyperthreading;
> it is possible for a sibling thread to read data from the cache during a
> vmexit, before the host completes the flush, or to read data from the cache
> while a sibling runs.  The suggestion there is to disable hyperthreading
> unless you've configured your system to dedicate each core to a specific
> guest.

I vaguely recall folks saying that they tried the "trigger IPI on VMEXIT" - was
there any performance metrix compared to say having SMT disabled/enabled?
And are there any patches for this floating around that have been ported against
upstream kernel?

(Asking as I am thinking to throw to the performance team an obfuscated kernel
with various knobs and see which sucks less).

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
> v2: Add checks for X86_BUG_L1TF
>    - Rework the commit description.
> v3: change module parameter to S_IRUGO
>     move the kvm_l1d_flush closer to VMENTER
> ---
>  Documentation/admin-guide/kernel-parameters.txt | 11 +++++
>  arch/x86/include/asm/kvm_host.h                 |  8 ++++
>  arch/x86/kvm/mmu.c                              |  1 +
>  arch/x86/kvm/svm.c                              |  1 +
>  arch/x86/kvm/vmx.c                              | 51 +++++++++++++++++++++-
>  arch/x86/kvm/x86.c                              | 58 ++++++++++++++++++++++++-
>  6 files changed, 128 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 4e56e6d0f124..dd1db796a463 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1923,6 +1923,17 @@
>  			SMT (aka Hyper-Threading) enabled then don't load KVM module.
>  			Default is 0 (allow module to be loaded).
>  
> +	kvm.vmentry_l1d_flush=[KVM] Mitigation for L1 Terminal Fault CVE.
> +			Valid arguments: 0, 1, 2
> +
> +			2 does an L1 cache flush on every VMENTER.
> +			1 tries to avoid so many L1 cache flush on VMENTERs and instead
> +			do it only if the kind of code that is executed would lead to
> +			leaking host memory.
> +			0 disables the mitigation
> +
> +			Default is 1 (do L1 cache flush in specific instances)
> +
>  	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
>  			KVM MMU at runtime.
>  			Default is 0 (off)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c13cd28d9d1b..78748925a370 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,12 @@ struct kvm_vcpu_arch {
>  
>  	/* be preempted when it's in kernel-mode(cpl=0) */
>  	bool preempted_in_kernel;
> +
> +	/* for L1 terminal fault vulnerability */
> +	bool vcpu_unconfined;
> +
> +	/* must flush the L1 Data cache */
> +	bool flush_cache_req;
>  };
>  
>  struct kvm_lpage_info {
> @@ -881,6 +887,7 @@ struct kvm_vcpu_stat {
>  	u64 signal_exits;
>  	u64 irq_window_exits;
>  	u64 nmi_window_exits;
> +	u64 l1d_flush;
>  	u64 halt_exits;
>  	u64 halt_successful_poll;
>  	u64 halt_attempted_poll;
> @@ -1449,6 +1456,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>  		     struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>  
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d594690d8b95..4d4e3dc2494e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
>  {
>  	int r = 1;
>  
> +	vcpu->arch.vcpu_unconfined = true;
>  	switch (vcpu->arch.apf.host_apf_reason) {
>  	default:
>  		trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f059a73f0fd0..fffc447f5410 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5437,6 +5437,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
>  
>  static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
>  {
> +	vcpu->arch.flush_cache_req = false;
>  }
>  
>  static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..d8ef111b8f5f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
>  };
>  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>  
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, S_IRUGO);
> +
>  static bool __read_mostly enable_vpid = 1;
>  module_param_named(vpid, enable_vpid, bool, 0444);
>  
> @@ -2618,6 +2621,45 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +{
> +	vmx_save_host_state(vcpu);
> +
> +	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> +	    !static_cpu_has(X86_BUG_L1TF)) {
> +		vcpu->arch.flush_cache_req = false;
> +		return;
> +	}
> +
> +	switch (vmentry_l1d_flush) {
> +	case 0:
> +		vcpu->arch.flush_cache_req = false;
> +		break;
> +	case 1:
> +		/*
> +		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> +		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
> +		 * following cases:
> +		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> +		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
> +		 *   on every nested entry.  Nested hypervisors do not bother clearing
> +		 *   the cache.
> +		 * - anything that runs the emulator (the slow paths for EPT misconfig
> +		 *   or I/O instruction)
> +		 * - anything that can cause get_user_pages (EPT violation, and again
> +		 *   the slow paths for EPT misconfig or I/O instruction)
> +		 * - anything that can run code outside KVM (external interrupt,
> +		 *   which can run interrupt handlers or irqs; or the sched_in
> +		 *   preempt notifier)
> +		 */
> +		break;
> +	case 2:
> +	default:
> +		vcpu->arch.flush_cache_req = true;
> +		break;
> +	}
> +}
> +
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>  {
>  	if (!vmx->host_state.loaded)
> @@ -9751,6 +9793,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		vcpu->arch.vcpu_unconfined = true;
>  	}
>  }
>  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -10008,6 +10051,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
>  		(unsigned long)&current_evmcs->host_rsp : 0;
>  
> +	if (vcpu->arch.flush_cache_req)
> +		kvm_l1d_flush();
> +
>  	asm(
>  		/* Store host registers */
>  		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
> @@ -11811,6 +11857,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return ret;
>  	}
>  
> +	/* Hide L1D cache contents from the nested guest.  */
> +	vmx->vcpu.arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>  	 * by event injection, halt vcpu.
> @@ -12928,7 +12977,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.vcpu_free = vmx_free_vcpu,
>  	.vcpu_reset = vmx_vcpu_reset,
>  
> -	.prepare_guest_switch = vmx_save_host_state,
> +	.prepare_guest_switch = vmx_prepare_guest_switch,
>  	.vcpu_load = vmx_vcpu_load,
>  	.vcpu_put = vmx_vcpu_put,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1065d4e7c5fd..b3365883ccaf 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	{ "irq_injections", VCPU_STAT(irq_injections) },
>  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
>  	{ "req_event", VCPU_STAT(req_event) },
> +	{ "l1d_flush", VCPU_STAT(l1d_flush) },
>  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
>  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
>  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>  	bool writeback = true;
>  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>  
> +	vcpu->arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
>  	 * never reused.
> @@ -6537,10 +6540,49 @@ static struct notifier_block pvclock_gtod_notifier = {
>  };
>  #endif
>  
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
> +	int size = PAGE_SIZE << L1D_CACHE_ORDER;
> +
> +	ASSERT(boot_cpu_has(X86_BUG_L1TF));
> +
> +	asm volatile(
> +		/* First ensure the pages are in the TLB */
> +		"xorl %%eax, %%eax\n\t"
> +		"11: \n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $4096, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"
> +		/* Now fill the cache */
> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		: : "r" (empty_zero_pages), "r" (size)
> +		: "eax", "ebx", "ecx", "edx");
> +}
> +EXPORT_SYMBOL_GPL(kvm_l1d_flush);
> +
>  int kvm_arch_init(void *opaque)
>  {
>  	int r;
>  	struct kvm_x86_ops *ops = opaque;
> +	struct page *page;
>  
>  	if (kvm_x86_ops) {
>  		printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6569,10 +6611,15 @@ int kvm_arch_init(void *opaque)
>  				"being able to snoop the host memory!");
>  	}
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +
>  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
>  	if (!shared_msrs) {
>  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> -		goto out;
> +		goto out_free_zero_pages;
>  	}
>  
>  	r = kvm_mmu_module_init();
> @@ -6603,6 +6650,8 @@ int kvm_arch_init(void *opaque)
>  
>  	return 0;
>  
> +out_free_zero_pages:
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  out_free_percpu:
>  	free_percpu(shared_msrs);
>  out:
> @@ -6627,6 +6676,7 @@ void kvm_arch_exit(void)
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  	free_percpu(shared_msrs);
>  }
>  
> @@ -7405,7 +7455,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	preempt_disable();
>  
> +	vcpu->arch.flush_cache_req = vcpu->arch.vcpu_unconfined;
>  	kvm_x86_ops->prepare_guest_switch(vcpu);
> +	vcpu->arch.vcpu_unconfined = false;
> +	if (vcpu->arch.flush_cache_req)
> +		vcpu->stat.l1d_flush++;
>  
>  	/*
>  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7592,6 +7646,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  	struct kvm *kvm = vcpu->kvm;
>  
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> +	vcpu->arch.vcpu_unconfined = true;
>  
>  	for (;;) {
>  		if (kvm_vcpu_running(vcpu)) {
> @@ -8711,6 +8766,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +	vcpu->arch.vcpu_unconfined = true;
>  	kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>  
> -- 
> 2.13.4
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
  2018-06-21 14:04     ` Konrad Rzeszutek Wilk
@ 2018-06-21 17:29       ` Paolo Bonzini
  0 siblings, 0 replies; 5+ messages in thread
From: Paolo Bonzini @ 2018-06-21 17:29 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 2651 bytes --]

On 21/06/2018 16:04, speck for Konrad Rzeszutek Wilk wrote:
>>  - They trigger the emulator, which could be a good target for
>>    other speculative execution-based threats,
>>  - or the MMU, which can bring host page tables in the L1 cache.
>>  - In addition, executing userspace or another process will trigger a flush.
> What about softirqs? Won't that happen on every VMEXIT?

Not on every, but this is covered.  The code has a more verbose comment

	* - anything that can run code outside KVM (external interrupt,
	*   which can run interrupt handlers or irqs; or the sched_in
	*   preempt notifier)

> And how about
> interrupts? They can happen too, any way to figure out whether an interrupt
> happend between VMEXIT -> VMENTER? 

Nope.  A kernel-only vmexit (of the kind that isn't considered
"unconfined" anyway) is less than a microsecond so it's not that likely,
but it is going to happen if the guest does a busy cpuid loop and uses
rdtsc to detect outliers.

We could add a generation count for softirqs, and possibly blacklist HLT
vmexits in case it does kvm->idle->kvm with no actual context switch.

> Also would it make sense to flip this around - that is everything is considered
> a blacklist (aka always flush), except those routines which we believe
> are OK and hence can be whitelisted?

I did this in v1, but in the end when reviewing the list I ended up
putting everything on the whitelist.

vmexit is a very fast path.  This means that 1) it's unlikely to do
expensive things 2) you don't really want to add overhead there.  I'm
okay with blacklisting more nested virt operations (vmclear, vmptrld);
that would hit nested VMware Workstation and possibly VirtualBox pretty
badly, but honestly who cares.

> And then what about the situation where say in a month somebody fixes a bug
> or adds a new feature that brings in the host page table (alloc_page?)
> and now we have to move it from whitelist to blacklist - there is not much
> of an automatic way to detect this I think?
> 
> Or maybe there is? What are some of the calls that KVM can make that would
> bring in the host page tables? I am assuming any call that hits 'page_alloc'
> or 'kzalloc', 'vzalloc' or such? But others?

That's what I did in this patch.

> I vaguely recall folks saying that they tried the "trigger IPI on VMEXIT" - was
> there any performance metrix compared to say having SMT disabled/enabled?
> And are there any patches for this floating around that have been ported against
> upstream kernel?

No, they were just too ugly, and the benefit of hyperthreading is debatable.

Paolo

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-06-21 17:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-20 20:43 [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5 konrad.wilk
2018-06-20 20:57 ` [MODERATED] " Konrad Rzeszutek Wilk
2018-06-21  3:17   ` Konrad Rzeszutek Wilk
2018-06-21 14:04     ` Konrad Rzeszutek Wilk
2018-06-21 17:29       ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.