From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.linutronix.de (146.0.238.70:993) by crypto-ml.lab.linutronix.de with IMAP4-SSL for ; 20 Jun 2018 20:57:13 -0000 Received: from aserp2130.oracle.com ([141.146.126.79]) by Galois.linutronix.de with esmtps (TLS1.2:RSA_AES_256_CBC_SHA256:256) (Exim 4.80) (envelope-from ) id 1fVkAB-00059j-4S for speck@linutronix.de; Wed, 20 Jun 2018 22:57:11 +0200 Received: from pps.filterd (aserp2130.oracle.com [127.0.0.1]) by aserp2130.oracle.com (8.16.0.22/8.16.0.22) with SMTP id w5KKsBHA115729 for ; Wed, 20 Jun 2018 20:57:04 GMT Received: from userv0022.oracle.com (userv0022.oracle.com [156.151.31.74]) by aserp2130.oracle.com with ESMTP id 2jmr2mpcc0-1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK) for ; Wed, 20 Jun 2018 20:57:04 +0000 Received: from aserv0122.oracle.com (aserv0122.oracle.com [141.146.126.236]) by userv0022.oracle.com (8.14.4/8.14.4) with ESMTP id w5KKv3Nh001541 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK) for ; Wed, 20 Jun 2018 20:57:03 GMT Received: from abhmp0012.oracle.com (abhmp0012.oracle.com [141.146.116.18]) by aserv0122.oracle.com (8.14.4/8.14.4) with ESMTP id w5KKv3FS002454 for ; Wed, 20 Jun 2018 20:57:03 GMT Date: Wed, 20 Jun 2018 16:57:02 -0400 From: Konrad Rzeszutek Wilk Subject: [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5 Message-ID: <20180620205701.GD28309@char.us.oracle.com> References: <20180620204352.066357420@localhost.localdomain> MIME-Version: 1.0 In-Reply-To: <20180620204352.066357420@localhost.localdomain> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit To: speck@linutronix.de List-ID: On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote: > kvm: x86: mitigation for L1 cache terminal fault vulnerabilities > > Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28]. > If it is active, the displacement flush is unnecessary. Tested on > a Coffee Lake machine. ooops, that nice commit description you wrote Paolo got messed up. Sorry! > > Signed-off-by: Paolo Bonzini > Signed-off-by: Konrad Rzeszutek Wilk > --- > arch/x86/include/asm/kvm_host.h | 7 ++++- > arch/x86/kvm/mmu.c | 1 + > arch/x86/kvm/svm.c | 3 ++- > arch/x86/kvm/vmx.c | 46 +++++++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++++++++++-- > 5 files changed, 111 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index c13cd28d9d1b..7d7626cffc21 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -713,6 +713,9 @@ struct kvm_vcpu_arch { > > /* be preempted when it's in kernel-mode(cpl=0) */ > bool preempted_in_kernel; > + There is a giant space right above here.. And I believe Paolo's original patch had it too. > + /* for L1 terminal fault vulnerability */ > + bool vcpu_unconfined; > }; > > struct kvm_lpage_info { > @@ -881,6 +884,7 @@ struct kvm_vcpu_stat { > u64 signal_exits; > u64 irq_window_exits; > u64 nmi_window_exits; > + u64 l1d_flush; > u64 halt_exits; > u64 halt_successful_poll; > u64 halt_attempted_poll; > @@ -939,7 +943,7 @@ struct kvm_x86_ops { > void (*vcpu_free)(struct kvm_vcpu *vcpu); > void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); > > - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); > + void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush); > void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); > void (*vcpu_put)(struct kvm_vcpu *vcpu); > > @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, > > void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, > struct kvm_lapic_irq *irq); > +void kvm_l1d_flush(void); > > static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) > { > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index d594690d8b95..4d4e3dc2494e 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, > { > int r = 1; > > + vcpu->arch.vcpu_unconfined = true; > switch (vcpu->arch.apf.host_apf_reason) { > default: > trace_kvm_page_fault(fault_address, error_code); > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index f059a73f0fd0..302afb2643fa 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) > svm->asid_generation--; > } > > -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) > +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush) > { > + *need_l1d_flush = false; > } > > static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 559a12b6184d..ba83d0cb9b03 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = { > }; > MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); > > +static int __read_mostly vmentry_l1d_flush = 1; > +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644); > + > static bool __read_mostly enable_vpid = 1; > module_param_named(vpid, enable_vpid, bool, 0444); > > @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) > vmx->guest_msrs[i].mask); > } > > +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush) > +{ > + vmx_save_host_state(vcpu); > + if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) { and || !boot_cpu_has(X86_BUG_L1TF) > + *need_l1d_flush = false; > + return; > + } > + > + switch (vmentry_l1d_flush) { > + case 0: > + *need_l1d_flush = false; > + break; > + case 1: > + /* > + * If vmentry_l1d_flush is 1, each vmexit handler is responsible for > + * setting vcpu->arch.vcpu_unconfined. Currently this happens in the > + * following cases: > + * - vmlaunch/vmresume: we do not want the cache to be cleared by a > + * nested hypervisor *and* by KVM on bare metal, so we just do it > + * on every nested entry. Nested hypervisors do not bother clearing > + * the cache. > + * - anything that runs the emulator (the slow paths for EPT misconfig > + * or I/O instruction) > + * - anything that can cause get_user_pages (EPT violation, and again > + * the slow paths for EPT misconfig or I/O instruction) > + * - anything that can run code outside KVM (external interrupt, > + * which can run interrupt handlers or irqs; or the sched_in > + * preempt notifier) > + */ > + break; > + case 2: > + default: > + *need_l1d_flush = true; > + break; > + } > +} > + > static void __vmx_load_host_state(struct vcpu_vmx *vmx) > { > if (!vmx->host_state.loaded) > @@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) > [ss]"i"(__KERNEL_DS), > [cs]"i"(__KERNEL_CS) > ); > + vcpu->arch.vcpu_unconfined = true; > } > } > STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); > @@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > return ret; > } > > + /* Hide L1D cache contents from the nested guest. */ > + vmx->vcpu.arch.vcpu_unconfined = true; > + > /* > * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken > * by event injection, halt vcpu. > @@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { > .vcpu_free = vmx_free_vcpu, > .vcpu_reset = vmx_vcpu_reset, > > - .prepare_guest_switch = vmx_save_host_state, > + .prepare_guest_switch = vmx_prepare_guest_switch, > .vcpu_load = vmx_vcpu_load, > .vcpu_put = vmx_vcpu_put, > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 1065d4e7c5fd..c0079aae38ba 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { > { "irq_injections", VCPU_STAT(irq_injections) }, > { "nmi_injections", VCPU_STAT(nmi_injections) }, > { "req_event", VCPU_STAT(req_event) }, > + { "l1d_flush", VCPU_STAT(l1d_flush) }, > { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, > { "mmu_pte_write", VM_STAT(mmu_pte_write) }, > { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, > @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, > bool writeback = true; > bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; > > + vcpu->arch.vcpu_unconfined = true; > + > /* > * Clear write_fault_to_shadow_pgtable here to ensure it is > * never reused. > @@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = { > }; > #endif > > + > +/* > + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in > + * 64 KiB because the replacement algorithm is not exactly LRU. > + */ > +#define L1D_CACHE_ORDER 4 > +static void *__read_mostly empty_zero_pages; > + > +void kvm_l1d_flush(void) > +{ > + /* FIXME: could this be boot_cpu_data.x86_cache_size * 2? */ > + int size = PAGE_SIZE << L1D_CACHE_ORDER; I am thinking we should have: ASSERT(boot_cpu_has(X86_BUG_L1TF)); or such? > + asm volatile( > + /* First ensure the pages are in the TLB */ > + "xorl %%eax, %%eax\n\t" > + "11: \n\t" > + "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t" > + "addl $4096, %%eax\n\t" > + "cmpl %%eax, %1\n\t" > + "jne 11b\n\t" > + "xorl %%eax, %%eax\n\t" > + "cpuid\n\t" > + /* Now fill the cache */ > + "xorl %%eax, %%eax\n\t" > + "12:\n\t" > + "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t" > + "addl $64, %%eax\n\t" > + "cmpl %%eax, %1\n\t" > + "jne 12b\n\t" > + "lfence\n\t" > + : : "r" (empty_zero_pages), "r" (size) > + : "eax", "ebx", "ecx", "edx"); > +} > + > int kvm_arch_init(void *opaque) > { > int r; > struct kvm_x86_ops *ops = opaque; > + struct page *page; > > if (kvm_x86_ops) { > printk(KERN_ERR "kvm: already loaded the other module\n"); > @@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque) > "being able to snoop the host memory!"); > } > r = -ENOMEM; > + page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER); > + if (!page) > + goto out; > + empty_zero_pages = page_address(page); > + > shared_msrs = alloc_percpu(struct kvm_shared_msrs); > if (!shared_msrs) { > printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); > - goto out; > + goto out_free_zero_pages; > } > > r = kvm_mmu_module_init(); > @@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque) > > return 0; > > +out_free_zero_pages: > + free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER); > out_free_percpu: > free_percpu(shared_msrs); > out: > @@ -6627,6 +6672,7 @@ void kvm_arch_exit(void) > #endif > kvm_x86_ops = NULL; > kvm_mmu_module_exit(); > + free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER); > free_percpu(shared_msrs); > } > > @@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > kvm_cpu_accept_dm_intr(vcpu); > > bool req_immediate_exit = false; > + bool need_l1d_flush; > > if (kvm_request_pending(vcpu)) { > if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) > @@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > preempt_disable(); > > - kvm_x86_ops->prepare_guest_switch(vcpu); > + need_l1d_flush = vcpu->arch.vcpu_unconfined; > + vcpu->arch.vcpu_unconfined = false; > + kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush); > + if (need_l1d_flush) { > + vcpu->stat.l1d_flush++; > + kvm_l1d_flush(); > + } I was thinking that this kvm_l1d_flush should really happen right before you an VMENTER. I was thinking to change your patch for this exact reason but I am curious why you put it here? > > /* > * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt > @@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) > struct kvm *kvm = vcpu->kvm; > > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > + vcpu->arch.vcpu_unconfined = true; > > for (;;) { > if (kvm_vcpu_running(vcpu)) { > @@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > > void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) > { > + vcpu->arch.vcpu_unconfined = true; > kvm_x86_ops->sched_in(vcpu, cpu); > } > > -- > 2.14.3 > >