From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <konrad.wilk@oracle.com>
Received: from mail.linutronix.de (146.0.238.70:993) by
  crypto-ml.lab.linutronix.de with IMAP4-SSL for <speck@linutronix.de>; 20 Jun
  2018 20:57:13 -0000
Received: from aserp2130.oracle.com ([141.146.126.79])
	by Galois.linutronix.de with esmtps (TLS1.2:RSA_AES_256_CBC_SHA256:256)
	(Exim 4.80)
	(envelope-from <konrad.wilk@oracle.com>)
	id 1fVkAB-00059j-4S
	for speck@linutronix.de; Wed, 20 Jun 2018 22:57:11 +0200
Received: from pps.filterd (aserp2130.oracle.com [127.0.0.1])
	by aserp2130.oracle.com (8.16.0.22/8.16.0.22) with SMTP id w5KKsBHA115729
	for <speck@linutronix.de>; Wed, 20 Jun 2018 20:57:04 GMT
Received: from userv0022.oracle.com (userv0022.oracle.com [156.151.31.74])
	by aserp2130.oracle.com with ESMTP id 2jmr2mpcc0-1
	(version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK)
	for <speck@linutronix.de>; Wed, 20 Jun 2018 20:57:04 +0000
Received: from aserv0122.oracle.com (aserv0122.oracle.com [141.146.126.236])
	by userv0022.oracle.com (8.14.4/8.14.4) with ESMTP id w5KKv3Nh001541
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK)
	for <speck@linutronix.de>; Wed, 20 Jun 2018 20:57:03 GMT
Received: from abhmp0012.oracle.com (abhmp0012.oracle.com [141.146.116.18])
	by aserv0122.oracle.com (8.14.4/8.14.4) with ESMTP id w5KKv3FS002454
	for <speck@linutronix.de>; Wed, 20 Jun 2018 20:57:03 GMT
Date: Wed, 20 Jun 2018 16:57:02 -0400
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Subject: [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
Message-ID: <20180620205701.GD28309@char.us.oracle.com>
References: <20180620204352.066357420@localhost.localdomain>
MIME-Version: 1.0
In-Reply-To: <20180620204352.066357420@localhost.localdomain>
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 7bit
To: speck@linutronix.de
List-ID: <speck.linutronix.de>

On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> If it is active, the displacement flush is unnecessary.  Tested on
> a Coffee Lake machine.

ooops, that nice commit description you wrote Paolo got messed up. Sorry!

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 ++++-
>  arch/x86/kvm/mmu.c              |  1 +
>  arch/x86/kvm/svm.c              |  3 ++-
>  arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 111 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c13cd28d9d1b..7d7626cffc21 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
>  
>  	/* be preempted when it's in kernel-mode(cpl=0) */
>  	bool preempted_in_kernel;
> +	

There is a giant space right above here.. And I believe Paolo's original patch had it too.

> +	/* for L1 terminal fault vulnerability */
> +	bool vcpu_unconfined;
>  };
>  
>  struct kvm_lpage_info {
> @@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
>  	u64 signal_exits;
>  	u64 irq_window_exits;
>  	u64 nmi_window_exits;
> +	u64 l1d_flush;
>  	u64 halt_exits;
>  	u64 halt_successful_poll;
>  	u64 halt_attempted_poll;
> @@ -939,7 +943,7 @@ struct kvm_x86_ops {
>  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
>  	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>  
> -	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
> +	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
>  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
>  	void (*vcpu_put)(struct kvm_vcpu *vcpu);
>  
> @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>  		     struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>  
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d594690d8b95..4d4e3dc2494e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
>  {
>  	int r = 1;
>  
> +	vcpu->arch.vcpu_unconfined = true;
>  	switch (vcpu->arch.apf.host_apf_reason) {
>  	default:
>  		trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f059a73f0fd0..302afb2643fa 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
>  		svm->asid_generation--;
>  }
>  
> -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
>  {
> +	*need_l1d_flush = false;
>  }
>  
>  static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..ba83d0cb9b03 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
>  };
>  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>  
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
> +
>  static bool __read_mostly enable_vpid = 1;
>  module_param_named(vpid, enable_vpid, bool, 0444);
>  
> @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> +	vmx_save_host_state(vcpu);
> +	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {

and 
	|| !boot_cpu_has(X86_BUG_L1TF)

> +		*need_l1d_flush = false;
> +		return;
> +	}
> +

> +	switch (vmentry_l1d_flush) {
> +	case 0:
> +		*need_l1d_flush = false;
> +		break;
> +	case 1:
> +		/*
> +		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> +		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
> +		 * following cases:
> +		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> +		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
> +		 *   on every nested entry.  Nested hypervisors do not bother clearing
> +		 *   the cache.
> +		 * - anything that runs the emulator (the slow paths for EPT misconfig
> +		 *   or I/O instruction)
> +		 * - anything that can cause get_user_pages (EPT violation, and again
> +		 *   the slow paths for EPT misconfig or I/O instruction)
> +		 * - anything that can run code outside KVM (external interrupt,
> +		 *   which can run interrupt handlers or irqs; or the sched_in
> +		 *   preempt notifier)
> +		 */
> +		break;
> +	case 2:
> +	default:
> +		*need_l1d_flush = true;
> +		break;
> +	}
> +}
> +
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>  {
>  	if (!vmx->host_state.loaded)
> @@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		vcpu->arch.vcpu_unconfined = true;
>  	}
>  }
>  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return ret;
>  	}
>  
> +	/* Hide L1D cache contents from the nested guest.  */
> +	vmx->vcpu.arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>  	 * by event injection, halt vcpu.
> @@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.vcpu_free = vmx_free_vcpu,
>  	.vcpu_reset = vmx_vcpu_reset,
>  
> -	.prepare_guest_switch = vmx_save_host_state,
> +	.prepare_guest_switch = vmx_prepare_guest_switch,
>  	.vcpu_load = vmx_vcpu_load,
>  	.vcpu_put = vmx_vcpu_put,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1065d4e7c5fd..c0079aae38ba 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	{ "irq_injections", VCPU_STAT(irq_injections) },
>  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
>  	{ "req_event", VCPU_STAT(req_event) },
> +	{ "l1d_flush", VCPU_STAT(l1d_flush) },
>  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
>  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
>  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>  	bool writeback = true;
>  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>  
> +	vcpu->arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
>  	 * never reused.
> @@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
>  };
>  #endif
>  
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
> +	int size = PAGE_SIZE << L1D_CACHE_ORDER;

I am thinking we should have:

	ASSERT(boot_cpu_has(X86_BUG_L1TF));

or such?

> +	asm volatile(
> +		/* First ensure the pages are in the TLB */
> +		"xorl %%eax, %%eax\n\t"
> +		"11: \n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $4096, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"
> +		/* Now fill the cache */
> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		: : "r" (empty_zero_pages), "r" (size)
> +		: "eax", "ebx", "ecx", "edx");
> +}
> +
>  int kvm_arch_init(void *opaque)
>  {
>  	int r;
>  	struct kvm_x86_ops *ops = opaque;
> +	struct page *page;
>  
>  	if (kvm_x86_ops) {
>  		printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
>  				"being able to snoop the host memory!");
>  	}
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +
>  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
>  	if (!shared_msrs) {
>  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> -		goto out;
> +		goto out_free_zero_pages;
>  	}
>  
>  	r = kvm_mmu_module_init();
> @@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
>  
>  	return 0;
>  
> +out_free_zero_pages:
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  out_free_percpu:
>  	free_percpu(shared_msrs);
>  out:
> @@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  	free_percpu(shared_msrs);
>  }
>  
> @@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		kvm_cpu_accept_dm_intr(vcpu);
>  
>  	bool req_immediate_exit = false;
> +	bool need_l1d_flush;
>  
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> @@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	preempt_disable();
>  
> -	kvm_x86_ops->prepare_guest_switch(vcpu);
> +	need_l1d_flush = vcpu->arch.vcpu_unconfined;
> +	vcpu->arch.vcpu_unconfined = false;
> +	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
> +	if (need_l1d_flush) {
> +		vcpu->stat.l1d_flush++;
> +		kvm_l1d_flush();
> +	}

I was thinking that this kvm_l1d_flush should really happen right before 
you an VMENTER. I was thinking to change your patch for this exact reason
but I am curious why you put it here?

>  
>  	/*
>  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  	struct kvm *kvm = vcpu->kvm;
>  
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> +	vcpu->arch.vcpu_unconfined = true;
>  
>  	for (;;) {
>  		if (kvm_vcpu_running(vcpu)) {
> @@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +	vcpu->arch.vcpu_unconfined = true;
>  	kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>  
> -- 
> 2.14.3
> 
>