[PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
@ 2010-06-30  4:25 Sheng Yang
  2010-06-30  4:25 ` [PATCH v6 2/2] KVM: Using workqueue for WBINVD Sheng Yang
  2010-06-30 16:34 ` [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Marcelo Tosatti
  0 siblings, 2 replies; 4+ messages in thread
From: Sheng Yang @ 2010-06-30  4:25 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Jan Kiszka, Joerg Roedel, kvm, Sheng Yang, Yaozu (Eddie) Dong

Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    6 +++++
 arch/x86/kvm/emulate.c          |    5 +++-
 arch/x86/kvm/svm.c              |    7 ++++++
 arch/x86/kvm/vmx.c              |   10 ++++++++-
 arch/x86/kvm/x86.c              |   41 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdea..2bda624 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
+#include <linux/cpumask.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
 
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
+
+	cpumask_var_t wbinvd_dirty_mask;
 };
 
 struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
+	bool (*has_wbinvd_exit)(void);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec..e8bdddc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
-	case 0x08:		/* invd */
 	case 0x09:		/* wbinvd */
+		kvm_emulate_wbinvd(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d..56c9b6b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
 	return false;
 }
 
+static bool svm_has_wbinvd_exit(void)
+{
+	return true;
+}
+
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.rdtscp_supported = svm_rdtscp_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
+
+	.has_wbinvd_exit = svm_has_wbinvd_exit,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e565689..806ab12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_WBINVD_EXITING;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -3400,7 +3406,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
+	kvm_emulate_wbinvd(vcpu);
 	return 1;
 }
 
@@ -4350,6 +4356,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.rdtscp_supported = vmx_rdtscp_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
+
+	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b9252..eea75f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,28 @@ out:
 	return r;
 }
 
+static void wbinvd_ipi(void *garbage)
+{
+	wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	/* Address WBINVD may be executed by guest */
+	if (need_emulate_wbinvd(vcpu)) {
+		if (kvm_x86_ops->has_wbinvd_exit())
+			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+			smp_call_function_single(vcpu->cpu,
+					wbinvd_ipi, NULL, 1);
+	}
+
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
 		unsigned long khz = cpufreq_quick_get(cpu);
@@ -3650,6 +3670,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 	return X86EMUL_CONTINUE;
 }
 
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	if (!need_emulate_wbinvd(vcpu))
+		return X86EMUL_CONTINUE;
+
+	if (kvm_x86_ops->has_wbinvd_exit()) {
+		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+				wbinvd_ipi, NULL, 1);
+		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+	}
+	wbinvd();
+	return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -5255,6 +5290,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 	}
 
+	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
@@ -5384,7 +5420,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+		goto fail_free_mce_banks;
+
 	return 0;
+fail_free_mce_banks:
+	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v6 2/2] KVM: Using workqueue for WBINVD
  2010-06-30  4:25 [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
@ 2010-06-30  4:25 ` Sheng Yang
  2010-06-30 16:41   ` Marcelo Tosatti
  2010-06-30 16:34 ` [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Marcelo Tosatti
  1 sibling, 1 reply; 4+ messages in thread
From: Sheng Yang @ 2010-06-30  4:25 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Jan Kiszka, Joerg Roedel, kvm, Sheng Yang

It would buy us the ability to schedule compared to smp_call_function().

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---

But I am not sure if it worth the complexity. Anyway WBINVD itself can't be
interrupted, so the benefit should come to the caller cpu I think. And that
would extended the waiting time for the caller cpu, for we still need to wait
other cpus to complete their works.

 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/x86.c              |   35 ++++++++++++++++++++++++++++-------
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2bda624..6e0b793 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
 #include <linux/cpumask.h>
+#include <linux/workqueue.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eea75f5..13f7c88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -153,6 +153,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
+static struct workqueue_struct *wbinvd_wq;
+static DEFINE_PER_CPU(struct work_struct, wbinvd_work);
+
 static inline u32 bit(int bitno)
 {
 	return 1 << (bitno & 31);
@@ -1783,7 +1786,7 @@ out:
 	return r;
 }
 
-static void wbinvd_ipi(void *garbage)
+static void wbinvd_do_work(struct work_struct *work)
 {
 	wbinvd();
 }
@@ -1800,9 +1803,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (need_emulate_wbinvd(vcpu)) {
 		if (kvm_x86_ops->has_wbinvd_exit())
 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
-		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
-			smp_call_function_single(vcpu->cpu,
-					wbinvd_ipi, NULL, 1);
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu) {
+			queue_work_on(vcpu->cpu, wbinvd_wq,
+					&per_cpu(wbinvd_work, vcpu->cpu));
+			flush_workqueue(wbinvd_wq);
+		}
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
@@ -3672,12 +3677,16 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
+	int cpu;
+
 	if (!need_emulate_wbinvd(vcpu))
 		return X86EMUL_CONTINUE;
 
 	if (kvm_x86_ops->has_wbinvd_exit()) {
-		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
-				wbinvd_ipi, NULL, 1);
+		for_each_cpu(cpu, vcpu->arch.wbinvd_dirty_mask)
+			queue_work_on(cpu, wbinvd_wq,
+					&per_cpu(wbinvd_work, cpu));
+		flush_workqueue(wbinvd_wq);
 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
 	}
 	wbinvd();
@@ -4179,7 +4188,7 @@ EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
 int kvm_arch_init(void *opaque)
 {
-	int r;
+	int r, cpu;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
 	if (kvm_x86_ops) {
@@ -4218,6 +4227,16 @@ int kvm_arch_init(void *opaque)
 	if (cpu_has_xsave)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
+	for_each_possible_cpu(cpu) {
+		INIT_WORK(&per_cpu(wbinvd_work, cpu), wbinvd_do_work);
+	}
+
+	wbinvd_wq = create_workqueue("kvm_wbinvd");
+	if (!wbinvd_wq) {
+		r = -ENOMEM;
+		goto out;
+	}
+
 	return 0;
 
 out:
@@ -4226,6 +4245,8 @@ out:
 
 void kvm_arch_exit(void)
 {
+	destroy_workqueue(wbinvd_wq);
+
 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v6 2/2] KVM: Using workqueue for WBINVD
  2010-06-30  4:25 ` [PATCH v6 2/2] KVM: Using workqueue for WBINVD Sheng Yang
@ 2010-06-30 16:41   ` Marcelo Tosatti
  0 siblings, 0 replies; 4+ messages in thread
From: Marcelo Tosatti @ 2010-06-30 16:41 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Jan Kiszka, Joerg Roedel, kvm

On Wed, Jun 30, 2010 at 12:25:16PM +0800, Sheng Yang wrote:
> It would buy us the ability to schedule compared to smp_call_function().
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
> 
> But I am not sure if it worth the complexity. Anyway WBINVD itself can't be
> interrupted, so the benefit should come to the caller cpu I think. And that
> would extended the waiting time for the caller cpu, for we still need to wait
> other cpus to complete their works.

Alright, can be done later in case it shows to be necessary.

>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/x86.c              |   35 ++++++++++++++++++++++++++++-------
>  2 files changed, 29 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2bda624..6e0b793 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -16,6 +16,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/tracepoint.h>
>  #include <linux/cpumask.h>
> +#include <linux/workqueue.h>
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eea75f5..13f7c88 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -153,6 +153,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  
>  u64 __read_mostly host_xcr0;
>  
> +static struct workqueue_struct *wbinvd_wq;
> +static DEFINE_PER_CPU(struct work_struct, wbinvd_work);
> +
>  static inline u32 bit(int bitno)
>  {
>  	return 1 << (bitno & 31);
> @@ -1783,7 +1786,7 @@ out:
>  	return r;
>  }
>  
> -static void wbinvd_ipi(void *garbage)
> +static void wbinvd_do_work(struct work_struct *work)
>  {
>  	wbinvd();
>  }
> @@ -1800,9 +1803,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	if (need_emulate_wbinvd(vcpu)) {
>  		if (kvm_x86_ops->has_wbinvd_exit())
>  			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
> -		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
> -			smp_call_function_single(vcpu->cpu,
> -					wbinvd_ipi, NULL, 1);
> +		else if (vcpu->cpu != -1 && vcpu->cpu != cpu) {
> +			queue_work_on(vcpu->cpu, wbinvd_wq,
> +					&per_cpu(wbinvd_work, vcpu->cpu));
> +			flush_workqueue(wbinvd_wq);
> +		}

Can't schedule here since preemption must be disabled during vcpu_load.

>  	}
>  
>  	kvm_x86_ops->vcpu_load(vcpu, cpu);
> @@ -3672,12 +3677,16 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
>  
>  int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
>  {
> +	int cpu;
> +
>  	if (!need_emulate_wbinvd(vcpu))
>  		return X86EMUL_CONTINUE;
>  
>  	if (kvm_x86_ops->has_wbinvd_exit()) {
> -		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
> -				wbinvd_ipi, NULL, 1);
> +		for_each_cpu(cpu, vcpu->arch.wbinvd_dirty_mask)
> +			queue_work_on(cpu, wbinvd_wq,
> +					&per_cpu(wbinvd_work, cpu));
> +		flush_workqueue(wbinvd_wq);
>  		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
>  	}
>  	wbinvd();

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
  2010-06-30  4:25 [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
  2010-06-30  4:25 ` [PATCH v6 2/2] KVM: Using workqueue for WBINVD Sheng Yang
@ 2010-06-30 16:34 ` Marcelo Tosatti
  1 sibling, 0 replies; 4+ messages in thread
From: Marcelo Tosatti @ 2010-06-30 16:34 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong

On Wed, Jun 30, 2010 at 12:25:15PM +0800, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> 
> Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    6 +++++
>  arch/x86/kvm/emulate.c          |    5 +++-
>  arch/x86/kvm/svm.c              |    7 ++++++
>  arch/x86/kvm/vmx.c              |   10 ++++++++-
>  arch/x86/kvm/x86.c              |   41 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 67 insertions(+), 2 deletions(-)

Applied, thanks.
.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-06-30 17:03 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-30  4:25 [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
2010-06-30  4:25 ` [PATCH v6 2/2] KVM: Using workqueue for WBINVD Sheng Yang
2010-06-30 16:41   ` Marcelo Tosatti
2010-06-30 16:34 ` [PATCH v6 1/2] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.