[PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
@ 2010-06-25  7:57 Sheng Yang
  2010-06-25  8:54 ` Jan Kiszka
  0 siblings, 1 reply; 3+ messages in thread
From: Sheng Yang @ 2010-06-25  7:57 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: kvm, Sheng Yang, Yaozu (Eddie) Dong

Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.

For wbinvd VMExit capable processors, we issue IPIs to all physical CPUs to
do wbinvd, for we can't easily tell which physical CPUs are "dirty".

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    3 +++
 arch/x86/kvm/emulate.c          |    5 ++++-
 arch/x86/kvm/svm.c              |    6 ++++++
 arch/x86/kvm/vmx.c              |   27 ++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |    6 ++++++
 5 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdea..1c392c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -514,6 +514,8 @@ struct kvm_x86_ops {
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
+	void (*execute_wbinvd)(struct kvm_vcpu *vcpu);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -571,6 +573,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
+int emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec..085dcb7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
-	case 0x08:		/* invd */
 	case 0x09:		/* wbinvd */
+		emulate_wbinvd(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d..6929da1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,10 @@ static bool svm_rdtscp_supported(void)
 	return false;
 }
 
+static void svm_execute_wbinvd(struct kvm_vcpu *vcpu)
+{
+}
+
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.rdtscp_supported = svm_rdtscp_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
+
+	.execute_wbinvd = svm_execute_wbinvd,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e565689..063002c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool cpu_has_wbinvd_exit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_WBINVD_EXITING;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -874,6 +880,11 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+static void wbinvd_ipi(void *opaque)
+{
+	wbinvd();
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -905,6 +916,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			 &per_cpu(vcpus_on_cpu, cpu));
 		local_irq_enable();
 
+		/* Issue WBINVD in case guest has executed it */
+		if (!cpu_has_wbinvd_exit() && vcpu->kvm->arch.iommu_domain &&
+		    vcpu->cpu != -1)
+			smp_call_function_single(vcpu->cpu,
+					wbinvd_ipi, NULL, 1);
+
 		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -3397,10 +3414,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void vmx_execute_wbinvd(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->kvm->arch.iommu_domain)
+		smp_call_function(wbinvd_ipi, NULL, 1);
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
+	vmx_execute_wbinvd(vcpu);
 	return 1;
 }
 
@@ -4350,6 +4373,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.rdtscp_supported = vmx_rdtscp_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
+
+	.execute_wbinvd = vmx_execute_wbinvd,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b9252..eba3a2b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3650,6 +3650,12 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 	return X86EMUL_CONTINUE;
 }
 
+int emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->execute_wbinvd(vcpu);
+	return X86EMUL_CONTINUE;
+}
+
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
  2010-06-25  7:57 [PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
@ 2010-06-25  8:54 ` Jan Kiszka
  2010-06-25 10:20   ` Sheng Yang
  0 siblings, 1 reply; 3+ messages in thread
From: Jan Kiszka @ 2010-06-25  8:54 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Yaozu (Eddie) Dong

Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> 
> For wbinvd VMExit capable processors, we issue IPIs to all physical CPUs to
> do wbinvd, for we can't easily tell which physical CPUs are "dirty".

wbinvd is a heavy weapon in the hands of a guest. Even if it is limited
to pass-through scenarios, do we really need to bother all physical host
CPUs with potential multi-millisecond stalls? Think of VMs only running
on a subset of CPUs (e.g. to isolate latency sources). I would suggest
to track the physical CPU usage of VCPUs between two wbinvd requests and
only send the wbinvd IPI to that set.

Also, I think the code is still too much vmx-focused. Only the trapping
should be vendor specific, the rest generic.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
  2010-06-25  8:54 ` Jan Kiszka
@ 2010-06-25 10:20   ` Sheng Yang
  0 siblings, 0 replies; 3+ messages in thread
From: Sheng Yang @ 2010-06-25 10:20 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Yaozu (Eddie) Dong

On Friday 25 June 2010 16:54:19 Jan Kiszka wrote:
> Sheng Yang wrote:
> > Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> > WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD
> > or CLFLUSH, we need to maintain data consistency either by:
> > 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> > wbinvd exit, or
> > 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> > 
> > For wbinvd VMExit capable processors, we issue IPIs to all physical CPUs
> > to do wbinvd, for we can't easily tell which physical CPUs are "dirty".
> 
> wbinvd is a heavy weapon in the hands of a guest. Even if it is limited
> to pass-through scenarios, do we really need to bother all physical host
> CPUs with potential multi-millisecond stalls? Think of VMs only running
> on a subset of CPUs (e.g. to isolate latency sources). I would suggest
> to track the physical CPU usage of VCPUs between two wbinvd requests and
> only send the wbinvd IPI to that set.

OK, would try to make it more specific(and complex)...
> 
> Also, I think the code is still too much vmx-focused. Only the trapping
> should be vendor specific, the rest generic.

OK, would consider it.

--
regards
Yang, Sheng

> 
> Jan

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-06-25 10:21 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-25  7:57 [PATCH] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
2010-06-25  8:54 ` Jan Kiszka
2010-06-25 10:20   ` Sheng Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox