* [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
@ 2010-06-28 3:36 Sheng Yang
2010-06-28 3:56 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-06-28 3:36 UTC (permalink / raw)
To: Avi Kivity
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm, Sheng Yang,
Yaozu (Eddie) Dong
Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
OK, I've checked AMD's spec, they do intercept WBINVD. So we can do it like
this.
arch/x86/include/asm/kvm_host.h | 6 ++++++
arch/x86/kvm/emulate.c | 5 ++++-
arch/x86/kvm/svm.c | 7 +++++++
arch/x86/kvm/vmx.c | 10 +++++++++-
arch/x86/kvm/x86.c | 29 +++++++++++++++++++++++++++++
5 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdea..b385d6f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
+
+ cpumask_t wbinvd_dirty_mask;
};
struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+ bool (*has_wbinvd_exit)(void);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec..e8bdddc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d..56c9b6b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e565689..806ab12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -3400,7 +3406,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
@@ -4350,6 +4356,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b9252..cfb6fad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,22 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (vcpu->kvm->arch.iommu_domain) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpu_set(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
unsigned long khz = cpufreq_quick_get(cpu);
@@ -3650,6 +3664,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->kvm->arch.iommu_domain)
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(&vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpus_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
int emulate_clts(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
--
1.7.0.1
^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 3:36 [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
@ 2010-06-28 3:56 ` Avi Kivity
2010-06-28 6:42 ` Sheng Yang
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 3:56 UTC (permalink / raw)
To: Sheng Yang
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On 06/28/2010 06:36 AM, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
>
Don't we always force enable snooping? Or is that only for the
processor, and you're worried about devices?
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>
>
>
> /* fields used by HYPER-V emulation */
> u64 hv_vapic;
> +
> + cpumask_t wbinvd_dirty_mask;
> };
>
>
Need alloc_cpumask_var()/free_cpumask_var() for very large hosts.
>
> +static void wbinvd_ipi(void *garbage)
> +{
> + wbinvd();
> +}
>
Like Jan mentioned, this is quite heavy. What about a clflush() loop
instead? That may take more time, but at least it's preemptible. Of
course, it isn't preemptible in an IPI.
> +
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> + /* Address WBINVD may be executed by guest */
> + if (vcpu->kvm->arch.iommu_domain) {
> + if (kvm_x86_ops->has_wbinvd_exit())
> + cpu_set(cpu, vcpu->arch.wbinvd_dirty_mask);
> + else if (vcpu->cpu != -1)
> + smp_call_function_single(vcpu->cpu,
> + wbinvd_ipi, NULL, 1);
>
Is there any point to doing this if !has_wbinvd_exit()? The vcpu might
not have migrated in time, so the cache is flushed too late.
> + }
> +
> kvm_x86_ops->vcpu_load(vcpu, cpu);
> if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
> unsigned long khz = cpufreq_quick_get(cpu);
> @@ -3650,6 +3664,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
> return X86EMUL_CONTINUE;
> }
>
> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> +{
> + if (!vcpu->kvm->arch.iommu_domain)
> + return X86EMUL_CONTINUE;
> +
> + if (kvm_x86_ops->has_wbinvd_exit()) {
> + smp_call_function_many(&vcpu->arch.wbinvd_dirty_mask,
> + wbinvd_ipi, NULL, 1);
> + cpus_clear(vcpu->arch.wbinvd_dirty_mask);
>
Race - a migration may set a new bit in wbinvd_dirty_mask after the
s_c_f_m().
However, it's probably benign, since we won't be entering the guest in
that period.
> + } else
> + wbinvd();
> + return X86EMUL_CONTINUE;
> +}
> +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
> +
> int emulate_clts(struct kvm_vcpu *vcpu)
> {
> kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
>
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 3:56 ` Avi Kivity
@ 2010-06-28 6:42 ` Sheng Yang
2010-06-28 6:56 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-06-28 6:42 UTC (permalink / raw)
To: Avi Kivity
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On Monday 28 June 2010 11:56:08 Avi Kivity wrote:
> On 06/28/2010 06:36 AM, Sheng Yang wrote:
> > Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> > WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD
> > or
>
> > CLFLUSH, we need to maintain data consistency either by:
> Don't we always force enable snooping? Or is that only for the
> processor, and you're worried about devices?
We only force enabling snooping for capable VT-d engine(with
KVM_IOMMU_CACHE_COHERENCY flag, on most recent server board). And you're right,
with the snooping capable VT-d engine we don't need to do all these. Would address
it in the next version.
> > 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> > wbinvd exit, or
> > 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> >
> > /* fields used by HYPER-V emulation */
> > u64 hv_vapic;
> >
> > +
> > + cpumask_t wbinvd_dirty_mask;
> >
> > };
>
> Need alloc_cpumask_var()/free_cpumask_var() for very large hosts.
OK.
>
> > +static void wbinvd_ipi(void *garbage)
> > +{
> > + wbinvd();
> > +}
>
> Like Jan mentioned, this is quite heavy. What about a clflush() loop
> instead? That may take more time, but at least it's preemptible. Of
> course, it isn't preemptible in an IPI.
I think this kind of behavior happened rarely, and most recent processor should
have WBINVD exit which means it's an IPI... So I think it's maybe acceptable here.
> > +
> >
> > void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> > {
> >
> > + /* Address WBINVD may be executed by guest */
> > + if (vcpu->kvm->arch.iommu_domain) {
> > + if (kvm_x86_ops->has_wbinvd_exit())
> > + cpu_set(cpu, vcpu->arch.wbinvd_dirty_mask);
> > + else if (vcpu->cpu != -1)
> > + smp_call_function_single(vcpu->cpu,
> > + wbinvd_ipi, NULL, 1);
>
> Is there any point to doing this if !has_wbinvd_exit()? The vcpu might
> not have migrated in time, so the cache is flushed too late.
For the !has_wbinvd_exit(), the instruction would be executed by guest and flush
the current processor immediately. And we can ensure that it's clean in the last
CPU, so we're fine.
> > + }
> > +
> >
> > kvm_x86_ops->vcpu_load(vcpu, cpu);
> > if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
> >
> > unsigned long khz = cpufreq_quick_get(cpu);
> >
> > @@ -3650,6 +3664,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t
> > address)
> >
> > return X86EMUL_CONTINUE;
> >
> > }
> >
> > +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> > +{
> > + if (!vcpu->kvm->arch.iommu_domain)
> > + return X86EMUL_CONTINUE;
> > +
> > + if (kvm_x86_ops->has_wbinvd_exit()) {
> > + smp_call_function_many(&vcpu->arch.wbinvd_dirty_mask,
> > + wbinvd_ipi, NULL, 1);
> > + cpus_clear(vcpu->arch.wbinvd_dirty_mask);
>
> Race - a migration may set a new bit in wbinvd_dirty_mask after the
> s_c_f_m().
>
> However, it's probably benign, since we won't be entering the guest in
> that period.
Yes. :)
--
regards
Yang, Sheng
>
> > + } else
> > + wbinvd();
> > + return X86EMUL_CONTINUE;
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
> > +
> >
> > int emulate_clts(struct kvm_vcpu *vcpu)
> > {
> >
> > kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 6:56 ` Avi Kivity
@ 2010-06-28 6:56 ` Sheng Yang
2010-06-28 7:08 ` Avi Kivity
2010-06-28 7:30 ` [PATCH v3] " Dong, Eddie
1 sibling, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-06-28 6:56 UTC (permalink / raw)
To: Avi Kivity
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On Monday 28 June 2010 14:56:38 Avi Kivity wrote:
> On 06/28/2010 09:42 AM, Sheng Yang wrote:
> >>> +static void wbinvd_ipi(void *garbage)
> >>> +{
> >>> + wbinvd();
> >>> +}
> >>
> >> Like Jan mentioned, this is quite heavy. What about a clflush() loop
> >> instead? That may take more time, but at least it's preemptible. Of
> >> course, it isn't preemptible in an IPI.
> >
> > I think this kind of behavior happened rarely, and most recent processor
> > should have WBINVD exit which means it's an IPI... So I think it's maybe
> > acceptable here.
>
> Several milliseconds of non-responsiveness may not be acceptable for
> some applications. So I think queue_work_on() and a clflush loop is
> better than an IPI and wbinvd.
OK... Would update it in the next version.
--
regards
Yang, Sheng
>
> >>> +
> >>>
> >>> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>> {
> >>>
> >>> + /* Address WBINVD may be executed by guest */
> >>> + if (vcpu->kvm->arch.iommu_domain) {
> >>> + if (kvm_x86_ops->has_wbinvd_exit())
> >>> + cpu_set(cpu, vcpu->arch.wbinvd_dirty_mask);
> >>> + else if (vcpu->cpu != -1)
> >>> + smp_call_function_single(vcpu->cpu,
> >>> + wbinvd_ipi, NULL, 1);
> >>
> >> Is there any point to doing this if !has_wbinvd_exit()? The vcpu might
> >> not have migrated in time, so the cache is flushed too late.
> >
> > For the !has_wbinvd_exit(), the instruction would be executed by guest
> > and flush the current processor immediately. And we can ensure that it's
> > clean in the last CPU, so we're fine.
>
> Ah, yes.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 6:42 ` Sheng Yang
@ 2010-06-28 6:56 ` Avi Kivity
2010-06-28 6:56 ` Sheng Yang
2010-06-28 7:30 ` [PATCH v3] " Dong, Eddie
0 siblings, 2 replies; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 6:56 UTC (permalink / raw)
To: Sheng Yang
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On 06/28/2010 09:42 AM, Sheng Yang wrote:
>>> +static void wbinvd_ipi(void *garbage)
>>> +{
>>> + wbinvd();
>>> +}
>>>
>> Like Jan mentioned, this is quite heavy. What about a clflush() loop
>> instead? That may take more time, but at least it's preemptible. Of
>> course, it isn't preemptible in an IPI.
>>
>
> I think this kind of behavior happened rarely, and most recent processor should
> have WBINVD exit which means it's an IPI... So I think it's maybe acceptable here.
>
>
Several milliseconds of non-responsiveness may not be acceptable for
some applications. So I think queue_work_on() and a clflush loop is
better than an IPI and wbinvd.
>>> +
>>>
>>> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>> {
>>>
>>> + /* Address WBINVD may be executed by guest */
>>> + if (vcpu->kvm->arch.iommu_domain) {
>>> + if (kvm_x86_ops->has_wbinvd_exit())
>>> + cpu_set(cpu, vcpu->arch.wbinvd_dirty_mask);
>>> + else if (vcpu->cpu != -1)
>>> + smp_call_function_single(vcpu->cpu,
>>> + wbinvd_ipi, NULL, 1);
>>>
>> Is there any point to doing this if !has_wbinvd_exit()? The vcpu might
>> not have migrated in time, so the cache is flushed too late.
>>
> For the !has_wbinvd_exit(), the instruction would be executed by guest and flush
> the current processor immediately. And we can ensure that it's clean in the last
> CPU, so we're fine.
>
Ah, yes.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 6:56 ` Sheng Yang
@ 2010-06-28 7:08 ` Avi Kivity
2010-06-28 7:41 ` Sheng Yang
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 7:08 UTC (permalink / raw)
To: Sheng Yang
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On 06/28/2010 09:56 AM, Sheng Yang wrote:
> On Monday 28 June 2010 14:56:38 Avi Kivity wrote:
>
>> On 06/28/2010 09:42 AM, Sheng Yang wrote:
>>
>>>>> +static void wbinvd_ipi(void *garbage)
>>>>> +{
>>>>> + wbinvd();
>>>>> +}
>>>>>
>>>> Like Jan mentioned, this is quite heavy. What about a clflush() loop
>>>> instead? That may take more time, but at least it's preemptible. Of
>>>> course, it isn't preemptible in an IPI.
>>>>
>>> I think this kind of behavior happened rarely, and most recent processor
>>> should have WBINVD exit which means it's an IPI... So I think it's maybe
>>> acceptable here.
>>>
>> Several milliseconds of non-responsiveness may not be acceptable for
>> some applications. So I think queue_work_on() and a clflush loop is
>> better than an IPI and wbinvd.
>>
> OK... Would update it in the next version.
>
>
Hm, the manual says (regarding clflush):
> Invalidates the cache line that contains the linear address specified
> with the source
> operand from all levels of the processor cache hierarchy (data and
> instruction). The
> invalidation is broadcast throughout the cache coherence domain. If,
> at any level of
> the cache hierarchy, the line is inconsistent with memory (dirty) it
> is written to
> memory before invalidation.
So I don't think you need to queue_work_on(), instead you can work in
vcpu thread context. But better check with someone that really knows.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 32+ messages in thread
* RE: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 6:56 ` Avi Kivity
2010-06-28 6:56 ` Sheng Yang
@ 2010-06-28 7:30 ` Dong, Eddie
2010-06-28 8:04 ` Avi Kivity
1 sibling, 1 reply; 32+ messages in thread
From: Dong, Eddie @ 2010-06-28 7:30 UTC (permalink / raw)
To: Avi Kivity, Sheng Yang
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm@vger.kernel.org,
Dong, Eddie
Avi Kivity wrote:
> On 06/28/2010 09:42 AM, Sheng Yang wrote:
>>>> +static void wbinvd_ipi(void *garbage)
>>>> +{
>>>> + wbinvd();
>>>> +}
>>>>
>>> Like Jan mentioned, this is quite heavy. What about a clflush()
>>> loop instead? That may take more time, but at least it's
>>> preemptible. Of course, it isn't preemptible in an IPI.
>>>
>>
>> I think this kind of behavior happened rarely, and most recent
>> processor should have WBINVD exit which means it's an IPI... So I
>> think it's maybe acceptable here.
>>
>>
>
> Several milliseconds of non-responsiveness may not be acceptable for
> some applications. So I think queue_work_on() and a clflush loop is
> better than an IPI and wbinvd.
>
Probably we should make it configurable. For RT usage models, we do care about responsiveness more than performance, but for typical server useg model, we'd better focus on performance in this issue. WBINVD may perform much much better than CLFLUSH, and a mallicious guest repeatedly issuing wbinvd may greatly impact the system performance.
thx, eddie
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 7:08 ` Avi Kivity
@ 2010-06-28 7:41 ` Sheng Yang
2010-06-28 8:07 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-06-28 7:41 UTC (permalink / raw)
To: Avi Kivity
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On Monday 28 June 2010 15:08:56 Avi Kivity wrote:
> On 06/28/2010 09:56 AM, Sheng Yang wrote:
> > On Monday 28 June 2010 14:56:38 Avi Kivity wrote:
> >> On 06/28/2010 09:42 AM, Sheng Yang wrote:
> >>>>> +static void wbinvd_ipi(void *garbage)
> >>>>> +{
> >>>>> + wbinvd();
> >>>>> +}
> >>>>
> >>>> Like Jan mentioned, this is quite heavy. What about a clflush() loop
> >>>> instead? That may take more time, but at least it's preemptible. Of
> >>>> course, it isn't preemptible in an IPI.
> >>>
> >>> I think this kind of behavior happened rarely, and most recent
> >>> processor should have WBINVD exit which means it's an IPI... So I
> >>> think it's maybe acceptable here.
> >>
> >> Several milliseconds of non-responsiveness may not be acceptable for
> >> some applications. So I think queue_work_on() and a clflush loop is
> >> better than an IPI and wbinvd.
> >
> > OK... Would update it in the next version.
>
> Hm, the manual says (regarding clflush):
> > Invalidates the cache line that contains the linear address specified
> > with the source
> > operand from all levels of the processor cache hierarchy (data and
> > instruction). The
> > invalidation is broadcast throughout the cache coherence domain. If,
> > at any level of
> > the cache hierarchy, the line is inconsistent with memory (dirty) it
> > is written to
> > memory before invalidation.
>
> So I don't think you need to queue_work_on(), instead you can work in
> vcpu thread context. But better check with someone that really knows.
Yeah, I've just checked the instruction as well. For it would be boardcasted,
seems we even don't need(and can't have) a dirty bitmap. So the overhead on the
large machine should be big.
And I've calculated the times we need to execute clflush for whole guest memory. If
I calculate it right, for a 64bit guest, clflush can only cover 64 bytes one time,
so for a typical 4G guest, we would need to execute the command for 4G / 64 = 64M
times. The cycles used by clflush can be vary, suppose it would use 10 cycles each
(which sounds impossible, for involving boardcast and writeback, and not including
cache refill time for all processors), it would cost more than 0.2 seconds one time
on an 3.2Ghz machine...
--
regards
Yang, Sheng
--
regards
Yang, Sheng
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 7:30 ` [PATCH v3] " Dong, Eddie
@ 2010-06-28 8:04 ` Avi Kivity
2010-06-28 8:16 ` Dong, Eddie
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 8:04 UTC (permalink / raw)
To: Dong, Eddie
Cc: Sheng Yang, Jan Kiszka, Marcelo Tosatti, Joerg Roedel,
kvm@vger.kernel.org
On 06/28/2010 10:30 AM, Dong, Eddie wrote:
>>
>> Several milliseconds of non-responsiveness may not be acceptable for
>> some applications. So I think queue_work_on() and a clflush loop is
>> better than an IPI and wbinvd.
>>
>>
> Probably we should make it configurable. For RT usage models, we do care about responsiveness more than performance, but for typical server useg model, we'd better focus on performance in this issue. WBINVD may perform much much better than CLFLUSH, and a mallicious guest repeatedly issuing wbinvd may greatly impact the system performance.
>
I'm not even sure clflush can work. I thought you could loop on just
the cache size, but it appears you'll need to loop over the entire guest
address space, which could take ages.
So I guess we'll have to settle for wbinvd, just avoiding it when the
hardware allows us to.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 7:41 ` Sheng Yang
@ 2010-06-28 8:07 ` Avi Kivity
2010-06-28 8:42 ` [PATCH v4] " Sheng Yang
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 8:07 UTC (permalink / raw)
To: Sheng Yang
Cc: Jan Kiszka, Marcelo Tosatti, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On 06/28/2010 10:41 AM, Sheng Yang wrote:
>
>> Hm, the manual says (regarding clflush):
>>
>>> Invalidates the cache line that contains the linear address specified
>>> with the source
>>> operand from all levels of the processor cache hierarchy (data and
>>> instruction). The
>>> invalidation is broadcast throughout the cache coherence domain. If,
>>> at any level of
>>> the cache hierarchy, the line is inconsistent with memory (dirty) it
>>> is written to
>>> memory before invalidation.
>>>
>> So I don't think you need to queue_work_on(), instead you can work in
>> vcpu thread context. But better check with someone that really knows.
>>
> Yeah, I've just checked the instruction as well. For it would be boardcasted,
> seems we even don't need(and can't have) a dirty bitmap. So the overhead on the
> large machine should be big.
>
> And I've calculated the times we need to execute clflush for whole guest memory. If
> I calculate it right, for a 64bit guest, clflush can only cover 64 bytes one time,
> so for a typical 4G guest, we would need to execute the command for 4G / 64 = 64M
> times. The cycles used by clflush can be vary, suppose it would use 10 cycles each
> (which sounds impossible, for involving boardcast and writeback, and not including
> cache refill time for all processors), it would cost more than 0.2 seconds one time
> on an 3.2Ghz machine...
>
Right, so clflush can't be made to work.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* RE: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 8:04 ` Avi Kivity
@ 2010-06-28 8:16 ` Dong, Eddie
2010-06-28 8:45 ` Jan Kiszka
0 siblings, 1 reply; 32+ messages in thread
From: Dong, Eddie @ 2010-06-28 8:16 UTC (permalink / raw)
To: Avi Kivity
Cc: Sheng Yang, Jan Kiszka, Marcelo Tosatti, Joerg Roedel,
kvm@vger.kernel.org, Dong, Eddie
Avi Kivity wrote:
> On 06/28/2010 10:30 AM, Dong, Eddie wrote:
>>>
>>> Several milliseconds of non-responsiveness may not be acceptable for
>>> some applications. So I think queue_work_on() and a clflush loop is
>>> better than an IPI and wbinvd.
>>>
>>>
>> Probably we should make it configurable. For RT usage models, we do
>> care about responsiveness more than performance, but for typical
>> server useg model, we'd better focus on performance in this issue.
>> WBINVD may perform much much better than CLFLUSH, and a mallicious
>> guest repeatedly issuing wbinvd may greatly impact the system
>> performance.
>>
>
> I'm not even sure clflush can work. I thought you could loop on just
> the cache size, but it appears you'll need to loop over the entire
> guest address space, which could take ages.
If RT usage model comes into reality, we may have to do in this way though pay with huge overhead :)
Is there any RT customers here?
>
> So I guess we'll have to settle for wbinvd, just avoiding it when the
> hardware allows us to.
Yes, for now I agree we can just use wbinvd to emulate wbinvd :)
Thx, Eddie
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH v4] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 8:07 ` Avi Kivity
@ 2010-06-28 8:42 ` Sheng Yang
2010-06-28 9:27 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-06-28 8:42 UTC (permalink / raw)
To: Avi Kivity
Cc: Marcelo Tosatti, Jan Kiszka, kvm, Sheng Yang, Yaozu (Eddie) Dong
Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
arch/x86/include/asm/kvm_host.h | 6 +++++
arch/x86/kvm/emulate.c | 5 +++-
arch/x86/kvm/svm.c | 7 ++++++
arch/x86/kvm/vmx.c | 10 ++++++++-
arch/x86/kvm/x86.c | 44 ++++++++++++++++++++++++++++++++++++++-
5 files changed, 69 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdea..2bda624 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
+
+ cpumask_var_t wbinvd_dirty_mask;
};
struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+ bool (*has_wbinvd_exit)(void);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec..e8bdddc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d..56c9b6b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e565689..806ab12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -3400,7 +3406,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
@@ -4350,6 +4356,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b9252..f3fcf0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,28 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
unsigned long khz = cpufreq_quick_get(cpu);
@@ -3650,6 +3670,18 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ if (need_emulate_wbinvd(vcpu) && kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
int emulate_clts(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -5255,6 +5287,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5262,7 +5295,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
- return kvm_x86_ops->vcpu_create(kvm, id);
+ struct kvm_vcpu *vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+ if (IS_ERR(vcpu))
+ return vcpu;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ kvm_x86_ops->vcpu_free(vcpu);
+ return ERR_PTR(-ENOMEM);
+ }
+ return vcpu;
}
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
--
1.7.0.1
^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 8:16 ` Dong, Eddie
@ 2010-06-28 8:45 ` Jan Kiszka
0 siblings, 0 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-28 8:45 UTC (permalink / raw)
To: Dong, Eddie
Cc: Avi Kivity, Sheng Yang, Marcelo Tosatti, Joerg Roedel,
kvm@vger.kernel.org
Dong, Eddie wrote:
> Avi Kivity wrote:
>> On 06/28/2010 10:30 AM, Dong, Eddie wrote:
>>>> Several milliseconds of non-responsiveness may not be acceptable for
>>>> some applications. So I think queue_work_on() and a clflush loop is
>>>> better than an IPI and wbinvd.
>>>>
>>>>
>>> Probably we should make it configurable. For RT usage models, we do
>>> care about responsiveness more than performance, but for typical
>>> server useg model, we'd better focus on performance in this issue.
>>> WBINVD may perform much much better than CLFLUSH, and a mallicious
>>> guest repeatedly issuing wbinvd may greatly impact the system
>>> performance.
>>>
>> I'm not even sure clflush can work. I thought you could loop on just
>> the cache size, but it appears you'll need to loop over the entire
>> guest address space, which could take ages.
>
> If RT usage model comes into reality, we may have to do in this way though pay with huge overhead :)
> Is there any RT customers here?
Yes, I know of a few (RT host + The Typical non-RT guest).
One case already has to deal with wbinvd because it runs on older HW
without trapping support. The issue is mitigated here by CPU isolation.
But shared caches remain problematic, also with this new approach here
(wbinvd not only flushes KVM's memory...).
>
>> So I guess we'll have to settle for wbinvd, just avoiding it when the
>> hardware allows us to.
>
> Yes, for now I agree we can just use wbinvd to emulate wbinvd :)
Having a switch even for the case the guest may have a potential need
would be useful. Maybe controllable by user space, maybe something like
emulate / skip / skip+report.
There might be guests issuing wbinvd from drivers of devices that aren't
passed through (I'm thinking of graphic adapters) while they don't for
the passed-through ones. In that case, ignoring should be fine.
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v4] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 8:42 ` [PATCH v4] " Sheng Yang
@ 2010-06-28 9:27 ` Avi Kivity
2010-06-28 9:31 ` Gleb Natapov
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 9:27 UTC (permalink / raw)
To: Sheng Yang; +Cc: Marcelo Tosatti, Jan Kiszka, kvm, Yaozu (Eddie) Dong
nOn 06/28/2010 11:42 AM, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>
>
> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> +{
> + if (need_emulate_wbinvd(vcpu)&& kvm_x86_ops->has_wbinvd_exit()) {
> + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
> + wbinvd_ipi, NULL, 1);
> + cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
> + } else
> + wbinvd();
> + return X86EMUL_CONTINUE;
> +}
> +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
>
Why check for has_wbinvd_exit()? If it's false, we don't get here anyway.
If !need_emulate_wbinvd(), why call wbinvd()?
> +
> int emulate_clts(struct kvm_vcpu *vcpu)
> {
> kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
> @@ -5255,6 +5287,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> vcpu->arch.time_page = NULL;
> }
>
> + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
> fx_free(vcpu);
> kvm_x86_ops->vcpu_free(vcpu);
> }
> @@ -5262,7 +5295,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
> unsigned int id)
> {
> - return kvm_x86_ops->vcpu_create(kvm, id);
> + struct kvm_vcpu *vcpu = kvm_x86_ops->vcpu_create(kvm, id);
> +
> + if (IS_ERR(vcpu))
> + return vcpu;
> +
> + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
> + kvm_x86_ops->vcpu_free(vcpu);
> + return ERR_PTR(-ENOMEM);
> + }
> + return vcpu;
> }
>
>
Better place is kvm_arch_vcpu_init().
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v4] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 9:27 ` Avi Kivity
@ 2010-06-28 9:31 ` Gleb Natapov
2010-06-28 9:35 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Gleb Natapov @ 2010-06-28 9:31 UTC (permalink / raw)
To: Avi Kivity
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm, Yaozu (Eddie) Dong
On Mon, Jun 28, 2010 at 12:27:22PM +0300, Avi Kivity wrote:
> nOn 06/28/2010 11:42 AM, Sheng Yang wrote:
> >Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> >WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> >CLFLUSH, we need to maintain data consistency either by:
> >1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> >wbinvd exit, or
> >2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> >
> >
> >+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> >+{
> >+ if (need_emulate_wbinvd(vcpu)&& kvm_x86_ops->has_wbinvd_exit()) {
> >+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
> >+ wbinvd_ipi, NULL, 1);
> >+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
> >+ } else
> >+ wbinvd();
> >+ return X86EMUL_CONTINUE;
> >+}
> >+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
>
> Why check for has_wbinvd_exit()? If it's false, we don't get here anyway.
>
> If !need_emulate_wbinvd(), why call wbinvd()?
>
>
The function is called from emulator too. I guess that is why.
> >+
> > int emulate_clts(struct kvm_vcpu *vcpu)
> > {
> > kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
> >@@ -5255,6 +5287,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> > vcpu->arch.time_page = NULL;
> > }
> >
> >+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
> > fx_free(vcpu);
> > kvm_x86_ops->vcpu_free(vcpu);
> > }
> >@@ -5262,7 +5295,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> > struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
> > unsigned int id)
> > {
> >- return kvm_x86_ops->vcpu_create(kvm, id);
> >+ struct kvm_vcpu *vcpu = kvm_x86_ops->vcpu_create(kvm, id);
> >+
> >+ if (IS_ERR(vcpu))
> >+ return vcpu;
> >+
> >+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
> >+ kvm_x86_ops->vcpu_free(vcpu);
> >+ return ERR_PTR(-ENOMEM);
> >+ }
> >+ return vcpu;
> > }
> >
>
> Better place is kvm_arch_vcpu_init().
>
> --
> error compiling committee.c: too many arguments to function
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Gleb.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v4] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 9:31 ` Gleb Natapov
@ 2010-06-28 9:35 ` Avi Kivity
2010-06-29 3:16 ` [PATCH v5] " Sheng Yang
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-28 9:35 UTC (permalink / raw)
To: Gleb Natapov
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm, Yaozu (Eddie) Dong
On 06/28/2010 12:31 PM, Gleb Natapov wrote:
> On Mon, Jun 28, 2010 at 12:27:22PM +0300, Avi Kivity wrote:
>
>> nOn 06/28/2010 11:42 AM, Sheng Yang wrote:
>>
>>> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
>>> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
>>> CLFLUSH, we need to maintain data consistency either by:
>>> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
>>> wbinvd exit, or
>>> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>>>
>>>
>>> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
>>> +{
>>> + if (need_emulate_wbinvd(vcpu)&& kvm_x86_ops->has_wbinvd_exit()) {
>>> + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
>>> + wbinvd_ipi, NULL, 1);
>>> + cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
>>> + } else
>>> + wbinvd();
>>> + return X86EMUL_CONTINUE;
>>> +}
>>> +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
>>>
>> Why check for has_wbinvd_exit()? If it's false, we don't get here anyway.
>>
>> If !need_emulate_wbinvd(), why call wbinvd()?
>>
>>
>>
> The function is called from emulator too. I guess that is why.
>
Ah, yes. But it needs to return, if !n_e_wbinvd(), not call wbinvd().
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-28 9:35 ` Avi Kivity
@ 2010-06-29 3:16 ` Sheng Yang
2010-06-29 9:39 ` Avi Kivity
` (2 more replies)
0 siblings, 3 replies; 32+ messages in thread
From: Sheng Yang @ 2010-06-29 3:16 UTC (permalink / raw)
To: Avi Kivity
Cc: Marcelo Tosatti, Jan Kiszka, Joerg Roedel, kvm, Sheng Yang,
Yaozu (Eddie) Dong
Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
arch/x86/include/asm/kvm_host.h | 6 +++++
arch/x86/kvm/emulate.c | 5 +++-
arch/x86/kvm/svm.c | 7 ++++++
arch/x86/kvm/vmx.c | 10 ++++++++-
arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++
5 files changed, 67 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdea..2bda624 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
+
+ cpumask_var_t wbinvd_dirty_mask;
};
struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+ bool (*has_wbinvd_exit)(void);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec..e8bdddc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d..56c9b6b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e565689..806ab12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -3400,7 +3406,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
@@ -4350,6 +4356,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b9252..9a400ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,28 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
unsigned long khz = cpufreq_quick_get(cpu);
@@ -3650,6 +3670,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
int emulate_clts(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -5255,6 +5290,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5384,7 +5420,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ goto fail_free_mce_banks;
+
return 0;
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
--
1.7.0.1
^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 3:16 ` [PATCH v5] " Sheng Yang
@ 2010-06-29 9:39 ` Avi Kivity
2010-06-29 10:32 ` Jan Kiszka
2010-06-29 10:14 ` Roedel, Joerg
2010-06-29 13:25 ` Marcelo Tosatti
2 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 9:39 UTC (permalink / raw)
To: Sheng Yang
Cc: Marcelo Tosatti, Jan Kiszka, Joerg Roedel, kvm,
Yaozu (Eddie) Dong
On 06/29/2010 06:16 AM, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>
>
Looks good.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 3:16 ` [PATCH v5] " Sheng Yang
2010-06-29 9:39 ` Avi Kivity
@ 2010-06-29 10:14 ` Roedel, Joerg
2010-06-29 10:44 ` Avi Kivity
2010-06-29 13:25 ` Marcelo Tosatti
2 siblings, 1 reply; 32+ messages in thread
From: Roedel, Joerg @ 2010-06-29 10:14 UTC (permalink / raw)
To: Sheng Yang
Cc: Avi Kivity, Marcelo Tosatti, Jan Kiszka, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On Mon, Jun 28, 2010 at 11:16:59PM -0400, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
Sorry for jumping in late. This code is not required on AMD platforms
because the io-page-tables for the AMD IOMMU have a FC bit (force
coherent) that must just be set. The current code does not set this bit
but I will prepare a patch for that. This wbinvd emulation code should
be avoided where possible.
Joerg
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 9:39 ` Avi Kivity
@ 2010-06-29 10:32 ` Jan Kiszka
2010-06-29 10:42 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-06-29 10:32 UTC (permalink / raw)
To: Avi Kivity
Cc: Sheng Yang, Marcelo Tosatti, Joerg Roedel, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
Avi Kivity wrote:
> On 06/29/2010 06:16 AM, Sheng Yang wrote:
>> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
>> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
>> CLFLUSH, we need to maintain data consistency either by:
>> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
>> wbinvd exit, or
>> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>>
>>
>
> Looks good.
>
There is just the question remaining if we want to add some disable
knob, maybe as an option in the device assignment configuration.
I wonder what the performance impact of this feature is when using CPUs
without wbinvd exiting. Can we afford to enable it unconditionally (in
the presence of pass-through) even if the guest doesn't need it?
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 10:32 ` Jan Kiszka
@ 2010-06-29 10:42 ` Avi Kivity
2010-06-29 12:32 ` Roedel, Joerg
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 10:42 UTC (permalink / raw)
To: Jan Kiszka
Cc: Sheng Yang, Marcelo Tosatti, Joerg Roedel, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On 06/29/2010 01:32 PM, Jan Kiszka wrote:
> Avi Kivity wrote:
>
>> On 06/29/2010 06:16 AM, Sheng Yang wrote:
>>
>>> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
>>> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
>>> CLFLUSH, we need to maintain data consistency either by:
>>> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
>>> wbinvd exit, or
>>> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>>>
>>>
>>>
>> Looks good.
>>
>>
> There is just the question remaining if we want to add some disable
> knob, maybe as an option in the device assignment configuration.
>
Patches welcome. Also an alternative implementation that uses clflush
could also work.
> I wonder what the performance impact of this feature is when using CPUs
> without wbinvd exiting. Can we afford to enable it unconditionally (in
> the presence of pass-through) even if the guest doesn't need it?
>
Correctness is more important than performance. Since we don't know
whether the guest needs it or not, we have to enable it. The user may
disable it if he likes.
Maybe we should emit a warning that performance may degrade when pci
snooping is not available.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 10:14 ` Roedel, Joerg
@ 2010-06-29 10:44 ` Avi Kivity
2010-06-29 12:28 ` Roedel, Joerg
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 10:44 UTC (permalink / raw)
To: Roedel, Joerg
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On 06/29/2010 01:14 PM, Roedel, Joerg wrote:
> On Mon, Jun 28, 2010 at 11:16:59PM -0400, Sheng Yang wrote:
>
>> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
>> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
>> CLFLUSH, we need to maintain data consistency either by:
>> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
>> wbinvd exit, or
>> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>>
> Sorry for jumping in late. This code is not required on AMD platforms
> because the io-page-tables for the AMD IOMMU have a FC bit (force
> coherent) that must just be set. The current code does not set this bit
> but I will prepare a patch for that. This wbinvd emulation code should
> be avoided where possible.
>
Do you mean, KVM_IOMMU_CACHE_COHERENCY should be set for the AMD IOMMU?
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 10:44 ` Avi Kivity
@ 2010-06-29 12:28 ` Roedel, Joerg
2010-06-29 12:35 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Roedel, Joerg @ 2010-06-29 12:28 UTC (permalink / raw)
To: Avi Kivity
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 06:44:25AM -0400, Avi Kivity wrote:
> On 06/29/2010 01:14 PM, Roedel, Joerg wrote:
> > On Mon, Jun 28, 2010 at 11:16:59PM -0400, Sheng Yang wrote:
> >
> >> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> >> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> >> CLFLUSH, we need to maintain data consistency either by:
> >> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> >> wbinvd exit, or
> >> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
> >>
> > Sorry for jumping in late. This code is not required on AMD platforms
> > because the io-page-tables for the AMD IOMMU have a FC bit (force
> > coherent) that must just be set. The current code does not set this bit
> > but I will prepare a patch for that. This wbinvd emulation code should
> > be avoided where possible.
> >
>
> Do you mean, KVM_IOMMU_CACHE_COHERENCY should be set for the AMD IOMMU?
No, as far as I understand it the KVM_IOMMU_CACHE_COHERENCY flag is only
there because there are VT-d IOMMUs that does not support the snoop
force bit. In the AMD IOMMU case all hardware has this feature, the
IOMMU driver just has to use it for IOMMU-API page-tables too. This is
currently not the case. So this is only an IOMMU driver change.
Joerg
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 10:42 ` Avi Kivity
@ 2010-06-29 12:32 ` Roedel, Joerg
2010-06-29 12:37 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Roedel, Joerg @ 2010-06-29 12:32 UTC (permalink / raw)
To: Avi Kivity
Cc: Jan Kiszka, Sheng Yang, Marcelo Tosatti, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 06:42:57AM -0400, Avi Kivity wrote:
> On 06/29/2010 01:32 PM, Jan Kiszka wrote:
> Correctness is more important than performance. Since we don't know
> whether the guest needs it or not, we have to enable it. The user may
> disable it if he likes.
Can't this code only be enabled if VT-d hardware is detected that does
not support the snoop force bit? So the user does not have to struggle
with configuration options that are hard to understand for a
non-developer.
Joerg
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 12:28 ` Roedel, Joerg
@ 2010-06-29 12:35 ` Avi Kivity
2010-06-29 13:34 ` Roedel, Joerg
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 12:35 UTC (permalink / raw)
To: Roedel, Joerg
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On 06/29/2010 03:28 PM, Roedel, Joerg wrote:
>
>> Do you mean, KVM_IOMMU_CACHE_COHERENCY should be set for the AMD IOMMU?
>>
> No, as far as I understand it the KVM_IOMMU_CACHE_COHERENCY flag is only
> there because there are VT-d IOMMUs that does not support the snoop
> force bit. In the AMD IOMMU case all hardware has this feature, the
> IOMMU driver just has to use it for IOMMU-API page-tables too. This is
> currently not the case. So this is only an IOMMU driver change.
>
The flag indicates to kvm that it doesn't need to worry about iommu
cache coherency issues (for example, it can ignore wbinvd), so it needs
to be set. THe following code
if (iommu_domain_has_cap(kvm->arch.iommu_domain,
IOMMU_CAP_CACHE_COHERENCY))
kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
does this, so it looks like you need to return true for
iommu_domain_has_cap() after the change. So far only the intel iommu
supports it.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 12:32 ` Roedel, Joerg
@ 2010-06-29 12:37 ` Avi Kivity
0 siblings, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 12:37 UTC (permalink / raw)
To: Roedel, Joerg
Cc: Jan Kiszka, Sheng Yang, Marcelo Tosatti, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On 06/29/2010 03:32 PM, Roedel, Joerg wrote:
> On Tue, Jun 29, 2010 at 06:42:57AM -0400, Avi Kivity wrote:
>
>> On 06/29/2010 01:32 PM, Jan Kiszka wrote:
>>
>
>> Correctness is more important than performance. Since we don't know
>> whether the guest needs it or not, we have to enable it. The user may
>> disable it if he likes.
>>
> Can't this code only be enabled if VT-d hardware is detected that does
> not support the snoop force bit? So the user does not have to struggle
> with configuration options that are hard to understand for a
> non-developer.
>
>
It's already this way, that's why we test for
KVM_IOMMU_CACHE_COHERENCY. Jan wants an override for older hardware
which doesn't support a coherent iommu (the wbinvd ipi on task
migrations, on processors that don't support wbinvd exits, is
particularly expensive).
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 3:16 ` [PATCH v5] " Sheng Yang
2010-06-29 9:39 ` Avi Kivity
2010-06-29 10:14 ` Roedel, Joerg
@ 2010-06-29 13:25 ` Marcelo Tosatti
2010-06-29 13:28 ` Avi Kivity
2 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2010-06-29 13:25 UTC (permalink / raw)
To: Sheng Yang; +Cc: Avi Kivity, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 11:16:59AM +0800, Sheng Yang wrote:
> Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
> WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
> CLFLUSH, we need to maintain data consistency either by:
> 1: flushing cache (wbinvd) when the guest is scheduled out if there is no
> wbinvd exit, or
> 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.
>
> Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
> arch/x86/include/asm/kvm_host.h | 6 +++++
> arch/x86/kvm/emulate.c | 5 +++-
> arch/x86/kvm/svm.c | 7 ++++++
> arch/x86/kvm/vmx.c | 10 ++++++++-
> arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++
> 5 files changed, 67 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a57cdea..2bda624 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -15,6 +15,7 @@
> #include <linux/mm.h>
> #include <linux/mmu_notifier.h>
> #include <linux/tracepoint.h>
> +#include <linux/cpumask.h>
>
> #include <linux/kvm.h>
> #include <linux/kvm_para.h>
> @@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
>
> /* fields used by HYPER-V emulation */
> u64 hv_vapic;
> +
> + cpumask_var_t wbinvd_dirty_mask;
> };
>
> struct kvm_arch {
> @@ -514,6 +517,8 @@ struct kvm_x86_ops {
>
> void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
>
> + bool (*has_wbinvd_exit)(void);
> +
> const struct trace_print_flags *exit_reasons_str;
> };
>
> @@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
> int kvm_emulate_halt(struct kvm_vcpu *vcpu);
> int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
> int emulate_clts(struct kvm_vcpu *vcpu);
> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
>
> void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
> int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index abb8cec..e8bdddc 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3138,8 +3138,11 @@ twobyte_insn:
> emulate_clts(ctxt->vcpu);
> c->dst.type = OP_NONE;
> break;
> - case 0x08: /* invd */
> case 0x09: /* wbinvd */
> + kvm_emulate_wbinvd(ctxt->vcpu);
> + c->dst.type = OP_NONE;
> + break;
> + case 0x08: /* invd */
> case 0x0d: /* GrpP (prefetch) */
> case 0x18: /* Grp16 (prefetch/nop) */
> c->dst.type = OP_NONE;
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 587b99d..56c9b6b 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
> return false;
> }
>
> +static bool svm_has_wbinvd_exit(void)
> +{
> + return true;
> +}
> +
> static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
> .rdtscp_supported = svm_rdtscp_supported,
>
> .set_supported_cpuid = svm_set_supported_cpuid,
> +
> + .has_wbinvd_exit = svm_has_wbinvd_exit,
> };
>
> static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index e565689..806ab12 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
> return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
> }
>
> +static inline bool cpu_has_vmx_wbinvd_exit(void)
> +{
> + return vmcs_config.cpu_based_2nd_exec_ctrl &
> + SECONDARY_EXEC_WBINVD_EXITING;
> +}
> +
> static inline bool report_flexpriority(void)
> {
> return flexpriority_enabled;
> @@ -3400,7 +3406,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
> static int handle_wbinvd(struct kvm_vcpu *vcpu)
> {
> skip_emulated_instruction(vcpu);
> - /* TODO: Add support for VT-d/pass-through device */
> + kvm_emulate_wbinvd(vcpu);
> return 1;
> }
>
> @@ -4350,6 +4356,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
> .rdtscp_supported = vmx_rdtscp_supported,
>
> .set_supported_cpuid = vmx_set_supported_cpuid,
> +
> + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
> };
>
> static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d0b9252..9a400ae 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1783,8 +1783,28 @@ out:
> return r;
> }
>
> +static void wbinvd_ipi(void *garbage)
> +{
> + wbinvd();
> +}
> +
> +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->kvm->arch.iommu_domain &&
> + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
> +}
> +
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> + /* Address WBINVD may be executed by guest */
> + if (need_emulate_wbinvd(vcpu)) {
> + if (kvm_x86_ops->has_wbinvd_exit())
> + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
> + else if (vcpu->cpu != -1)
&& vcpu->cpu != cpu
> + smp_call_function_single(vcpu->cpu,
> + wbinvd_ipi, NULL, 1);
> + }
> +
> kvm_x86_ops->vcpu_load(vcpu, cpu);
> if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
> unsigned long khz = cpufreq_quick_get(cpu);
> @@ -3650,6 +3670,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
> return X86EMUL_CONTINUE;
> }
>
> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> +{
> + if (!need_emulate_wbinvd(vcpu))
> + return X86EMUL_CONTINUE;
> +
> + if (kvm_x86_ops->has_wbinvd_exit()) {
> + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
> + wbinvd_ipi, NULL, 1);
work_on_cpu() loop instead of smp_call_function_many(), to avoid executing
wbinvd with interrupts disabled.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 13:25 ` Marcelo Tosatti
@ 2010-06-29 13:28 ` Avi Kivity
2010-06-29 13:35 ` Marcelo Tosatti
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 13:28 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Sheng Yang, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong
On 06/29/2010 04:25 PM, Marcelo Tosatti wrote:
>
>> + smp_call_function_single(vcpu->cpu,
>> + wbinvd_ipi, NULL, 1);
>> + }
>> +
>> kvm_x86_ops->vcpu_load(vcpu, cpu);
>> if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
>> unsigned long khz = cpufreq_quick_get(cpu);
>> @@ -3650,6 +3670,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
>> return X86EMUL_CONTINUE;
>> }
>>
>> +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
>> +{
>> + if (!need_emulate_wbinvd(vcpu))
>> + return X86EMUL_CONTINUE;
>> +
>> + if (kvm_x86_ops->has_wbinvd_exit()) {
>> + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
>> + wbinvd_ipi, NULL, 1);
>>
> work_on_cpu() loop instead of smp_call_function_many(), to avoid executing
> wbinvd with interrupts disabled.
>
Why? wbinvd is not interruptible.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 12:35 ` Avi Kivity
@ 2010-06-29 13:34 ` Roedel, Joerg
0 siblings, 0 replies; 32+ messages in thread
From: Roedel, Joerg @ 2010-06-29 13:34 UTC (permalink / raw)
To: Avi Kivity
Cc: Sheng Yang, Marcelo Tosatti, Jan Kiszka, kvm@vger.kernel.org,
Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 08:35:35AM -0400, Avi Kivity wrote:
> On 06/29/2010 03:28 PM, Roedel, Joerg wrote:
>
> The flag indicates to kvm that it doesn't need to worry about iommu
> cache coherency issues (for example, it can ignore wbinvd), so it needs
> to be set. THe following code
>
> if (iommu_domain_has_cap(kvm->arch.iommu_domain,
> IOMMU_CAP_CACHE_COHERENCY))
> kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
>
> does this, so it looks like you need to return true for
> iommu_domain_has_cap() after the change. So far only the intel iommu
> supports it.
Ah ok, good to know. Thanks.
Joerg
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 13:28 ` Avi Kivity
@ 2010-06-29 13:35 ` Marcelo Tosatti
2010-06-29 13:50 ` Avi Kivity
0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2010-06-29 13:35 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sheng Yang, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 04:28:35PM +0300, Avi Kivity wrote:
> On 06/29/2010 04:25 PM, Marcelo Tosatti wrote:
> >
> >>+ smp_call_function_single(vcpu->cpu,
> >>+ wbinvd_ipi, NULL, 1);
> >>+ }
> >>+
> >> kvm_x86_ops->vcpu_load(vcpu, cpu);
> >> if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
> >> unsigned long khz = cpufreq_quick_get(cpu);
> >>@@ -3650,6 +3670,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
> >> return X86EMUL_CONTINUE;
> >> }
> >>
> >>+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
> >>+{
> >>+ if (!need_emulate_wbinvd(vcpu))
> >>+ return X86EMUL_CONTINUE;
> >>+
> >>+ if (kvm_x86_ops->has_wbinvd_exit()) {
> >>+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
> >>+ wbinvd_ipi, NULL, 1);
> >work_on_cpu() loop instead of smp_call_function_many(), to avoid executing
> >wbinvd with interrupts disabled.
>
> Why? wbinvd is not interruptible.
Right. But still, smp_call_function_many() is going to busy-spin until
the target CPUs finish their work, while work_on_cpu() will schedule.
Also the IPI request has to handled immediately, bypassing the
scheduler.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 13:35 ` Marcelo Tosatti
@ 2010-06-29 13:50 ` Avi Kivity
2010-06-29 14:31 ` Marcelo Tosatti
0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-06-29 13:50 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Sheng Yang, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong
On 06/29/2010 04:35 PM, Marcelo Tosatti wrote:
>>>
>>> work_on_cpu() loop instead of smp_call_function_many(), to avoid executing
>>> wbinvd with interrupts disabled.
>>>
>> Why? wbinvd is not interruptible.
>>
> Right. But still, smp_call_function_many() is going to busy-spin until
> the target CPUs finish their work, while work_on_cpu() will schedule.
>
>
Good point. So in the worst case we double the hit.
> Also the IPI request has to handled immediately, bypassing the
> scheduler.
>
We're screwed on that point in any case, once wbinvd starts the
scheduler is bypassed.
But work_on_cpu() has its own problems. It creates a kthread (perhaps
not too bad). Maybe queue_work_on()?
btw, smp_call_function_many() appears not to run the function on the
current cpu.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH v5] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices
2010-06-29 13:50 ` Avi Kivity
@ 2010-06-29 14:31 ` Marcelo Tosatti
0 siblings, 0 replies; 32+ messages in thread
From: Marcelo Tosatti @ 2010-06-29 14:31 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sheng Yang, Jan Kiszka, Joerg Roedel, kvm, Yaozu (Eddie) Dong
On Tue, Jun 29, 2010 at 04:50:49PM +0300, Avi Kivity wrote:
> On 06/29/2010 04:35 PM, Marcelo Tosatti wrote:
> >>>
> >>>work_on_cpu() loop instead of smp_call_function_many(), to avoid executing
> >>>wbinvd with interrupts disabled.
> >>Why? wbinvd is not interruptible.
> >Right. But still, smp_call_function_many() is going to busy-spin until
> >the target CPUs finish their work, while work_on_cpu() will schedule.
> >
>
> Good point. So in the worst case we double the hit.
>
> >Also the IPI request has to handled immediately, bypassing the
> >scheduler.
>
> We're screwed on that point in any case, once wbinvd starts the
> scheduler is bypassed.
>
> But work_on_cpu() has its own problems. It creates a kthread
> (perhaps not too bad). Maybe queue_work_on()?
queue_work_on + flush_work should be equivalent to work_on_cpu (without
thread creation).
^ permalink raw reply [flat|nested] 32+ messages in thread
end of thread, other threads:[~2010-06-29 18:12 UTC | newest]
Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-28 3:36 [PATCH v3] KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Sheng Yang
2010-06-28 3:56 ` Avi Kivity
2010-06-28 6:42 ` Sheng Yang
2010-06-28 6:56 ` Avi Kivity
2010-06-28 6:56 ` Sheng Yang
2010-06-28 7:08 ` Avi Kivity
2010-06-28 7:41 ` Sheng Yang
2010-06-28 8:07 ` Avi Kivity
2010-06-28 8:42 ` [PATCH v4] " Sheng Yang
2010-06-28 9:27 ` Avi Kivity
2010-06-28 9:31 ` Gleb Natapov
2010-06-28 9:35 ` Avi Kivity
2010-06-29 3:16 ` [PATCH v5] " Sheng Yang
2010-06-29 9:39 ` Avi Kivity
2010-06-29 10:32 ` Jan Kiszka
2010-06-29 10:42 ` Avi Kivity
2010-06-29 12:32 ` Roedel, Joerg
2010-06-29 12:37 ` Avi Kivity
2010-06-29 10:14 ` Roedel, Joerg
2010-06-29 10:44 ` Avi Kivity
2010-06-29 12:28 ` Roedel, Joerg
2010-06-29 12:35 ` Avi Kivity
2010-06-29 13:34 ` Roedel, Joerg
2010-06-29 13:25 ` Marcelo Tosatti
2010-06-29 13:28 ` Avi Kivity
2010-06-29 13:35 ` Marcelo Tosatti
2010-06-29 13:50 ` Avi Kivity
2010-06-29 14:31 ` Marcelo Tosatti
2010-06-28 7:30 ` [PATCH v3] " Dong, Eddie
2010-06-28 8:04 ` Avi Kivity
2010-06-28 8:16 ` Dong, Eddie
2010-06-28 8:45 ` Jan Kiszka
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox