* [PATCH 1/4] KVM: MMU: Consolidate two guest pte reads in kvm_mmu_pte_write()
2010-02-15 12:48 [PATCH 0/4] Fix some mmu/emulator atomicity issues Avi Kivity
@ 2010-02-15 12:48 ` Avi Kivity
2010-02-15 12:48 ` [PATCH 2/4] KVM: Make locked operations truly atomic Avi Kivity
` (2 subsequent siblings)
3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-02-15 12:48 UTC (permalink / raw)
To: Marcelo Tosatti, kvm
kvm_mmu_pte_write() reads guest ptes in two different occasions:
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/mmu.c | 69 +++++++++++++++++++++++----------------------------
1 files changed, 31 insertions(+), 38 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 741373e..086025e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2558,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
}
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ u64 gpte)
{
gfn_t gfn;
- int r;
- u64 gpte = 0;
pfn_t pfn;
- if (bytes != 4 && bytes != 8)
- return;
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if (is_pae(vcpu)) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if ((bytes == 4) && (gpa % 4 == 0)) {
- r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
- if (r)
- return;
- memcpy((void *)&gpte + (gpa % 8), new, 4);
- } else if ((bytes == 8) && (gpa % 8 == 0)) {
- memcpy((void *)&gpte, new, 8);
- }
- } else {
- if ((bytes == 4) && (gpa % 4 == 0))
- memcpy((void *)&gpte, new, 4);
- }
if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2638,7 +2613,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int r;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+ switch (bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
+ }
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if (is_pae(vcpu) && bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ gpa &= ~(gpa_t)7;
+ r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+ if (r)
+ gentry = 0;
+ }
+
+ mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
@@ -2703,20 +2705,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
- if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
- gentry = 0;
- r = kvm_read_guest_atomic(vcpu->kvm,
- gpa & ~(u64)(pte_size - 1),
- &gentry, pte_size);
- new = (const void *)&gentry;
- if (r < 0)
- new = NULL;
- }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (new)
- mmu_pte_write_new_pte(vcpu, sp, spte, new);
+ if (gentry)
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
--
1.6.5.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/4] KVM: Make locked operations truly atomic
2010-02-15 12:48 [PATCH 0/4] Fix some mmu/emulator atomicity issues Avi Kivity
2010-02-15 12:48 ` [PATCH 1/4] KVM: MMU: Consolidate two guest pte reads in kvm_mmu_pte_write() Avi Kivity
@ 2010-02-15 12:48 ` Avi Kivity
2010-02-15 12:48 ` [PATCH 3/4] KVM: Don't follow an atmoic operation by a non-atomic one Avi Kivity
2010-02-15 12:48 ` [PATCH 4/4] KVM: MMU: Reinstate pte prefetch on invlpg Avi Kivity
3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-02-15 12:48 UTC (permalink / raw)
To: Marcelo Tosatti, kvm
Once upon a time, locked operations were emulated while holding the mmu mutex.
Since mmu pages were write protected, it was safe to emulate the writes in
a non-atomic manner, since there could be no other writer, either in the
guest or in the kernel.
These days emulation takes place without holding the mmu spinlock, so the
write could be preempted by an unshadowing event, which exposes the page
to writes by the guest. This may cause corruption of guest page tables.
Fix by using an atomic cmpxchg for these operations.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/x86.c | 69 ++++++++++++++++++++++++++++++++++++----------------
1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 86b739f..38344be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3272,41 +3272,68 @@ int emulator_write_emulated(unsigned long addr,
}
EXPORT_SYMBOL_GPL(emulator_write_emulated);
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u *)(new)) == *(u64 *)(old))
+#endif
+
static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes == 8) {
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- u64 val;
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
- val = *(u64 *)new;
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- kaddr = kmap_atomic(page, KM_USER0);
- set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
+ kaddr = kmap_atomic(page, KM_USER0);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
}
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
emul_write:
-#endif
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
return emulator_write_emulated(addr, new, bytes, vcpu);
}
--
1.6.5.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/4] KVM: Don't follow an atmoic operation by a non-atomic one
2010-02-15 12:48 [PATCH 0/4] Fix some mmu/emulator atomicity issues Avi Kivity
2010-02-15 12:48 ` [PATCH 1/4] KVM: MMU: Consolidate two guest pte reads in kvm_mmu_pte_write() Avi Kivity
2010-02-15 12:48 ` [PATCH 2/4] KVM: Make locked operations truly atomic Avi Kivity
@ 2010-02-15 12:48 ` Avi Kivity
2010-02-15 12:48 ` [PATCH 4/4] KVM: MMU: Reinstate pte prefetch on invlpg Avi Kivity
3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-02-15 12:48 UTC (permalink / raw)
To: Marcelo Tosatti, kvm
Currently emulated atomic operations are immediately followed by a non-atomic
operation, so that kvm_mmu_pte_write() can be invoked. This updates the mmu
but undoes the whole point of doing things atomically.
Fix by only performing the atomic operation and the mmu update, and avoiding
the non-atomic write.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/x86.c | 32 +++++++++++++++++++++++++-------
1 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38344be..0b1f0a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3215,7 +3215,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu,
+ bool mmu_only)
{
gpa_t gpa;
u32 error_code;
@@ -3231,6 +3232,10 @@ static int emulator_write_emulated_onepage(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
+ if (mmu_only) {
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+ return X86EMUL_CONTINUE;
+ }
if (emulator_write_phys(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
@@ -3251,24 +3256,35 @@ mmio:
return X86EMUL_CONTINUE;
}
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+static int __emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ bool mmu_only)
{
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, vcpu,
+ mmu_only);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, vcpu,
+ mmu_only);
+}
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ return __emulator_write_emulated(addr, val, bytes, vcpu, false);
}
EXPORT_SYMBOL_GPL(emulator_write_emulated);
@@ -3332,6 +3348,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
+ return __emulator_write_emulated(addr, new, bytes, vcpu, true);
+
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
--
1.6.5.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 4/4] KVM: MMU: Reinstate pte prefetch on invlpg
2010-02-15 12:48 [PATCH 0/4] Fix some mmu/emulator atomicity issues Avi Kivity
` (2 preceding siblings ...)
2010-02-15 12:48 ` [PATCH 3/4] KVM: Don't follow an atmoic operation by a non-atomic one Avi Kivity
@ 2010-02-15 12:48 ` Avi Kivity
2010-02-16 8:40 ` Avi Kivity
3 siblings, 1 reply; 6+ messages in thread
From: Avi Kivity @ 2010-02-15 12:48 UTC (permalink / raw)
To: Marcelo Tosatti, kvm
Commit fb341f57 removed the pte prefetch on guest invlpg, citing guest races.
However, the SDM is adamant that prefetch is allowed:
"The processor may create entries in paging-structure caches for
translations required for prefetches and for accesses that are a
result of speculative execution that would never actually occur
in the executed code path."
And, in fact, there was a race in the prefetch code: we picked up the pte
without the mmu lock held, so an older invlpg could install the pte over
a newer invlpg.
Reinstate the prefetch logic, but this time note whether another invlpg has
executed using a counter. If a race occured, do not install the pte.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 37 +++++++++++++++++++++++--------------
arch/x86/kvm/paging_tmpl.h | 15 +++++++++++++++
3 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f9a2f66..ded4ed7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,7 @@ struct kvm_arch {
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
+ atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 086025e..e821609 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2611,20 +2611,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int flooded = 0;
int npte;
int r;
+ int invlpg_counter;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- switch (bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
- }
+ invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
/*
* Assume that the pte write on a page table of the same type
@@ -2632,16 +2623,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
* (might be false while changing modes). Note it is verified later
* by update_pte().
*/
- if (is_pae(vcpu) && bytes == 4) {
+ if ((is_pae(vcpu) && bytes == 4) || !new) {
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- gpa &= ~(gpa_t)7;
- r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+ if (is_pae(vcpu)) {
+ gpa &= ~(gpa_t)7;
+ bytes = 8;
+ }
+ r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
if (r)
gentry = 0;
+ new = (const u8 *)&gentry;
+ }
+
+ switch (bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
}
mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
spin_lock(&vcpu->kvm->mmu_lock);
+ if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+ gentry = 0;
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a..0628b94 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -457,6 +457,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
@@ -470,6 +471,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (level == PT_PAGE_TABLE_LEVEL ||
((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ pte_gpa = (sp->gfn << PAGE_SHIFT);
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +492,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
+
+ atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (pte_gpa == -1)
+ return;
+
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
--
1.6.5.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH 4/4] KVM: MMU: Reinstate pte prefetch on invlpg
2010-02-15 12:48 ` [PATCH 4/4] KVM: MMU: Reinstate pte prefetch on invlpg Avi Kivity
@ 2010-02-16 8:40 ` Avi Kivity
0 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-02-16 8:40 UTC (permalink / raw)
To: Marcelo Tosatti, kvm
On 02/15/2010 02:48 PM, Avi Kivity wrote:
> Commit fb341f57 removed the pte prefetch on guest invlpg, citing guest races.
> However, the SDM is adamant that prefetch is allowed:
>
> "The processor may create entries in paging-structure caches for
> translations required for prefetches and for accesses that are a
> result of speculative execution that would never actually occur
> in the executed code path."
>
> And, in fact, there was a race in the prefetch code: we picked up the pte
> without the mmu lock held, so an older invlpg could install the pte over
> a newer invlpg.
>
> Reinstate the prefetch logic, but this time note whether another invlpg has
> executed using a counter. If a race occured, do not install the pte.
>
>
This patch is broken (UP windows XP won't start), I'm debugging it.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 6+ messages in thread