* [PATCH v3 0/4] Fix accessed bit tracking
@ 2010-07-05 12:39 Avi Kivity
2010-07-05 12:39 ` [PATCH v3 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: Avi Kivity @ 2010-07-05 12:39 UTC (permalink / raw)
To: kvm; +Cc: Marcelo Tosatti
The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
will happily ignore the lock when setting the accessed bit. This can cause
the accessed bit to be lost. Luckily this only results in incorrect page
selection for swap.
This patchset fixes the problem by atomically updating the spte when
needed while taking care of the accessed bit.
v3: fix i386 pte exchange code in patch 3
add missing __set_spte() to drop_spte() in patch 2
v2: fix incorrect code transformations in first patch
Avi Kivity (4):
KVM: MMU: Introduce drop_spte()
KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
drop_spte()
KVM: MMU: Atomically check for accessed bit when dropping an spte
KVM: MMU: Don't drop accessed bit while updating an spte
arch/x86/kvm/mmu.c | 88 +++++++++++++++++++++++++++++++------------
arch/x86/kvm/paging_tmpl.h | 13 +++---
2 files changed, 69 insertions(+), 32 deletions(-)
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH v3 1/4] KVM: MMU: Introduce drop_spte()
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
@ 2010-07-05 12:39 ` Avi Kivity
2010-07-05 12:39 ` [PATCH v3 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-07-05 12:39 UTC (permalink / raw)
To: kvm; +Cc: Marcelo Tosatti
When we call rmap_remove(), we (almost) always immediately follow it by
an __set_spte() to a nonpresent pte. Since we need to perform the two
operations atomically, to avoid losing the dirty and accessed bits, introduce
a helper drop_spte() and convert all call sites.
The operation is still nonatomic at this point.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/mmu.c | 30 +++++++++++++++++-------------
arch/x86/kvm/paging_tmpl.h | 13 ++++++-------
2 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c515753..5b211dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
}
}
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ rmap_remove(kvm, sptep);
+ __set_spte(sptep, new_spte);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
@@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- rmap_remove(vcpu->kvm, sptep);
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
@@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
__set_spte(sptep, spte);
+done:
return ret;
}
@@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3350c02..dfb2720 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -353,8 +353,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
}
@@ -515,12 +514,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
+ drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte);
need_flush = 1;
- }
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
@@ -636,12 +636,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
!is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
- rmap_remove(vcpu->kvm, &sp->spt[i]);
if (is_present_gpte(gpte) || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- __set_spte(&sp->spt[i], nonpresent);
+ drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
continue;
}
--
1.7.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v3 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte()
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
2010-07-05 12:39 ` [PATCH v3 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
@ 2010-07-05 12:39 ` Avi Kivity
2010-07-05 12:39 ` [PATCH v3 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-07-05 12:39 UTC (permalink / raw)
To: kvm; +Cc: Marcelo Tosatti
Since we need to make the check atomic, move it to the place that will
set the new spte.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/mmu.c | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5b211dc..e1e6967 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
@@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
+ pfn_t pfn;
+
+ if (!is_rmap_spte(*sptep)) {
+ __set_spte(sptep, new_spte);
+ return;
+ }
+ pfn = spte_to_pfn(*sptep);
+ if (*sptep & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(*sptep))
+ kvm_set_pfn_dirty(pfn);
rmap_remove(kvm, sptep);
__set_spte(sptep, new_spte);
}
--
1.7.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v3 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
2010-07-05 12:39 ` [PATCH v3 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
2010-07-05 12:39 ` [PATCH v3 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
@ 2010-07-05 12:39 ` Avi Kivity
2010-07-05 12:39 ` [PATCH v3 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
2010-07-05 13:45 ` [PATCH v3 0/4] Fix accessed bit tracking Marcelo Tosatti
4 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-07-05 12:39 UTC (permalink / raw)
To: kvm; +Cc: Marcelo Tosatti
Currently, in the window between the check for the accessed bit, and actually
dropping the spte, a vcpu can access the page through the spte and set the bit,
which will be ignored by the mmu.
Fix by using an exchange operation to atmoically fetch the spte and drop it.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++-------
1 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e1e6967..b744fbc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte)
#endif
}
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+ return xchg(sptep, new_spte);
+#else
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
+#endif
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
pfn_t pfn;
+ u64 old_spte;
- if (!is_rmap_spte(*sptep)) {
- __set_spte(sptep, new_spte);
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (!is_rmap_spte(old_spte))
return;
- }
- pfn = spte_to_pfn(*sptep);
- if (*sptep & shadow_accessed_mask)
+ pfn = spte_to_pfn(old_spte);
+ if (old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*sptep))
+ if (is_writable_pte(old_spte))
kvm_set_pfn_dirty(pfn);
rmap_remove(kvm, sptep);
- __set_spte(sptep, new_spte);
}
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
--
1.7.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v3 4/4] KVM: MMU: Don't drop accessed bit while updating an spte
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
` (2 preceding siblings ...)
2010-07-05 12:39 ` [PATCH v3 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
@ 2010-07-05 12:39 ` Avi Kivity
2010-07-05 13:45 ` [PATCH v3 0/4] Fix accessed bit tracking Marcelo Tosatti
4 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-07-05 12:39 UTC (permalink / raw)
To: kvm; +Cc: Marcelo Tosatti
__set_spte() will happily replace an spte with the accessed bit set with
one that has the accessed bit clear. Add a helper update_spte() which checks
for this condition and updates the page flag if needed.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++----
1 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b744fbc..104756b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
#endif
}
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte;
+
+ if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+ __set_spte(sptep, new_spte);
+ } else {
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (old_spte & shadow_accessed_mask)
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+ }
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
int need_flush = 0;
- u64 *spte, new_spte;
+ u64 *spte, new_spte, old_spte;
pte_t *ptep = (pte_t *)data;
pfn_t new_pfn;
@@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_accessed_mask;
if (is_writable_pte(*spte))
kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ old_spte = __xchg_spte(spte, new_spte);
+ if (is_shadow_present_pte(old_spte)
+ && (old_spte & shadow_accessed_mask))
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ update_spte(sptep, spte);
done:
return ret;
}
--
1.7.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v3 0/4] Fix accessed bit tracking
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
` (3 preceding siblings ...)
2010-07-05 12:39 ` [PATCH v3 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
@ 2010-07-05 13:45 ` Marcelo Tosatti
4 siblings, 0 replies; 6+ messages in thread
From: Marcelo Tosatti @ 2010-07-05 13:45 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm
On Mon, Jul 05, 2010 at 03:39:09PM +0300, Avi Kivity wrote:
> The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
> will happily ignore the lock when setting the accessed bit. This can cause
> the accessed bit to be lost. Luckily this only results in incorrect page
> selection for swap.
>
> This patchset fixes the problem by atomically updating the spte when
> needed while taking care of the accessed bit.
>
> v3: fix i386 pte exchange code in patch 3
> add missing __set_spte() to drop_spte() in patch 2
>
> v2: fix incorrect code transformations in first patch
>
> Avi Kivity (4):
> KVM: MMU: Introduce drop_spte()
> KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
> drop_spte()
> KVM: MMU: Atomically check for accessed bit when dropping an spte
> KVM: MMU: Don't drop accessed bit while updating an spte
>
> arch/x86/kvm/mmu.c | 88 +++++++++++++++++++++++++++++++------------
> arch/x86/kvm/paging_tmpl.h | 13 +++---
> 2 files changed, 69 insertions(+), 32 deletions(-)
Looks good to me.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2010-07-05 13:55 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-05 12:39 [PATCH v3 0/4] Fix accessed bit tracking Avi Kivity
2010-07-05 12:39 ` [PATCH v3 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
2010-07-05 12:39 ` [PATCH v3 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
2010-07-05 12:39 ` [PATCH v3 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
2010-07-05 12:39 ` [PATCH v3 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
2010-07-05 13:45 ` [PATCH v3 0/4] Fix accessed bit tracking Marcelo Tosatti
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).