public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] Fix accessed bit tracking
@ 2010-06-06 13:06 Avi Kivity
  2010-06-06 13:06 ` [PATCH 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
will happily ignore the lock when setting the accessed bit.  This can cause
the accessed bit to be lost.  Luckily this only results in incorrect page
selection for swap.

This patchset fixes the problem by atomically updating the spte when
needed while taking care of the accessed bit.

Avi Kivity (4):
  KVM: MMU: Introduce drop_spte()
  KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
    drop_spte()
  KVM: MMU: Atomically check for accessed bit when dropping an spte
  KVM: MMU: Don't drop accessed bit while updating an spte

 arch/x86/kvm/mmu.c         |   91 +++++++++++++++++++++++++++++++------------
 arch/x86/kvm/paging_tmpl.h |   13 +++---
 2 files changed, 71 insertions(+), 33 deletions(-)


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/4] KVM: MMU: Introduce drop_spte()
  2010-06-06 13:06 [PATCH 0/4] Fix accessed bit tracking Avi Kivity
@ 2010-06-06 13:06 ` Avi Kivity
  2010-06-06 14:33   ` Avi Kivity
  2010-06-06 13:06 ` [PATCH 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

When we call rmap_remove(), we (almost) always immediately follow it by
an __set_spte() to a nonpresent pte.  Since we need to perform the two
operations atomically, to avoid losing the dirty and accessed bits, introduce
a helper drop_spte() and convert all call sites.

The operation is still nonatomic at this point.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         |   33 +++++++++++++++++++--------------
 arch/x86/kvm/paging_tmpl.h |   13 ++++++-------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6b2c644..17331c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -666,6 +666,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	rmap_remove(kvm, sptep);
+	__set_spte(sptep, new_spte);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -731,9 +737,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				rmap_remove(kvm, spte);
+				drop_spte(kvm, spte,
+					  shadow_trap_nonpresent_pte);
 				--kvm->stat.lpages;
-				__set_spte(spte, shadow_trap_nonpresent_pte);
 				spte = NULL;
 				write_protected = 1;
 			}
@@ -753,8 +759,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		rmap_remove(kvm, spte);
-		__set_spte(spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -776,8 +781,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			rmap_remove(kvm, spte);
-			__set_spte(spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1498,13 +1502,14 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 				ent &= PT64_BASE_ADDR_MASK;
 				mmu_page_remove_parent_pte(page_header(ent),
 							   &pt[i]);
+				pt[i] = shadow_trap_nonpresent_pte;
 			} else {
 				if (is_large_pte(ent))
 					--kvm->stat.lpages;
-				rmap_remove(kvm, &pt[i]);
+				drop_spte(kvm, &pt[i],
+					  shadow_trap_nonpresent_pte);
 			}
 		}
-		pt[i] = shadow_trap_nonpresent_pte;
 	}
 }
 
@@ -1902,9 +1907,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			rmap_remove(vcpu->kvm, sptep);
-			spte = shadow_trap_nonpresent_pte;
-			goto set_pte;
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			goto done;
 		}
 
 		spte |= PT_WRITABLE_MASK;
@@ -1936,6 +1940,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	__set_spte(sptep, spte);
+done:
 	return ret;
 }
 
@@ -1972,7 +1977,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
-			rmap_remove(vcpu->kvm, sptep);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 		} else
 			was_rmapped = 1;
 	}
@@ -2623,13 +2628,13 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
-			rmap_remove(vcpu->kvm, spte);
+			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
+			__set_spte(spte, shadow_trap_nonpresent_pte);
 		}
 	}
-	__set_spte(spte, shadow_trap_nonpresent_pte);
 	if (is_large_pte(pte))
 		--vcpu->kvm->stat.lpages;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8f1ef87..105176d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,8 +329,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			continue;
 
 		if (is_large_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
@@ -491,12 +490,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
+				drop_spte(vcpu->kvm, sptep,
+					  shadow_trap_nonpresent_pte);
 				need_flush = 1;
-			}
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else
+				__set_spte(sptep, shadow_trap_nonpresent_pte);
 			break;
 		}
 
@@ -612,12 +612,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
-			rmap_remove(vcpu->kvm, &sp->spt[i]);
 			if (is_present_gpte(gpte))
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-			__set_spte(&sp->spt[i], nonpresent);
+			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
 			continue;
 		}
 
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte()
  2010-06-06 13:06 [PATCH 0/4] Fix accessed bit tracking Avi Kivity
  2010-06-06 13:06 ` [PATCH 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
@ 2010-06-06 13:06 ` Avi Kivity
  2010-06-06 13:06 ` [PATCH 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
  2010-06-06 13:06 ` [PATCH 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
  3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Since we need to make the check atomic, move it to the place that will
set the new spte.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   17 +++++++++--------
 1 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 17331c2..f93948d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -620,19 +620,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
-	pfn_t pfn;
 	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
-	if (!is_rmap_spte(*spte))
-		return;
 	sp = page_header(__pa(spte));
-	pfn = spte_to_pfn(*spte);
-	if (*spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*spte))
-		kvm_set_pfn_dirty(pfn);
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
@@ -668,6 +660,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
+	pfn_t pfn;
+
+	if (!is_rmap_spte(*sptep))
+		return;
+	pfn = spte_to_pfn(*sptep);
+	if (*sptep & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (is_writable_pte(*sptep))
+		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
 	__set_spte(sptep, new_spte);
 }
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte
  2010-06-06 13:06 [PATCH 0/4] Fix accessed bit tracking Avi Kivity
  2010-06-06 13:06 ` [PATCH 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
  2010-06-06 13:06 ` [PATCH 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
@ 2010-06-06 13:06 ` Avi Kivity
  2010-06-06 13:06 ` [PATCH 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
  3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Currently, in the window between the check for the accessed bit, and actually
dropping the spte, a vcpu can access the page through the spte and set the bit,
which will be ignored by the mmu.

Fix by using an exchange operation to atmoically fetch the spte and drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   26 +++++++++++++++++++++-----
 1 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f93948d..b565a14 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -290,6 +290,21 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 }
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+	return xchg(sptep, new_spte);
+#else
+	u64 old_spte;
+
+	do {
+		old_spte = *sptep;
+	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+	return old;
+#endif
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -661,16 +676,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
+	u64 old_spte;
 
-	if (!is_rmap_spte(*sptep))
+	old_spte = __xchg_spte(sptep, new_spte);
+	if (!is_rmap_spte(old_spte))
 		return;
-	pfn = spte_to_pfn(*sptep);
-	if (*sptep & shadow_accessed_mask)
+	pfn = spte_to_pfn(old_spte);
+	if (old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*sptep))
+	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
-	__set_spte(sptep, new_spte);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] KVM: MMU: Don't drop accessed bit while updating an spte
  2010-06-06 13:06 [PATCH 0/4] Fix accessed bit tracking Avi Kivity
                   ` (2 preceding siblings ...)
  2010-06-06 13:06 ` [PATCH 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
@ 2010-06-06 13:06 ` Avi Kivity
  3 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

__set_spte() will happily replace an spte with the accessed bit set with
one that has the accessed bit clear.  Add a helper update_spte() which checks
for this condition and updates the page flag if needed.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   25 +++++++++++++++++++++----
 1 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b565a14..4c98726 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -305,6 +305,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+	u64 old_spte;
+
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+		__set_spte(sptep, new_spte);
+	} else {
+		old_spte = __xchg_spte(sptep, new_spte);
+		if (old_spte & shadow_accessed_mask)
+			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+	}
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -730,7 +743,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -786,7 +799,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 *spte, new_spte, old_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -806,9 +819,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
+			new_spte &= ~shadow_accessed_mask;
 			if (is_writable_pte(*spte))
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			__set_spte(spte, new_spte);
+			old_spte = __xchg_spte(spte, new_spte);
+			if (is_shadow_present_pte(old_spte)
+			    && (old_spte & shadow_accessed_mask))
+				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1956,7 +1973,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	__set_spte(sptep, spte);
+	update_spte(sptep, spte);
 done:
 	return ret;
 }
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/4] KVM: MMU: Introduce drop_spte()
  2010-06-06 13:06 ` [PATCH 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
@ 2010-06-06 14:33   ` Avi Kivity
  0 siblings, 0 replies; 6+ messages in thread
From: Avi Kivity @ 2010-06-06 14:33 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

On 06/06/2010 04:06 PM, Avi Kivity wrote:
> When we call rmap_remove(), we (almost) always immediately follow it by
> an __set_spte() to a nonpresent pte.  Since we need to perform the two
> operations atomically, to avoid losing the dirty and accessed bits, introduce
> a helper drop_spte() and convert all call sites.
>
> The operation is still nonatomic at this point.
>
>
> @@ -1498,13 +1502,14 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
>   				ent&= PT64_BASE_ADDR_MASK;
>   				mmu_page_remove_parent_pte(page_header(ent),
>   							&pt[i]);
> +				pt[i] = shadow_trap_nonpresent_pte;
>   			} else {
>   				if (is_large_pte(ent))
>   					--kvm->stat.lpages;
> -				rmap_remove(kvm,&pt[i]);
> +				drop_spte(kvm,&pt[i],
> +					  shadow_trap_nonpresent_pte);
>   			}
>   		}
> -		pt[i] = shadow_trap_nonpresent_pte;
>   	}
>   }
>    

Autotest points out that this transformation (and an identical one in 
zap_pte) does not preserve the semantics; if the outer if () fails, the 
new code does not update pt[i].

With the original line after the if () retained, autotest is happier.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-06-06 14:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-06 13:06 [PATCH 0/4] Fix accessed bit tracking Avi Kivity
2010-06-06 13:06 ` [PATCH 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
2010-06-06 14:33   ` Avi Kivity
2010-06-06 13:06 ` [PATCH 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
2010-06-06 13:06 ` [PATCH 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
2010-06-06 13:06 ` [PATCH 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox