[PATCH v2 0/4] Fix accessed bit tracking

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] Fix accessed bit tracking
@ 2010-06-07  7:10 Avi Kivity
  2010-06-07  7:10 ` [PATCH v2 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  7:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
will happily ignore the lock when setting the accessed bit.  This can cause
the accessed bit to be lost.  Luckily this only results in incorrect page
selection for swap.

This patchset fixes the problem by atomically updating the spte when
needed while taking care of the accessed bit.

v2: fix incorrect code transformations in first patch

Avi Kivity (4):
  KVM: MMU: Introduce drop_spte()
  KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
    drop_spte()
  KVM: MMU: Atomically check for accessed bit when dropping an spte
  KVM: MMU: Don't drop accessed bit while updating an spte

 arch/x86/kvm/mmu.c         |   87 ++++++++++++++++++++++++++++++++------------
 arch/x86/kvm/paging_tmpl.h |   13 +++----
 2 files changed, 69 insertions(+), 31 deletions(-)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/4] KVM: MMU: Introduce drop_spte()
  2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
@ 2010-06-07  7:10 ` Avi Kivity
  2010-06-07  7:10 ` [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  7:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

When we call rmap_remove(), we (almost) always immediately follow it by
an __set_spte() to a nonpresent pte.  Since we need to perform the two
operations atomically, to avoid losing the dirty and accessed bits, introduce
a helper drop_spte() and convert all call sites.

The operation is still nonatomic at this point.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         |   29 +++++++++++++++++------------
 arch/x86/kvm/paging_tmpl.h |   13 ++++++-------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6b2c644..16cedc9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -666,6 +666,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	rmap_remove(kvm, sptep);
+	__set_spte(sptep, new_spte);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -731,9 +737,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				rmap_remove(kvm, spte);
+				drop_spte(kvm, spte,
+					  shadow_trap_nonpresent_pte);
 				--kvm->stat.lpages;
-				__set_spte(spte, shadow_trap_nonpresent_pte);
 				spte = NULL;
 				write_protected = 1;
 			}
@@ -753,8 +759,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		rmap_remove(kvm, spte);
-		__set_spte(spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -776,8 +781,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			rmap_remove(kvm, spte);
-			__set_spte(spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1501,7 +1505,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 			} else {
 				if (is_large_pte(ent))
 					--kvm->stat.lpages;
-				rmap_remove(kvm, &pt[i]);
+				drop_spte(kvm, &pt[i],
+					  shadow_trap_nonpresent_pte);
 			}
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
@@ -1902,9 +1907,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			rmap_remove(vcpu->kvm, sptep);
-			spte = shadow_trap_nonpresent_pte;
-			goto set_pte;
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			goto done;
 		}
 
 		spte |= PT_WRITABLE_MASK;
@@ -1936,6 +1940,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	__set_spte(sptep, spte);
+done:
 	return ret;
 }
 
@@ -1972,7 +1977,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
-			rmap_remove(vcpu->kvm, sptep);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 		} else
 			was_rmapped = 1;
 	}
@@ -2623,7 +2628,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
-			rmap_remove(vcpu->kvm, spte);
+			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8f1ef87..105176d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,8 +329,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			continue;
 
 		if (is_large_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
@@ -491,12 +490,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
+				drop_spte(vcpu->kvm, sptep,
+					  shadow_trap_nonpresent_pte);
 				need_flush = 1;
-			}
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else
+				__set_spte(sptep, shadow_trap_nonpresent_pte);
 			break;
 		}
 
@@ -612,12 +612,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
-			rmap_remove(vcpu->kvm, &sp->spt[i]);
 			if (is_present_gpte(gpte))
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-			__set_spte(&sp->spt[i], nonpresent);
+			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
 			continue;
 		}
 
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte()
  2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
  2010-06-07  7:10 ` [PATCH v2 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
@ 2010-06-07  7:10 ` Avi Kivity
  2010-06-07  8:16   ` Lai Jiangshan
  2010-06-07  7:10 ` [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  7:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Since we need to make the check atomic, move it to the place that will
set the new spte.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   17 +++++++++--------
 1 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 16cedc9..b5a2d3d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -620,19 +620,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
-	pfn_t pfn;
 	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
-	if (!is_rmap_spte(*spte))
-		return;
 	sp = page_header(__pa(spte));
-	pfn = spte_to_pfn(*spte);
-	if (*spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*spte))
-		kvm_set_pfn_dirty(pfn);
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
@@ -668,6 +660,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
+	pfn_t pfn;
+
+	if (!is_rmap_spte(*sptep))
+		return;
+	pfn = spte_to_pfn(*sptep);
+	if (*sptep & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (is_writable_pte(*sptep))
+		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
 	__set_spte(sptep, new_spte);
 }
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte
  2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
  2010-06-07  7:10 ` [PATCH v2 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
  2010-06-07  7:10 ` [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
@ 2010-06-07  7:10 ` Avi Kivity
  2010-06-08  2:07   ` Xiao Guangrong
  2010-06-07  7:10 ` [PATCH v2 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
  2010-06-07  8:43 ` [PATCH v2 0/4] Fix accessed bit tracking Lai Jiangshan
  4 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  7:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Currently, in the window between the check for the accessed bit, and actually
dropping the spte, a vcpu can access the page through the spte and set the bit,
which will be ignored by the mmu.

Fix by using an exchange operation to atmoically fetch the spte and drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   26 +++++++++++++++++++++-----
 1 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b5a2d3d..f5bb959 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -290,6 +290,21 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 }
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+	return xchg(sptep, new_spte);
+#else
+	u64 old_spte;
+
+	do {
+		old_spte = *sptep;
+	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+	return old;
+#endif
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -661,16 +676,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
+	u64 old_spte;
 
-	if (!is_rmap_spte(*sptep))
+	old_spte = __xchg_spte(sptep, new_spte);
+	if (!is_rmap_spte(old_spte))
 		return;
-	pfn = spte_to_pfn(*sptep);
-	if (*sptep & shadow_accessed_mask)
+	pfn = spte_to_pfn(old_spte);
+	if (old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*sptep))
+	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
-	__set_spte(sptep, new_spte);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/4] KVM: MMU: Don't drop accessed bit while updating an spte
  2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
                   ` (2 preceding siblings ...)
  2010-06-07  7:10 ` [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
@ 2010-06-07  7:10 ` Avi Kivity
  2010-06-07  8:43 ` [PATCH v2 0/4] Fix accessed bit tracking Lai Jiangshan
  4 siblings, 0 replies; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  7:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

__set_spte() will happily replace an spte with the accessed bit set with
one that has the accessed bit clear.  Add a helper update_spte() which checks
for this condition and updates the page flag if needed.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |   25 +++++++++++++++++++++----
 1 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f5bb959..68e8923 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -305,6 +305,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+	u64 old_spte;
+
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+		__set_spte(sptep, new_spte);
+	} else {
+		old_spte = __xchg_spte(sptep, new_spte);
+		if (old_spte & shadow_accessed_mask)
+			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+	}
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -730,7 +743,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -786,7 +799,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 *spte, new_spte, old_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -806,9 +819,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
+			new_spte &= ~shadow_accessed_mask;
 			if (is_writable_pte(*spte))
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			__set_spte(spte, new_spte);
+			old_spte = __xchg_spte(spte, new_spte);
+			if (is_shadow_present_pte(old_spte)
+			    && (old_spte & shadow_accessed_mask))
+				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1956,7 +1973,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	__set_spte(sptep, spte);
+	update_spte(sptep, spte);
 done:
 	return ret;
 }
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte()
  2010-06-07  7:10 ` [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
@ 2010-06-07  8:16   ` Lai Jiangshan
  2010-06-07  9:01     ` Avi Kivity
  0 siblings, 1 reply; 16+ messages in thread
From: Lai Jiangshan @ 2010-06-07  8:16 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

Avi Kivity wrote:
> Since we need to make the check atomic, move it to the place that will
> set the new spte.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/x86/kvm/mmu.c |   17 +++++++++--------
>  1 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 16cedc9..b5a2d3d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -620,19 +620,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  	struct kvm_rmap_desc *desc;
>  	struct kvm_rmap_desc *prev_desc;
>  	struct kvm_mmu_page *sp;
> -	pfn_t pfn;
>  	gfn_t gfn;
>  	unsigned long *rmapp;
>  	int i;
>  
> -	if (!is_rmap_spte(*spte))
> -		return;
>  	sp = page_header(__pa(spte));
> -	pfn = spte_to_pfn(*spte);
> -	if (*spte & shadow_accessed_mask)
> -		kvm_set_pfn_accessed(pfn);
> -	if (is_writable_pte(*spte))
> -		kvm_set_pfn_dirty(pfn);
>  	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
>  	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
>  	if (!*rmapp) {
> @@ -668,6 +660,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  
>  static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
>  {
> +	pfn_t pfn;
> +
> +	if (!is_rmap_spte(*sptep))
> +		return;

Missing __set_spte(sptep, new_spte); ?

> +	pfn = spte_to_pfn(*sptep);
> +	if (*sptep & shadow_accessed_mask)
> +		kvm_set_pfn_accessed(pfn);
> +	if (is_writable_pte(*sptep))
> +		kvm_set_pfn_dirty(pfn);
>  	rmap_remove(kvm, sptep);
>  	__set_spte(sptep, new_spte);
>  }


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
                   ` (3 preceding siblings ...)
  2010-06-07  7:10 ` [PATCH v2 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
@ 2010-06-07  8:43 ` Lai Jiangshan
  2010-06-07  9:00   ` Avi Kivity
  4 siblings, 1 reply; 16+ messages in thread
From: Lai Jiangshan @ 2010-06-07  8:43 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

Avi Kivity wrote:
> The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
> will happily ignore the lock when setting the accessed bit.  This can cause
> the accessed bit to be lost.  Luckily this only results in incorrect page
> selection for swap.
> 

Atomic operation is heavy and slow, it hurts performance.
Incorrect page selection for swap also hurts performance.

I think there are very rare competitions happened and cause
the accessed bit to be lost. Since there is no incorrect result
when the accessed bit is lost. Is this problem over concern?

Lai

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-07  8:43 ` [PATCH v2 0/4] Fix accessed bit tracking Lai Jiangshan
@ 2010-06-07  9:00   ` Avi Kivity
  2010-06-08  2:35     ` Xiao Guangrong
  0 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  9:00 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Marcelo Tosatti, kvm

On 06/07/2010 11:43 AM, Lai Jiangshan wrote:
> Avi Kivity wrote:
>    
>> The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
>> will happily ignore the lock when setting the accessed bit.  This can cause
>> the accessed bit to be lost.  Luckily this only results in incorrect page
>> selection for swap.
>>
>>      
> Atomic operation is heavy and slow, it hurts performance.
> Incorrect page selection for swap also hurts performance.
>    

We can avoid the exchange in most cases, for example if the new spte has 
the accessed bit set (already in the patch set) or if the page is 
already marked as accessed, or if we see the old spte has the accessed 
bit set (so no race can occur).  I'll update the patches to avoid 
atomics when possible.

I don't think atomics are that expensive, though, ~20 cycles on modern 
processors?

> I think there are very rare competitions happened and cause
> the accessed bit to be lost. Since there is no incorrect result
> when the accessed bit is lost. Is this problem over concern?
>    

The real concern is when we start using the dirty bit.  I'd like to 
fault read accesses with writeable sptes, but with the dirty bit clear.  
This way we can allow a guest to write to a page without a fault, but 
not cause it to swap too soon.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte()
  2010-06-07  8:16   ` Lai Jiangshan
@ 2010-06-07  9:01     ` Avi Kivity
  0 siblings, 0 replies; 16+ messages in thread
From: Avi Kivity @ 2010-06-07  9:01 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Marcelo Tosatti, kvm

On 06/07/2010 11:16 AM, Lai Jiangshan wrote:
> Avi Kivity wrote:
>    
>> Since we need to make the check atomic, move it to the place that will
>> set the new spte.
>>
>> Signed-off-by: Avi Kivity<avi@redhat.com>
>> ---
>>   arch/x86/kvm/mmu.c |   17 +++++++++--------
>>   1 files changed, 9 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 16cedc9..b5a2d3d 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -620,19 +620,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>>   	struct kvm_rmap_desc *desc;
>>   	struct kvm_rmap_desc *prev_desc;
>>   	struct kvm_mmu_page *sp;
>> -	pfn_t pfn;
>>   	gfn_t gfn;
>>   	unsigned long *rmapp;
>>   	int i;
>>
>> -	if (!is_rmap_spte(*spte))
>> -		return;
>>   	sp = page_header(__pa(spte));
>> -	pfn = spte_to_pfn(*spte);
>> -	if (*spte&  shadow_accessed_mask)
>> -		kvm_set_pfn_accessed(pfn);
>> -	if (is_writable_pte(*spte))
>> -		kvm_set_pfn_dirty(pfn);
>>   	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
>>   	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
>>   	if (!*rmapp) {
>> @@ -668,6 +660,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>>
>>   static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
>>   {
>> +	pfn_t pfn;
>> +
>> +	if (!is_rmap_spte(*sptep))
>> +		return;
>>      
> Missing __set_spte(sptep, new_spte); ?
>    

Yup, will fix.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte
  2010-06-07  7:10 ` [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
@ 2010-06-08  2:07   ` Xiao Guangrong
  2010-06-08  5:51     ` Avi Kivity
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2010-06-08  2:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm



Avi Kivity wrote:
 
> +static u64 __xchg_spte(u64 *sptep, u64 new_spte)
> +{
> +#ifdef CONFIG_X86_64
> +	return xchg(sptep, new_spte);
> +#else
> +	u64 old_spte;
> +
> +	do {
> +		old_spte = *sptep;
> +	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
> +
> +	return old;

it's a typo: 'old' -> 'old_spte' :-)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-07  9:00   ` Avi Kivity
@ 2010-06-08  2:35     ` Xiao Guangrong
  2010-06-08  5:24       ` Avi Kivity
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2010-06-08  2:35 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Lai Jiangshan, Marcelo Tosatti, kvm



Avi Kivity wrote:
> On 06/07/2010 11:43 AM, Lai Jiangshan wrote:
>> Avi Kivity wrote:
>>   
>>> The kvm mmu synchronizes shadow ptes using the mmu lock, however the cpu
>>> will happily ignore the lock when setting the accessed bit.  This can
>>> cause
>>> the accessed bit to be lost.  Luckily this only results in incorrect
>>> page
>>> selection for swap.
>>>
>>>      
>> Atomic operation is heavy and slow, it hurts performance.
>> Incorrect page selection for swap also hurts performance.
>>    
> 
> We can avoid the exchange in most cases, for example if the new spte has
> the accessed bit set (already in the patch set) or if the page is
> already marked as accessed, or if we see the old spte has the accessed
> bit set (so no race can occur).  I'll update the patches to avoid
> atomics when possible.

Umm, the reason that we need atomics here is to avoid vcpu to update spte when we read A bit
form it, so, perhaps we can use below way to avoid atomics completely:

- set reserved bit in spte
- get A bit form spte
- set new spte

the worst case is cause vcpu #PF here, but it doesn't matter since the old mapping is already invalid,
also need a remote tlb flush later.

> 
> I don't think atomics are that expensive, though, ~20 cycles on modern
> processors?
> 

Yes, but atomics are "LOCK" instructions, it can stop multiple cpus runing in parallel.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-08  2:35     ` Xiao Guangrong
@ 2010-06-08  5:24       ` Avi Kivity
  2010-06-08  6:53         ` Xiao Guangrong
  0 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2010-06-08  5:24 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Lai Jiangshan, Marcelo Tosatti, kvm

On 06/08/2010 05:35 AM, Xiao Guangrong wrote:
>
>> We can avoid the exchange in most cases, for example if the new spte has
>> the accessed bit set (already in the patch set) or if the page is
>> already marked as accessed, or if we see the old spte has the accessed
>> bit set (so no race can occur).  I'll update the patches to avoid
>> atomics when possible.
>>      
> Umm, the reason that we need atomics here is to avoid vcpu to update spte when we read A bit
> form it, so, perhaps we can use below way to avoid atomics completely:
>
> - set reserved bit in spte
> - get A bit form spte
> - set new spte
>
> the worst case is cause vcpu #PF here, but it doesn't matter since the old mapping is already invalid,
> also need a remote tlb flush later.
>    

To set the reserved bit in the spte, you need an atomic operation (well, 
unless you use a sub-word-acccess to set a reserved bit in the high 32 
bits).

>> I don't think atomics are that expensive, though, ~20 cycles on modern
>> processors?
>>
>>      
> Yes, but atomics are "LOCK" instructions, it can stop multiple cpus runing in parallel.
>    

Only if those cpus are accessing the same word you're accessing.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte
  2010-06-08  2:07   ` Xiao Guangrong
@ 2010-06-08  5:51     ` Avi Kivity
  0 siblings, 0 replies; 16+ messages in thread
From: Avi Kivity @ 2010-06-08  5:51 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, kvm

On 06/08/2010 05:07 AM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>    
>> +static u64 __xchg_spte(u64 *sptep, u64 new_spte)
>> +{
>> +#ifdef CONFIG_X86_64
>> +	return xchg(sptep, new_spte);
>> +#else
>> +	u64 old_spte;
>> +
>> +	do {
>> +		old_spte = *sptep;
>> +	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
>> +
>> +	return old;
>>      
> it's a typo: 'old' ->  'old_spte' :-)
>    

Thanks.  Reminds me I need to test i386 too.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-08  5:24       ` Avi Kivity
@ 2010-06-08  6:53         ` Xiao Guangrong
  2010-06-08  7:54           ` Avi Kivity
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2010-06-08  6:53 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Lai Jiangshan, Marcelo Tosatti, kvm

Avi Kivity wrote:
> On 06/08/2010 05:35 AM, Xiao Guangrong wrote:
>>
>>> We can avoid the exchange in most cases, for example if the new spte has
>>> the accessed bit set (already in the patch set) or if the page is
>>> already marked as accessed, or if we see the old spte has the accessed
>>> bit set (so no race can occur).  I'll update the patches to avoid
>>> atomics when possible.
>>>      
>> Umm, the reason that we need atomics here is to avoid vcpu to update
>> spte when we read A bit
>> form it, so, perhaps we can use below way to avoid atomics completely:
>>
>> - set reserved bit in spte
>> - get A bit form spte
>> - set new spte
>>
>> the worst case is cause vcpu #PF here, but it doesn't matter since the
>> old mapping is already invalid,
>> also need a remote tlb flush later.
>>    
> 
> To set the reserved bit in the spte, you need an atomic operation (well,
> unless you use a sub-word-acccess to set a reserved bit in the high 32
> bits).

I think we no need atomic here, for example, we can do it like this:

*spte |= RSVD_BIT
[ maybe need a write barrier here? ]

After this sentence completed, we can ensure that the spte can not updated A bit
by vcpu, so we can get A bit safely.

> 
>>> I don't think atomics are that expensive, though, ~20 cycles on modern
>>> processors?
>>>
>>>      
>> Yes, but atomics are "LOCK" instructions, it can stop multiple cpus
>> runing in parallel.
>>    
> 
> Only if those cpus are accessing the same word you're accessing.
> 

Oh, you are right, the LOCK only locked the memory defined by the destination operand,
but i also recall that page table access can pass LOCK instruction, below description
is form intel' spec Vol. 3 7-5:

Locked operations are atomic with respect to all other memory operations and all externally
visible events. Only instruction fetch and page table accesses can pass locked instructions.
Locked instructions can be used to synchronize data written by one processor and read by another
processor.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-08  6:53         ` Xiao Guangrong
@ 2010-06-08  7:54           ` Avi Kivity
  2010-06-08  8:30             ` Xiao Guangrong
  0 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2010-06-08  7:54 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Lai Jiangshan, Marcelo Tosatti, kvm

On 06/08/2010 09:53 AM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>    
>> On 06/08/2010 05:35 AM, Xiao Guangrong wrote:
>>      
>>>        
>>>> We can avoid the exchange in most cases, for example if the new spte has
>>>> the accessed bit set (already in the patch set) or if the page is
>>>> already marked as accessed, or if we see the old spte has the accessed
>>>> bit set (so no race can occur).  I'll update the patches to avoid
>>>> atomics when possible.
>>>>
>>>>          
>>> Umm, the reason that we need atomics here is to avoid vcpu to update
>>> spte when we read A bit
>>> form it, so, perhaps we can use below way to avoid atomics completely:
>>>
>>> - set reserved bit in spte
>>> - get A bit form spte
>>> - set new spte
>>>
>>> the worst case is cause vcpu #PF here, but it doesn't matter since the
>>> old mapping is already invalid,
>>> also need a remote tlb flush later.
>>>
>>>        
>> To set the reserved bit in the spte, you need an atomic operation (well,
>> unless you use a sub-word-acccess to set a reserved bit in the high 32
>> bits).
>>      
> I think we no need atomic here, for example, we can do it like this:
>
> *spte |= RSVD_BIT
> [ maybe need a write barrier here? ]
>    

That can drop an A bit.  If *spte starts out with A cleared, we can have

cpu0                        cpu1

fetch *spte (A=0)
                             set A bit
write *spte (A=0, RSVD=1)


> After this sentence completed, we can ensure that the spte can not updated A bit
> by vcpu, so we can get A bit safely.
>    

You also need a remote tlb flush...

>>> Yes, but atomics are "LOCK" instructions, it can stop multiple cpus
>>> runing in parallel.
>>>
>>>        
>> Only if those cpus are accessing the same word you're accessing.
>>
>>      
> Oh, you are right, the LOCK only locked the memory defined by the destination operand,
> but i also recall that page table access can pass LOCK instruction, below description
> is form intel' spec Vol. 3 7-5:
>
> Locked operations are atomic with respect to all other memory operations and all externally
> visible events. Only instruction fetch and page table accesses can pass locked instructions.
> Locked instructions can be used to synchronize data written by one processor and read by another
> processor.
>    

But actually setting the A bit will use LOCK itself.  So in the 
following sequence


    write pte (A=0)
    test_and_clear_bit(A, pte)
    access memory through pte

the test_and_clear_bit can return A=1 due to speculation and the 
parapgraph above, but setting the A bit by the processor will happen 
with a bus lock, so it won't lose information.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/4] Fix accessed bit tracking
  2010-06-08  7:54           ` Avi Kivity
@ 2010-06-08  8:30             ` Xiao Guangrong
  0 siblings, 0 replies; 16+ messages in thread
From: Xiao Guangrong @ 2010-06-08  8:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Lai Jiangshan, Marcelo Tosatti, kvm



Avi Kivity wrote:
> On 06/08/2010 09:53 AM, Xiao Guangrong wrote:
>>
>> Avi Kivity wrote:
>>   
>>> On 06/08/2010 05:35 AM, Xiao Guangrong wrote:
>>>     
>>>>       
>>>>> We can avoid the exchange in most cases, for example if the new
>>>>> spte has
>>>>> the accessed bit set (already in the patch set) or if the page is
>>>>> already marked as accessed, or if we see the old spte has the accessed
>>>>> bit set (so no race can occur).  I'll update the patches to avoid
>>>>> atomics when possible.
>>>>>
>>>>>          
>>>> Umm, the reason that we need atomics here is to avoid vcpu to update
>>>> spte when we read A bit
>>>> form it, so, perhaps we can use below way to avoid atomics completely:
>>>>
>>>> - set reserved bit in spte
>>>> - get A bit form spte
>>>> - set new spte
>>>>
>>>> the worst case is cause vcpu #PF here, but it doesn't matter since the
>>>> old mapping is already invalid,
>>>> also need a remote tlb flush later.
>>>>
>>>>        
>>> To set the reserved bit in the spte, you need an atomic operation (well,
>>> unless you use a sub-word-acccess to set a reserved bit in the high 32
>>> bits).
>>>      
>> I think we no need atomic here, for example, we can do it like this:
>>
>> *spte |= RSVD_BIT
>> [ maybe need a write barrier here? ]
>>    
> 
> That can drop an A bit.  If *spte starts out with A cleared, we can have
> 
> cpu0                        cpu1
> 
> fetch *spte (A=0)
>                             set A bit
> write *spte (A=0, RSVD=1)
> 

Yes, you are right, i forget it :-(, we can avoid it by only touch higher 32 bits
as you say.

> 
>> After this sentence completed, we can ensure that the spte can not
>> updated A bit
>> by vcpu, so we can get A bit safely.
>>    
> 
> You also need a remote tlb flush...

Maybe it not need, since we only need get A bit here, after:

set reserved bit in spte

if the spte is in tlb, the A bit must 1, we can get it correctly later,
otherwise, if cpu try to access 'spte' mapping, it will cause #PF

> 
>>>> Yes, but atomics are "LOCK" instructions, it can stop multiple cpus
>>>> runing in parallel.
>>>>
>>>>        
>>> Only if those cpus are accessing the same word you're accessing.
>>>
>>>      
>> Oh, you are right, the LOCK only locked the memory defined by the
>> destination operand,
>> but i also recall that page table access can pass LOCK instruction,
>> below description
>> is form intel' spec Vol. 3 7-5:
>>
>> Locked operations are atomic with respect to all other memory
>> operations and all externally
>> visible events. Only instruction fetch and page table accesses can
>> pass locked instructions.
>> Locked instructions can be used to synchronize data written by one
>> processor and read by another
>> processor.
>>    
> 
> But actually setting the A bit will use LOCK itself.  So in the
> following sequence
> 
> 
>    write pte (A=0)
>    test_and_clear_bit(A, pte)
>    access memory through pte
> 
> the test_and_clear_bit can return A=1 due to speculation and the
> parapgraph above, but setting the A bit by the processor will happen
> with a bus lock, so it won't lose information.
> 

OH, sorry for my fault, thanks a lot, Avi

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-06-08  8:33 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-07  7:10 [PATCH v2 0/4] Fix accessed bit tracking Avi Kivity
2010-06-07  7:10 ` [PATCH v2 1/4] KVM: MMU: Introduce drop_spte() Avi Kivity
2010-06-07  7:10 ` [PATCH v2 2/4] KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Avi Kivity
2010-06-07  8:16   ` Lai Jiangshan
2010-06-07  9:01     ` Avi Kivity
2010-06-07  7:10 ` [PATCH v2 3/4] KVM: MMU: Atomically check for accessed bit when dropping an spte Avi Kivity
2010-06-08  2:07   ` Xiao Guangrong
2010-06-08  5:51     ` Avi Kivity
2010-06-07  7:10 ` [PATCH v2 4/4] KVM: MMU: Don't drop accessed bit while updating " Avi Kivity
2010-06-07  8:43 ` [PATCH v2 0/4] Fix accessed bit tracking Lai Jiangshan
2010-06-07  9:00   ` Avi Kivity
2010-06-08  2:35     ` Xiao Guangrong
2010-06-08  5:24       ` Avi Kivity
2010-06-08  6:53         ` Xiao Guangrong
2010-06-08  7:54           ` Avi Kivity
2010-06-08  8:30             ` Xiao Guangrong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox