[PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache
@ 2024-08-21 20:28 David Woodhouse
  2024-08-21 20:28 ` [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry() David Woodhouse
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: David Woodhouse @ 2024-08-21 20:28 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

From: David Woodhouse <dwmw@amazon.co.uk>

This will be used to allow hva_to_pfn_retry() to be more selective about
its retry loop, which is currently extremely pessimistic.

It allows for invalidations to occur even while the PFN is being mapped
(which happens with the lock dropped), before the GPC is fully valid.

No functional change yet, as the existing mmu_notifier_retry_cache()
function will still return true in all cases where the invalidation
may have triggered.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_types.h |  1 +
 virt/kvm/pfncache.c       | 29 ++++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..4d8fbd87c320 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
 	void *khva;
 	kvm_pfn_t pfn;
 	bool active;
+	bool needs_invalidation;
 	bool valid;
 };
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..7007d32d197a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -32,7 +32,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 		read_lock_irq(&gpc->lock);
 
 		/* Only a single page so no need to care about length */
-		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
 		    gpc->uhva >= start && gpc->uhva < end) {
 			read_unlock_irq(&gpc->lock);
 
@@ -45,9 +45,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 			 */
 
 			write_lock_irq(&gpc->lock);
-			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >= start && gpc->uhva < end)
+			if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
+			    gpc->uhva >= start && gpc->uhva < end) {
+				gpc->needs_invalidation = false;
 				gpc->valid = false;
+			}
 			write_unlock_irq(&gpc->lock);
 			continue;
 		}
@@ -93,6 +95,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->valid)
 		return false;
 
+	/* If it's valid, it needs invalidation! */
+	WARN_ON_ONCE(!gpc->needs_invalidation);
+
 	return true;
 }
 
@@ -175,6 +180,17 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
 		smp_rmb();
 
+		/*
+		 * The translation made by hva_to_pfn() below could be made
+		 * invalid as soon as it's mapped. But the uhva is already
+		 * known and that's all that gfn_to_pfn_cache_invalidate()
+		 * looks at. So set the 'needs_invalidation' flag to allow
+		 * the GPC to be marked invalid from the moment the lock is
+		 * dropped, before the corresponding PFN is even found (and,
+		 * more to the point, immediately afterwards).
+		 */
+		gpc->needs_invalidation = true;
+
 		write_unlock_irq(&gpc->lock);
 
 		/*
@@ -224,7 +240,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+	} while (!gpc->needs_invalidation ||
+		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
@@ -339,6 +356,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
 	 */
 	if (ret) {
 		gpc->valid = false;
+		gpc->needs_invalidation = false;
 		gpc->pfn = KVM_PFN_ERR_FAULT;
 		gpc->khva = NULL;
 	}
@@ -383,7 +401,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->gpa = INVALID_GPA;
 	gpc->uhva = KVM_HVA_ERR_BAD;
-	gpc->active = gpc->valid = false;
+	gpc->active = gpc->valid = gpc->needs_invalidation = false;
 }
 
 static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +471,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		write_lock_irq(&gpc->lock);
 		gpc->active = false;
 		gpc->valid = false;
+		gpc->needs_invalidation = false;
 
 		/*
 		 * Leave the GPA => uHVA cache intact, it's protected by the
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()
  2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
@ 2024-08-21 20:28 ` David Woodhouse
  2024-10-16  0:04   ` Sean Christopherson
  2024-08-21 20:28 ` [PATCH v3 3/5] KVM: pfncache: Wait for pending invalidations instead of spinning David Woodhouse
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 8+ messages in thread
From: David Woodhouse @ 2024-08-21 20:28 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

From: David Woodhouse <dwmw@amazon.co.uk>

The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keep retrying
for ever, not making any progress.

Solve this by being a bit more selective about when to retry. In
addition to the ->needs_invalidation flag added in a previous commit,
check before calling hva_to_pfn() if there is any pending invalidation
(i.e. between invalidate_range_start() and invalidate_range_end()) which
affects the uHVA of the GPC being updated. This catches the case where
hva_to_pfn() would return a soon-to-be-stale mapping of a range for which
the invalidate_range_start() hook had already been called before the uHVA
was even set in the GPC and the ->needs_invalidation flag set.

An invalidation which *starts* after the lock is dropped in the loop is
fine because it will clear the ->needs_revalidation flag and also trigger
a retry.

This is slightly more complex than it needs to be; moving the
invalidation to occur in the invalidate_range_end() hook will make life
simpler, in a subsequent commit.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++
 virt/kvm/pfncache.c      | 60 +++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..1bfe2e8d52cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,8 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..84eb1ebb6f47 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
+	if (likely(kvm->mn_active_invalidate_count == 1)) {
+		kvm->mmu_gpc_invalidate_range_start = range->start;
+		kvm->mmu_gpc_invalidate_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, range->start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, range->end);
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
@@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 7007d32d197a..eeb9bf43c04a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 {
 	/*
-	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_invalidate_in_progress here; but the latter cannot
-	 * be used here because the invalidation of caches in the
-	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-	 * is elevated.
-	 *
-	 * Note, it does not matter that mn_active_invalidate_count
-	 * is not protected by gpc->lock.  It is guaranteed to
-	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_invalidate_seq is updated.
+	 * No need for locking on GPC here because these fields are protected
+	 * by gpc->refresh_lock.
 	 */
-	if (kvm->mn_active_invalidate_count)
-		return true;
+	guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
 
-	/*
-	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end() to guarantee either the
-	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_invalidate_seq is observed.
-	 */
-	smp_rmb();
-	return kvm->mmu_invalidate_seq != mmu_seq;
+	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
-	unsigned long mmu_seq;
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
@@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = gpc->kvm->mmu_invalidate_seq;
-		smp_rmb();
-
 		/*
 		 * The translation made by hva_to_pfn() below could be made
 		 * invalid as soon as it's mapped. But the uhva is already
@@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 		write_unlock_irq(&gpc->lock);
 
+		/*
+		 * Invalidation occurs from the invalidate_range_start() hook,
+		 * which could already have happened before __kvm_gpc_refresh()
+		 * (or the previous turn around this loop) took gpc->lock().
+		 * If so, and if the corresponding invalidate_range_end() hook
+		 * hasn't happened yet, hva_to_pfn() could return a mapping
+		 * which is about to be stale and which should not be used. So
+		 * check if there are any currently-running invalidations which
+		 * affect the uHVA of this GPC, and retry if there are. Any
+		 * invalidation which starts after gpc->needs_invalidation is
+		 * set is fine, because it will clear that flag and trigger a
+		 * retry. And any invalidation which *completes* by having its
+		 * invalidate_range_end() hook called immediately prior to this
+		 * check is also fine, because the page tables are guaranteed
+		 * to have been changted already, so hva_to_pfn() won't return
+		 * a stale mapping in that case anyway.
+		 */
+		while (gpc_invalidations_pending(gpc)) {
+			cond_resched();
+			write_lock_irq(&gpc->lock);
+			continue;
+		}
+
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
 		 * event, release the pfn and unmap the kernel virtual address
@@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+
 		write_lock_irq(&gpc->lock);
 
 		/*
@@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (!gpc->needs_invalidation ||
-		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+	} while (!gpc->needs_invalidation);
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()
  2024-08-21 20:28 ` [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry() David Woodhouse
@ 2024-10-16  0:04   ` Sean Christopherson
  0 siblings, 0 replies; 8+ messages in thread
From: Sean Christopherson @ 2024-10-16  0:04 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Paolo Bonzini, Mushahid Hussain, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel, kvm, linux-kernel, Mingwei Zhang,
	Maxim Levitsky

[-- Attachment #1: Type: text/plain, Size: 6230 bytes --]

On Wed, Aug 21, 2024, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
> If there are any concurrent invalidations running, it's effectively just
> a complex busy wait loop because its local mmu_notifier_retry_cache()
> function will always return true.
> 
> Since multiple invalidations can be running in parallel, this can result
> in a situation where hva_to_pfn_retry() just backs off and keep retrying
> for ever, not making any progress.
> 
> Solve this by being a bit more selective about when to retry. In
> addition to the ->needs_invalidation flag added in a previous commit,
> check before calling hva_to_pfn() if there is any pending invalidation
> (i.e. between invalidate_range_start() and invalidate_range_end()) which
> affects the uHVA of the GPC being updated. This catches the case where
> hva_to_pfn() would return a soon-to-be-stale mapping of a range for which
> the invalidate_range_start() hook had already been called before the uHVA
> was even set in the GPC and the ->needs_invalidation flag set.
> 
> An invalidation which *starts* after the lock is dropped in the loop is
> fine because it will clear the ->needs_revalidation flag and also trigger
> a retry.
> 
> This is slightly more complex than it needs to be; moving the
> invalidation to occur in the invalidate_range_end() hook will make life
> simpler, in a subsequent commit.

I'm pretty sure that's still more complex than it needs to be.  And even if the
gpc implementation isn't any more/less complex, diverging from the page fault
side of things makes KVM overall more complex.

Maybe I'm just not seeing some functionality that needs_invalidation provides,
but I still don't understand why it's necessary. 

Omitting context to keep this short, I think the below is what we can end up at;
see attached (very lightly tested) patches for the full idea.  So long as either
the invalidation side marks the gpc invalid, or the refresh side is guaranteed
to check for an invalidation, I don't see why needs_invalidation is required.

If the cache is invalid, or the uhva is changing, then the event doesn't need to
do anything because refresh() will hva_to_pfn_retry() to re-resolve the pfn.  So
at that point, KVM just needs to ensure either hva_to_pfn_retry() observes the
in-progress range, or the event sees valid==true.

Note, the attached patches include the "wait" idea, but I'm not convinced that's
actually a good idea.  I'll respond more to your last patch.

gfn_to_pfn_cache_invalidate_start():

	...

	/*
	 * Ensure that either each cache sees the to-be-invalidated range and
	 * retries if necessary, or that this task sees the cache's valid flag
	 * and invalidates the cache before completing the mmu_notifier event.
	 * Note, gpc->uhva must be set before gpc->valid, and if gpc->uhva is
	 * modified the cache must be re-validated.  Pairs with the smp_mb() in
	 * hva_to_pfn_retry().
	 */
	smp_mb__before_atomic();

	spin_lock(&kvm->gpc_lock);
	list_for_each_entry(gpc, &kvm->gpc_list, list) {
		if (!gpc->valid || gpc->uhva < start || gpc->uhva >= end)
			continue;

		write_lock_irq(&gpc->lock);

		/*
		 * Verify the cache still needs to be invalidated after
		 * acquiring gpc->lock, to avoid an unnecessary invalidation
		 * in the unlikely scenario the cache became valid with a
		 * different userspace virtual address.
		 */
		if (gpc->valid && gpc->uhva >= start && gpc->uhva < end)
			gpc->valid = false;
		write_unlock_irq(&gpc->lock);
	}
	spin_unlock(&kvm->gpc_lock);

hva_to_pfn_retry():

	do {
		/*
		 * Invalidate the cache prior to dropping gpc->lock, the
		 * gpa=>uhva assets have already been updated and so a check()
		 * from a different task may not fail the gpa/uhva/generation
		 * checks, i.e. must observe valid==false.
		 */
		gpc->valid = false;

		write_unlock_irq(&gpc->lock);

		...

		write_lock_irq(&gpc->lock);

		/*
		 * Other tasks must wait for _this_ refresh to complete before
		 * attempting to refresh.
		 */
		WARN_ON_ONCE(gpc->valid);
		gpc->valid = true;

		/*
		 * Ensure valid is visible before checking if retry is needed.
		 * Pairs with the smp_mb__before_atomic() in
		 * gfn_to_pfn_cache_invalidate().
		 */
		smp_mb();
	} while (gpc_invalidate_retry_hva(gpc, mmu_seq));

> @@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  
>  		write_unlock_irq(&gpc->lock);
>  
> +		/*
> +		 * Invalidation occurs from the invalidate_range_start() hook,
> +		 * which could already have happened before __kvm_gpc_refresh()
> +		 * (or the previous turn around this loop) took gpc->lock().
> +		 * If so, and if the corresponding invalidate_range_end() hook
> +		 * hasn't happened yet, hva_to_pfn() could return a mapping
> +		 * which is about to be stale and which should not be used. So
> +		 * check if there are any currently-running invalidations which
> +		 * affect the uHVA of this GPC, and retry if there are. Any
> +		 * invalidation which starts after gpc->needs_invalidation is
> +		 * set is fine, because it will clear that flag and trigger a
> +		 * retry. And any invalidation which *completes* by having its
> +		 * invalidate_range_end() hook called immediately prior to this
> +		 * check is also fine, because the page tables are guaranteed
> +		 * to have been changted already, so hva_to_pfn() won't return

s/changted/changed

> +		 * a stale mapping in that case anyway.
> +		 */
> +		while (gpc_invalidations_pending(gpc)) {
> +			cond_resched();
> +			write_lock_irq(&gpc->lock);
> +			continue;

Heh, our testcase obviously isn't as robust as we hope it is.  Or maybe only the
"wait" version was tested.  The "continue" here will continue the
while(gpc_invalidations_pending) loop, not the outer while loop.  That will cause
deadlock due to trying acquiring gpc->lock over and over.

> +		}
> +
>  		/*
>  		 * If the previous iteration "failed" due to an mmu_notifier
>  		 * event, release the pfn and unmap the kernel virtual address
> @@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  			goto out_error;
>  		}
>  
> +

Spurious whitespace.

>  		write_lock_irq(&gpc->lock);
>  
>  		/*


[-- Attachment #2: 0001-KVM-pfncache-Snapshot-mmu_invalidate_seq-immediately.patch --]
[-- Type: text/x-diff, Size: 1691 bytes --]

From 4e34801acab175913b6fd5b6c7c4aa1350d5e571 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 15 Oct 2024 16:35:15 -0700
Subject: [PATCH 1/5] KVM: pfncache: Snapshot mmu_invalidate_seq immediately
 before hva_to_pfn()

Grab the snapshot of the mmu_notifier sequence counter immediately before
calling hva_to_pfn() when refreching a gpc, as there's no requirement that
the snapshot be taken while holding gpc->lock; the sequence counter is
completely independent of locking.  This will allow waiting on in-progress
invalidations to complete, instead of actively trying to resolve a pfn
that KVM is guaranteed to discard (because either the invalidation will
still be in-progress, or it will have completed and bumped the sequence
counter).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..4afbc1262e3f 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -172,9 +172,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = gpc->kvm->mmu_invalidate_seq;
-		smp_rmb();
-
 		write_unlock_irq(&gpc->lock);
 
 		/*
@@ -197,6 +194,9 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			cond_resched();
 		}
 
+		mmu_seq = gpc->kvm->mmu_invalidate_seq;
+		smp_rmb();
+
 		/* We always request a writeable mapping */
 		new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
 		if (is_error_noslot_pfn(new_pfn))

base-commit: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
-- 
2.47.0.rc1.288.g06298d1525-goog


[-- Attachment #3: 0002-KVM-pfncache-Wait-for-in-progress-invalidations-to-c.patch --]
[-- Type: text/x-diff, Size: 1607 bytes --]

From b023d3ada9c856cbfe38839068861e5f2e19489f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 15 Oct 2024 16:33:58 -0700
Subject: [PATCH 2/5] KVM: pfncache: Wait for in-progress invalidations to
 complete during refresh

When refreshing a gpc, wait for in-progress invalidations to complete
before reading the sequence counter and resolving the pfn.  Resolving the
pfn when there is an in-progress invalidation is worse than pointless, as
the pfn is guaranteed to be discarded, and trying to resolve the pfn could
contended for resources, e.g. mmap_lock, and make the invalidation and
thus refresh take longer to complete.

Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 4afbc1262e3f..957f739227ab 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -194,6 +194,18 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			cond_resched();
 		}
 
+		/*
+		 * Wait for in-progress invalidations to complete if the user
+		 * already being invalidated.  Unlike the page fault path, this
+		 * task _must_ complete the refresh, i.e. there's no value in
+		 * trying to race ahead in the hope that a different task makes
+		 * the cache valid.
+		 */
+		while (READ_ONCE(gpc->kvm->mn_active_invalidate_count)) {
+			if (!cond_resched())
+				cpu_relax();
+		}
+
 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
 		smp_rmb();
 
-- 
2.47.0.rc1.288.g06298d1525-goog


[-- Attachment #4: 0003-KVM-pfncache-Implement-range-based-invalidation-chec.patch --]
[-- Type: text/x-diff, Size: 8693 bytes --]

From 98de4aad0c442958b95ca62e0a9ff1a736b71bb7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 15 Oct 2024 12:49:06 -0700
Subject: [PATCH 3/5] KVM: pfncache: Implement range-based invalidation check
 for hva_to_pfn_retry()

The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.  Even worse, caches are forced to wait
even if the invalidations do not affect the cache's virtual address range.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keeps retrying
forever, not making any progress.

Mitigate the badness by being a bit more selective about when to retry.
Specifically, retry only if an in-progress invalidation range affects the
cache, i.e. implement range-based invalidation for caches, similar to how
KVM imlements range-based invalidation for page faults.

Note, like the existing range-based invalidation, this approach doesn't
completely prevent false positives since the in-progress range never
shrinks.  E.g. if userspace is spamming invalidations *and* simultaneously
invalidates a cache, the cache will still get stuck until the spam stops.
However, that problem already exists with page faults, and precisely
tracking in-progress ranges would add significant complexity.  Defer
trying to address that issue until it actually becomes a problem, which in
all likelihood will never happen.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      |  7 ++--
 virt/kvm/pfncache.c      | 75 ++++++++++++++++++++++++++++------------
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index db567d26f7b9..2c0ed735f0f4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -785,6 +785,8 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb2548d99..b9223ecab2ca 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,7 +775,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
-	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
 	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
@@ -784,10 +783,10 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * any given time, and the caches themselves can check for hva overlap,
 	 * i.e. don't need to rely on memslot overlap checks for performance.
 	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of
-	 * mmu_invalidate_in_progress.
+	 * mn_active_invalidate_count instead of mmu_invalidate_in_progress.
 	 */
 	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
+	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
@@ -1164,6 +1163,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 957f739227ab..3d5bc9bab3d9 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -27,6 +27,27 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 {
 	struct gfn_to_pfn_cache *gpc;
 
+	lockdep_assert_held(&kvm->mn_invalidate_lock);
+
+	if (likely(kvm->mn_active_invalidate_count == 1)) {
+		kvm->mmu_gpc_invalidate_range_start = start;
+		kvm->mmu_gpc_invalidate_range_end = end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, end);
+	}
+
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
 		read_lock_irq(&gpc->lock);
@@ -74,6 +95,8 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
 
+	lockdep_assert_held(&gpc->lock);
+
 	if (!gpc->active)
 		return false;
 
@@ -124,21 +147,26 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidate_in_progress_hva(struct gfn_to_pfn_cache *gpc)
 {
+	struct kvm *kvm = gpc->kvm;
+
 	/*
-	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_invalidate_in_progress here; but the latter cannot
-	 * be used here because the invalidation of caches in the
-	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-	 * is elevated.
-	 *
-	 * Note, it does not matter that mn_active_invalidate_count
-	 * is not protected by gpc->lock.  It is guaranteed to
-	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_invalidate_seq is updated.
+	 * Ensure the count and range are always read from memory so that
+	 * checking for retry in a loop won't get false negatives on the range,
+	 * and won't result in an infinite retry loop.  Note, the range never
+	 * shrinks, only grows, and so the key to avoiding infinite retry loops
+	 * is observing the 1=>0 transition of the count.
 	 */
-	if (kvm->mn_active_invalidate_count)
+	return unlikely(READ_ONCE(kvm->mn_active_invalidate_count)) &&
+	       READ_ONCE(kvm->mmu_gpc_invalidate_range_start) <= gpc->uhva &&
+	       READ_ONCE(kvm->mmu_gpc_invalidate_range_end) > gpc->uhva;
+}
+
+static bool gpc_invalidate_retry_hva(struct gfn_to_pfn_cache *gpc,
+				     unsigned long mmu_seq)
+{
+	if (gpc_invalidate_in_progress_hva(gpc))
 		return true;
 
 	/*
@@ -149,7 +177,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
 	 * new (incremented) value of mmu_invalidate_seq is observed.
 	 */
 	smp_rmb();
-	return kvm->mmu_invalidate_seq != mmu_seq;
+	return gpc->kvm->mmu_invalidate_seq != mmu_seq;
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -164,14 +192,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 	lockdep_assert_held_write(&gpc->lock);
 
-	/*
-	 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
-	 * assets have already been updated and so a concurrent check() from a
-	 * different task may not fail the gpa/uhva/generation checks.
-	 */
-	gpc->valid = false;
-
 	do {
+		/*
+		 * Invalidate the cache prior to dropping gpc->lock, the
+		 * gpa=>uhva assets have already been updated and so a check()
+		 * from a different task may not fail the gpa/uhva/generation
+		 * checks, i.e. must observe valid==false.
+		 */
+		gpc->valid = false;
+
 		write_unlock_irq(&gpc->lock);
 
 		/*
@@ -201,7 +230,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * trying to race ahead in the hope that a different task makes
 		 * the cache valid.
 		 */
-		while (READ_ONCE(gpc->kvm->mn_active_invalidate_count)) {
+		while (gpc_invalidate_in_progress_hva(gpc)) {
 			if (!cond_resched())
 				cpu_relax();
 		}
@@ -236,9 +265,9 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+		gpc->valid = true;
+	} while (gpc_invalidate_retry_hva(gpc, mmu_seq));
 
-	gpc->valid = true;
 	gpc->pfn = new_pfn;
 	gpc->khva = new_khva + offset_in_page(gpc->uhva);
 
-- 
2.47.0.rc1.288.g06298d1525-goog


[-- Attachment #5: 0004-KVM-pfncache-Add-lockless-checking-of-cache-invalida.patch --]
[-- Type: text/x-diff, Size: 3837 bytes --]

From 423be552d22d83a49ce4d0ad7865b8f4e3f1eeb0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 15 Oct 2024 13:57:54 -0700
Subject: [PATCH 4/5] KVM: pfncache: Add lockless checking of cache
 invalidation events

When handling an mmu_notifier invalidation event, perform the initial check
for overlap with a valid gpc_to_pfn_cache without acquiring the cache's
lock, and instead ensure either the invalidation event observes a valid
cache, or the cache observes the invalidation event.

Locklessly checking gpc->valid and gpc->uhva relies on several existing
invariants:

 - Changing gpc->uhva requires performing a refresh()
 - A cache can be made valid only during refresh()
 - Only one task can execute refresh() at a time
 - Tasks must check() a cache after a successful refresh()
 - check() must hold gpc->lock (usually for read)

The existing invariants means that if KVM observes an invalid gpc, or if
the uhva is unstable, then a refresh() is in-progress or will be performed
prior to consuming the new uhva.  And so KVM simply needs to ensure that
refresh() sees the invalidation and retries, or that the invalidation sees
the fully valid gpc.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 51 ++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 3d5bc9bab3d9..2163bb6b899c 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -48,32 +48,32 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 			max(kvm->mmu_gpc_invalidate_range_end, end);
 	}
 
+	/*
+	 * Ensure that either each cache sees the to-be-invalidated range and
+	 * retries if necessary, or that this task sees the cache's valid flag
+	 * and invalidates the cache before completing the mmu_notifier event.
+	 * Note, gpc->uhva must be set before gpc->valid, and if gpc->uhva is
+	 * modified the cache must be re-validated.  Pairs with the smp_mb() in
+	 * hva_to_pfn_retry().
+	 */
+	smp_mb__before_atomic();
+
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
-		read_lock_irq(&gpc->lock);
-
-		/* Only a single page so no need to care about length */
-		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-		    gpc->uhva >= start && gpc->uhva < end) {
-			read_unlock_irq(&gpc->lock);
-
-			/*
-			 * There is a small window here where the cache could
-			 * be modified, and invalidation would no longer be
-			 * necessary. Hence check again whether invalidation
-			 * is still necessary once the write lock has been
-			 * acquired.
-			 */
-
-			write_lock_irq(&gpc->lock);
-			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >= start && gpc->uhva < end)
-				gpc->valid = false;
-			write_unlock_irq(&gpc->lock);
+		if (!gpc->valid || gpc->uhva < start || gpc->uhva >= end)
 			continue;
-		}
 
-		read_unlock_irq(&gpc->lock);
+		write_lock_irq(&gpc->lock);
+
+		/*
+		 * Verify the cache still needs to be invalidated after
+		 * acquiring gpc->lock, to avoid an unnecessary invalidation
+		 * in the unlikely scenario the cache became valid with a
+		 * different userspace virtual address.
+		 */
+		if (gpc->valid && gpc->uhva >= start && gpc->uhva < end)
+			gpc->valid = false;
+		write_unlock_irq(&gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
 }
@@ -266,6 +266,13 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 */
 		WARN_ON_ONCE(gpc->valid);
 		gpc->valid = true;
+
+		/*
+		 * Ensure valid is visible before checking if retry is needed.
+		 * Pairs with the smp_mb__before_atomic() in
+		 * gfn_to_pfn_cache_invalidate().
+		 */
+		smp_mb();
 	} while (gpc_invalidate_retry_hva(gpc, mmu_seq));
 
 	gpc->pfn = new_pfn;
-- 
2.47.0.rc1.288.g06298d1525-goog


[-- Attachment #6: 0005-KVM-pfncache-Wait-for-pending-invalidations-instead-.patch --]
[-- Type: text/x-diff, Size: 3904 bytes --]

From 6df3fbb08269acfbf28b2cf2fab12c1a0e7c52f7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 15 Oct 2024 14:55:49 -0700
Subject: [PATCH 5/5] KVM: pfncache: Wait for pending invalidations instead of
 spinning

The busy loop in hva_to_pfn_retry() is worse than a normal page fault
retry loop because it spins even while it's waiting for the invalidation
to complete. It isn't just that a page might get faulted out again before
it's actually accessed.

Introduce a wait queue to be woken when kvm->mn_active_invalidate_count
reaches zero, and wait on it if there is any pending invalidation which
affects the GPC being refreshed.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[sean: massage comment as part of rebasing]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      |  9 ++++++---
 virt/kvm/pfncache.c      | 30 ++++++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c0ed735f0f4..a9d7b2200b6f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -787,6 +787,7 @@ struct kvm {
 	struct list_head gpc_list;
 	u64 mmu_gpc_invalidate_range_start;
 	u64 mmu_gpc_invalidate_range_end;
+	wait_queue_head_t gpc_invalidate_wq;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b9223ecab2ca..3ba6d109a941 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -849,11 +849,13 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
-	 * There can only be one waiter, since the wait happens under
-	 * slots_lock.
+	 * There can only be one memslots waiter, since the wait happens under
+	 * slots_lock, but there can be multiple gpc waiters.
 	 */
-	if (wake)
+	if (wake) {
+		wake_up(&kvm->gpc_invalidate_wq);
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+	}
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1163,6 +1165,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	init_waitqueue_head(&kvm->gpc_invalidate_wq);
 	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
 	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2163bb6b899c..77cc5633636a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -180,6 +180,30 @@ static bool gpc_invalidate_retry_hva(struct gfn_to_pfn_cache *gpc,
 	return gpc->kvm->mmu_invalidate_seq != mmu_seq;
 }
 
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+	struct kvm *kvm = gpc->kvm;
+
+	spin_lock(&kvm->mn_invalidate_lock);
+	if (gpc_invalidate_in_progress_hva(gpc)) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait(&kvm->gpc_invalidate_wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			if (!gpc_invalidate_in_progress_hva(gpc))
+				break;
+
+			spin_unlock(&kvm->mn_invalidate_lock);
+			schedule();
+			spin_lock(&kvm->mn_invalidate_lock);
+		}
+		finish_wait(&kvm->gpc_invalidate_wq, &wait);
+	}
+	spin_unlock(&kvm->mn_invalidate_lock);
+}
+
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -230,10 +254,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * trying to race ahead in the hope that a different task makes
 		 * the cache valid.
 		 */
-		while (gpc_invalidate_in_progress_hva(gpc)) {
-			if (!cond_resched())
-				cpu_relax();
-		}
+		while (gpc_invalidate_in_progress_hva(gpc))
+			gpc_wait_for_invalidations(gpc);
 
 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
 		smp_rmb();
-- 
2.47.0.rc1.288.g06298d1525-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 3/5] KVM: pfncache: Wait for pending invalidations instead of spinning
  2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
  2024-08-21 20:28 ` [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry() David Woodhouse
@ 2024-08-21 20:28 ` David Woodhouse
  2024-08-21 20:28 ` [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook David Woodhouse
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: David Woodhouse @ 2024-08-21 20:28 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

From: David Woodhouse <dwmw@amazon.co.uk>

The busy loop in hva_to_pfn_retry() is worse than a normal page fault
retry loop because it spins even while it's waiting for the invalidation
to complete. It isn't just that a page might get faulted out again before
it's actually accessed.

Introduce a wait queue to be woken when kvm->mn_active_invalidate_count
reaches zero, and wait on it if there is any pending invalidation which
affects the GPC being refreshed.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      |  5 ++++-
 virt/kvm/pfncache.c      | 32 ++++++++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bfe2e8d52cd..a0739c343da5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -772,6 +772,7 @@ struct kvm {
 	struct list_head gpc_list;
 	u64 mmu_gpc_invalidate_range_start;
 	u64 mmu_gpc_invalidate_range_end;
+	wait_queue_head_t gpc_invalidate_wq;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 84eb1ebb6f47..e04eb700448b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -871,8 +871,10 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * There can only be one waiter, since the wait happens under
 	 * slots_lock.
 	 */
-	if (wake)
+	if (wake) {
+		wake_up(&kvm->gpc_invalidate_wq);
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+	}
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1182,6 +1184,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	init_waitqueue_head(&kvm->gpc_invalidate_wq);
 	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
 	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index eeb9bf43c04a..fa494eb3d924 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -135,13 +135,38 @@ static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 	 * No need for locking on GPC here because these fields are protected
 	 * by gpc->refresh_lock.
 	 */
-	guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
-
 	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
 		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
 		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
+static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+	bool waited = false;
+
+	spin_lock(&gpc->kvm->mn_invalidate_lock);
+	if (gpc_invalidations_pending(gpc)) {
+		DEFINE_WAIT(wait);
+
+		waited = true;
+		for (;;) {
+			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			if (!gpc_invalidations_pending(gpc))
+				break;
+
+			spin_unlock(&gpc->kvm->mn_invalidate_lock);
+			schedule();
+			spin_lock(&gpc->kvm->mn_invalidate_lock);
+		}
+		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+	}
+	spin_unlock(&gpc->kvm->mn_invalidate_lock);
+	return waited;
+}
+
+
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -191,8 +216,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * to have been changted already, so hva_to_pfn() won't return
 		 * a stale mapping in that case anyway.
 		 */
-		while (gpc_invalidations_pending(gpc)) {
-			cond_resched();
+		if (gpc_wait_for_invalidations(gpc)) {
 			write_lock_irq(&gpc->lock);
 			continue;
 		}
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook
  2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
  2024-08-21 20:28 ` [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry() David Woodhouse
  2024-08-21 20:28 ` [PATCH v3 3/5] KVM: pfncache: Wait for pending invalidations instead of spinning David Woodhouse
@ 2024-08-21 20:28 ` David Woodhouse
  2024-10-16  0:17   ` Sean Christopherson
  2024-08-21 20:28 ` [PATCH v3 5/5] DO NOT MERGE: Hack-a-test to verify gpc invalidation+refresh David Woodhouse
  2024-09-19 11:13 ` [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
  4 siblings, 1 reply; 8+ messages in thread
From: David Woodhouse @ 2024-08-21 20:28 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

From: David Woodhouse <dwmw@amazon.co.uk>

By performing the invalidation from the 'end' hook, the process is a lot
cleaner and simpler because it is guaranteed that ->needs_invalidation
will be cleared after hva_to_pfn() has been called, so the only thing
that hva_to_pfn_retry() needs to do is *wait* for there to be no pending
invalidations.

Another false positive on the range based checks is thus removed as well.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 virt/kvm/kvm_main.c | 20 +++++++----------
 virt/kvm/kvm_mm.h   | 12 +++++-----
 virt/kvm/pfncache.c | 55 ++++++++++++++++++++-------------------------
 3 files changed, 38 insertions(+), 49 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e04eb700448b..cca870f1a78c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -795,18 +795,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
-	/*
-	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
-	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
-	 * each cache's lock.  There are relatively few caches in existence at
-	 * any given time, and the caches themselves can check for hva overlap,
-	 * i.e. don't need to rely on memslot overlap checks for performance.
-	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of
-	 * mmu_invalidate_in_progress.
-	 */
-	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
 	 * that guest memory has been reclaimed.  This needs to be done *after*
@@ -860,6 +848,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/*
+	 * It's safe to invalidate the gfn_to_pfn_caches from this 'end'
+	 * hook, because hva_to_pfn_retry() will wait until no active
+	 * invalidations could be affecting the corresponding uHVA
+	 * before before allowing a newly mapped GPC to become valid.
+	 */
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
 	/* Pairs with the increment in range_start(). */
 	spin_lock(&kvm->mn_invalidate_lock);
 	if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..34e4e67f09f8 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
 		     bool *async, bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-				       unsigned long start,
-				       unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+				 unsigned long start,
+				 unsigned long end);
 #else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-						     unsigned long start,
-						     unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+					       unsigned long start,
+					       unsigned long end)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index fa494eb3d924..3f48df8cd6e5 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -20,10 +20,15 @@
 #include "kvm_mm.h"
 
 /*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently with the actual invalidation (with the GPC lock dropped). By
+ * using a separate 'needs_invalidation' flag, the concurrent invalidation
+ * can handle this case, causing hva_to_pfn_retry() to drop its result and
+ * retry correctly.
  */
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
-				       unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+				 unsigned long end)
 {
 	struct gfn_to_pfn_cache *gpc;
 
@@ -140,15 +145,12 @@ static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
-static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
 {
-	bool waited = false;
-
 	spin_lock(&gpc->kvm->mn_invalidate_lock);
 	if (gpc_invalidations_pending(gpc)) {
 		DEFINE_WAIT(wait);
 
-		waited = true;
 		for (;;) {
 			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -163,10 +165,8 @@ static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
 		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
 	}
 	spin_unlock(&gpc->kvm->mn_invalidate_lock);
-	return waited;
 }
 
-
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -199,28 +199,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 		write_unlock_irq(&gpc->lock);
 
-		/*
-		 * Invalidation occurs from the invalidate_range_start() hook,
-		 * which could already have happened before __kvm_gpc_refresh()
-		 * (or the previous turn around this loop) took gpc->lock().
-		 * If so, and if the corresponding invalidate_range_end() hook
-		 * hasn't happened yet, hva_to_pfn() could return a mapping
-		 * which is about to be stale and which should not be used. So
-		 * check if there are any currently-running invalidations which
-		 * affect the uHVA of this GPC, and retry if there are. Any
-		 * invalidation which starts after gpc->needs_invalidation is
-		 * set is fine, because it will clear that flag and trigger a
-		 * retry. And any invalidation which *completes* by having its
-		 * invalidate_range_end() hook called immediately prior to this
-		 * check is also fine, because the page tables are guaranteed
-		 * to have been changted already, so hva_to_pfn() won't return
-		 * a stale mapping in that case anyway.
-		 */
-		if (gpc_wait_for_invalidations(gpc)) {
-			write_lock_irq(&gpc->lock);
-			continue;
-		}
-
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
 		 * event, release the pfn and unmap the kernel virtual address
@@ -261,6 +239,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+		/*
+		 * Avoid populating a GPC with a user address which is already
+		 * being invalidated, if the invalidate_range_start() notifier
+		 * has already been called. The actual invalidation happens in
+		 * the invalidate_range_end() callback, so wait until there are
+		 * no active invalidations (for the relevant HVA).
+		 */
+		gpc_wait_for_invalidations(gpc);
 
 		write_lock_irq(&gpc->lock);
 
@@ -269,6 +255,13 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
+
+		/*
+		 * Since gfn_to_pfn_cache_invalidate() is called from the
+		 * kvm_mmu_notifier_invalidate_range_end() callback, it can
+		 * invalidate the GPC the moment after hva_to_pfn() returned
+		 * a valid PFN. If that happens, retry.
+		 */
 	} while (!gpc->needs_invalidation);
 
 	gpc->valid = true;
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook
  2024-08-21 20:28 ` [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook David Woodhouse
@ 2024-10-16  0:17   ` Sean Christopherson
  0 siblings, 0 replies; 8+ messages in thread
From: Sean Christopherson @ 2024-10-16  0:17 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Paolo Bonzini, Mushahid Hussain, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel, kvm, linux-kernel, Mingwei Zhang,
	Maxim Levitsky

On Wed, Aug 21, 2024, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> By performing the invalidation from the 'end' hook, the process is a lot
> cleaner and simpler because it is guaranteed that ->needs_invalidation
> will be cleared after hva_to_pfn() has been called, so the only thing
> that hva_to_pfn_retry() needs to do is *wait* for there to be no pending
> invalidations.
> 
> Another false positive on the range based checks is thus removed as well.

...

> @@ -261,6 +239,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  			goto out_error;
>  		}
>  
> +		/*
> +		 * Avoid populating a GPC with a user address which is already
> +		 * being invalidated, if the invalidate_range_start() notifier
> +		 * has already been called. The actual invalidation happens in
> +		 * the invalidate_range_end() callback, so wait until there are
> +		 * no active invalidations (for the relevant HVA).
> +		 */
> +		gpc_wait_for_invalidations(gpc);

I'm not convinced scheduling out the vCPU is actually a good idea.  At first
blush, it sounds good, e.g. why consume CPU cycles when forward progress is
blocked?

But scheduling out the vCPU will likely negatively impact latency, and KVM can't
predict when the invalidation will complete.  E.g. the refresh() could have come
along at the start of the invalidation, but it also could have arrived at the
tail end.

And if the vCPU is the only runnable task on the CPU, and the system is under enough
load to trigger an invalidation, then scheduling out won't provide any benefit
because odds are good the processor won't be able to get into a deep enough sleep
state to allow other logical CPUs to hit higher turbo bins.

The wait+schedule() logic for the memslots update is purely about being deliberately
_unfair_ to avoid a deadlock (the original idea was to simply use a r/w semapahore
to block memslot updates, but lock fairness lead to a possible deadlock).

If we want to not busy wait, then we should probably do something along the lines
of halt-polling, e.g. busy wait for a bit and then schedule() out.  But even that
would require tuning, and I highly doubt that there will be enough colliding
invalidations and retries to build sufficient state to allow KVM to make a "good"
decision.

If you (or anyone else) have numbers to show that blocking until the invalidation
goes away provides meaningful value in most cases, then by all means.  But without
numbers, I don't think we should go this route as it's not a clear win.

>  		write_lock_irq(&gpc->lock);
>  
> @@ -269,6 +255,13 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  		 * attempting to refresh.
>  		 */
>  		WARN_ON_ONCE(gpc->valid);
> +
> +		/*
> +		 * Since gfn_to_pfn_cache_invalidate() is called from the
> +		 * kvm_mmu_notifier_invalidate_range_end() callback, it can
> +		 * invalidate the GPC the moment after hva_to_pfn() returned
> +		 * a valid PFN. If that happens, retry.
> +		 */
>  	} while (!gpc->needs_invalidation);
>  
>  	gpc->valid = true;
> -- 
> 2.44.0
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3 5/5] DO NOT MERGE: Hack-a-test to verify gpc invalidation+refresh
  2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
                   ` (2 preceding siblings ...)
  2024-08-21 20:28 ` [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook David Woodhouse
@ 2024-08-21 20:28 ` David Woodhouse
  2024-09-19 11:13 ` [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
  4 siblings, 0 replies; 8+ messages in thread
From: David Woodhouse @ 2024-08-21 20:28 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

From: Sean Christopherson <seanjc@google.com>

Add a VM-wide gfn=>pfn cache and a fake MSR to let userspace control the
cache.  On writes, reflect the value of the MSR into the backing page of
a gfn=>pfn cache so that userspace can detect if a value was written to
the wrong page, i.e. to a stale mapping.

Spin up 16 vCPUs (arbitrary) to use/refresh the cache, and another thread
to trigger mmu_notifier events and memslot updates.

Co-authored-by: David Woodhouse <dwmw@amazon.co.uk>
Not-signed-off-by: Sean Christopherson <seanjc@google.com>
Not-signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/x86.c                     |  28 ++++
 include/linux/kvm_host.h               |   2 +
 tools/testing/selftests/kvm/Makefile   |   1 +
 tools/testing/selftests/kvm/gpc_test.c | 215 +++++++++++++++++++++++++
 4 files changed, 246 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/gpc_test.c

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70219e406987..910fef96409a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3767,6 +3767,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return kvm_xen_write_hypercall_page(vcpu, data);
 
 	switch (msr) {
+	case 0xdeadbeefu: {
+		struct gfn_to_pfn_cache *gpc = &vcpu->kvm->test_cache;
+
+		if (kvm_gpc_activate(gpc, data, 8))
+			break;
+
+		read_lock(&gpc->lock);
+		if (kvm_gpc_check(gpc, 8))
+			*(u64 *)(gpc->khva) = data;
+		read_unlock(&gpc->lock);
+		break;
+	}
+
 	case MSR_AMD64_NB_CFG:
 	case MSR_IA32_UCODE_WRITE:
 	case MSR_VM_HSAVE_PA:
@@ -4206,6 +4219,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	switch (msr_info->index) {
+	case 0xdeadbeefu: {
+		struct gfn_to_pfn_cache *gpc = &vcpu->kvm->test_cache;
+
+		read_lock(&gpc->lock);
+		if (kvm_gpc_check(gpc, 8))
+			msr_info->data = gpc->gpa;
+		else
+			msr_info->data = 0xdeadbeefu;
+		read_unlock(&gpc->lock);
+		return 0;
+	}
+
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_LASTBRANCHFROMIP:
@@ -12693,6 +12718,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm_hv_init_vm(kvm);
 	kvm_xen_init_vm(kvm);
 
+	kvm_gpc_init(&kvm->test_cache, kvm);
+
 	return 0;
 
 out_uninit_mmu:
@@ -12840,6 +12867,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);
 	kvm_hv_destroy_vm(kvm);
+	kvm_gpc_deactivate(&kvm->test_cache);
 }
 
 static void memslot_rmap_free(struct kvm_memory_slot *slot)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0739c343da5..d4d155091bfd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -741,6 +741,8 @@ struct kvm {
 
 	struct mutex slots_lock;
 
+	struct gfn_to_pfn_cache test_cache;
+
 	/*
 	 * Protects the arch-specific fields of struct kvm_memory_slots in
 	 * use by the VM. To be used under the slots_lock (above) or in a
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 48d32c5aa3eb..1c9f601d7f7d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -65,6 +65,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
 
 # Compiled test targets
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
+TEST_GEN_PROGS_x86_64 += gpc_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
diff --git a/tools/testing/selftests/kvm/gpc_test.c b/tools/testing/selftests/kvm/gpc_test.c
new file mode 100644
index 000000000000..611d2342d7d4
--- /dev/null
+++ b/tools/testing/selftests/kvm/gpc_test.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/sysinfo.h>
+#include <asm/barrier.h>
+#include <linux/atomic.h>
+#include <linux/rseq.h>
+#include <linux/unistd.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define NR_VCPUS 16
+
+#define NR_ITERATIONS	1000
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE	0x100000
+#endif
+
+static const uint64_t gpa_base = (4ull * (1 << 30));
+
+static struct kvm_vm *vm;
+
+static pthread_t memory_thread;
+static pthread_t vcpu_threads[NR_VCPUS];
+struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+static bool fight;
+
+static uint64_t per_vcpu_gpa_aligned(int vcpu_id)
+{
+	return gpa_base + (vcpu_id * PAGE_SIZE);
+}
+
+static uint64_t per_vcpu_gpa(int vcpu_id)
+{
+	return per_vcpu_gpa_aligned(vcpu_id) + vcpu_id;
+}
+
+static void guest_code(int vcpu_id)
+{
+	uint64_t this_vcpu_gpa;
+	int i;
+
+	this_vcpu_gpa = per_vcpu_gpa(vcpu_id);
+
+	for (i = 0; i < NR_ITERATIONS; i++)
+		wrmsr(0xdeadbeefu, this_vcpu_gpa);
+	GUEST_SYNC(0);
+}
+
+static void *memory_worker(void *ign)
+{
+	int i, x, r, k;
+	uint64_t *hva;
+	uint64_t gpa;
+	void *mem;
+
+	while (!READ_ONCE(fight))
+		cpu_relax();
+
+	for (k = 0; k < 50; k++) {
+		i = (unsigned int)random() % NR_VCPUS;
+
+		gpa = per_vcpu_gpa_aligned(i);
+		hva = (void *)gpa;
+
+		x = (unsigned int)random() % 5;
+		switch (x) {
+		case 0:
+			r = munmap(hva, PAGE_SIZE);
+			TEST_ASSERT(!r, "Failed to mumap (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			mem = mmap(hva, PAGE_SIZE, PROT_READ | PROT_WRITE,
+				MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+			TEST_ASSERT(mem != MAP_FAILED || mem != hva,
+				    "Failed to mmap (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 1:
+			vm_set_user_memory_region(vm, i + 1, KVM_MEM_LOG_DIRTY_PAGES,
+						  gpa, PAGE_SIZE, hva);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+			break;
+		case 2:
+			r = mprotect(hva, PAGE_SIZE, PROT_NONE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			r = mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 3:
+			r = mprotect(hva, PAGE_SIZE, PROT_READ);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			r = mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 4:
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, 0, 0);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE,
+						  (void *)per_vcpu_gpa_aligned(NR_VCPUS));
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, 0, 0);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+			break;
+		}
+	}
+	return NULL;
+}
+
+static void sync_guest(int vcpu_id)
+{
+	struct ucall uc;
+
+	switch (get_ucall(vcpus[vcpu_id], &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(uc.args[1] == 0,
+			   "Unexpected sync ucall, got %lx", uc.args[1]);
+		break;
+	case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx",
+		(const char *)uc.args[0],
+		__FILE__, uc.args[1], uc.args[2], uc.args[3]);
+		break;
+	default:
+		TEST_FAIL("Unexpected userspace exit, reason = %s\n",
+			  exit_reason_str(vcpus[vcpu_id]->run->exit_reason));
+		break;
+	}
+}
+
+static void *vcpu_worker(void *data)
+{
+	int vcpu_id = (unsigned long)data;
+
+	vcpu_args_set(vcpus[vcpu_id], 1, vcpu_id);
+
+	while (!READ_ONCE(fight))
+		cpu_relax();
+
+	usleep(10);
+
+	vcpu_run(vcpus[vcpu_id]);
+
+	sync_guest(vcpu_id);
+
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t *hva;
+	uint64_t gpa;
+	void *r;
+	int i;
+
+	srandom(time(0));
+
+	vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus);
+
+	pthread_create(&memory_thread, NULL, memory_worker, 0);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		pthread_create(&vcpu_threads[i], NULL, vcpu_worker, (void *)(unsigned long)i);
+
+		gpa = per_vcpu_gpa_aligned(i);
+		hva = (void *)gpa;
+		r = mmap(hva, PAGE_SIZE, PROT_READ | PROT_WRITE,
+			 MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		TEST_ASSERT(r != MAP_FAILED, "mmap() '%lx' failed, errno = %d (%s)",
+			    gpa, errno, strerror(errno));
+
+		vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+	}
+
+	WRITE_ONCE(fight, true);
+
+	for (i = 0; i < NR_VCPUS; i++)
+		pthread_join(vcpu_threads[i], NULL);
+
+	pthread_join(memory_thread, NULL);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		gpa = per_vcpu_gpa(i);
+		hva = (void *)gpa;
+
+		TEST_ASSERT(*hva == 0 || *hva == gpa,
+			    "Want '0' or '%lx', got '%lx'\n", gpa, *hva);
+	}
+
+	gpa = vcpu_get_msr(vcpus[0], 0xdeadbeefu);
+	hva = (void *)gpa;
+	if (gpa != 0xdeadbeefu)
+		TEST_ASSERT(*hva == gpa, "Want '%lx', got '%lx'\n", gpa, *hva);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache
  2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
                   ` (3 preceding siblings ...)
  2024-08-21 20:28 ` [PATCH v3 5/5] DO NOT MERGE: Hack-a-test to verify gpc invalidation+refresh David Woodhouse
@ 2024-09-19 11:13 ` David Woodhouse
  4 siblings, 0 replies; 8+ messages in thread
From: David Woodhouse @ 2024-09-19 11:13 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Hussain, Mushahid
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel, kvm,
	linux-kernel, Mingwei Zhang, Maxim Levitsky

On 21 August 2024 22:28:09 CEST, David Woodhouse <dwmw2@infradead.org> wrote:
>From: David Woodhouse <dwmw@amazon.co.uk>
>
>This will be used to allow hva_to_pfn_retry() to be more selective about
>its retry loop, which is currently extremely pessimistic.
>
>It allows for invalidations to occur even while the PFN is being mapped
>(which happens with the lock dropped), before the GPC is fully valid.
>
>No functional change yet, as the existing mmu_notifier_retry_cache()
>function will still return true in all cases where the invalidation
>may have triggered.
>
>Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>---
> include/linux/kvm_types.h |  1 +
> virt/kvm/pfncache.c       | 29 ++++++++++++++++++++++++-----
> 2 files changed, 25 insertions(+), 5 deletions(-)
>
>diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
>index 827ecc0b7e10..4d8fbd87c320 100644
>--- a/include/linux/kvm_types.h
>+++ b/include/linux/kvm_types.h
>@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
> 	void *khva;
> 	kvm_pfn_t pfn;
> 	bool active;
>+	bool needs_invalidation;
> 	bool valid;
> };
> 
>diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
>index f0039efb9e1e..7007d32d197a 100644
>--- a/virt/kvm/pfncache.c
>+++ b/virt/kvm/pfncache.c
>@@ -32,7 +32,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
> 		read_lock_irq(&gpc->lock);
> 
> 		/* Only a single page so no need to care about length */
>-		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
>+		if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
> 		    gpc->uhva >= start && gpc->uhva < end) {
> 			read_unlock_irq(&gpc->lock);
> 
>@@ -45,9 +45,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
> 			 */
> 
> 			write_lock_irq(&gpc->lock);
>-			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
>-			    gpc->uhva >= start && gpc->uhva < end)
>+			if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
>+			    gpc->uhva >= start && gpc->uhva < end) {
>+				gpc->needs_invalidation = false;
> 				gpc->valid = false;
>+			}
> 			write_unlock_irq(&gpc->lock);
> 			continue;
> 		}
>@@ -93,6 +95,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
> 	if (!gpc->valid)
> 		return false;
> 
>+	/* If it's valid, it needs invalidation! */
>+	WARN_ON_ONCE(!gpc->needs_invalidation);
>+
> 	return true;
> }
> 
>@@ -175,6 +180,17 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
> 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
> 		smp_rmb();
> 
>+		/*
>+		 * The translation made by hva_to_pfn() below could be made
>+		 * invalid as soon as it's mapped. But the uhva is already
>+		 * known and that's all that gfn_to_pfn_cache_invalidate()
>+		 * looks at. So set the 'needs_invalidation' flag to allow
>+		 * the GPC to be marked invalid from the moment the lock is
>+		 * dropped, before the corresponding PFN is even found (and,
>+		 * more to the point, immediately afterwards).
>+		 */
>+		gpc->needs_invalidation = true;
>+
> 		write_unlock_irq(&gpc->lock);
> 
> 		/*
>@@ -224,7 +240,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
> 		 * attempting to refresh.
> 		 */
> 		WARN_ON_ONCE(gpc->valid);
>-	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
>+	} while (!gpc->needs_invalidation ||
>+		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
> 
> 	gpc->valid = true;
> 	gpc->pfn = new_pfn;
>@@ -339,6 +356,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
> 	 */
> 	if (ret) {
> 		gpc->valid = false;
>+		gpc->needs_invalidation = false;
> 		gpc->pfn = KVM_PFN_ERR_FAULT;
> 		gpc->khva = NULL;
> 	}
>@@ -383,7 +401,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
> 	gpc->pfn = KVM_PFN_ERR_FAULT;
> 	gpc->gpa = INVALID_GPA;
> 	gpc->uhva = KVM_HVA_ERR_BAD;
>-	gpc->active = gpc->valid = false;
>+	gpc->active = gpc->valid = gpc->needs_invalidation = false;
> }
> 
> static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
>@@ -453,6 +471,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
> 		write_lock_irq(&gpc->lock);
> 		gpc->active = false;
> 		gpc->valid = false;
>+		gpc->needs_invalidation = false;
> 
> 		/*
> 		 * Leave the GPA => uHVA cache intact, it's protected by the

Ping?

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-10-16  0:17 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-21 20:28 [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse
2024-08-21 20:28 ` [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry() David Woodhouse
2024-10-16  0:04   ` Sean Christopherson
2024-08-21 20:28 ` [PATCH v3 3/5] KVM: pfncache: Wait for pending invalidations instead of spinning David Woodhouse
2024-08-21 20:28 ` [PATCH v3 4/5] KVM: pfncache: Move invalidation to invalidate_range_end hook David Woodhouse
2024-10-16  0:17   ` Sean Christopherson
2024-08-21 20:28 ` [PATCH v3 5/5] DO NOT MERGE: Hack-a-test to verify gpc invalidation+refresh David Woodhouse
2024-09-19 11:13 ` [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to gfn_to_pfn_cache David Woodhouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox