[PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

* [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time
  2026-05-29 16:50 [PATCH v2 00/20] KVM: x86/xen: Fix Xen/GP/PREEMPT_RT issues with rwlock_t Sean Christopherson
@ 2026-05-29 16:51 ` Sean Christopherson
  0 siblings, 0 replies; 3+ messages in thread
From: Sean Christopherson @ 2026-05-29 16:51 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, David Woodhouse, Paul Durrant,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng
  Cc: Waiman Long, kvm, linux-kernel, David Woodhouse,
	Sebastian Andrzej Siewior, syzbot+208f7f3e5f59c11aeb90,
	Carsten Stollmaier

From: Carsten Stollmaier <stollmc@amazon.com>

This largely reverts commit 7e2175ebd695 ("KVM: x86: Fix recording of
guest steal time / preempted status"), which dropped the use of the
gfn_to_pfn_cache because it was not integrated with the MMU notifiers
at the time. That shortcoming has long since been addressed, making
the GPC work correctly for this use case.

Aside from cleaning up the last	open-coded assembler access to user
addresses and associated explicit asm exception fixups, moving back
to the now-functional GPC also resolves an issue with contention on
the mmap_lock with userfaultfd. The contention issue is as follows:

On vcpu_run, before entering the guest, the update of the steal time
information causes a page-fault if the page is not present. In our
scenario, this gets handled by do_user_addr_fault() and successively
handle_userfault() because the region is registered to that.

Since handle_userfault() uses TASK_INTERRUPTIBLE, it is interruptible
by signals. But do_user_addr_fault() then busy-retries if the pending
signal is non-fatal, which leads to heavy contention of the mmap_lock.

By restoring the use of GPC for accessing the guest steal time, the
contention is avoided and refreshing the GPC happens when the vCPU is
next scheduled.

Since the gfn_to_pfn_cache gives a kernel mapping rather than a
userspace HVA, accesses are now plain C instead of unsafe_put_user()
et al. Use READ_ONCE()/WRITE_ONCE() to prevent the compiler from
reordering or tearing the accesses, and add an smp_wmb() before the
final version increment to ensure the data writes are ordered before
the seqcount update — the old unsafe_put_user() inline assembly acted
as an implicit compiler barrier.

In kvm_steal_time_set_preempted(), use read_trylock() instead of
read_lock_irqsave() since this is called from the scheduler path
where rwlock_t is not safe on PREEMPT_RT (it becomes sleepable).
Since we only trylock and bail on failure, there is no risk of
deadlock with an interrupt handler, so no need to disable interrupts
at all. Setting the preempted flag is best-effort anyway.

Signed-off-by: Carsten Stollmaier <stollmc@amazon.com>
Co-developed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 122 ++++++++++++++------------------
 2 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ae7d539af90..9f652dcdda93 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -983,7 +983,7 @@ struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
-		struct gfn_to_hva_cache cache;
+		struct gfn_to_pfn_cache cache;
 	} st;
 
 	u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ea10ed4ab06f..1b27dd9ba0aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3728,10 +3728,8 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests);
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
-	struct kvm_steal_time __user *st;
-	struct kvm_memslots *slots;
-	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+	struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+	struct kvm_steal_time *st;
 	u64 steal;
 	u32 version;
 
@@ -3746,42 +3744,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
 		return;
 
-	slots = kvm_memslots(vcpu->kvm);
+	/* We rely on the fact that it fits in a single page. */
+	BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
 
-	if (unlikely(slots->generation != ghc->generation ||
-		     gpa != ghc->gpa ||
-		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
-		/* We rely on the fact that it fits in a single page. */
-		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+	CLASS(gpc_map_local, st_map)(gpc, sizeof(*st));
+	if (IS_ERR(st_map))
+		return;
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
-		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
-			return;
-	}
-
-	st = (struct kvm_steal_time __user *)ghc->hva;
+	st = *st_map;
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
 	 * expensive IPIs.
 	 */
 	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-		u8 st_preempted = 0;
-		int err = -EFAULT;
-
-		if (!user_access_begin(st, sizeof(*st)))
-			return;
-
-		asm volatile("1: xchgb %0, %2\n"
-			     "xor %1, %1\n"
-			     "2:\n"
-			     _ASM_EXTABLE_UA(1b, 2b)
-			     : "+q" (st_preempted),
-			       "+&r" (err),
-			       "+m" (st->preempted));
-		if (err)
-			goto out;
-
-		user_access_end();
+		u8 st_preempted = xchg(&st->preempted, 0);
 
 		vcpu->arch.st.preempted = 0;
 
@@ -3789,39 +3765,30 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 				       st_preempted & KVM_VCPU_FLUSH_TLB);
 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
 			kvm_vcpu_flush_tlb_guest(vcpu);
-
-		if (!user_access_begin(st, sizeof(*st)))
-			goto dirty;
 	} else {
-		if (!user_access_begin(st, sizeof(*st)))
-			return;
-
-		unsafe_put_user(0, &st->preempted, out);
+		WRITE_ONCE(st->preempted, 0);
 		vcpu->arch.st.preempted = 0;
 	}
 
-	unsafe_get_user(version, &st->version, out);
+	version = READ_ONCE(st->version);
 	if (version & 1)
 		version += 1;  /* first time write, random junk */
 
 	version += 1;
-	unsafe_put_user(version, &st->version, out);
+	WRITE_ONCE(st->version, version);
 
 	smp_wmb();
 
-	unsafe_get_user(steal, &st->steal, out);
+	steal = READ_ONCE(st->steal);
 	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
-	unsafe_put_user(steal, &st->steal, out);
+	WRITE_ONCE(st->steal, steal);
+
+	smp_wmb();
 
 	version += 1;
-	unsafe_put_user(version, &st->version, out);
-
- out:
-	user_access_end();
- dirty:
-	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+	WRITE_ONCE(st->version, version);
 }
 
 /*
@@ -4162,8 +4129,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vcpu->arch.st.msr_val = data;
 
-		if (!(data & KVM_MSR_ENABLED))
-			break;
+		if (data & KVM_MSR_ENABLED)
+			kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED,
+					sizeof(struct kvm_steal_time));
+		else
+			kvm_gpc_deactivate(&vcpu->arch.st.cache);
 
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
@@ -5231,11 +5201,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
-	struct kvm_steal_time __user *st;
-	struct kvm_memslots *slots;
-	static const u8 preempted = KVM_VCPU_PREEMPTED;
-	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+	struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+	struct kvm_steal_time *st;
 
 	/*
 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5260,20 +5227,32 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (unlikely(current->mm != vcpu->kvm->mm))
 		return;
 
-	slots = kvm_memslots(vcpu->kvm);
-
-	if (unlikely(slots->generation != ghc->generation ||
-		     gpa != ghc->gpa ||
-		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+	/*
+	 * Use a trylock as this is called from the scheduler path (via
+	 * kvm_sched_out), where rwlock_t is not safe on PREEMPT_RT (it
+	 * becomes sleepable).  Setting preempted is best-effort anyway;
+	 * the old HVA-based code used copy_to_user_nofault() which could
+	 * also silently fail.
+	 *
+	 * Since we only trylock and bail on failure, there is no risk of
+	 * deadlock with an interrupt handler, so no need to disable
+	 * interrupts.
+	 */
+	CLASS(gpc_try_map_local, st_map)(gpc, sizeof(st->preempted));
+	if (IS_ERR(st_map))
 		return;
 
-	st = (struct kvm_steal_time __user *)ghc->hva;
-	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+	st = *st_map;
+	WRITE_ONCE(st->preempted, KVM_VCPU_PREEMPTED);
+	vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+}
 
-	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
-		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
-
-	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+static void kvm_steal_time_reset(struct kvm_vcpu *vcpu)
+{
+	kvm_gpc_deactivate(&vcpu->arch.st.cache);
+	vcpu->arch.st.preempted = 0;
+	vcpu->arch.st.msr_val = 0;
+	vcpu->arch.st.last_steal = 0;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -12819,6 +12798,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
 
+	kvm_gpc_init(&vcpu->arch.st.cache, vcpu->kvm);
+
 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
 	else
@@ -12926,6 +12907,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_mmu_unload(vcpu);
 
+	kvm_steal_time_reset(vcpu);
+
 	kvmclock_reset(vcpu);
 
 	for_each_possible_cpu(cpu)
@@ -13046,7 +13029,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_en_val = 0;
 	vcpu->arch.apf.msr_int_val = 0;
-	vcpu->arch.st.msr_val = 0;
+
+	kvm_steal_time_reset(vcpu);
 
 	kvmclock_reset(vcpu);
 
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time
       [not found] <20260530061932.7849D1F00893@smtp.kernel.org>
@ 2026-06-02 12:29 ` David Woodhouse
  2026-06-09  0:45   ` Sean Christopherson
  0 siblings, 1 reply; 3+ messages in thread
From: David Woodhouse @ 2026-06-02 12:29 UTC (permalink / raw)
  To: sashiko-bot
  Cc: seanjc, pbonzini, tglx, mingo, bp, dave.hansen, x86, hpa, kvm,
	linux-kernel, sashiko-reviews, stollmc, dwmw

[-- Attachment #1: Type: text/plain, Size: 2108 bytes --]

On Sat, 30 May 2026 06:19:32 +0000, sashiko-bot@kernel.org wrote:
> [Severity: High]
> Does this introduce a scheduling while atomic bug on non-PREEMPT_RT kernels?
>
> The CLASS(gpc_map_local, st_map) macro acquires a read_lock on gpc->lock,
> which disables preemption. While this lock is held, if the guest supports
> PV TLB flush, the code calls kvm_vcpu_flush_tlb_guest().
>
> If TDP is disabled (shadow paging), kvm_vcpu_flush_tlb_guest() calls
> kvm_mmu_sync_roots() and eventually mmu_sync_children(). This path can yield
> via cond_resched_rwlock_write(). Yielding while preemption is disabled by
> the gpc read lock will trigger a BUG.

Ah, that issue exists in the previous versions too, but it's simple
enough to fix. There's no particular timing constraint for flushing the
TLB; it just have to be done before this vCPU ever runs again. It can
just be moved to the end of the function after the lock is dropped.

That does mean record_steal_time() should use the explicit
gpc_map_local_lock()/gpc_map_local_unlock() instead of the CLASS()
macro, but that's easy enough.

> [Severity: High]
> Can this cause an ABBA deadlock in the scheduler on PREEMPT_RT?
>
> kvm_steal_time_set_preempted() is called during a vCPU context switch
> via kvm_sched_out(), which is a preempt notifier hook. This runs with
> the local CPUs runqueue lock (rq->lock) held.
>
> While the code correctly uses a trylock to avoid sleeping when acquiring
> the lock, the CLASS macro destructor will implicitly call read_unlock()
> at the end of the function. On PREEMPT_RT kernels, unlocking an rt_rwlock
> can wake up a waiting writer, such as a concurrent kvm_gpc_refresh().

This one is more fun, and the problem already exists on RT for the Xen
runstate update so it's not even really new with this patch.

I think the gpc lock needs to be a raw_rwlock_t, so that it remains a
true spinning lock even on PREEMPT_RT. The writer side
(kvm_gpc_refresh) only holds the lock briefly to swap in the new
pfn/khva after doing the heavyweight work, so it should be fine as a
raw lock.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time
  2026-06-02 12:29 ` [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time David Woodhouse
@ 2026-06-09  0:45   ` Sean Christopherson
  0 siblings, 0 replies; 3+ messages in thread
From: Sean Christopherson @ 2026-06-09  0:45 UTC (permalink / raw)
  To: David Woodhouse
  Cc: sashiko-bot, pbonzini, tglx, mingo, bp, dave.hansen, x86, hpa,
	kvm, linux-kernel, sashiko-reviews, stollmc, dwmw

On Tue, Jun 02, 2026, David Woodhouse wrote:
> On Sat, 30 May 2026 06:19:32 +0000, sashiko-bot@kernel.org wrote:
> > [Severity: High]
> > Does this introduce a scheduling while atomic bug on non-PREEMPT_RT kernels?
> >
> > The CLASS(gpc_map_local, st_map) macro acquires a read_lock on gpc->lock,
> > which disables preemption. While this lock is held, if the guest supports
> > PV TLB flush, the code calls kvm_vcpu_flush_tlb_guest().
> >
> > If TDP is disabled (shadow paging), kvm_vcpu_flush_tlb_guest() calls
> > kvm_mmu_sync_roots() and eventually mmu_sync_children(). This path can yield
> > via cond_resched_rwlock_write(). Yielding while preemption is disabled by
> > the gpc read lock will trigger a BUG.
> 
> Ah, that issue exists in the previous versions too, but it's simple
> enough to fix. There's no particular timing constraint for flushing the
> TLB; it just have to be done before this vCPU ever runs again. It can
> just be moved to the end of the function after the lock is dropped.
> 
> That does mean record_steal_time() should use the explicit
> gpc_map_local_lock()/gpc_map_local_unlock() instead of the CLASS()
> macro, but that's easy enough.

Actually, we use KVM_REQ_TLB_FLUSH_GUEST and "optimize" the code for the rare
case where KVM already have a TLB flushed queued for the vCPU.  E.g. over two
patches (so that changing the order of the request processing is isolated):

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b27dd9ba0aa..48234eeb246b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3764,7 +3764,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
                                       st_preempted & KVM_VCPU_FLUSH_TLB);
                if (st_preempted & KVM_VCPU_FLUSH_TLB)
-                       kvm_vcpu_flush_tlb_guest(vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
        } else {
                WRITE_ONCE(st->preempted, 0);
                vcpu->arch.st.preempted = 0;
@@ -11165,6 +11165,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        if (unlikely(r))
                                goto out;
                }
+               if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+                       record_steal_time(vcpu);
                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                        kvm_mmu_sync_roots(vcpu);
                if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
@@ -11214,8 +11216,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        r = 1;
                        goto out;
                }
-               if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
-                       record_steal_time(vcpu);
                if (kvm_check_request(KVM_REQ_PMU, vcpu))
                        kvm_pmu_handle_event(vcpu);
                if (kvm_check_request(KVM_REQ_PMI, vcpu))


KVM needs to ensure the RMW on st->preempted is atomic, to avoid re-introducing
the bug fixed by commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag
is not missed"), but AFAICT there's nothing that requires to complete the TLB
flush before bumping the version, KVM just needs to service the flush before
entering the guest on that vCPU.

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-09  0:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260530061932.7849D1F00893@smtp.kernel.org>
2026-06-02 12:29 ` [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time David Woodhouse
2026-06-09  0:45   ` Sean Christopherson
2026-05-29 16:50 [PATCH v2 00/20] KVM: x86/xen: Fix Xen/GP/PREEMPT_RT issues with rwlock_t Sean Christopherson
2026-05-29 16:51 ` [PATCH v2 20/20] KVM: x86: Use gfn_to_pfn_cache for record_steal_time Sean Christopherson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox