public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT
@ 2026-03-29 13:15 shaikh.kamal
  2026-03-30 14:18 ` Steven Rostedt
  0 siblings, 1 reply; 16+ messages in thread
From: shaikh.kamal @ 2026-03-29 13:15 UTC (permalink / raw)
  To: H. Peter Anvin, David Woodhouse, Paul Durrant,
	Sean Christopherson, kvm, linux-kernel, linux-rt-devel
  Cc: skhan, me, shaikh.kamal, syzbot+919877893c9d28162dc2

On PREEMPT_RT, kvm_xen_set_evtchn_fast() acquires a sleeping lock
(gpc->lock) from hard IRQ context (xen_timer_callback), triggering:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
  in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 0, name: swapper/5
  preempt_count: 10100, expected: 0
  RCU nest depth: 0, expected: 0
  4 locks held by swapper/5/0:
  INFO: lockdep is turned off.
  irq event stamp: 1766
  hardirqs last  enabled at (1765): [<ffffffff81678fd4>] tick_nohz_idle_got_tick+0x84/0x90
  hardirqs last disabled at (1766): [<ffffffff8b665051>] sysvec_apic_timer_interrupt+0x11/0xd0
  softirqs last  enabled at (0): [<ffffffff81289e76>] copy_process+0x1586/0x58b0
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  Preempt disabled at:
  [<ffffffff8b6650bc>] sysvec_apic_timer_interrupt+0x7c/0xd0
  CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.13.0-rc1-syzkaller-00026-g2d5404caa8c7 #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
  Call Trace:
   <IRQ>
   __dump_stack lib/dump_stack.c:94 [inline]
   dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120
   __might_resched+0x30d/0x8f0 kernel/sched/core.c:10318
   rt_spin_lock+0x70/0x130 kernel/locking/spinlock_rt.c:48
   kvm_xen_set_evtchn_fast+0x20b/0xa40 arch/x86/kvm/xen.c:1820
   xen_timer_callback+0x91/0x1a0 arch/x86/kvm/xen.c:142
   __run_hrtimer kernel/time/hrtimer.c:1739 [inline]
   __hrtimer_run_queues+0x20b/0xa00 kernel/time/hrtimer.c:1803

The Xen timer uses HRTIMER_MODE_ABS_HARD for latency-sensitive event
delivery (see commit 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen
timer delivery")). On PREEMPT_RT, hard IRQ hrtimers execute in hard IRQ
context where sleeping locks cannot be acquired.

Use irq_work to defer event injection to a context where sleeping locks
are permitted on PREEMPT_RT. This preserves the hard IRQ timer precision
on non-RT kernels while avoiding the lock context violation on RT.

The approach follows the existing pvclock_irq_work pattern in
arch/x86/kvm/x86.c.

Tested on PREEMPT_RT kernel (CONFIG_PREEMPT_RT=y) with the syzbot C
reproducer - no crash observed after 30+ minutes of continuous execution.
Also tested on non-RT kernel (CONFIG_PREEMPT_RT=n) to verify no
regression in the fast path.

Reported-by: syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=919877893c9d28162dc2
Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")

Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/xen.c              | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..533b45289d53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -746,6 +746,7 @@ struct kvm_vcpu_xen {
 	u64 timer_expires; /* In guest epoch */
 	atomic_t timer_pending;
 	struct hrtimer timer;
+	struct irq_work timer_inject_irqwork;
 	int poll_evtchn;
 	struct timer_list poll_timer;
 	struct kvm_hypervisor_cpuid cpuid;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index d6b2a665b499..01fa7b165355 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -122,6 +122,24 @@ void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void xen_timer_inject_irqwork(struct irq_work *work)
+{
+	struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
+						timer_inject_irqwork);
+	struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
+	struct kvm_xen_evtchn e;
+	int rc;
+
+	e.vcpu_id = vcpu->vcpu_id;
+	e.vcpu_idx = vcpu->vcpu_idx;
+	e.port = vcpu->arch.xen.timer_virq;
+	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+	rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+	if (rc != -EWOULDBLOCK)
+		vcpu->arch.xen.timer_expires = 0;
+}
+
 static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 {
 	struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
@@ -132,6 +150,17 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 	if (atomic_read(&vcpu->arch.xen.timer_pending))
 		return HRTIMER_NORESTART;
 
+	/*
+	 * On PREEMPT_RT, this callback runs in hard IRQ context where
+	 * kvm_xen_set_evtchn_fast() cannot acquire sleeping locks
+	 * (specifically gpc->lock). Defer to irq_work which runs in
+	 * thread context on RT.
+	 */
+	if (in_hardirq()) {
+		irq_work_queue(&vcpu->arch.xen.timer_inject_irqwork);
+		return HRTIMER_NORESTART;
+	}
+
 	e.vcpu_id = vcpu->vcpu_id;
 	e.vcpu_idx = vcpu->vcpu_idx;
 	e.port = vcpu->arch.xen.timer_virq;
@@ -2303,6 +2332,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 	hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
 		      HRTIMER_MODE_ABS_HARD);
+	init_irq_work(&vcpu->arch.xen.timer_inject_irqwork,
+		      xen_timer_inject_irqwork);
 
 	kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
 	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread
* Re: [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations
@ 2026-03-30 11:24 Paolo Bonzini
  2026-04-30 14:17 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
  0 siblings, 1 reply; 16+ messages in thread
From: Paolo Bonzini @ 2026-03-30 11:24 UTC (permalink / raw)
  To: shaikh kamaluddin
  Cc: Sean Christopherson, Sebastian Andrzej Siewior, kvm,
	Kernel Mailing List, Linux, linux-rt-devel, Shuah Khan, me

[-- Attachment #1: Type: text/plain, Size: 1406 bytes --]

On Sat, Mar 28, 2026 at 3:50 PM shaikh kamaluddin
<shaikhkamal2012@gmail.com> wrote:
> +void __mmu_notifier_oom_enter(struct mm_struct *mm)
> +{
> +       struct mmu_notifier *subscription;
> +       int id;
> +       pr_info("Entering :func:%s\n", __func__);
> +       if (!mm->notifier_subscriptions)
> +               return;
> +
> +       id = srcu_read_lock(&srcu);
> +       hlist_for_each_entry_rcu(subscription,
> +                                &mm->notifier_subscriptions->list, hlist,
> +                                rcu_read_lock_held(&srcu)) {
> +               if(subscription->ops->oom_enter)
> +                       subscription->ops->oom_enter(subscription, mm);
> +
> +       }
> +       srcu_read_unlock(&srcu, id);
> +       pr_info("Done:%s\n", __func__);

Yeah, calling mmu_notifier_unregister() won't work from within this function.

One possibility is for the new method to be something like this:

       void (*after_oom_unregister)(struct mmu_notifier *subscription);

So it only has to do

kvm->mn_registered = false; /* or xchg, it's the same */
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
if (kvm->mn_active_invalidate_count)
    kvm->mn_active_invalidate_count = 0;
else
    WARN_ON(kvm->mmu_invalidate_in_progress);

or something like that. See the attached sketch, feel free to reuse it
as you see fit.

Paolo

[-- Attachment #2: mm.patch --]
[-- Type: text/x-patch, Size: 4393 bytes --]

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3b670ee4eb26..7b14d8099cc1 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1973,12 +1973,15 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
 				      struct x86_exception *exception,
 				      u64 pte_access)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 
 	BUG_ON(!mmu_is_nested(vcpu));
 
-	/* NPT walks are always user-walks */
-	access |= PFERR_USER_MASK;
+	/* Non-GMET walks are always user-walks */
+	if (!(svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_GMET_ENABLE))
+		access |= PFERR_USER_MASK;
+
 	return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
 }
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e4cb317807ab..4a1c1f5297c4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7444,6 +7444,15 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 
 	BUG_ON(!mmu_is_nested(vcpu));
+
+	/*
+	 * MBEC differentiates based on the effective U/S bit of
+	 * the guest page tables; not the processor CPL.
+	 */
+	access &= ~PFERR_USER_MASK;
+	if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
+		access |= PFERR_USER_MASK;
+
 	return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
 }
 
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 8450e18a87c2..3c67ec15c09c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -212,6 +212,14 @@ struct mmu_notifier_ops {
 	 */
 	struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
 	void (*free_notifier)(struct mmu_notifier *subscription);
+
+	/*
+	 * Any mmu notifier that defines this is automatically unregistered
+	 * when its mm is the subject of an OOM kill.  after_oom_unregister()
+	 * is invoked after all other outstanding callbacks have terminated.
+	 */
+	void (*after_oom_unregister)(struct mmu_notifier *subscription,
+				     struct mm_struct *mm);
 };
 
 /*
@@ -287,6 +295,7 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
 }
 void mmu_notifier_put(struct mmu_notifier *subscription);
 void mmu_notifier_synchronize(void);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
 
 extern int mmu_notifier_register(struct mmu_notifier *subscription,
 				 struct mm_struct *mm);
@@ -661,6 +670,10 @@ static inline void mmu_notifier_synchronize(void)
 {
 }
 
+static inline void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+}
+
 #endif /* CONFIG_MMU_NOTIFIER */
 
 #endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..deba056468b1 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -835,6 +835,56 @@ void mmu_notifier_unregister(struct mmu_notifier *subscription,
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
 
+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+	struct mmu_notifier_subscriptions *subscriptions = mm->notifier_subscriptions;
+	struct mmu_notifier *subscription;
+	struct hlist_node *tmp;
+	HLIST_HEAD(oom_list);
+	int id;
+
+	id = srcu_read_lock(&srcu);
+
+	/*
+	 * Prevent further calls to the MMU notifier, except for
+	 * release and after_oom_unregister.
+	 */
+	spin_lock(&subscriptions->lock);
+	hlist_for_each_entry_safe(subscription, tmp, &subscriptions->list, hlist) {
+		if (!subscription->ops->after_oom_unregister)
+			continue;
+
+		/*
+		 * after_oom_unregister and alloc_notifier are incompatible,
+		 * because there could be other references to allocated
+		 * notifiers.
+		 */
+		if (WARN_ON(subscription->ops->alloc_notifier))
+			continue;
+
+		hlist_del_init_rcu(&subscription->hlist);
+		hlist_add_head(&subscription->hlist, &oom_list);
+	}
+	spin_unlock(&subscriptions->lock);
+
+	hlist_for_each_entry(subscription, &oom_list, hlist)
+		if (subscription->ops->release)
+			subscription->ops->release(subscription, mm);
+	srcu_read_unlock(&srcu, id);
+
+	if (hlist_empty(&oom_list))
+		return;
+
+	synchronize_srcu(&srcu);
+
+	hlist_for_each_entry_safe(subscription, tmp, &oom_list, hlist) {
+		subscription->ops->after_oom_unregister(subscription, mm);
+
+		BUG_ON(atomic_read(&mm->mm_count) <= 0);
+		mmdrop(mm);
+	}
+}
+
 static void mmu_notifier_free_rcu(struct rcu_head *rcu)
 {
 	struct mmu_notifier *subscription =

^ permalink raw reply related	[flat|nested] 16+ messages in thread
* [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
@ 2026-04-30  4:48 shaikh.kamal
  0 siblings, 0 replies; 16+ messages in thread
From: shaikh.kamal @ 2026-04-30  4:48 UTC (permalink / raw)
  To: Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, David Rientjes, Shakeel Butt,
	linux-mm, linux-kernel, kvm, linux-rt-devel
  Cc: pbonzini, skhan, me, shaikh.kamal, syzbot+c3178b6b512446632bac

When an mm undergoes OOM kill, the OOM reaper unmaps memory while
holding the mmap_lock. MMU notifier subscribers (notably KVM) need
to be informed so they can tear down their secondary mappings. The
current synchronous unregister path can deadlock on PREEMPT_RT
because synchronize_srcu() is called from contexts that cannot
safely sleep.

This patch implements the asynchronous cleanup design proposed by
Paolo Bonzini in v1 review: a new optional after_oom_unregister
callback in struct mmu_notifier_ops, invoked after the SRCU grace
period via call_srcu() so that no readers can still reference the
subscription when cleanup runs.

The flow is:

  1. The OOM reaper calls mmu_notifier_oom_enter() from
     __oom_reap_task_mm().
  2. mmu_notifier_oom_enter() walks the subscription list and, for
     each subscriber that provides after_oom_unregister, detaches
     the subscription from the active list and schedules a
     call_srcu() callback.
  3. The deferred callback invokes after_oom_unregister once the
     grace period has elapsed and all in-flight readers have
     finished.
  4. Subsystems waiting to free structures referenced by the
     callback can call the new mmu_notifier_barrier() helper, which
     wraps srcu_barrier() to wait for all outstanding callbacks
     scheduled this way.

after_oom_unregister is mutually exclusive with alloc_notifier
because allocated notifiers can have additional outstanding
references that the OOM path cannot safely drop.

KVM is updated to provide after_oom_unregister, which clears
mn_active_invalidate_count, and to detect via hlist_unhashed() in
kvm_destroy_vm() when its subscription was already detached by the
OOM path; in that case it calls mmu_notifier_barrier() and drops
the mm reference rather than calling mmu_notifier_unregister().

Reported-by: syzbot+c3178b6b512446632bac@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c3178b6b512446632bac
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/all/20260209161527.31978-1-shaikhkamal2012@gmail.com/

Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
 include/linux/mmu_notifier.h |  10 +++
 mm/mmu_notifier.c            | 123 +++++++++++++++++++++++++++++++++++
 mm/oom_kill.c                |   3 +
 virt/kvm/kvm_main.c          |  27 +++++++-
 4 files changed, 162 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 07a2bbaf86e9..0ccd590f55d3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -88,6 +88,14 @@ struct mmu_notifier_ops {
 	void (*release)(struct mmu_notifier *subscription,
 			struct mm_struct *mm);

+	/*
+	 * Any mmu notifier that defines this is automatically unregistered
+	 * when its mm is the subject of an OOM kill.  after_oom_unregister()
+	 * is invoked after all other outstanding callbacks have terminated.
+	 */
+	void (*after_oom_unregister)(struct mmu_notifier *subscription,
+				     struct mm_struct *mm);
+
 	/*
 	 * clear_flush_young is called after the VM is
 	 * test-and-clearing the young/accessed bitflag in the
@@ -375,6 +383,8 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,

 extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
+extern void mmu_notifier_barrier(void);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long start,
 					  unsigned long end);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..b8fa58fe6b7d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -49,6 +49,37 @@ struct mmu_notifier_subscriptions {
 	struct hlist_head deferred_list;
 };

+/*
+ * Callback structure for asynchronous OOM cleanup.
+ * Used with call_srcu() to defer after_oom_unregister callbacks
+ * until after SRCU grace period completes.
+ */
+struct mmu_notifier_oom_callback {
+	struct rcu_head rcu;
+	struct mmu_notifier *subscription;
+	struct mm_struct *mm;
+};
+
+/*
+ * Callback function invoked after SRCU grace period.
+ * Safely calls after_oom_unregister once all readers have finished.
+ */
+static void mmu_notifier_oom_callback_fn(struct rcu_head *rcu)
+{
+	struct mmu_notifier_oom_callback *cb =
+		container_of(rcu, struct mmu_notifier_oom_callback, rcu);
+
+	/* Safe - all SRCU readers have finished */
+	cb->subscription->ops->after_oom_unregister(cb->subscription, cb->mm);
+
+	/* Release mm reference taken when callback was scheduled */
+	WARN_ON_ONCE(atomic_read(&cb->mm->mm_count) <= 0);
+	mmdrop(cb->mm);
+
+	/* Free callback structure */
+	kfree(cb);
+}
+
 /*
  * This is a collision-retry read-side/write-side 'lock', a lot like a
  * seqcount, however this allows multiple write-sides to hold it at
@@ -359,6 +390,85 @@ void __mmu_notifier_release(struct mm_struct *mm)
 		mn_hlist_release(subscriptions, mm);
 }

+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+	struct mmu_notifier_subscriptions *subscriptions =
+						mm->notifier_subscriptions;
+	struct mmu_notifier *subscription;
+	struct hlist_node *tmp;
+	HLIST_HEAD(oom_list);
+	int id;
+
+	if (!subscriptions)
+		return;
+
+	id = srcu_read_lock(&srcu);
+
+	/*
+	 * Prevent further calls to the MMU notifier, except for
+	 * release and after_oom_unregister.
+	 */
+	spin_lock(&subscriptions->lock);
+	hlist_for_each_entry_safe(subscription, tmp,
+				  &subscriptions->list, hlist) {
+		if (!subscription->ops->after_oom_unregister)
+			continue;
+
+		/*
+		 * after_oom_unregister and alloc_notifier are incompatible,
+		 * because there could be other references to allocated
+		 * notifiers.
+		 */
+		if (WARN_ON(subscription->ops->alloc_notifier))
+			continue;
+
+		hlist_del_init_rcu(&subscription->hlist);
+		hlist_add_head(&subscription->hlist, &oom_list);
+	}
+	spin_unlock(&subscriptions->lock);
+	hlist_for_each_entry(subscription, &oom_list, hlist)
+		if (subscription->ops->release)
+			subscription->ops->release(subscription, mm);
+
+	srcu_read_unlock(&srcu, id);
+
+	if (hlist_empty(&oom_list))
+		return;
+
+	hlist_for_each_entry_safe(subscription, tmp,
+				  &oom_list, hlist) {
+		struct mmu_notifier_oom_callback *cb;
+		/*
+		 * Remove from stack-based oom_list and reset hlist to unhashed state.
+		 * This sets subscription->hlist.pprev = NULL, so future callers of
+		 * mmu_notifier_unregister() (e.g. kvm_destroy_vm) will see
+		 * hlist_unhashed() == true and take the safe path, avoiding
+		 * use-after-free on the stack-allocated oom_list head.
+		 */
+		hlist_del_init(&subscription->hlist);
+
+		/*
+		 * GFP_ATOMIC failure is exceedingly rare. We cannot sleep
+		 * here (would reintroduce the deadlock this patch fixes)
+		 * and cannot call after_oom_unregister synchronously
+		 * without first waiting for SRCU readers. The subscriber
+		 * will not receive after_oom_unregister but cleanup will
+		 * eventually happen via the unregister path.
+		 */
+		cb = kmalloc(sizeof(*cb), GFP_ATOMIC);
+		if (!cb)
+			continue;
+
+		cb->subscription = subscription;
+		cb->mm = mm;
+		mmgrab(mm);
+
+		/* Schedule callback - returns immediately */
+		call_srcu(&srcu, &cb->rcu, mmu_notifier_oom_callback_fn);
+	}
+
+}
+
 /*
  * If no young bitflag is supported by the hardware, ->clear_flush_young can
  * unmap the address and return 1 or 0 depending if the mapping previously
@@ -1096,3 +1206,16 @@ void mmu_notifier_synchronize(void)
 	synchronize_srcu(&srcu);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+/**
+ * mmu_notifier_barrier - Wait for all pending MMU notifier callbacks
+ *
+ * Waits for all call_srcu() callbacks scheduled by mmu_notifier_oom_enter()
+ * to complete. Used by subsystems during cleanup to prevent use-after-free
+ * when destroying structures accessed by the callbacks.
+ */
+void mmu_notifier_barrier(void)
+{
+	srcu_barrier(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_barrier);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..029e041afc57 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -519,6 +519,9 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
 	bool ret = true;
 	MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);

+	/* Notify MMU notifiers about the OOM event */
+	mmu_notifier_oom_enter(mm);
+
 	/*
 	 * Tell all users of get_user/copy_from_user etc... that the content
 	 * is no longer stable. No barriers really needed because unmapping
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..a2df83d3b413 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -885,6 +885,24 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 }

+static void kvm_mmu_notifier_after_oom_unregister(struct mmu_notifier *mn,
+					struct mm_struct *mm)
+{
+	struct kvm *kvm;
+
+	kvm = mmu_notifier_to_kvm(mn);
+
+	/*
+	 * At this point the unregister has completed and all other callbacks
+	 * have terminated. Clean up any unbalanced invalidation counts.
+	 */
+	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+	if (kvm->mn_active_invalidate_count)
+		kvm->mn_active_invalidate_count = 0;
+	else
+		WARN_ON(kvm->mmu_invalidate_in_progress);
+}
+
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
@@ -892,6 +910,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.release		= kvm_mmu_notifier_release,
+	.after_oom_unregister	= kvm_mmu_notifier_after_oom_unregister,
 };

 static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1280,7 +1299,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
-	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	if (hlist_unhashed(&kvm->mmu_notifier.hlist)) {
+		/* Subscription removed by OOM. Wait for async callback. */
+		mmu_notifier_barrier();
+		mmdrop(kvm->mm);
+	} else {
+		mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	}
 	/*
 	 * At this point, pending calls to invalidate_range_start()
 	 * have completed but no more MMU notifiers will run, so
--
2.43.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2026-05-03  3:27 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-29 13:15 [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT shaikh.kamal
2026-03-30 14:18 ` Steven Rostedt
2026-03-30 14:51   ` Woodhouse, David
2026-04-01 15:40     ` Sean Christopherson
2026-04-02  1:30       ` [PATCH v2 0/1] KVM: x86/xen: Fix PREEMPT_RT sleeping lock bug shaikh.kamal
2026-04-02  1:31       ` [PATCH v2 1/1] KVM: x86/xen: Use trylock for fast path event channel delivery shaikh.kamal
2026-04-02  6:36         ` Sebastian Andrzej Siewior
2026-04-02 22:40           ` Sean Christopherson
2026-04-02  6:42       ` [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT Sebastian Andrzej Siewior
2026-04-02 22:23         ` Sean Christopherson
2026-04-29 22:25       ` [PATCH v2 0/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
2026-04-29 22:25       ` [PATCH v2 1/1] " shaikh.kamal
2026-05-03  3:26         ` kernel test robot
2026-05-03  3:26         ` kernel test robot
  -- strict thread matches above, loose matches on Subject: below --
2026-03-30 11:24 [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations Paolo Bonzini
2026-04-30 14:17 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
2026-04-30  4:48 shaikh.kamal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox