* [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
@ 2026-04-30 4:48 shaikh.kamal
0 siblings, 0 replies; 5+ messages in thread
From: shaikh.kamal @ 2026-04-30 4:48 UTC (permalink / raw)
To: Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, David Rientjes, Shakeel Butt,
linux-mm, linux-kernel, kvm, linux-rt-devel
Cc: pbonzini, skhan, me, shaikh.kamal, syzbot+c3178b6b512446632bac
When an mm undergoes OOM kill, the OOM reaper unmaps memory while
holding the mmap_lock. MMU notifier subscribers (notably KVM) need
to be informed so they can tear down their secondary mappings. The
current synchronous unregister path can deadlock on PREEMPT_RT
because synchronize_srcu() is called from contexts that cannot
safely sleep.
This patch implements the asynchronous cleanup design proposed by
Paolo Bonzini in v1 review: a new optional after_oom_unregister
callback in struct mmu_notifier_ops, invoked after the SRCU grace
period via call_srcu() so that no readers can still reference the
subscription when cleanup runs.
The flow is:
1. The OOM reaper calls mmu_notifier_oom_enter() from
__oom_reap_task_mm().
2. mmu_notifier_oom_enter() walks the subscription list and, for
each subscriber that provides after_oom_unregister, detaches
the subscription from the active list and schedules a
call_srcu() callback.
3. The deferred callback invokes after_oom_unregister once the
grace period has elapsed and all in-flight readers have
finished.
4. Subsystems waiting to free structures referenced by the
callback can call the new mmu_notifier_barrier() helper, which
wraps srcu_barrier() to wait for all outstanding callbacks
scheduled this way.
after_oom_unregister is mutually exclusive with alloc_notifier
because allocated notifiers can have additional outstanding
references that the OOM path cannot safely drop.
KVM is updated to provide after_oom_unregister, which clears
mn_active_invalidate_count, and to detect via hlist_unhashed() in
kvm_destroy_vm() when its subscription was already detached by the
OOM path; in that case it calls mmu_notifier_barrier() and drops
the mm reference rather than calling mmu_notifier_unregister().
Reported-by: syzbot+c3178b6b512446632bac@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c3178b6b512446632bac
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/all/20260209161527.31978-1-shaikhkamal2012@gmail.com/
Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
include/linux/mmu_notifier.h | 10 +++
mm/mmu_notifier.c | 123 +++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 3 +
virt/kvm/kvm_main.c | 27 +++++++-
4 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 07a2bbaf86e9..0ccd590f55d3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -88,6 +88,14 @@ struct mmu_notifier_ops {
void (*release)(struct mmu_notifier *subscription,
struct mm_struct *mm);
+ /*
+ * Any mmu notifier that defines this is automatically unregistered
+ * when its mm is the subject of an OOM kill. after_oom_unregister()
+ * is invoked after all other outstanding callbacks have terminated.
+ */
+ void (*after_oom_unregister)(struct mmu_notifier *subscription,
+ struct mm_struct *mm);
+
/*
* clear_flush_young is called after the VM is
* test-and-clearing the young/accessed bitflag in the
@@ -375,6 +383,8 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
+extern void mmu_notifier_barrier(void);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..b8fa58fe6b7d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -49,6 +49,37 @@ struct mmu_notifier_subscriptions {
struct hlist_head deferred_list;
};
+/*
+ * Callback structure for asynchronous OOM cleanup.
+ * Used with call_srcu() to defer after_oom_unregister callbacks
+ * until after SRCU grace period completes.
+ */
+struct mmu_notifier_oom_callback {
+ struct rcu_head rcu;
+ struct mmu_notifier *subscription;
+ struct mm_struct *mm;
+};
+
+/*
+ * Callback function invoked after SRCU grace period.
+ * Safely calls after_oom_unregister once all readers have finished.
+ */
+static void mmu_notifier_oom_callback_fn(struct rcu_head *rcu)
+{
+ struct mmu_notifier_oom_callback *cb =
+ container_of(rcu, struct mmu_notifier_oom_callback, rcu);
+
+ /* Safe - all SRCU readers have finished */
+ cb->subscription->ops->after_oom_unregister(cb->subscription, cb->mm);
+
+ /* Release mm reference taken when callback was scheduled */
+ WARN_ON_ONCE(atomic_read(&cb->mm->mm_count) <= 0);
+ mmdrop(cb->mm);
+
+ /* Free callback structure */
+ kfree(cb);
+}
+
/*
* This is a collision-retry read-side/write-side 'lock', a lot like a
* seqcount, however this allows multiple write-sides to hold it at
@@ -359,6 +390,85 @@ void __mmu_notifier_release(struct mm_struct *mm)
mn_hlist_release(subscriptions, mm);
}
+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+ struct mmu_notifier *subscription;
+ struct hlist_node *tmp;
+ HLIST_HEAD(oom_list);
+ int id;
+
+ if (!subscriptions)
+ return;
+
+ id = srcu_read_lock(&srcu);
+
+ /*
+ * Prevent further calls to the MMU notifier, except for
+ * release and after_oom_unregister.
+ */
+ spin_lock(&subscriptions->lock);
+ hlist_for_each_entry_safe(subscription, tmp,
+ &subscriptions->list, hlist) {
+ if (!subscription->ops->after_oom_unregister)
+ continue;
+
+ /*
+ * after_oom_unregister and alloc_notifier are incompatible,
+ * because there could be other references to allocated
+ * notifiers.
+ */
+ if (WARN_ON(subscription->ops->alloc_notifier))
+ continue;
+
+ hlist_del_init_rcu(&subscription->hlist);
+ hlist_add_head(&subscription->hlist, &oom_list);
+ }
+ spin_unlock(&subscriptions->lock);
+ hlist_for_each_entry(subscription, &oom_list, hlist)
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+
+ srcu_read_unlock(&srcu, id);
+
+ if (hlist_empty(&oom_list))
+ return;
+
+ hlist_for_each_entry_safe(subscription, tmp,
+ &oom_list, hlist) {
+ struct mmu_notifier_oom_callback *cb;
+ /*
+ * Remove from stack-based oom_list and reset hlist to unhashed state.
+ * This sets subscription->hlist.pprev = NULL, so future callers of
+ * mmu_notifier_unregister() (e.g. kvm_destroy_vm) will see
+ * hlist_unhashed() == true and take the safe path, avoiding
+ * use-after-free on the stack-allocated oom_list head.
+ */
+ hlist_del_init(&subscription->hlist);
+
+ /*
+ * GFP_ATOMIC failure is exceedingly rare. We cannot sleep
+ * here (would reintroduce the deadlock this patch fixes)
+ * and cannot call after_oom_unregister synchronously
+ * without first waiting for SRCU readers. The subscriber
+ * will not receive after_oom_unregister but cleanup will
+ * eventually happen via the unregister path.
+ */
+ cb = kmalloc(sizeof(*cb), GFP_ATOMIC);
+ if (!cb)
+ continue;
+
+ cb->subscription = subscription;
+ cb->mm = mm;
+ mmgrab(mm);
+
+ /* Schedule callback - returns immediately */
+ call_srcu(&srcu, &cb->rcu, mmu_notifier_oom_callback_fn);
+ }
+
+}
+
/*
* If no young bitflag is supported by the hardware, ->clear_flush_young can
* unmap the address and return 1 or 0 depending if the mapping previously
@@ -1096,3 +1206,16 @@ void mmu_notifier_synchronize(void)
synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+/**
+ * mmu_notifier_barrier - Wait for all pending MMU notifier callbacks
+ *
+ * Waits for all call_srcu() callbacks scheduled by mmu_notifier_oom_enter()
+ * to complete. Used by subsystems during cleanup to prevent use-after-free
+ * when destroying structures accessed by the callbacks.
+ */
+void mmu_notifier_barrier(void)
+{
+ srcu_barrier(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_barrier);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..029e041afc57 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -519,6 +519,9 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
bool ret = true;
MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
+ /* Notify MMU notifiers about the OOM event */
+ mmu_notifier_oom_enter(mm);
+
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..a2df83d3b413 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -885,6 +885,24 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
srcu_read_unlock(&kvm->srcu, idx);
}
+static void kvm_mmu_notifier_after_oom_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm;
+
+ kvm = mmu_notifier_to_kvm(mn);
+
+ /*
+ * At this point the unregister has completed and all other callbacks
+ * have terminated. Clean up any unbalanced invalidation counts.
+ */
+ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+ if (kvm->mn_active_invalidate_count)
+ kvm->mn_active_invalidate_count = 0;
+ else
+ WARN_ON(kvm->mmu_invalidate_in_progress);
+}
+
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
@@ -892,6 +910,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
.release = kvm_mmu_notifier_release,
+ .after_oom_unregister = kvm_mmu_notifier_after_oom_unregister,
};
static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1280,7 +1299,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
- mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ if (hlist_unhashed(&kvm->mmu_notifier.hlist)) {
+ /* Subscription removed by OOM. Wait for async callback. */
+ mmu_notifier_barrier();
+ mmdrop(kvm->mm);
+ } else {
+ mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ }
/*
* At this point, pending calls to invalidate_range_start()
* have completed but no more MMU notifiers will run, so
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT
@ 2026-04-01 15:40 Sean Christopherson
2026-04-29 22:25 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
0 siblings, 1 reply; 5+ messages in thread
From: Sean Christopherson @ 2026-04-01 15:40 UTC (permalink / raw)
To: David Woodhouse
Cc: rostedt@goodmis.org, shaikhkamal2012@gmail.com,
syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com,
me@brighamcampbell.com, linux-rt-devel@lists.linux.dev,
hpa@zytor.com, linux-kernel@vger.kernel.org, paul@xen.org,
kvm@vger.kernel.org, skhan@linuxfoundation.org
On Mon, Mar 30, 2026, David Woodhouse wrote:
> On Mon, 2026-03-30 at 10:18 -0400, Steven Rostedt wrote:
> >
> > > +static void xen_timer_inject_irqwork(struct irq_work *work)
> > > +{
> > > + struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
> > > + timer_inject_irqwork);
> > > + struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
> > > + struct kvm_xen_evtchn e;
> > > + int rc;
> > > +
> > > + e.vcpu_id = vcpu->vcpu_id;
> > > + e.vcpu_idx = vcpu->vcpu_idx;
> > > + e.port = vcpu->arch.xen.timer_virq;
> > > + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
> > > +
> > > + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
> > > + if (rc != -EWOULDBLOCK)
> > > + vcpu->arch.xen.timer_expires = 0;
> > > +}
> >
> > Why duplicate this code and not simply make a static inline helper
> > function that is used in both places?
>
> It's already duplicating the functionality; the original
> xen_timer_callback() will already fall back to injecting the IRQ in
> process context when it needs to (by setting vcpu-
> >arch.xen.timer_pending and then setting KVM_REQ_UNBLOCK).
>
> All you had to do was make kvm_xen_set_evtchn_fast() return
> -EWOULDBLOCK in the in_hardirq() case in order to use the existing
> fallback, surely?
>
> Better still, can't kvm_xen_set_evtchn_fast() just use read_trylock()
> instead?
Re-reading through the thread where you proposed using trylock, and through
commit bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()"),
I think I agree with using trylock for "fast" paths.
Though I would prefer to not make it unconditional for the "fast" helper instead
of conditional based on in_interrupt(). And before we start doing surgery to
"fix" a setup no one uses, and also before we use gpcs more broadly, I think we
should try to up-level the gpc APIs to reduce the amount of duplicate, boilerplate
code. kvm_xen_update_runstate_guest() and maybe kvm_xen_set_evtchn() will likely
need to open code some amount of logic, but
Side topic, looks like kvm_xen_shared_info_init() is buggy in that it fails to
mark the slot as dirty.
E.g. sans the API implementations, I think we can and should end up with code
like this:
---
arch/x86/kvm/x86.c | 14 ++---
arch/x86/kvm/xen.c | 127 ++++++++++++---------------------------------
2 files changed, 37 insertions(+), 104 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b5d48e75b65..65bad25fd9d4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3274,15 +3274,8 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;
guest_hv_clock = (void *)(gpc->khva + offset);
@@ -3305,8 +3298,7 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
guest_hv_clock->version = ++hv_clock.version;
- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);
trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 91fd3673c09a..a97fd88ee99c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,19 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
- int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
+ int ret;
- read_lock_irq(&gpc->lock);
- while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
- read_unlock_irq(&gpc->lock);
-
- ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
- if (ret)
- goto out;
-
- read_lock_irq(&gpc->lock);
- }
+ ret = kvm_gpc_acquire(gpc);
+ if (ret)
+ goto out;
/*
* This code mirrors kvm_write_wall_clock() except that it writes
@@ -96,7 +89,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
smp_wmb();
wc->version = wc_version + 1;
- read_unlock_irq(&gpc->lock);
+ kvm_gpc_release_dirty(gpc);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
@@ -155,22 +148,14 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
unsigned int offset)
{
- unsigned long flags;
int r;
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
- if (r)
- return r;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ r = kvm_gpc_acquire(gpc);
+ if (r)
+ return r;
memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);
/*
* Sanity check TSC shift+multiplier to verify the guest's view of time
@@ -420,27 +405,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- if (atomic) {
- local_irq_save(flags);
- if (!read_trylock(&gpc1->lock)) {
- local_irq_restore(flags);
- return;
- }
- } else {
- read_lock_irqsave(&gpc1->lock, flags);
- }
- while (!kvm_gpc_check(gpc1, user_len1)) {
- read_unlock_irqrestore(&gpc1->lock, flags);
-
- /* When invoked from kvm_sched_out() we cannot sleep */
- if (atomic)
- return;
-
- if (kvm_gpc_refresh(gpc1, user_len1))
- return;
-
- read_lock_irqsave(&gpc1->lock, flags);
- }
+ if (__kvm_gpc_acquire(gpc, atomic))
+ return;
if (likely(!user_len2)) {
/*
@@ -465,6 +431,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* gpc1 lock to make lockdep shut up about it.
*/
lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+
if (atomic) {
if (!read_trylock(&gpc2->lock)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -575,13 +542,10 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2) {
- kvm_gpc_mark_dirty_in_slot(gpc2);
- read_unlock(&gpc2->lock);
- }
+ if (user_len2)
+ kvm_gpc_release_dirty(gpc2);
- kvm_gpc_mark_dirty_in_slot(gpc1);
- read_unlock_irqrestore(&gpc1->lock, flags);
+ kvm_gpc_release_dirty(gpc1);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -645,20 +609,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
if (!evtchn_pending_sel)
return;
- /*
- * Yes, this is an open-coded loop. But that's just what put_user()
- * does anyway. Page it in and retry the instruction. We're just a
- * little more honest about it.
- */
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;
/* Now gpc->khva is a valid kernel address for the vcpu_info */
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
@@ -686,8 +638,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
@@ -697,8 +648,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
- unsigned long flags;
u8 rc = 0;
+ int r;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
@@ -713,33 +664,23 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- /*
- * This function gets called from kvm_vcpu_block() after setting the
- * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
- * from a HLT. So we really mustn't sleep. If the page ended up absent
- * at that point, just return 1 in order to trigger an immediate wake,
- * and we'll end up getting called again from a context where we *can*
- * fault in the page and wait for it.
- */
- if (in_atomic() || !task_is_running(current))
- return 1;
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
- /*
- * If this failed, userspace has screwed up the
- * vcpu_info mapping. No interrupts for you.
- */
- return 0;
- }
- read_lock_irqsave(&gpc->lock, flags);
- }
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ *
+ * If acquiring the cache fails completely, then userspace has screwed
+ * up the vcpu_info mapping. No interrupts for you.
+ */
+ r = __kvm_gpc_acquire(gpc, in_atomic() || !task_is_running(current));
+ if (r)
+ return r == -EWOULDBLOCK ? 1 : 0;
rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);
return rc;
}
base-commit: 3d6cdcc8883b5726513d245eef0e91cabfc397f7
--
[*] https://lore.kernel.org/all/76c61e1cb86e04df892d74c10976597700fe4cb5.camel@infradead.org
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
2026-04-01 15:40 [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT Sean Christopherson
@ 2026-04-29 22:25 ` shaikh.kamal
2026-05-03 3:26 ` kernel test robot
2026-05-03 3:26 ` kernel test robot
0 siblings, 2 replies; 5+ messages in thread
From: shaikh.kamal @ 2026-04-29 22:25 UTC (permalink / raw)
To: Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, David Rientjes, Shakeel Butt,
linux-mm, linux-kernel, kvm, linux-rt-devel
Cc: pbonzini, skhan, me, syzbot+c3178b6b512446632bac, shaikh.kamal
When an mm undergoes OOM kill, the OOM reaper unmaps memory while
holding the mmap_lock. MMU notifier subscribers (notably KVM) need
to be informed so they can tear down their secondary mappings. The
current synchronous unregister path can deadlock on PREEMPT_RT
because synchronize_srcu() is called from contexts that cannot
safely sleep.
This patch implements the asynchronous cleanup design proposed by
Paolo Bonzini in v1 review: a new optional after_oom_unregister
callback in struct mmu_notifier_ops, invoked after the SRCU grace
period via call_srcu() so that no readers can still reference the
subscription when cleanup runs.
The flow is:
1. The OOM reaper calls mmu_notifier_oom_enter() from
__oom_reap_task_mm().
2. mmu_notifier_oom_enter() walks the subscription list and, for
each subscriber that provides after_oom_unregister, detaches
the subscription from the active list and schedules a
call_srcu() callback.
3. The deferred callback invokes after_oom_unregister once the
grace period has elapsed and all in-flight readers have
finished.
4. Subsystems waiting to free structures referenced by the
callback can call the new mmu_notifier_barrier() helper, which
wraps srcu_barrier() to wait for all outstanding callbacks
scheduled this way.
after_oom_unregister is mutually exclusive with alloc_notifier
because allocated notifiers can have additional outstanding
references that the OOM path cannot safely drop.
KVM is updated to provide after_oom_unregister, which clears
mn_active_invalidate_count, and to detect via hlist_unhashed() in
kvm_destroy_vm() when its subscription was already detached by the
OOM path; in that case it calls mmu_notifier_barrier() and drops
the mm reference rather than calling mmu_notifier_unregister().
Reported-by: syzbot+c3178b6b512446632bac@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c3178b6b512446632bac
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/all/20260209161527.31978-1-shaikhkamal2012@gmail.com/
Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
include/linux/mmu_notifier.h | 10 +++
mm/mmu_notifier.c | 123 +++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 3 +
virt/kvm/kvm_main.c | 27 +++++++-
4 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 07a2bbaf86e9..0ccd590f55d3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -88,6 +88,14 @@ struct mmu_notifier_ops {
void (*release)(struct mmu_notifier *subscription,
struct mm_struct *mm);
+ /*
+ * Any mmu notifier that defines this is automatically unregistered
+ * when its mm is the subject of an OOM kill. after_oom_unregister()
+ * is invoked after all other outstanding callbacks have terminated.
+ */
+ void (*after_oom_unregister)(struct mmu_notifier *subscription,
+ struct mm_struct *mm);
+
/*
* clear_flush_young is called after the VM is
* test-and-clearing the young/accessed bitflag in the
@@ -375,6 +383,8 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
+extern void mmu_notifier_barrier(void);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..b8fa58fe6b7d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -49,6 +49,37 @@ struct mmu_notifier_subscriptions {
struct hlist_head deferred_list;
};
+/*
+ * Callback structure for asynchronous OOM cleanup.
+ * Used with call_srcu() to defer after_oom_unregister callbacks
+ * until after SRCU grace period completes.
+ */
+struct mmu_notifier_oom_callback {
+ struct rcu_head rcu;
+ struct mmu_notifier *subscription;
+ struct mm_struct *mm;
+};
+
+/*
+ * Callback function invoked after SRCU grace period.
+ * Safely calls after_oom_unregister once all readers have finished.
+ */
+static void mmu_notifier_oom_callback_fn(struct rcu_head *rcu)
+{
+ struct mmu_notifier_oom_callback *cb =
+ container_of(rcu, struct mmu_notifier_oom_callback, rcu);
+
+ /* Safe - all SRCU readers have finished */
+ cb->subscription->ops->after_oom_unregister(cb->subscription, cb->mm);
+
+ /* Release mm reference taken when callback was scheduled */
+ WARN_ON_ONCE(atomic_read(&cb->mm->mm_count) <= 0);
+ mmdrop(cb->mm);
+
+ /* Free callback structure */
+ kfree(cb);
+}
+
/*
* This is a collision-retry read-side/write-side 'lock', a lot like a
* seqcount, however this allows multiple write-sides to hold it at
@@ -359,6 +390,85 @@ void __mmu_notifier_release(struct mm_struct *mm)
mn_hlist_release(subscriptions, mm);
}
+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+ struct mmu_notifier *subscription;
+ struct hlist_node *tmp;
+ HLIST_HEAD(oom_list);
+ int id;
+
+ if (!subscriptions)
+ return;
+
+ id = srcu_read_lock(&srcu);
+
+ /*
+ * Prevent further calls to the MMU notifier, except for
+ * release and after_oom_unregister.
+ */
+ spin_lock(&subscriptions->lock);
+ hlist_for_each_entry_safe(subscription, tmp,
+ &subscriptions->list, hlist) {
+ if (!subscription->ops->after_oom_unregister)
+ continue;
+
+ /*
+ * after_oom_unregister and alloc_notifier are incompatible,
+ * because there could be other references to allocated
+ * notifiers.
+ */
+ if (WARN_ON(subscription->ops->alloc_notifier))
+ continue;
+
+ hlist_del_init_rcu(&subscription->hlist);
+ hlist_add_head(&subscription->hlist, &oom_list);
+ }
+ spin_unlock(&subscriptions->lock);
+ hlist_for_each_entry(subscription, &oom_list, hlist)
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+
+ srcu_read_unlock(&srcu, id);
+
+ if (hlist_empty(&oom_list))
+ return;
+
+ hlist_for_each_entry_safe(subscription, tmp,
+ &oom_list, hlist) {
+ struct mmu_notifier_oom_callback *cb;
+ /*
+ * Remove from stack-based oom_list and reset hlist to unhashed state.
+ * This sets subscription->hlist.pprev = NULL, so future callers of
+ * mmu_notifier_unregister() (e.g. kvm_destroy_vm) will see
+ * hlist_unhashed() == true and take the safe path, avoiding
+ * use-after-free on the stack-allocated oom_list head.
+ */
+ hlist_del_init(&subscription->hlist);
+
+ /*
+ * GFP_ATOMIC failure is exceedingly rare. We cannot sleep
+ * here (would reintroduce the deadlock this patch fixes)
+ * and cannot call after_oom_unregister synchronously
+ * without first waiting for SRCU readers. The subscriber
+ * will not receive after_oom_unregister but cleanup will
+ * eventually happen via the unregister path.
+ */
+ cb = kmalloc(sizeof(*cb), GFP_ATOMIC);
+ if (!cb)
+ continue;
+
+ cb->subscription = subscription;
+ cb->mm = mm;
+ mmgrab(mm);
+
+ /* Schedule callback - returns immediately */
+ call_srcu(&srcu, &cb->rcu, mmu_notifier_oom_callback_fn);
+ }
+
+}
+
/*
* If no young bitflag is supported by the hardware, ->clear_flush_young can
* unmap the address and return 1 or 0 depending if the mapping previously
@@ -1096,3 +1206,16 @@ void mmu_notifier_synchronize(void)
synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+/**
+ * mmu_notifier_barrier - Wait for all pending MMU notifier callbacks
+ *
+ * Waits for all call_srcu() callbacks scheduled by mmu_notifier_oom_enter()
+ * to complete. Used by subsystems during cleanup to prevent use-after-free
+ * when destroying structures accessed by the callbacks.
+ */
+void mmu_notifier_barrier(void)
+{
+ srcu_barrier(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_barrier);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..029e041afc57 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -519,6 +519,9 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
bool ret = true;
MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
+ /* Notify MMU notifiers about the OOM event */
+ mmu_notifier_oom_enter(mm);
+
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..a2df83d3b413 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -885,6 +885,24 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
srcu_read_unlock(&kvm->srcu, idx);
}
+static void kvm_mmu_notifier_after_oom_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm;
+
+ kvm = mmu_notifier_to_kvm(mn);
+
+ /*
+ * At this point the unregister has completed and all other callbacks
+ * have terminated. Clean up any unbalanced invalidation counts.
+ */
+ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+ if (kvm->mn_active_invalidate_count)
+ kvm->mn_active_invalidate_count = 0;
+ else
+ WARN_ON(kvm->mmu_invalidate_in_progress);
+}
+
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
@@ -892,6 +910,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
.release = kvm_mmu_notifier_release,
+ .after_oom_unregister = kvm_mmu_notifier_after_oom_unregister,
};
static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1280,7 +1299,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
- mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ if (hlist_unhashed(&kvm->mmu_notifier.hlist)) {
+ /* Subscription removed by OOM. Wait for async callback. */
+ mmu_notifier_barrier();
+ mmdrop(kvm->mm);
+ } else {
+ mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ }
/*
* At this point, pending calls to invalidate_range_start()
* have completed but no more MMU notifiers will run, so
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
2026-04-29 22:25 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
@ 2026-05-03 3:26 ` kernel test robot
2026-05-03 3:26 ` kernel test robot
1 sibling, 0 replies; 5+ messages in thread
From: kernel test robot @ 2026-05-03 3:26 UTC (permalink / raw)
To: shaikh.kamal, Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka,
Mike Rapoport, Suren Baghdasaryan, Michal Hocko, David Rientjes,
Shakeel Butt, linux-mm, linux-kernel, kvm, linux-rt-devel
Cc: llvm, oe-kbuild-all, pbonzini, skhan, me,
syzbot+c3178b6b512446632bac, shaikh.kamal
Hi shaikh.kamal,
kernel test robot noticed the following build errors:
[auto build test ERROR on v7.0]
[cannot apply to akpm-mm/mm-everything kvm/queue kvm/next kvm/linux-next v7.1-rc1 linus/master next-20260430]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/shaikh-kamal/mm-mmu_notifier-Add-async-OOM-cleanup-via-call_srcu/20260430-202943
base: v7.0
patch link: https://lore.kernel.org/r/20260429222548.25475-1-shaikhkamal2012%40gmail.com
patch subject: [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
config: hexagon-allnoconfig (https://download.01.org/0day-ci/archive/20260503/202605031115.qmkkOLQc-lkp@intel.com/config)
compiler: clang version 23.0.0git (https://github.com/llvm/llvm-project 5bac06718f502014fade905512f1d26d578a18f3)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260503/202605031115.qmkkOLQc-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605031115.qmkkOLQc-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/oom_kill.c:523:2: error: call to undeclared function 'mmu_notifier_oom_enter'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
523 | mmu_notifier_oom_enter(mm);
| ^
mm/oom_kill.c:523:2: note: did you mean 'mmu_notifier_release'?
include/linux/mmu_notifier.h:610:20: note: 'mmu_notifier_release' declared here
610 | static inline void mmu_notifier_release(struct mm_struct *mm)
| ^
mm/oom_kill.c:511:28: warning: variable 'oom_reaper_th' set but not used [-Wunused-but-set-global]
511 | static struct task_struct *oom_reaper_th;
| ^
1 warning and 1 error generated.
vim +/mmu_notifier_oom_enter +523 mm/oom_kill.c
515
516 static bool __oom_reap_task_mm(struct mm_struct *mm)
517 {
518 struct vm_area_struct *vma;
519 bool ret = true;
520 MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
521
522 /* Notify MMU notifiers about the OOM event */
> 523 mmu_notifier_oom_enter(mm);
524
525 /*
526 * Tell all users of get_user/copy_from_user etc... that the content
527 * is no longer stable. No barriers really needed because unmapping
528 * should imply barriers already and the reader would hit a page fault
529 * if it stumbled over a reaped memory.
530 */
531 mm_flags_set(MMF_UNSTABLE, mm);
532
533 /*
534 * It might start racing with the dying task and compete for shared
535 * resources - e.g. page table lock contention has been observed.
536 * Reduce those races by reaping the oom victim from the other end
537 * of the address space.
538 */
539 mas_for_each_rev(&mas, vma, 0) {
540 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
541 continue;
542
543 /*
544 * Only anonymous pages have a good chance to be dropped
545 * without additional steps which we cannot afford as we
546 * are OOM already.
547 *
548 * We do not even care about fs backed pages because all
549 * which are reclaimable have already been reclaimed and
550 * we do not want to block exit_mmap by keeping mm ref
551 * count elevated without a good reason.
552 */
553 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
554 struct mmu_notifier_range range;
555 struct mmu_gather tlb;
556
557 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
558 mm, vma->vm_start,
559 vma->vm_end);
560 tlb_gather_mmu(&tlb, mm);
561 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
562 tlb_finish_mmu(&tlb);
563 ret = false;
564 continue;
565 }
566 unmap_page_range(&tlb, vma, range.start, range.end, NULL);
567 mmu_notifier_invalidate_range_end(&range);
568 tlb_finish_mmu(&tlb);
569 }
570 }
571
572 return ret;
573 }
574
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
2026-04-29 22:25 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
2026-05-03 3:26 ` kernel test robot
@ 2026-05-03 3:26 ` kernel test robot
1 sibling, 0 replies; 5+ messages in thread
From: kernel test robot @ 2026-05-03 3:26 UTC (permalink / raw)
To: shaikh.kamal, Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka,
Mike Rapoport, Suren Baghdasaryan, Michal Hocko, David Rientjes,
Shakeel Butt, linux-mm, linux-kernel, kvm, linux-rt-devel
Cc: oe-kbuild-all, pbonzini, skhan, me, syzbot+c3178b6b512446632bac,
shaikh.kamal
Hi shaikh.kamal,
kernel test robot noticed the following build errors:
[auto build test ERROR on v7.0]
[cannot apply to akpm-mm/mm-everything kvm/queue kvm/next kvm/linux-next v7.1-rc1 linus/master next-20260430]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/shaikh-kamal/mm-mmu_notifier-Add-async-OOM-cleanup-via-call_srcu/20260430-202943
base: v7.0
patch link: https://lore.kernel.org/r/20260429222548.25475-1-shaikhkamal2012%40gmail.com
patch subject: [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20260503/202605031109.uxckW5L3-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260503/202605031109.uxckW5L3-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605031109.uxckW5L3-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/oom_kill.c: In function '__oom_reap_task_mm':
>> mm/oom_kill.c:523:9: error: implicit declaration of function 'mmu_notifier_oom_enter'; did you mean 'mmu_notifier_release'? [-Wimplicit-function-declaration]
523 | mmu_notifier_oom_enter(mm);
| ^~~~~~~~~~~~~~~~~~~~~~
| mmu_notifier_release
vim +523 mm/oom_kill.c
515
516 static bool __oom_reap_task_mm(struct mm_struct *mm)
517 {
518 struct vm_area_struct *vma;
519 bool ret = true;
520 MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
521
522 /* Notify MMU notifiers about the OOM event */
> 523 mmu_notifier_oom_enter(mm);
524
525 /*
526 * Tell all users of get_user/copy_from_user etc... that the content
527 * is no longer stable. No barriers really needed because unmapping
528 * should imply barriers already and the reader would hit a page fault
529 * if it stumbled over a reaped memory.
530 */
531 mm_flags_set(MMF_UNSTABLE, mm);
532
533 /*
534 * It might start racing with the dying task and compete for shared
535 * resources - e.g. page table lock contention has been observed.
536 * Reduce those races by reaping the oom victim from the other end
537 * of the address space.
538 */
539 mas_for_each_rev(&mas, vma, 0) {
540 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
541 continue;
542
543 /*
544 * Only anonymous pages have a good chance to be dropped
545 * without additional steps which we cannot afford as we
546 * are OOM already.
547 *
548 * We do not even care about fs backed pages because all
549 * which are reclaimable have already been reclaimed and
550 * we do not want to block exit_mmap by keeping mm ref
551 * count elevated without a good reason.
552 */
553 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
554 struct mmu_notifier_range range;
555 struct mmu_gather tlb;
556
557 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
558 mm, vma->vm_start,
559 vma->vm_end);
560 tlb_gather_mmu(&tlb, mm);
561 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
562 tlb_finish_mmu(&tlb);
563 ret = false;
564 continue;
565 }
566 unmap_page_range(&tlb, vma, range.start, range.end, NULL);
567 mmu_notifier_invalidate_range_end(&range);
568 tlb_finish_mmu(&tlb);
569 }
570 }
571
572 return ret;
573 }
574
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations
@ 2026-03-30 11:24 Paolo Bonzini
2026-04-30 14:17 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
0 siblings, 1 reply; 5+ messages in thread
From: Paolo Bonzini @ 2026-03-30 11:24 UTC (permalink / raw)
To: shaikh kamaluddin
Cc: Sean Christopherson, Sebastian Andrzej Siewior, kvm,
Kernel Mailing List, Linux, linux-rt-devel, Shuah Khan, me
[-- Attachment #1: Type: text/plain, Size: 1406 bytes --]
On Sat, Mar 28, 2026 at 3:50 PM shaikh kamaluddin
<shaikhkamal2012@gmail.com> wrote:
> +void __mmu_notifier_oom_enter(struct mm_struct *mm)
> +{
> + struct mmu_notifier *subscription;
> + int id;
> + pr_info("Entering :func:%s\n", __func__);
> + if (!mm->notifier_subscriptions)
> + return;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(subscription,
> + &mm->notifier_subscriptions->list, hlist,
> + rcu_read_lock_held(&srcu)) {
> + if(subscription->ops->oom_enter)
> + subscription->ops->oom_enter(subscription, mm);
> +
> + }
> + srcu_read_unlock(&srcu, id);
> + pr_info("Done:%s\n", __func__);
Yeah, calling mmu_notifier_unregister() won't work from within this function.
One possibility is for the new method to be something like this:
void (*after_oom_unregister)(struct mmu_notifier *subscription);
So it only has to do
kvm->mn_registered = false; /* or xchg, it's the same */
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
if (kvm->mn_active_invalidate_count)
kvm->mn_active_invalidate_count = 0;
else
WARN_ON(kvm->mmu_invalidate_in_progress);
or something like that. See the attached sketch, feel free to reuse it
as you see fit.
Paolo
[-- Attachment #2: mm.patch --]
[-- Type: text/x-patch, Size: 4393 bytes --]
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3b670ee4eb26..7b14d8099cc1 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1973,12 +1973,15 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
struct x86_exception *exception,
u64 pte_access)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_mmu *mmu = vcpu->arch.mmu;
BUG_ON(!mmu_is_nested(vcpu));
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
+ /* Non-GMET walks are always user-walks */
+ if (!(svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_GMET_ENABLE))
+ access |= PFERR_USER_MASK;
+
return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e4cb317807ab..4a1c1f5297c4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7444,6 +7444,15 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu *mmu = vcpu->arch.mmu;
BUG_ON(!mmu_is_nested(vcpu));
+
+ /*
+ * MBEC differentiates based on the effective U/S bit of
+ * the guest page tables; not the processor CPL.
+ */
+ access &= ~PFERR_USER_MASK;
+ if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
+ access |= PFERR_USER_MASK;
+
return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
}
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 8450e18a87c2..3c67ec15c09c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -212,6 +212,14 @@ struct mmu_notifier_ops {
*/
struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
void (*free_notifier)(struct mmu_notifier *subscription);
+
+ /*
+ * Any mmu notifier that defines this is automatically unregistered
+ * when its mm is the subject of an OOM kill. after_oom_unregister()
+ * is invoked after all other outstanding callbacks have terminated.
+ */
+ void (*after_oom_unregister)(struct mmu_notifier *subscription,
+ struct mm_struct *mm);
};
/*
@@ -287,6 +295,7 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
extern int mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
@@ -661,6 +670,10 @@ static inline void mmu_notifier_synchronize(void)
{
}
+static inline void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+}
+
#endif /* CONFIG_MMU_NOTIFIER */
#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..deba056468b1 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -835,6 +835,56 @@ void mmu_notifier_unregister(struct mmu_notifier *subscription,
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions = mm->notifier_subscriptions;
+ struct mmu_notifier *subscription;
+ struct hlist_node *tmp;
+ HLIST_HEAD(oom_list);
+ int id;
+
+ id = srcu_read_lock(&srcu);
+
+ /*
+ * Prevent further calls to the MMU notifier, except for
+ * release and after_oom_unregister.
+ */
+ spin_lock(&subscriptions->lock);
+ hlist_for_each_entry_safe(subscription, tmp, &subscriptions->list, hlist) {
+ if (!subscription->ops->after_oom_unregister)
+ continue;
+
+ /*
+ * after_oom_unregister and alloc_notifier are incompatible,
+ * because there could be other references to allocated
+ * notifiers.
+ */
+ if (WARN_ON(subscription->ops->alloc_notifier))
+ continue;
+
+ hlist_del_init_rcu(&subscription->hlist);
+ hlist_add_head(&subscription->hlist, &oom_list);
+ }
+ spin_unlock(&subscriptions->lock);
+
+ hlist_for_each_entry(subscription, &oom_list, hlist)
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+ srcu_read_unlock(&srcu, id);
+
+ if (hlist_empty(&oom_list))
+ return;
+
+ synchronize_srcu(&srcu);
+
+ hlist_for_each_entry_safe(subscription, tmp, &oom_list, hlist) {
+ subscription->ops->after_oom_unregister(subscription, mm);
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+ mmdrop(mm);
+ }
+}
+
static void mmu_notifier_free_rcu(struct rcu_head *rcu)
{
struct mmu_notifier *subscription =
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu()
2026-03-30 11:24 [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations Paolo Bonzini
@ 2026-04-30 14:17 ` shaikh.kamal
0 siblings, 0 replies; 5+ messages in thread
From: shaikh.kamal @ 2026-04-30 14:17 UTC (permalink / raw)
To: Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, David Rientjes, Shakeel Butt,
linux-mm, linux-kernel, kvm, linux-rt-devel
Cc: pbonzini, skhan, me, shaikh.kamal, syzbot+c3178b6b512446632bac
When an mm undergoes OOM kill, the OOM reaper unmaps memory while
holding the mmap_lock. MMU notifier subscribers (notably KVM) need
to be informed so they can tear down their secondary mappings. The
current synchronous unregister path can deadlock on PREEMPT_RT
because synchronize_srcu() is called from contexts that cannot
safely sleep.
This patch implements the asynchronous cleanup design proposed by
Paolo Bonzini in v1 review: a new optional after_oom_unregister
callback in struct mmu_notifier_ops, invoked after the SRCU grace
period via call_srcu() so that no readers can still reference the
subscription when cleanup runs.
The flow is:
1. The OOM reaper calls mmu_notifier_oom_enter() from
__oom_reap_task_mm().
2. mmu_notifier_oom_enter() walks the subscription list and, for
each subscriber that provides after_oom_unregister, detaches
the subscription from the active list and schedules a
call_srcu() callback.
3. The deferred callback invokes after_oom_unregister once the
grace period has elapsed and all in-flight readers have
finished.
4. Subsystems waiting to free structures referenced by the
callback can call the new mmu_notifier_barrier() helper, which
wraps srcu_barrier() to wait for all outstanding callbacks
scheduled this way.
after_oom_unregister is mutually exclusive with alloc_notifier
because allocated notifiers can have additional outstanding
references that the OOM path cannot safely drop.
KVM is updated to provide after_oom_unregister, which clears
mn_active_invalidate_count, and to detect via hlist_unhashed() in
kvm_destroy_vm() when its subscription was already detached by the
OOM path; in that case it calls mmu_notifier_barrier() and drops
the mm reference rather than calling mmu_notifier_unregister().
Reported-by: syzbot+c3178b6b512446632bac@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c3178b6b512446632bac
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/all/20260209161527.31978-1-shaikhkamal2012@gmail.com/
Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
include/linux/mmu_notifier.h | 10 +++
mm/mmu_notifier.c | 123 +++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 3 +
virt/kvm/kvm_main.c | 27 +++++++-
4 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 07a2bbaf86e9..0ccd590f55d3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -88,6 +88,14 @@ struct mmu_notifier_ops {
void (*release)(struct mmu_notifier *subscription,
struct mm_struct *mm);
+ /*
+ * Any mmu notifier that defines this is automatically unregistered
+ * when its mm is the subject of an OOM kill. after_oom_unregister()
+ * is invoked after all other outstanding callbacks have terminated.
+ */
+ void (*after_oom_unregister)(struct mmu_notifier *subscription,
+ struct mm_struct *mm);
+
/*
* clear_flush_young is called after the VM is
* test-and-clearing the young/accessed bitflag in the
@@ -375,6 +383,8 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
+void mmu_notifier_oom_enter(struct mm_struct *mm);
+extern void mmu_notifier_barrier(void);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..b8fa58fe6b7d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -49,6 +49,37 @@ struct mmu_notifier_subscriptions {
struct hlist_head deferred_list;
};
+/*
+ * Callback structure for asynchronous OOM cleanup.
+ * Used with call_srcu() to defer after_oom_unregister callbacks
+ * until after SRCU grace period completes.
+ */
+struct mmu_notifier_oom_callback {
+ struct rcu_head rcu;
+ struct mmu_notifier *subscription;
+ struct mm_struct *mm;
+};
+
+/*
+ * Callback function invoked after SRCU grace period.
+ * Safely calls after_oom_unregister once all readers have finished.
+ */
+static void mmu_notifier_oom_callback_fn(struct rcu_head *rcu)
+{
+ struct mmu_notifier_oom_callback *cb =
+ container_of(rcu, struct mmu_notifier_oom_callback, rcu);
+
+ /* Safe - all SRCU readers have finished */
+ cb->subscription->ops->after_oom_unregister(cb->subscription, cb->mm);
+
+ /* Release mm reference taken when callback was scheduled */
+ WARN_ON_ONCE(atomic_read(&cb->mm->mm_count) <= 0);
+ mmdrop(cb->mm);
+
+ /* Free callback structure */
+ kfree(cb);
+}
+
/*
* This is a collision-retry read-side/write-side 'lock', a lot like a
* seqcount, however this allows multiple write-sides to hold it at
@@ -359,6 +390,85 @@ void __mmu_notifier_release(struct mm_struct *mm)
mn_hlist_release(subscriptions, mm);
}
+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+ struct mmu_notifier *subscription;
+ struct hlist_node *tmp;
+ HLIST_HEAD(oom_list);
+ int id;
+
+ if (!subscriptions)
+ return;
+
+ id = srcu_read_lock(&srcu);
+
+ /*
+ * Prevent further calls to the MMU notifier, except for
+ * release and after_oom_unregister.
+ */
+ spin_lock(&subscriptions->lock);
+ hlist_for_each_entry_safe(subscription, tmp,
+ &subscriptions->list, hlist) {
+ if (!subscription->ops->after_oom_unregister)
+ continue;
+
+ /*
+ * after_oom_unregister and alloc_notifier are incompatible,
+ * because there could be other references to allocated
+ * notifiers.
+ */
+ if (WARN_ON(subscription->ops->alloc_notifier))
+ continue;
+
+ hlist_del_init_rcu(&subscription->hlist);
+ hlist_add_head(&subscription->hlist, &oom_list);
+ }
+ spin_unlock(&subscriptions->lock);
+ hlist_for_each_entry(subscription, &oom_list, hlist)
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+
+ srcu_read_unlock(&srcu, id);
+
+ if (hlist_empty(&oom_list))
+ return;
+
+ hlist_for_each_entry_safe(subscription, tmp,
+ &oom_list, hlist) {
+ struct mmu_notifier_oom_callback *cb;
+ /*
+ * Remove from stack-based oom_list and reset hlist to unhashed state.
+ * This sets subscription->hlist.pprev = NULL, so future callers of
+ * mmu_notifier_unregister() (e.g. kvm_destroy_vm) will see
+ * hlist_unhashed() == true and take the safe path, avoiding
+ * use-after-free on the stack-allocated oom_list head.
+ */
+ hlist_del_init(&subscription->hlist);
+
+ /*
+ * GFP_ATOMIC failure is exceedingly rare. We cannot sleep
+ * here (would reintroduce the deadlock this patch fixes)
+ * and cannot call after_oom_unregister synchronously
+ * without first waiting for SRCU readers. The subscriber
+ * will not receive after_oom_unregister but cleanup will
+ * eventually happen via the unregister path.
+ */
+ cb = kmalloc(sizeof(*cb), GFP_ATOMIC);
+ if (!cb)
+ continue;
+
+ cb->subscription = subscription;
+ cb->mm = mm;
+ mmgrab(mm);
+
+ /* Schedule callback - returns immediately */
+ call_srcu(&srcu, &cb->rcu, mmu_notifier_oom_callback_fn);
+ }
+
+}
+
/*
* If no young bitflag is supported by the hardware, ->clear_flush_young can
* unmap the address and return 1 or 0 depending if the mapping previously
@@ -1096,3 +1206,16 @@ void mmu_notifier_synchronize(void)
synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+/**
+ * mmu_notifier_barrier - Wait for all pending MMU notifier callbacks
+ *
+ * Waits for all call_srcu() callbacks scheduled by mmu_notifier_oom_enter()
+ * to complete. Used by subsystems during cleanup to prevent use-after-free
+ * when destroying structures accessed by the callbacks.
+ */
+void mmu_notifier_barrier(void)
+{
+ srcu_barrier(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_barrier);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..029e041afc57 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -519,6 +519,9 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
bool ret = true;
MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
+ /* Notify MMU notifiers about the OOM event */
+ mmu_notifier_oom_enter(mm);
+
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..a2df83d3b413 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -885,6 +885,24 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
srcu_read_unlock(&kvm->srcu, idx);
}
+static void kvm_mmu_notifier_after_oom_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm;
+
+ kvm = mmu_notifier_to_kvm(mn);
+
+ /*
+ * At this point the unregister has completed and all other callbacks
+ * have terminated. Clean up any unbalanced invalidation counts.
+ */
+ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+ if (kvm->mn_active_invalidate_count)
+ kvm->mn_active_invalidate_count = 0;
+ else
+ WARN_ON(kvm->mmu_invalidate_in_progress);
+}
+
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
@@ -892,6 +910,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
.release = kvm_mmu_notifier_release,
+ .after_oom_unregister = kvm_mmu_notifier_after_oom_unregister,
};
static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1280,7 +1299,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
- mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ if (hlist_unhashed(&kvm->mmu_notifier.hlist)) {
+ /* Subscription removed by OOM. Wait for async callback. */
+ mmu_notifier_barrier();
+ mmdrop(kvm->mm);
+ } else {
+ mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ }
/*
* At this point, pending calls to invalidate_range_start()
* have completed but no more MMU notifiers will run, so
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2026-05-03 3:27 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-30 4:48 [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
-- strict thread matches above, loose matches on Subject: below --
2026-04-01 15:40 [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT Sean Christopherson
2026-04-29 22:25 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
2026-05-03 3:26 ` kernel test robot
2026-05-03 3:26 ` kernel test robot
2026-03-30 11:24 [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations Paolo Bonzini
2026-04-30 14:17 ` [PATCH v2 1/1] mm/mmu_notifier: Add async OOM cleanup via call_srcu() shaikh.kamal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox