From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>,
David Hildenbrand <david@kernel.org>,
Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
Suren Baghdasaryan <surenb@google.com>,
Vlastimil Babka <vbabka@kernel.org>,
"Liam R . Howlett" <Liam.Howlett@oracle.com>,
Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
Shuah Khan <skhan@linuxfoundation.org>,
Sean Christopherson <seanjc@google.com>,
Paolo Bonzini <pbonzini@redhat.com>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
kvm@vger.kernel.org, "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Subject: [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
Date: Tue, 14 Apr 2026 15:23:44 +0100 [thread overview]
Message-ID: <20260414142354.1465950-11-kas@kernel.org> (raw)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
Add UFFDIO_SET_MODE ioctl to toggle UFFD_FEATURE_MINOR_ASYNC at
runtime. Takes mmap_write_lock for serialization against all in-flight
faults. On sync-to-async transition, wake threads blocked in
handle_userfault() so they retry and auto-resolve.
Since ctx->features can now be modified concurrently, add
userfaultfd_features() helper that wraps READ_ONCE() and convert
all ctx->features reads to use it.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 95 ++++++++++++++++++++++++++++----
include/uapi/linux/userfaultfd.h | 13 +++++
2 files changed, 96 insertions(+), 12 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 43064238fd8d..0edb33599491 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -79,24 +79,33 @@ struct userfaultfd_wake_range {
/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED (1u << 31)
+/*
+ * Read ctx->features with READ_ONCE() since UFFDIO_SET_MODE can
+ * modify it concurrently.
+ */
+static unsigned int userfaultfd_features(struct userfaultfd_ctx *ctx)
+{
+ return READ_ONCE(ctx->features);
+}
+
static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
- return ctx->features & UFFD_FEATURE_INITIALIZED;
+ return userfaultfd_features(ctx) & UFFD_FEATURE_INITIALIZED;
}
static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
- return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+ return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_WP_ASYNC);
}
static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
{
- return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
+ return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ANON);
}
static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
{
- return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
+ return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ASYNC);
}
static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
@@ -122,7 +131,7 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
if (!ctx)
return false;
- return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+ return userfaultfd_features(ctx) & UFFD_FEATURE_WP_UNPOPULATED;
}
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
@@ -435,7 +444,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
- if (ctx->features & UFFD_FEATURE_SIGBUS)
+ if (userfaultfd_features(ctx) & UFFD_FEATURE_SIGBUS)
goto out;
if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
goto out;
@@ -506,7 +515,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
- reason, ctx->features);
+ reason, userfaultfd_features(ctx));
uwq.ctx = ctx;
uwq.waken = false;
@@ -668,7 +677,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
if (!octx)
return 0;
- if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ if (!(userfaultfd_features(octx) & UFFD_FEATURE_EVENT_FORK)) {
userfaultfd_reset_ctx(vma);
return 0;
}
@@ -774,7 +783,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (!ctx)
return;
- if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
+ if (userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
@@ -824,7 +833,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
struct userfaultfd_wait_queue ewq;
ctx = vma->vm_userfaultfd_ctx.ctx;
- if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
+ if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMOVE))
return true;
userfaultfd_ctx_get(ctx);
@@ -863,7 +872,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
struct userfaultfd_unmap_ctx *unmap_ctx;
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
- if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+ if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_UNMAP) ||
has_unmap_ctx(ctx, unmaps, start, end))
return 0;
@@ -1826,6 +1835,65 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
return ret;
}
+/*
+ * Features that can be toggled at runtime via UFFDIO_SET_MODE.
+ * Only async features that were enabled at UFFDIO_API time may be toggled.
+ */
+#define UFFD_FEATURE_TOGGLEABLE (UFFD_FEATURE_MINOR_ASYNC)
+
+static int userfaultfd_set_mode(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_set_mode mode;
+ struct mm_struct *mm = ctx->mm;
+
+ if (copy_from_user(&mode, (void __user *)arg, sizeof(mode)))
+ return -EFAULT;
+
+ /* enable and disable must not overlap */
+ if (mode.enable & mode.disable)
+ return -EINVAL;
+
+ /* only toggleable features are allowed */
+ if ((mode.enable | mode.disable) & ~UFFD_FEATURE_TOGGLEABLE)
+ return -EINVAL;
+
+ if (!mmget_not_zero(mm))
+ return -ESRCH;
+
+ /*
+ * mmap_write_lock serializes against all page faults.
+ * After we release, no in-flight faults from the old mode exist.
+ */
+ {
+ unsigned int new_features;
+
+ mmap_write_lock(mm);
+ new_features = userfaultfd_features(ctx);
+ new_features |= mode.enable;
+ new_features &= ~mode.disable;
+ WRITE_ONCE(ctx->features, new_features);
+ mmap_write_unlock(mm);
+ }
+
+ /*
+ * If switching to async, wake threads blocked in handle_userfault().
+ * They will retry the fault and auto-resolve under the new mode.
+ * len=0 means wake all pending faults on this context.
+ */
+ if (mode.enable & UFFD_FEATURE_MINOR_ASYNC) {
+ struct userfaultfd_wake_range range = { .len = 0 };
+
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
+ &range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+ }
+
+ mmput(mm);
+ return 0;
+}
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
@@ -2150,6 +2218,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_DEACTIVATE:
ret = userfaultfd_deactivate(ctx, arg);
break;
+ case UFFDIO_SET_MODE:
+ ret = userfaultfd_set_mode(ctx, arg);
+ break;
}
return ret;
}
@@ -2177,7 +2248,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
* protocols: aa:... bb:...
*/
seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
- pending, total, UFFD_API, ctx->features,
+ pending, total, UFFD_API, userfaultfd_features(ctx),
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 775825da2596..f0f14f9db06c 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -84,6 +84,7 @@
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_POISON (0x08)
#define _UFFDIO_DEACTIVATE (0x09)
+#define _UFFDIO_SET_MODE (0x0A)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -110,6 +111,8 @@
struct uffdio_poison)
#define UFFDIO_DEACTIVATE _IOR(UFFDIO, _UFFDIO_DEACTIVATE, \
struct uffdio_range)
+#define UFFDIO_SET_MODE _IOW(UFFDIO, _UFFDIO_SET_MODE, \
+ struct uffdio_set_mode)
/* read() structure */
struct uffd_msg {
@@ -395,6 +398,16 @@ struct uffdio_move {
__s64 move;
};
+struct uffdio_set_mode {
+ /*
+ * Toggle async mode for features at runtime.
+ * Supported: UFFD_FEATURE_MINOR_ASYNC.
+ * Setting a bit in both enable and disable is invalid.
+ */
+ __u64 enable;
+ __u64 disable;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/
--
2.51.2
next prev parent reply other threads:[~2026-04-14 14:24 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` Kiryl Shutsemau (Meta) [this message]
2026-04-15 15:08 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Usama Arif
2026-04-16 13:27 ` Kiryl Shutsemau
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08 ` Kiryl Shutsemau
2026-04-14 17:45 ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10 ` Kiryl Shutsemau
2026-04-16 13:49 ` Kiryl Shutsemau
2026-04-16 18:32 ` David Hildenbrand (Arm)
2026-04-16 20:25 ` Kiryl Shutsemau
2026-04-17 11:02 ` Kiryl Shutsemau
2026-04-17 11:43 ` David Hildenbrand (Arm)
2026-04-17 12:26 ` Kiryl Shutsemau
2026-04-19 14:33 ` Kiryl Shutsemau
2026-04-21 13:03 ` David Hildenbrand (Arm)
2026-04-21 14:33 ` Kiryl Shutsemau
2026-04-22 9:27 ` Kiryl Shutsemau
2026-04-22 18:27 ` David Hildenbrand (Arm)
2026-04-22 18:39 ` David Hildenbrand (Arm)
2026-04-23 14:27 ` Kiryl Shutsemau
2026-04-23 14:50 ` Peter Xu
2026-04-23 18:08 ` Kiryl Shutsemau
2026-04-23 18:57 ` Peter Xu
2026-04-23 19:25 ` David Hildenbrand (Arm)
2026-04-23 20:10 ` Peter Xu
2026-04-24 11:37 ` Kiryl Shutsemau
2026-04-24 12:59 ` Peter Xu
2026-04-25 5:56 ` David Hildenbrand (Arm)
2026-04-24 0:26 ` SeongJae Park
2026-04-24 11:55 ` Peter Xu
2026-04-24 23:59 ` SeongJae Park
2026-04-24 10:34 ` Kiryl Shutsemau
2026-04-24 11:51 ` Peter Xu
2026-04-24 13:49 ` Kiryl Shutsemau
2026-04-24 15:55 ` Peter Xu
2026-04-24 16:09 ` Peter Xu
2026-04-27 10:52 ` Kiryl Shutsemau
2026-04-25 6:05 ` David Hildenbrand (Arm)
2026-04-27 10:23 ` Kiryl Shutsemau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260414142354.1465950-11-kas@kernel.org \
--to=kas@kernel.org \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=skhan@linuxfoundation.org \
--cc=surenb@google.com \
--cc=vbabka@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.