All of lore.kernel.org
 help / color / mirror / Atom feed
From: Usama Arif <usama.arif@linux.dev>
To: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Cc: Usama Arif <usama.arif@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Peter Xu <peterx@redhat.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	kvm@vger.kernel.org
Subject: Re: [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
Date: Wed, 15 Apr 2026 08:08:59 -0700	[thread overview]
Message-ID: <20260415150900.3660575-1-usama.arif@linux.dev> (raw)
In-Reply-To: <20260414142354.1465950-11-kas@kernel.org>

On Tue, 14 Apr 2026 15:23:44 +0100 "Kiryl Shutsemau (Meta)" <kas@kernel.org> wrote:

> Add UFFDIO_SET_MODE ioctl to toggle UFFD_FEATURE_MINOR_ASYNC at
> runtime. Takes mmap_write_lock for serialization against all in-flight
> faults. On sync-to-async transition, wake threads blocked in
> handle_userfault() so they retry and auto-resolve.
> 
> Since ctx->features can now be modified concurrently, add
> userfaultfd_features() helper that wraps READ_ONCE() and convert
> all ctx->features reads to use it.
> 
> Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
> Assisted-by: Claude:claude-opus-4-6
> ---
>  fs/userfaultfd.c                 | 95 ++++++++++++++++++++++++++++----
>  include/uapi/linux/userfaultfd.h | 13 +++++
>  2 files changed, 96 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 43064238fd8d..0edb33599491 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -79,24 +79,33 @@ struct userfaultfd_wake_range {
>  /* internal indication that UFFD_API ioctl was successfully executed */
>  #define UFFD_FEATURE_INITIALIZED		(1u << 31)
>  
> +/*
> + * Read ctx->features with READ_ONCE() since UFFDIO_SET_MODE can
> + * modify it concurrently.
> + */
> +static unsigned int userfaultfd_features(struct userfaultfd_ctx *ctx)
> +{
> +	return READ_ONCE(ctx->features);
> +}
> +
>  static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
>  {
> -	return ctx->features & UFFD_FEATURE_INITIALIZED;
> +	return userfaultfd_features(ctx) & UFFD_FEATURE_INITIALIZED;
>  }
>  
>  static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
>  {
> -	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
> +	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_WP_ASYNC);
>  }
>  
>  static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
>  {
> -	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
> +	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ANON);
>  }
>  
>  static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
>  {
> -	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
> +	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ASYNC);
>  }
>  
>  static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
> @@ -122,7 +131,7 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
>  	if (!ctx)
>  		return false;
>  
> -	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
> +	return userfaultfd_features(ctx) & UFFD_FEATURE_WP_UNPOPULATED;
>  }
>  
>  static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
> @@ -435,7 +444,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
>  	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
>  	VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
>  
> -	if (ctx->features & UFFD_FEATURE_SIGBUS)
> +	if (userfaultfd_features(ctx) & UFFD_FEATURE_SIGBUS)
>  		goto out;
>  	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
>  		goto out;
> @@ -506,7 +515,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
>  	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
>  	uwq.wq.private = current;
>  	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
> -				reason, ctx->features);
> +				reason, userfaultfd_features(ctx));
>  	uwq.ctx = ctx;
>  	uwq.waken = false;
>  
> @@ -668,7 +677,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  	if (!octx)
>  		return 0;
>  
> -	if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> +	if (!(userfaultfd_features(octx) & UFFD_FEATURE_EVENT_FORK)) {
>  		userfaultfd_reset_ctx(vma);
>  		return 0;
>  	}
> @@ -774,7 +783,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  	if (!ctx)
>  		return;
>  
> -	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
> +	if (userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMAP) {
>  		vm_ctx->ctx = ctx;
>  		userfaultfd_ctx_get(ctx);
>  		down_write(&ctx->map_changing_lock);
> @@ -824,7 +833,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
>  	struct userfaultfd_wait_queue ewq;
>  
>  	ctx = vma->vm_userfaultfd_ctx.ctx;
> -	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
> +	if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMOVE))
>  		return true;
>  
>  	userfaultfd_ctx_get(ctx);
> @@ -863,7 +872,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
>  	struct userfaultfd_unmap_ctx *unmap_ctx;
>  	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
>  
> -	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
> +	if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_UNMAP) ||
>  	    has_unmap_ctx(ctx, unmaps, start, end))
>  		return 0;
>  
> @@ -1826,6 +1835,65 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
>  	return ret;
>  }
>  
> +/*
> + * Features that can be toggled at runtime via UFFDIO_SET_MODE.
> + * Only async features that were enabled at UFFDIO_API time may be toggled.
> + */
> +#define UFFD_FEATURE_TOGGLEABLE	(UFFD_FEATURE_MINOR_ASYNC)
> +
> +static int userfaultfd_set_mode(struct userfaultfd_ctx *ctx,
> +				  unsigned long arg)
> +{
> +	struct uffdio_set_mode mode;
> +	struct mm_struct *mm = ctx->mm;
> +
> +	if (copy_from_user(&mode, (void __user *)arg, sizeof(mode)))
> +		return -EFAULT;
> +
> +	/* enable and disable must not overlap */
> +	if (mode.enable & mode.disable)
> +		return -EINVAL;
> +
> +	/* only toggleable features are allowed */
> +	if ((mode.enable | mode.disable) & ~UFFD_FEATURE_TOGGLEABLE)
> +		return -EINVAL;

The commit message states "Only async features that were enabled at
UFFDIO_API time may be toggled."  However, the code only checks that
the requested feature is in UFFD_FEATURE_TOGGLEABLE.

Is it intentional that a user who opened a uffd without
UFFD_FEATURE_MINOR_ASYNC can still enable it later via
UFFDIO_SET_MODE? 

> +
> +	if (!mmget_not_zero(mm))
> +		return -ESRCH;
> +
> +	/*
> +	 * mmap_write_lock serializes against all page faults.
> +	 * After we release, no in-flight faults from the old mode exist.
> +	 */
> +	{
> +		unsigned int new_features;
> +
> +		mmap_write_lock(mm);
> +		new_features = userfaultfd_features(ctx);
> +		new_features |= mode.enable;
> +		new_features &= ~mode.disable;
> +		WRITE_ONCE(ctx->features, new_features);
> +		mmap_write_unlock(mm);
> +	}
> +
> +	/*
> +	 * If switching to async, wake threads blocked in handle_userfault().
> +	 * They will retry the fault and auto-resolve under the new mode.
> +	 * len=0 means wake all pending faults on this context.
> +	 */
> +	if (mode.enable & UFFD_FEATURE_MINOR_ASYNC) {
> +		struct userfaultfd_wake_range range = { .len = 0 };
> +
> +		spin_lock_irq(&ctx->fault_pending_wqh.lock);
> +		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
> +				     &range);
> +		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
> +		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
> +	}
> +
> +	mmput(mm);
> +	return 0;
> +}
>  
>  static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  {
> @@ -2150,6 +2218,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
>  	case UFFDIO_DEACTIVATE:
>  		ret = userfaultfd_deactivate(ctx, arg);
>  		break;
> +	case UFFDIO_SET_MODE:
> +		ret = userfaultfd_set_mode(ctx, arg);
> +		break;
>  	}
>  	return ret;
>  }
> @@ -2177,7 +2248,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
>  	 *	protocols: aa:... bb:...
>  	 */
>  	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
> -		   pending, total, UFFD_API, ctx->features,
> +		   pending, total, UFFD_API, userfaultfd_features(ctx),
>  		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
>  }
>  #endif
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 775825da2596..f0f14f9db06c 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -84,6 +84,7 @@
>  #define _UFFDIO_CONTINUE		(0x07)
>  #define _UFFDIO_POISON			(0x08)
>  #define _UFFDIO_DEACTIVATE		(0x09)
> +#define _UFFDIO_SET_MODE		(0x0A)
>  #define _UFFDIO_API			(0x3F)
>  
>  /* userfaultfd ioctl ids */
> @@ -110,6 +111,8 @@
>  				      struct uffdio_poison)
>  #define UFFDIO_DEACTIVATE	_IOR(UFFDIO, _UFFDIO_DEACTIVATE,	\
>  				     struct uffdio_range)
> +#define UFFDIO_SET_MODE		_IOW(UFFDIO, _UFFDIO_SET_MODE,	\
> +				     struct uffdio_set_mode)
>  
>  /* read() structure */
>  struct uffd_msg {
> @@ -395,6 +398,16 @@ struct uffdio_move {
>  	__s64 move;
>  };
>  
> +struct uffdio_set_mode {
> +	/*
> +	 * Toggle async mode for features at runtime.
> +	 * Supported: UFFD_FEATURE_MINOR_ASYNC.
> +	 * Setting a bit in both enable and disable is invalid.
> +	 */
> +	__u64 enable;
> +	__u64 disable;
> +};
> +
>  /*
>   * Flags for the userfaultfd(2) system call itself.
>   */
> -- 
> 2.51.2
> 
> 

  reply	other threads:[~2026-04-15 15:09 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Kiryl Shutsemau (Meta)
2026-04-15 15:08   ` Usama Arif [this message]
2026-04-16 13:27     ` Kiryl Shutsemau
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08   ` Kiryl Shutsemau
2026-04-14 17:45     ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10   ` Kiryl Shutsemau
2026-04-16 13:49     ` Kiryl Shutsemau
2026-04-16 18:32       ` David Hildenbrand (Arm)
2026-04-16 20:25         ` Kiryl Shutsemau
2026-04-17 11:02           ` Kiryl Shutsemau
2026-04-17 11:43           ` David Hildenbrand (Arm)
2026-04-17 12:26             ` Kiryl Shutsemau
2026-04-19 14:33               ` Kiryl Shutsemau
2026-04-21 13:03                 ` David Hildenbrand (Arm)
2026-04-21 14:33                   ` Kiryl Shutsemau
2026-04-22  9:27                     ` Kiryl Shutsemau
2026-04-22 18:27                       ` David Hildenbrand (Arm)
2026-04-22 18:39                     ` David Hildenbrand (Arm)
2026-04-23 14:27                       ` Kiryl Shutsemau
2026-04-23 14:50                         ` Peter Xu
2026-04-23 18:08                           ` Kiryl Shutsemau
2026-04-23 18:57                             ` Peter Xu
2026-04-23 19:25                               ` David Hildenbrand (Arm)
2026-04-23 20:10                                 ` Peter Xu
2026-04-24 11:37                                   ` Kiryl Shutsemau
2026-04-24 12:59                                     ` Peter Xu
2026-04-25  5:56                                   ` David Hildenbrand (Arm)
2026-04-24  0:26                               ` SeongJae Park
2026-04-24 11:55                                 ` Peter Xu
2026-04-24 23:59                                   ` SeongJae Park
2026-04-24 10:34                               ` Kiryl Shutsemau
2026-04-24 11:51                                 ` Peter Xu
2026-04-24 13:49                                   ` Kiryl Shutsemau
2026-04-24 15:55                                     ` Peter Xu
2026-04-24 16:09                                       ` Peter Xu
2026-04-27 10:52                                       ` Kiryl Shutsemau
2026-04-25  6:05                                     ` David Hildenbrand (Arm)
2026-04-27 10:23                                       ` Kiryl Shutsemau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260415150900.3660575-1-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=david@kernel.org \
    --cc=kas@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=skhan@linuxfoundation.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.