Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* Re: [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB
From: mkf @ 2026-04-14 15:37 UTC (permalink / raw)
  To: Jiayuan Chen, bpf
  Cc: Quan Sun, Yinhao Hu, Kaiyan Mei, Dongliang Mu, Eric Dumazet,
	Neal Cardwell, Kuniyuki Iwashima, David S. Miller, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	David Ahern, netdev, linux-doc, linux-kernel
In-Reply-To: <20260414105702.248310-1-jiayuan.chen@linux.dev>

On Tue, 2026-04-14 at 18:57 +0800, Jiayuan Chen wrote:
> A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
> to inject custom TCP header options. When the kernel builds a TCP packet,
> it calls tcp_established_options() to calculate the header size, which
> invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
> callback.
> 
> If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
> __tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
> tcp_current_mss(), which calls tcp_established_options() again,
> re-triggering the same BPF callback. This creates an infinite recursion
> that exhausts the kernel stack and causes a panic.
> 
> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>   -> bpf_setsockopt(TCP_NODELAY)
> 	-> tcp_push_pending_frames()
> 	  -> tcp_current_mss()
> 		-> tcp_established_options()
> 		  -> bpf_skops_hdr_opt_len()
>                            /* infinite recursion */
> 			-> BPF_SOCK_OPS_HDR_OPT_LEN_CB
> 
> A similar reentrancy issue exists for TCP congestion control, which is
> guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
> tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
> bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
> bpf_setsockopt(TCP_NODELAY) calls that would trigger
> tcp_push_pending_frames() and cause the recursion.
> 
> Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
> Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
> Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
> Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
> Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
> Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
> ---
>  Documentation/networking/net_cachelines/tcp_sock.rst |  1 +
>  include/linux/tcp.h                                  | 11 ++++++++++-
>  net/core/filter.c                                    |  4 ++++
>  net/ipv4/tcp_minisocks.c                             |  1 +
>  net/ipv4/tcp_output.c                                |  3 +++
>  5 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst
> b/Documentation/networking/net_cachelines/tcp_sock.rst
> index 563daea10d6c..07d3226d90cc 100644
> --- a/Documentation/networking/net_cachelines/tcp_sock.rst
> +++ b/Documentation/networking/net_cachelines/tcp_sock.rst
> @@ -152,6 +152,7 @@ unsigned_int                  keepalive_intvl
>  int                           linger2
>  u8                            bpf_sock_ops_cb_flags
>  u8:1                          bpf_chg_cc_inprogress
> +u8:1                          bpf_hdr_opt_len_cb_inprogress
>  u16                           timeout_rehash
>  u32                           rcv_ooopack
>  u32                           rcv_rtt_last_tsecr
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index f72eef31fa23..2bfb73cf922e 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -475,12 +475,21 @@ struct tcp_sock {
>  	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
>  					 * values defined in uapi/linux/tcp.h
>  					 */
> -	u8	bpf_chg_cc_inprogress:1; /* In the middle of
> +	u8	bpf_chg_cc_inprogress:1, /* In the middle of
>  					  * bpf_setsockopt(TCP_CONGESTION),
>  					  * it is to avoid the bpf_tcp_cc->init()
>  					  * to recur itself by calling
>  					  * bpf_setsockopt(TCP_CONGESTION, "itself").
>  					  */
> +		bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
> +						  * callback so that a nested
> +						  * bpf_setsockopt(TCP_NODELAY) or
> +						  * bpf_setsockopt(TCP_CORK) cannot
> +						  * trigger tcp_push_pending_frames(),
> +						  * which would call tcp_current_mss()
> +						  * -> bpf_skops_hdr_opt_len(), causing
> +						  * infinite recursion.
> +						  */
>  #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
>  #else
>  #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 78b548158fb0..518699429a7a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
>  	if (sk->sk_protocol != IPPROTO_TCP)
>  		return -EINVAL;
>  
> +	if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
> +	    tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
> +		return -EBUSY;
> +
TCP_CORK is not support in sol_tcp_sockopt(), return -EINVAL by default. and put the check here
could also prevent us from calling getsockopt(TCP_NODELAY) below.

>  	switch (optname) {
>  	case TCP_NODELAY:
>  	case TCP_MAXSEG:
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index dafb63b923d0..fb06c464ac16 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
>  	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
>  
>  	newtp->bpf_chg_cc_inprogress = 0;
> +	newtp->bpf_hdr_opt_len_cb_inprogress = 0;
>  	tcp_bpf_clone(sk, newsk);
>  
>  	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 326b58ff1118..c9654e690e1a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
>  				  unsigned int *remaining)
>  {
>  	struct bpf_sock_ops_kern sock_ops;
> +	struct tcp_sock *tp = tcp_sk(sk);
>  	int err;
>  
>  	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
> @@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
>  	if (skb)
>  		bpf_skops_init_skb(&sock_ops, skb, 0);
>  
> +	tp->bpf_hdr_opt_len_cb_inprogress = 1;
we check the BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG before calling BPF_CGROUP_RUN_PROG_SOCK_OPS_SK,
could this flag use for the same purpose? so we don't need to add an extra field.

	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
					   BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
	    !*remaining)
		return;
>  	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
> +	tp->bpf_hdr_opt_len_cb_inprogress = 0;
>  
>  	if (err || sock_ops.remaining_opt_len == *remaining)
>  		return;

-- 
Thanks,
KaFai


^ permalink raw reply

* Re: [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory
From: David Hildenbrand (Arm) @ 2026-04-14 15:37 UTC (permalink / raw)
  To: Kiryl Shutsemau (Meta), Andrew Morton
  Cc: Peter Xu, Lorenzo Stoakes, Mike Rapoport, Suren Baghdasaryan,
	Vlastimil Babka, Liam R . Howlett, Zi Yan, Jonathan Corbet,
	Shuah Khan, Sean Christopherson, Paolo Bonzini, linux-mm,
	linux-kernel, linux-doc, linux-kselftest, kvm
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

On 4/14/26 16:23, Kiryl Shutsemau (Meta) wrote:
> This series adds userfaultfd support for tracking the working set of
> VM guest memory, enabling VMMs to identify cold pages and evict them
> to tiered or remote storage.
> 
> == Problem ==
> 
> VMMs managing guest memory need to:
> 1. Track which pages are actively used (working set detection)
> 2. Safely evict cold pages to slower storage
> 3. Fetch pages back on demand when accessed again
> 
> For shmem-backed guest memory, working set tracking partially works
> today: MADV_DONTNEED zaps PTEs while pages stay in page cache, and
> re-access auto-resolves from cache. But safe eviction still requires
> synchronous fault interception to prevent data loss races.
> 
> For anonymous guest memory (needed for KSM cross-VM deduplication),
> there is no mechanism at all — clearing a PTE loses the page.
> 
> == Solution ==
> 
> The series introduces a unified userfaultfd interface that works
> across both anonymous and shmem-backed memory:
> 
> UFFD_FEATURE_MINOR_ANON: extends MODE_MINOR registration to anonymous
> private memory. Uses the PROT_NONE hinting mechanism (same as NUMA
> balancing) to make pages inaccessible without freeing them.

I would rather tackle this from the other direction: it's another form
of protection (like WP), not really a "minor" mode.

Could we add a UFFDIO_REGISTER_MODE_RWP (or however we would call it)
and support it for anon+shmem, avoiding the zapping for shmem completely?

-- 
Cheers,

David

^ permalink raw reply

* Re: [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory
From: Peter Xu @ 2026-04-14 15:28 UTC (permalink / raw)
  To: Kiryl Shutsemau (Meta)
  Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	James Houghton, Andrea Arcangeli
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Hi, Kiryl,

On Tue, Apr 14, 2026 at 03:23:34PM +0100, Kiryl Shutsemau (Meta) wrote:
> This series adds userfaultfd support for tracking the working set of
> VM guest memory, enabling VMMs to identify cold pages and evict them
> to tiered or remote storage.

Thanks for sharing this work, it looks very interesting to me.

Personally I am also looking at some kind of VMM memtiering issues.  I'm
not sure if you saw my lsfmm proposal, it mentioned the challenge we're
facing, it's slightly different but still a bit relevant:

https://lore.kernel.org/all/aYuad2k75iD9bnBE@x1.local/

Unfortunately, that proposal was rejected upstream.

For us, it's so far more about migration and how migration process
introduce zero impact to guest workloads especially on hotness.  I'm not
sure if we have any shared goals over that aspect.

> 
> == Problem ==
> 
> VMMs managing guest memory need to:
> 1. Track which pages are actively used (working set detection)
> 2. Safely evict cold pages to slower storage
> 3. Fetch pages back on demand when accessed again
> 
> For shmem-backed guest memory, working set tracking partially works
> today: MADV_DONTNEED zaps PTEs while pages stay in page cache, and
> re-access auto-resolves from cache. But safe eviction still requires
> synchronous fault interception to prevent data loss races.
> 
> For anonymous guest memory (needed for KSM cross-VM deduplication),
> there is no mechanism at all — clearing a PTE loses the page.
> 
> == Solution ==
> 
> The series introduces a unified userfaultfd interface that works
> across both anonymous and shmem-backed memory:
> 
> UFFD_FEATURE_MINOR_ANON: extends MODE_MINOR registration to anonymous
> private memory. Uses the PROT_NONE hinting mechanism (same as NUMA
> balancing) to make pages inaccessible without freeing them.
> 
> UFFD_FEATURE_MINOR_ASYNC: auto-resolves minor faults without handler
> involvement. The kernel restores PTE permissions immediately and the
> faulting thread continues. Works for anonymous, shmem, and hugetlbfs.
> 
> UFFDIO_DEACTIVATE: marks pages as deactivated. For anonymous memory,
> sets PROT_NONE on PTEs (pages stay resident). For shmem/hugetlbfs,
> zaps PTEs (pages stay in page cache).
> 
> UFFDIO_SET_MODE: toggles MINOR_ASYNC at runtime, synchronized via
> mmap_write_lock. Enables the VMM workflow: async mode for lightweight
> detection, sync mode for race-free eviction.
> 
> PAGE_IS_UFFD_DEACTIVATED: PAGEMAP_SCAN category flag for efficient
> batch detection of cold (still-deactivated) anonymous pages.
> 
> == VMM Workflow ==

AFAIU, this workflow provides two functionalities:

> 
>     UFFDIO_DEACTIVATE(all)            -- async, no vCPU stalls
>     sleep(interval)
>     PAGEMAP_SCAN                      -- find cold pages

Until here it's only about page hotness tracking.  I am curious whether you
evaluated idle page tracking.  Is it because of perf overheads on rmap?  To
me, your solution (until here.. on the hotness sampling) reads more like a
more efficient way to do idle page tracking but only per-mm, not per-folio.

That will also be something I would like to benefit if QEMU will decide to
do full userspace swap.  I think that's our last resort, I'll likely start
with something that makes QEMU work together with Linux on swapping
(e.g. we're happy to make MGLRU or any reclaim logic that Linux mm
currently uses, as long as efficient) then QEMU only cares about the rest,
which is what the migration problem is about.

The other issue about idle page tracking to us is, I believe MGLRU
currently doesn't work well with it (due to ignoring IDLE bits) where the
old LRU algo works.  I'm not sure how much you evaluated above, so it'll be
great to share from that perspective too.  I also mentioned some of these
challenges in the lsfmm proposal link above.

>     UFFDIO_SET_MODE(sync)             -- block faults for eviction
>     pwrite + MADV_DONTNEED cold pages -- safe, faults block
>     UFFDIO_SET_MODE(async)            -- resume tracking

These operations are the 2nd function.  It's, IMHO, a full userspace swap
system based on userfaultfd.

Have you thought about directly relying on userfaultfd-wp to do this work?
The relevant question is, why do we need to block guest reads on pages
being evicted by the userapp?  Can we still allow that to happen, which
seems to be more efficient?  IIUC, only writes / updates matters in such
swap system.

Also, I'm not sure if you're aware of LLNL's umap library:

https://github.com/llnl/umap

That implemnted the swap system using userfaultfd wr-protect mode only, so
no new kernel API needed.

Thanks,

> 
> The same workflow applies to shmem, with a different PAGEMAP_SCAN mask
> (!PAGE_IS_PRESENT instead of PAGE_IS_UFFD_DEACTIVATED).
> 
> == NUMA Balancing ==
> 
> NUMA balancing scanning is skipped on anonymous VM_UFFD_MINOR VMAs to
> avoid protnone conflicts. NUMA locality stats are fed from the uffd
> fault path via task_numa_fault() so the scheduler retains placement
> data. Shmem VMAs are unaffected (UFFDIO_DEACTIVATE zaps PTEs there,
> no protnone involved).
> 
> == Testing ==
> 
> The series includes 6 new selftests covering async/sync modes,
> PAGEMAP_SCAN cold detection, GUP through protnone, UFFDIO_SET_MODE
> toggling, and cleanup on close. All 73 uffd unit tests pass
> (including hugetlb) across defconfig, allnoconfig, allmodconfig,
> and randomized configs.
> 
> Kiryl Shutsemau (Meta) (12):
>   userfaultfd: define UAPI constants for anonymous minor faults
>   userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
>   userfaultfd: implement UFFDIO_DEACTIVATE ioctl
>   userfaultfd: UFFDIO_CONTINUE for anonymous memory
>   mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
>   userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async
>     mode
>   sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
>   userfaultfd: enable UFFD_FEATURE_MINOR_ANON
>   mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN
>   userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
>   selftests/mm: add userfaultfd anonymous minor fault tests
>   Documentation/userfaultfd: document working set tracking
> 
>  Documentation/admin-guide/mm/userfaultfd.rst | 141 ++++-
>  fs/proc/task_mmu.c                           |  11 +-
>  fs/userfaultfd.c                             | 184 +++++-
>  include/linux/huge_mm.h                      |   6 +
>  include/linux/mm.h                           |   2 +
>  include/linux/sched/numa_balancing.h         |   1 +
>  include/linux/userfaultfd_k.h                |  21 +-
>  include/trace/events/sched.h                 |   3 +-
>  include/uapi/linux/fs.h                      |   1 +
>  include/uapi/linux/userfaultfd.h             |  40 +-
>  kernel/sched/fair.c                          |  13 +
>  mm/huge_memory.c                             |  33 +-
>  mm/hugetlb.c                                 |   3 +-
>  mm/memory.c                                  |  51 +-
>  mm/mprotect.c                                |   9 +-
>  mm/shmem.c                                   |   3 +-
>  mm/userfaultfd.c                             | 164 +++++-
>  tools/testing/selftests/mm/uffd-unit-tests.c | 458 +++++++++++++++
>  18 files changed, 1096 insertions(+), 48 deletions(-)
> 
> Kiryl Shutsemau (Meta) (12):
>   userfaultfd: define UAPI constants for anonymous minor faults
>   userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
>   userfaultfd: implement UFFDIO_DEACTIVATE ioctl
>   userfaultfd: UFFDIO_CONTINUE for anonymous memory
>   mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
>   userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async
>     mode
>   sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
>   userfaultfd: enable UFFD_FEATURE_MINOR_ANON
>   mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN
>   userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
>   selftests/mm: add userfaultfd anonymous minor fault tests
>   Documentation/userfaultfd: document working set tracking
> 
>  Documentation/admin-guide/mm/userfaultfd.rst | 141 +++++-
>  fs/proc/task_mmu.c                           |  11 +-
>  fs/userfaultfd.c                             | 184 +++++++-
>  include/linux/huge_mm.h                      |   6 +
>  include/linux/mm.h                           |   2 +
>  include/linux/sched/numa_balancing.h         |   1 +
>  include/linux/userfaultfd_k.h                |  21 +-
>  include/trace/events/sched.h                 |   3 +-
>  include/uapi/linux/fs.h                      |   1 +
>  include/uapi/linux/userfaultfd.h             |  40 +-
>  kernel/sched/fair.c                          |  13 +
>  mm/huge_memory.c                             |  33 +-
>  mm/hugetlb.c                                 |   3 +-
>  mm/memory.c                                  |  51 ++-
>  mm/mprotect.c                                |   9 +-
>  mm/shmem.c                                   |   3 +-
>  mm/userfaultfd.c                             | 164 ++++++-
>  tools/testing/selftests/mm/uffd-unit-tests.c | 458 +++++++++++++++++++
>  18 files changed, 1096 insertions(+), 48 deletions(-)
> 
> -- 
> 2.51.2
> 
> 

-- 
Peter Xu


^ permalink raw reply

* Re: [PATCH V10 00/10] famfs: port into fuse
From: John Groves @ 2026-04-14 15:23 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Joanne Koong, Bernd Schubert, John Groves, Dan Williams,
	Bernd Schubert, Alison Schofield, John Groves, Jonathan Corbet,
	Shuah Khan, Vishal Verma, Dave Jiang, Matthew Wilcox, Jan Kara,
	Alexander Viro, David Hildenbrand, Christian Brauner,
	Darrick J . Wong, Randy Dunlap, Jeff Layton, Amir Goldstein,
	Jonathan Cameron, Stefan Hajnoczi, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org, djbw
In-Reply-To: <CAJfpegsCoMMg-Ux3CbBh0d1uqDNg3Fu_8YE-LubwrQ6A-2Cggw@mail.gmail.com>

On 26/04/14 04:18PM, Miklos Szeredi wrote:
> On Tue, 14 Apr 2026 at 15:41, John Groves <John@groves.net> wrote:
> 
> > My short response: Noooooooooo!!!!!!
> 
> :) Seems like this is a highly emotional topic...  I suggest that we
> go ahead with bpf experiments, then discuss results and path forward
> at LSM.
> 
> Thanks,
> Miklos

I think we need to try to emergency-add a session at LSFMM on this, with
fs/mm/bpf people. Any ideas on how to do this?

John


^ permalink raw reply

* Re: [PATCH v7 5/6] iio: adc: ad4691: add oversampling support
From: David Lechner @ 2026-04-14 15:02 UTC (permalink / raw)
  To: Sabau, Radu bogdan, Jonathan Cameron
  Cc: Lars-Peter Clausen, Hennerich, Michael, Sa, Nuno, Andy Shevchenko,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Uwe Kleine-König, Liam Girdwood, Mark Brown, Linus Walleij,
	Bartosz Golaszewski, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
	linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <LV9PR03MB8414E0A68C5676302909E220F7252@LV9PR03MB8414.namprd03.prod.outlook.com>

On 4/14/26 9:25 AM, Sabau, Radu bogdan wrote:
> 
> 
>> -----Original Message-----
>> From: Jonathan Cameron <jic23@kernel.org>
>> Sent: Sunday, April 12, 2026 8:58 PM
>> To: David Lechner <dlechner@baylibre.com>
>> Cc: Sabau, Radu bogdan <Radu.Sabau@analog.com>; Lars-Peter Clausen
>> <lars@metafoo.de>; Hennerich, Michael <Michael.Hennerich@analog.com>;
>> Sa, Nuno <Nuno.Sa@analog.com>; Andy Shevchenko <andy@kernel.org>;
>> Rob Herring <robh@kernel.org>; Krzysztof Kozlowski <krzk+dt@kernel.org>;
>> Conor Dooley <conor+dt@kernel.org>; Uwe Kleine-König
>> <ukleinek@kernel.org>; Liam Girdwood <lgirdwood@gmail.com>; Mark Brown
>> <broonie@kernel.org>; Linus Walleij <linusw@kernel.org>; Bartosz
>> Golaszewski <brgl@kernel.org>; Philipp Zabel <p.zabel@pengutronix.de>;
>> Jonathan Corbet <corbet@lwn.net>; Shuah Khan
>> <skhan@linuxfoundation.org>; linux-iio@vger.kernel.org;
>> devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
>> pwm@vger.kernel.org; linux-gpio@vger.kernel.org; linux-doc@vger.kernel.org
>> Subject: Re: [PATCH v7 5/6] iio: adc: ad4691: add oversampling support
>>
>> [External]
>>
>> On Fri, 10 Apr 2026 16:15:20 -0500
>> David Lechner <dlechner@baylibre.com> wrote:
>>
>>> On 4/9/26 10:28 AM, Radu Sabau via B4 Relay wrote:
>>>> From: Radu Sabau <radu.sabau@analog.com>
>>>>
>>>> Add per-channel oversampling ratio (OSR) support for CNV burst mode.
>>>> The accumulator depth register (ACC_DEPTH_IN) is programmed with the
>>>> selected OSR at buffer enable time and before each single-shot read.
>>>>
>>>> Supported OSR values: 1, 2, 4, 8, 16, 32.
>>>>
>>>> Introduce AD4691_MANUAL_CHANNEL() for manual mode channels,
>> which do
>>>> not expose the oversampling ratio attribute since OSR is not applicable
>>>> in that mode. A separate manual_channels array is added to
>>>> struct ad4691_channel_info and selected at probe time; offload paths
>>>> reuse the same arrays with num_channels capping access before the soft
>>>> timestamp entry.
>>>>
>>>> The reported sampling frequency accounts for the active OSR:
>>>> effective_freq = oscillator_freq / osr
>>>
>>> Technically, the way this is implemented is fine according to IIO ABI
>>> rules. Writing any attribute can cause others to change. It does
>>> introduce a potential pitfall though. Currently, changing the OSR will
>>> change the sampling frequency, so you have to always write
>> oversampling_ratio
>>> first, then write sampling_frequency to get what you asked for. If you want
>>> to change the OSR and keep the same sample rate, you still have to write
>> both
>>> attributes again.
>>>
>>> In other drivers, I've implemented it so that the requested sampling
>> frequency
>>> is stored any you always get the closest sampling frequency available based
>> on
>>> the oversampling ratio. This way, it doesn't matter which order you write
>>> the attributes. In that case, the actual periodic trigger source isn't set up
>>> until we actually start sampling.
>>>
>> Agreed. This is more intuitive. Now generally the userspace should
>> be sanity checking the value anyway as limitations may mean the new
>> sampling frequency is not particularly close to the original one but
>> at least it increases the chances of getting the expected value somewhat!
>>
>> So to me this is a nice useability improvement given the code to implement
>> it tends not to be too complex.
>>
> 
> Hi David, Jonathan,
> 
> What I understand from this is that the osr should be taken into account when writing
> the sampling frequency as well, right? Here's what I understand:
> 
> If the user wants a 125kHz freq with 4 OSR, then when internal osc will be written
> to 500kHz before single-shot read, buffer preenable/postenable.
> However, if the user wants a 500kHz frequency with 4 OSR, that would mean a 2MHz
> Internal osc freq, which is impossible.

It is up to the user to request something that is legal. They should know this
from reading the datasheet.

> 
> More than this, if the OSR is 32 the maximum effective rate would be 31250, so 25kHz
> would make it the closes available one. If the user would select 1MHz from the available
> list it would be weird I would say. So perhaps a solution for this is to display the avail list
> depending on the set OSR value.

Yes, the available list should reflect the current state of any other attributes
that affect it.

> 
> Linking the two together is perhaps wrong to begin with from my end, since in this
> driver's case, the per-channel sampling frequency is controlled by the internal oscillator
> which has static available values. So perhaps sampling frequency should be separate, and
> OSR separate as well, which would make everything cleaner.
> 
> Indeed, the effective rate is changed by OSR, but perhaps that is something the user
> should be aware of, since the sampling frequency is the rate at which the channel samples
> (1 sample per period) and OSR is how many times the channel samples upon a final sample
> is to be read. The user already has to take this into account when setting the buffer
> sampling frequency, so it would make sense to take this into account here too.

We can't change the definition of the IIO ABI just to make one driver simpler
to implement. The OSR and sample rate can't be completely independent.

If you want to leave it the way it is currently implemented though, that is fine.

> 
> Please let me know you thoughts on this,
> Radu


^ permalink raw reply

* Re: [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB
From: Alexei Starovoitov @ 2026-04-14 14:33 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: bpf, Quan Sun, Yinhao Hu, Kaiyan Mei, Dongliang Mu, Eric Dumazet,
	Neal Cardwell, Kuniyuki Iwashima, David S. Miller, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	David Ahern, Network Development, open list:DOCUMENTATION, LKML
In-Reply-To: <20260414105702.248310-1-jiayuan.chen@linux.dev>

On Tue, Apr 14, 2026 at 3:57 AM Jiayuan Chen <jiayuan.chen@linux.dev> wrote:
>
> A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
> to inject custom TCP header options. When the kernel builds a TCP packet,
> it calls tcp_established_options() to calculate the header size, which
> invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
> callback.
>
> If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
> __tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
> tcp_current_mss(), which calls tcp_established_options() again,
> re-triggering the same BPF callback. This creates an infinite recursion
> that exhausts the kernel stack and causes a panic.
>
> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>   -> bpf_setsockopt(TCP_NODELAY)
>         -> tcp_push_pending_frames()
>           -> tcp_current_mss()
>                 -> tcp_established_options()
>                   -> bpf_skops_hdr_opt_len()
>                            /* infinite recursion */
>                         -> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>
> A similar reentrancy issue exists for TCP congestion control, which is
> guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
> tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
> bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
> bpf_setsockopt(TCP_NODELAY) calls that would trigger
> tcp_push_pending_frames() and cause the recursion.
>
> Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
> Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
> Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
> Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
> Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
> Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
> ---
>  Documentation/networking/net_cachelines/tcp_sock.rst |  1 +
>  include/linux/tcp.h                                  | 11 ++++++++++-
>  net/core/filter.c                                    |  4 ++++
>  net/ipv4/tcp_minisocks.c                             |  1 +
>  net/ipv4/tcp_output.c                                |  3 +++
>  5 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
> index 563daea10d6c..07d3226d90cc 100644
> --- a/Documentation/networking/net_cachelines/tcp_sock.rst
> +++ b/Documentation/networking/net_cachelines/tcp_sock.rst
> @@ -152,6 +152,7 @@ unsigned_int                  keepalive_intvl
>  int                           linger2
>  u8                            bpf_sock_ops_cb_flags
>  u8:1                          bpf_chg_cc_inprogress
> +u8:1                          bpf_hdr_opt_len_cb_inprogress
>  u16                           timeout_rehash
>  u32                           rcv_ooopack
>  u32                           rcv_rtt_last_tsecr
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index f72eef31fa23..2bfb73cf922e 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -475,12 +475,21 @@ struct tcp_sock {
>         u8      bpf_sock_ops_cb_flags;  /* Control calling BPF programs
>                                          * values defined in uapi/linux/tcp.h
>                                          */
> -       u8      bpf_chg_cc_inprogress:1; /* In the middle of
> +       u8      bpf_chg_cc_inprogress:1, /* In the middle of
>                                           * bpf_setsockopt(TCP_CONGESTION),
>                                           * it is to avoid the bpf_tcp_cc->init()
>                                           * to recur itself by calling
>                                           * bpf_setsockopt(TCP_CONGESTION, "itself").
>                                           */
> +               bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
> +                                                 * callback so that a nested
> +                                                 * bpf_setsockopt(TCP_NODELAY) or
> +                                                 * bpf_setsockopt(TCP_CORK) cannot
> +                                                 * trigger tcp_push_pending_frames(),
> +                                                 * which would call tcp_current_mss()
> +                                                 * -> bpf_skops_hdr_opt_len(), causing
> +                                                 * infinite recursion.

Let's not add new bits.
Reuse existing and test/check all in one place,
like commit 061ff040710e9 did.

pw-bot: cr

^ permalink raw reply

* Re: maintainer profiles
From: Mauro Carvalho Chehab @ 2026-04-14 14:32 UTC (permalink / raw)
  To: Dan Williams
  Cc: Jonathan Corbet, Randy Dunlap, Linux Documentation,
	Linux Kernel Mailing List, Linux Kernel Workflows
In-Reply-To: <20260414143733.6cbd6d62@localhost>

On Tue, 14 Apr 2026 14:37:33 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> On Mon, 13 Apr 2026 14:39:37 -0700
> Dan Williams <djbw@kernel.org> wrote:
> 
> > Jonathan Corbet wrote:  
> > > Randy Dunlap <rdunlap@infradead.org> writes:
> > >     
> > > > Hi,
> > > >
> > > > Is there supposed to be a difference (or distinction) in the contents of
> > > >
> > > > Documentation/process/maintainer-handbooks.rst
> > > > and
> > > > Documentation/maintainer/maintainer-entry-profile.rst
> > > > ?
> > > >
> > > > Can they be combined into one location?    
> > > 
> > > Late to the party, sorry ... the original idea, I believe, was that
> > > maintainer-handbooks.rst would be for developers looking for a guidebook
> > > for a specific subsystem, while maintainer-entry-profile.rst was about
> > > how maintainers themselves should write their subsystem guide.
> > > Doubtless things have drifted since then...  But the intended audiences
> > > were different, so it might be good to think about bringing them back
> > > into focus.    
> > 
> > Right, I think something (roughly / hand-wavy) like the below is the
> > intent. However, as I write that I notice that the combined list is a
> > bit of a mess. I also notice that there are more "P:" entries in
> > MAINTAINERS than there are entries in this maintainer-handbooks.rst
> > list.
> > 
> > So this probably wants to be a script that can build Documentation links
> > from MAINTAINERS, or otherwise provide a script for developers to query
> > a kernel tree for additional submission guides. It is probably not as
> > important for the built docs to link all guides as it is for developers
> > (or their agents) to live query a tree they are developing against.  
> 
> There is already a Python script which parses MAINTAINERS file
> (Documentation/sphinx/maintainers_include.py).
> 
> Currently, it expects a Sphinx meta-tag inside
> Documentation/process/maintainers.rst:
> 
> 	.. maintainers-include::
> 
> I guess it shouldn't be hard to add support there for a
> 
> 	.. maintainers-profile::
> 
> Making it creating a set of cross-references is probably easy. Not
> sure how easy/hard would be to create a TOC tree, though.

It was actually easier than what I would expect ;-)

Just submitted a patch series doing that:

https://lore.kernel.org/linux-doc/cover.1776176108.git.mchehab+huawei@kernel.org/T/#t

> > diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst
> > index 6020d188e13d..58e2af333692 100644

...

If you transform this diff into a patch, it would make sense to
add together with the next version of my RFC ;-)

-- 
Thanks,
Mauro

^ permalink raw reply

* [PATCH RFC 4/4] docs: auto-generate maintainer entry profile links
From: Mauro Carvalho Chehab @ 2026-04-14 14:29 UTC (permalink / raw)
  To: Jonathan Corbet, Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, linux-kernel, linux-riscv, workflows,
	Albert Ou, Alexandre Ghiti, Dan Williams, Palmer Dabbelt,
	Paul Walmsley, Randy Dunlap, Shuah Khan
In-Reply-To: <cover.1776176108.git.mchehab+huawei@kernel.org>

Instead of manually creating a TOC tree for them, use the new
tag to auto-generate its TOC.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../maintainer/maintainer-entry-profile.rst     | 17 ++---------------
 Documentation/process/maintainer-handbooks.rst  | 10 +---------
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst
index 6020d188e13d..48ecabd4ce13 100644
--- a/Documentation/maintainer/maintainer-entry-profile.rst
+++ b/Documentation/maintainer/maintainer-entry-profile.rst
@@ -98,18 +98,5 @@ Existing profiles
 For now, existing maintainer profiles are listed here; we will likely want
 to do something different in the near future.
 
-.. toctree::
-   :maxdepth: 1
-
-   ../doc-guide/maintainer-profile
-   ../nvdimm/maintainer-entry-profile
-   ../arch/riscv/patch-acceptance
-   ../process/maintainer-soc
-   ../process/maintainer-soc-clean-dts
-   ../driver-api/media/maintainer-entry-profile
-   ../process/maintainer-netdev
-   ../driver-api/vfio-pci-device-specific-driver-acceptance
-   ../nvme/feature-and-quirk-policy
-   ../filesystems/nfs/nfsd-maintainer-entry-profile
-   ../filesystems/xfs/xfs-maintainer-entry-profile
-   ../mm/damon/maintainer-profile
+See Documentation/process/maintainer-handbooks.rst for subsystem-specific
+profiles.
diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst
index 3d72ad25fc6a..d3d74c719018 100644
--- a/Documentation/process/maintainer-handbooks.rst
+++ b/Documentation/process/maintainer-handbooks.rst
@@ -9,12 +9,4 @@ which is supplementary to the general development process handbook
 
 Contents:
 
-.. toctree::
-   :numbered:
-   :maxdepth: 2
-
-   maintainer-netdev
-   maintainer-soc
-   maintainer-soc-clean-dts
-   maintainer-tip
-   maintainer-kvm-x86
+.. maintainers-profile-toc::
-- 
2.52.0


^ permalink raw reply related

* [PATCH RFC 3/4] MAINTAINERS: add maintainer-tip.rst to X86
From: Mauro Carvalho Chehab @ 2026-04-14 14:29 UTC (permalink / raw)
  To: Jonathan Corbet, Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, linux-kernel, linux-riscv, workflows,
	Dan Williams, Randy Dunlap
In-Reply-To: <cover.1776176108.git.mchehab+huawei@kernel.org>

While the maintainer's profile for tip is there, it is not
at X86 maintainer's entry.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 620219e48f98..a85fcae5f56e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -28560,6 +28560,7 @@ M:	Ingo Molnar <mingo@redhat.com>
 M:	Borislav Petkov <bp@alien8.de>
 M:	Dave Hansen <dave.hansen@linux.intel.com>
 M:	x86@kernel.org
+P:	Documentation/process/maintainer-tip.rst
 R:	"H. Peter Anvin" <hpa@zytor.com>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
-- 
2.52.0


^ permalink raw reply related

* [PATCH RFC 2/4] MAINTAINERS: add an entry for media maintainers profile
From: Mauro Carvalho Chehab @ 2026-04-14 14:29 UTC (permalink / raw)
  To: Jonathan Corbet, Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, linux-kernel, linux-riscv, workflows,
	Dan Williams, Randy Dunlap
In-Reply-To: <cover.1776176108.git.mchehab+huawei@kernel.org>

While media has a maintainers entry profile, its entry is
missing at MAINTAINERS.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f0b106a4dd96..620219e48f98 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16115,6 +16115,7 @@ S:	Maintained
 W:	https://linuxtv.org
 Q:	http://patchwork.kernel.org/project/linux-media/list/
 T:	git git://linuxtv.org/media.git
+P:	Documentation/driver-api/media/maintainer-entry-profile.rst
 F:	Documentation/admin-guide/media/
 F:	Documentation/devicetree/bindings/media/
 F:	Documentation/driver-api/media/
-- 
2.52.0


^ permalink raw reply related

* [PATCH RFC 1/4] docs: maintainers_include: auto-generate maintainer profile TOC
From: Mauro Carvalho Chehab @ 2026-04-14 14:29 UTC (permalink / raw)
  To: Jonathan Corbet, Linux Doc Mailing List, Mauro Carvalho Chehab
  Cc: Mauro Carvalho Chehab, linux-kernel, linux-riscv, workflows,
	Dan Williams, Randy Dunlap, Shuah Khan
In-Reply-To: <cover.1776176108.git.mchehab+huawei@kernel.org>

Add a feature to allow auto-generating media entry profiles from the
corresponding field inside MAINTAINERS file(s).

Suggested-by: Dan Williams <djbw@kernel.org>
Closes: https://lore.kernel.org/linux-doc/69dd6299440be_147c801005b@djbw-dev.notmuch/
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/sphinx/maintainers_include.py | 93 +++++++++++++++++----
 1 file changed, 76 insertions(+), 17 deletions(-)

diff --git a/Documentation/sphinx/maintainers_include.py b/Documentation/sphinx/maintainers_include.py
index 519ad18685b2..1dac83bf1a65 100755
--- a/Documentation/sphinx/maintainers_include.py
+++ b/Documentation/sphinx/maintainers_include.py
@@ -21,6 +21,8 @@ import sys
 import re
 import os.path
 
+from textwrap import indent
+
 from docutils import statemachine
 from docutils.parsers.rst import Directive
 from docutils.parsers.rst.directives.misc import Include
@@ -30,20 +32,11 @@ def ErrorString(exc):  # Shamelessly stolen from docutils
 
 __version__  = '1.0'
 
-def setup(app):
-    app.add_directive("maintainers-include", MaintainersInclude)
-    return dict(
-        version = __version__,
-        parallel_read_safe = True,
-        parallel_write_safe = True
-    )
+class MaintainersParser:
+    """Parse MAINTAINERS file(s) content"""
 
-class MaintainersInclude(Include):
-    """MaintainersInclude (``maintainers-include``) directive"""
-    required_arguments = 0
-
-    def parse_maintainers(self, path):
-        """Parse all the MAINTAINERS lines into ReST for human-readability"""
+    def __init__(self, base_path, path):
+        self.profiles = list()
 
         result = list()
         result.append(".. _maintainers:")
@@ -78,6 +71,12 @@ class MaintainersInclude(Include):
             # Drop needless input whitespace.
             line = line.rstrip()
 
+            match = re.match(r"P:\s*(Documentation/\S+)\.rst", line)
+            if match:
+                fname = os.path.relpath(match.group(1), base_path)
+                if fname not in self.profiles:
+                    self.profiles.append(fname)
+
             # Linkify all non-wildcard refs to ReST files in Documentation/.
             pat = r'(Documentation/([^\s\?\*]*)\.rst)'
             m = re.search(pat, line)
@@ -165,12 +164,23 @@ class MaintainersInclude(Include):
             for separated in field_content.split('\n'):
                 result.append(separated)
 
-        output = "\n".join(result)
+        self.output = "\n".join(result)
+
+        # Create a TOC class
+
+class MaintainersInclude(Include):
+    """MaintainersInclude (``maintainers-include``) directive"""
+    required_arguments = 0
+
+    def emit(self, base_path, path):
+        """Parse all the MAINTAINERS lines into ReST for human-readability"""
+
+        output = MaintainersParser(base_path, path).output
+
         # For debugging the pre-rendered results...
         #print(output, file=open("/tmp/MAINTAINERS.rst", "w"))
 
-        self.state_machine.insert_input(
-          statemachine.string2lines(output), path)
+        self.state_machine.insert_input(statemachine.string2lines(output), path)
 
     def run(self):
         """Include the MAINTAINERS file as part of this reST file."""
@@ -186,12 +196,61 @@ class MaintainersInclude(Include):
 
         # Append "MAINTAINERS"
         path = os.path.join(path, "MAINTAINERS")
+        base_path = os.path.dirname(self.state.document.document.current_source)
 
         try:
             self.state.document.settings.record_dependencies.add(path)
-            lines = self.parse_maintainers(path)
+            lines = self.emit(base_path, path)
         except IOError as error:
             raise self.severe('Problems with "%s" directive path:\n%s.' %
                       (self.name, ErrorString(error)))
 
         return []
+
+class MaintainersProfile(Include):
+    required_arguments = 0
+
+    def emit(self, base_path, path):
+        """Parse all the MAINTAINERS lines looking for profile entries"""
+
+        profiles = MaintainersParser(base_path, path).profiles
+
+        output  = ".. toctree::\n"
+        output += "   :maxdepth: 2\n\n"
+        output += indent("\n".join(profiles), "   ")
+
+        self.state_machine.insert_input(statemachine.string2lines(output), path)
+
+    def run(self):
+        """Include the MAINTAINERS file as part of this reST file."""
+        if not self.state.document.settings.file_insertion_enabled:
+            raise self.warning('"%s" directive disabled.' % self.name)
+
+        # Walk up source path directories to find Documentation/../
+        path = self.state_machine.document.attributes['source']
+        path = os.path.realpath(path)
+        tail = path
+        while tail != "Documentation" and tail != "":
+            (path, tail) = os.path.split(path)
+
+        # Append "MAINTAINERS"
+        path = os.path.join(path, "MAINTAINERS")
+        base_path = os.path.dirname(self.state.document.document.current_source)
+
+        try:
+            self.state.document.settings.record_dependencies.add(path)
+            lines = self.emit(base_path, path)
+        except IOError as error:
+            raise self.severe('Problems with "%s" directive path:\n%s.' %
+                      (self.name, ErrorString(error)))
+
+        return []
+
+def setup(app):
+    app.add_directive("maintainers-include", MaintainersInclude)
+    app.add_directive("maintainers-profile-toc", MaintainersProfile)
+    return dict(
+        version = __version__,
+        parallel_read_safe = True,
+        parallel_write_safe = True
+    )
-- 
2.52.0


^ permalink raw reply related

* [PATCH RFC 0/4] Auto-generate maintainer profile entries
From: Mauro Carvalho Chehab @ 2026-04-14 14:29 UTC (permalink / raw)
  To: Albert Ou, Jonathan Corbet, Dan Williams, Mauro Carvalho Chehab,
	Palmer Dabbelt, Paul Walmsley
  Cc: Mauro Carvalho Chehab, Randy Dunlap, linux-doc, linux-kernel,
	linux-riscv, workflows, Alexandre Ghiti, Shuah Khan

Hi Dan/Jon,

This small patch series change the way maintainer entry profile links
are added to the documentation. Instead of having an entry for
each of them at an ReST file, get them from MAINTAINERS content.

That should likely make easier to maintain, as there will be a single
point to place all such profiles.

I made this as an RFC. The goal is mostly to be a start of discussions
about how this is implemented.

Also, it should be noticed that  I'm not incorporating the diff
content from Dan's sugggestion, as it was just an e-mail reply without
a proper patch title/description/SoB.

Some points on this RFC:

1. some P: entries are links to web pages. The current approach
   ignores them;

2. the current logic doesn't use glob. So, if one would add an
   entry like:

	P: Documentation/foo/profiles-*.rst

   it will generate an entry like "../foo/profiles-*".

   This probably works, as toc trees accept glob.

3. entries are placed at the order they occur at MAINTAINERS
   file (but duplication is properly handled);

4. as Randy mentioned, if an entry there is inside another TOC
   using numeration, those entries will have numeration as well;

5. the approach I took on patch 1 was a little bit lazy, as it
   ends processing MAINTAINERS two times, and there are some code
   duplication on different classes to handle path. I opted to do
   this way to minimize the differences, but it makes sense to 
   clean the code later on newer versions of this series or after
   applying it;

6. patches 2 and 3 can be applied independently of this approach.
   They just add two missing "P:" entries to MAINTAINERS.

Suggested-by: Dan Williams <djbw@kernel.org>
Closes: https://lore.kernel.org/linux-doc/69dd6299440be_147c801005b@djbw-dev.notmuch/

Mauro Carvalho Chehab (4):
  docs: maintainers_include: auto-generate maintainer profile TOC
  MAINTAINERS: add an entry for media maintainers profile
  MAINTAINERS: add maintainer-tip.rst to X86
  docs: auto-generate maintainer entry profile links

 .../maintainer/maintainer-entry-profile.rst   | 17 +---
 .../process/maintainer-handbooks.rst          | 10 +-
 Documentation/sphinx/maintainers_include.py   | 93 +++++++++++++++----
 MAINTAINERS                                   |  2 +
 4 files changed, 81 insertions(+), 41 deletions(-)

-- 
2.52.0

^ permalink raw reply

* RE: [PATCH v7 5/6] iio: adc: ad4691: add oversampling support
From: Sabau, Radu bogdan @ 2026-04-14 14:25 UTC (permalink / raw)
  To: Jonathan Cameron, David Lechner
  Cc: Lars-Peter Clausen, Hennerich, Michael, Sa, Nuno, Andy Shevchenko,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Uwe Kleine-König, Liam Girdwood, Mark Brown, Linus Walleij,
	Bartosz Golaszewski, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
	linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <20260412185821.739e477f@jic23-huawei>



> -----Original Message-----
> From: Jonathan Cameron <jic23@kernel.org>
> Sent: Sunday, April 12, 2026 8:58 PM
> To: David Lechner <dlechner@baylibre.com>
> Cc: Sabau, Radu bogdan <Radu.Sabau@analog.com>; Lars-Peter Clausen
> <lars@metafoo.de>; Hennerich, Michael <Michael.Hennerich@analog.com>;
> Sa, Nuno <Nuno.Sa@analog.com>; Andy Shevchenko <andy@kernel.org>;
> Rob Herring <robh@kernel.org>; Krzysztof Kozlowski <krzk+dt@kernel.org>;
> Conor Dooley <conor+dt@kernel.org>; Uwe Kleine-König
> <ukleinek@kernel.org>; Liam Girdwood <lgirdwood@gmail.com>; Mark Brown
> <broonie@kernel.org>; Linus Walleij <linusw@kernel.org>; Bartosz
> Golaszewski <brgl@kernel.org>; Philipp Zabel <p.zabel@pengutronix.de>;
> Jonathan Corbet <corbet@lwn.net>; Shuah Khan
> <skhan@linuxfoundation.org>; linux-iio@vger.kernel.org;
> devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> pwm@vger.kernel.org; linux-gpio@vger.kernel.org; linux-doc@vger.kernel.org
> Subject: Re: [PATCH v7 5/6] iio: adc: ad4691: add oversampling support
> 
> [External]
> 
> On Fri, 10 Apr 2026 16:15:20 -0500
> David Lechner <dlechner@baylibre.com> wrote:
> 
> > On 4/9/26 10:28 AM, Radu Sabau via B4 Relay wrote:
> > > From: Radu Sabau <radu.sabau@analog.com>
> > >
> > > Add per-channel oversampling ratio (OSR) support for CNV burst mode.
> > > The accumulator depth register (ACC_DEPTH_IN) is programmed with the
> > > selected OSR at buffer enable time and before each single-shot read.
> > >
> > > Supported OSR values: 1, 2, 4, 8, 16, 32.
> > >
> > > Introduce AD4691_MANUAL_CHANNEL() for manual mode channels,
> which do
> > > not expose the oversampling ratio attribute since OSR is not applicable
> > > in that mode. A separate manual_channels array is added to
> > > struct ad4691_channel_info and selected at probe time; offload paths
> > > reuse the same arrays with num_channels capping access before the soft
> > > timestamp entry.
> > >
> > > The reported sampling frequency accounts for the active OSR:
> > > effective_freq = oscillator_freq / osr
> >
> > Technically, the way this is implemented is fine according to IIO ABI
> > rules. Writing any attribute can cause others to change. It does
> > introduce a potential pitfall though. Currently, changing the OSR will
> > change the sampling frequency, so you have to always write
> oversampling_ratio
> > first, then write sampling_frequency to get what you asked for. If you want
> > to change the OSR and keep the same sample rate, you still have to write
> both
> > attributes again.
> >
> > In other drivers, I've implemented it so that the requested sampling
> frequency
> > is stored any you always get the closest sampling frequency available based
> on
> > the oversampling ratio. This way, it doesn't matter which order you write
> > the attributes. In that case, the actual periodic trigger source isn't set up
> > until we actually start sampling.
> >
> Agreed. This is more intuitive. Now generally the userspace should
> be sanity checking the value anyway as limitations may mean the new
> sampling frequency is not particularly close to the original one but
> at least it increases the chances of getting the expected value somewhat!
> 
> So to me this is a nice useability improvement given the code to implement
> it tends not to be too complex.
> 

Hi David, Jonathan,

What I understand from this is that the osr should be taken into account when writing
the sampling frequency as well, right? Here's what I understand:

If the user wants a 125kHz freq with 4 OSR, then when internal osc will be written
to 500kHz before single-shot read, buffer preenable/postenable.
However, if the user wants a 500kHz frequency with 4 OSR, that would mean a 2MHz
Internal osc freq, which is impossible.

More than this, if the OSR is 32 the maximum effective rate would be 31250, so 25kHz
would make it the closes available one. If the user would select 1MHz from the available
list it would be weird I would say. So perhaps a solution for this is to display the avail list
depending on the set OSR value.

Linking the two together is perhaps wrong to begin with from my end, since in this
driver's case, the per-channel sampling frequency is controlled by the internal oscillator
which has static available values. So perhaps sampling frequency should be separate, and
OSR separate as well, which would make everything cleaner.

Indeed, the effective rate is changed by OSR, but perhaps that is something the user
should be aware of, since the sampling frequency is the rate at which the channel samples
(1 sample per period) and OSR is how many times the channel samples upon a final sample
is to be read. The user already has to take this into account when setting the buffer
sampling frequency, so it would make sense to take this into account here too.

Please let me know you thoughts on this,
Radu

^ permalink raw reply

* [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Document the new userfaultfd capabilities for VM working set tracking:

- UFFD_FEATURE_MINOR_ANON and UFFD_FEATURE_MINOR_ASYNC for anonymous
  minor fault interception using the PROT_NONE hinting mechanism.
- UFFDIO_DEACTIVATE for marking pages as inaccessible while keeping
  them resident.
- Sync and async fault resolution modes, and UFFDIO_SET_MODE for
  runtime toggling between them.
- PAGEMAP_SCAN with PAGE_IS_UFFD_DEACTIVATED for cold page detection.
- Cleanup semantics on unregister and close.
- NUMA balancing interaction on anonymous VMAs.
- Complete VMM workflow example for the cold page eviction lifecycle,
  with a note on shmem applicability.

Update the feature flag descriptions at the top of the guide to
reference the new section.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 Documentation/admin-guide/mm/userfaultfd.rst | 141 ++++++++++++++++++-
 1 file changed, 140 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index e5cc8848dcb3..fc89e029060c 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -111,7 +111,11 @@ events, except page fault notifications, may be generated:
 - ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports
   ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory
   areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating
-  support for shmem virtual memory areas.
+  support for shmem virtual memory areas. ``UFFD_FEATURE_MINOR_ANON``
+  extends minor fault support to anonymous private memory using
+  PROT_NONE hinting; see the `Anonymous Minor Faults`_ section.
+  ``UFFD_FEATURE_MINOR_ASYNC`` enables asynchronous auto-resolution for
+  anonymous minor faults (requires ``UFFD_FEATURE_MINOR_ANON``).
 
 - ``UFFD_FEATURE_MOVE`` indicates that the kernel supports moving an
   existing page contents from userspace.
@@ -297,6 +301,141 @@ transparent to the guest, we want that same address range to act as if it was
 still poisoned, even though it's on a new physical host which ostensibly
 doesn't have a memory error in the exact same spot.
 
+Anonymous Minor Faults
+----------------------
+
+``UFFD_FEATURE_MINOR_ANON`` enables ``UFFDIO_REGISTER_MODE_MINOR`` on
+anonymous private memory. Unlike shmem/hugetlbfs minor faults (where a page
+exists in the page cache but has no PTE), anonymous minor faults use the
+PROT_NONE hinting mechanism: pages remain resident in memory with their PFNs
+preserved in the PTEs, but access permissions are removed so the next access
+triggers a fault.
+
+This is designed for VM memory managers that need to track the working set of
+anonymous guest memory for cold page eviction to tiered or remote storage.
+
+**Setup:**
+
+1. Open a userfaultfd and enable ``UFFD_FEATURE_MINOR_ANON`` (and optionally
+   ``UFFD_FEATURE_MINOR_ASYNC``) via ``UFFDIO_API``.
+
+2. Register the guest memory range with ``UFFDIO_REGISTER_MODE_MINOR``
+   (and ``UFFDIO_REGISTER_MODE_MISSING`` if evicted pages will need to be
+   fetched back from storage).
+
+**Deactivation:**
+
+Use ``UFFDIO_DEACTIVATE`` to mark pages as inaccessible. This ioctl takes a
+``struct uffdio_range`` and sets PROT_NONE on all present PTEs in the range,
+using the same mechanism as NUMA balancing. Pages stay resident and their
+physical frames are preserved — only access permissions are removed.
+
+**Fault Handling:**
+
+When a deactivated page is accessed:
+
+- **Sync mode** (default): The faulting thread blocks and a
+  ``UFFD_PAGEFAULT_FLAG_MINOR`` message is delivered to the userfaultfd
+  handler. The handler resolves the fault with ``UFFDIO_CONTINUE``, which
+  restores the PTE permissions and wakes the faulting thread.
+
+- **Async mode** (``UFFD_FEATURE_MINOR_ASYNC``): The kernel automatically
+  restores PTE permissions and the thread continues without blocking. No
+  message is delivered to the handler.
+
+**Cold Page Detection with PAGEMAP_SCAN:**
+
+After deactivating a range and letting the application run, use the
+``PAGEMAP_SCAN`` ioctl on ``/proc/pid/pagemap`` with the
+``PAGE_IS_UFFD_DEACTIVATED`` category flag to efficiently find pages that were
+never re-accessed (cold pages)::
+
+    struct pm_scan_arg arg = {
+        .size = sizeof(arg),
+        .start = guest_mem_start,
+        .end = guest_mem_end,
+        .vec = (uint64_t)regions,
+        .vec_len = regions_len,
+        .category_mask = PAGE_IS_UFFD_DEACTIVATED,
+        .return_mask = PAGE_IS_UFFD_DEACTIVATED,
+    };
+    long n = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+
+The returned ``page_region`` array contains contiguous cold ranges that can
+then be evicted.
+
+**Cleanup:**
+
+When the userfaultfd is closed or the range is unregistered, all protnone
+PTEs are automatically restored to their normal VMA permissions. This
+prevents pages from becoming permanently inaccessible.
+
+**Interaction with NUMA Balancing:**
+
+NUMA balancing is automatically disabled on anonymous VMAs registered with
+``UFFDIO_REGISTER_MODE_MINOR``, since both mechanisms use PROT_NONE PTEs
+as access hints and would interfere with each other. Shmem VMAs are not
+affected since ``UFFDIO_DEACTIVATE`` zaps PTEs there instead of using
+PROT_NONE.
+
+**VMM Working Set Tracking Workflow:**
+
+A typical VMM lifecycle for cold page eviction to tiered storage::
+
+    /* One-time setup */
+    uffd = userfaultfd(O_CLOEXEC | O_NONBLOCK);
+    ioctl(uffd, UFFDIO_API, &(struct uffdio_api){
+        .api = UFFD_API,
+        .features = UFFD_FEATURE_MINOR_ANON |
+                    UFFD_FEATURE_MINOR_ASYNC,
+    });
+    ioctl(uffd, UFFDIO_REGISTER, &(struct uffdio_register){
+        .range = { guest_mem, guest_size },
+        .mode = UFFDIO_REGISTER_MODE_MINOR |
+                UFFDIO_REGISTER_MODE_MISSING,
+    });
+
+    /* Tracking loop */
+    while (vm_running) {
+        /* 1. Detection phase (async — no vCPU stalls) */
+        ioctl(uffd, UFFDIO_DEACTIVATE, &full_range);
+        sleep(tracking_interval);
+
+        /* 2. Find cold pages */
+        ioctl(pagemap_fd, PAGEMAP_SCAN, &(struct pm_scan_arg){
+            .category_mask = PAGE_IS_UFFD_DEACTIVATED,
+            ...
+        });
+
+        /* 3. Switch to sync for safe eviction */
+        ioctl(uffd, UFFDIO_SET_MODE,
+              &(struct uffdio_set_mode){
+                  .disable = UFFD_FEATURE_MINOR_ASYNC });
+
+        /* 4. Evict cold pages (vCPU faults block in handler) */
+        for each cold range:
+            pwrite(storage_fd, cold_addr, len, offset);
+            madvise(cold_addr, len, MADV_DONTNEED);
+
+        /* 5. Resume async tracking */
+        ioctl(uffd, UFFDIO_SET_MODE,
+              &(struct uffdio_set_mode){
+                  .enable = UFFD_FEATURE_MINOR_ASYNC });
+    }
+
+During step 4, if a vCPU accesses a cold page being evicted, it blocks
+with a ``UFFD_PAGEFAULT_FLAG_MINOR`` fault. The handler can either let it
+wait (the eviction completes, ``MADV_DONTNEED`` fires, the fault retries as
+``MISSING`` and is resolved with ``UFFDIO_COPY`` from storage) or resolve
+it immediately with ``UFFDIO_CONTINUE``.
+
+The same workflow applies to shmem-backed guest memory
+(``UFFD_FEATURE_MINOR_SHMEM``). The only difference is the
+``PAGEMAP_SCAN`` mask for cold page detection: use
+``!PAGE_IS_PRESENT`` instead of ``PAGE_IS_UFFD_DEACTIVATED``, since
+``UFFDIO_DEACTIVATE`` zaps PTEs on shmem (pages stay in page cache)
+rather than setting PROT_NONE.
+
 QEMU/KVM
 ========
 
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Add tests for UFFD_FEATURE_MINOR_ANON, UFFD_FEATURE_MINOR_ASYNC,
UFFDIO_DEACTIVATE, UFFDIO_SET_MODE, and PAGE_IS_UFFD_DEACTIVATED:

- minor-anon-async: populate pages, register MODE_MINOR with
  MINOR_ASYNC, deactivate via UFFDIO_DEACTIVATE, re-access and verify
  content is preserved with no faults delivered to the handler.

- minor-anon-sync: same setup but without MINOR_ASYNC. Verify that
  each deactivated page access delivers a MINOR fault to the handler,
  and UFFDIO_CONTINUE resolves it. Exercises both PTE and THP paths.

- minor-anon-pagemap: deactivate a range, touch first half, use
  PAGEMAP_SCAN with PAGE_IS_UFFD_DEACTIVATED to verify the untouched
  second half is reported as cold.

- minor-anon-gup: write() from a deactivated page into a pipe to
  exercise GUP resolution through protnone PTEs via async auto-restore.

- minor-anon-async-toggle: full detection-to-eviction cycle using
  UFFDIO_SET_MODE. Start async (detection), flip to sync (eviction
  of cold pages), flip back to async.

- minor-anon-close: deactivate pages, close the uffd fd, verify all
  pages are accessible again (protnone PTEs restored on cleanup).

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 tools/testing/selftests/mm/uffd-unit-tests.c | 458 +++++++++++++++++++
 1 file changed, 458 insertions(+)

diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 6f5e404a446c..8bd5a642bd5a 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -7,6 +7,7 @@
 
 #include "uffd-common.h"
 
+#include <linux/fs.h>
 #include "../../../../mm/gup_test.h"
 
 #ifdef __NR_userfaultfd
@@ -623,6 +624,423 @@ void uffd_minor_collapse_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *
 	uffd_minor_test_common(gopts, true, false);
 }
 
+static void deactivate_range(int uffd, __u64 start, __u64 len)
+{
+	struct uffdio_range range = { .start = start, .len = len };
+
+	if (ioctl(uffd, UFFDIO_DEACTIVATE, &range))
+		err("UFFDIO_DEACTIVATE failed");
+}
+
+static void set_async_mode(int uffd, bool enable)
+{
+	struct uffdio_set_mode mode = { };
+
+	if (enable)
+		mode.enable = UFFD_FEATURE_MINOR_ASYNC;
+	else
+		mode.disable = UFFD_FEATURE_MINOR_ASYNC;
+
+	if (ioctl(uffd, UFFDIO_SET_MODE, &mode))
+		err("UFFDIO_SET_MODE failed");
+}
+
+/*
+ * Test async minor faults on anonymous memory.
+ * Populate pages, register MODE_MINOR with MINOR_ASYNC,
+ * deactivate, re-access, verify content preserved and no faults delivered.
+ */
+static void uffd_minor_anon_async_test(uffd_global_test_opts_t *gopts,
+				       uffd_test_args_t *args)
+{
+	unsigned long nr_pages = gopts->nr_pages;
+	unsigned long page_size = gopts->page_size;
+	unsigned long p;
+
+	/* Populate all pages with known content */
+	for (p = 0; p < nr_pages; p++)
+		memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+	/* Register MODE_MINOR (uffd was opened with MINOR_ANON | MINOR_ASYNC) */
+	if (uffd_register(gopts->uffd, gopts->area_dst,
+			  nr_pages * page_size,
+			  false, false, true))
+		err("register failure");
+
+	/* Deactivate all pages — sets protnone */
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	/* Access all pages — should auto-resolve, no faults */
+	for (p = 0; p < nr_pages; p++) {
+		unsigned char *page = (unsigned char *)gopts->area_dst +
+				      p * page_size;
+		unsigned char expected = p % 255 + 1;
+
+		if (page[0] != expected) {
+			uffd_test_fail("page %lu content mismatch: %u != %u",
+				       p, page[0], expected);
+			return;
+		}
+	}
+
+	uffd_test_pass();
+}
+
+/*
+ * Custom fault handler for anon minor — just UFFDIO_CONTINUE, no content
+ * modification (the page is protnone so we can't access it from here).
+ */
+static void uffd_handle_minor_anon(uffd_global_test_opts_t *gopts,
+				   struct uffd_msg *msg,
+				   struct uffd_args *uargs)
+{
+	struct uffdio_continue req;
+
+	if (!(msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR))
+		err("expected minor fault, got 0x%llx",
+		    msg->arg.pagefault.flags);
+
+	req.range.start = msg->arg.pagefault.address;
+	req.range.len = gopts->page_size;
+	req.mode = 0;
+	if (ioctl(gopts->uffd, UFFDIO_CONTINUE, &req)) {
+		/*
+		 * THP races with khugepaged collapse/split:
+		 * EAGAIN: PMD changed under us
+		 * EEXIST: THP present but already resolved
+		 * In both cases the page is accessible — the faulting
+		 * thread retries and succeeds.
+		 */
+		if (errno != EEXIST && errno != EAGAIN)
+			err("UFFDIO_CONTINUE failed");
+	}
+
+	uargs->minor_faults++;
+}
+
+/*
+ * Test sync minor faults on anonymous memory.
+ * Populate pages, register MODE_MINOR (sync), deactivate,
+ * access from worker thread, verify fault delivered, UFFDIO_CONTINUE resolves.
+ */
+static void uffd_minor_anon_sync_test(uffd_global_test_opts_t *gopts,
+				      uffd_test_args_t *args)
+{
+	unsigned long nr_pages = gopts->nr_pages;
+	unsigned long page_size = gopts->page_size;
+	pthread_t uffd_mon;
+	struct uffd_args uargs = { };
+	char c = '\0';
+	unsigned long p;
+
+	uargs.gopts = gopts;
+	uargs.handle_fault = uffd_handle_minor_anon;
+
+	/* Populate all pages */
+	for (p = 0; p < nr_pages; p++)
+		memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+	/* Register MODE_MINOR (uffd opened with MINOR_ANON, no MINOR_ASYNC) */
+	if (uffd_register(gopts->uffd, gopts->area_dst,
+			  nr_pages * page_size,
+			  false, false, true))
+		err("register failure");
+
+	/* Deactivate all pages */
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	/* Start fault handler thread */
+	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &uargs))
+		err("uffd_poll_thread create");
+
+	/* Access all pages — triggers sync minor faults, handler does CONTINUE */
+	for (p = 0; p < nr_pages; p++) {
+		unsigned char *page = (unsigned char *)gopts->area_dst +
+				      p * page_size;
+
+		if (page[0] != (p % 255 + 1)) {
+			uffd_test_fail("page %lu content mismatch", p);
+			goto out;
+		}
+	}
+
+	if (uargs.minor_faults == 0) {
+		uffd_test_fail("expected minor faults, got 0");
+		goto out;
+	}
+
+	uffd_test_pass();
+out:
+	if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c))
+		err("pipe write");
+	if (pthread_join(uffd_mon, NULL))
+		err("join() failed");
+}
+
+/*
+ * Test PAGEMAP_SCAN detection of deactivated (cold) pages.
+ */
+static void uffd_minor_anon_pagemap_test(uffd_global_test_opts_t *gopts,
+					  uffd_test_args_t *args)
+{
+	unsigned long nr_pages = gopts->nr_pages;
+	unsigned long page_size = gopts->page_size;
+	unsigned long p;
+	struct page_region regions[16];
+	struct pm_scan_arg pm_arg;
+	int pagemap_fd;
+	long ret;
+
+	/* Need at least 4 pages */
+	if (nr_pages < 4) {
+		uffd_test_skip("need at least 4 pages");
+		return;
+	}
+
+	/* Populate all pages */
+	for (p = 0; p < nr_pages; p++)
+		memset(gopts->area_dst + p * page_size, 0xab, page_size);
+
+	/* Register and deactivate */
+	if (uffd_register(gopts->uffd, gopts->area_dst,
+			  nr_pages * page_size,
+			  false, false, true))
+		err("register failure");
+
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	/* Touch first half of pages to re-activate them (async auto-resolve) */
+	for (p = 0; p < nr_pages / 2; p++) {
+		volatile char *page = gopts->area_dst + p * page_size;
+		(void)*page;
+	}
+
+	/* Scan for cold (still deactivated) pages */
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		err("open pagemap");
+
+	memset(&pm_arg, 0, sizeof(pm_arg));
+	pm_arg.size = sizeof(pm_arg);
+	pm_arg.start = (uint64_t)gopts->area_dst;
+	pm_arg.end = (uint64_t)gopts->area_dst + nr_pages * page_size;
+	pm_arg.vec = (uint64_t)regions;
+	pm_arg.vec_len = 16;
+	pm_arg.category_mask = PAGE_IS_UFFD_DEACTIVATED;
+	pm_arg.return_mask = PAGE_IS_UFFD_DEACTIVATED;
+
+	ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &pm_arg);
+	close(pagemap_fd);
+
+	if (ret < 0) {
+		uffd_test_fail("PAGEMAP_SCAN failed: %s", strerror(errno));
+		return;
+	}
+
+	/*
+	 * The second half of pages should be reported as deactivated.
+	 * They may be coalesced into one region.
+	 */
+	if (ret < 1) {
+		uffd_test_fail("expected cold pages, got %ld regions", ret);
+		return;
+	}
+
+	/* Verify the cold region covers the second half */
+	uint64_t cold_start = regions[0].start;
+	uint64_t expected_start = (uint64_t)gopts->area_dst +
+				  (nr_pages / 2) * page_size;
+
+	if (cold_start != expected_start) {
+		uffd_test_fail("cold region starts at 0x%lx, expected 0x%lx",
+			       (unsigned long)cold_start,
+			       (unsigned long)expected_start);
+		return;
+	}
+
+	uffd_test_pass();
+}
+
+/*
+ * Test that GUP resolves through protnone PTEs (async mode).
+ * Deactivate pages, then use a pipe to exercise GUP on the deactivated
+ * memory. write() from deactivated pages triggers GUP which must fault
+ * through the protnone PTE.
+ */
+static void uffd_minor_anon_gup_test(uffd_global_test_opts_t *gopts,
+				     uffd_test_args_t *args)
+{
+	unsigned long page_size = gopts->page_size;
+	char *buf;
+	int pipefd[2];
+
+	buf = malloc(page_size);
+	if (!buf)
+		err("malloc");
+
+	/* Populate first page with known content */
+	memset(gopts->area_dst, 0xCD, page_size);
+
+	if (uffd_register(gopts->uffd, gopts->area_dst, page_size,
+			  false, false, true))
+		err("register failure");
+
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst, page_size);
+
+	if (pipe(pipefd))
+		err("pipe");
+
+	/*
+	 * write() from the deactivated page into the pipe.
+	 * This triggers GUP on the protnone PTE. In async mode the
+	 * kernel auto-restores permissions and GUP succeeds.
+	 */
+	if (write(pipefd[1], gopts->area_dst, page_size) != page_size) {
+		uffd_test_fail("write from deactivated page failed: %s",
+			       strerror(errno));
+		goto out;
+	}
+
+	if (read(pipefd[0], buf, page_size) != page_size) {
+		uffd_test_fail("read from pipe failed");
+		goto out;
+	}
+
+	if (memcmp(buf, "\xCD", 1) != 0) {
+		uffd_test_fail("content mismatch: got 0x%02x, expected 0xCD",
+			       (unsigned char)buf[0]);
+		goto out;
+	}
+
+	uffd_test_pass();
+out:
+	close(pipefd[0]);
+	close(pipefd[1]);
+	free(buf);
+}
+
+/*
+ * Test runtime toggle between async and sync modes.
+ * Start in async mode (detection), flip to sync (eviction), verify faults
+ * block, resolve them, flip back to async.
+ */
+static void uffd_minor_anon_async_toggle_test(uffd_global_test_opts_t *gopts,
+					      uffd_test_args_t *args)
+{
+	unsigned long nr_pages = gopts->nr_pages;
+	unsigned long page_size = gopts->page_size;
+	struct uffd_args uargs = { };
+	pthread_t uffd_mon;
+	char c = '\0';
+	unsigned long p;
+
+	uargs.gopts = gopts;
+	uargs.handle_fault = uffd_handle_minor_anon;
+
+	/* Populate */
+	for (p = 0; p < nr_pages; p++)
+		memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+	if (uffd_register(gopts->uffd, gopts->area_dst,
+			  nr_pages * page_size,
+			  false, false, true))
+		err("register failure");
+
+	/* Phase 1: async detection — deactivate, access first half */
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	for (p = 0; p < nr_pages / 2; p++) {
+		volatile char *page = gopts->area_dst + p * page_size;
+		(void)*page;  /* auto-resolves in async mode */
+	}
+
+	/* Phase 2: flip to sync for eviction */
+	set_async_mode(gopts->uffd, false);
+
+	/* Start handler — will receive faults for cold pages */
+	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &uargs))
+		err("uffd_poll_thread create");
+
+	/* Access second half (cold pages) — should trigger sync faults */
+	for (p = nr_pages / 2; p < nr_pages; p++) {
+		unsigned char *page = (unsigned char *)gopts->area_dst +
+				      p * page_size;
+		if (page[0] != (p % 255 + 1)) {
+			uffd_test_fail("page %lu content mismatch", p);
+			goto out;
+		}
+	}
+
+	if (uargs.minor_faults == 0) {
+		uffd_test_fail("expected sync faults, got 0");
+		goto out;
+	}
+
+	/* Phase 3: flip back to async */
+	set_async_mode(gopts->uffd, true);
+
+	/* Deactivate and access again — should auto-resolve */
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	for (p = 0; p < nr_pages; p++) {
+		volatile char *page = gopts->area_dst + p * page_size;
+		(void)*page;
+	}
+
+	uffd_test_pass();
+out:
+	if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c))
+		err("pipe write");
+	if (pthread_join(uffd_mon, NULL))
+		err("join() failed");
+}
+
+/*
+ * Test that deactivated pages become accessible after closing uffd.
+ */
+static void uffd_minor_anon_close_test(uffd_global_test_opts_t *gopts,
+				       uffd_test_args_t *args)
+{
+	unsigned long nr_pages = gopts->nr_pages;
+	unsigned long page_size = gopts->page_size;
+	unsigned long p;
+
+	/* Populate */
+	for (p = 0; p < nr_pages; p++)
+		memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+	if (uffd_register(gopts->uffd, gopts->area_dst,
+			  nr_pages * page_size,
+			  false, false, true))
+		err("register failure");
+
+	deactivate_range(gopts->uffd, (uint64_t)gopts->area_dst,
+			 nr_pages * page_size);
+
+	/* Close uffd — should restore protnone PTEs */
+	close(gopts->uffd);
+	gopts->uffd = -1;
+
+	/* All pages should be accessible with original content */
+	for (p = 0; p < nr_pages; p++) {
+		unsigned char *page = (unsigned char *)gopts->area_dst +
+				      p * page_size;
+		unsigned char expected = p % 255 + 1;
+
+		if (page[0] != expected) {
+			uffd_test_fail("page %lu not accessible after close", p);
+			return;
+		}
+	}
+
+	uffd_test_pass();
+}
+
 static sigjmp_buf jbuf, *sigbuf;
 
 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -1625,6 +2043,46 @@ uffd_test_case_t uffd_tests[] = {
 		/* We can't test MADV_COLLAPSE, so try our luck */
 		.uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM,
 	},
+	{
+		.name = "minor-anon-async",
+		.uffd_fn = uffd_minor_anon_async_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required =
+		UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC,
+	},
+	{
+		.name = "minor-anon-sync",
+		.uffd_fn = uffd_minor_anon_sync_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required = UFFD_FEATURE_MINOR_ANON,
+	},
+	{
+		.name = "minor-anon-pagemap",
+		.uffd_fn = uffd_minor_anon_pagemap_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required =
+		UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC,
+	},
+	{
+		.name = "minor-anon-gup",
+		.uffd_fn = uffd_minor_anon_gup_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required =
+		UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC,
+	},
+	{
+		.name = "minor-anon-async-toggle",
+		.uffd_fn = uffd_minor_anon_async_toggle_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required =
+		UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC,
+	},
+	{
+		.name = "minor-anon-close",
+		.uffd_fn = uffd_minor_anon_close_test,
+		.mem_targets = MEM_ANON,
+		.uffd_feature_required = UFFD_FEATURE_MINOR_ANON,
+	},
 	{
 		.name = "sigbus",
 		.uffd_fn = uffd_sigbus_test,
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Add UFFDIO_SET_MODE ioctl to toggle UFFD_FEATURE_MINOR_ASYNC at
runtime. Takes mmap_write_lock for serialization against all in-flight
faults. On sync-to-async transition, wake threads blocked in
handle_userfault() so they retry and auto-resolve.

Since ctx->features can now be modified concurrently, add
userfaultfd_features() helper that wraps READ_ONCE() and convert
all ctx->features reads to use it.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c                 | 95 ++++++++++++++++++++++++++++----
 include/uapi/linux/userfaultfd.h | 13 +++++
 2 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 43064238fd8d..0edb33599491 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -79,24 +79,33 @@ struct userfaultfd_wake_range {
 /* internal indication that UFFD_API ioctl was successfully executed */
 #define UFFD_FEATURE_INITIALIZED		(1u << 31)
 
+/*
+ * Read ctx->features with READ_ONCE() since UFFDIO_SET_MODE can
+ * modify it concurrently.
+ */
+static unsigned int userfaultfd_features(struct userfaultfd_ctx *ctx)
+{
+	return READ_ONCE(ctx->features);
+}
+
 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 {
-	return ctx->features & UFFD_FEATURE_INITIALIZED;
+	return userfaultfd_features(ctx) & UFFD_FEATURE_INITIALIZED;
 }
 
 static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
 {
-	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_WP_ASYNC);
 }
 
 static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
 {
-	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
+	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ANON);
 }
 
 static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
 {
-	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
+	return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ASYNC);
 }
 
 static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
@@ -122,7 +131,7 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 	if (!ctx)
 		return false;
 
-	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+	return userfaultfd_features(ctx) & UFFD_FEATURE_WP_UNPOPULATED;
 }
 
 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
@@ -435,7 +444,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
 	VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
 
-	if (ctx->features & UFFD_FEATURE_SIGBUS)
+	if (userfaultfd_features(ctx) & UFFD_FEATURE_SIGBUS)
 		goto out;
 	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
 		goto out;
@@ -506,7 +515,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
 	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
-				reason, ctx->features);
+				reason, userfaultfd_features(ctx));
 	uwq.ctx = ctx;
 	uwq.waken = false;
 
@@ -668,7 +677,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 	if (!octx)
 		return 0;
 
-	if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+	if (!(userfaultfd_features(octx) & UFFD_FEATURE_EVENT_FORK)) {
 		userfaultfd_reset_ctx(vma);
 		return 0;
 	}
@@ -774,7 +783,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	if (!ctx)
 		return;
 
-	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
+	if (userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
 		down_write(&ctx->map_changing_lock);
@@ -824,7 +833,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
 	struct userfaultfd_wait_queue ewq;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
+	if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMOVE))
 		return true;
 
 	userfaultfd_ctx_get(ctx);
@@ -863,7 +872,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 	struct userfaultfd_unmap_ctx *unmap_ctx;
 	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 
-	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+	if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_UNMAP) ||
 	    has_unmap_ctx(ctx, unmaps, start, end))
 		return 0;
 
@@ -1826,6 +1835,65 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+/*
+ * Features that can be toggled at runtime via UFFDIO_SET_MODE.
+ * Only async features that were enabled at UFFDIO_API time may be toggled.
+ */
+#define UFFD_FEATURE_TOGGLEABLE	(UFFD_FEATURE_MINOR_ASYNC)
+
+static int userfaultfd_set_mode(struct userfaultfd_ctx *ctx,
+				  unsigned long arg)
+{
+	struct uffdio_set_mode mode;
+	struct mm_struct *mm = ctx->mm;
+
+	if (copy_from_user(&mode, (void __user *)arg, sizeof(mode)))
+		return -EFAULT;
+
+	/* enable and disable must not overlap */
+	if (mode.enable & mode.disable)
+		return -EINVAL;
+
+	/* only toggleable features are allowed */
+	if ((mode.enable | mode.disable) & ~UFFD_FEATURE_TOGGLEABLE)
+		return -EINVAL;
+
+	if (!mmget_not_zero(mm))
+		return -ESRCH;
+
+	/*
+	 * mmap_write_lock serializes against all page faults.
+	 * After we release, no in-flight faults from the old mode exist.
+	 */
+	{
+		unsigned int new_features;
+
+		mmap_write_lock(mm);
+		new_features = userfaultfd_features(ctx);
+		new_features |= mode.enable;
+		new_features &= ~mode.disable;
+		WRITE_ONCE(ctx->features, new_features);
+		mmap_write_unlock(mm);
+	}
+
+	/*
+	 * If switching to async, wake threads blocked in handle_userfault().
+	 * They will retry the fault and auto-resolve under the new mode.
+	 * len=0 means wake all pending faults on this context.
+	 */
+	if (mode.enable & UFFD_FEATURE_MINOR_ASYNC) {
+		struct userfaultfd_wake_range range = { .len = 0 };
+
+		spin_lock_irq(&ctx->fault_pending_wqh.lock);
+		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
+				     &range);
+		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+	}
+
+	mmput(mm);
+	return 0;
+}
 
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
@@ -2150,6 +2218,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_DEACTIVATE:
 		ret = userfaultfd_deactivate(ctx, arg);
 		break;
+	case UFFDIO_SET_MODE:
+		ret = userfaultfd_set_mode(ctx, arg);
+		break;
 	}
 	return ret;
 }
@@ -2177,7 +2248,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	 *	protocols: aa:... bb:...
 	 */
 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
-		   pending, total, UFFD_API, ctx->features,
+		   pending, total, UFFD_API, userfaultfd_features(ctx),
 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
 }
 #endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 775825da2596..f0f14f9db06c 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -84,6 +84,7 @@
 #define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_POISON			(0x08)
 #define _UFFDIO_DEACTIVATE		(0x09)
+#define _UFFDIO_SET_MODE		(0x0A)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -110,6 +111,8 @@
 				      struct uffdio_poison)
 #define UFFDIO_DEACTIVATE	_IOR(UFFDIO, _UFFDIO_DEACTIVATE,	\
 				     struct uffdio_range)
+#define UFFDIO_SET_MODE		_IOW(UFFDIO, _UFFDIO_SET_MODE,	\
+				     struct uffdio_set_mode)
 
 /* read() structure */
 struct uffd_msg {
@@ -395,6 +398,16 @@ struct uffdio_move {
 	__s64 move;
 };
 
+struct uffdio_set_mode {
+	/*
+	 * Toggle async mode for features at runtime.
+	 * Supported: UFFD_FEATURE_MINOR_ASYNC.
+	 * Setting a bit in both enable and disable is invalid.
+	 */
+	__u64 enable;
+	__u64 disable;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Report deactivated anonymous pages in PAGEMAP_SCAN results.
Only set on anonymous VMAs (shmem cold = !PAGE_IS_PRESENT).
Both PTE and PMD (THP) levels handled.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/proc/task_mmu.c      | 11 ++++++++++-
 include/uapi/linux/fs.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e091931d7ca1..fc42cfd5720a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2329,7 +2329,7 @@ static int pagemap_release(struct inode *inode, struct file *file)
 				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
 				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
 				 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY |	\
-				 PAGE_IS_GUARD)
+				 PAGE_IS_GUARD | PAGE_IS_UFFD_DEACTIVATED)
 #define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
 
 struct pagemap_scan_private {
@@ -2354,6 +2354,10 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 
 		categories = PAGE_IS_PRESENT;
 
+		if (pte_protnone(pte) && vma_is_accessible(vma) &&
+		    vma_is_anonymous(vma) && userfaultfd_minor(vma))
+			categories |= PAGE_IS_UFFD_DEACTIVATED;
+
 		if (!pte_uffd_wp(pte))
 			categories |= PAGE_IS_WRITTEN;
 
@@ -2422,6 +2426,11 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 		struct page *page;
 
 		categories |= PAGE_IS_PRESENT;
+
+		if (pmd_protnone(pmd) && vma_is_accessible(vma) &&
+		    vma_is_anonymous(vma) && userfaultfd_minor(vma))
+			categories |= PAGE_IS_UFFD_DEACTIVATED;
+
 		if (!pmd_uffd_wp(pmd))
 			categories |= PAGE_IS_WRITTEN;
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 70b2b661f42c..af5b28901800 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -455,6 +455,7 @@ typedef int __bitwise __kernel_rwf_t;
 #define PAGE_IS_HUGE		(1 << 6)
 #define PAGE_IS_SOFT_DIRTY	(1 << 7)
 #define PAGE_IS_GUARD		(1 << 8)
+#define PAGE_IS_UFFD_DEACTIVATED (1 << 9)
 
 /*
  * struct page_region - Page region with flags
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Add UFFD_FEATURE_MINOR_ANON, UFFD_FEATURE_MINOR_ASYNC to
UFFD_API_FEATURES and UFFDIO_DEACTIVATE to UFFD_API_RANGE_IOCTLS.
The feature is now available to userspace.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 include/uapi/linux/userfaultfd.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 336d07e1b6de..775825da2596 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -42,7 +42,9 @@
 			   UFFD_FEATURE_WP_UNPOPULATED |	\
 			   UFFD_FEATURE_POISON |		\
 			   UFFD_FEATURE_WP_ASYNC |		\
-			   UFFD_FEATURE_MOVE)
+			   UFFD_FEATURE_MOVE |			\
+			   UFFD_FEATURE_MINOR_ANON |		\
+			   UFFD_FEATURE_MINOR_ASYNC)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -54,13 +56,15 @@
 	 (__u64)1 << _UFFDIO_MOVE |		\
 	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
 	 (__u64)1 << _UFFDIO_CONTINUE |		\
-	 (__u64)1 << _UFFDIO_POISON)
+	 (__u64)1 << _UFFDIO_POISON |		\
+	 (__u64)1 << _UFFDIO_DEACTIVATE)
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
 	 (__u64)1 << _UFFDIO_CONTINUE |		\
-	 (__u64)1 << _UFFDIO_POISON)
+	 (__u64)1 << _UFFDIO_POISON |		\
+	 (__u64)1 << _UFFDIO_DEACTIVATE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Avoid protnone conflict on anonymous VMAs. Shmem unaffected.
NUMA stats fed from uffd fault path instead.
Add NUMAB_SKIP_UFFD_MINOR trace reason.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 include/linux/sched/numa_balancing.h |  1 +
 include/trace/events/sched.h         |  3 ++-
 kernel/sched/fair.c                  | 13 +++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 52b22c5c396d..5668074a4271 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -23,6 +23,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_PID_INACTIVE,
 	NUMAB_SKIP_IGNORE_PID,
 	NUMAB_SKIP_SEQ_COMPLETED,
+	NUMAB_SKIP_UFFD_MINOR,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..02e79b56db28 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -728,7 +728,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
 	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
 	EM( NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )		\
-	EMe(NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )
+	EM( NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )	\
+	EMe(NUMAB_SKIP_UFFD_MINOR,		"uffd_minor" )
 
 /* Redefine for export. */
 #undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab4114712be7..57beb04562cf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -25,6 +25,7 @@
 #include <linux/hugetlb_inline.h>
 #include <linux/jiffies.h>
 #include <linux/mm_api.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/highmem.h>
 #include <linux/spinlock_api.h>
 #include <linux/cpumask_api.h>
@@ -3459,6 +3460,18 @@ static void task_numa_work(struct callback_head *work)
 			continue;
 		}
 
+		/*
+		 * Skip anonymous VMAs registered for userfaultfd minor faults.
+		 * Both NUMA balancing and uffd use protnone PTEs on anonymous
+		 * memory — let uffd own the hinting. For shmem, UFFDIO_DEACTIVATE
+		 * zaps PTEs entirely (no protnone conflict), so NUMA scanning
+		 * can proceed normally.
+		 */
+		if (vma_is_anonymous(vma) && userfaultfd_minor(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UFFD_MINOR);
+			continue;
+		}
+
 		/*
 		 * Shared library pages mapped by multiple processes are not
 		 * migrated as it is expected they are cache replicated. Avoid
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

When UFFD_FEATURE_MINOR_ASYNC is enabled, skip handle_userfault() in
the shmem and hugetlbfs minor fault paths. The normal fault path
installs the PTE from page cache directly.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 mm/hugetlb.c | 3 ++-
 mm/shmem.c   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 327eaa4074d3..c10d2432768c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5847,7 +5847,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
 		}
 
 		/* Check for page in userfault range. */
-		if (userfaultfd_minor(vma)) {
+		if (userfaultfd_minor(vma) &&
+		    !userfaultfd_minor_async(vma)) {
 			folio_unlock(folio);
 			folio_put(folio);
 			/* See comment in userfaultfd_missing() block above */
diff --git a/mm/shmem.c b/mm/shmem.c
index b40f3cd48961..ce47e77fc090 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2489,7 +2489,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 	fault_mm = vma ? vma->vm_mm : NULL;
 
 	folio = filemap_get_entry(inode->i_mapping, index);
-	if (folio && vma && userfaultfd_minor(vma)) {
+	if (folio && vma && userfaultfd_minor(vma) &&
+	    !userfaultfd_minor_async(vma)) {
 		if (!xa_is_value(folio))
 			folio_put(folio);
 		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

When a protnone PTE/PMD fault occurs on a VMA with VM_UFFD_MINOR,
dispatch to the userfaultfd minor fault path instead of NUMA balancing.
Async: restore permissions inline. Sync: deliver via handle_userfault().

Feed NUMA locality stats from the fault path via task_numa_fault()
so the scheduler retains placement data even though NUMA scanning
is skipped on these VMAs.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 include/linux/huge_mm.h |  6 +++++
 mm/huge_memory.c        | 24 +++++++++++++++++++
 mm/memory.c             | 51 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a4d9f964dfde..a900bb530998 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -519,6 +519,7 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 }
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf);
 
 vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
 
@@ -707,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	return 0;
 }
 
+static inline vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf)
+{
+	return 0;
+}
+
 static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
 {
 	return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2ad736ff007c..264c646a8573 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2181,6 +2181,30 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
 	return pmd_dirty(pmd);
 }
 
+vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+
+	if (userfaultfd_minor_async(vma)) {
+		pmd_t pmd;
+
+		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+		if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
+			spin_unlock(vmf->ptl);
+			return 0;
+		}
+		pmd = pmd_modify(vmf->orig_pmd, vma->vm_page_prot);
+		pmd = pmd_mkyoung(pmd);
+		set_pmd_at(vma->vm_mm, vmf->address & HPAGE_PMD_MASK,
+			   vmf->pmd, pmd);
+		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+		spin_unlock(vmf->ptl);
+		return 0;
+	}
+
+	return handle_userfault(vmf, VM_UFFD_MINOR);
+}
+
 /* NUMA hinting page fault entry point for trans huge pmds */
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
diff --git a/mm/memory.c b/mm/memory.c
index c65e82c86fed..f068ff4027e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6045,6 +6045,47 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
 	}
 }
 
+static void uffd_minor_feed_numa_fault(struct vm_fault *vmf)
+{
+	struct folio *folio;
+
+	folio = vm_normal_folio(vmf->vma, vmf->address, vmf->orig_pte);
+	if (folio) {
+		int nid = folio_nid(folio);
+		int flags = 0;
+
+		if (nid == numa_node_id())
+			flags |= TNF_FAULT_LOCAL;
+		task_numa_fault(folio_last_cpupid(folio), nid, 1, flags);
+	}
+}
+
+static vm_fault_t do_uffd_minor_anon(struct vm_fault *vmf)
+{
+	/* Feed NUMA stats even though we skip NUMA scanning on this VMA */
+	uffd_minor_feed_numa_fault(vmf);
+
+	if (userfaultfd_minor_async(vmf->vma)) {
+		pte_t pte;
+
+		spin_lock(vmf->ptl);
+		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+			pte_unmap_unlock(vmf->pte, vmf->ptl);
+			return 0;
+		}
+		pte = pte_modify(vmf->orig_pte, vmf->vma->vm_page_prot);
+		pte = pte_mkyoung(pte);
+		set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte);
+		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+		pte_unmap_unlock(vmf->pte, vmf->ptl);
+		return 0;
+	}
+
+	/* Sync mode: unmap PTE and deliver to userfaultfd handler */
+	pte_unmap(vmf->pte);
+	return handle_userfault(vmf, VM_UFFD_MINOR);
+}
+
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -6319,8 +6360,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	if (!pte_present(vmf->orig_pte))
 		return do_swap_page(vmf);
 
-	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+		if (userfaultfd_minor(vmf->vma))
+			return do_uffd_minor_anon(vmf);
 		return do_numa_page(vmf);
+	}
 
 	spin_lock(vmf->ptl);
 	entry = vmf->orig_pte;
@@ -6434,8 +6478,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		return 0;
 	}
 	if (pmd_trans_huge(vmf.orig_pmd)) {
-		if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+		if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) {
+			if (userfaultfd_minor(vma))
+				return do_huge_pmd_uffd_minor(&vmf);
 			return do_huge_pmd_numa_page(&vmf);
+		}
 
 		if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
 		    !pmd_write(vmf.orig_pmd)) {
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Allow UFFDIO_CONTINUE on anonymous VMAs with VM_UFFD_MINOR. For shmem,
CONTINUE installs a PTE from page cache. For anonymous memory, the
page is already mapped via a protnone PTE — CONTINUE restores the
original VMA permissions.

PTE level: mfill_atomic_pte_continue_anon() walks to the PTE, verifies
protnone, restores permissions. Rename the shmem path to
mfill_atomic_pte_continue_shmem() for clarity.

PMD/THP level: mfill_atomic_pmd_continue_anon() restores protnone PMD
permissions in place without splitting. Handles PMD races with EAGAIN
retry in the mfill_atomic loop.

Add protnone PTE/PMD checks in userfaultfd_must_wait() so sync minor
faults properly block until resolved.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c |  9 +++++-
 mm/userfaultfd.c | 82 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b317c9854b86..43064238fd8d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -340,8 +340,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	if (!pmd_present(_pmd))
 		return false;
 
-	if (pmd_trans_huge(_pmd))
+	if (pmd_trans_huge(_pmd)) {
+		if (pmd_protnone(_pmd) && (reason & VM_UFFD_MINOR))
+			return true;
 		return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+	}
 
 	pte = pte_offset_map(pmd, address);
 	if (!pte)
@@ -366,6 +369,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	 */
 	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
 		goto out;
+	/* PTE is still protnone (deactivated), wait for userspace to resolve. */
+	if (pte_protnone(ptent) && (reason & VM_UFFD_MINOR))
+		goto out;
 
 	ret = false;
 out:
@@ -1820,6 +1826,7 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
 	__s64 ret;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3373b11b9d83..4c52fa5d1608 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -380,8 +380,61 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 	return ret;
 }
 
-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue_anon(pmd_t *dst_pmd,
+					  struct vm_area_struct *dst_vma,
+					  unsigned long dst_addr,
+					  uffd_flags_t flags)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	int ret = -EFAULT;
+
+	ptep = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+	if (!ptep)
+		return ret;
+
+	pte = ptep_get(ptep);
+	if (!pte_protnone(pte))
+		goto out_unlock;
+
+	pte = pte_modify(pte, dst_vma->vm_page_prot);
+	pte = pte_mkyoung(pte);
+	if (flags & MFILL_ATOMIC_WP)
+		pte = pte_wrprotect(pte);
+	set_pte_at(dst_vma->vm_mm, dst_addr, ptep, pte);
+	update_mmu_cache(dst_vma, dst_addr, ptep);
+	ret = 0;
+out_unlock:
+	pte_unmap_unlock(ptep, ptl);
+	return ret;
+}
+
+static int mfill_atomic_pmd_continue_anon(struct mm_struct *mm,
+					  struct vm_area_struct *vma,
+					  unsigned long addr,
+					  pmd_t *pmd, pmd_t orig_pmd,
+					  uffd_flags_t flags)
+{
+	spinlock_t *ptl;
+	pmd_t entry;
+
+	ptl = pmd_lock(mm, pmd);
+	if (unlikely(!pmd_same(pmdp_get(pmd), orig_pmd))) {
+		spin_unlock(ptl);
+		return -EAGAIN;
+	}
+
+	entry = pmd_modify(orig_pmd, vma->vm_page_prot);
+	entry = pmd_mkyoung(entry);
+	if (flags & MFILL_ATOMIC_WP)
+		entry = pmd_wrprotect(entry);
+	set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, entry);
+	update_mmu_cache_pmd(vma, addr, pmd);
+	spin_unlock(ptl);
+	return 0;
+}
+
+static int mfill_atomic_pte_continue_shmem(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr,
 				     uffd_flags_t flags)
@@ -667,7 +720,10 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	ssize_t err;
 
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
-		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+		if (vma_is_anonymous(dst_vma))
+			return mfill_atomic_pte_continue_anon(dst_pmd, dst_vma,
+							      dst_addr, flags);
+		return mfill_atomic_pte_continue_shmem(dst_pmd, dst_vma,
 						 dst_addr, flags);
 	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
 		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
@@ -802,11 +858,25 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			break;
 		}
 		/*
-		 * If the dst_pmd is THP don't override it and just be strict.
-		 * (This includes the case where the PMD used to be THP and
-		 * changed back to none after __pte_alloc().)
+		 * THP PMD: for anon CONTINUE, restore protnone PMD
+		 * permissions in place. For other operations, reject.
 		 */
 		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+			if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+			    vma_is_anonymous(dst_vma) &&
+			    pmd_protnone(dst_pmdval)) {
+				err = mfill_atomic_pmd_continue_anon(
+					dst_mm, dst_vma, dst_addr,
+					dst_pmd, dst_pmdval, flags);
+				if (err == -EAGAIN)
+					continue; /* PMD changed, re-read it */
+				if (err)
+					break;
+				dst_addr += HPAGE_PMD_SIZE;
+				src_addr += HPAGE_PMD_SIZE;
+				copied += HPAGE_PMD_SIZE;
+				continue;
+			}
 			err = -EEXIST;
 			break;
 		}
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR
range:

- Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE).
  Pages stay resident with PFNs preserved, only permissions removed.
  MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA,
  bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.

- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single().
  Pages stay in page cache.

- Private hugetlb: rejected with -EINVAL (zapping would destroy content).

Cleanup on unregister/close: restore protnone PTEs to normal permissions
in userfaultfd_clear_vma(), preventing permanently inaccessible pages.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c              | 35 ++++++++++++++++
 include/linux/mm.h            |  2 +
 include/linux/userfaultfd_k.h |  2 +
 mm/huge_memory.c              |  9 ++--
 mm/mprotect.c                 |  9 +++-
 mm/userfaultfd.c              | 78 +++++++++++++++++++++++++++++++++--
 6 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d508ad19e89..b317c9854b86 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
 
+		/* DEACTIVATE is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
+				  unsigned long arg)
+{
+	int ret;
+	struct uffdio_range uffdio_range;
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	if (copy_from_user(&uffdio_range, (void __user *)arg,
+			   sizeof(uffdio_range)))
+		return -EFAULT;
+
+	ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len);
+	if (ret)
+		return ret;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mdeactivate_range(ctx, uffdio_range.start,
+					uffdio_range.len);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	return ret;
+}
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
 	__s64 ret;
@@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_POISON:
 		ret = userfaultfd_poison(ctx, arg);
 		break;
+	case UFFDIO_DEACTIVATE:
+		ret = userfaultfd_deactivate(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..fc2841264d56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
 					    MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd deactivation */
+#define  MM_CP_UFFD_DEACTIVATE             (1UL << 4)
 
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d1d4ed4a08b0..c94b5c5b5f24 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 			       unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
+extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			     unsigned long len);
 
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..2ad736ff007c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pmd_t oldpmd, entry;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int ret = 1;
@@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		goto unlock;
 	}
 
-	if (prot_numa) {
+	/* Already protnone — nothing to do for either NUMA or uffd */
+	if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd))
+		goto unlock;
 
+	if (prot_numa) {
 		/*
 		 * Avoid trapping faults against the zero page. The read-only
 		 * data is likely to be read-cached on the local CPU and
@@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (is_huge_zero_pmd(*pmd))
 			goto unlock;
 
-		if (pmd_protnone(*pmd))
-			goto unlock;
-
 		if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
 					     vma_is_single_threaded_private(vma)))
 			goto unlock;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..7c612a680014 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 	long pages = 0;
 	bool is_private_single_threaded;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int nr_ptes;
@@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			pte_t ptent;
 
 			/* Already in the desired state. */
-			if (prot_numa && pte_protnone(oldpte))
+			if ((prot_numa || uffd_deactivate) &&
+			    pte_protnone(oldpte))
 				continue;
 
 			page = vm_normal_page(vma, addr, oldpte);
@@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
+			 * Skip this filter for uffd deactivation which
+			 * must set protnone regardless of NUMA placement.
 			 */
 			if (prot_numa &&
 			    !folio_can_map_prot_numa(folio, vma,
@@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb,
 	WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 #endif
 
+	if (cp_flags & MM_CP_UFFD_DEACTIVATE)
+		newprot = PAGE_NONE;
+
 	if (is_vm_hugetlb_page(vma))
 		pages = hugetlb_change_protection(vma, start, end, newprot,
 						  cp_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dba1ea26fdfe..3373b11b9d83 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
-	if (!vma_is_shmem(dst_vma) &&
+	if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) &&
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
@@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			break;
 		}
 		dst_pmdval = pmdp_get_lockless(dst_pmd);
+		if (unlikely(!pmd_present(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
 		/*
 		 * If the dst_pmd is THP don't override it and just be strict.
 		 * (This includes the case where the PMD used to be THP and
 		 * changed back to none after __pte_alloc().)
 		 */
-		if (unlikely(!pmd_present(dst_pmdval) ||
-				pmd_trans_huge(dst_pmdval))) {
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
 			err = -EEXIST;
 			break;
 		}
@@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 	return err;
 }
 
+int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+		      unsigned long len)
+{
+	struct mm_struct *dst_mm = ctx->mm;
+	unsigned long end = start + len;
+	struct vm_area_struct *dst_vma;
+	long err;
+	VMA_ITERATOR(vmi, dst_mm, start);
+
+	VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(start + len <= start);
+
+	guard(mmap_read_lock)(dst_mm);
+	guard(rwsem_read)(&ctx->map_changing_lock);
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	err = -ENOENT;
+	for_each_vma_range(vmi, dst_vma, end) {
+		unsigned long vma_start = max(dst_vma->vm_start, start);
+		unsigned long vma_end = min(dst_vma->vm_end, end);
+
+		if (!userfaultfd_minor(dst_vma)) {
+			err = -ENOENT;
+			break;
+		}
+
+		/*
+		 * Private hugetlb has no page cache to fall back on —
+		 * zapping PTEs would destroy page content.
+		 */
+		if (is_vm_hugetlb_page(dst_vma) &&
+		    !(dst_vma->vm_flags & VM_SHARED)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (vma_is_anonymous(dst_vma)) {
+			/* Anonymous: set protnone, pages stay resident */
+			struct mmu_gather tlb;
+
+			tlb_gather_mmu(&tlb, dst_mm);
+			err = change_protection(&tlb, dst_vma, vma_start,
+						vma_end,
+						MM_CP_UFFD_DEACTIVATE);
+			tlb_finish_mmu(&tlb);
+			if (err < 0)
+				break;
+		} else {
+			/* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */
+			zap_page_range_single(dst_vma, vma_start,
+					      vma_end - vma_start, NULL);
+		}
+		err = 0;
+	}
+	return err;
+}
 
 void double_pt_lock(spinlock_t *ptl1,
 		    spinlock_t *ptl2)
@@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
 	if (userfaultfd_wp(vma))
 		uffd_wp_range(vma, start, end - start, false);
 
+	/* Restore protnone PTEs to normal permissions */
+	if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) {
+		struct mmu_gather tlb;
+
+		tlb_gather_mmu(&tlb, vma->vm_mm);
+		change_protection(&tlb, vma, start, end,
+				  MM_CP_TRY_CHANGE_WRITABLE);
+		tlb_finish_mmu(&tlb);
+	}
+
 	ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
 				    vma->vm_flags & ~__VM_UFFD_FLAGS,
 				    NULL_VM_UFFD_CTX, give_up_on_oom);
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Allow UFFDIO_REGISTER_MODE_MINOR on anonymous VMAs when the
UFFD_FEATURE_MINOR_ANON feature is enabled.

Replace the bool wp_async parameter in vma_can_userfault() and
userfaultfd_register_range() with an extensible ctx_flags bitmap.
Add UFFD_CTX_WP_ASYNC and UFFD_CTX_MINOR_ANON flags, and
userfaultfd_ctx_flags() to build the bitmap from ctx->features.

Add userfaultfd_minor_async() helper for checking async minor mode
from the fault path.

Gate UFFD_FEATURE_MINOR_ANON and UFFD_FEATURE_MINOR_ASYNC on
CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Validate that MINOR_ASYNC
requires at least one minor feature.

Not yet visible to userspace (not in UFFD_API_FEATURES).

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c              | 49 ++++++++++++++++++++++++++++++-----
 include/linux/userfaultfd_k.h | 19 +++++++++++---
 mm/userfaultfd.c              |  4 +--
 3 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bdc84e5219cd..8d508ad19e89 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -89,6 +89,27 @@ static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
 	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
 }
 
+static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
+{
+	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
+}
+
+static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
+{
+	return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
+}
+
+static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
+{
+	unsigned int flags = 0;
+
+	if (userfaultfd_wp_async_ctx(ctx))
+		flags |= UFFD_CTX_WP_ASYNC;
+	if (userfaultfd_minor_anon_ctx(ctx))
+		flags |= UFFD_CTX_MINOR_ANON;
+	return flags;
+}
+
 /*
  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
  * meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -1271,7 +1292,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	bool basic_ioctls;
 	unsigned long start, end;
 	struct vma_iterator vmi;
-	bool wp_async = userfaultfd_wp_async_ctx(ctx);
+	unsigned int ctx_flags = userfaultfd_ctx_flags(ctx);
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
 
@@ -1345,7 +1366,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_can_userfault(cur, vm_flags, wp_async))
+		if (!vma_can_userfault(cur, vm_flags, ctx_flags))
 			goto out_unlock;
 
 		/*
@@ -1398,7 +1419,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	VM_WARN_ON_ONCE(!found);
 
 	ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
-					 wp_async);
+					 ctx_flags);
 
 out_unlock:
 	mmap_write_unlock(mm);
@@ -1443,7 +1464,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	unsigned long start, end, vma_end;
 	const void __user *buf = (void __user *)arg;
 	struct vma_iterator vmi;
-	bool wp_async = userfaultfd_wp_async_ctx(ctx);
+	unsigned int ctx_flags = userfaultfd_ctx_flags(ctx);
 
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1505,7 +1526,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
+		if (!vma_can_userfault(cur, cur->vm_flags, ctx_flags))
 			goto out_unlock;
 
 		found = true;
@@ -1526,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			goto skip;
 
 		VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
-		VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+		VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, ctx_flags));
 		VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
 
 		if (vma->vm_start > start)
@@ -1890,6 +1911,11 @@ bool userfaultfd_wp_async(struct vm_area_struct *vma)
 	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
 }
 
+bool userfaultfd_minor_async(struct vm_area_struct *vma)
+{
+	return userfaultfd_minor_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1993,11 +2019,20 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	if (features & UFFD_FEATURE_WP_ASYNC)
 		features |= UFFD_FEATURE_WP_UNPOPULATED;
 
+	ret = -EINVAL;
+	/* MINOR_ASYNC requires at least one minor feature */
+	if ((features & UFFD_FEATURE_MINOR_ASYNC) &&
+	    !(features & (UFFD_FEATURE_MINOR_ANON |
+			  UFFD_FEATURE_MINOR_HUGETLBFS |
+			  UFFD_FEATURE_MINOR_SHMEM)))
+		goto err_out;
+
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
 	uffdio_api.features &=
-		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM |
+		  UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC);
 #endif
 	if (!pgtable_supports_uffd_wp())
 		uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fd5f42765497..d1d4ed4a08b0 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -208,9 +208,13 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
+/* Flags for vma_can_userfault() describing uffd context capabilities */
+#define UFFD_CTX_WP_ASYNC	(1 << 0)
+#define UFFD_CTX_MINOR_ANON	(1 << 1)
+
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
 				     vm_flags_t vm_flags,
-				     bool wp_async)
+				     unsigned int ctx_flags)
 {
 	vm_flags &= __VM_UFFD_FLAGS;
 
@@ -218,14 +222,15 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 		return false;
 
 	if ((vm_flags & VM_UFFD_MINOR) &&
-	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
+	    !is_vm_hugetlb_page(vma) && !vma_is_shmem(vma) &&
+	    !(vma_is_anonymous(vma) && (ctx_flags & UFFD_CTX_MINOR_ANON)))
 		return false;
 
 	/*
 	 * If wp async enabled, and WP is the only mode enabled, allow any
 	 * memory type.
 	 */
-	if (wp_async && (vm_flags == VM_UFFD_WP))
+	if ((ctx_flags & UFFD_CTX_WP_ASYNC) && (vm_flags == VM_UFFD_WP))
 		return true;
 
 	/*
@@ -270,6 +275,7 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
 extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
+extern bool userfaultfd_minor_async(struct vm_area_struct *vma);
 
 void userfaultfd_reset_ctx(struct vm_area_struct *vma);
 
@@ -283,7 +289,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
 			       struct vm_area_struct *vma,
 			       vm_flags_t vm_flags,
 			       unsigned long start, unsigned long end,
-			       bool wp_async);
+			       unsigned int ctx_flags);
 
 void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
 
@@ -446,6 +452,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_minor_async(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
 {
 	return false;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 927086bb4a3c..dba1ea26fdfe 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -2008,7 +2008,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
 			       struct vm_area_struct *vma,
 			       vm_flags_t vm_flags,
 			       unsigned long start, unsigned long end,
-			       bool wp_async)
+			       unsigned int ctx_flags)
 {
 	VMA_ITERATOR(vmi, ctx->mm, start);
 	struct vm_area_struct *prev = vma_prev(&vmi);
@@ -2021,7 +2021,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
 	for_each_vma_range(vmi, vma, end) {
 		cond_resched();
 
-		VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+		VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, ctx_flags));
 		VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
 				vma->vm_userfaultfd_ctx.ctx != ctx);
 		VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
-- 
2.51.2


^ permalink raw reply related

* [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
	Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
	linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
	Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Add UAPI definitions for userfaultfd working set tracking on anonymous
memory:

- UFFD_FEATURE_MINOR_ANON: minor fault support for anonymous memory
- UFFD_FEATURE_MINOR_ASYNC: auto-resolve minor faults without handler
- UFFDIO_DEACTIVATE: mark pages as deactivated (protnone or PTE zap)

Not yet added to UFFD_API_FEATURES or UFFD_API_RANGE_IOCTLS.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 include/uapi/linux/userfaultfd.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2841e4ea8f2c..336d07e1b6de 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -79,6 +79,7 @@
 #define _UFFDIO_WRITEPROTECT		(0x06)
 #define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_POISON			(0x08)
+#define _UFFDIO_DEACTIVATE		(0x09)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -103,6 +104,8 @@
 				      struct uffdio_continue)
 #define UFFDIO_POISON		_IOWR(UFFDIO, _UFFDIO_POISON, \
 				      struct uffdio_poison)
+#define UFFDIO_DEACTIVATE	_IOR(UFFDIO, _UFFDIO_DEACTIVATE,	\
+				     struct uffdio_range)
 
 /* read() structure */
 struct uffd_msg {
@@ -230,6 +233,18 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_MOVE indicates that the kernel supports moving an
 	 * existing page contents from userspace.
+	 *
+	 * UFFD_FEATURE_MINOR_ANON indicates that minor fault interception
+	 * is supported for anonymous private memory.  Pages are made
+	 * inaccessible via UFFDIO_DEACTIVATE (sets PROT_NONE while
+	 * preserving the page) and faults are delivered when the pages
+	 * are re-accessed.
+	 *
+	 * UFFD_FEATURE_MINOR_ASYNC indicates asynchronous minor fault
+	 * mode.  When set, faults on deactivated pages are auto-resolved
+	 * by the kernel (PTE permissions restored immediately) without
+	 * delivering a message to the userfaultfd handler.  Use
+	 * PAGEMAP_SCAN to find pages that were not re-accessed.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -248,6 +263,8 @@ struct uffdio_api {
 #define UFFD_FEATURE_POISON			(1<<14)
 #define UFFD_FEATURE_WP_ASYNC			(1<<15)
 #define UFFD_FEATURE_MOVE			(1<<16)
+#define UFFD_FEATURE_MINOR_ANON			(1<<17)
+#define UFFD_FEATURE_MINOR_ASYNC		(1<<18)
 	__u64 features;
 
 	__u64 ioctls;
-- 
2.51.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox