Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v2 1/3] init: remove deprecated "load_ramdisk" and "prompt_ramdisk" command line parameters
From: Askar Safin @ 2025-10-13  6:05 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: linux-fsdevel, linux-kernel, Linus Torvalds, Greg Kroah-Hartman,
	Christian Brauner, Al Viro, Jan Kara, Christoph Hellwig,
	Jens Axboe, Aleksa Sarai, Thomas Weißschuh, Julian Stecklina,
	Gao Xiang, Art Nikpal, Andrew Morton, Alexander Graf, Rob Landley,
	Lennart Poettering, linux-arch, linux-block, initramfs, linux-api,
	linux-doc, Michal Simek, Luis Chamberlain, Kees Cook,
	Thorsten Blum, Heiko Carstens, Arnd Bergmann, Dave Young,
	Christophe Leroy, Krzysztof Kozlowski, Borislav Petkov,
	Jessica Clarke, Nicolas Schichan, David Disseldorp, patches
In-Reply-To: <CAHp75VeJM_OoCWDX20FhphRi6e7rG9Z4X6zkjx9vFF12n7Ef7A@mail.gmail.com>

On Fri, Oct 10, 2025 at 6:02 PM Andy Shevchenko
<andy.shevchenko@gmail.com> wrote:
> 1) often the last period is missing in the commit messages;
I will fix in v3.

> 2) in this change it's unclear for how long (years) the feature was
> deprecated, i.e. the other patch states that 2020 for something else.
> I wonder if this one has the similar order of oldness.

These two commits were done in 2020, too. I will fix in v3.

--
Askar Safin

^ permalink raw reply

* [PATCH v2] vdso: Remove struct getcpu_cache
From: Thomas Weißschuh @ 2025-10-13  9:20 UTC (permalink / raw)
  To: Huacai Chen, WANG Xuerui, Heiko Carstens, Vasily Gorbik,
	Alexander Gordeev, Christian Borntraeger, Sven Schnelle,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Richard Weinberger,
	Anton Ivanov, Johannes Berg, Vincenzo Frascino, Shuah Khan
  Cc: loongarch, linux-kernel, linux-s390, linux-um, linux-api,
	linux-kselftest, Thomas Weißschuh

The cache parameter of getcpu() is not used by the kernel and no user
ever passes it in anyways.

Remove the struct and its header.

As a side-effect we get rid of an unwanted inclusion of the linux/
header namespace from vDSO code.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
---
Changes in v2:
- Rebase on v6.18-rc1
- Link to v1: https://lore.kernel.org/r/20250826-getcpu_cache-v1-1-8748318f6141@linutronix.de
---
We could also completely remove the parameter, but I am not sure if
that is a good idea for syscalls and vDSO entrypoints.
---
 arch/loongarch/vdso/vgetcpu.c                   |  5 ++---
 arch/s390/kernel/vdso64/getcpu.c                |  3 +--
 arch/s390/kernel/vdso64/vdso.h                  |  4 +---
 arch/x86/entry/vdso/vgetcpu.c                   |  5 ++---
 arch/x86/include/asm/vdso/processor.h           |  4 +---
 arch/x86/um/vdso/um_vdso.c                      |  7 +++----
 include/linux/getcpu.h                          | 19 -------------------
 include/linux/syscalls.h                        |  3 +--
 kernel/sys.c                                    |  4 +---
 tools/testing/selftests/vDSO/vdso_test_getcpu.c |  4 +---
 10 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c
index 5301cd9d0f839eb0fd7b73a1d36e80aaa75d5e76..aefba899873ed035d70766a95b0b6fea881e94df 100644
--- a/arch/loongarch/vdso/vgetcpu.c
+++ b/arch/loongarch/vdso/vgetcpu.c
@@ -4,7 +4,6 @@
  */
 
 #include <asm/vdso.h>
-#include <linux/getcpu.h>
 
 static __always_inline int read_cpu_id(void)
 {
@@ -20,8 +19,8 @@ static __always_inline int read_cpu_id(void)
 }
 
 extern
-int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
-int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
+int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
+int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
 {
 	int cpu_id;
 
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c
index 5c5d4a848b7669436e73df8e3b711e5b876eb3db..1e17665616c5fa766ca66c8de276b212528934bd 100644
--- a/arch/s390/kernel/vdso64/getcpu.c
+++ b/arch/s390/kernel/vdso64/getcpu.c
@@ -2,11 +2,10 @@
 /* Copyright IBM Corp. 2020 */
 
 #include <linux/compiler.h>
-#include <linux/getcpu.h>
 #include <asm/timex.h>
 #include "vdso.h"
 
-int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused)
 {
 	union tod_clock clk;
 
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
index 9e5397e7b590a23c149ccc6043d0c0b0d5ea8457..cadd307d7a365cabf53f5c8d313be3718625533d 100644
--- a/arch/s390/kernel/vdso64/vdso.h
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -4,9 +4,7 @@
 
 #include <vdso/datapage.h>
 
-struct getcpu_cache;
-
-int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused);
 int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
 int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
 int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index e4640306b2e3c95d74d73037ab6b09294b8e1d6c..6381b472b7c52487bccf3cbf0664c3d7a0e59699 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -6,17 +6,16 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/getcpu.h>
 #include <asm/segment.h>
 #include <vdso/processor.h>
 
 notrace long
-__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+__vdso_getcpu(unsigned *cpu, unsigned *node, void *unused)
 {
 	vdso_read_cpunode(cpu, node);
 
 	return 0;
 }
 
-long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+long getcpu(unsigned *cpu, unsigned *node, void *tcache)
 	__attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 7000aeb59aa287e2119c3d43ab3eaf82befb59c4..93e0e24e5cb47f7b0056c13f2a7f2304ed4a0595 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -18,9 +18,7 @@ static __always_inline void cpu_relax(void)
 	native_pause();
 }
 
-struct getcpu_cache;
-
-notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused);
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c
index cbae2584124fd0ff0f9d240c33fefb8d213c84cd..9aa2c62cce6b7a07bbaf8441014d347162d1950d 100644
--- a/arch/x86/um/vdso/um_vdso.c
+++ b/arch/x86/um/vdso/um_vdso.c
@@ -10,14 +10,13 @@
 #define DISABLE_BRANCH_PROFILING
 
 #include <linux/time.h>
-#include <linux/getcpu.h>
 #include <asm/unistd.h>
 
 /* workaround for -Wmissing-prototypes warnings */
 int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts);
 int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
 __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
-long __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
+long __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
 
 int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts)
 {
@@ -60,7 +59,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
 __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));
 
 long
-__vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
+__vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
 {
 	/*
 	 * UML does not support SMP, we can cheat here. :)
@@ -74,5 +73,5 @@ __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused
 	return 0;
 }
 
-long getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *tcache)
+long getcpu(unsigned int *cpu, unsigned int *node, void *tcache)
 	__attribute__((weak, alias("__vdso_getcpu")));
diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h
deleted file mode 100644
index c304dcdb4eac2a9117080e6a14f4e3f28d07fd56..0000000000000000000000000000000000000000
--- a/include/linux/getcpu.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_GETCPU_H
-#define _LINUX_GETCPU_H 1
-
-/* Cache for getcpu() to speed it up. Results might be a short time
-   out of date, but will be faster.
-
-   User programs should not refer to the contents of this structure.
-   I repeat they should not refer to it. If they do they will break
-   in future kernels.
-
-   It is only a private cache for vgetcpu(). It will change in future kernels.
-   The user program must store this information per thread (__thread)
-   If you want 100% accurate information pass NULL instead. */
-struct getcpu_cache {
-	unsigned long blob[128 / sizeof(long)];
-};
-
-#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19e27b99eb9a187c22e022e260802f..403488e5eba906ecf40975fc3cb29ed0402491f2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -59,7 +59,6 @@ struct compat_stat;
 struct old_timeval32;
 struct robust_list_head;
 struct futex_waitv;
-struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
 struct file_handle;
@@ -714,7 +713,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
 asmlinkage long sys_umask(int mask);
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
-asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
+asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, void __user *cache);
 asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv,
 				struct timezone __user *tz);
 asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv,
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e580b883d19bb1336aff627ae783a4d..f1780ab132a3fbce6aac937ade5b9a35d9837f13 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,7 +31,6 @@
 #include <linux/tty.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
-#include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
@@ -2876,8 +2875,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	return error;
 }
 
-SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
-		struct getcpu_cache __user *, unused)
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused)
 {
 	int err = 0;
 	int cpu = raw_smp_processor_id();
diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
index cdeaed45fb26c61f6314c58fe1b71fa0be3c0108..994ce569dc37c6689b1a3c79156e3dfc8bf27f22 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
@@ -16,9 +16,7 @@
 #include "vdso_config.h"
 #include "vdso_call.h"
 
-struct getcpu_cache;
-typedef long (*getcpu_t)(unsigned int *, unsigned int *,
-			 struct getcpu_cache *);
+typedef long (*getcpu_t)(unsigned int *, unsigned int *, void *);
 
 int main(int argc, char **argv)
 {

---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20250825-getcpu_cache-3abcd2e65437

Best regards,
-- 
Thomas Weißschuh <thomas.weissschuh@linutronix.de>


^ permalink raw reply related

* Re: [PATCH v2 2/3] initrd: remove deprecated code path (linuxrc)
From: Askar Safin @ 2025-10-13  9:59 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: linux-fsdevel, linux-kernel, Linus Torvalds, Greg Kroah-Hartman,
	Christian Brauner, Al Viro, Jan Kara, Christoph Hellwig,
	Jens Axboe, Aleksa Sarai, Thomas Weißschuh, Julian Stecklina,
	Gao Xiang, Art Nikpal, Andrew Morton, Alexander Graf, Rob Landley,
	Lennart Poettering, linux-arch, linux-block, initramfs, linux-api,
	linux-doc, Michal Simek, Luis Chamberlain, Kees Cook,
	Thorsten Blum, Heiko Carstens, Arnd Bergmann, Dave Young,
	Christophe Leroy, Krzysztof Kozlowski, Borislav Petkov,
	Jessica Clarke, Nicolas Schichan, David Disseldorp, patches
In-Reply-To: <CAHp75VezkZ7A1VOP8cBH8h0DKVumP66jjUbepMCP87wGOrh+MQ@mail.gmail.com>

On Fri, Oct 10, 2025 at 6:05 PM Andy Shevchenko
<andy.shevchenko@gmail.com> wrote:
> > -       noinitrd        [RAM] Tells the kernel not to load any configured
> > +       noinitrd        [Deprecated,RAM] Tells the kernel not to load any configured
> >                         initial RAM disk.
>
> How one is supposed to run this when just having a kernel is enough?
> At least (ex)colleague of mine was a heavy user of this option for
> testing kernel builds on the real HW.

This option applies to initrd only, not to initramfs.
Except for EFI mode, when it applies to both.

I will remove this option when I remove initrd.

In EFI mode it is easy just not to pass initramfs, so all is okay.

Also I will clarify docs in v3.

Also, please, answer here:
https://lore.kernel.org/regressions/20250918183336.5633-1-safinaskar@gmail.com/

-- 
Askar Safin

^ permalink raw reply

* Re: [PATCH v2 2/3] initrd: remove deprecated code path (linuxrc)
From: Askar Safin @ 2025-10-13 10:29 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: linux-fsdevel, linux-kernel, Linus Torvalds, Greg Kroah-Hartman,
	Christian Brauner, Al Viro, Jan Kara, Christoph Hellwig,
	Jens Axboe, Andy Shevchenko, Aleksa Sarai, Thomas Weißschuh,
	Julian Stecklina, Gao Xiang, Art Nikpal, Andrew Morton,
	Alexander Graf, Rob Landley, Lennart Poettering, linux-arch,
	linux-block, initramfs, linux-api, linux-doc, Michal Simek,
	Luis Chamberlain, Kees Cook, Thorsten Blum, Heiko Carstens,
	Arnd Bergmann, Dave Young, Christophe Leroy, Krzysztof Kozlowski,
	Borislav Petkov, Jessica Clarke, Nicolas Schichan,
	David Disseldorp, patches
In-Reply-To: <07ae142e-4266-44a3-9aa1-4b2acbd72c1b@infradead.org>

On Fri, Oct 10, 2025 at 10:31 PM Randy Dunlap <rdunlap@infradead.org> wrote:
> There are more places in Documentation/ that refer to "linuxrc".
> Should those also be removed or fixed?
>
> accounting/delay-accounting.rst
> admin-guide/initrd.rst
> driver-api/early-userspace/early_userspace_support.rst
> power/swsusp-dmcrypt.rst
> translations/zh_CN/accounting/delay-accounting.rst

Yes, they should be removed.
I made this patchset minimal to be sure it is easy to revert.
I will remove these linuxrc mentions in cleanup patchset.

-- 
Askar Safin

^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: Dave Hansen @ 2025-10-13 14:06 UTC (permalink / raw)
  To: Thomas Weißschuh, Huacai Chen, WANG Xuerui, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan
  Cc: loongarch, linux-kernel, linux-s390, linux-um, linux-api,
	linux-kselftest
In-Reply-To: <20251013-getcpu_cache-v2-1-880fbfa3b7cc@linutronix.de>

On 10/13/25 02:20, Thomas Weißschuh wrote:
> -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
> -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
> +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
> +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
>  {
>  	int cpu_id;

It would ideally be nice to have a _bit_ more history on this about
how it became unused any why there is such high confidence that
userspace never tries to use it.

Let's say someone comes along in a few years and wants to use this
'unused' parameter. Could they?

^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pratyush Yadav @ 2025-10-13 15:23 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Pratyush Yadav, jasonmiu, graf, changyuanl, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
	saeedm, ajayachandra, jgg, parav, leonro, witu, hughd, skhawaja,
	chrisl, steven.sistare
In-Reply-To: <CA+CK2bB6F634HCw_N5z9E5r_LpbGJrucuFb_5fL4da5_W99e4Q@mail.gmail.com>

On Thu, Oct 09 2025, Pasha Tatashin wrote:

> On Thu, Oct 9, 2025 at 6:58 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>>
>> On Tue, Oct 07 2025, Pasha Tatashin wrote:
>>
>> > On Sun, Sep 28, 2025 at 9:03 PM Pasha Tatashin
>> > <pasha.tatashin@soleen.com> wrote:
>> >>
>> [...]
>> > 4. New File-Lifecycle-Bound Global State
>> > ----------------------------------------
>> > A new mechanism for managing global state was proposed, designed to be
>> > tied to the lifecycle of the preserved files themselves. This would
>> > allow a file owner (e.g., the IOMMU subsystem) to save and retrieve
>> > global state that is only relevant when one or more of its FDs are
>> > being managed by LUO.
>>
>> Is this going to replace LUO subsystems? If yes, then why? The global
>> state will likely need to have its own lifecycle just like the FDs, and
>> subsystems are a simple and clean abstraction to control that. I get the
>> idea of only "activating" a subsystem when one or more of its FDs are
>> participating in LUO, but we can do that while keeping subsystems
>> around.
>
> Thanks for the feedback. The FLB Global State is not replacing the LUO
> subsystems. On the contrary, it's a higher-level abstraction that is
> itself implemented as a LUO subsystem. The goal is to provide a
> solution for a pattern that emerged during the PCI and IOMMU
> discussions.

Okay, makes sense then. I thought we were removing the subsystems idea.
I didn't follow the PCI and IOMMU discussions that closely.

Side note: I see a dependency between subsystems forming. For example,
the FLB subsystem probably wants to make sure all its dependent
subsystems (like LUO files) go through their callbacks before getting
its callback. Maybe in the current implementation doing it in any order
works, but in general, if it manages data of other subsystems, it should
be serialized after them.

Same with the hugetlb subsystem for example. On prepare or freeze time,
it would probably be a good idea if the files callbacks finish first. I
would imagine most subsystems would want to go after files.

With the current registration mechanism, the order depends on when the
subsystem is registered, which is hard to control. Maybe we should have
a global list of subsystems and can manually specify the order? Not sure
if that is a good idea, just throwing it out there off the top of my
head.

>
> You can see the WIP implementation here, which shows it registering as
> a subsystem named "luo-fh-states-v1-struct":
> https://github.com/soleen/linux/commit/94e191aab6b355d83633718bc4a1d27dda390001
>
> The existing subsystem API is a low-level tool that provides for the
> preservation of a raw 8-byte handle. It doesn't provide locking, nor
> is it explicitly tied to the lifecycle of any higher-level object like
> a file handler. The new API is designed to solve a more specific
> problem: allowing global components (like IOMMU or PCI) to
> automatically track when resources relevant to them are added to or
> removed from preservation. If HugeTLB requires a subsystem, it can
> still use it, but I suspect it might benefit from FLB Global State as
> well.

Hmm, right. Let me see how I can make use of it.

>
>> Here is how I imagine the proposed API would compare against subsystems
>> with hugetlb as an example (hugetlb support is still WIP, so I'm still
>> not clear on specifics, but this is how I imagine it will work):
>>
>> - Hugetlb subsystem needs to track its huge page pools and which pages
>>   are allocated and free. This is its global state. The pools get
>>   reconstructed after kexec. Post-kexec, the free pages are ready for
>>   allocation from other "regular" files and the pages used in LUO files
>>   are reserved.
>>
>> - Pre-kexec, when a hugetlb FD is preserved, it marks that as preserved
>>   in hugetlb's global data structure tracking this. This is runtime data
>>   (say xarray), and _not_ serialized data. Reason being, there are
>>   likely more FDs to come so no point in wasting time serializing just
>>   yet.
>>
>>   This can look something like:
>>
>>   hugetlb_luo_preserve_folio(folio, ...);
>>
>>   Nice and simple.
>>
>>   Compare this with the new proposed API:
>>
>>   liveupdate_fh_global_state_get(h, &hugetlb_data);
>>   // This will have update serialized state now.
>>   hugetlb_luo_preserve_folio(hugetlb_data, folio, ...);
>>   liveupdate_fh_global_state_put(h);
>>
>>   We do the same thing but in a very complicated way.
>>
>> - When the system-wide preserve happens, the hugetlb subsystem gets a
>>   callback to serialize. It converts its runtime global state to
>>   serialized state since now it knows no more FDs will be added.
>>
>>   With the new API, this doesn't need to be done since each FD prepare
>>   already updates serialized state.
>>
>> - If there are no hugetlb FDs, then the hugetlb subsystem doesn't put
>>   anything in LUO. This is same as new API.
>>
>> - If some hugetlb FDs are not restored after liveupdate and the finish
>>   event is triggered, the subsystem gets its finish() handler called and
>>   it can free things up.
>>
>>   I don't get how that would work with the new API.
>
> The new API isn't more complicated; It codifies the common pattern of
> "create on first use, destroy on last use" into a reusable helper,
> saving each file handler from having to reinvent the same reference
> counting and locking scheme. But, as you point out, subsystems provide
> more control, specifically they handle full creation/free instead of
> relying on file-handlers for that.
>
>> My point is, I see subsystems working perfectly fine here and I don't
>> get how the proposed API is any better.
>>
>> Am I missing something?
>
> No, I don't think you are. Your analysis is correct that this is
> achievable with subsystems. The goal of the new API is to make that
> specific, common use case simpler.

Right. Thanks for clarifying.

>
> Pasha

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: H. Peter Anvin @ 2025-10-13 16:14 UTC (permalink / raw)
  To: Dave Hansen, Thomas Weißschuh, Huacai Chen, WANG Xuerui,
	Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan
  Cc: loongarch, linux-kernel, linux-s390, linux-um, linux-api,
	linux-kselftest
In-Reply-To: <e95dc212-6fd3-43e3-aeb7-bf55917e0cd4@intel.com>

On October 13, 2025 7:06:55 AM PDT, Dave Hansen <dave.hansen@intel.com> wrote:
>On 10/13/25 02:20, Thomas Weißschuh wrote:
>> -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
>> -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
>> +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
>> +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
>>  {
>>  	int cpu_id;
>
>It would ideally be nice to have a _bit_ more history on this about
>how it became unused any why there is such high confidence that
>userspace never tries to use it.
>
>Let's say someone comes along in a few years and wants to use this
>'unused' parameter. Could they?

I believe it was a private storage area for the kernel to use... which it doesn't. Not doing anything at all with the pointer is perfectly legitimate.

^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: Andy Lutomirski @ 2025-10-13 17:14 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Thomas Weißschuh, Huacai Chen, WANG Xuerui, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan, loongarch, linux-kernel,
	linux-s390, linux-um, linux-api, linux-kselftest
In-Reply-To: <e95dc212-6fd3-43e3-aeb7-bf55917e0cd4@intel.com>

On Mon, Oct 13, 2025 at 7:07 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 10/13/25 02:20, Thomas Weißschuh wrote:
> > -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
> > -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
> > +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
> > +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
> >  {
> >       int cpu_id;
>
> It would ideally be nice to have a _bit_ more history on this about
> how it became unused any why there is such high confidence that
> userspace never tries to use it.

The theory is that people thought that getcpu was going to be kind of
slow, so userspace would allocate a little cache (IIRC per-thread) and
pass it in, and the vDSO would do, well, something clever to return
the right value.  The something clever was probably based on the idea
that you can't actually tell (in general) if the return value from
getcpu is stale, since you might well get migrated right as the
function returns anyway, so the cache could be something silly like
(jiffies, cpu).

I don't actually remember whether the kernel ever used this.  It's
possible that there are ancient kernels where passing a wild, non-null
pointer would blow up.  But it's certainly safe to pass null, and it's
certainly safe for the kernel to ignore the parameter.

--Andy

>
> Let's say someone comes along in a few years and wants to use this
> 'unused' parameter. Could they?
>

^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: H. Peter Anvin @ 2025-10-13 19:44 UTC (permalink / raw)
  To: Andy Lutomirski, Dave Hansen
  Cc: Thomas Weißschuh, Huacai Chen, WANG Xuerui, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan, loongarch, linux-kernel,
	linux-s390, linux-um, linux-api, linux-kselftest
In-Reply-To: <CALCETrV2W3cZEJ2yy7F-F9=e_8HLP84ZWrOJCzUYn_ASb0+M6A@mail.gmail.com>

On 2025-10-13 10:14, Andy Lutomirski wrote:
> 
> I don't actually remember whether the kernel ever used this.  It's
> possible that there are ancient kernels where passing a wild, non-null
> pointer would blow up.  But it's certainly safe to pass null, and it's
> certainly safe for the kernel to ignore the parameter.
> 

One could imagine an architecture which would have to execute an actual system
call wanting to use this, but on x86 it is pointless -- even the LSL trick is
much faster than a system call, and once you account for whatever hassle you
would have to deal with do make the cache make sense (probably having a global
generation number and/or a timestamp to expire it) it well and truly makes no
sense.

	-hpa


^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: Andy Lutomirski @ 2025-10-13 20:32 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, Dave Hansen, Thomas Weißschuh, Huacai Chen,
	WANG Xuerui, Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan, loongarch, linux-kernel,
	linux-s390, linux-um, linux-api, linux-kselftest
In-Reply-To: <494caf29-8755-4bc6-a2c3-b9d0b3e9b78d@zytor.com>

On Mon, Oct 13, 2025 at 12:45 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On 2025-10-13 10:14, Andy Lutomirski wrote:
> >
> > I don't actually remember whether the kernel ever used this.  It's
> > possible that there are ancient kernels where passing a wild, non-null
> > pointer would blow up.  But it's certainly safe to pass null, and it's
> > certainly safe for the kernel to ignore the parameter.
> >
>
> One could imagine an architecture which would have to execute an actual system
> call wanting to use this, but on x86 it is pointless -- even the LSL trick is
> much faster than a system call, and once you account for whatever hassle you
> would have to deal with do make the cache make sense (probably having a global
> generation number and/or a timestamp to expire it) it well and truly makes no
> sense.

The global timestamp would just be some field in the vvar area, which
we have plenty of anyway.

But I agree, accelerating getcpu is pointless.  In any case, anything
that historically thought it really really wanted accelerated getcpu
can, and probably does, use rseq these days.

--Andy

^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: H. Peter Anvin @ 2025-10-13 21:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Dave Hansen, Thomas Weißschuh, Huacai Chen, WANG Xuerui,
	Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan, loongarch, linux-kernel,
	linux-s390, linux-um, linux-api, linux-kselftest
In-Reply-To: <CALCETrVuW_MmksnkprK5Ljm-5RBSS=FUA8R8fgGMhZ3BxW15Sw@mail.gmail.com>

On 2025-10-13 13:32, Andy Lutomirski wrote:
> 
> The global timestamp would just be some field in the vvar area, which
> we have plenty of anyway.
> 
> But I agree, accelerating getcpu is pointless.  In any case, anything
> that historically thought it really really wanted accelerated getcpu
> can, and probably does, use rseq these days.
> 

Indeed. And with RDPID it is fast enough that the bulk of the cost is probably
in the vdso call.

	-hpa


^ permalink raw reply

* Re: [PATCH v2] vdso: Remove struct getcpu_cache
From: Florian Weimer @ 2025-10-14  8:56 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Dave Hansen, Thomas Weißschuh, Huacai Chen, WANG Xuerui,
	Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Richard Weinberger, Anton Ivanov, Johannes Berg,
	Vincenzo Frascino, Shuah Khan, loongarch, linux-kernel,
	linux-s390, linux-um, linux-api, linux-kselftest
In-Reply-To: <CALCETrV2W3cZEJ2yy7F-F9=e_8HLP84ZWrOJCzUYn_ASb0+M6A@mail.gmail.com>

* Andy Lutomirski:

> The theory is that people thought that getcpu was going to be kind of
> slow, so userspace would allocate a little cache (IIRC per-thread) and
> pass it in, and the vDSO would do, well, something clever to return
> the right value.  The something clever was probably based on the idea
> that you can't actually tell (in general) if the return value from
> getcpu is stale, since you might well get migrated right as the
> function returns anyway, so the cache could be something silly like
> (jiffies, cpu).

It probably had to do something with per-CPU or per-node mappings of the
vDSO.  Or may some non-coherent cache line in the vDSO.  As far as I
understand it, the cache has zero chance of working with the way vDSO
data is currently implemented.

We have the CPU ID and node ID in the restartable sequences area now
(although glibc does not use the node ID yet).  It's not a cache.  So
this clearly supersedes whatever struct getcpu_cache tried to achieve.

Thanks,
Florian


^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: Wei Yang @ 2025-10-14 12:23 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-kernel, linux-doc, cgroups, linux-mm, linux-fsdevel,
	linux-api, Andrew Morton, Matthew Wilcox (Oracle), Tejun Heo,
	Zefan Li, Johannes Weiner, Michal Koutný, Jonathan Corbet,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, Muchun Song, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Jann Horn
In-Reply-To: <20250303163014.1128035-21-david@redhat.com>

On Mon, Mar 03, 2025 at 05:30:13PM +0100, David Hildenbrand wrote:
[...]
>@@ -1678,6 +1726,22 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> 		break;
> 	case RMAP_LEVEL_PMD:
> 	case RMAP_LEVEL_PUD:
>+		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
>+			last = atomic_add_negative(-1, &folio->_entire_mapcount);
>+			if (level == RMAP_LEVEL_PMD && last)
>+				nr_pmdmapped = folio_large_nr_pages(folio);
>+			nr = folio_dec_return_large_mapcount(folio, vma);
>+			if (!nr) {
>+				/* Now completely unmapped. */
>+				nr = folio_large_nr_pages(folio);
>+			} else {
>+				partially_mapped = last &&
>+						   nr < folio_large_nr_pages(folio);

Hi, David

Do you think this is better to be?

	partially_mapped = last && nr < nr_pmdmapped;

As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
folio yet.

Not sure what I missed here.

>+				nr = 0;
>+			}
>+			break;
>+		}
>+
> 		folio_dec_large_mapcount(folio, vma);
> 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
> 		if (last) {
>-- 
>2.48.1
>

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: David Hildenbrand @ 2025-10-14 12:59 UTC (permalink / raw)
  To: Wei Yang
  Cc: linux-kernel, linux-doc, cgroups, linux-mm, linux-fsdevel,
	linux-api, Andrew Morton, Matthew Wilcox (Oracle), Tejun Heo,
	Zefan Li, Johannes Weiner, Michal Koutný, Jonathan Corbet,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, Muchun Song, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Jann Horn
In-Reply-To: <20251014122335.dpyk5advbkioojnm@master>

On 14.10.25 14:23, Wei Yang wrote:
> On Mon, Mar 03, 2025 at 05:30:13PM +0100, David Hildenbrand wrote:
> [...]
>> @@ -1678,6 +1726,22 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>> 		break;
>> 	case RMAP_LEVEL_PMD:
>> 	case RMAP_LEVEL_PUD:
>> +		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
>> +			last = atomic_add_negative(-1, &folio->_entire_mapcount);
>> +			if (level == RMAP_LEVEL_PMD && last)
>> +				nr_pmdmapped = folio_large_nr_pages(folio);
>> +			nr = folio_dec_return_large_mapcount(folio, vma);
>> +			if (!nr) {
>> +				/* Now completely unmapped. */
>> +				nr = folio_large_nr_pages(folio);
>> +			} else {
>> +				partially_mapped = last &&
>> +						   nr < folio_large_nr_pages(folio);
> 
> Hi, David

Hi!

> 
> Do you think this is better to be?
> 
> 	partially_mapped = last && nr < nr_pmdmapped;

I see what you mean, it would be similar to the CONFIG_PAGE_MAPCOUNT 
case below.

But probably it could then be

	partially_mapped = nr < nr_pmdmapped;

because nr_pmdmapped is only set when "last = true".

I'm not sure if there is a good reason to change it at this point 
though. Smells like a micro-optimization for PUD, which we probably 
shouldn't worry about.

> 
> As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
> folio yet.

We do support partially mapped PUD-sized folios I think, but not 
anonymous PUD-sized folios.

So consequently the partially_mapped variable will never really be used 
later on, because the folio_test_anon() will never hit in the PUD case.

-- 
Cheers

David / dhildenb


^ permalink raw reply

* Re: [PATCH v4 00/30] Live Update Orchestrator
From: Pratyush Yadav @ 2025-10-14 13:29 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Pasha Tatashin, Pratyush Yadav, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, parav, leonro, witu, hughd,
	skhawaja, chrisl, steven.sistare
In-Reply-To: <20251010150116.GC3901471@nvidia.com>

On Fri, Oct 10 2025, Jason Gunthorpe wrote:

> On Thu, Oct 09, 2025 at 07:50:12PM -0400, Pasha Tatashin wrote:
>> >   This can look something like:
>> >
>> >   hugetlb_luo_preserve_folio(folio, ...);
>> >
>> >   Nice and simple.
>> >
>> >   Compare this with the new proposed API:
>> >
>> >   liveupdate_fh_global_state_get(h, &hugetlb_data);
>> >   // This will have update serialized state now.
>> >   hugetlb_luo_preserve_folio(hugetlb_data, folio, ...);
>> >   liveupdate_fh_global_state_put(h);
>> >
>> >   We do the same thing but in a very complicated way.
>> >
>> > - When the system-wide preserve happens, the hugetlb subsystem gets a
>> >   callback to serialize. It converts its runtime global state to
>> >   serialized state since now it knows no more FDs will be added.
>> >
>> >   With the new API, this doesn't need to be done since each FD prepare
>> >   already updates serialized state.
>> >
>> > - If there are no hugetlb FDs, then the hugetlb subsystem doesn't put
>> >   anything in LUO. This is same as new API.
>> >
>> > - If some hugetlb FDs are not restored after liveupdate and the finish
>> >   event is triggered, the subsystem gets its finish() handler called and
>> >   it can free things up.
>> >
>> >   I don't get how that would work with the new API.
>> 
>> The new API isn't more complicated; It codifies the common pattern of
>> "create on first use, destroy on last use" into a reusable helper,
>> saving each file handler from having to reinvent the same reference
>> counting and locking scheme. But, as you point out, subsystems provide
>> more control, specifically they handle full creation/free instead of
>> relying on file-handlers for that.
>
> I'd say hugetlb *should* be doing the more complicated thing. We
> should not have global static data for luo floating around the kernel,
> this is too easily abused in bad ways.

Not sure how much difference this makes in practice, but I get your
point.

>
> The above "complicated" sequence forces the caller to have a fd
> session handle, and "hides" the global state inside luo so the
> subsystem can't just randomly reach into it whenever it likes.
>
> This is a deliberate and violent way to force clean coding practices
> and good layering.
>
> Not sure why hugetlb pools would need another xarray??

Not sure myself either. I used it to demonstrate my point of having
runtime state and serialized state separate from each other.

>
> 1) Use a vmalloc and store a list of the PFNs in the pool. Pool becomes
>    frozen, can't add/remove PFNs.

Doesn't that circumvent LUO's state machine? The idea with the state
machine was to have clear points in time when the system goes into the
"limited capacity"/"frozen" state, which is the LIVEUPDATE_PREPARE
event. With what you propose, the first FD being preserved implicitly
triggers the prepare event. Same thing for unprepare/cancel operations.

I am wondering if it is better to do it the other way round: prepare all
files first, and then prepare the hugetlb subsystem at
LIVEUPDATE_PREPARE event. At that point it already knows which pages to
mark preserved so the serialization can be done in one go.

> 2) Require the users of hugetlb memory, like memfd, to
>    preserve/restore the folios they are using (using their hugetlb order)
> 3) Just before kexec run over the PFN list and mark a bit if the folio
>    was preserved by KHO or not. Make sure everything gets KHO
>    preserved.

"just before kexec" would need a callback from LUO. I suppose a
subsystem is the place for that callback. I wrote my email under the
(wrong) impression that we were replacing subsystems.

That makes me wonder: how is the subsystem-level callback supposed to
access the global data? I suppose it can use the liveupdate_file_handler
directly, but it is kind of strange since technically the subsystem and
file handler are two different entities.

Also as Pasha mentioned, 1G pages for guest_memfd will use hugetlb, and
I'm not sure how that would map with this shared global data. memfd and
guest_memfd will likely have different liveupdate_file_handler but would
share data from the same subsystem. Maybe that's a problem to solve for
later...

>
> Restore puts the PFNs that were not preserved directly in the free
> pool, the end user of the folio like the memfd restores and eventually
> normally frees the other folios.

Yeah, on the restore side this idea works fine I think.

>
> It is simple and fits nicely into the infrastructure here, where the
> first time you trigger a global state it does the pfn list and
> freezing, and the lifecycle and locking for this operation is directly
> managed by luo.
>
> The memfd, when it knows it has hugetlb folios inside it, would
> trigger this.
>
> Jason

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: Wei Yang @ 2025-10-14 13:31 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Wei Yang, linux-kernel, linux-doc, cgroups, linux-mm,
	linux-fsdevel, linux-api, Andrew Morton, Matthew Wilcox (Oracle),
	Tejun Heo, Zefan Li, Johannes Weiner, Michal Koutný,
	Jonathan Corbet, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, Muchun Song, Liam R. Howlett,
	Lorenzo Stoakes, Vlastimil Babka, Jann Horn
In-Reply-To: <71380b43-c23c-42b5-8aab-f158bb37bc75@redhat.com>

On Tue, Oct 14, 2025 at 02:59:30PM +0200, David Hildenbrand wrote:
>On 14.10.25 14:23, Wei Yang wrote:
>> On Mon, Mar 03, 2025 at 05:30:13PM +0100, David Hildenbrand wrote:
>> [...]
>> > @@ -1678,6 +1726,22 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>> > 		break;
>> > 	case RMAP_LEVEL_PMD:
>> > 	case RMAP_LEVEL_PUD:
>> > +		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
>> > +			last = atomic_add_negative(-1, &folio->_entire_mapcount);
>> > +			if (level == RMAP_LEVEL_PMD && last)
>> > +				nr_pmdmapped = folio_large_nr_pages(folio);
>> > +			nr = folio_dec_return_large_mapcount(folio, vma);
>> > +			if (!nr) {
>> > +				/* Now completely unmapped. */
>> > +				nr = folio_large_nr_pages(folio);
>> > +			} else {
>> > +				partially_mapped = last &&
>> > +						   nr < folio_large_nr_pages(folio);
>> 
>> Hi, David
>
>Hi!
>
>> 
>> Do you think this is better to be?
>> 
>> 	partially_mapped = last && nr < nr_pmdmapped;
>
>I see what you mean, it would be similar to the CONFIG_PAGE_MAPCOUNT case
>below.
>
>But probably it could then be
>
>	partially_mapped = nr < nr_pmdmapped;
>
>because nr_pmdmapped is only set when "last = true".
>
>I'm not sure if there is a good reason to change it at this point though.
>Smells like a micro-optimization for PUD, which we probably shouldn't worry
>about.
>
>> 
>> As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
>> folio yet.
>
>We do support partially mapped PUD-sized folios I think, but not anonymous
>PUD-sized folios.
>
>So consequently the partially_mapped variable will never really be used later
>on, because the folio_test_anon() will never hit in the PUD case.
>

Ok, folio_test_anon() takes care of it. We won't add it to defer list by
accident.

>-- 
>Cheers
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: Matthew Wilcox @ 2025-10-14 14:32 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Wei Yang, linux-kernel, linux-doc, cgroups, linux-mm,
	linux-fsdevel, linux-api, Andrew Morton, Tejun Heo, Zefan Li,
	Johannes Weiner, Michal Koutný, Jonathan Corbet,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, Muchun Song, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Jann Horn
In-Reply-To: <71380b43-c23c-42b5-8aab-f158bb37bc75@redhat.com>

On Tue, Oct 14, 2025 at 02:59:30PM +0200, David Hildenbrand wrote:
> > As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
> > folio yet.
> 
> We do support partially mapped PUD-sized folios I think, but not anonymous
> PUD-sized folios.

I don't think so?  The only mechanism I know of to allocate PUD-sized
chunks of memory is hugetlb, and that doesn't permit partial mappings.

^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: David Hildenbrand @ 2025-10-14 14:38 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Wei Yang, linux-kernel, linux-doc, cgroups, linux-mm,
	linux-fsdevel, linux-api, Andrew Morton, Tejun Heo, Zefan Li,
	Johannes Weiner, Michal Koutný, Jonathan Corbet,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, Muchun Song, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Jann Horn
In-Reply-To: <aO5fCT62gZZw9-wQ@casper.infradead.org>

On 14.10.25 16:32, Matthew Wilcox wrote:
> On Tue, Oct 14, 2025 at 02:59:30PM +0200, David Hildenbrand wrote:
>>> As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
>>> folio yet.
>>
>> We do support partially mapped PUD-sized folios I think, but not anonymous
>> PUD-sized folios.
> 
> I don't think so?  The only mechanism I know of to allocate PUD-sized
> chunks of memory is hugetlb, and that doesn't permit partial mappings.

Greetings from the latest DAX rework :)

-- 
Cheers

David / dhildenb


^ permalink raw reply

* Re: [PATCH v3 20/20] mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT)
From: Wei Yang @ 2025-10-15  0:45 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Matthew Wilcox, Wei Yang, linux-kernel, linux-doc, cgroups,
	linux-mm, linux-fsdevel, linux-api, Andrew Morton, Tejun Heo,
	Zefan Li, Johannes Weiner, Michal Koutný, Jonathan Corbet,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, Muchun Song, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Jann Horn
In-Reply-To: <f9d19f72-58f7-4694-ae18-1d944238a3e7@redhat.com>

On Tue, Oct 14, 2025 at 04:38:38PM +0200, David Hildenbrand wrote:
>On 14.10.25 16:32, Matthew Wilcox wrote:
>> On Tue, Oct 14, 2025 at 02:59:30PM +0200, David Hildenbrand wrote:
>> > > As commit 349994cf61e6 mentioned, we don't support partially mapped PUD-sized
>> > > folio yet.
>> > 
>> > We do support partially mapped PUD-sized folios I think, but not anonymous
>> > PUD-sized folios.
>> 
>> I don't think so?  The only mechanism I know of to allocate PUD-sized
>> chunks of memory is hugetlb, and that doesn't permit partial mappings.
>
>Greetings from the latest DAX rework :)

After a re-think, do you think it's better to align the behavior between
CONFIG_NO_PAGE_MAPCOUNT and CONFIG_PAGE_MAPCOUNT?

It looks we treat a PUD-sized folio partially_mapped if CONFIG_NO_PAGE_MAPCOUNT,
but !partially_mapped if CONFIG_PAGE_MAPCOUNT, if my understanding is correct.

>
>-- 
>Cheers
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* [PATCH v22 0/8] fork: Support shadow stacks in clone3()
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook, Kees Cook,
	Shuah Khan

At this point I think everyone in the on the kernel side is happy with
this but there were some questions from the glibc side about the value
of controlling the shadow stack placement and size, especially with the
current inability to reuse the shadow stack for an exited thread.  With
support for reuse it would be possible to have a cache of shadow stacks
as is currently supported for the normal stack.

Since the discussion petered out I'm resending this in order to give
people something work with while prototyping.  It should be possible to
prototype any potential kernel features to help build out shadow stack
support in userspace by enabling shadow stack writes, as suggested by
Rick Edgecombe this may end up being required anyway for supporting more
exotic scenarios.  On all current architectures with the feature writes
to shadow stack require specific instructions so there are still
security benefits even with writes enabled.

I did send a change implementing a feature writing a token on thread
exit to allow reuse:

   https://lore.kernel.org/r/20250921-arm64-gcs-exit-token-v1-0-45cf64e648d5@kernel.org

but wasn't planning to refresh it without some indication from the
userspace side that that'd be useful.

Non-process cover letter:

The kernel has added support for shadow stacks, currently x86 only using
their CET feature but both arm64 and RISC-V have equivalent features
(GCS and Zicfiss respectively), I am actively working on GCS[1].  With
shadow stacks the hardware maintains an additional stack containing only
the return addresses for branch instructions which is not generally
writeable by userspace and ensures that any returns are to the recorded
addresses.  This provides some protection against ROP attacks and making
it easier to collect call stacks.  These shadow stacks are allocated in
the address space of the userspace process.

Our API for shadow stacks does not currently offer userspace any
flexiblity for managing the allocation of shadow stacks for newly
created threads, instead the kernel allocates a new shadow stack with
the same size as the normal stack whenever a thread is created with the
feature enabled.  The stacks allocated in this way are freed by the
kernel when the thread exits or shadow stacks are disabled for the
thread.  This lack of flexibility and control isn't ideal, in the vast
majority of cases the shadow stack will be over allocated and the
implicit allocation and deallocation is not consistent with other
interfaces.  As far as I can tell the interface is done in this manner
mainly because the shadow stack patches were in development since before
clone3() was implemented.

Since clone3() is readily extensible let's add support for specifying a
shadow stack when creating a new thread or process, keeping the current
implicit allocation behaviour if one is not specified either with
clone3() or through the use of clone().  The user must provide a shadow
stack pointer, this must point to memory mapped for use as a shadow
stackby map_shadow_stack() with an architecture specified shadow stack
token at the top of the stack.

Yuri Khrustalev has raised questions from the libc side regarding
discoverability of extended clone3() structure sizes[2], this seems like
a general issue with clone3().  There was a suggestion to add a hwcap on
arm64 which isn't ideal but is doable there, though architecture
specific mechanisms would also be needed for x86 (and RISC-V if it's
support gets merged before this does).  The idea has, however, had
strong pushback from the architecture maintainers and it is possible to
detect support for this in clone3() by attempting a call with a
misaligned shadow stack pointer specified so no hwcap has been added.

[1] https://lore.kernel.org/linux-arm-kernel/20241001-arm64-gcs-v13-0-222b78d87eee@kernel.org/T/#mc58f97f27461749ccf400ebabf6f9f937116a86b
[2] https://lore.kernel.org/r/aCs65ccRQtJBnZ_5@arm.com

Signed-off-by: Mark Brown <broonie@kernel.org>
---
Changes in v22:
- Rebase onto v6.18-rc1.
- Cover letter updates.
- Link to v21: https://lore.kernel.org/r/20250916-clone3-shadow-stack-v21-0-910493527013@kernel.org

Changes in v21:
- Rebase onto https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git kernel-6.18.clone3
- Rename shadow_stack_token to shstk_token, since it's a simple rename I've
  kept the acks and reviews but I dropped the tested-bys just to be safe.
- Link to v20: https://lore.kernel.org/r/20250902-clone3-shadow-stack-v20-0-4d9fff1c53e7@kernel.org

Changes in v20:
- Comment fixes and clarifications in x86 arch_shstk_validate_clone()
  from Rick Edgecombe.
- Spelling fix in documentation.
- Link to v19: https://lore.kernel.org/r/20250819-clone3-shadow-stack-v19-0-bc957075479b@kernel.org

Changes in v19:
- Rebase onto v6.17-rc1.
- Link to v18: https://lore.kernel.org/r/20250702-clone3-shadow-stack-v18-0-7965d2b694db@kernel.org

Changes in v18:
- Rebase onto v6.16-rc3.
- Thanks to pointers from Yuri Khrustalev this version has been tested
  on x86 so I have removed the RFT tag.
- Clarify clone3_shadow_stack_valid() comment about the Kconfig check.
- Remove redundant GCSB DSYNCs in arm64 code.
- Fix token validation on x86.
- Link to v17: https://lore.kernel.org/r/20250609-clone3-shadow-stack-v17-0-8840ed97ff6f@kernel.org

Changes in v17:
- Rebase onto v6.16-rc1.
- Link to v16: https://lore.kernel.org/r/20250416-clone3-shadow-stack-v16-0-2ffc9ca3917b@kernel.org

Changes in v16:
- Rebase onto v6.15-rc2.
- Roll in fixes from x86 testing from Rick Edgecombe.
- Rework so that the argument is shadow_stack_token.
- Link to v15: https://lore.kernel.org/r/20250408-clone3-shadow-stack-v15-0-3fa245c6e3be@kernel.org

Changes in v15:
- Rebase onto v6.15-rc1.
- Link to v14: https://lore.kernel.org/r/20250206-clone3-shadow-stack-v14-0-805b53af73b9@kernel.org

Changes in v14:
- Rebase onto v6.14-rc1.
- Link to v13: https://lore.kernel.org/r/20241203-clone3-shadow-stack-v13-0-93b89a81a5ed@kernel.org

Changes in v13:
- Rebase onto v6.13-rc1.
- Link to v12: https://lore.kernel.org/r/20241031-clone3-shadow-stack-v12-0-7183eb8bee17@kernel.org

Changes in v12:
- Add the regular prctl() to the userspace API document since arm64
  support is queued in -next.
- Link to v11: https://lore.kernel.org/r/20241005-clone3-shadow-stack-v11-0-2a6a2bd6d651@kernel.org

Changes in v11:
- Rebase onto arm64 for-next/gcs, which is based on v6.12-rc1, and
  integrate arm64 support.
- Rework the interface to specify a shadow stack pointer rather than a
  base and size like we do for the regular stack.
- Link to v10: https://lore.kernel.org/r/20240821-clone3-shadow-stack-v10-0-06e8797b9445@kernel.org

Changes in v10:
- Integrate fixes & improvements for the x86 implementation from Rick
  Edgecombe.
- Require that the shadow stack be VM_WRITE.
- Require that the shadow stack base and size be sizeof(void *) aligned.
- Clean up trailing newline.
- Link to v9: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99464@kernel.org

Changes in v9:
- Pull token validation earlier and report problems with an error return
  to parent rather than signal delivery to the child.
- Verify that the top of the supplied shadow stack is VM_SHADOW_STACK.
- Rework token validation to only do the page mapping once.
- Drop no longer needed support for testing for signals in selftest.
- Fix typo in comments.
- Link to v8: https://lore.kernel.org/r/20240808-clone3-shadow-stack-v8-0-0acf37caf14c@kernel.org

Changes in v8:
- Fix token verification with user specified shadow stack.
- Don't track user managed shadow stacks for child processes.
- Link to v7: https://lore.kernel.org/r/20240731-clone3-shadow-stack-v7-0-a9532eebfb1d@kernel.org

Changes in v7:
- Rebase onto v6.11-rc1.
- Typo fixes.
- Link to v6: https://lore.kernel.org/r/20240623-clone3-shadow-stack-v6-0-9ee7783b1fb9@kernel.org

Changes in v6:
- Rebase onto v6.10-rc3.
- Ensure we don't try to free the parent shadow stack in error paths of
  x86 arch code.
- Spelling fixes in userspace API document.
- Additional cleanups and improvements to the clone3() tests to support
  the shadow stack tests.
- Link to v5: https://lore.kernel.org/r/20240203-clone3-shadow-stack-v5-0-322c69598e4b@kernel.org

Changes in v5:
- Rebase onto v6.8-rc2.
- Rework ABI to have the user allocate the shadow stack memory with
  map_shadow_stack() and a token.
- Force inlining of the x86 shadow stack enablement.
- Move shadow stack enablement out into a shared header for reuse by
  other tests.
- Link to v4: https://lore.kernel.org/r/20231128-clone3-shadow-stack-v4-0-8b28ffe4f676@kernel.org

Changes in v4:
- Formatting changes.
- Use a define for minimum shadow stack size and move some basic
  validation to fork.c.
- Link to v3: https://lore.kernel.org/r/20231120-clone3-shadow-stack-v3-0-a7b8ed3e2acc@kernel.org

Changes in v3:
- Rebase onto v6.7-rc2.
- Remove stale shadow_stack in internal kargs.
- If a shadow stack is specified unconditionally use it regardless of
  CLONE_ parameters.
- Force enable shadow stacks in the selftest.
- Update changelogs for RISC-V feature rename.
- Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@kernel.org

Changes in v2:
- Rebase onto v6.7-rc1.
- Remove ability to provide preallocated shadow stack, just specify the
  desired size.
- Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@kernel.org

---
Mark Brown (8):
      arm64/gcs: Return a success value from gcs_alloc_thread_stack()
      Documentation: userspace-api: Add shadow stack API documentation
      selftests: Provide helper header for shadow stack testing
      fork: Add shadow stack support to clone3()
      selftests/clone3: Remove redundant flushes of output streams
      selftests/clone3: Factor more of main loop into test_clone3()
      selftests/clone3: Allow tests to flag if -E2BIG is a valid error code
      selftests/clone3: Test shadow stack support

 Documentation/userspace-api/index.rst             |   1 +
 Documentation/userspace-api/shadow_stack.rst      |  44 +++++
 arch/arm64/include/asm/gcs.h                      |   8 +-
 arch/arm64/kernel/process.c                       |   8 +-
 arch/arm64/mm/gcs.c                               |  55 +++++-
 arch/x86/include/asm/shstk.h                      |  11 +-
 arch/x86/kernel/process.c                         |   2 +-
 arch/x86/kernel/shstk.c                           |  53 ++++-
 include/asm-generic/cacheflush.h                  |  11 ++
 include/linux/sched/task.h                        |  17 ++
 include/uapi/linux/sched.h                        |   9 +-
 kernel/fork.c                                     |  93 +++++++--
 tools/testing/selftests/clone3/clone3.c           | 226 ++++++++++++++++++----
 tools/testing/selftests/clone3/clone3_selftests.h |  65 ++++++-
 tools/testing/selftests/ksft_shstk.h              |  98 ++++++++++
 15 files changed, 620 insertions(+), 81 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20231019-clone3-shadow-stack-15d40d2bf536

Best regards,
--  
Mark Brown <broonie@kernel.org>


^ permalink raw reply

* [PATCH v22 1/8] arm64/gcs: Return a success value from gcs_alloc_thread_stack()
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook
In-Reply-To: <20251015-clone3-shadow-stack-v22-0-a8c8da011427@kernel.org>

Currently as a result of templating from x86 code gcs_alloc_thread_stack()
returns a pointer as an unsigned int however on arm64 we don't actually use
this pointer value as anything other than a pass/fail flag. Simplify the
interface to just return an int with 0 on success and a negative error code
on failure.

Acked-by: Deepak Gupta <debug@rivosinc.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm64/include/asm/gcs.h | 8 ++++----
 arch/arm64/kernel/process.c  | 8 ++++----
 arch/arm64/mm/gcs.c          | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 8fa0707069e8..534ea5ae9281 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -64,8 +64,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task)
 void gcs_set_el0_mode(struct task_struct *task);
 void gcs_free(struct task_struct *task);
 void gcs_preserve_current_state(void);
-unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
-				     const struct kernel_clone_args *args);
+int gcs_alloc_thread_stack(struct task_struct *tsk,
+			   const struct kernel_clone_args *args);
 
 static inline int gcs_check_locked(struct task_struct *task,
 				   unsigned long new_val)
@@ -171,8 +171,8 @@ static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
 				int *err) { }
 static inline void push_user_gcs(unsigned long val, int *err) { }
 
-static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
-						   const struct kernel_clone_args *args)
+static inline int gcs_alloc_thread_stack(struct task_struct *tsk,
+					 const struct kernel_clone_args *args)
 {
 	return -ENOTSUPP;
 }
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index fba7ca102a8c..4dadc70df16b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -299,7 +299,7 @@ static void flush_gcs(void)
 static int copy_thread_gcs(struct task_struct *p,
 			   const struct kernel_clone_args *args)
 {
-	unsigned long gcs;
+	int ret;
 
 	if (!system_supports_gcs())
 		return 0;
@@ -310,9 +310,9 @@ static int copy_thread_gcs(struct task_struct *p,
 	p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
 	p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
 
-	gcs = gcs_alloc_thread_stack(p, args);
-	if (IS_ERR_VALUE(gcs))
-		return PTR_ERR((void *)gcs);
+	ret = gcs_alloc_thread_stack(p, args);
+	if (ret != 0)
+		return ret;
 
 	return 0;
 }
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 6e93f78de79b..3abcbf9adb5c 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -38,8 +38,8 @@ static unsigned long gcs_size(unsigned long size)
 	return max(PAGE_SIZE, size);
 }
 
-unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
-				     const struct kernel_clone_args *args)
+int gcs_alloc_thread_stack(struct task_struct *tsk,
+			   const struct kernel_clone_args *args)
 {
 	unsigned long addr, size;
 
@@ -59,13 +59,13 @@ unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 	size = gcs_size(size);
 	addr = alloc_gcs(0, size);
 	if (IS_ERR_VALUE(addr))
-		return addr;
+		return PTR_ERR((void *)addr);
 
 	tsk->thread.gcs_base = addr;
 	tsk->thread.gcs_size = size;
 	tsk->thread.gcspr_el0 = addr + size - sizeof(u64);
 
-	return addr;
+	return 0;
 }
 
 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)

-- 
2.47.2


^ permalink raw reply related

* [PATCH v22 2/8] Documentation: userspace-api: Add shadow stack API documentation
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook, Kees Cook,
	Shuah Khan
In-Reply-To: <20251015-clone3-shadow-stack-v22-0-a8c8da011427@kernel.org>

There are a number of architectures with shadow stack features which we are
presenting to userspace with as consistent an API as we can (though there
are some architecture specifics). Especially given that there are some
important considerations for userspace code interacting directly with the
feature let's provide some documentation covering the common aspects.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Tested-by: Kees Cook <kees@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Tested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/userspace-api/index.rst        |  1 +
 Documentation/userspace-api/shadow_stack.rst | 44 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index b8c73be4fb11..0167e59b541e 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -62,6 +62,7 @@ Everything else
 
    ELF
    netlink/index
+   shadow_stack
    sysfs-platform_profile
    vduse
    futex2
diff --git a/Documentation/userspace-api/shadow_stack.rst b/Documentation/userspace-api/shadow_stack.rst
new file mode 100644
index 000000000000..42617d0470ba
--- /dev/null
+++ b/Documentation/userspace-api/shadow_stack.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Shadow Stacks
+=============
+
+Introduction
+============
+
+Several architectures have features which provide backward edge
+control flow protection through a hardware maintained stack, only
+writable by userspace through very limited operations.  This feature
+is referred to as shadow stacks on Linux, on x86 it is part of Intel
+Control Enforcement Technology (CET), on arm64 it is Guarded Control
+Stacks feature (FEAT_GCS) and for RISC-V it is the Zicfiss extension.
+It is expected that this feature will normally be managed by the
+system dynamic linker and libc in ways broadly transparent to
+application code, this document covers interfaces and considerations.
+
+
+Enabling
+========
+
+Shadow stacks default to disabled when a userspace process is
+executed, they can be enabled for the current thread with a syscall:
+
+ - For x86 the ARCH_SHSTK_ENABLE arch_prctl()
+ - For other architectures the PR_SET_SHADOW_STACK_ENABLE prctl()
+
+It is expected that this will normally be done by the dynamic linker.
+Any new threads created by a thread with shadow stacks enabled will
+themselves have shadow stacks enabled.
+
+
+Enablement considerations
+=========================
+
+- Returning from the function that enables shadow stacks without first
+  disabling them will cause a shadow stack exception.  This includes
+  any syscall wrapper or other library functions, the syscall will need
+  to be inlined.
+- A lock feature allows userspace to prevent disabling of shadow stacks.
+- Those that change the stack context like longjmp() or use of ucontext
+  changes on signal return will need support from libc.

-- 
2.47.2


^ permalink raw reply related

* [PATCH v22 3/8] selftests: Provide helper header for shadow stack testing
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook, Kees Cook,
	Shuah Khan
In-Reply-To: <20251015-clone3-shadow-stack-v22-0-a8c8da011427@kernel.org>

While almost all users of shadow stacks should be relying on the dynamic
linker and libc to enable the feature there are several low level test
programs where it is useful to enable without any libc support, allowing
testing without full system enablement. This low level testing is helpful
during bringup of the support itself, and also in enabling coverage by
automated testing without needing all system components in the target root
filesystems to have enablement.

Provide a header with helpers for this purpose, intended for use only by
test programs directly exercising shadow stack interfaces.

Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Tested-by: Kees Cook <kees@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/testing/selftests/ksft_shstk.h | 98 ++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/tools/testing/selftests/ksft_shstk.h b/tools/testing/selftests/ksft_shstk.h
new file mode 100644
index 000000000000..fecf91218ea5
--- /dev/null
+++ b/tools/testing/selftests/ksft_shstk.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers for shadow stack enablement, this is intended to only be
+ * used by low level test programs directly exercising interfaces for
+ * working with shadow stacks.
+ *
+ * Copyright (C) 2024 ARM Ltd.
+ */
+
+#ifndef __KSFT_SHSTK_H
+#define __KSFT_SHSTK_H
+
+#include <asm/mman.h>
+
+/* This is currently only defined for x86 */
+#ifndef SHADOW_STACK_SET_TOKEN
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0)
+#endif
+
+static bool shadow_stack_enabled;
+
+#ifdef __x86_64__
+#define ARCH_SHSTK_ENABLE	0x5001
+#define ARCH_SHSTK_SHSTK	(1ULL <<  0)
+
+#define ARCH_PRCTL(arg1, arg2)					\
+({								\
+	long _ret;						\
+	register long _num  asm("eax") = __NR_arch_prctl;	\
+	register long _arg1 asm("rdi") = (long)(arg1);		\
+	register long _arg2 asm("rsi") = (long)(arg2);		\
+								\
+	asm volatile (						\
+		"syscall\n"					\
+		: "=a"(_ret)					\
+		: "r"(_arg1), "r"(_arg2),			\
+		  "0"(_num)					\
+		: "rcx", "r11", "memory", "cc"			\
+	);							\
+	_ret;							\
+})
+
+#define ENABLE_SHADOW_STACK
+static __always_inline void enable_shadow_stack(void)
+{
+	int ret = ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK);
+	if (ret == 0)
+		shadow_stack_enabled = true;
+}
+
+#endif
+
+#ifdef __aarch64__
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+
+#define my_syscall2(num, arg1, arg2)                                          \
+({                                                                            \
+	register long _num  __asm__ ("x8") = (num);                           \
+	register long _arg1 __asm__ ("x0") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("x1") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("x2") = 0;                               \
+	register long _arg4 __asm__ ("x3") = 0;                               \
+	register long _arg5 __asm__ ("x4") = 0;                               \
+									      \
+	__asm__  volatile (                                                   \
+		"svc #0\n"                                                    \
+		: "=r"(_arg1)                                                 \
+		: "r"(_arg1), "r"(_arg2),                                     \
+		  "r"(_arg3), "r"(_arg4),                                     \
+		  "r"(_arg5), "r"(_num)					      \
+		: "memory", "cc"                                              \
+	);                                                                    \
+	_arg1;                                                                \
+})
+
+#define ENABLE_SHADOW_STACK
+static __always_inline void enable_shadow_stack(void)
+{
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  PR_SHADOW_STACK_ENABLE);
+	if (ret == 0)
+		shadow_stack_enabled = true;
+}
+
+#endif
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+#ifndef ENABLE_SHADOW_STACK
+static inline void enable_shadow_stack(void) { }
+#endif
+
+#endif

-- 
2.47.2


^ permalink raw reply related

* [PATCH v22 4/8] fork: Add shadow stack support to clone3()
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook
In-Reply-To: <20251015-clone3-shadow-stack-v22-0-a8c8da011427@kernel.org>

Unlike with the normal stack there is no API for configuring the shadow
stack for a new thread, instead the kernel will dynamically allocate a
new shadow stack with the same size as the normal stack. This appears to
be due to the shadow stack series having been in development since
before the more extensible clone3() was added rather than anything more
deliberate.

Add a parameter to clone3() specifying a shadow stack pointer to use
for the new thread, this is inconsistent with the way we specify the
normal stack but during review concerns were expressed about having to
identify where the shadow stack pointer should be placed especially in
cases where the shadow stack has been previously active.  If no shadow
stack is specified then the existing implicit allocation behaviour is
maintained.

If a shadow stack pointer is specified then it is required to have an
architecture defined token placed on the stack, this will be consumed by
the new task, the shadow stack is specified by pointing to this token.  If
no valid token is present then this will be reported with -EINVAL.  This
token prevents new threads being created pointing at the shadow stack of
an existing running thread.  On architectures with support for userspace
pivoting of shadow stacks it is expected that the same format and placement
of tokens will be used, this is the case for arm64 and x86.

If the architecture does not support shadow stacks the shadow stack
pointer must be not be specified, architectures that do support the
feature are expected to enforce the same requirement on individual
systems that lack shadow stack support.

Update the existing arm64 and x86 implementations to pay attention to
the newly added arguments, in order to maintain compatibility we use the
existing behaviour if no shadow stack is specified. Since we are now
using more fields from the kernel_clone_args we pass that into the
shadow stack code rather than individual fields.

Portions of the x86 architecture code were written by Rick Edgecombe.

Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm64/mm/gcs.c              | 47 +++++++++++++++++++-
 arch/x86/include/asm/shstk.h     | 11 +++--
 arch/x86/kernel/process.c        |  2 +-
 arch/x86/kernel/shstk.c          | 53 ++++++++++++++++++++---
 include/asm-generic/cacheflush.h | 11 +++++
 include/linux/sched/task.h       | 17 ++++++++
 include/uapi/linux/sched.h       |  9 ++--
 kernel/fork.c                    | 93 ++++++++++++++++++++++++++++++++++------
 8 files changed, 217 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 3abcbf9adb5c..fd1d5a6655de 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -43,8 +43,23 @@ int gcs_alloc_thread_stack(struct task_struct *tsk,
 {
 	unsigned long addr, size;
 
-	if (!system_supports_gcs())
+	if (!system_supports_gcs()) {
+		if (args->shstk_token)
+			return -EINVAL;
+
 		return 0;
+	}
+
+	/*
+	 * If the user specified a GCS then use it, otherwise fall
+	 * back to a default allocation strategy. Validation is done
+	 * in arch_shstk_validate_clone().
+	 */
+	if (args->shstk_token) {
+		tsk->thread.gcs_base = 0;
+		tsk->thread.gcs_size = 0;
+		return 0;
+	}
 
 	if (!task_gcs_el0_enabled(tsk))
 		return 0;
@@ -68,6 +83,36 @@ int gcs_alloc_thread_stack(struct task_struct *tsk,
 	return 0;
 }
 
+static bool gcs_consume_token(struct vm_area_struct *vma, struct page *page,
+			      unsigned long user_addr)
+{
+	u64 expected = GCS_CAP(user_addr);
+	u64 *token = page_address(page) + offset_in_page(user_addr);
+
+	if (!cmpxchg_to_user_page(vma, page, user_addr, token, expected, 0))
+		return false;
+	set_page_dirty_lock(page);
+
+	return true;
+}
+
+int arch_shstk_validate_clone(struct task_struct *tsk,
+			      struct vm_area_struct *vma,
+			      struct page *page,
+			      struct kernel_clone_args *args)
+{
+	unsigned long gcspr_el0;
+	int ret = 0;
+
+	gcspr_el0 = args->shstk_token;
+	if (!gcs_consume_token(vma, page, gcspr_el0))
+		return -EINVAL;
+
+	tsk->thread.gcspr_el0 = gcspr_el0 + sizeof(u64);
+
+	return ret;
+}
+
 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
 {
 	unsigned long alloc_size;
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index fc7dcec58fd4..42a5f03a51e2 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 
 struct task_struct;
+struct kernel_clone_args;
 struct ksignal;
 
 #ifdef CONFIG_X86_USER_SHADOW_STACK
@@ -16,8 +17,8 @@ struct thread_shstk {
 
 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
 void reset_thread_features(void);
-unsigned long shstk_alloc_thread_stack(struct task_struct *p, u64 clone_flags,
-				       unsigned long stack_size);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+				       const struct kernel_clone_args *args);
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
@@ -30,8 +31,10 @@ static inline long shstk_prctl(struct task_struct *task, int option,
 			       unsigned long arg2) { return -EINVAL; }
 static inline void reset_thread_features(void) {}
 static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
-						     u64 clone_flags,
-						     unsigned long stack_size) { return 0; }
+						     const struct kernel_clone_args *args)
+{
+	return 0;
+}
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4c718f8adc59..449da29a9f92 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -219,7 +219,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	 * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
 	 * update it.
 	 */
-	new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+	new_ssp = shstk_alloc_thread_stack(p, args);
 	if (IS_ERR_VALUE(new_ssp))
 		return PTR_ERR((void *)new_ssp);
 
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 978232b6d48d..d66c866274d0 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -191,18 +191,61 @@ void reset_thread_features(void)
 	current->thread.features_locked = 0;
 }
 
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
-				       unsigned long stack_size)
+int arch_shstk_validate_clone(struct task_struct *t,
+			      struct vm_area_struct *vma,
+			      struct page *page,
+			      struct kernel_clone_args *args)
+{
+	void *maddr = page_address(page);
+	unsigned long token;
+	int offset;
+	u64 expected;
+
+	/*
+	 * kernel_clone_args() verification assures token address is 8
+	 * byte aligned.
+	 */
+	token = args->shstk_token;
+	expected = (token + SS_FRAME_SIZE) | BIT(0);
+	offset = offset_in_page(token);
+
+	if (!cmpxchg_to_user_page(vma, page, token, (unsigned long *)(maddr + offset),
+				  expected, 0))
+		return -EINVAL;
+	set_page_dirty_lock(page);
+
+	return 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
+				       const struct kernel_clone_args *args)
 {
 	struct thread_shstk *shstk = &tsk->thread.shstk;
+	u64 clone_flags = args->flags;
 	unsigned long addr, size;
 
 	/*
 	 * If shadow stack is not enabled on the new thread, skip any
-	 * switch to a new shadow stack.
+	 * implicit switch to a new shadow stack and reject attempts to
+	 * explicitly specify one.
 	 */
-	if (!features_enabled(ARCH_SHSTK_SHSTK))
+	if (!features_enabled(ARCH_SHSTK_SHSTK)) {
+		if (args->shstk_token)
+			return (unsigned long)ERR_PTR(-EINVAL);
+
 		return 0;
+	}
+
+	/*
+	 * If the user specified a shadow stack then use it, otherwise
+	 * fall back to a default allocation strategy. Validation is
+	 * done in arch_shstk_validate_clone().
+	 */
+	if (args->shstk_token) {
+		shstk->base = 0;
+		shstk->size = 0;
+		return args->shstk_token + 8;
+	}
 
 	/*
 	 * For CLONE_VFORK the child will share the parents shadow stack.
@@ -222,7 +265,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
 	if (!(clone_flags & CLONE_VM))
 		return 0;
 
-	size = adjust_shstk_size(stack_size);
+	size = adjust_shstk_size(args->stack_size);
 	addr = alloc_shstk(0, size, 0, false);
 	if (IS_ERR_VALUE(addr))
 		return addr;
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index 7ee8a179d103..96cc0c7a5c90 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -124,4 +124,15 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
 	} while (0)
 #endif
 
+#ifndef cmpxchg_to_user_page
+#define cmpxchg_to_user_page(vma, page, vaddr, ptr, old, new)  \
+({							  \
+	bool ret;						  \
+								  \
+	ret = try_cmpxchg(ptr, &old, new);			  \
+	flush_icache_user_page(vma, page, vaddr, sizeof(*ptr));	  \
+	ret;							  \
+})
+#endif
+
 #endif /* _ASM_GENERIC_CACHEFLUSH_H */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 525aa2a632b2..7f860edc58da 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -16,6 +16,7 @@ struct task_struct;
 struct rusage;
 union thread_union;
 struct css_set;
+struct vm_area_struct;
 
 /* All the bits taken by the old clone syscall. */
 #define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -44,6 +45,7 @@ struct kernel_clone_args {
 	struct cgroup *cgrp;
 	struct css_set *cset;
 	unsigned int kill_seq;
+	unsigned long shstk_token;
 };
 
 /*
@@ -225,4 +227,19 @@ static inline void task_unlock(struct task_struct *p)
 
 DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
 
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+int arch_shstk_validate_clone(struct task_struct *p,
+			      struct vm_area_struct *vma,
+			      struct page *page,
+			      struct kernel_clone_args *args);
+#else
+static inline int arch_shstk_validate_clone(struct task_struct *p,
+					    struct vm_area_struct *vma,
+					    struct page *page,
+					    struct kernel_clone_args *args)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..7e18e7b3df3a 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -84,6 +84,7 @@
  *                kernel's limit of nested PID namespaces.
  * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
  *                a file descriptor for the cgroup.
+ * @shstk_token:  Pointer to shadow stack token at top of stack.
  *
  * The structure is versioned by size and thus extensible.
  * New struct members must go at the end of the struct and
@@ -101,12 +102,14 @@ struct clone_args {
 	__aligned_u64 set_tid;
 	__aligned_u64 set_tid_size;
 	__aligned_u64 cgroup;
+	__aligned_u64 shstk_token;
 };
 #endif
 
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
-#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
-#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER0  64 /* sizeof first published struct */
+#define CLONE_ARGS_SIZE_VER1  80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2  88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER3  96 /* sizeof fourth published struct */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..a0b8eeeb1d27 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1915,6 +1915,51 @@ static bool need_futex_hash_allocate_default(u64 clone_flags)
 	return true;
 }
 
+static int shstk_validate_clone(struct task_struct *p,
+				struct kernel_clone_args *args)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	unsigned long addr;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK))
+		return 0;
+
+	if (!args->shstk_token)
+		return 0;
+
+	mm = get_task_mm(p);
+	if (!mm)
+		return -EFAULT;
+
+	mmap_read_lock(mm);
+
+	addr = untagged_addr_remote(mm, args->shstk_token);
+	page = get_user_page_vma_remote(mm, addr, FOLL_FORCE | FOLL_WRITE,
+					&vma);
+	if (IS_ERR(page)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!(vma->vm_flags & VM_SHADOW_STACK) ||
+	    !(vma->vm_flags & VM_WRITE)) {
+		ret = -EFAULT;
+		goto out_page;
+	}
+
+	ret = arch_shstk_validate_clone(p, vma, page, args);
+
+out_page:
+	put_page(page);
+out:
+	mmap_read_unlock(mm);
+	mmput(mm);
+	return ret;
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2188,6 +2233,9 @@ __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(p, args);
+	if (retval)
+		goto bad_fork_cleanup_io;
+	retval = shstk_validate_clone(p, args);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
@@ -2765,7 +2813,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		     CLONE_ARGS_SIZE_VER1);
 	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
 		     CLONE_ARGS_SIZE_VER2);
-	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+	BUILD_BUG_ON(offsetofend(struct clone_args, shstk_token) !=
+		     CLONE_ARGS_SIZE_VER3);
+	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER3);
 
 	if (unlikely(usize > PAGE_SIZE))
 		return -E2BIG;
@@ -2798,16 +2848,17 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		return -EINVAL;
 
 	*kargs = (struct kernel_clone_args){
-		.flags		= args.flags,
-		.pidfd		= u64_to_user_ptr(args.pidfd),
-		.child_tid	= u64_to_user_ptr(args.child_tid),
-		.parent_tid	= u64_to_user_ptr(args.parent_tid),
-		.exit_signal	= args.exit_signal,
-		.stack		= args.stack,
-		.stack_size	= args.stack_size,
-		.tls		= args.tls,
-		.set_tid_size	= args.set_tid_size,
-		.cgroup		= args.cgroup,
+		.flags			= args.flags,
+		.pidfd			= u64_to_user_ptr(args.pidfd),
+		.child_tid		= u64_to_user_ptr(args.child_tid),
+		.parent_tid		= u64_to_user_ptr(args.parent_tid),
+		.exit_signal		= args.exit_signal,
+		.stack			= args.stack,
+		.stack_size		= args.stack_size,
+		.tls			= args.tls,
+		.set_tid_size		= args.set_tid_size,
+		.cgroup			= args.cgroup,
+		.shstk_token		= args.shstk_token,
 	};
 
 	if (args.set_tid &&
@@ -2848,6 +2899,24 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
 	return true;
 }
 
+/**
+ * clone3_shadow_stack_valid - check and prepare shadow stack
+ * @kargs: kernel clone args
+ *
+ * Verify that shadow stacks are only enabled if supported.
+ */
+static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs)
+{
+	if (!kargs->shstk_token)
+		return true;
+
+	if (!IS_ALIGNED(kargs->shstk_token, sizeof(void *)))
+		return false;
+
+	/* Fail if the kernel wasn't built with shadow stacks */
+	return IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK);
+}
+
 static bool clone3_args_valid(struct kernel_clone_args *kargs)
 {
 	/* Verify that no unknown flags are passed along. */
@@ -2870,7 +2939,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
 	    kargs->exit_signal)
 		return false;
 
-	if (!clone3_stack_valid(kargs))
+	if (!clone3_stack_valid(kargs) || !clone3_shadow_stack_valid(kargs))
 		return false;
 
 	return true;

-- 
2.47.2


^ permalink raw reply related

* [PATCH v22 5/8] selftests/clone3: Remove redundant flushes of output streams
From: Mark Brown @ 2025-10-15 12:49 UTC (permalink / raw)
  To: Rick P. Edgecombe, Deepak Gupta, H.J. Lu, Florian Weimer,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Christian Brauner, Shuah Khan
  Cc: linux-kernel, Catalin Marinas, Will Deacon, jannh, bsegall,
	Andrew Morton, Yury Khrustalev, H.J. Lu, Adhemerval Zanella Netto,
	Wilco Dijkstra, CarlosO'Donell, Florian Weimer, Rich Felker,
	linux-kselftest, linux-api, Mark Brown, Kees Cook, Kees Cook,
	Shuah Khan
In-Reply-To: <20251015-clone3-shadow-stack-v22-0-a8c8da011427@kernel.org>

Since there were widespread issues with output not being flushed the
kselftest framework was modified to explicitly set the output streams
unbuffered in commit 58e2847ad2e6 ("selftests: line buffer test
program's stdout") so there is no need to explicitly flush in the clone3
tests.

Reviewed-by: Kees Cook <kees@kernel.org>
Tested-by: Kees Cook <kees@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/testing/selftests/clone3/clone3_selftests.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index eeca8005723f..939b26c86d42 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -35,8 +35,6 @@ struct __clone_args {
 
 static pid_t sys_clone3(struct __clone_args *args, size_t size)
 {
-	fflush(stdout);
-	fflush(stderr);
 	return syscall(__NR_clone3, args, size);
 }
 

-- 
2.47.2


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox