Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* Re: [RFC][CFT] selftest for permission checks in mount propagation changes
From: Al Viro @ 2025-08-14  6:37 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Tycho Andersen, Andrei Vagin, Andrei Vagin, Christian Brauner,
	Pavel Tikhomirov, LKML, criu, Linux API, stable
In-Reply-To: <20250814055702.GO222315@ZenIV>

> void do_unshare(void)
> {
> 	FILE *f;
> 	uid_t uid = geteuid();
> 	gid_t gid = getegid();
> 	unshare(CLONE_NEWNS|CLONE_NEWUSER);
> 	f = fopen("/proc/self/uid_map", "w");
> 	fprintf(f, "0 %d 1", uid);
> 	fclose(f);
> 	f = fopen("/proc/self/setgroups", "w");
> 	fprintf(f, "deny");
> 	fclose(f);
> 	f = fopen("/proc/self/gid_map", "w");
> 	fprintf(f, "0 %d 1", gid);
> 	fclose(f);
> 	mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL);
> }

This obviously needs error checking - in this form it won't do
anything good without userns enabled (coredump on the first
fprintf() in there, since there won't be /proc/self/uid_map);
should probably just report CLONE_NEWUSER failure, warn about
skipped tests, fall back to unshare(CLONE_NEWNS) and skip
everything in in_child()...

^ permalink raw reply

* [RFC][CFT] selftest for permission checks in mount propagation changes
From: Al Viro @ 2025-08-14  5:57 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Tycho Andersen, Andrei Vagin, Andrei Vagin, Christian Brauner,
	Pavel Tikhomirov, LKML, criu, Linux API, stable
In-Reply-To: <20250814055142.GN222315@ZenIV>

// link with -lcap, can run both as root and as regular user
#define _GNU_SOURCE
#include <sched.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <stdbool.h>

_Bool drop_caps(void)
{
        cap_value_t cap_value[] = { CAP_SYS_ADMIN };
        cap_t cap = cap_get_proc();
        if (!cap) {
		perror("cap_get_proc");
		return false;
	}
	return true;
}

void do_unshare(void)
{
	FILE *f;
	uid_t uid = geteuid();
	gid_t gid = getegid();
	unshare(CLONE_NEWNS|CLONE_NEWUSER);
	f = fopen("/proc/self/uid_map", "w");
	fprintf(f, "0 %d 1", uid);
	fclose(f);
	f = fopen("/proc/self/setgroups", "w");
	fprintf(f, "deny");
	fclose(f);
	f = fopen("/proc/self/gid_map", "w");
	fprintf(f, "0 %d 1", gid);
	fclose(f);
	mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL);
}

void bind(char *p)
{
	mount(p, p, NULL, MS_BIND, NULL);
}

void change_type(char *p, int type)
{
	errno = 0;
	mount(NULL, p, NULL, type, NULL);
}

void set_group(int fd1, char *p1, int fd2, char *p2)
{
	int flags = MOVE_MOUNT_SET_GROUP;
	int n;

	if (!p1 || !*p1) {
		p1 = "";	// be kind to old kernels
		flags |= MOVE_MOUNT_F_EMPTY_PATH;
	}
	if (!p2 || !*p2) {
		p2 = "";	// be kind to old kernels
		flags |= MOVE_MOUNT_T_EMPTY_PATH;
	}
	errno = 0;
	move_mount(fd1, p1, fd2, p2, flags);
}

_Bool result(int expected)
{
	if (expected != errno) {
		printf(" failed: %d != %d\n", expected, errno);
		return false;
	}
	printf(" OK\n");
	return true;
}

int fd1[2], fd2[2];

void in_child(void)
{
	printf("from should be someplace we have permissions for");
	set_group(AT_FDCWD, "mnt/a", AT_FDCWD, "/mnt/a/private");
	result(EPERM);
	printf("to should be someplace we have permissions for");
	set_group(AT_FDCWD, "/mnt/a", AT_FDCWD, "mnt/a/private");
	result(EPERM);
	printf("change_type: should have permissions for target");
	change_type("mnt/locked", MS_PRIVATE);
	result(EPERM);
}

void in_parent(void)
{
	printf("from should be mounted (pipes are not)");
	set_group(fd1[0], NULL, AT_FDCWD, "/mnt/a/private");
	result(EINVAL);

	printf("to should be mounted (pipes are not)");
	set_group(AT_FDCWD, "/mnt", fd1[0], NULL);
	result(EINVAL);

	printf("from should be someplace we have permissions for");
	set_group(AT_FDCWD, "mnt/a", AT_FDCWD, "/mnt/a/private");
	if (result(0))
		change_type("/mnt/a/private", MS_PRIVATE);

	printf("to should be someplace we have permissions for");
	set_group(AT_FDCWD, "/mnt", AT_FDCWD, "mnt/a/private");
	if (result(0))
		change_type("mnt/a/private", MS_PRIVATE);

	printf("from should be mountpoint");
	set_group(AT_FDCWD, "/mnt/a", AT_FDCWD, "/mnt/a/private");
	result(EINVAL);

	printf("to should be mountpoint");
	set_group(AT_FDCWD, "/mnt/a", AT_FDCWD, "/mnt/a/private/b");
	result(EINVAL);

	printf("to and from should be on the same filesystem");
	mount("none", "/mnt/no-locked", "tmpfs", 0, NULL);
	set_group(AT_FDCWD, "/mnt/a/shared", AT_FDCWD, "/mnt/no-locked");
	result(EINVAL);
	umount("/mnt/no-locked");

	printf("from should contain the counterpart of to");
	set_group(AT_FDCWD, "/mnt/a/shared", AT_FDCWD, "/mnt/no-locked");
	result(EINVAL);

	printf("from should not have anything locked in counterpart of to");
	set_group(AT_FDCWD, "mnt", AT_FDCWD, "/mnt/no-locked");
	if (result(0))
		change_type("/mnt/no-locked", MS_PRIVATE);

	printf("change_type: should have permissions for target");
	change_type("mnt/a/private", MS_PRIVATE);
	result(0);

	printf("change_type: target should be a mountpoint");
	change_type("/mnt/a", MS_PRIVATE);
	result(EINVAL);

	chdir("/mnt/a/private");
	umount2("/mnt/a/private", MNT_DETACH);
	printf("change_type: target should be mounted");
	change_type(".", MS_PRIVATE);
	result(EINVAL);
}

int main()
{
	char path[40];
	pid_t child;
	int root_fd;
	char c;

	if (pipe(fd1) < 0 || pipe(fd2) < 0) {
		perror("pipe");
		return -1;
	}
	if (!drop_caps())
		return -1;
	do_unshare();

	root_fd = open("/", O_PATH);

	errno = 0;
	mount("none", "/mnt", "tmpfs", 0, NULL);
	mkdir("/mnt/a", 0777);
	mkdir("/mnt/a/private", 0777);
	mkdir("/mnt/a/private/b", 0777);
	mkdir("/mnt/a/shared", 0777);
	mkdir("/mnt/a/slave", 0777);
	mkdir("/mnt/a/shared-slave", 0777);
	mkdir("/mnt/locked", 0777);
	mkdir("/mnt/no-locked", 0777);
	bind("/mnt/locked");

	child = fork();
	if (child < 0) {
		perror("fork");
		return -1;
	} else if (child == 0) {
		do_unshare();
		change_type("/mnt/", MS_SHARED);
		bind("/mnt/a");
		bind("/mnt/a/private");
		change_type("/mnt/a/private", MS_PRIVATE);
		write(fd1[1], &c, 1);
		read(fd2[0], &c, 1);

		fchdir(root_fd);
		in_child();

		write(fd1[1], &c, 1);
		return 0;
	}
	read(fd1[0], &c, 1);
	sprintf(path, "/proc/%d/root", child);
	chdir(path);

	change_type("/mnt", MS_SHARED);
	bind("/mnt/a/private");
	bind("/mnt/a/shared");
	bind("/mnt/a/slave");
	bind("/mnt/a/slave-shared");
	bind("/mnt/no-locked");
	change_type("/mnt/a/private", MS_PRIVATE);
	change_type("/mnt/a/slave", MS_SLAVE);
	change_type("/mnt/a/shared-slave", MS_SLAVE);
	change_type("/mnt/a/shared-slave", MS_SHARED);
	change_type("/mnt/no-locked", MS_PRIVATE);

	in_parent();

	fflush(stdout);
	write(fd2[1], &c, 1);
	read(fd1[0], &c, 1);
	return 0;
}

^ permalink raw reply

* [PATCH][RFC][CFT] use uniform permission checks for all mount propagation changes
From: Al Viro @ 2025-08-14  5:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Tycho Andersen, Andrei Vagin, Andrei Vagin, Christian Brauner,
	Pavel Tikhomirov, LKML, criu, Linux API, stable
In-Reply-To: <20250814044239.GM222315@ZenIV>

[this should fix userland regression from do_change_type() fix last cycle;
we have too little self-test coverage in the area, unfortunately.  First
approximation for selftest in the followup to this posting.  Review and
testing of both the patch and test would be very welcome]

do_change_type() and do_set_group() are operating on different
aspects of the same thing - propagation graph.  The latter
asks for mounts involved to be mounted in namespace(s) the caller
has CAP_SYS_ADMIN for.  The former is a mess - originally it
didn't even check that mount *is* mounted.  That got fixed,
but the resulting check turns out to be too strict for userland -
in effect, we check that mount is in our namespace, having already
checked that we have CAP_SYS_ADMIN there.

What we really need (in both cases) is
	* we only touch mounts that are mounted.  Hard requirement,
data corruption if that's get violated.
	* we don't allow to mess with a namespace unless you already
have enough permissions to do so (i.e. CAP_SYS_ADMIN in its userns).

That's an equivalent of what do_set_group() does; let's extract that
into a helper (may_change_propagation()) and use it in both
do_set_group() and do_change_type().

Fixes: 12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/namespace.c b/fs/namespace.c
index ddfd4457d338..a191c6519e36 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2862,6 +2862,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	return attach_recursive_mnt(mnt, p, mp);
 }
 
+static int may_change_propagation(const struct mount *m)
+{
+        struct mnt_namespace *ns = m->mnt_ns;
+
+	 // it must be mounted in some namespace
+	 if (IS_ERR_OR_NULL(ns))         // is_mounted()
+		 return -EINVAL;
+	 // and the caller must be admin in userns of that namespace
+	 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		 return -EPERM;
+	 return 0;
+}
+
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
@@ -2898,10 +2911,10 @@ static int do_change_type(struct path *path, int ms_flags)
 		return -EINVAL;
 
 	namespace_lock();
-	if (!check_mnt(mnt)) {
-		err = -EINVAL;
+	err = may_change_propagation(mnt);
+	if (err)
 		goto out_unlock;
-	}
+
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
@@ -3347,18 +3360,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 
 	namespace_lock();
 
-	err = -EINVAL;
-	/* To and From must be mounted */
-	if (!is_mounted(&from->mnt))
-		goto out;
-	if (!is_mounted(&to->mnt))
-		goto out;
-
-	err = -EPERM;
-	/* We should be allowed to modify mount namespaces of both mounts */
-	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(from);
+	if (err)
 		goto out;
-	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(to);
+	if (err)
 		goto out;
 
 	err = -EINVAL;

^ permalink raw reply related

* Re: do_change_type(): refuse to operate on unmounted/not ours mounts
From: Al Viro @ 2025-08-14  4:42 UTC (permalink / raw)
  To: Pavel Tikhomirov
  Cc: Tycho Andersen, Andrei Vagin, Andrei Vagin, Christian Brauner,
	linux-fsdevel, LKML, criu, Linux API, stable
In-Reply-To: <CAE1zp77jmFD=rySJVLf6yU+JKZnUpjkBagC3qQHrxPotrccEbQ@mail.gmail.com>

On Thu, Aug 14, 2025 at 12:08:49PM +0800, Pavel Tikhomirov wrote:

> Yes, selftest is very simple and is not covering userns checks.

FWIW, see below for what I've got here at the moment for MOVE_MOUNT_SET_GROUP;
no tests for cross-filesystem and not-a-subtree yet.  At least it does catch
that braino when run on a kernel that doesn't have it fixed ;-)
No do_change_type() tests either yet...

// link with -lcap, assumes userns enabled
// can run both as root and as regular user
#define _GNU_SOURCE
#include <sched.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <stdbool.h>

_Bool drop_caps(void)
{
        cap_value_t cap_value[] = { CAP_SYS_ADMIN };
        cap_t cap = cap_get_proc();
        if (!cap) {
		perror("cap_get_proc");
		return false;
	}
	return true;
}

void do_unshare(void)
{
	FILE *f;
	uid_t uid = geteuid();
	gid_t gid = getegid();
	unshare(CLONE_NEWNS|CLONE_NEWUSER);
	f = fopen("/proc/self/uid_map", "w");
	fprintf(f, "0 %d 1", uid);
	fclose(f);
	f = fopen("/proc/self/setgroups", "w");
	fprintf(f, "deny");
	fclose(f);
	f = fopen("/proc/self/gid_map", "w");
	fprintf(f, "0 %d 1", gid);
	fclose(f);
	mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL);
}

void bind(char *p)
{
	mount(p, p, NULL, MS_BIND, NULL);
}

void test_it(int fd1, char *p1, int fd2, char *p2, int expected)
{
	int flags = MOVE_MOUNT_SET_GROUP;
	int n;

	if (!p1) {
		p1 = "";
		flags |= MOVE_MOUNT_F_EMPTY_PATH;
	}
	if (!p2) {
		p2 = "";
		flags |= MOVE_MOUNT_T_EMPTY_PATH;
	}
	n = move_mount(fd1, p1, fd2, p2, flags);
	if (!n)
		errno = 0;
	if (expected != errno)
		printf(" failed: %d != %d\n", expected, errno);
	else
		printf(" OK\n");
}

int main()
{
	int pipe1[2], pipe2[2];
	char path[40];
	pid_t child;
	int root_fd;
	char c;

	if (pipe(pipe1) < 0 || pipe(pipe2) < 0) {
		perror("pipe");
		return -1;
	}
	if (!drop_caps())
		return -1;
	do_unshare();

	root_fd = open("/", O_PATH);

	errno = 0;
	mount("none", "/mnt", "tmpfs", 0, NULL);
	mkdir("/mnt/a", 0777);
	mkdir("/mnt/a/private", 0777);
	mkdir("/mnt/a/private/b", 0777);
	mkdir("/mnt/a/shared", 0777);
	mkdir("/mnt/a/slave", 0777);
	mkdir("/mnt/a/shared-slave", 0777);
	mkdir("/mnt/locked", 0777);
	mkdir("/mnt/no-locked", 0777);
	bind("/mnt/locked");

	child = fork();
	if (child < 0) {
		perror("fork");
		return -1;
	} else if (child == 0) {
		do_unshare();
		mount(NULL, "/mnt/", NULL, MS_SHARED, NULL);
		bind("/mnt/a");
		write(pipe1[1], &c, 1);
		fchdir(root_fd);
		read(pipe2[0], &c, 1);
		printf("from should be someplace we have permissions for");
		test_it(AT_FDCWD, "mnt/a", AT_FDCWD, "/mnt/a/private", EPERM);
		printf("to should be someplace we have permissions for");
		test_it(AT_FDCWD, "/mnt/a", AT_FDCWD, "mnt/a/private", EPERM);
		write(pipe1[1], &c, 1);
		return 0;
	}
	read(pipe1[0], &c, 1);
	sprintf(path, "/proc/%d/root", child);
	chdir(path);

	mount(NULL, "/mnt", NULL, MS_SHARED, NULL);
	bind("/mnt/a/private");
	bind("/mnt/a/shared");
	bind("/mnt/a/slave");
	bind("/mnt/a/slave-shared");
	bind("/mnt/no-locked");
	mount(NULL, "/mnt/a/private", NULL, MS_PRIVATE, NULL);
	mount(NULL, "/mnt/a/slave", NULL, MS_SLAVE, NULL);
	mount(NULL, "/mnt/a/shared-slave", NULL, MS_SLAVE, NULL);
	mount(NULL, "/mnt/a/shared-slave", NULL, MS_SHARED, NULL);
	mount(NULL, "/mnt/no-locked", NULL, MS_PRIVATE, NULL);

	printf("from should be mounted (pipes are not)");
	test_it(pipe1[0], NULL, AT_FDCWD, "/mnt/a/private", EINVAL);

	printf("to should be mounted (pipes are not)");
	test_it(AT_FDCWD, "/mnt", pipe1[0], NULL, EINVAL);

	printf("from should be someplace we have permissions for");
	test_it(AT_FDCWD, "mnt/a", AT_FDCWD, "/mnt/a/private", 0);
	mount(NULL, "/mnt/a/private", NULL, MS_PRIVATE, NULL);

	printf("from should be mountpoint");
	test_it(AT_FDCWD, "/mnt/a", AT_FDCWD, "/mnt/a/private", EINVAL);

	printf("to should be mountpoint");
	test_it(AT_FDCWD, "/mnt/a", AT_FDCWD, "/mnt/a/private/b", EINVAL);

	printf("from should not have anything locked in counterpart of to");
	test_it(AT_FDCWD, "mnt", AT_FDCWD, "/mnt/locked", EINVAL);

	printf("from should not have anything locked in counterpart of to");
	test_it(AT_FDCWD, "mnt", AT_FDCWD, "/mnt/no-locked", 0);
	mount(NULL, "/mnt/no-locked", NULL, MS_PRIVATE, NULL);

	fflush(stdout);
	write(pipe2[1], &c, 1);
	read(pipe1[0], &c, 1);
	return 0;
}

^ permalink raw reply

* Re: do_change_type(): refuse to operate on unmounted/not ours mounts
From: Pavel Tikhomirov @ 2025-08-14  4:08 UTC (permalink / raw)
  To: Al Viro
  Cc: Tycho Andersen, Andrei Vagin, Andrei Vagin, Christian Brauner,
	linux-fsdevel, LKML, criu, Linux API, stable
In-Reply-To: <20250813194145.GK222315@ZenIV>

On Thu, Aug 14, 2025 at 3:41 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Aug 13, 2025 at 01:09:27PM -0600, Tycho Andersen wrote:
> > On Wed, Aug 13, 2025 at 07:56:01PM +0100, Al Viro wrote:
> > > @@ -3347,18 +3360,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
> > >
> > >     namespace_lock();
> > >
> > > -   err = -EINVAL;
> > > -   /* To and From must be mounted */
> > > -   if (!is_mounted(&from->mnt))
> > > -           goto out;
> > > -   if (!is_mounted(&to->mnt))
> > > -           goto out;
> > > -
> > > -   err = -EPERM;
> > > -   /* We should be allowed to modify mount namespaces of both mounts */
> > > -   if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > > +   err = may_change_propagation(from);
> > > +   if (err)
> > >             goto out;
> > > -   if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > > +   err = may_change_propagation(from);
> >
> > Just driving by, but I guess you mean "to" here.
>
> D'oh...  Yes, of course.  Fun question: would our selftests have caught
> that?
> [checks]
> move_mount_set_group_test.c doesn't have anything in that area, nothing in
> LTP or xfstests either, AFAICS...

Yes, selftest is very simple and is not covering userns checks.

>  And I don't see anything in
> https://github.com/checkpoint-restore/criu
> either - there are uses of MOVE_MOUNT_SET_GROUP, but they are well-buried
> and I don't see anything in their tests that would even try to poke into
> that thing...
>
> Before we go and try to cobble something up, does anybody know of a place
> where regression tests for MOVE_MOUNT_SET_GROUP could be picked from?
>

Basically each CRIU test that is run by zdtm (if it is in ns/uns
flavor (which are most of them)), tests mounts checkpoint/restore. And
each test which has shared/slave moutns leads to MOVE_MOUNT_SET_GROUP
being used and thus tested. We have a mountinfo comparison in zdtm
which checks that propagation is topologically the same after c/r.

But, yes, we do not cover userns checks, as in CRIU case, CRIU is
expected to run in userns which has all capabilities over restored
container, and should always pass those checks.

JFYI:

The use of MOVE_MOUNT_SET_GROUP in CRIU is well-buried in:

https://github.com/checkpoint-restore/criu/blob/116e56ba46382c05066d33a8bbadcc495dbdb644/criu/mount-v2.c#L896

  +-< move_mount_set_group
    +-< restore_one_sharing
      +-< restore_one_sharing_group
        +-< restore_mount_sharing_options
          +-< prepare_mnt_ns_v2

This stack already has a set of precreated mounts and walks over their
sharing groups saved in CRIU image files and assigns them accordingly.

And we have a bunch of tests with different sharing configurations to
test propagation c/r specifically:

git grep -l "SHARING\|SLAVE" test/zdtm/static
test/zdtm/static/mnt_ext_auto.c
test/zdtm/static/mnt_ext_master.c
test/zdtm/static/mnt_ext_multiple.c
test/zdtm/static/mnt_root_ext.c
test/zdtm/static/mntns_overmount.c
test/zdtm/static/mntns_shared_bind03.c
test/zdtm/static/mount_complex_sharing.c
test/zdtm/static/mountpoints.c
test/zdtm/static/shared_slave_mount_children.c

It should be enough to run a zdtm test-suit to check that change does
not break something for CRIU (will do).

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Jason Gunthorpe @ 2025-08-13 20:03 UTC (permalink / raw)
  To: Greg KH
  Cc: Pratyush Yadav, Vipin Sharma, Pasha Tatashin, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <2025081334-rotten-visible-517a@gregkh>

On Wed, Aug 13, 2025 at 03:00:08PM +0200, Greg KH wrote:
> > In this case if kho_unpreserve_folio() fails in this call chain it
> > means some error unwind is wrongly happening out of sequence, and we
> > are now forced to leak memory. Unwind is not something that userspace
> > should be controlling, so of course we want a WARN_ON here.
> 
> "should be" is the key here.  And it's not obvious from this patch if
> that's true or not, which is why I mentioned it.
> 
> I will keep bringing this up, given the HUGE number of CVEs I keep
> assigning each week for when userspace hits WARN_ON() calls until that
> flow starts to die out either because we don't keep adding new calls, OR
> we finally fix them all.  Both would be good...

WARN or not, userspace triggering permanently leaking kernel memory is
a CVE worthy bug in of itself.

So even if userspace triggers this I'd rather have the warn than the
difficult to find leak.

I don't know what your CVEs are, but I get a decent number of
userspace hits a WARN bug from with syzkaller, and they are all bugs
in the kernel. Bugs that should probably get CVEs even without the
crash on WARN issue anyhow. The WARN made them discoverable cheaply.

The most recent was a userspace triggerable arthimetic overflow
corrupted a datastructure and a WARN caught it, syzkaller found it,
and we fixed it before it became a splashy exploit with a web
site.

Removing bug catching to reduce CVEs because we don't find the bugs
anymore seems like the wrong direction to me.

Jason

^ permalink raw reply

* Re: do_change_type(): refuse to operate on unmounted/not ours mounts
From: Al Viro @ 2025-08-13 19:41 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Andrei Vagin, Andrei Vagin, Christian Brauner, linux-fsdevel,
	LKML, criu, Linux API, stable
In-Reply-To: <aJzi506tGJb8CzA3@tycho.pizza>

On Wed, Aug 13, 2025 at 01:09:27PM -0600, Tycho Andersen wrote:
> On Wed, Aug 13, 2025 at 07:56:01PM +0100, Al Viro wrote:
> > @@ -3347,18 +3360,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
> >  
> >  	namespace_lock();
> >  
> > -	err = -EINVAL;
> > -	/* To and From must be mounted */
> > -	if (!is_mounted(&from->mnt))
> > -		goto out;
> > -	if (!is_mounted(&to->mnt))
> > -		goto out;
> > -
> > -	err = -EPERM;
> > -	/* We should be allowed to modify mount namespaces of both mounts */
> > -	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > +	err = may_change_propagation(from);
> > +	if (err)
> >  		goto out;
> > -	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > +	err = may_change_propagation(from);
> 
> Just driving by, but I guess you mean "to" here.

D'oh...  Yes, of course.  Fun question: would our selftests have caught
that?
[checks]
move_mount_set_group_test.c doesn't have anything in that area, nothing in
LTP or xfstests either, AFAICS...  And I don't see anything in
https://github.com/checkpoint-restore/criu
either - there are uses of MOVE_MOUNT_SET_GROUP, but they are well-buried
and I don't see anything in their tests that would even try to poke into
that thing...

Before we go and try to cobble something up, does anybody know of a place
where regression tests for MOVE_MOUNT_SET_GROUP could be picked from?

^ permalink raw reply

* Re: do_change_type(): refuse to operate on unmounted/not ours mounts
From: Tycho Andersen @ 2025-08-13 19:09 UTC (permalink / raw)
  To: Al Viro
  Cc: Andrei Vagin, Andrei Vagin, Christian Brauner, linux-fsdevel,
	LKML, criu, Linux API, stable
In-Reply-To: <20250813185601.GJ222315@ZenIV>

On Wed, Aug 13, 2025 at 07:56:01PM +0100, Al Viro wrote:
> @@ -3347,18 +3360,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
>  
>  	namespace_lock();
>  
> -	err = -EINVAL;
> -	/* To and From must be mounted */
> -	if (!is_mounted(&from->mnt))
> -		goto out;
> -	if (!is_mounted(&to->mnt))
> -		goto out;
> -
> -	err = -EPERM;
> -	/* We should be allowed to modify mount namespaces of both mounts */
> -	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
> +	err = may_change_propagation(from);
> +	if (err)
>  		goto out;
> -	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
> +	err = may_change_propagation(from);

Just driving by, but I guess you mean "to" here.

Tycho

^ permalink raw reply

* Re: do_change_type(): refuse to operate on unmounted/not ours mounts
From: Al Viro @ 2025-08-13 18:56 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrei Vagin, Christian Brauner, linux-fsdevel, LKML, criu,
	Linux API, stable
In-Reply-To: <CAEWA0a6jgj8vQhrijSJXUHBnCTtz0HEV66tmaVKPe83ng=3feQ@mail.gmail.com>

On Sat, Jul 26, 2025 at 02:01:20PM -0700, Andrei Vagin wrote:

> > For a very mild example of fun to be had there:
> >         mount("none", "/mnt", "tmpfs", 0, "");
> >         chdir("/mnt");
> >         umount2(".", MNT_DETACH);
> >         mount(NULL, ".", NULL, MS_SHARED, NULL);
> > Repeat in a loop, watch mount group id leak.  That's a trivial example
> > of violating the assertion ("a mount that had been through umount_tree()
> > is out of propagation graph and related data structures for good").
> 
> I wasn't referring to detached mounts. CRIU modifies mounts from
> non-current namespaces.
> 
> >
> > As for the "CAP_SYS_ADMIN within the mount user namespace" - which
> > userns do you have in mind?
> >
> 
> The user namespace of the target mount:
> ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)

To bring that thread back: how about the following?  If nobody objects,
I'm going to throw it into viro/vfs.git #fixes...

[PATCH] use uniform permission checks for all mount propagation changes

do_change_type() and do_set_group() are operating on different
aspects of the same thing - propagation graph.  The latter
asks for mounts involved to be mounted in namespace(s) the caller
has CAP_SYS_ADMIN for.  The former is a mess - originally it
didn't even check that mount *is* mounted.  That got fixed,
but the resulting check turns out to be too strict for userland -
in effect, we check that mount is in our namespace, having already
checked that we have CAP_SYS_ADMIN there.

What we really need (in both cases) is
	* we only touch mounts that are mounted.  Hard requirement,
data corruption if that's get violated.
	* we don't allow to mess with a namespace unless you already
have enough permissions to do so (i.e. CAP_SYS_ADMIN in its userns).

That's an equivalent of what do_set_group() does; let's extract that
into a helper (may_change_propagation()) and use it in both
do_set_group() and do_change_type().

Fixes: 12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/namespace.c b/fs/namespace.c
index ddfd4457d338..e7d9b23f1e9e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2862,6 +2862,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	return attach_recursive_mnt(mnt, p, mp);
 }
 
+static int may_change_propagation(const struct mount *m)
+{
+        struct mnt_namespace *ns = m->mnt_ns;
+
+	 // it must be mounted in some namespace
+	 if (IS_ERR_OR_NULL(ns))         // is_mounted()
+		 return -EINVAL;
+	 // and the caller must be admin in userns of that namespace
+	 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		 return -EPERM;
+	 return 0;
+}
+
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
@@ -2898,10 +2911,10 @@ static int do_change_type(struct path *path, int ms_flags)
 		return -EINVAL;
 
 	namespace_lock();
-	if (!check_mnt(mnt)) {
-		err = -EINVAL;
+	err = may_change_propagation(mnt);
+	if (err)
 		goto out_unlock;
-	}
+
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
@@ -3347,18 +3360,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 
 	namespace_lock();
 
-	err = -EINVAL;
-	/* To and From must be mounted */
-	if (!is_mounted(&from->mnt))
-		goto out;
-	if (!is_mounted(&to->mnt))
-		goto out;
-
-	err = -EPERM;
-	/* We should be allowed to modify mount namespaces of both mounts */
-	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(from);
+	if (err)
 		goto out;
-	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(from);
+	if (err)
 		goto out;
 
 	err = -EINVAL;

^ permalink raw reply related

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 13:55 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Pratyush Yadav, Vipin Sharma, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <CA+CK2bCmQ3hY+ACnLrVZ1qwiTiVvxEBCDNFmAHn_uVRagvshhw@mail.gmail.com>

On Wed, Aug 13 2025, Pasha Tatashin wrote:

> On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>>
>> Hi Vipin,
>>
>> Thanks for the review.
>>
>> On Tue, Aug 12 2025, Vipin Sharma wrote:
>>
>> > On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> >> From: Pratyush Yadav <ptyadav@amazon.de>
>> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> >> +                                    unsigned int nr_folios)
>> >> +{
>> >> +    unsigned int i;
>> >> +
>> >> +    for (i = 0; i < nr_folios; i++) {
>> >> +            const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> >> +            struct folio *folio;
>> >> +
>> >> +            if (!pfolio->foliodesc)
>> >> +                    continue;
>> >> +
>> >> +            folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> >> +
>> >> +            kho_unpreserve_folio(folio);
>> >
>> > This one is missing WARN_ON_ONCE() similar to the one in
>> > memfd_luo_preserve_folios().
>>
>> Right, will add.
>>
>> >
>> >> +            unpin_folio(folio);
>>
>> Looking at this code caught my eye. This can also be called from LUO's
>> finish callback if no one claimed the memfd after live update. In that
>> case, unpin_folio() is going to underflow the pincount or refcount on
>> the folio since after the kexec, the folio is no longer pinned. We
>> should only be doing folio_put().
>>
>> I think this function should take a argument to specify which of these
>> cases it is dealing with.
>>
>> >> +    }
>> >> +}
>> >> +
>> >> +static void *memfd_luo_create_fdt(unsigned long size)
>> >> +{
>> >> +    unsigned int order = get_order(size);
>> >> +    struct folio *fdt_folio;
>> >> +    int err = 0;
>> >> +    void *fdt;
>> >> +
>> >> +    if (order > MAX_PAGE_ORDER)
>> >> +            return NULL;
>> >> +
>> >> +    fdt_folio = folio_alloc(GFP_KERNEL, order);
>> >
>> > __GFP_ZERO should also be used here. Otherwise this can lead to
>> > unintentional passing of old kernel memory.
>>
>> fdt_create() zeroes out the buffer so this should not be a problem.
>
> You are right, fdt_create() zeroes the whole buffer, however, I wonder
> if it could be `optimized` to only clear only the header part of FDT,
> not the rest and this could potentially lead us to send an FDT buffer
> that contains both a valid FDT and the trailing bits contain data from
> old kernel.

Fair enough. At least the API documentation does not say anything about
the state of the buffer. My main concern was around performance since
the FDT can be multiple megabytes long for big memfds. Anyway, this
isn't in the blackout window so perhaps we can live with it. Will add
the GFP_ZERO.

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Greg KH @ 2025-08-13 13:53 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: Pratyush Yadav, Jason Gunthorpe, Vipin Sharma, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <CA+CK2bDs9prKNSo=Ris-L7T43ZFU7ji3cBH3KD1=FxXg7hFbFA@mail.gmail.com>

On Wed, Aug 13, 2025 at 01:41:51PM +0000, Pasha Tatashin wrote:
> On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote:
> >
> > On Wed, Aug 13 2025, Greg KH wrote:
> >
> > > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> > [...]
> > >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> > >> them to detect corruption/malfunction in the kernel.
> > >>
> > >> In this case if kho_unpreserve_folio() fails in this call chain it
> > >> means some error unwind is wrongly happening out of sequence, and we
> > >> are now forced to leak memory. Unwind is not something that userspace
> > >> should be controlling, so of course we want a WARN_ON here.
> > >
> > > "should be" is the key here.  And it's not obvious from this patch if
> > > that's true or not, which is why I mentioned it.
> > >
> > > I will keep bringing this up, given the HUGE number of CVEs I keep
> > > assigning each week for when userspace hits WARN_ON() calls until that
> > > flow starts to die out either because we don't keep adding new calls, OR
> > > we finally fix them all.  Both would be good...
> >
> > Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> > I'd guess one reason is overwhelming system console which can cause a
> > denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?
> 
> My understanding that it is vulnerability only if it can be triggered
> from userspace, otherwise it is a preferred method to give a notice
> that something is very wrong.
> 
> Given the large number of machines that have panic_on_warn, a reliable
> kernel crash that is triggered from userspace is a vulnerability(?).

Yes, and so is a unreliable one :)

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Greg KH @ 2025-08-13 13:53 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Jason Gunthorpe, Vipin Sharma, Pasha Tatashin, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <mafs07bz7wdfk.fsf@kernel.org>

On Wed, Aug 13, 2025 at 03:37:03PM +0200, Pratyush Yadav wrote:
> On Wed, Aug 13 2025, Greg KH wrote:
> 
> > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> [...]
> >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> >> them to detect corruption/malfunction in the kernel.
> >> 
> >> In this case if kho_unpreserve_folio() fails in this call chain it
> >> means some error unwind is wrongly happening out of sequence, and we
> >> are now forced to leak memory. Unwind is not something that userspace
> >> should be controlling, so of course we want a WARN_ON here.
> >
> > "should be" is the key here.  And it's not obvious from this patch if
> > that's true or not, which is why I mentioned it.
> >
> > I will keep bringing this up, given the HUGE number of CVEs I keep
> > assigning each week for when userspace hits WARN_ON() calls until that
> > flow starts to die out either because we don't keep adding new calls, OR
> > we finally fix them all.  Both would be good...
> 
> Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> I'd guess one reason is overwhelming system console which can cause a
> denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

If panic_on_warn is set, this will cause the machine to crash/reboot,
which is considered a "vulnerability" by the CVE.org definition.  If a
user can trigger this, it gets a CVE assigned to it.

hope this helps,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pasha Tatashin @ 2025-08-13 13:49 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Vipin Sharma, jasonmiu, graf, changyuanl, rppt, dmatlack,
	rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
	saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <mafs0wm77wgjx.fsf@kernel.org>

On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>
> Hi Vipin,
>
> Thanks for the review.
>
> On Tue, Aug 12 2025, Vipin Sharma wrote:
>
> > On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> >> From: Pratyush Yadav <ptyadav@amazon.de>
> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> >> +                                    unsigned int nr_folios)
> >> +{
> >> +    unsigned int i;
> >> +
> >> +    for (i = 0; i < nr_folios; i++) {
> >> +            const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> >> +            struct folio *folio;
> >> +
> >> +            if (!pfolio->foliodesc)
> >> +                    continue;
> >> +
> >> +            folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> >> +
> >> +            kho_unpreserve_folio(folio);
> >
> > This one is missing WARN_ON_ONCE() similar to the one in
> > memfd_luo_preserve_folios().
>
> Right, will add.
>
> >
> >> +            unpin_folio(folio);
>
> Looking at this code caught my eye. This can also be called from LUO's
> finish callback if no one claimed the memfd after live update. In that
> case, unpin_folio() is going to underflow the pincount or refcount on
> the folio since after the kexec, the folio is no longer pinned. We
> should only be doing folio_put().
>
> I think this function should take a argument to specify which of these
> cases it is dealing with.
>
> >> +    }
> >> +}
> >> +
> >> +static void *memfd_luo_create_fdt(unsigned long size)
> >> +{
> >> +    unsigned int order = get_order(size);
> >> +    struct folio *fdt_folio;
> >> +    int err = 0;
> >> +    void *fdt;
> >> +
> >> +    if (order > MAX_PAGE_ORDER)
> >> +            return NULL;
> >> +
> >> +    fdt_folio = folio_alloc(GFP_KERNEL, order);
> >
> > __GFP_ZERO should also be used here. Otherwise this can lead to
> > unintentional passing of old kernel memory.
>
> fdt_create() zeroes out the buffer so this should not be a problem.

You are right, fdt_create() zeroes the whole buffer, however, I wonder
if it could be `optimized` to only clear only the header part of FDT,
not the rest and this could potentially lead us to send an FDT buffer
that contains both a valid FDT and the trailing bits contain data from
old kernel.

Pasha

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pasha Tatashin @ 2025-08-13 13:41 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Greg KH, Jason Gunthorpe, Vipin Sharma, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <mafs07bz7wdfk.fsf@kernel.org>

On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>
> On Wed, Aug 13 2025, Greg KH wrote:
>
> > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> [...]
> >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> >> them to detect corruption/malfunction in the kernel.
> >>
> >> In this case if kho_unpreserve_folio() fails in this call chain it
> >> means some error unwind is wrongly happening out of sequence, and we
> >> are now forced to leak memory. Unwind is not something that userspace
> >> should be controlling, so of course we want a WARN_ON here.
> >
> > "should be" is the key here.  And it's not obvious from this patch if
> > that's true or not, which is why I mentioned it.
> >
> > I will keep bringing this up, given the HUGE number of CVEs I keep
> > assigning each week for when userspace hits WARN_ON() calls until that
> > flow starts to die out either because we don't keep adding new calls, OR
> > we finally fix them all.  Both would be good...
>
> Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> I'd guess one reason is overwhelming system console which can cause a
> denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

My understanding that it is vulnerability only if it can be triggered
from userspace, otherwise it is a preferred method to give a notice
that something is very wrong.

Given the large number of machines that have panic_on_warn, a reliable
kernel crash that is triggered from userspace is a vulnerability(?).

Pasha

>
> --
> Regards,
> Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 13:37 UTC (permalink / raw)
  To: Greg KH
  Cc: Jason Gunthorpe, Pratyush Yadav, Vipin Sharma, Pasha Tatashin,
	jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes, corbet,
	rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm,
	tj, yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <2025081334-rotten-visible-517a@gregkh>

On Wed, Aug 13 2025, Greg KH wrote:

> On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
[...]
>> Use the warn ons. Make sure they can't be triggered by userspace. Use
>> them to detect corruption/malfunction in the kernel.
>> 
>> In this case if kho_unpreserve_folio() fails in this call chain it
>> means some error unwind is wrongly happening out of sequence, and we
>> are now forced to leak memory. Unwind is not something that userspace
>> should be controlling, so of course we want a WARN_ON here.
>
> "should be" is the key here.  And it's not obvious from this patch if
> that's true or not, which is why I mentioned it.
>
> I will keep bringing this up, given the HUGE number of CVEs I keep
> assigning each week for when userspace hits WARN_ON() calls until that
> flow starts to die out either because we don't keep adding new calls, OR
> we finally fix them all.  Both would be good...

Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
I'd guess one reason is overwhelming system console which can cause a
denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 13:31 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Greg KH, Pratyush Yadav, Vipin Sharma, Pasha Tatashin, jasonmiu,
	graf, changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <20250813124140.GA699432@nvidia.com>

On Wed, Aug 13 2025, Jason Gunthorpe wrote:

> On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
>> On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
>> > On Wed, Aug 13 2025, Greg KH wrote:
>> > 
>> > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
>> > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> > >> > From: Pratyush Yadav <ptyadav@amazon.de>
>> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> > >> > +					unsigned int nr_folios)
>> > >> > +{
>> > >> > +	unsigned int i;
>> > >> > +
>> > >> > +	for (i = 0; i < nr_folios; i++) {
>> > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> > >> > +		struct folio *folio;
>> > >> > +
>> > >> > +		if (!pfolio->foliodesc)
>> > >> > +			continue;
>> > >> > +
>> > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> > >> > +
>> > >> > +		kho_unpreserve_folio(folio);
>> > >> 
>> > >> This one is missing WARN_ON_ONCE() similar to the one in
>> > >> memfd_luo_preserve_folios().
>> > >
>> > > So you really want to cause a machine to reboot and get a CVE issued for
>> > > this, if it could be triggered?  That's bold :)
>> > >
>> > > Please don't.  If that can happen, handle the issue and move on, don't
>> > > crash boxes.
>> > 
>> > Why would a WARN() crash the machine? That is what BUG() does, not
>> > WARN().
>> 
>> See 'panic_on_warn' which is enabled in a few billion Linux systems
>> these days :(
>
> This has been discussed so many times already:
>
> https://lwn.net/Articles/969923/
>
> When someone tried to formalize this "don't use WARN_ON" position 
> in the coding-style.rst it was NAK'd:
>
> https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/
>
> Based on Linus's opposition to the idea:
>
> https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/
>
> Use the warn ons. Make sure they can't be triggered by userspace. Use
> them to detect corruption/malfunction in the kernel.
>
> In this case if kho_unpreserve_folio() fails in this call chain it
> means some error unwind is wrongly happening out of sequence, and we
> are now forced to leak memory. Unwind is not something that userspace
> should be controlling, so of course we want a WARN_ON here.

Yep. And if we are saying WARN() should never be used then doesn't that
make panic_on_warn a no-op? What is even the point of that option then?

Here, we are unable to unpreserve a folio that we have preserved. This
isn't a normal error that we expect to happen. This should _not_ happen
unless something has gone horribly wrong.

For example, the calls to kho_preserve_folio() don't WARN(), since that
can fail for various reasons. They just return the error up the call
chain. As an analogy, allocating a page can fail, and it is quite
reasonable to expect the code to not throw out WARN()s for that. But if
for some reason you can't free a page that you allocated, this is very
unexpected and should WARN(). Of course, in Linux the page free APIs
don't even return a status, but I hope you get my point.

If I were a system administrator who sets panic_on_warn, I would _want_
the system to crash so no further damage happens and I can collect
logs/crash dumps to investigate later. Without the WARN(), I never get a
chance to debug and my system breaks silently. For all others, the
kernel goes on with some possibly corrupted/broken state.

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Greg KH @ 2025-08-13 13:00 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Pratyush Yadav, Vipin Sharma, Pasha Tatashin, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <20250813124140.GA699432@nvidia.com>

On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
> > On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> > > On Wed, Aug 13 2025, Greg KH wrote:
> > > 
> > > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> > > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > > >> > From: Pratyush Yadav <ptyadav@amazon.de>
> > > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > > >> > +					unsigned int nr_folios)
> > > >> > +{
> > > >> > +	unsigned int i;
> > > >> > +
> > > >> > +	for (i = 0; i < nr_folios; i++) {
> > > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > > >> > +		struct folio *folio;
> > > >> > +
> > > >> > +		if (!pfolio->foliodesc)
> > > >> > +			continue;
> > > >> > +
> > > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > > >> > +
> > > >> > +		kho_unpreserve_folio(folio);
> > > >> 
> > > >> This one is missing WARN_ON_ONCE() similar to the one in
> > > >> memfd_luo_preserve_folios().
> > > >
> > > > So you really want to cause a machine to reboot and get a CVE issued for
> > > > this, if it could be triggered?  That's bold :)
> > > >
> > > > Please don't.  If that can happen, handle the issue and move on, don't
> > > > crash boxes.
> > > 
> > > Why would a WARN() crash the machine? That is what BUG() does, not
> > > WARN().
> > 
> > See 'panic_on_warn' which is enabled in a few billion Linux systems
> > these days :(
> 
> This has been discussed so many times already:
> 
> https://lwn.net/Articles/969923/
> 
> When someone tried to formalize this "don't use WARN_ON" position 
> in the coding-style.rst it was NAK'd:
> 
> https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/
> 
> Based on Linus's opposition to the idea:
> 
> https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/
> 
> Use the warn ons. Make sure they can't be triggered by userspace. Use
> them to detect corruption/malfunction in the kernel.
> 
> In this case if kho_unpreserve_folio() fails in this call chain it
> means some error unwind is wrongly happening out of sequence, and we
> are now forced to leak memory. Unwind is not something that userspace
> should be controlling, so of course we want a WARN_ON here.

"should be" is the key here.  And it's not obvious from this patch if
that's true or not, which is why I mentioned it.

I will keep bringing this up, given the HUGE number of CVEs I keep
assigning each week for when userspace hits WARN_ON() calls until that
flow starts to die out either because we don't keep adding new calls, OR
we finally fix them all.  Both would be good...

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 12:44 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes,
	corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, lennart, brauner, linux-api, linux-fsdevel,
	saeedm, ajayachandra, jgg, parav, leonro, witu, jrhilke
In-Reply-To: <CA+CK2bAP-PWkYtZbw8ofhTgDaW3qoQkNob30wWSjidxEUTV4pg@mail.gmail.com>

On Fri, Aug 08 2025, Pasha Tatashin wrote:

>> +static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios,
>> +                                    struct folio **folios,
>> +                                    unsigned int nr_folios)
>> +{
>> +       unsigned int i;
>
> Should be 'long i'
>
> Otherwise in err_unpreserve we get into an infinite loop. Thank you
> Josh Hilke for noticing this.

Good catch! Will fix.
>
>> +       int err;
>> +
>> +       for (i = 0; i < nr_folios; i++) {
>> +               struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> +               struct folio *folio = folios[i];
>> +               unsigned int flags = 0;
>> +               unsigned long pfn;
>> +
>> +               err = kho_preserve_folio(folio);
>> +               if (err)
>> +                       goto err_unpreserve;
>> +
>> +               pfn = folio_pfn(folio);
>> +               if (folio_test_dirty(folio))
>> +                       flags |= PRESERVED_FLAG_DIRTY;
>> +               if (folio_test_uptodate(folio))
>> +                       flags |= PRESERVED_FLAG_UPTODATE;
>> +
>> +               pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags);
>> +               pfolio->index = folio->index;
>> +       }
>> +
>> +       return 0;
>> +
>> +err_unpreserve:
>> +       i--;
>> +       for (; i >= 0; i--)
>> +               WARN_ON_ONCE(kho_unpreserve_folio(folios[i]));
>> +       return err;
>> +}
>> +

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 26/30] mm: shmem: use SHMEM_F_* flags instead of VM_* flags
From: Pratyush Yadav @ 2025-08-13 12:42 UTC (permalink / raw)
  To: Vipin Sharma
  Cc: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <20250811231107.GA2328988.vipinsh@google.com>

On Mon, Aug 11 2025, Vipin Sharma wrote:

> On 2025-08-07 01:44:32, Pasha Tatashin wrote:
>> From: Pratyush Yadav <ptyadav@amazon.de>
>> @@ -3123,7 +3123,9 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
>>  	spin_lock_init(&info->lock);
>>  	atomic_set(&info->stop_eviction, 0);
>>  	info->seals = F_SEAL_SEAL;
>> -	info->flags = flags & VM_NORESERVE;
>> +	info->flags = 0;
>
> This is not needed as the 'info' is being set to 0 just above
> spin_lock_init.
>
>> +	if (flags & VM_NORESERVE)
>> +		info->flags |= SHMEM_F_NORESERVE;
>
> As info->flags will be 0, this can be just direct assignment '='.

I think it is a bit more readable this way.

Anyway, I don't have a strong opinion, so if you insist, I'll change
this.

>
>>  	info->i_crtime = inode_get_mtime(inode);
>>  	info->fsflags = (dir == NULL) ? 0 :
>>  		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
>> @@ -5862,8 +5864,10 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
>>  /* common code */
>>  
>>  static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
>> -			loff_t size, unsigned long flags, unsigned int i_flags)
>> +				       loff_t size, unsigned long vm_flags,
>> +				       unsigned int i_flags)
>
> Nit: Might be just my editor, but this alignment seems off.

Looks fine for me:
https://gist.github.com/prati0100/a06229ca99cac5aae795fb962bb24ac5

Checkpatch also doesn't complain. Can you double-check? And if it still
looks off, can you describe what's wrong?

>
>>  {
>> +	unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
>>  	struct inode *inode;
>>  	struct file *res;
>>  
>> @@ -5880,7 +5884,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
>>  		return ERR_PTR(-ENOMEM);
>>  
>>  	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
>> -				S_IFREG | S_IRWXUGO, 0, flags);
>> +				S_IFREG | S_IRWXUGO, 0, vm_flags);
>>  	if (IS_ERR(inode)) {
>>  		shmem_unacct_size(flags, size);
>>  		return ERR_CAST(inode);
>> -- 
>> 2.50.1.565.gc32cd1483b-goog
>> 

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Jason Gunthorpe @ 2025-08-13 12:41 UTC (permalink / raw)
  To: Greg KH
  Cc: Pratyush Yadav, Vipin Sharma, Pasha Tatashin, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, parav,
	leonro, witu
In-Reply-To: <2025081351-tinsel-sprinkler-af77@gregkh>

On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
> On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> > On Wed, Aug 13 2025, Greg KH wrote:
> > 
> > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > >> > From: Pratyush Yadav <ptyadav@amazon.de>
> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > >> > +					unsigned int nr_folios)
> > >> > +{
> > >> > +	unsigned int i;
> > >> > +
> > >> > +	for (i = 0; i < nr_folios; i++) {
> > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > >> > +		struct folio *folio;
> > >> > +
> > >> > +		if (!pfolio->foliodesc)
> > >> > +			continue;
> > >> > +
> > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > >> > +
> > >> > +		kho_unpreserve_folio(folio);
> > >> 
> > >> This one is missing WARN_ON_ONCE() similar to the one in
> > >> memfd_luo_preserve_folios().
> > >
> > > So you really want to cause a machine to reboot and get a CVE issued for
> > > this, if it could be triggered?  That's bold :)
> > >
> > > Please don't.  If that can happen, handle the issue and move on, don't
> > > crash boxes.
> > 
> > Why would a WARN() crash the machine? That is what BUG() does, not
> > WARN().
> 
> See 'panic_on_warn' which is enabled in a few billion Linux systems
> these days :(

This has been discussed so many times already:

https://lwn.net/Articles/969923/

When someone tried to formalize this "don't use WARN_ON" position 
in the coding-style.rst it was NAK'd:

https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/

Based on Linus's opposition to the idea:

https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/

Use the warn ons. Make sure they can't be triggered by userspace. Use
them to detect corruption/malfunction in the kernel.

In this case if kho_unpreserve_folio() fails in this call chain it
means some error unwind is wrongly happening out of sequence, and we
are now forced to leak memory. Unwind is not something that userspace
should be controlling, so of course we want a WARN_ON here.

Jason

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 12:29 UTC (permalink / raw)
  To: Vipin Sharma
  Cc: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, gregkh, tglx, mingo, bp, dave.hansen, x86,
	hpa, rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <20250813063407.GA3182745.vipinsh@google.com>

Hi Vipin,

Thanks for the review.

On Tue, Aug 12 2025, Vipin Sharma wrote:

> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> From: Pratyush Yadav <ptyadav@amazon.de>
>> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> +					unsigned int nr_folios)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < nr_folios; i++) {
>> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> +		struct folio *folio;
>> +
>> +		if (!pfolio->foliodesc)
>> +			continue;
>> +
>> +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> +
>> +		kho_unpreserve_folio(folio);
>
> This one is missing WARN_ON_ONCE() similar to the one in
> memfd_luo_preserve_folios().

Right, will add.

>
>> +		unpin_folio(folio);

Looking at this code caught my eye. This can also be called from LUO's
finish callback if no one claimed the memfd after live update. In that
case, unpin_folio() is going to underflow the pincount or refcount on
the folio since after the kexec, the folio is no longer pinned. We
should only be doing folio_put().

I think this function should take a argument to specify which of these
cases it is dealing with.

>> +	}
>> +}
>> +
>> +static void *memfd_luo_create_fdt(unsigned long size)
>> +{
>> +	unsigned int order = get_order(size);
>> +	struct folio *fdt_folio;
>> +	int err = 0;
>> +	void *fdt;
>> +
>> +	if (order > MAX_PAGE_ORDER)
>> +		return NULL;
>> +
>> +	fdt_folio = folio_alloc(GFP_KERNEL, order);
>
> __GFP_ZERO should also be used here. Otherwise this can lead to
> unintentional passing of old kernel memory.

fdt_create() zeroes out the buffer so this should not be a problem.

>
>> +static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
>> +			     struct file *file, u64 *data)
>> +{
>> +	struct memfd_luo_preserved_folio *preserved_folios;
>> +	struct inode *inode = file_inode(file);
>> +	unsigned int max_folios, nr_folios = 0;
>> +	int err = 0, preserved_size;
>> +	struct folio **folios;
>> +	long size, nr_pinned;
>> +	pgoff_t offset;
>> +	void *fdt;
>> +	u64 pos;
>> +
>> +	if (WARN_ON_ONCE(!shmem_file(file)))
>> +		return -EINVAL;
>
> This one is only check for shmem_file, whereas in
> memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that
> not needed here?

Actually, this should never happen since the LUO can_preserve() callback
should make sure of this. I think it would be perfectly fine to just
drop this check. I only added it because I was being extra careful.

>
>> +
>> +	inode_lock(inode);
>> +	shmem_i_mapping_freeze(inode, true);
>> +
>> +	size = i_size_read(inode);
>> +	if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
>> +		err = -E2BIG;
>> +		goto err_unlock;
>> +	}
>> +
>> +	/*
>> +	 * Guess the number of folios based on inode size. Real number might end
>> +	 * up being smaller if there are higher order folios.
>> +	 */
>> +	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
>> +	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
>
> __GFP_ZERO?

Why? This is only used in this function and gets freed on return. And
the function only looks at the elements that get initialized by
memfd_pin_folios().

>
>> +static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
>> +			    struct file *file, u64 *data)
>> +{
>> +	u64 pos = file->f_pos;
>> +	void *fdt;
>> +	int err;
>> +
>> +	if (WARN_ON_ONCE(!*data))
>> +		return -EINVAL;
>> +
>> +	fdt = phys_to_virt(*data);
>> +
>> +	/*
>> +	 * The pos or size might have changed since prepare. Everything else
>> +	 * stays the same.
>> +	 */
>> +	err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
>> +	if (err)
>> +		return err;
>
> Comment is talking about pos and size but code is only updating pos. 

Right. Comment is out of date. size can no longer change since prepare.
So will update the comment.

>
>> +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
>> +			      struct file **file_p)
>> +{
>> +	const struct memfd_luo_preserved_folio *pfolios;
>> +	int nr_pfolios, len, ret = 0, i = 0;
>> +	struct address_space *mapping;
>> +	struct folio *folio, *fdt_folio;
>> +	const u64 *pos, *size;
>> +	struct inode *inode;
>> +	struct file *file;
>> +	const void *fdt;
>> +
>> +	fdt_folio = memfd_luo_get_fdt(data);
>> +	if (!fdt_folio)
>> +		return -ENOENT;
>> +
>> +	fdt = page_to_virt(folio_page(fdt_folio, 0));
>> +
>> +	pfolios = fdt_getprop(fdt, 0, "folios", &len);
>> +	if (!pfolios || len % sizeof(*pfolios)) {
>> +		pr_err("invalid 'folios' property\n");
>
> Print should clearly state that error is because fields is not found or
> len is not multiple of sizeof(*pfolios).

Eh, there is already too much boilerplate one has to write (and read)
for parsing the FDT. Is there really a need for an extra 3-4 lines of
code for _each_ property that is parsed?

Long term, I think we shouldn't be doing this manually anyway. I think
the maintainable path forward is to define a schema for the serialized
data and have a parser that takes in the schema and gives out a parsed
struct, doing all sorts of checks in the process.

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Greg KH @ 2025-08-13 12:14 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Vipin Sharma, Pasha Tatashin, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, tglx, mingo, bp, dave.hansen, x86, hpa,
	rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <mafs01ppfxwe8.fsf@kernel.org>

On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> On Wed, Aug 13 2025, Greg KH wrote:
> 
> > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> >> > From: Pratyush Yadav <ptyadav@amazon.de>
> >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> >> > +					unsigned int nr_folios)
> >> > +{
> >> > +	unsigned int i;
> >> > +
> >> > +	for (i = 0; i < nr_folios; i++) {
> >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> >> > +		struct folio *folio;
> >> > +
> >> > +		if (!pfolio->foliodesc)
> >> > +			continue;
> >> > +
> >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> >> > +
> >> > +		kho_unpreserve_folio(folio);
> >> 
> >> This one is missing WARN_ON_ONCE() similar to the one in
> >> memfd_luo_preserve_folios().
> >
> > So you really want to cause a machine to reboot and get a CVE issued for
> > this, if it could be triggered?  That's bold :)
> >
> > Please don't.  If that can happen, handle the issue and move on, don't
> > crash boxes.
> 
> Why would a WARN() crash the machine? That is what BUG() does, not
> WARN().

See 'panic_on_warn' which is enabled in a few billion Linux systems
these days :(

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Pratyush Yadav @ 2025-08-13 12:02 UTC (permalink / raw)
  To: Greg KH
  Cc: Vipin Sharma, Pasha Tatashin, pratyush, jasonmiu, graf,
	changyuanl, rppt, dmatlack, rientjes, corbet, rdunlap,
	ilpo.jarvinen, kanie, ojeda, aliceryhl, masahiroy, akpm, tj,
	yoann.congal, mmaurer, roman.gushchin, chenridong, axboe,
	mark.rutland, jannh, vincent.guittot, hannes, dan.j.williams,
	david, joel.granados, rostedt, anna.schumaker, song, zhangguopeng,
	linux, linux-kernel, linux-doc, linux-mm, tglx, mingo, bp,
	dave.hansen, x86, hpa, rafael, dakr, bartosz.golaszewski,
	cw00.choi, myungjoo.ham, yesanishhere, Jonathan.Cameron,
	quic_zijuhu, aleksander.lobakin, ira.weiny, andriy.shevchenko,
	leon, lukas, bhelgaas, wagi, djeffery, stuart.w.hayes, lennart,
	brauner, linux-api, linux-fsdevel, saeedm, ajayachandra, jgg,
	parav, leonro, witu
In-Reply-To: <2025081310-custodian-ashamed-3104@gregkh>

On Wed, Aug 13 2025, Greg KH wrote:

> On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
>> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> > From: Pratyush Yadav <ptyadav@amazon.de>
>> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> > +					unsigned int nr_folios)
>> > +{
>> > +	unsigned int i;
>> > +
>> > +	for (i = 0; i < nr_folios; i++) {
>> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> > +		struct folio *folio;
>> > +
>> > +		if (!pfolio->foliodesc)
>> > +			continue;
>> > +
>> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> > +
>> > +		kho_unpreserve_folio(folio);
>> 
>> This one is missing WARN_ON_ONCE() similar to the one in
>> memfd_luo_preserve_folios().
>
> So you really want to cause a machine to reboot and get a CVE issued for
> this, if it could be triggered?  That's bold :)
>
> Please don't.  If that can happen, handle the issue and move on, don't
> crash boxes.

Why would a WARN() crash the machine? That is what BUG() does, not
WARN().

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Greg KH @ 2025-08-13  7:09 UTC (permalink / raw)
  To: Vipin Sharma
  Cc: Pasha Tatashin, pratyush, jasonmiu, graf, changyuanl, rppt,
	dmatlack, rientjes, corbet, rdunlap, ilpo.jarvinen, kanie, ojeda,
	aliceryhl, masahiroy, akpm, tj, yoann.congal, mmaurer,
	roman.gushchin, chenridong, axboe, mark.rutland, jannh,
	vincent.guittot, hannes, dan.j.williams, david, joel.granados,
	rostedt, anna.schumaker, song, zhangguopeng, linux, linux-kernel,
	linux-doc, linux-mm, tglx, mingo, bp, dave.hansen, x86, hpa,
	rafael, dakr, bartosz.golaszewski, cw00.choi, myungjoo.ham,
	yesanishhere, Jonathan.Cameron, quic_zijuhu, aleksander.lobakin,
	ira.weiny, andriy.shevchenko, leon, lukas, bhelgaas, wagi,
	djeffery, stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <20250813063407.GA3182745.vipinsh@google.com>

On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > From: Pratyush Yadav <ptyadav@amazon.de>
> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > +					unsigned int nr_folios)
> > +{
> > +	unsigned int i;
> > +
> > +	for (i = 0; i < nr_folios; i++) {
> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > +		struct folio *folio;
> > +
> > +		if (!pfolio->foliodesc)
> > +			continue;
> > +
> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > +
> > +		kho_unpreserve_folio(folio);
> 
> This one is missing WARN_ON_ONCE() similar to the one in
> memfd_luo_preserve_folios().

So you really want to cause a machine to reboot and get a CVE issued for
this, if it could be triggered?  That's bold :)

Please don't.  If that can happen, handle the issue and move on, don't
crash boxes.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v3 29/30] luo: allow preserving memfd
From: Vipin Sharma @ 2025-08-13  6:34 UTC (permalink / raw)
  To: Pasha Tatashin
  Cc: pratyush, jasonmiu, graf, changyuanl, rppt, dmatlack, rientjes,
	corbet, rdunlap, ilpo.jarvinen, kanie, ojeda, aliceryhl,
	masahiroy, akpm, tj, yoann.congal, mmaurer, roman.gushchin,
	chenridong, axboe, mark.rutland, jannh, vincent.guittot, hannes,
	dan.j.williams, david, joel.granados, rostedt, anna.schumaker,
	song, zhangguopeng, linux, linux-kernel, linux-doc, linux-mm,
	gregkh, tglx, mingo, bp, dave.hansen, x86, hpa, rafael, dakr,
	bartosz.golaszewski, cw00.choi, myungjoo.ham, yesanishhere,
	Jonathan.Cameron, quic_zijuhu, aleksander.lobakin, ira.weiny,
	andriy.shevchenko, leon, lukas, bhelgaas, wagi, djeffery,
	stuart.w.hayes, ptyadav, lennart, brauner, linux-api,
	linux-fsdevel, saeedm, ajayachandra, jgg, parav, leonro, witu
In-Reply-To: <20250807014442.3829950-30-pasha.tatashin@soleen.com>

On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> From: Pratyush Yadav <ptyadav@amazon.de>
> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> +					unsigned int nr_folios)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < nr_folios; i++) {
> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> +		struct folio *folio;
> +
> +		if (!pfolio->foliodesc)
> +			continue;
> +
> +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> +
> +		kho_unpreserve_folio(folio);

This one is missing WARN_ON_ONCE() similar to the one in
memfd_luo_preserve_folios().

> +		unpin_folio(folio);
> +	}
> +}
> +
> +static void *memfd_luo_create_fdt(unsigned long size)
> +{
> +	unsigned int order = get_order(size);
> +	struct folio *fdt_folio;
> +	int err = 0;
> +	void *fdt;
> +
> +	if (order > MAX_PAGE_ORDER)
> +		return NULL;
> +
> +	fdt_folio = folio_alloc(GFP_KERNEL, order);

__GFP_ZERO should also be used here. Otherwise this can lead to
unintentional passing of old kernel memory.

> +static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
> +			     struct file *file, u64 *data)
> +{
> +	struct memfd_luo_preserved_folio *preserved_folios;
> +	struct inode *inode = file_inode(file);
> +	unsigned int max_folios, nr_folios = 0;
> +	int err = 0, preserved_size;
> +	struct folio **folios;
> +	long size, nr_pinned;
> +	pgoff_t offset;
> +	void *fdt;
> +	u64 pos;
> +
> +	if (WARN_ON_ONCE(!shmem_file(file)))
> +		return -EINVAL;

This one is only check for shmem_file, whereas in
memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that
not needed here?

> +
> +	inode_lock(inode);
> +	shmem_i_mapping_freeze(inode, true);
> +
> +	size = i_size_read(inode);
> +	if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
> +		err = -E2BIG;
> +		goto err_unlock;
> +	}
> +
> +	/*
> +	 * Guess the number of folios based on inode size. Real number might end
> +	 * up being smaller if there are higher order folios.
> +	 */
> +	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
> +	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);

__GFP_ZERO?

> +static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
> +			    struct file *file, u64 *data)
> +{
> +	u64 pos = file->f_pos;
> +	void *fdt;
> +	int err;
> +
> +	if (WARN_ON_ONCE(!*data))
> +		return -EINVAL;
> +
> +	fdt = phys_to_virt(*data);
> +
> +	/*
> +	 * The pos or size might have changed since prepare. Everything else
> +	 * stays the same.
> +	 */
> +	err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
> +	if (err)
> +		return err;

Comment is talking about pos and size but code is only updating pos. 

> +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
> +			      struct file **file_p)
> +{
> +	const struct memfd_luo_preserved_folio *pfolios;
> +	int nr_pfolios, len, ret = 0, i = 0;
> +	struct address_space *mapping;
> +	struct folio *folio, *fdt_folio;
> +	const u64 *pos, *size;
> +	struct inode *inode;
> +	struct file *file;
> +	const void *fdt;
> +
> +	fdt_folio = memfd_luo_get_fdt(data);
> +	if (!fdt_folio)
> +		return -ENOENT;
> +
> +	fdt = page_to_virt(folio_page(fdt_folio, 0));
> +
> +	pfolios = fdt_getprop(fdt, 0, "folios", &len);
> +	if (!pfolios || len % sizeof(*pfolios)) {
> +		pr_err("invalid 'folios' property\n");

Print should clearly state that error is because fields is not found or
len is not multiple of sizeof(*pfolios).


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox