Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v3 1/2] clone3: add CLONE_CLEAR_SIGHAND
From: Christian Brauner @ 2019-10-14 10:45 UTC (permalink / raw)
  To: linux-kernel, Oleg Nesterov, Florian Weimer, Arnd Bergmann,
	libc-alpha
  Cc: David Howells, Jann Horn, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Shuah Khan, Andrew Morton, Michal Hocko,
	Elena Reshetova, Thomas Gleixner, Roman Gushchin,
	Andrea Arcangeli, Al Viro, Aleksa Sarai, Dmitry V. Levin,
	linux-kselftest

Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
Mutually exclusive with CLONE_SIGHAND to not disturb other thread's
signal handler.

In the spirit of closer cooperation between glibc developers and kernel
developers (cf. [2]) this patchset came out of a discussion on the glibc
mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel
support for this feature has been explicitly requested by glibc and I
see no reason not to help them with this.

The child helper process on Linux posix_spawn must ensure that no signal
handlers are enabled, so the signal disposition must be either SIG_DFL
or SIG_IGN. However, it requires a sigprocmask to obtain the current
signal mask and at least _NSIG sigaction calls to reset the signal
handlers for each posix_spawn call or complex state tracking that might
lead to data corruption in glibc. Adding this flags lets glibc avoid
these problems.

[1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html
[3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html
[4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html
[2]: https://lwn.net/Articles/799331/
     '[...] by asking for better cooperation with the C-library projects
     in general. They should be copied on patches containing ABI
     changes, for example. I noted that there are often times where
     C-library developers wish the kernel community had done things
     differently; how could those be avoided in the future? Members of
     the audience suggested that more glibc developers should perhaps
     join the linux-api list. The other suggestion was to "copy Florian
     on everything".'
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: libc-alpha@sourceware.org
Cc: linux-api@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
Link: https://lore.kernel.org/r/20191010133518.5420-1-christian.brauner@ubuntu.com

/* v2 */
Link: https://lore.kernel.org/r/20191011102537.27502-1-christian.brauner@ubuntu.com
- Florian Weimer <fweimer@redhat.com>:
  - update comment in clone3_args_valid()

/* v3 */
- "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>:
  - s/CLONE3_CLEAR_SIGHAND/CLONE_CLEAR_SIGHAND/g
---
 include/uapi/linux/sched.h |  3 +++
 kernel/fork.c              | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 99335e1f4a27..1d500ed03c63 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -33,6 +33,9 @@
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 
+/* Flags for the clone3() syscall. */
+#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
+
 #ifndef __ASSEMBLY__
 /**
  * struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 1f6c45f6a734..aa5b5137f071 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 	spin_lock_irq(&current->sighand->siglock);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	spin_unlock_irq(&current->sighand->siglock);
+
+	/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
+	if (clone_flags & CLONE_CLEAR_SIGHAND)
+		flush_signal_handlers(tsk, 0);
+
 	return 0;
 }
 
@@ -2563,11 +2568,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 
 static bool clone3_args_valid(const struct kernel_clone_args *kargs)
 {
-	/*
-	 * All lower bits of the flag word are taken.
-	 * Verify that no other unknown flags are passed along.
-	 */
-	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+	/* Verify that no unknown flags are passed along. */
+	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
 		return false;
 
 	/*
@@ -2577,6 +2579,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
 	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
 		return false;
 
+	if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
+	    (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
+		return false;
+
 	if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
 	    kargs->exit_signal)
 		return false;
-- 
2.23.0

^ permalink raw reply related

* [PATCH v3 2/2] tests: test CLONE_CLEAR_SIGHAND
From: Christian Brauner @ 2019-10-14 10:45 UTC (permalink / raw)
  To: linux-kernel, Oleg Nesterov, Florian Weimer, Arnd Bergmann,
	libc-alpha
  Cc: David Howells, Jann Horn, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Shuah Khan, Andrew Morton, Michal Hocko,
	Elena Reshetova, Thomas Gleixner, Roman Gushchin,
	Andrea Arcangeli, Al Viro, Aleksa Sarai, Dmitry V. Levin,
	linux-kselftest
In-Reply-To: <20191014104538.3096-1-christian.brauner@ubuntu.com>

Test that CLONE_CLEAR_SIGHAND resets signal handlers to SIG_DFL for the
child process and that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are
mutually exclusive.

Cc: Florian Weimer <fweimer@redhat.com>
Cc: libc-alpha@sourceware.org
Cc: linux-api@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
Link: https://lore.kernel.org/r/20191010133518.5420-2-christian.brauner@ubuntu.com

/* v2 */
Link: https://lore.kernel.org/r/20191011102537.27502-2-christian.brauner@ubuntu.com
- Christian Brauner <christian.brauner@ubuntu.com>:
  - remove unused variable
  - reuse variable in child process instead od declaring a new one
  - move check for mutual exclusivity of CLONE_SIGHAND and
    CLONE3_CLEAR_SIGHAND to top of test before setting up signal
    handlers
  - rename variables

/* v3 */
- Christian Brauner <christian.brauner@ubuntu.com>:
  - s/CLONE3_CLEAR_SIGHAND/CLONE_CLEAR_SIGHAND/g
---
 MAINTAINERS                                   |   1 +
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/clone3/.gitignore     |   1 +
 tools/testing/selftests/clone3/Makefile       |   7 +
 .../selftests/clone3/clone3_clear_sighand.c   | 172 ++++++++++++++++++
 5 files changed, 182 insertions(+)
 create mode 100644 tools/testing/selftests/clone3/.gitignore
 create mode 100644 tools/testing/selftests/clone3/Makefile
 create mode 100644 tools/testing/selftests/clone3/clone3_clear_sighand.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 55199ef7fa74..582275d85607 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12828,6 +12828,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
 F:	samples/pidfd/
 F:	tools/testing/selftests/pidfd/
+F:	tools/testing/selftests/clone3/
 K:	(?i)pidfd
 K:	(?i)clone3
 K:	\b(clone_args|kernel_clone_args)\b
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index c3feccb99ff5..6bf7aeb47650 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += clone3
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore
new file mode 100644
index 000000000000..6c9f98097774
--- /dev/null
+++ b/tools/testing/selftests/clone3/.gitignore
@@ -0,0 +1 @@
+clone3_clear_sighand
diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
new file mode 100644
index 000000000000..3ecd56ebc99d
--- /dev/null
+++ b/tools/testing/selftests/clone3/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -g -I../../../../usr/include/
+
+TEST_GEN_PROGS := clone3_clear_sighand
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c
new file mode 100644
index 000000000000..0d957be1bdc5
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+#ifndef CLONE_CLEAR_SIGHAND
+#define CLONE_CLEAR_SIGHAND 0x100000000ULL
+#endif
+
+#ifndef __NR_clone3
+#define __NR_clone3 -1
+struct clone_args {
+	__aligned_u64 flags;
+	__aligned_u64 pidfd;
+	__aligned_u64 child_tid;
+	__aligned_u64 parent_tid;
+	__aligned_u64 exit_signal;
+	__aligned_u64 stack;
+	__aligned_u64 stack_size;
+	__aligned_u64 tls;
+};
+#endif
+
+static pid_t sys_clone3(struct clone_args *args, size_t size)
+{
+	return syscall(__NR_clone3, args, size);
+}
+
+static void test_clone3_supported(void)
+{
+	pid_t pid;
+	struct clone_args args = {};
+
+	if (__NR_clone3 < 0)
+		ksft_exit_skip("clone3() syscall is not supported\n");
+
+	/* Set to something that will always cause EINVAL. */
+	args.exit_signal = -1;
+	pid = sys_clone3(&args, sizeof(args));
+	if (!pid)
+		exit(EXIT_SUCCESS);
+
+	if (pid > 0) {
+		wait(NULL);
+		ksft_exit_fail_msg(
+			"Managed to create child process with invalid exit_signal\n");
+	}
+
+	if (errno == ENOSYS)
+		ksft_exit_skip("clone3() syscall is not supported\n");
+
+	ksft_print_msg("clone3() syscall supported\n");
+}
+
+static void nop_handler(int signo)
+{
+}
+
+static int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		return -1;
+	}
+
+	if (!WIFEXITED(status))
+		return -1;
+
+	return WEXITSTATUS(status);
+}
+
+static void test_clone3_clear_sighand(void)
+{
+	int ret;
+	pid_t pid;
+	struct clone_args args = {};
+	struct sigaction act;
+
+	/*
+	 * Check that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are mutually
+	 * exclusive.
+	 */
+	args.flags |= CLONE_CLEAR_SIGHAND | CLONE_SIGHAND;
+	args.exit_signal = SIGCHLD;
+	pid = sys_clone3(&args, sizeof(args));
+	if (pid > 0)
+		ksft_exit_fail_msg(
+			"clone3(CLONE_CLEAR_SIGHAND | CLONE_SIGHAND) succeeded\n");
+
+	act.sa_handler = nop_handler;
+	ret = sigemptyset(&act.sa_mask);
+	if (ret < 0)
+		ksft_exit_fail_msg("%s - sigemptyset() failed\n",
+				   strerror(errno));
+
+	act.sa_flags = 0;
+
+	/* Register signal handler for SIGUSR1 */
+	ret = sigaction(SIGUSR1, &act, NULL);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s - sigaction(SIGUSR1, &act, NULL) failed\n",
+			strerror(errno));
+
+	/* Register signal handler for SIGUSR2 */
+	ret = sigaction(SIGUSR2, &act, NULL);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s - sigaction(SIGUSR2, &act, NULL) failed\n",
+			strerror(errno));
+
+	/* Check that CLONE_CLEAR_SIGHAND works. */
+	args.flags = CLONE_CLEAR_SIGHAND;
+	pid = sys_clone3(&args, sizeof(args));
+	if (pid < 0)
+		ksft_exit_fail_msg("%s - clone3(CLONE_CLEAR_SIGHAND) failed\n",
+				   strerror(errno));
+
+	if (pid == 0) {
+		ret = sigaction(SIGUSR1, NULL, &act);
+		if (ret < 0)
+			exit(EXIT_FAILURE);
+
+		if (act.sa_handler != SIG_DFL)
+			exit(EXIT_FAILURE);
+
+		ret = sigaction(SIGUSR2, NULL, &act);
+		if (ret < 0)
+			exit(EXIT_FAILURE);
+
+		if (act.sa_handler != SIG_DFL)
+			exit(EXIT_FAILURE);
+
+		exit(EXIT_SUCCESS);
+	}
+
+	ret = wait_for_pid(pid);
+	if (ret)
+		ksft_exit_fail_msg(
+			"Failed to clear signal handler for child process\n");
+
+	ksft_test_result_pass("Cleared signal handlers for child process\n");
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	test_clone3_supported();
+	test_clone3_clear_sighand();
+
+	return ksft_exit_pass();
+}
-- 
2.23.0

^ permalink raw reply related

* Re: [PATCH] pidfd: add NSpid entries to fdinfo
From: Jann Horn @ 2019-10-14 15:09 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andrea Arcangeli, Andrew Morton, Christian Kellner,
	Christian Kellner, Aleksa Sarai, Elena Reshetova, Roman Gushchin,
	Dmitry V. Levin, Linux API, kernel list, Michal Hocko,
	Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Al Viro
In-Reply-To: <20191012101922.24168-1-christian.brauner@ubuntu.com>

On Sat, Oct 12, 2019 at 12:19 PM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
> Currently, the fdinfo file of contains the field Pid:

nit: something missing after "of"?

> It contains the pid a given pidfd refers to in the pid namespace of the
> opener's procfs instance.

s/opener's // here and in various places below? "the opener's" is
kinda misleading since it sounds as if it has something to do with
task identity.

> If the pid namespace of the process is not a descendant of the pid
> namespace of the procfs instance 0 will be shown as its pid. This is
> similar to calling getppid() on a process who's parent is out of it's

nit: s/who's/whose/

nit: s/it's/its/

> pid namespace (e.g. when moving a process into a sibling pid namespace
> via setns()).

You can't move a process into a PID namespace via setns(), you can
only set the PID namespace for its children; and even then, you can't
set that to a sibling PID namespace, you can only move into descendant
PID namespaces.

> Add an NSpid field for easy retrieval of the pid in all descendant pid
> namespaces:
> If pid namespaces are supported this field will contain the pid a given
> pidfd refers to for all descendant pid namespaces starting from the
> current pid namespace of the opener's procfs instance, i.e. the first

s/current // - neither tasks nor procfs instances can change which pid
namespace they're associated with

> pid entry for Pid and NSpid will be identical.

*the Pid field and the first entry in the NSpid field?

> If the pid namespace of the process is not a descendant of the pid
> namespace of the procfs instance 0 will be shown as its first NSpid and
> no other NSpid entries will be shown.
> Note that this differs from the Pid and NSpid fields in
> /proc/<pid>/status where Pid and NSpid are always shown relative to the
> pid namespace of the opener's procfs instace. The difference becomes
> obvious when sending around a pidfd between pid namespaces from
> different trees, i.e. where no ancestoral relation is present between
> the pid namespaces:
> 1. sending around pidfd:
> - create two new pid namespaces ns1 and ns2 in the initial pid namespace
>   (Also take care to create new mount namespaces in the new pid
>   namespace and mount procfs.)
> - create a process with a pidfd in ns1
> - send pidfd from ns1 to ns2
> - read /proc/self/fdinfo/<pidfd> and observe that Pid and NSpid entry
>   are 0
> - create a process with a pidfd in

truncated phrase?

> - open a pidfd for a process in the initial pid namespace
> 2. sending around /proc/<pid>/status fd:
> - create two new pid namespaces ns1 and ns2 in the initial pid namespace
>   (Also take care to create new mount namespaces in the new pid
>   namespace and mount procfs.)
> - create a process in ns1
> - open /proc/<pid>/status in the initial pid namespace for the process
>   you created in ns1
> - send statusfd from initial pid namespace to ns2
> - read statusfd and observe:
>   - that Pid will contain the pid of the process as seen from the init
>   - that NSpid will contain the pids of the process for all descendant
>     pid namespaces starting from the initial pid namespace

You don't need fd passing for case 2, you can just look at the procfs
instance that's mounted in the initial mount namespace. Using fd
passing in this example kinda obfuscates what's going on, in my
opinion.


> +/**
> + * pidfd_show_fdinfo - print information about a pidfd
> + * @m: proc fdinfo file
> + * @f: file referencing a pidfd
> + *
> + * Pid:
> + * This function will print the pid a given pidfd refers to in the pid
> + * namespace of the opener's procfs instance.
> + * If the pid namespace of the process is not a descendant of the pid
> + * namespace of the procfs instance 0 will be shown as its pid. This is
> + * similar to calling getppid() on a process who's parent is out of it's
> + * pid namespace (e.g. when moving a process into a sibling pid namespace
> + * via setns()).
> + * NSpid:
> + * If pid namespaces are supported then this function will also print the
> + * pid a given pidfd refers to for all descendant pid namespaces starting
> + * from the current pid namespace of the opener's procfs instance, i.e. the
> + * first pid entry for Pid and NSpid will be identical.
> + * If the pid namespace of the process is not a descendant of the pid
> + * namespace of the procfs instance 0 will be shown as its first NSpid and
> + * no other NSpid entries will be shown.
> + * Note that this differs from the Pid and NSpid fields in
> + * /proc/<pid>/status where Pid and NSpid are always shown relative to the
> + * pid namespace of the opener's procfs instace. The difference becomes
> + * obvious when sending around a pidfd between pid namespaces from
> + * different trees, i.e. where no ancestoral relation is present between
> + * the pid namespaces:
> + * 1. sending around pidfd:
> + * - create two new pid namespaces ns1 and ns2 in the initial pid namespace
> + *   (Also take care to create new mount namespaces in the new pid
> + *   namespace and mount procfs.)
> + * - create a process with a pidfd in ns1
> + * - send pidfd from ns1 to ns2
> + * - read /proc/self/fdinfo/<pidfd> and observe that Pid and NSpid entry
> + *   are 0
> + * - create a process with a pidfd in
> + * - open a pidfd for a process in the initial pid namespace
> + * 2. sending around /proc/<pid>/status fd:
> + * - create two new pid namespaces ns1 and ns2 in the initial pid namespace
> + *   (Also take care to create new mount namespaces in the new pid
> + *   namespace and mount procfs.)
> + * - create a process in ns1
> + * - open /proc/<pid>/status in the initial pid namespace for the process
> + *   you created in ns1
> + * - send statusfd from initial pid namespace to ns2
> + * - read statusfd and observe:
> + *   - that Pid will contain the pid of the process as seen from the init
> + *   - that NSpid will contain the pids of the process for all descendant
> + *     pid namespaces starting from the initial pid namespace
> + */

See above, same problems as in the commit message. Actually, since you
have a big comment block with this text, there's no reason to repeat
it in the commit message, right?

(By the way, only slightly related to this patch: At the moment, if
you have a pidfd to a task that has already been reaped, and the PID
has been reallocated to another task, the pidfd will still show up in
/proc/*/fdinfo/* as if it referred to a non-dead process, right? Might
be worth considering whether you want to probe ->tasks[PIDTYPE_PID] or
->tasks[PIDTYPE_TGID] for NULL-ness when printing fdinfo, or something
like that.)

^ permalink raw reply

* Re: [PATCH] pidfd: add NSpid entries to fdinfo
From: Jann Horn @ 2019-10-14 15:10 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Christian Kellner, Andrea Arcangeli, Andrew Morton, Aleksa Sarai,
	Elena Reshetova, Roman Gushchin, Dmitry V. Levin, Linux API,
	kernel list, Michal Hocko, Ingo Molnar, Peter Zijlstra,
	Thomas Gleixner, Al Viro
In-Reply-To: <20191014103157.h2wph2ujjidsrhyw@wittgenstein>

On Mon, Oct 14, 2019 at 12:32 PM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
> On Mon, Oct 14, 2019 at 11:43:01AM +0200, Christian Kellner wrote:
> > On Sat, 2019-10-12 at 12:21 +0200, Christian Brauner wrote:
> > > I tried to think of cases where the first entry of Pid is not
> > > identical
> > > to the first entry of NSpid but I came up with none. Maybe you do,
> > > Jann?
> > Yeah, I don't think that can be the case. By looking at the source of
> > 'pid_nr_ns(pid, ns)' a non-zero return means that a) 'pid' valid, ie.
> > non-null and b) 'ns' is in the pid namespace hierarchy of 'pid' (at
> > pid->level, i.e. "pid->numbers[ns->level].ns == ns").

Agreed.

> You could probably do:
>
> #ifdef CONFIG_PID_NS
> seq_put_decimal_ull(m, "\nNSpid:\t", nr);
> for (i = ns->level + 1; i <= pid->level && nr; i++)
>         seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
> #endif

Personally, I dislike hiding the precondition for running the loop in
the loop statement like that. While it makes the code more concise, it
somewhat obfuscates the high-level logic at a first glance.

^ permalink raw reply

* Re: [PATCH] pidfd: add NSpid entries to fdinfo
From: Christian Kellner @ 2019-10-14 15:20 UTC (permalink / raw)
  To: Jann Horn, Christian Brauner
  Cc: Andrea Arcangeli, Andrew Morton, Aleksa Sarai, Elena Reshetova,
	Roman Gushchin, Dmitry V. Levin, Linux API, kernel list,
	Michal Hocko, Ingo Molnar, Peter Zijlstra, Thomas Gleixner,
	Al Viro
In-Reply-To: <CAG48ez1Yf05j2irdZxT2TA=2n3xo25KN48nRQdp_F8j14K36rA@mail.gmail.com>

On Mon, 2019-10-14 at 17:10 +0200, Jann Horn wrote:
> On Mon, Oct 14, 2019 at 12:32 PM Christian Brauner wrote:
> > On Mon, Oct 14, 2019 at 11:43:01AM +0200, Christian Kellner wrote:
> > You could probably do:
> > 
> > #ifdef CONFIG_PID_NS
> > seq_put_decimal_ull(m, "\nNSpid:\t", nr);
> > for (i = ns->level + 1; i <= pid->level && nr; i++)
> >         seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
> > #endif
> 
> Personally, I dislike hiding the precondition for running the loop in
> the loop statement like that. While it makes the code more concise,
> it somewhat obfuscates the high-level logic at a first glance.
I agree and it has the side-effect of needing another #ifdef at the end
of the variable block for "i". I think I will go with:

if (nr) {
	int i;

	/* If nr is non-zero it means that 'pid' is valid and that
	 * ns, i.e. the pid namespace associated with the procfs
	 * instance, is in the pid namespace hierarchy of pid.
	 * Start at one below the already printed level.
	 */
	for (i = ns->level + 1; i <= pid->level; i++)
		seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
}

I will re-work the comment block and then send a new version of
the patch.

Thanks,
Christian

^ permalink raw reply

* Re: [PATCH 1/7] Add a new flags-accepting interface for anonymous inodes
From: Jann Horn @ 2019-10-14 15:38 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Linux API, kernel list, Lokesh Gidra, Nick Kralevich, nosh,
	Tim Murray
In-Reply-To: <20191012191602.45649-2-dancol@google.com>

On Sat, Oct 12, 2019 at 9:16 PM Daniel Colascione <dancol@google.com> wrote:
> Add functions forwarding from the old names to the new ones so we
> don't need to change any callers.

This patch does more than the commit message says; it also refactors
the body of the function. (I would've moved that refactoring over into
patch 2, but I guess this works, too.)

[...]
> -struct file *anon_inode_getfile(const char *name,
> -                               const struct file_operations *fops,
> -                               void *priv, int flags)
> +struct file *anon_inode_getfile2(const char *name,
> +                                const struct file_operations *fops,
> +                                void *priv, int flags, int anon_inode_flags)

(AFAIK, normal kernel style is to slap a "__" prefix in front of the
function name instead of appending a digit, but I guess it doesn't
really matter.)

>  {
> +       struct inode *inode;
>         struct file *file;
>
> -       if (IS_ERR(anon_inode_inode))
> -               return ERR_PTR(-ENODEV);
> -
> -       if (fops->owner && !try_module_get(fops->owner))
> -               return ERR_PTR(-ENOENT);
> +       if (anon_inode_flags)
> +               return ERR_PTR(-EINVAL);
>
> +       inode = anon_inode_inode;
> +       if (IS_ERR(inode))
> +               return ERR_PTR(-ENODEV);
>         /*
> -        * We know the anon_inode inode count is always greater than zero,
> -        * so ihold() is safe.
> +        * We know the anon_inode inode count is always
> +        * greater than zero, so ihold() is safe.
>          */

This looks like maybe you started editing the comment, then un-did the
change, but left the modified line wrapping in your patch? Please
avoid that - code changes with no real reason make "git blame" output
more annoying and create trouble when porting patches between kernel
versions.

[...]
>  EXPORT_SYMBOL_GPL(anon_inode_getfile);
> +EXPORT_SYMBOL_GPL(anon_inode_getfile2);
[...]
>  EXPORT_SYMBOL_GPL(anon_inode_getfd);
> +EXPORT_SYMBOL_GPL(anon_inode_getfd2);

Since anon_inode_getfd() is now a static inline function in
include/linux/anon_inodes.h, exporting it doesn't make sense anymore.
Same for anon_inode_getfile().

[...]
> +#define ANON_INODE_SECURE 1

That #define belongs in a later patch, right?

^ permalink raw reply

* Re: [PATCH 3/7] Add a UFFD_SECURE flag to the userfaultfd API.
From: Jann Horn @ 2019-10-14 16:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Colascione, Linus Torvalds, Andrea Arcangeli,
	Pavel Emelyanov, Linux API, LKML, Lokesh Gidra, Nick Kralevich,
	Nosh Minwalla, Tim Murray
In-Reply-To: <CALCETrUyq=J37gU-MYXqLdoi7uH7iNNVRjvcGUT11JA1QuTFyg@mail.gmail.com>

On Sun, Oct 13, 2019 at 3:14 AM Andy Lutomirski <luto@kernel.org> wrote:
> [adding more people because this is going to be an ABI break, sigh]
> On Sat, Oct 12, 2019 at 5:52 PM Daniel Colascione <dancol@google.com> wrote:
> > On Sat, Oct 12, 2019 at 4:10 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > On Sat, Oct 12, 2019 at 12:16 PM Daniel Colascione <dancol@google.com> wrote:
> > > > The new secure flag makes userfaultfd use a new "secure" anonymous
> > > > file object instead of the default one, letting security modules
> > > > supervise userfaultfd use.
> > > >
> > > > Requiring that users pass a new flag lets us avoid changing the
> > > > semantics for existing callers.
> > >
> > > Is there any good reason not to make this be the default?
> > >
> > >
> > > The only downside I can see is that it would increase the memory usage
> > > of userfaultfd(), but that doesn't seem like such a big deal.  A
> > > lighter-weight alternative would be to have a single inode shared by
> > > all userfaultfd instances, which would require a somewhat different
> > > internal anon_inode API.
> >
> > I'd also prefer to just make SELinux use mandatory, but there's a
> > nasty interaction with UFFD_EVENT_FORK. Adding a new UFFD_SECURE mode
> > which blocks UFFD_EVENT_FORK sidesteps this problem. Maybe you know a
> > better way to deal with it.
[...]
> Now that you've pointed this mechanism out, it is utterly and
> completely broken and should be removed from the kernel outright or at
> least severely restricted.  A .read implementation MUST NOT ACT ON THE
> CALLING TASK.  Ever.  Just imagine the effect of passing a userfaultfd
> as stdin to a setuid program.
>
> So I think the right solution might be to attempt to *remove*
> UFFD_EVENT_FORK.  Maybe the solution is to say that, unless the
> creator of a userfaultfd() has global CAP_SYS_ADMIN, then it cannot
> use UFFD_FEATURE_EVENT_FORK) and print a warning (once) when
> UFFD_FEATURE_EVENT_FORK is allowed.  And, after some suitable
> deprecation period, just remove it.  If it's genuinely useful, it
> needs an entirely new API based on ioctl() or a syscall.  Or even
> recvmsg() :)

FWIW, <https://codesearch.debian.net/search?q=UFFD_FEATURE_EVENT_FORK&literal=1>
just shows the kernel, kernel selftests, and strace code for decoding
syscall arguments. CRIU uses it though (probably for postcopy live
migration / lazy migration?), I guess that code isn't in debian for
some reason.

^ permalink raw reply

* [PATCH v4 1/2] pidfd: add NSpid entries to fdinfo
From: Christian Kellner @ 2019-10-14 16:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, Jann Horn, Christian Kellner, Christian Brauner,
	Christian Brauner, Andrew Morton, Peter Zijlstra (Intel),
	Ingo Molnar, Michal Hocko, Thomas Gleixner, Elena Reshetova,
	Roman Gushchin, Andrea Arcangeli, Al Viro, Aleksa Sarai,
	Dmitry V. Levin
In-Reply-To: <20191011122323.7770-1-ckellner@redhat.com>

From: Christian Kellner <christian@kellner.me>

Currently, the fdinfo file contains the Pid field which shows the
pid a given pidfd refers to in the pid namespace of the procfs
instance. If pid namespaces are configured, also show an NSpid field
for easy retrieval of the pid in all descendant pid namespaces. If
the pid namespace of the process is not a descendant of the pid
namespace of the procfs instance 0 will be shown as its first NSpid
entry and no other entries will be shown. Add a block comment to
pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields.

Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Kellner <christian@kellner.me>
---
Changes in v4:
- Reworked to properly handle the case where the pidfd is from a
  different branch in the pid namespace hierarchy; also add block
  comment with an in-depth explanation (Christian Brauner)

 kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf53125210..782986962d47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file)
 }
 
 #ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the  pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestoral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ *   namespace (also take care to create new mount namespaces in the
+ *   new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ *   have exactly one entry, which is 0
+ */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
 	struct pid *pid = f->private_data;
+	pid_t nr = pid_nr_ns(pid, ns);
+
+	seq_put_decimal_ull(m, "Pid:\t", nr);
 
-	seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+#ifdef CONFIG_PID_NS
+	seq_put_decimal_ull(m, "\nNSpid:\t", nr);
+	if (nr) {
+		int i;
+
+		/* If nr is non-zero it means that 'pid' is valid and that
+		 * ns, i.e. the pid namespace associated with the procfs
+		 * instance, is in the pid namespace hierarchy of pid.
+		 * Start at one below the already printed level.
+		 */
+		for (i = ns->level + 1; i <= pid->level; i++)
+			seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
+	}
+#endif
 	seq_putc(m, '\n');
 }
 #endif
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 2/2] pidfd: add tests for NSpid info in fdinfo
From: Christian Kellner @ 2019-10-14 16:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, Jann Horn, Christian Kellner, Christian Brauner,
	Shuah Khan, Andrew Morton, Peter Zijlstra (Intel), Ingo Molnar,
	Michal Hocko, Thomas Gleixner, Elena Reshetova, Roman Gushchin,
	Andrea Arcangeli, Al Viro, Aleksa Sarai, Dmitry V. Levin,
	linux-kselftest
In-Reply-To: <20191014162034.2185-1-ckellner@redhat.com>

From: Christian Kellner <christian@kellner.me>

Add a test that checks that if pid namespaces are configured the fdinfo
file of a pidfd contains an NSpid: entry containing the process id in
the current and additionally all nested namespaces. In the case that
a pidfd is from a pid namespace not in the same namespace hierarchy as
the process accessing the fdinfo file, ensure the 'NSpid' shows 0 for
that pidfd, analogous to the 'Pid' entry.

Signed-off-by: Christian Kellner <christian@kellner.me>
---
Changes in v4:
- Rework to test include a the situation where the fdinfo for a pidfd
  of a process in a sibling pid namespace is being read and ensure
  the NSpid field only contains one entry, being 0.

 tools/testing/selftests/pidfd/Makefile        |   2 +-
 .../selftests/pidfd/pidfd_fdinfo_test.c       | 265 ++++++++++++++++++
 2 files changed, 266 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/pidfd/pidfd_fdinfo_test.c

diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 7550f08822a3..43db1b98e845 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -g -I../../../../usr/include/ -pthread
 
-TEST_GEN_PROGS := pidfd_test pidfd_open_test pidfd_poll_test pidfd_wait
+TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
new file mode 100644
index 000000000000..3721be994abd
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+struct error {
+	int  code;
+	char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	if (code == PIDFD_PASS || !err || err->code != PIDFD_PASS)
+		return code;
+
+	err->code = code;
+	va_start(args, fmt);
+	r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+	assert((size_t)r < sizeof(err->msg));
+	va_end(args);
+
+	return code;
+}
+
+static void error_report(struct error *err, const char *test_name)
+{
+	switch (err->code) {
+	case PIDFD_ERROR:
+		ksft_exit_fail_msg("%s test: Fatal: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_FAIL:
+		/* will be: not ok %d # error %s test: %s */
+		ksft_test_result_error("%s test: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_SKIP:
+		/* will be: not ok %d # SKIP %s test: %s */
+		ksft_test_result_skip("%s test: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_XFAIL:
+		ksft_test_result_pass("%s test: Expected failure: %s\n",
+				      test_name, err->msg);
+		break;
+
+	case PIDFD_PASS:
+		ksft_test_result_pass("%s test: Passed\n");
+		break;
+
+	default:
+		ksft_exit_fail_msg("%s test: Unknown code: %d %s\n",
+				   test_name, err->code, err->msg);
+		break;
+	}
+}
+
+static inline int error_check(struct error *err, const char *test_name)
+{
+	/* In case of error we bail out and terminate the test program */
+	if (err->code == PIDFD_ERROR)
+		error_report(err, test_name);
+
+	return err->code;
+}
+
+struct child {
+	pid_t pid;
+	int   fd;
+};
+
+static struct child clone_newns(int (*fn)(void *), void *args,
+				struct error *err)
+{
+	static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;
+	size_t stack_size = 1024;
+	char *stack[1024] = { 0 };
+	struct child ret;
+
+	if (!(flags & CLONE_NEWUSER) && geteuid() != 0)
+		flags |= CLONE_NEWUSER;
+
+#ifdef __ia64__
+	ret.pid = __clone2(fn, stack, stack_size, flags, args, &ret.fd);
+#else
+	ret.pid = clone(fn, stack + stack_size, flags, args, &ret.fd);
+#endif
+
+	if (ret.pid < 0) {
+		error_set(err, PIDFD_ERROR, "clone failed (ret %d, errno %d)",
+			  ret.fd, errno);
+		return ret;
+	}
+
+	ksft_print_msg("New child: %d, fd: %d\n", ret.pid, ret.fd);
+
+	return ret;
+}
+
+static inline int child_join(struct child *child, struct error *err)
+{
+	int r;
+
+	(void)close(child->fd);
+	r = wait_for_pid(child->pid);
+	if (r < 0)
+		error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
+			  r, errno);
+	else if (r > 0)
+		error_set(err, r, "child %d reported: %d", child->pid, r);
+
+	return r;
+}
+
+static inline void trim_newline(char *str)
+{
+	char *pos = strrchr(str, '\n');
+
+	if (pos)
+		*pos = '\0';
+}
+
+static int verify_fdinfo_nspid(int pidfd, struct error *err,
+			       const char *expect, ...)
+{
+	char buffer[512] = {0, };
+	char path[512] = {0, };
+	va_list args;
+	FILE *f;
+	char *line = NULL;
+	size_t n = 0;
+	int found = 0;
+	int r;
+
+	va_start(args, expect);
+	r = vsnprintf(buffer, sizeof(buffer), expect, args);
+	assert((size_t)r < sizeof(buffer));
+	va_end(args);
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+	f = fopen(path, "re");
+	if (!f)
+		return error_set(err, PIDFD_ERROR, "fdinfo open failed for %d",
+				 pidfd);
+
+	while (getline(&line, &n, f) != -1) {
+		if (strncmp(line, "NSpid:", 6))
+			continue;
+
+		found = 1;
+
+		r = strcmp(line + 6, buffer);
+		if (r != 0) {
+			trim_newline(line);
+			trim_newline(buffer);
+			error_set(err, PIDFD_FAIL, "NSpid: '%s' != '%s'",
+				  line + 6, buffer);
+		}
+		break;
+	}
+
+	free(line);
+	fclose(f);
+
+	if (found == 0)
+		return error_set(err, PIDFD_FAIL, "NSpid not found for fd %d",
+				 pidfd);
+
+	return PIDFD_PASS;
+}
+
+static int child_fdinfo_nspid_test(void *args)
+{
+	struct error err;
+	int pidfd;
+	int r;
+
+	/* if we got no fd for the sibling, we are done */
+	if (!args)
+		return PIDFD_PASS;
+
+	/* verify that we can not resolve the pidfd for a process
+	 * in a sibling pid namespace, i.e. a pid namespace it is
+	 * not in our or a descended namespace
+	 */
+	r = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+	if (r < 0) {
+		ksft_print_msg("Failed to remount / private\n");
+		return PIDFD_ERROR;
+	}
+
+	(void)umount2("/proc", MNT_DETACH);
+	r = mount("proc", "/proc", "proc", 0, NULL);
+	if (r < 0) {
+		ksft_print_msg("Failed to remount /proc\n");
+		return PIDFD_ERROR;
+	}
+
+	pidfd = *(int *)args;
+	r = verify_fdinfo_nspid(pidfd, &err, "\t0\n");
+
+	if (r != PIDFD_PASS)
+		ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
+
+	return r;
+}
+
+static void test_pidfd_fdinfo_nspid(void)
+{
+	struct child a, b;
+	struct error err = {0, };
+	const char *test_name = "pidfd check for NSpid in fdinfo";
+
+	/* Create a new child in a new pid and mount namespace */
+	a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+	error_check(&err, test_name);
+
+	/* Pass the pidfd representing the first child to the
+	 * second child, which will be in a sibling pid namespace,
+	 * which means that the fdinfo NSpid entry for the pidfd
+	 * should only contain '0'.
+	 */
+	b = clone_newns(child_fdinfo_nspid_test, &a.fd, &err);
+	error_check(&err, test_name);
+
+	/* The children will have pid 1 in the new pid namespace,
+	 * so the line must be 'NSPid:\t<pid>\t1'.
+	 */
+	verify_fdinfo_nspid(a.fd, &err, "\t%d\t%d\n", a.pid, 1);
+	verify_fdinfo_nspid(b.fd, &err, "\t%d\t%d\n", b.pid, 1);
+
+	/* wait for the process, check the exit status and set
+	 * 'err' accordingly, if it is not already set.
+	 */
+	child_join(&a, &err);
+	child_join(&b, &err);
+
+	error_report(&err, test_name);
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	test_pidfd_fdinfo_nspid();
+
+	return ksft_exit_pass();
+}
-- 
2.21.0

^ permalink raw reply related

* Re: [PATCH] pidfd: add NSpid entries to fdinfo
From: Christian Brauner @ 2019-10-14 17:06 UTC (permalink / raw)
  To: Jann Horn
  Cc: Andrea Arcangeli, Andrew Morton, Christian Kellner,
	Christian Kellner, Aleksa Sarai, Elena Reshetova, Roman Gushchin,
	Dmitry V. Levin, Linux API, kernel list, Michal Hocko,
	Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Al Viro
In-Reply-To: <CAG48ez0SG3LLeLtqmf5Q8aZL3J8b5XwgNbgDr72jSnq2QBac8Q@mail.gmail.com>

On October 14, 2019 5:09:58 PM GMT+02:00, Jann Horn <jannh@google.com> wrote:
>On Sat, Oct 12, 2019 at 12:19 PM Christian Brauner
><christian.brauner@ubuntu.com> wrote:
>> Currently, the fdinfo file of contains the field Pid:
>
>nit: something missing after "of"?
>
>> It contains the pid a given pidfd refers to in the pid namespace of
>the
>> opener's procfs instance.
>
>s/opener's // here and in various places below? "the opener's" is
>kinda misleading since it sounds as if it has something to do with
>task identity.
>
>> If the pid namespace of the process is not a descendant of the pid
>> namespace of the procfs instance 0 will be shown as its pid. This is
>> similar to calling getppid() on a process who's parent is out of it's
>
>nit: s/who's/whose/
>
>nit: s/it's/its/
>
>> pid namespace (e.g. when moving a process into a sibling pid
>namespace
>> via setns()).
>
>You can't move a process into a PID namespace via setns(), you can
>only set the PID namespace for its children; and even then, you can't
>set that to a sibling PID namespace, you can only move into descendant
>PID namespaces.

Yes, I know. This was sloppy changelogging on my part.

>
>> Add an NSpid field for easy retrieval of the pid in all descendant
>pid
>> namespaces:
>> If pid namespaces are supported this field will contain the pid a
>given
>> pidfd refers to for all descendant pid namespaces starting from the
>> current pid namespace of the opener's procfs instance, i.e. the first
>
>s/current // - neither tasks nor procfs instances can change which pid
>namespace they're associated with

Yes.

>
>> pid entry for Pid and NSpid will be identical.
>
>*the Pid field and the first entry in the NSpid field?

Yes.

>
>> If the pid namespace of the process is not a descendant of the pid
>> namespace of the procfs instance 0 will be shown as its first NSpid
>and
>> no other NSpid entries will be shown.
>> Note that this differs from the Pid and NSpid fields in
>> /proc/<pid>/status where Pid and NSpid are always shown relative to
>the
>> pid namespace of the opener's procfs instace. The difference becomes
>> obvious when sending around a pidfd between pid namespaces from
>> different trees, i.e. where no ancestoral relation is present between
>> the pid namespaces:
>> 1. sending around pidfd:
>> - create two new pid namespaces ns1 and ns2 in the initial pid
>namespace
>>   (Also take care to create new mount namespaces in the new pid
>>   namespace and mount procfs.)
>> - create a process with a pidfd in ns1
>> - send pidfd from ns1 to ns2
>> - read /proc/self/fdinfo/<pidfd> and observe that Pid and NSpid entry
>>   are 0
>> - create a process with a pidfd in
>
>truncated phrase?

Yeah, as I said this was really just a rough draft for Christian (the other one :)).

>
>> - open a pidfd for a process in the initial pid namespace
>> 2. sending around /proc/<pid>/status fd:
>> - create two new pid namespaces ns1 and ns2 in the initial pid
>namespace
>>   (Also take care to create new mount namespaces in the new pid
>>   namespace and mount procfs.)
>> - create a process in ns1
>> - open /proc/<pid>/status in the initial pid namespace for the
>process
>>   you created in ns1
>> - send statusfd from initial pid namespace to ns2
>> - read statusfd and observe:
>>   - that Pid will contain the pid of the process as seen from the
>init
>>   - that NSpid will contain the pids of the process for all
>descendant
>>     pid namespaces starting from the initial pid namespace
>
>You don't need fd passing for case 2, you can just look at the procfs
>instance that's mounted in the initial mount namespace. Using fd
>passing in this example kinda obfuscates what's going on, in my
>opinion.

My goal was to illustrate the same mechanism leading to different results.
But I don't care much about how this is described as long as I illustrates the difference.

>
>
>> +/**
>> + * pidfd_show_fdinfo - print information about a pidfd
>> + * @m: proc fdinfo file
>> + * @f: file referencing a pidfd
>> + *
>> + * Pid:
>> + * This function will print the pid a given pidfd refers to in the
>pid
>> + * namespace of the opener's procfs instance.
>> + * If the pid namespace of the process is not a descendant of the
>pid
>> + * namespace of the procfs instance 0 will be shown as its pid. This
>is
>> + * similar to calling getppid() on a process who's parent is out of
>it's
>> + * pid namespace (e.g. when moving a process into a sibling pid
>namespace
>> + * via setns()).
>> + * NSpid:
>> + * If pid namespaces are supported then this function will also
>print the
>> + * pid a given pidfd refers to for all descendant pid namespaces
>starting
>> + * from the current pid namespace of the opener's procfs instance,
>i.e. the
>> + * first pid entry for Pid and NSpid will be identical.
>> + * If the pid namespace of the process is not a descendant of the
>pid
>> + * namespace of the procfs instance 0 will be shown as its first
>NSpid and
>> + * no other NSpid entries will be shown.
>> + * Note that this differs from the Pid and NSpid fields in
>> + * /proc/<pid>/status where Pid and NSpid are always shown relative
>to the
>> + * pid namespace of the opener's procfs instace. The difference
>becomes
>> + * obvious when sending around a pidfd between pid namespaces from
>> + * different trees, i.e. where no ancestoral relation is present
>between
>> + * the pid namespaces:
>> + * 1. sending around pidfd:
>> + * - create two new pid namespaces ns1 and ns2 in the initial pid
>namespace
>> + *   (Also take care to create new mount namespaces in the new pid
>> + *   namespace and mount procfs.)
>> + * - create a process with a pidfd in ns1
>> + * - send pidfd from ns1 to ns2
>> + * - read /proc/self/fdinfo/<pidfd> and observe that Pid and NSpid
>entry
>> + *   are 0
>> + * - create a process with a pidfd in
>> + * - open a pidfd for a process in the initial pid namespace
>> + * 2. sending around /proc/<pid>/status fd:
>> + * - create two new pid namespaces ns1 and ns2 in the initial pid
>namespace
>> + *   (Also take care to create new mount namespaces in the new pid
>> + *   namespace and mount procfs.)
>> + * - create a process in ns1
>> + * - open /proc/<pid>/status in the initial pid namespace for the
>process
>> + *   you created in ns1
>> + * - send statusfd from initial pid namespace to ns2
>> + * - read statusfd and observe:
>> + *   - that Pid will contain the pid of the process as seen from the
>init
>> + *   - that NSpid will contain the pids of the process for all
>descendant
>> + *     pid namespaces starting from the initial pid namespace
>> + */
>
>See above, same problems as in the commit message. Actually, since you
>have a big comment block with this text, there's no reason to repeat
>it in the commit message, right?

If the comment gets modified or the logic changes I'd still like to have the actual context recorded in the changelog too.
I think that's good practice.

>
>(By the way, only slightly related to this patch: At the moment, if
>you have a pidfd to a task that has already been reaped, and the PID
>has been reallocated to another task, the pidfd will still show up in
>/proc/*/fdinfo/* as if it referred to a non-dead process, right? Might
>be worth considering whether you want to probe ->tasks[PIDTYPE_PID] or
>->tasks[PIDTYPE_TGID] for NULL-ness when printing fdinfo, or something
>like that.)

Not a big deal but yes, let me put this on my list.
Or do you feel in the mood for a patch? ;)

Christian

^ permalink raw reply

* Re: [PATCH 1/7] Add a new flags-accepting interface for anonymous inodes
From: Daniel Colascione @ 2019-10-14 18:15 UTC (permalink / raw)
  To: Jann Horn
  Cc: Linux API, kernel list, Lokesh Gidra, Nick Kralevich,
	Nosh Minwalla, Tim Murray
In-Reply-To: <CAG48ez3yOPAC3mTJdQ5_8aARQPe+siid5jaa8U+aMtfj-bUJ2g@mail.gmail.com>

Thanks for taking a look

On Mon, Oct 14, 2019 at 8:39 AM Jann Horn <jannh@google.com> wrote:
>
> On Sat, Oct 12, 2019 at 9:16 PM Daniel Colascione <dancol@google.com> wrote:
> > Add functions forwarding from the old names to the new ones so we
> > don't need to change any callers.
>
> This patch does more than the commit message says; it also refactors
> the body of the function. (I would've moved that refactoring over into
> patch 2, but I guess this works, too.)
>
> [...]
> > -struct file *anon_inode_getfile(const char *name,
> > -                               const struct file_operations *fops,
> > -                               void *priv, int flags)
> > +struct file *anon_inode_getfile2(const char *name,
> > +                                const struct file_operations *fops,
> > +                                void *priv, int flags, int anon_inode_flags)
>
> (AFAIK, normal kernel style is to slap a "__" prefix in front of the
> function name instead of appending a digit, but I guess it doesn't
> really matter.)

I thought prefixing "_" was for signaling "this is an implementation
detail and you probably don't want to call it unless you know what
you're doing", not "here's a new version that does a new thing".

> >  {
> > +       struct inode *inode;
> >         struct file *file;
> >
> > -       if (IS_ERR(anon_inode_inode))
> > -               return ERR_PTR(-ENODEV);
> > -
> > -       if (fops->owner && !try_module_get(fops->owner))
> > -               return ERR_PTR(-ENOENT);
> > +       if (anon_inode_flags)
> > +               return ERR_PTR(-EINVAL);
> >
> > +       inode = anon_inode_inode;
> > +       if (IS_ERR(inode))
> > +               return ERR_PTR(-ENODEV);
> >         /*
> > -        * We know the anon_inode inode count is always greater than zero,
> > -        * so ihold() is safe.
> > +        * We know the anon_inode inode count is always
> > +        * greater than zero, so ihold() is safe.
> >          */
>
> This looks like maybe you started editing the comment, then un-did the
> change, but left the modified line wrapping in your patch? Please
> avoid that - code changes with no real reason make "git blame" output
> more annoying and create trouble when porting patches between kernel
> versions.

I'll fix it.

>
> [...]
> >  EXPORT_SYMBOL_GPL(anon_inode_getfile);
> > +EXPORT_SYMBOL_GPL(anon_inode_getfile2);
> [...]
> >  EXPORT_SYMBOL_GPL(anon_inode_getfd);
> > +EXPORT_SYMBOL_GPL(anon_inode_getfd2);
>
> Since anon_inode_getfd() is now a static inline function in
> include/linux/anon_inodes.h, exporting it doesn't make sense anymore.
> Same for anon_inode_getfile().

I didn't want to break modules unnecessarily. Declaring the function
inline and also exporting it gives us an efficiency win while avoiding
an ABI break, right?

> [...]
> > +#define ANON_INODE_SECURE 1
>
> That #define belongs in a later patch, right?

Yep.

^ permalink raw reply

* Re: [PATCH 1/7] Add a new flags-accepting interface for anonymous inodes
From: Jann Horn @ 2019-10-14 18:30 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Linux API, kernel list, Lokesh Gidra, Nick Kralevich,
	Nosh Minwalla, Tim Murray
In-Reply-To: <CAKOZuet4VM-P_xm9R7cJO2_f60eUcqt5wHG8+khJedhctfEEhw@mail.gmail.com>

On Mon, Oct 14, 2019 at 8:16 PM Daniel Colascione <dancol@google.com> wrote:
> On Mon, Oct 14, 2019 at 8:39 AM Jann Horn <jannh@google.com> wrote:
> >
> > On Sat, Oct 12, 2019 at 9:16 PM Daniel Colascione <dancol@google.com> wrote:
> > > Add functions forwarding from the old names to the new ones so we
> > > don't need to change any callers.
> >
> > This patch does more than the commit message says; it also refactors
> > the body of the function. (I would've moved that refactoring over into
> > patch 2, but I guess this works, too.)
> >
> > [...]
> > > -struct file *anon_inode_getfile(const char *name,
> > > -                               const struct file_operations *fops,
> > > -                               void *priv, int flags)
> > > +struct file *anon_inode_getfile2(const char *name,
> > > +                                const struct file_operations *fops,
> > > +                                void *priv, int flags, int anon_inode_flags)
> >
> > (AFAIK, normal kernel style is to slap a "__" prefix in front of the
> > function name instead of appending a digit, but I guess it doesn't
> > really matter.)
>
> I thought prefixing "_" was for signaling "this is an implementation
> detail and you probably don't want to call it unless you know what
> you're doing", not "here's a new version that does a new thing".

Ah, I guess that might be true.

^ permalink raw reply

* Re: [PATCHv7 15/33] posix-timers: Make clock_nanosleep() time namespace aware
From: Andrey Vagin @ 2019-10-14 19:58 UTC (permalink / raw)
  To: kbuild test robot
  Cc: Dmitry Safonov, kbuild-all, LKML, Dmitry Safonov, Adrian Reber,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	Thomas Gleixner, Vincenzo Frascino, Linux Containers, crml
In-Reply-To: <201910141209.9Vv7oLGL%lkp@intel.com>

On Sun, Oct 13, 2019 at 9:28 PM kbuild test robot <lkp@intel.com> wrote:
>
> Hi Dmitry,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on linus/master]
> [cannot apply to v5.4-rc2 next-20191011]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
>
> url:    https://github.com/0day-ci/linux/commits/Dmitry-Safonov/kernel-Introduce-Time-Namespace/20191014-075119
> config: x86_64-randconfig-s1-201941 (attached as .config)
> compiler: gcc-4.9 (Debian 4.9.2-10+deb8u1) 4.9.2
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64
>
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>    kernel//time/posix-stubs.c: In function '__do_sys_clock_nanosleep':
> >> kernel//time/posix-stubs.c:153:31: error: 'clockid' undeclared (first use in this function)
>       texp = timens_ktime_to_host(clockid, texp);

This is my fault. I forgot to compile it without CONFIG_POSIX_TIMERS.
Will fix this shortly. Sorry for inconvenience.

>                                   ^
>    kernel//time/posix-stubs.c:153:31: note: each undeclared identifier is reported only once for each function it appears in
>    kernel//time/posix-stubs.c: In function '__do_sys_clock_nanosleep_time32':
> >> kernel//time/posix-stubs.c:222:2: error: unknown type name 'ktime'
>      ktime texp;
>      ^
>    kernel//time/posix-stubs.c:243:31: error: 'clockid' undeclared (first use in this function)
>       texp = timens_ktime_to_host(clockid, texp);
>                                   ^
>
> vim +/clockid +153 kernel//time/posix-stubs.c
>
>    126
>    127  SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
>    128                  const struct __kernel_timespec __user *, rqtp,
>    129                  struct __kernel_timespec __user *, rmtp)
>    130  {
>    131          struct timespec64 t;
>    132          ktime_t texp;
>    133
>    134          switch (which_clock) {
>    135          case CLOCK_REALTIME:
>    136          case CLOCK_MONOTONIC:
>    137          case CLOCK_BOOTTIME:
>    138                  break;
>    139          default:
>    140                  return -EINVAL;
>    141          }
>    142
>    143          if (get_timespec64(&t, rqtp))
>    144                  return -EFAULT;
>    145          if (!timespec64_valid(&t))
>    146                  return -EINVAL;
>    147          if (flags & TIMER_ABSTIME)
>    148                  rmtp = NULL;
>    149          current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
>    150          current->restart_block.nanosleep.rmtp = rmtp;
>    151          texp = timespec64_to_ktime(t);
>    152          if (flags & TIMER_ABSTIME)
>  > 153                  texp = timens_ktime_to_host(clockid, texp);
>    154          return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
>    155                                   HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
>    156                                   which_clock);
>    157  }
>    158
>    159  #ifdef CONFIG_COMPAT
>    160  COMPAT_SYS_NI(timer_create);
>    161  COMPAT_SYS_NI(getitimer);
>    162  COMPAT_SYS_NI(setitimer);
>    163  #endif
>    164
>    165  #ifdef CONFIG_COMPAT_32BIT_TIME
>    166  SYS_NI(timer_settime32);
>    167  SYS_NI(timer_gettime32);
>    168
>    169  SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
>    170                  struct old_timespec32 __user *, tp)
>    171  {
>    172          struct timespec64 new_tp;
>    173
>    174          if (which_clock != CLOCK_REALTIME)
>    175                  return -EINVAL;
>    176          if (get_old_timespec32(&new_tp, tp))
>    177                  return -EFAULT;
>    178
>    179          return do_sys_settimeofday64(&new_tp, NULL);
>    180  }
>    181
>    182  SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
>    183                  struct old_timespec32 __user *, tp)
>    184  {
>    185          int ret;
>    186          struct timespec64 kernel_tp;
>    187
>    188          ret = do_clock_gettime(which_clock, &kernel_tp);
>    189          if (ret)
>    190                  return ret;
>    191
>    192          if (put_old_timespec32(&kernel_tp, tp))
>    193                  return -EFAULT;
>    194          return 0;
>    195  }
>    196
>    197  SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
>    198                  struct old_timespec32 __user *, tp)
>    199  {
>    200          struct timespec64 rtn_tp = {
>    201                  .tv_sec = 0,
>    202                  .tv_nsec = hrtimer_resolution,
>    203          };
>    204
>    205          switch (which_clock) {
>    206          case CLOCK_REALTIME:
>    207          case CLOCK_MONOTONIC:
>    208          case CLOCK_BOOTTIME:
>    209                  if (put_old_timespec32(&rtn_tp, tp))
>    210                          return -EFAULT;
>    211                  return 0;
>    212          default:
>    213                  return -EINVAL;
>    214          }
>    215  }
>    216
>    217  SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
>    218                  struct old_timespec32 __user *, rqtp,
>    219                  struct old_timespec32 __user *, rmtp)
>    220  {
>    221          struct timespec64 t;
>  > 222          ktime texp;
>
> ---
> 0-DAY kernel test infrastructure                Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* Re: [PATCH 1/7] Add a new flags-accepting interface for anonymous inodes
From: Christoph Hellwig @ 2019-10-15  8:08 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: linux-api, linux-kernel, lokeshgidra, nnk, nosh, timmurray
In-Reply-To: <20191012191602.45649-2-dancol@google.com>

On Sat, Oct 12, 2019 at 12:15:56PM -0700, Daniel Colascione wrote:
> Add functions forwarding from the old names to the new ones so we
> don't need to change any callers.

Independent of the usefulness of the interface (I'll let other comment,
but you defintively want to talk to Al Viro), adding a second interface
for only 17 callers total is a bad idea.  Just switch the existing
interface to pass flags.

^ permalink raw reply

* Re: [PATCH 2/7] Add a concept of a "secure" anonymous file
From: Christoph Hellwig @ 2019-10-15  8:08 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: linux-api, linux-kernel, lokeshgidra, nnk, nosh, timmurray
In-Reply-To: <20191012191602.45649-3-dancol@google.com>

On Sat, Oct 12, 2019 at 12:15:57PM -0700, Daniel Colascione wrote:
> A secure anonymous file is one we hooked up to its own inode (as
> opposed to the shared inode we use for non-secure anonymous files). A
> new selinux hook gives security modules a chance to initialize, label,
> and veto the creation of these secure anonymous files. Security
> modules had limit ability to interact with non-secure anonymous files
> due to all of these files sharing a single inode.

Again please add Al.  Also explain what the problem would be to always
use a separate inode.

^ permalink raw reply

* Re: [PATCH v4 1/2] pidfd: add NSpid entries to fdinfo
From: Christian Brauner @ 2019-10-15  9:40 UTC (permalink / raw)
  To: Christian Kellner
  Cc: linux-kernel, linux-api, Jann Horn, Christian Kellner,
	Christian Brauner, Andrew Morton, Peter Zijlstra (Intel),
	Ingo Molnar, Michal Hocko, Thomas Gleixner, Elena Reshetova,
	Roman Gushchin, Andrea Arcangeli, Al Viro, Aleksa Sarai,
	Dmitry V. Levin
In-Reply-To: <20191014162034.2185-1-ckellner@redhat.com>

On Mon, Oct 14, 2019 at 06:20:32PM +0200, Christian Kellner wrote:
> From: Christian Kellner <christian@kellner.me>
> 
> Currently, the fdinfo file contains the Pid field which shows the
> pid a given pidfd refers to in the pid namespace of the procfs
> instance. If pid namespaces are configured, also show an NSpid field
> for easy retrieval of the pid in all descendant pid namespaces. If
> the pid namespace of the process is not a descendant of the pid
> namespace of the procfs instance 0 will be shown as its first NSpid
> entry and no other entries will be shown. Add a block comment to
> pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields.
> 
> Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> Signed-off-by: Christian Kellner <christian@kellner.me>

Thanks!
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>

> ---
> Changes in v4:
> - Reworked to properly handle the case where the pidfd is from a
>   different branch in the pid namespace hierarchy; also add block
>   comment with an in-depth explanation (Christian Brauner)
> 
>  kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index bcdf53125210..782986962d47 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file)
>  }
>  
>  #ifdef CONFIG_PROC_FS
> +/**
> + * pidfd_show_fdinfo - print information about a pidfd
> + * @m: proc fdinfo file
> + * @f: file referencing a pidfd
> + *
> + * Pid:
> + * This function will print the pid that a given pidfd refers to in the
> + * pid namespace of the procfs instance.
> + * If the pid namespace of the process is not a descendant of the pid
> + * namespace of the procfs instance 0 will be shown as its pid. This is
> + * similar to calling getppid() on a process whose parent is outside of
> + * its pid namespace.
> + *
> + * NSpid:
> + * If pid namespaces are supported then this function will also print
> + * the pid of a given pidfd refers to for all descendant pid namespaces
> + * starting from the current pid namespace of the instance, i.e. the
> + * Pid field and the first entry in the NSpid field will be identical.
> + * If the pid namespace of the process is not a descendant of the pid
> + * namespace of the procfs instance 0 will be shown as its first NSpid
> + * entry and no others will be shown.
> + * Note that this differs from the Pid and NSpid fields in
> + * /proc/<pid>/status where Pid and NSpid are always shown relative to
> + * the  pid namespace of the procfs instance. The difference becomes
> + * obvious when sending around a pidfd between pid namespaces from a
> + * different branch of the tree, i.e. where no ancestoral relation is
> + * present between the pid namespaces:
> + * - create two new pid namespaces ns1 and ns2 in the initial pid
> + *   namespace (also take care to create new mount namespaces in the
> + *   new pid namespace and mount procfs)
> + * - create a process with a pidfd in ns1
> + * - send pidfd from ns1 to ns2
> + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
> + *   have exactly one entry, which is 0
> + */
>  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>  {
>  	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
>  	struct pid *pid = f->private_data;
> +	pid_t nr = pid_nr_ns(pid, ns);
> +
> +	seq_put_decimal_ull(m, "Pid:\t", nr);
>  
> -	seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
> +#ifdef CONFIG_PID_NS
> +	seq_put_decimal_ull(m, "\nNSpid:\t", nr);
> +	if (nr) {
> +		int i;
> +
> +		/* If nr is non-zero it means that 'pid' is valid and that

Nit: multiline kernel comment style is usually

/* 
 * bla
 * bla
 */

but I'll just fix this up when applying. No need to resend.

> +		 * ns, i.e. the pid namespace associated with the procfs
> +		 * instance, is in the pid namespace hierarchy of pid.
> +		 * Start at one below the already printed level.
> +		 */
> +		for (i = ns->level + 1; i <= pid->level; i++)
> +			seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
> +	}
> +#endif
>  	seq_putc(m, '\n');
>  }
>  #endif
> -- 
> 2.21.0
> 

^ permalink raw reply

* Re: [PATCH v4 2/2] pidfd: add tests for NSpid info in fdinfo
From: Christian Brauner @ 2019-10-15 10:07 UTC (permalink / raw)
  To: Christian Kellner
  Cc: linux-kernel, linux-api, Jann Horn, Christian Kellner,
	Christian Brauner, Shuah Khan, Andrew Morton,
	Peter Zijlstra (Intel), Ingo Molnar, Michal Hocko,
	Thomas Gleixner, Elena Reshetova, Roman Gushchin,
	Andrea Arcangeli, Al Viro, Aleksa Sarai, Dmitry V. Levin,
	linux-kselftest
In-Reply-To: <20191014162034.2185-2-ckellner@redhat.com>

On Mon, Oct 14, 2019 at 06:20:33PM +0200, Christian Kellner wrote:
> From: Christian Kellner <christian@kellner.me>
> 
> Add a test that checks that if pid namespaces are configured the fdinfo
> file of a pidfd contains an NSpid: entry containing the process id in
> the current and additionally all nested namespaces. In the case that
> a pidfd is from a pid namespace not in the same namespace hierarchy as
> the process accessing the fdinfo file, ensure the 'NSpid' shows 0 for
> that pidfd, analogous to the 'Pid' entry.
> 
> Signed-off-by: Christian Kellner <christian@kellner.me>

That looks reasonable to me.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>

^ permalink raw reply

* [PATCH 1/2] pidfd: verify task is alive when printing fdinfo
From: Christian Brauner @ 2019-10-15 14:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Shuah Khan, Andrew Morton, Peter Zijlstra (Intel), Ingo Molnar,
	Michal Hocko, Thomas Gleixner, Elena Reshetova, Christian Kellner,
	Roman Gushchin, Andrea Arcangeli, Al Viro, Aleksa Sarai,
	Dmitry V. Levin, linux-kselftest, Christian Brauner, Jann Horn,
	Oleg Nesterov, linux-api

Currently, when a task is dead we still print the pid it used to use in
the fdinfo files of its pidfds. This doesn't make much sense since the
pid may have already been reused. So verify that the task is still
alive. If the task is not alive anymore, we will print -1. This allows
us to differentiate between a task not being present in a given pid
namespace - in which case we already print 0 - and a task having been
reaped.

Note that this uses PIDTYPE_PID for the check. Technically, we could've
checked PIDTYPE_TGID since pidfds currently only refer to thread-group
leaders but if they won't anymore in the future then this check becomes
problematic without it being immediately obvious to non-experts imho. If
a thread is created via clone(CLONE_THREAD) than struct pid has a single
non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a
PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even
though the thread-group leader might still be very much alive. We could
be more complicated and do something like:

bool alive = false;
rcu_read_lock();
struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
if (tsk && task_tgid(tsk))
	alive = true;
rcu_read_unlock();

but it's really not worth it. We already have created a pidfd and we
thus know it refers to a thread-group leader. Checking PIDTYPE_PID is
fine and is easier to maintain should we ever allow pidfds to refer to
threads.

Cc: Jann Horn <jannh@google.com>
Cc: Christian Kellner <christian@kellner.me>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: linux-api@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 kernel/fork.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 782986962d47..a67944a5e542 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1695,6 +1695,18 @@ static int pidfd_release(struct inode *inode, struct file *file)
 }

 #ifdef CONFIG_PROC_FS
+static inline bool task_alive(struct pid *pid)
+{
+	bool alive = true;
+
+	rcu_read_lock();
+	if (!pid_task(pid, PIDTYPE_PID))
+		alive = false;
+	rcu_read_unlock();
+
+	return alive;
+}
+
 /**
  * pidfd_show_fdinfo - print information about a pidfd
  * @m: proc fdinfo file
@@ -1732,15 +1744,20 @@ static int pidfd_release(struct inode *inode, struct file *file)
  */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
-	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
 	struct pid *pid = f->private_data;
-	pid_t nr = pid_nr_ns(pid, ns);
+	struct pid_namespace *ns;
+	pid_t nr = -1;
+
+	if (likely(task_alive(pid))) {
+		ns = proc_pid_ns(file_inode(m->file));
+		nr = pid_nr_ns(pid, ns);
+	}

-	seq_put_decimal_ull(m, "Pid:\t", nr);
+	seq_put_decimal_ll(m, "Pid:\t", nr);

 #ifdef CONFIG_PID_NS
-	seq_put_decimal_ull(m, "\nNSpid:\t", nr);
-	if (nr) {
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
 		int i;

 		/* If nr is non-zero it means that 'pid' is valid and that
@@ -1749,7 +1766,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 		 * Start at one below the already printed level.
 		 */
 		for (i = ns->level + 1; i <= pid->level; i++)
-			seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
 	}
 #endif
 	seq_putc(m, '\n');
-- 
2.23.0

^ permalink raw reply related

* Re: [PATCH 1/2] pidfd: verify task is alive when printing fdinfo
From: Oleg Nesterov @ 2019-10-15 14:43 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, Shuah Khan, Andrew Morton, Peter Zijlstra (Intel),
	Ingo Molnar, Michal Hocko, Thomas Gleixner, Elena Reshetova,
	Christian Kellner, Roman Gushchin, Andrea Arcangeli, Al Viro,
	Aleksa Sarai, Dmitry V. Levin, linux-kselftest, Jann Horn,
	linux-api
In-Reply-To: <20191015141332.4055-1-christian.brauner@ubuntu.com>

On 10/15, Christian Brauner wrote:
>
> +static inline bool task_alive(struct pid *pid)
> +{
> +	bool alive = true;
> +
> +	rcu_read_lock();
> +	if (!pid_task(pid, PIDTYPE_PID))
> +		alive = false;
> +	rcu_read_unlock();
> +
> +	return alive;
> +}

Well, the usage of rcu_read_lock/unlock looks confusing to me...

I mean, this helper does not need rcu lock at all. Except
rcu_dereference_check() will complain.

	static inline bool task_alive(struct pid *pid)
	{
		bool alive;

		/* shut up rcu_dereference_check() */
		rcu_lock_acquire(&rcu_lock_map);
		alive = !!pid_task(pid, PIDTYPE_PID));
		rcu_lock_release(&rcu_lock_map);

		return alive;
	}

looks more clear imo.

But in fact I'd suggest to simply use !hlist_empty(&pid->tasks[PIDTYPE_PID])
in pidfd_show_fdinfo() and do not add a new helper.

Oleg.

^ permalink raw reply

* Re: [PATCH 1/2] pidfd: verify task is alive when printing fdinfo
From: Christian Brauner @ 2019-10-15 14:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, Shuah Khan, Andrew Morton, Peter Zijlstra (Intel),
	Ingo Molnar, Michal Hocko, Thomas Gleixner, Elena Reshetova,
	Christian Kellner, Roman Gushchin, Andrea Arcangeli, Al Viro,
	Aleksa Sarai, Dmitry V. Levin, linux-kselftest, Jann Horn,
	linux-api
In-Reply-To: <20191015144356.GA16978@redhat.com>

On Tue, Oct 15, 2019 at 04:43:57PM +0200, Oleg Nesterov wrote:
> On 10/15, Christian Brauner wrote:
> >
> > +static inline bool task_alive(struct pid *pid)
> > +{
> > +	bool alive = true;
> > +
> > +	rcu_read_lock();
> > +	if (!pid_task(pid, PIDTYPE_PID))
> > +		alive = false;
> > +	rcu_read_unlock();
> > +
> > +	return alive;
> > +}
> 
> Well, the usage of rcu_read_lock/unlock looks confusing to me...
> 
> I mean, this helper does not need rcu lock at all. Except
> rcu_dereference_check() will complain.

Yep, I think we have another codepath were the rcu locks might be purely
cosmetic so I thought it's not a big deal (see below).

> 
> 	static inline bool task_alive(struct pid *pid)
> 	{
> 		bool alive;
> 
> 		/* shut up rcu_dereference_check() */
> 		rcu_lock_acquire(&rcu_lock_map);
> 		alive = !!pid_task(pid, PIDTYPE_PID));
> 		rcu_lock_release(&rcu_lock_map);
> 
> 		return alive;
> 	}
> 
> looks more clear imo.
> 
> But in fact I'd suggest to simply use !hlist_empty(&pid->tasks[PIDTYPE_PID])
> in pidfd_show_fdinfo() and do not add a new helper.

Sounds good to me. But can't we then just do something similar just with
!hlist_empty(&pid->tasks[PIDTYPE_TGID])

in v5.4-rc3:kernel/pid.c:pidfd_open():514-517 ?

or would this be problematic because of de_thread()?

Thanks!
Christian

^ permalink raw reply

* Re: [PATCH 1/2] pidfd: verify task is alive when printing fdinfo
From: Oleg Nesterov @ 2019-10-15 15:43 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, Shuah Khan, Andrew Morton, Peter Zijlstra (Intel),
	Ingo Molnar, Michal Hocko, Thomas Gleixner, Elena Reshetova,
	Christian Kellner, Roman Gushchin, Andrea Arcangeli, Al Viro,
	Aleksa Sarai, Dmitry V. Levin, linux-kselftest, Jann Horn,
	linux-api
In-Reply-To: <20191015145646.72eqrw6j52ehvfn2@wittgenstein>

On 10/15, Christian Brauner wrote:
>
> On Tue, Oct 15, 2019 at 04:43:57PM +0200, Oleg Nesterov wrote:
> > But in fact I'd suggest to simply use !hlist_empty(&pid->tasks[PIDTYPE_PID])
> > in pidfd_show_fdinfo() and do not add a new helper.
>
> Sounds good to me. But can't we then just do something similar just with
> !hlist_empty(&pid->tasks[PIDTYPE_TGID])
>
> in v5.4-rc3:kernel/pid.c:pidfd_open():514-517 ?

Agreed. Actually, it seems to me I suggested to use rcu_lock_acquire() rather
than rcu_read_lock() in pidfd_open() too.

But hlist_empty(pid->tasks[type]) looks even better.

If you decide to add a new helper, you can also change do_wait() which checks
hlist_empty(&wo->wo_pid->tasks[wo->wo_type]). May be even __change_pid().

Oleg.

^ permalink raw reply

* [PATCH man-pages] Document encoded I/O
From: Omar Sandoval @ 2019-10-15 18:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs
  Cc: Dave Chinner, Jann Horn, linux-api, kernel-team
In-Reply-To: <cover.1571164762.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

This adds a new page, rwf_encoded(7), providing an overview of encoded
I/O and updates fcntl(2), open(2), and preadv2(2)/pwritev2(2) to
reference it.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 man2/fcntl.2       |  10 +-
 man2/open.2        |  13 ++
 man2/readv.2       |  46 +++++++
 man7/rwf_encoded.7 | 297 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 man7/rwf_encoded.7

diff --git a/man2/fcntl.2 b/man2/fcntl.2
index fce4f4c2b..76fe9cc6f 100644
--- a/man2/fcntl.2
+++ b/man2/fcntl.2
@@ -222,8 +222,9 @@ On Linux, this command can change only the
 .BR O_ASYNC ,
 .BR O_DIRECT ,
 .BR O_NOATIME ,
+.BR O_NONBLOCK ,
 and
-.B O_NONBLOCK
+.B O_ENCODED
 flags.
 It is not possible to change the
 .BR O_DSYNC
@@ -1803,6 +1804,13 @@ Attempted to clear the
 flag on a file that has the append-only attribute set.
 .TP
 .B EPERM
+Attempted to set the
+.B O_ENCODED
+flag and the calling process did not have the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B EPERM
 .I cmd
 was
 .BR F_ADD_SEALS ,
diff --git a/man2/open.2 b/man2/open.2
index b0f485b41..cdd3c549c 100644
--- a/man2/open.2
+++ b/man2/open.2
@@ -421,6 +421,14 @@ was followed by a call to
 .BR fdatasync (2)).
 .IR "See NOTES below" .
 .TP
+.B O_ENCODED
+Open the file with encoded I/O permissions;
+see
+.BR rwf_encoded (7).
+The caller must have the
+.B CAP_SYS_ADMIN
+capabilty.
+.TP
 .B O_EXCL
 Ensure that this call creates the file:
 if this flag is specified in conjunction with
@@ -1168,6 +1176,11 @@ did not match the owner of the file and the caller was not privileged.
 The operation was prevented by a file seal; see
 .BR fcntl (2).
 .TP
+.B EPERM
+The
+.B O_ENCODED
+flag was specified, but the caller was not privileged.
+.TP
 .B EROFS
 .I pathname
 refers to a file on a read-only filesystem and write access was
diff --git a/man2/readv.2 b/man2/readv.2
index af27aa63e..aa60b980a 100644
--- a/man2/readv.2
+++ b/man2/readv.2
@@ -265,6 +265,11 @@ the data is always appended to the end of the file.
 However, if the
 .I offset
 argument is \-1, the current file offset is updated.
+.TP
+.BR RWF_ENCODED " (since Linux 5.6)"
+Read or write encoded (e.g., compressed) data.
+See
+.BR rwf_encoded (7).
 .SH RETURN VALUE
 On success,
 .BR readv (),
@@ -284,6 +289,13 @@ than requested (see
 and
 .BR write (2)).
 .PP
+If
+.B
+RWF_ENCODED
+was specified in
+.IR flags ,
+then the return value is the number of encoded bytes.
+.PP
 On error, \-1 is returned, and \fIerrno\fP is set appropriately.
 .SH ERRORS
 The errors are as given for
@@ -314,6 +326,40 @@ is less than zero or greater than the permitted maximum.
 .TP
 .B EOPNOTSUPP
 An unknown flag is specified in \fIflags\fP.
+.TP
+.B EOPNOTSUPP
+.B RWF_ENCODED
+is specified in
+.I flags
+and the filesystem does not implement encoded I/O.
+.TP
+.B EPERM
+.B RWF_ENCODED
+is specified in
+.I flags
+and the file was not opened with the
+.B O_ENCODED
+flag.
+.PP
+.BR preadv2 ()
+can fail for the following reasons:
+.TP
+.B EFBIG
+.B RWF_ENCODED
+is specified in
+.I flags
+and buffers in
+.I iov
+were not big enough to return the encoded data.
+.PP
+.BR pwritev2 ()
+can fail for the following reasons:
+.TP
+.B EINVAL
+.B RWF_ENCODED
+is specified in
+.I flags
+and the alignment and/or size requirements are not met.
 .SH VERSIONS
 .BR preadv ()
 and
diff --git a/man7/rwf_encoded.7 b/man7/rwf_encoded.7
new file mode 100644
index 000000000..90f5292e2
--- /dev/null
+++ b/man7/rwf_encoded.7
@@ -0,0 +1,297 @@
+.\" Copyright (c) 2019 by Omar Sandoval <osandov@fb.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.\"
+.TH RWF_ENCODED  7 2019-10-14 "Linux" "Linux Programmer's Manual"
+.SH NAME
+rwf_encoded \- overview of encoded I/O
+.SH DESCRIPTION
+Several filesystems (e.g., Btrfs) support transparent encoding
+(e.g., compression, encryption) of data on disk:
+written data is encoded by the kernel before it is written to disk,
+and read data is decoded before being returned to the user.
+In some cases, it is useful to skip this encoding step.
+For example, the user may want to read the compressed contents of a file
+or write pre-compressed data directly to a file.
+This is referred to as "encoded I/O".
+.SS Encoded I/O API
+Encoded I/O is specified with the
+.B RWF_ENCODED
+flag to
+.BR preadv2 (2)
+and
+.BR pwritev2 (2).
+If
+.B RWF_ENCODED
+is specified, then
+.I iov[0].iov_base
+points to an
+.I
+encoded_iov
+structure, defined in
+.I <linux/fs.h>
+as:
+.PP
+.in +4n
+.EX
+struct encoded_iov {
+    __u64 len;
+    __u64 unencoded_len;
+    __u64 unencoded_offset;
+    __u32 compression;
+    __u32 encryption;
+
+};
+.EE
+.in
+.PP
+.I iov[0].iov_len
+must be set to
+.IR "sizeof(struct\ encoded_iov)" .
+The remaining buffers contain the encoded data.
+.PP
+.I compression
+and
+.I encryption
+are the encoding fields.
+.I compression
+is one of
+.B ENCODED_IOV_COMPRESSION_NONE
+(zero),
+.BR ENCODED_IOV_COMPRESSION_ZLIB ,
+.BR ENCODED_IOV_COMPRESSION_LZO ,
+or
+.BR ENCODED_IOV_COMPRESSION_ZSTD .
+.I encryption
+is currently always
+.B ENCODED_IOV_ENCRYPTION_NONE
+(zero).
+.PP
+.I unencoded_len
+is the length of the unencoded (i.e., decrypted and decompressed) data.
+.I unencoded_offset
+is the offset into the unencoded data where the data in the file begins
+(strictly less than
+.IR unencoded_len ).
+.I len
+is the length of the data in the file.
+.PP
+In most cases,
+.I len
+is equal to
+.I unencoded_len
+and
+.I unencoded_offset
+is zero.
+However, it may be necessary to refer to a subset of the unencoded data,
+usually because a read occurred in the middle of an encoded extent,
+because part of an extent was overwritten or deallocated in some
+way (e.g., with
+.BR write (2),
+.BR truncate (2),
+or
+.BR fallocate (2))
+or because part of an extent was added to the file (e.g., with
+.BR ioctl_ficlonerange (2)
+or
+.BR ioctl_fideduperange (2)).
+For example, if
+.I len
+is 300,
+.I unencoded_len
+is 1000,
+and
+.I unencoded_offset
+is 600,
+then the encoded data is 1000 bytes long when decoded,
+of which only the 300 bytes starting at offset 600 are used;
+the first 600 and last 100 bytes should be ignored.
+.PP
+Additionally,
+.I len
+may be greater than
+.I unencoded_len
+-
+.IR unencoded_offset;
+in this case, the data in the file is longer than the unencoded data,
+and the difference is zero-filled.
+.PP
+If the unencoded data is actually longer than
+.IR unencoded_len ,
+then it is truncated;
+if it is shorter, then it is extended with zeroes.
+.PP
+For
+.BR pwritev2 (),
+the metadata should be specified in
+.IR iov[0] ,
+and the encoded data should be passed in the remaining buffers.
+This returns the number of encoded bytes written (that is, the sum of
+.I iov[n].iov_len
+for 1 <=
+.I n
+<
+.IR iovcnt ;
+partial writes will not occur).
+If the
+.I offset
+argument to
+.BR pwritev2 ()
+is -1, then the file offset is incremented by
+.IR len .
+At least one encoding field must be non-zero.
+Note that the encoded data is not validated when it is written;
+if it is not valid (e.g., it cannot be decompressed),
+then a subsequent read may result in an error.
+.PP
+For
+.BR preadv2 (),
+the metadata is returned in
+.IR iov[0] ,
+and the encoded data is returned in the remaining buffers.
+This returns the number of encoded bytes read.
+Note that a return value of zero does not indicate end of file;
+one should refer to
+.I len
+(for example, a hole in the file has a non-zero
+.I len
+but a zero return value).
+A
+.I len
+of zero indicates end of file.
+If the
+.I offset
+argument to
+.BR preadv2 ()
+is -1, then the file offset is incremented by
+.IR len .
+If the provided buffers are not large enough to return an entire encoded
+extent,
+then this returns -1 and sets
+.I errno
+to
+.BR EFBIG .
+This will only return one encoded extent per call.
+This can also read data which is not encoded;
+all encoding fields will be zero in that case.
+.SS Security
+Encoded I/O creates the potential for some security issues:
+.IP * 3
+Encoded writes allow writing arbitrary data which the kernel will decode on
+a subsequent read. Decompression algorithms are complex and may have bugs
+which can be exploited by malicous data.
+.IP *
+Encoded reads may return data which is not logically present in the file
+(see the discussion of
+.I len
+vs.
+.I unencoded_len
+above).
+It may not be intended for this data to be readable.
+.PP
+Therefore, encoded I/O requires privilege.
+Namely, the
+.B RWF_ENCODED
+flag may only be used when the file was opened with the
+.B O_ENCODED
+flag to
+.BR open (2),
+which requires the
+.B CAP_SYS_ADMIN
+capability.
+.B O_ENCODED
+may be set and cleared with
+.BR fcntl (2).
+Note that it is not cleared on
+.BR fork (2)
+or
+.BR execve (2);
+one may wish to use
+.B O_CLOEXEC
+with
+.BR O_ENCODED .
+.SS Filesystem support
+Encoded I/O is supported on the following filesystems:
+.TP
+Btrfs (since Linux 5.6)
+.IP
+Btrfs supports encoded reads and writes of compressed data.
+The data is encoded as follows:
+.RS
+.IP * 3
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_ZLIB ,
+then the encoded data is a single zlib stream.
+.IP *
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_LZO ,
+then the encoded data is compressed page by page with LZO1X
+and wrapped in the format described in the Linux kernel source file
+.IR fs/btrfs/lzo.c .
+.IP *
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_ZSTD ,
+then the encoded data is a single zstd frame compressed with the
+.I windowLog
+compression parameter set to no more than 17.
+.RE
+.IP
+Additionally, there are some restrictions on
+.BR pwritev2 ():
+.RS
+.IP * 3
+.I offset
+(or the current file offset if
+.I offset
+is -1) must be aligned to the sector size of the filesystem.
+.IP *
+.I len
+must be aligned to the sector size of the filesystem
+unless the data ends at or beyond the current end of the file.
+.IP *
+.I unencoded_len
+and the length of the encoded data must each be no more than 128 KiB.
+This limit may increase in the future.
+.IP *
+The length of the encoded data rounded up to the nearest sector must be
+less than
+.I unencoded_len
+rounded up to the nearest sector.
+.IP *
+Referring to a subset of unencoded data is not yet implemented; i.e.,
+.I len
+must equal
+.I unencoded_len
+and
+.I unencoded_offset
+must be zero.
+.IP *
+Writing compressed inline extents is not yet implemented.
+.RE
-- 
2.23.0

^ permalink raw reply related

* [RFC PATCH v2 0/5] fs: interface for directly reading/writing compressed data
From: Omar Sandoval @ 2019-10-15 18:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs
  Cc: Dave Chinner, Jann Horn, linux-api, kernel-team
In-Reply-To: <c7e8f93596fee7bb818dc0edf29f484036be1abb.1571164851.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Hello,

This series adds an API for reading compressed data on a filesystem
without decompressing it as well as support for writing compressed data
directly to the filesystem. It is based on my previous series which
added a Btrfs-specific ioctl [1], but it is now an extension to
preadv2()/pwritev2() as suggested by Dave Chinner [2]. I've included a
man page patch describing the API in detail. Test cases and examples
programs are available [3].

The use case that I have in mind is Btrfs send/receive: currently, when
sending data from one compressed filesystem to another, the sending side
decompresses the data and the receiving side recompresses it before
writing it out. This is wasteful and can be avoided if we can just send
and write compressed extents. The send part will be implemented in a
separate series, as this API can stand alone.

Patches 1 and 2 add the VFS support. Patch 3 is a Btrfs prep patch.
Patch 4 implements encoded reads for Btrfs, and patch 5 implements
encoded writes.

Changes from v1 [4]:

- Encoded reads are now also implemented.
- The encoded_iov structure now includes metadata for referring to a
  subset of decoded data. This is required to handle certain cases where
  a compressed extent is truncated, hole punched, or otherwise sliced up
  and Btrfs chooses to reflect this in metadata instead of decompressing
  the whole extent and rewriting the pieces. We call these "bookend
  extents" in Btrfs, but any filesystem supporting transparent encoding
  is likely to have a similar concept.
- The behavior of the filesystem when the decompressed data is longer
  than or shorter than expected is more strictly defined (truncate and
  zero extend, respectively).
- As pointed out by Jann Horn [5], the capability check done at
  read/write time in v1 was incorrect; v2 adds an explicit open flag
  (which can be changed with fcntl()). As this can be trivially combined
  with O_CLOEXEC, I did not add any sort of automatic clearing on exec.

I wanted to get the ball rolling on reviewing the interface, so the
Btrfs implementation has a couple of smaller todos:

- Encoded reads do not yet implement repair for disk/checksum failures.
- Encoded writes do not yet support inline extents or bookend extents.

This is based on v5.4-rc3

Please share any comments on the API or implementation. Thanks!

1: https://lore.kernel.org/linux-fsdevel/cover.1567623877.git.osandov@fb.com/
2: https://lore.kernel.org/linux-fsdevel/20190906212710.GI7452@vader/
3: https://github.com/osandov/xfstests/tree/rwf-encoded
4: https://lore.kernel.org/linux-btrfs/cover.1568875700.git.osandov@fb.com/
5: https://lore.kernel.org/linux-btrfs/CAG48ez2GKv15Uj6Wzv0sG5v2bXyrSaCtRTw5Ok_ovja_CiO_fQ@mail.gmail.com/

Omar Sandoval (5):
  fs: add O_ENCODED open flag
  fs: add RWF_ENCODED for reading/writing compressed data
  btrfs: generalize btrfs_lookup_bio_sums_dio()
  btrfs: implement RWF_ENCODED reads
  btrfs: implement RWF_ENCODED writes

 fs/btrfs/compression.c           |   6 +-
 fs/btrfs/compression.h           |   5 +-
 fs/btrfs/ctree.h                 |   9 +-
 fs/btrfs/file-item.c             |  18 +-
 fs/btrfs/file.c                  |  52 ++-
 fs/btrfs/inode.c                 | 663 ++++++++++++++++++++++++++++++-
 fs/fcntl.c                       |  10 +-
 fs/namei.c                       |   4 +
 include/linux/fcntl.h            |   2 +-
 include/linux/fs.h               |  14 +
 include/uapi/asm-generic/fcntl.h |   4 +
 include/uapi/linux/fs.h          |  26 +-
 mm/filemap.c                     |  82 +++-
 13 files changed, 851 insertions(+), 44 deletions(-)

-- 
2.23.0

^ permalink raw reply

* [RFC PATCH v2 1/5] fs: add O_ENCODED open flag
From: Omar Sandoval @ 2019-10-15 18:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs
  Cc: Dave Chinner, Jann Horn, linux-api, kernel-team
In-Reply-To: <cover.1571164762.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

The upcoming RWF_ENCODED operation introduces some security concerns:

1. Compressed writes will pass arbitrary data to decompression
   algorithms in the kernel.
2. Compressed reads can leak truncated/hole punched data.

Therefore, we need to require privilege for RWF_ENCODED. It's not
possible to do the permissions checks at the time of the read or write
because, e.g., io_uring submits IO from a worker thread. So, add an open
flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
fcntl(). The flag is not cleared in any way on fork or exec; it should
probably be used with O_CLOEXEC in most cases.

Note that the usual issue that unknown open flags are ignored doesn't
really matter for O_ENCODED; if the kernel doesn't support O_ENCODED,
then it doesn't support RWF_ENCODED, either.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/fcntl.c                       | 10 ++++++++--
 fs/namei.c                       |  4 ++++
 include/linux/fcntl.h            |  2 +-
 include/uapi/asm-generic/fcntl.h |  4 ++++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3d40771e8e7c..45ebc6df078e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -30,7 +30,8 @@
 #include <asm/siginfo.h>
 #include <linux/uaccess.h>
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
+		    O_ENCODED)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -49,6 +50,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
+	/* O_ENCODED can only be set by superuser */
+	if ((arg & O_ENCODED) && !(filp->f_flags & O_ENCODED) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* required for strict SunOS emulation */
 	if (O_NONBLOCK != O_NDELAY)
 	       if (arg & O_NDELAY)
@@ -1031,7 +1037,7 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
 			__FMODE_EXEC | __FMODE_NONOTIFY));
diff --git a/fs/namei.c b/fs/namei.c
index 671c3c1a3425..ae86b125888a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2978,6 +2978,10 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
 		return -EPERM;
 
+	/* O_ENCODED can only be set by superuser */
+	if ((flag & O_ENCODED) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	return 0;
 }
 
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index d019df946cb2..5fac02479639 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -9,7 +9,7 @@
 	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
 	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
-	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_ENCODED)
 
 #ifndef force_o_largefile
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 9dc0bf0c5a6e..8c5cbd5942e3 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -97,6 +97,10 @@
 #define O_NDELAY	O_NONBLOCK
 #endif
 
+#ifndef O_ENCODED
+#define O_ENCODED	040000000
+#endif
+
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
 #define F_SETFD		2	/* set/clear close_on_exec */
-- 
2.23.0

^ permalink raw reply related

* [RFC PATCH v2 2/5] fs: add RWF_ENCODED for reading/writing compressed data
From: Omar Sandoval @ 2019-10-15 18:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs
  Cc: Dave Chinner, Jann Horn, linux-api, kernel-team
In-Reply-To: <cover.1571164762.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Btrfs supports transparent compression: data written by the user can be
compressed when written to disk and decompressed when read back.
However, we'd like to add an interface to write pre-compressed data
directly to the filesystem, and the matching interface to read
compressed data without decompressing it. This adds support for
so-called "encoded I/O" via preadv2() and pwritev2().

A new RWF_ENCODED flags indicates that a read or write is "encoded". If
this flag is set, iov[0].iov_base points to a struct encoded_iov which
is used for metadata: namely, the compression algorithm, unencoded
(i.e., decompressed) length, and what subrange of the unencoded data
should be used (needed for truncated or hole-punched extents and when
reading in the middle of an extent). For reads, the filesystem returns
this information; for writes, the caller provides it to the filesystem.
iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
used to extend the interface in the future. The remaining iovecs contain
the encoded extent.

Filesystems must indicate that they support encoded writes by setting
FMODE_ENCODED_IO in ->file_open().

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 include/linux/fs.h      | 14 +++++++
 include/uapi/linux/fs.h | 26 ++++++++++++-
 mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0d909d35763..54681f21e05e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File supports encoded IO */
+#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -314,6 +317,7 @@ enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ENCODED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+struct encoded_iov;
+extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
+extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
+extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
+				struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
@@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		ki->ki_flags |= IOCB_NOWAIT;
 	}
+	if (flags & RWF_ENCODED) {
+		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_ENCODED;
+	}
 	if (flags & RWF_HIPRI)
 		ki->ki_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 379a612f8f1d..ed92a8a257cb 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -284,6 +284,27 @@ struct fsxattr {
 
 typedef int __bitwise __kernel_rwf_t;
 
+enum {
+	ENCODED_IOV_COMPRESSION_NONE,
+	ENCODED_IOV_COMPRESSION_ZLIB,
+	ENCODED_IOV_COMPRESSION_LZO,
+	ENCODED_IOV_COMPRESSION_ZSTD,
+	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
+};
+
+enum {
+	ENCODED_IOV_ENCRYPTION_NONE,
+	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
+};
+
+struct encoded_iov {
+	__u64 len;
+	__u64 unencoded_len;
+	__u64 unencoded_offset;
+	__u32 compression;
+	__u32 encryption;
+};
+
 /* high priority request, poll if possible */
 #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
 
@@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* encoded (e.g., compressed or encrypted) IO */
+#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ENCODED)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 1146fcfa3215..d2e6d9caf353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
 	return 0;
 }
 
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	loff_t count;
-	int ret;
 
 	if (IS_SWAPFILE(inode))
 		return -ETXTBSY;
 
-	if (!iov_iter_count(from))
+	if (!*count)
 		return 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
@@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 		return -EINVAL;
 
-	count = iov_iter_count(from);
-	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns a negative errno or the new number of bytes to write.
+ */
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	loff_t count = iov_iter_count(from);
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
 	if (ret)
 		return ret;
 
@@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int generic_encoded_write_checks(struct kiocb *iocb,
+				 struct encoded_iov *encoded)
+{
+	loff_t count = encoded->unencoded_len;
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
+	if (ret)
+		return ret;
+
+	if (count != encoded->unencoded_len) {
+		/*
+		 * The write got truncated by generic_write_checks_common(). We
+		 * can't do a partial encoded write.
+		 */
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_encoded_write_checks);
+
+ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+	if (!(iocb->ki_filp->f_flags & O_ENCODED))
+		return -EPERM;
+	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
+		return -EINVAL;
+	return iov_iter_count(iter) - sizeof(struct encoded_iov);
+}
+EXPORT_SYMBOL(check_encoded_read);
+
+int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
+			 struct iov_iter *from)
+{
+	if (!(iocb->ki_filp->f_flags & O_ENCODED))
+		return -EPERM;
+	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
+		return -EINVAL;
+	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
+		return -EFAULT;
+	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
+	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
+		return -EINVAL;
+	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
+	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
+		return -EINVAL;
+	if (encoded->unencoded_offset >= encoded->unencoded_len)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(import_encoded_write);
+
 /*
  * Performs necessary checks before doing a clone.
  *
-- 
2.23.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox