From: Kees Cook <keescook@chromium.org>
To: jeffxu@chromium.org
Cc: skhan@linuxfoundation.org, akpm@linux-foundation.org,
dmitry.torokhov@gmail.com, dverkamp@chromium.org,
hughd@google.com, jeffxu@google.com, jorgelo@chromium.org,
linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-mm@kvack.org, jannh@google.com,
linux-hardening@vger.kernel.org,
kernel test robot <lkp@intel.com>
Subject: Re: [PATCH v6 3/6] mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC
Date: Thu, 8 Dec 2022 08:27:00 -0800 [thread overview]
Message-ID: <202212080821.5AE7EE99@keescook> (raw)
In-Reply-To: <20221207154939.2532830-4-jeffxu@google.com>
On Wed, Dec 07, 2022 at 03:49:36PM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@google.com>
>
> The new MFD_NOEXEC_SEAL and MFD_EXEC flags allows application to
> set executable bit at creation time (memfd_create).
>
> When MFD_NOEXEC_SEAL is set, memfd is created without executable bit
> (mode:0666), and sealed with F_SEAL_EXEC, so it can't be chmod to
> be executable (mode: 0777) after creation.
>
> when MFD_EXEC flag is set, memfd is created with executable bit
> (mode:0777), this is the same as the old behavior of memfd_create.
>
> The new pid namespaced sysctl vm.memfd_noexec has 3 values:
> 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like
> MFD_EXEC was set.
> 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like
> MFD_NOEXEC_SEAL was set.
> 2: memfd_create() without MFD_NOEXEC_SEAL will be rejected.
>
> The sysctl allows finer control of memfd_create for old-software
> that doesn't set the executable bit, for example, a container with
> vm.memfd_noexec=1 means the old-software will create non-executable
> memfd by default. Also, the value of memfd_noexec is passed to child
> namespace at creation time. For example, if the init namespace has
> vm.memfd_noexec=2, all its children namespaces will be created with 2.
>
> Signed-off-by: Jeff Xu <jeffxu@google.com>
> Co-developed-by: Daniel Verkamp <dverkamp@chromium.org>
> Signed-off-by: Daniel Verkamp <dverkamp@chromium.org>
> Reported-by: kernel test robot <lkp@intel.com>
Please rearrange these tags, and add a link to the lkp report:
Reported-by: kernel test robot <lkp@intel.com>
Link: ...url.to.lkp.lore.email...
Co-developed-by: Daniel Verkamp <dverkamp@chromium.org>
Signed-off-by: Daniel Verkamp <dverkamp@chromium.org>
Signed-off-by: Jeff Xu <jeffxu@google.com>
> ---
> include/linux/pid_namespace.h | 19 +++++++++++
> include/uapi/linux/memfd.h | 4 +++
> kernel/pid_namespace.c | 5 +++
> kernel/pid_sysctl.h | 59 +++++++++++++++++++++++++++++++++++
> mm/memfd.c | 48 ++++++++++++++++++++++++++--
> 5 files changed, 133 insertions(+), 2 deletions(-)
> create mode 100644 kernel/pid_sysctl.h
>
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 07481bb87d4e..a4789a7b34a9 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -16,6 +16,21 @@
>
> struct fs_pin;
>
> +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> +/*
> + * sysctl for vm.memfd_noexec
> + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
> + * acts like MFD_EXEC was set.
> + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
> + * acts like MFD_NOEXEC_SEAL was set.
> + * 2: memfd_create() without MFD_NOEXEC_SEAL will be
> + * rejected.
> + */
> +#define MEMFD_NOEXEC_SCOPE_EXEC 0
> +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1
> +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2
These don't align? I think a tab is missing on MEMFD_NOEXEC_SCOPE_EXEC.
> +#endif
> +
> struct pid_namespace {
> struct idr idr;
> struct rcu_head rcu;
> @@ -31,6 +46,10 @@ struct pid_namespace {
> struct ucounts *ucounts;
> int reboot; /* group exit code if this pidns was rebooted */
> struct ns_common ns;
> +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> + /* sysctl for vm.memfd_noexec */
> + int memfd_noexec_scope;
> +#endif
> } __randomize_layout;
>
> extern struct pid_namespace init_pid_ns;
> diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
> index 7a8a26751c23..273a4e15dfcf 100644
> --- a/include/uapi/linux/memfd.h
> +++ b/include/uapi/linux/memfd.h
> @@ -8,6 +8,10 @@
> #define MFD_CLOEXEC 0x0001U
> #define MFD_ALLOW_SEALING 0x0002U
> #define MFD_HUGETLB 0x0004U
> +/* not executable and sealed to prevent changing to executable. */
> +#define MFD_NOEXEC_SEAL 0x0008U
> +/* executable */
> +#define MFD_EXEC 0x0010U
>
> /*
> * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index f4f8cb0435b4..8a98b1af9376 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -23,6 +23,7 @@
> #include <linux/sched/task.h>
> #include <linux/sched/signal.h>
> #include <linux/idr.h>
> +#include "pid_sysctl.h"
>
> static DEFINE_MUTEX(pid_caches_mutex);
> static struct kmem_cache *pid_ns_cachep;
> @@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
>
> + initialize_memfd_noexec_scope(ns);
> +
> return ns;
>
> out_free_idr:
> @@ -455,6 +458,8 @@ static __init int pid_namespaces_init(void)
> #ifdef CONFIG_CHECKPOINT_RESTORE
> register_sysctl_paths(kern_path, pid_ns_ctl_table);
> #endif
> +
> + register_pid_ns_sysctl_table_vm();
> return 0;
> }
>
> diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
> new file mode 100644
> index 000000000000..5986d6493b5b
> --- /dev/null
> +++ b/kernel/pid_sysctl.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef LINUX_PID_SYSCTL_H
> +#define LINUX_PID_SYSCTL_H
> +
> +#include <linux/pid_namespace.h>
> +
> +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
> +{
> + ns->memfd_noexec_scope =
> + task_active_pid_ns(current)->memfd_noexec_scope;
> +}
> +
> +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
> + int write, void *buf, size_t *lenp, loff_t *ppos)
> +{
> + struct pid_namespace *ns = task_active_pid_ns(current);
> + struct ctl_table table_copy;
> +
> + if (write && !capable(CAP_SYS_ADMIN))
> + return -EPERM;
Should this be CAP_SYS_ADMIN within the userns, rather than the global
init_task CAP_SYS_ADMIN?
> +
> + table_copy = *table;
> + if (ns != &init_pid_ns)
> + table_copy.data = &ns->memfd_noexec_scope;
> +
> + /*
> + * set minimum to current value, the effect is only bigger
> + * value is accepted.
> + */
> + if (*(int *)table_copy.data > *(int *)table_copy.extra1)
> + table_copy.extra1 = table_copy.data;
> +
> + return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
> +}
> +
> +static struct ctl_table pid_ns_ctl_table_vm[] = {
> + {
> + .procname = "memfd_noexec",
> + .data = &init_pid_ns.memfd_noexec_scope,
> + .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
> + .mode = 0644,
> + .proc_handler = pid_mfd_noexec_dointvec_minmax,
> + .extra1 = SYSCTL_ZERO,
> + .extra2 = SYSCTL_TWO,
> + },
> + { }
> +};
> +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } };
> +static inline void register_pid_ns_sysctl_table_vm(void)
> +{
> + register_sysctl_paths(vm_path, pid_ns_ctl_table_vm);
> +}
> +#else
> +static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
> +static inline void register_pid_ns_ctl_table_vm(void) {}
> +#endif
> +
> +#endif /* LINUX_PID_SYSCTL_H */
> diff --git a/mm/memfd.c b/mm/memfd.c
> index 4ebeab94aa74..ec70675a7069 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -18,6 +18,7 @@
> #include <linux/hugetlb.h>
> #include <linux/shmem_fs.h>
> #include <linux/memfd.h>
> +#include <linux/pid_namespace.h>
> #include <uapi/linux/memfd.h>
>
> /*
> @@ -263,12 +264,14 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
> #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
>
> -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
> +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
>
> SYSCALL_DEFINE2(memfd_create,
> const char __user *, uname,
> unsigned int, flags)
> {
> + char comm[TASK_COMM_LEN];
I'm fine with using "comm", but technically, it's not needed: task->comm
will always be %NUL terminated.
> + struct pid_namespace *ns;
> unsigned int *file_seals;
> struct file *file;
> int fd, error;
> @@ -285,6 +288,39 @@ SYSCALL_DEFINE2(memfd_create,
> return -EINVAL;
> }
>
> + /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
> + if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
> + return -EINVAL;
> +
> + if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
> +#ifdef CONFIG_SYSCTL
> + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
> +
> + ns = task_active_pid_ns(current);
> + if (ns)
> + sysctl = ns->memfd_noexec_scope;
> +
> + switch (sysctl) {
> + case MEMFD_NOEXEC_SCOPE_EXEC:
> + flags |= MFD_EXEC;
> + break;
> + case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
> + flags |= MFD_NOEXEC_SEAL;
> + break;
> + default:
> + pr_warn_ratelimited(
> + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
> + task_pid_nr(current), get_task_comm(comm, current));
> + return -EINVAL;
> + }
> +#else
> + flags |= MFD_EXEC;
> +#endif
> + pr_warn_ratelimited(
> + "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
> + task_pid_nr(current), get_task_comm(comm, current));
> + }
> +
> /* length includes terminating zero */
> len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
> if (len <= 0)
> @@ -328,7 +364,15 @@ SYSCALL_DEFINE2(memfd_create,
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> - if (flags & MFD_ALLOW_SEALING) {
> + if (flags & MFD_NOEXEC_SEAL) {
> + struct inode *inode = file_inode(file);
> +
> + inode->i_mode &= ~0111;
> + file_seals = memfd_file_seals_ptr(file);
> + *file_seals &= ~F_SEAL_SEAL;
> + *file_seals |= F_SEAL_EXEC;
> + } else if (flags & MFD_ALLOW_SEALING) {
> + /* MFD_EXEC and MFD_ALLOW_SEALING are set */
> file_seals = memfd_file_seals_ptr(file);
> *file_seals &= ~F_SEAL_SEAL;
> }
> --
> 2.39.0.rc0.267.gcb52ba06e7-goog
>
Otherwise looks good!
--
Kees Cook
next prev parent reply other threads:[~2022-12-08 16:27 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-12-07 15:49 [PATCH v6 0/6] mm/memfd: introduce MFD_NOEXEC_SEAL and MFD_EXEC jeffxu
2022-12-07 15:49 ` [PATCH v6 1/6] mm/memfd: add F_SEAL_EXEC jeffxu
2022-12-08 16:16 ` Kees Cook
2022-12-07 15:49 ` [PATCH v6 2/6] selftests/memfd: add tests for F_SEAL_EXEC jeffxu
2022-12-08 16:21 ` Kees Cook
2022-12-07 15:49 ` [PATCH v6 3/6] mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC jeffxu
2022-12-08 16:27 ` Kees Cook [this message]
2022-12-08 22:55 ` Jeff Xu
2022-12-16 15:46 ` Peter Xu
2022-12-16 17:15 ` Jeff Xu
2022-12-16 17:42 ` Andrew Morton
2022-12-16 18:11 ` Jeff Xu
2022-12-16 20:35 ` Kees Cook
2022-12-16 21:46 ` Jeff Xu
2022-12-16 22:06 ` Andrew Morton
2022-12-16 23:40 ` Jeff Xu
2022-12-20 16:55 ` Shuah Khan
2022-12-23 18:06 ` Jeff Xu
2022-12-07 15:49 ` [PATCH v6 4/6] mm/memfd: Add write seals when apply SEAL_EXEC to executable memfd jeffxu
2022-12-08 16:27 ` Kees Cook
2022-12-07 15:49 ` [PATCH v6 5/6] selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC jeffxu
2022-12-08 16:30 ` Kees Cook
2022-12-07 15:49 ` [PATCH v6 6/6] mm/memfd: security hook for memfd_create jeffxu
2022-12-08 16:32 ` Kees Cook
2022-12-08 16:46 ` Kees Cook
2022-12-08 16:13 ` [PATCH v6 0/6] mm/memfd: introduce MFD_NOEXEC_SEAL and MFD_EXEC Kees Cook
2022-12-08 18:33 ` Jeff Xu
2022-12-08 20:55 ` Kees Cook
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=202212080821.5AE7EE99@keescook \
--to=keescook@chromium.org \
--cc=akpm@linux-foundation.org \
--cc=dmitry.torokhov@gmail.com \
--cc=dverkamp@chromium.org \
--cc=hughd@google.com \
--cc=jannh@google.com \
--cc=jeffxu@chromium.org \
--cc=jeffxu@google.com \
--cc=jorgelo@chromium.org \
--cc=linux-hardening@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lkp@intel.com \
--cc=skhan@linuxfoundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.