From: Cong Wang <xiyou.wangcong@gmail.com>
To: Kees Cook <kees@kernel.org>, linux-kernel@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>,
Will Drewry <wad@chromium.org>,
Christian Brauner <brauner@kernel.org>,
Cong Wang <cwang@multikernel.io>
Subject: [RFC PATCH 1/3] seccomp: add SECCOMP_IOCTL_NOTIF_PIN_ARGS to close the unotify TOCTOU race
Date: Sun, 3 May 2026 18:12:05 -0700 [thread overview]
Message-ID: <20260504011207.539408-2-xiyou.wangcong@gmail.com> (raw)
In-Reply-To: <20260504011207.539408-1-xiyou.wangcong@gmail.com>
From: Cong Wang <cwang@multikernel.io>
seccomp_unotify(2) leaves a documented TOCTOU window for unprivileged
supervisors: a sibling thread or CLONE_VM peer can mutate pointer-arg
buffers between the supervisor's process_vm_readv() and the kernel's
re-read on SECCOMP_USER_NOTIF_FLAG_CONTINUE. ptrace()/proc/pid/mem are
not available to unprivileged supervisors, so today there is no
race-free path for argument-content policy on CONTINUE.
This patch adds SECCOMP_IOCTL_NOTIF_PIN_ARGS, which atomically copies
designated pointer-arg payloads from the trapped task's address space
into kernel-owned buffers and binds those buffers to the task's next
syscall execution. On SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED, the
syscall-body fetch points consume from the kernel buffer instead of
re-reading user memory; mutations after PIN_ARGS returns have no
effect.
Three v1 shapes are supported: a fixed-size copy (sockaddr, single-
buffer write/read content), a NUL-bounded C string (paths), and a
NULL-terminated array of C strings (argv/envp). Each per-arg
descriptor caps copy size; total cumulative bytes per request are
bounded at a hardcoded 1 MiB. Pinned-buffer allocations are tagged
GFP_KERNEL_ACCOUNT so the trapped task's memcg pays the cost.
Pin orchestration uses a three-phase lock dance: validate the notif
and snapshot register args under the filter notify lock, walk the
trapped task's mm without locks, then re-validate and attach the
snapshot. The pin is one-shot: a task_work clears it on the next
return-to-userspace after the resumed syscall body completes, with
fallback paths for task exit, listener release, and explicit discard
(CONTINUE without CONTINUE_PINNED). The syscall number is captured at
pin time and verified at consumption so a signal-handler-issued
syscall during -ERESTART* resolution will not consume the pin.
Assisted-by: Claude:claude-opus-4.6
Signed-off-by: Cong Wang <cwang@multikernel.io>
---
MAINTAINERS | 2 +
fs/exec.c | 63 +++++
fs/namei.c | 19 ++
fs/read_write.c | 8 +-
include/linux/mm.h | 2 +-
include/linux/seccomp.h | 35 +++
include/linux/seccomp_types.h | 33 +++
include/uapi/linux/seccomp.h | 73 ++++++
kernel/Makefile | 1 +
kernel/exit.c | 1 +
kernel/fork.c | 5 +
kernel/seccomp.c | 189 +++++++++++++-
kernel/seccomp_pin.c | 453 ++++++++++++++++++++++++++++++++++
kernel/seccomp_pin.h | 109 ++++++++
lib/iov_iter.c | 22 ++
mm/memory.c | 4 +-
mm/nommu.c | 4 +-
net/socket.c | 16 ++
18 files changed, 1026 insertions(+), 13 deletions(-)
create mode 100644 kernel/seccomp_pin.c
create mode 100644 kernel/seccomp_pin.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 882214b0e7db..d7904e8989ca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24086,6 +24086,8 @@ F: Documentation/userspace-api/seccomp_filter.rst
F: include/linux/seccomp.h
F: include/uapi/linux/seccomp.h
F: kernel/seccomp.c
+F: kernel/seccomp_pin.c
+F: kernel/seccomp_pin.h
F: tools/testing/selftests/kselftest_harness.h
F: tools/testing/selftests/kselftest_harness/
F: tools/testing/selftests/seccomp/*
diff --git a/fs/exec.c b/fs/exec.c
index ba12b4c466f6..99d4a3daaeeb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -38,6 +38,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
+#include <linux/seccomp.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
@@ -445,6 +446,63 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
+/*
+ * If a seccomp PIN_ARGS snapshot covers this argv/envp pointer table,
+ * push each pinned string onto the bprm stack directly via
+ * copy_string_kernel(), bypassing the per-string strnlen_user() and
+ * copy_from_user() that would otherwise re-read mutated user memory.
+ *
+ * Returns 0 on success, a negative errno on failure, or +1 if no pin
+ * applied and the caller should run the normal user-memory walk.
+ */
+static int copy_strings_from_pin(struct user_arg_ptr argv,
+ struct linux_binprm *bprm)
+{
+ const struct seccomp_pinned_arg *pin;
+ const u32 *header;
+ const char *strings;
+ u32 count, i;
+ u64 user_argv;
+
+#ifdef CONFIG_COMPAT
+ user_argv = (u64)(uintptr_t)(argv.is_compat ?
+ (const void __user *)argv.ptr.compat :
+ (const void __user *)argv.ptr.native);
+#else
+ user_argv = (u64)(uintptr_t)argv.ptr.native;
+#endif
+ if (!user_argv)
+ return 1;
+
+ pin = seccomp_pin_lookup_current(user_argv);
+ if (!pin || pin->kind != SECCOMP_PIN_CSTRING_ARRAY)
+ return 1;
+
+ header = pin->data;
+ count = header[0];
+ strings = (const char *)pin->data;
+
+ /*
+ * copy_strings() processes argv backwards (highest index first)
+ * because it grows the bprm stack downward. Match that ordering
+ * so the resulting stack layout is identical.
+ */
+ for (i = count; i-- > 0; ) {
+ u32 off = header[1 + i];
+ int ret;
+
+ if (off >= pin->size)
+ return -EINVAL;
+ ret = copy_string_kernel(strings + off, bprm);
+ if (ret < 0)
+ return ret;
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
+ cond_resched();
+ }
+ return 0;
+}
+
static int copy_strings(int argc, struct user_arg_ptr argv,
struct linux_binprm *bprm)
{
@@ -453,6 +511,11 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
unsigned long kpos = 0;
int ret;
+ ret = copy_strings_from_pin(argv, bprm);
+ if (ret <= 0)
+ return ret;
+ /* No pin matched; continue with the normal user-memory walk. */
+
while (argc-- > 0) {
const char __user *str;
int len;
diff --git a/fs/namei.c b/fs/namei.c
index c7fac83c9a85..ee86f4c91cae 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -30,6 +30,7 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/seccomp.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
@@ -222,6 +223,24 @@ do_getname(const char __user *filename, int flags, bool incomplete)
struct filename *
getname_flags(const char __user *filename, int flags)
{
+ const struct seccomp_pinned_arg *pin;
+
+ /*
+ * If a seccomp supervisor pinned this path via PIN_ARGS and sent
+ * CONTINUE_PINNED, build the struct filename from the kernel-side
+ * snapshot instead of re-reading user memory. The pinned buffer
+ * is NUL-terminated by copy_remote_vm_str() in the walker, so
+ * getname_kernel() can consume it directly.
+ *
+ * The empty-path-with-LOOKUP_EMPTY policy is handled here because
+ * getname_kernel() does not reject empty strings.
+ */
+ pin = seccomp_pin_lookup_current((u64)(uintptr_t)filename);
+ if (pin && pin->kind == SECCOMP_PIN_CSTRING) {
+ if (pin->size <= 1 && !(flags & LOOKUP_EMPTY))
+ return ERR_PTR(-ENOENT);
+ return getname_kernel(pin->data);
+ }
return do_getname(filename, flags, false);
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 50bff7edc91f..59877e8422a8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -488,7 +488,9 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, ITER_DEST, buf, len);
+ ret = import_ubuf(ITER_DEST, buf, len, &iter);
+ if (unlikely(ret))
+ return ret;
ret = filp->f_op->read_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -590,7 +592,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
+ ret = import_ubuf(ITER_SOURCE, (void __user *)buf, len, &iter);
+ if (unlikely(ret))
+ return ret;
ret = filp->f_op->write_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index af23453e9dbd..b0116e8ed407 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3187,7 +3187,7 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
-#ifdef CONFIG_BPF_SYSCALL
+#if defined(CONFIG_BPF_SYSCALL) || defined(CONFIG_SECCOMP_FILTER)
extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
#endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 9b959972bf4a..fcc369d3dfca 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -75,6 +75,35 @@ static inline int seccomp_mode(struct seccomp *s)
#ifdef CONFIG_SECCOMP_FILTER
extern void seccomp_filter_release(struct task_struct *tsk);
extern void get_seccomp_filter(struct task_struct *tsk);
+extern void seccomp_clear_pinned_args(struct task_struct *tsk);
+
+/**
+ * seccomp_pin_lookup_current - find a live PIN_ARGS snapshot for current().
+ * @user_addr: the userspace address the syscall body is about to read.
+ *
+ * Called from syscall fetch points (getname_flags, copy_strings,
+ * move_addr_to_kernel, import_ubuf). Returns a pinned-arg entry whose
+ * @data / @size the caller may consume in place of re-reading user
+ * memory, or NULL if there is no live snapshot, the current syscall
+ * does not match the one captured at pin time, or no entry matches
+ * @user_addr.
+ *
+ * Safe to call lockless: current owns its seccomp.pinned_args field
+ * once the PIN_ARGS orchestrator has installed it via WRITE_ONCE.
+ */
+const struct seccomp_pinned_arg *seccomp_pin_lookup_current(u64 user_addr);
+
+/**
+ * seccomp_pin_kvec_for - return a stable kvec for the given pin entry.
+ * @pin: a pin returned by seccomp_pin_lookup_current(); must belong
+ * to the current task.
+ *
+ * The returned pointer references kvec storage that outlives the pin
+ * (freed at syscall exit), suitable for iov_iter_kvec() callers whose
+ * iov_iter consumes after the wrapping function returns.
+ */
+struct kvec;
+const struct kvec *seccomp_pin_kvec_for(const struct seccomp_pinned_arg *pin);
#else /* CONFIG_SECCOMP_FILTER */
static inline void seccomp_filter_release(struct task_struct *tsk)
{
@@ -84,6 +113,12 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
{
return;
}
+static inline void seccomp_clear_pinned_args(struct task_struct *tsk) { }
+static inline const struct seccomp_pinned_arg *
+seccomp_pin_lookup_current(u64 user_addr) { return NULL; }
+struct kvec;
+static inline const struct kvec *
+seccomp_pin_kvec_for(const struct seccomp_pinned_arg *pin) { return NULL; }
#endif /* CONFIG_SECCOMP_FILTER */
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
diff --git a/include/linux/seccomp_types.h b/include/linux/seccomp_types.h
index cf0a0355024f..bd3fe17e659a 100644
--- a/include/linux/seccomp_types.h
+++ b/include/linux/seccomp_types.h
@@ -7,6 +7,34 @@
#ifdef CONFIG_SECCOMP
struct seccomp_filter;
+struct seccomp_pinned_args;
+
+#define SECCOMP_PIN_MAX_ARGS 6
+
+/**
+ * struct seccomp_pinned_arg - one kernel-owned snapshot of a user-pointer arg.
+ * @user_addr: the original userspace address (key for lookup at consumption).
+ * @size: bytes actually populated in @data.
+ * @arg_idx: syscall register slot 0..5.
+ * @kind: one of SECCOMP_PIN_*.
+ * @data: kvmalloc'd buffer holding the snapshotted bytes.
+ *
+ * Consumption sites (getname_flags, copy_strings, move_addr_to_kernel,
+ * import_ubuf) inspect @data and @size after a successful
+ * seccomp_pin_lookup_current(). For sites that need a stable kvec
+ * pointer outliving the call (import_ubuf -> vfs_write iter),
+ * seccomp_pin_kvec_for() returns a kvec stored alongside the pin
+ * with matching lifetime.
+ */
+struct seccomp_pinned_arg {
+ u64 user_addr;
+ u32 size;
+ u8 arg_idx;
+ u8 kind;
+ u16 _pad;
+ void *data;
+};
+
/**
* struct seccomp - the state of a seccomp'ed process
*
@@ -18,11 +46,16 @@ struct seccomp_filter;
*
* @filter must only be accessed from the context of current as there
* is no read locking.
+ * @pinned_args: NULL except during a PIN_ARGS window. Owned by the trapped
+ * task itself; populated by SECCOMP_IOCTL_NOTIF_PIN_ARGS, consumed
+ * on CONTINUE_PINNED, freed at syscall exit, listener release, or
+ * task exit. See kernel/seccomp_pin.c.
*/
struct seccomp {
int mode;
atomic_t filter_count;
struct seccomp_filter *filter;
+ struct seccomp_pinned_args *pinned_args;
};
#else
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..51cf081cbc5a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -154,4 +154,77 @@ struct seccomp_notif_addfd {
#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+/*
+ * SECCOMP_IOCTL_NOTIF_PIN_ARGS — atomically snapshot the trapped child's
+ * pointer-arg payloads into kernel buffers, populate the supervisor's
+ * byte buffer, and bind the snapshot to the child for re-execution.
+ *
+ * On NOTIF_SEND with SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED, the kernel
+ * consumes from the pinned buffers instead of re-reading user memory,
+ * closing the documented TOCTOU race in seccomp_unotify(2).
+ */
+
+/* Shape of a pointer-arg to be pinned. */
+#define SECCOMP_PIN_FIXED 0 /* exactly max_bytes from user_addr */
+#define SECCOMP_PIN_CSTRING 1 /* walk to NUL, capped at max_bytes */
+#define SECCOMP_PIN_CSTRING_ARRAY 2 /* NULL-term array of CSTRINGs */
+#define SECCOMP_PIN_KIND_MAX 2
+
+/* New NOTIF_SEND response flag (paired with CONTINUE). */
+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED (1UL << 1)
+
+/* Bits for seccomp_pin_arg.truncated. */
+#define SECCOMP_PIN_TRUNCATED_BYTES (1U << 0)
+#define SECCOMP_PIN_TRUNCATED_ENTRIES (1U << 1)
+
+/**
+ * struct seccomp_pin_arg - per-arg pin descriptor (in/out).
+ * @arg_idx: syscall register slot (0..5).
+ * @kind: one of SECCOMP_PIN_*.
+ * @max_bytes: hard cap on bytes copied for this arg; kernel may copy less.
+ * @max_entries: hard cap on pointer-table entries (CSTRING_ARRAY only).
+ * @actual_size: bytes the kernel actually populated for this arg (out).
+ * @actual_entries: entries actually walked (CSTRING_ARRAY only, out).
+ * @truncated: bitmask of SECCOMP_PIN_TRUNCATED_* (out).
+ * @user_addr: the userspace address the kernel snapshotted (out, echoed).
+ * @buf_offset: offset into the supervisor's buf where this arg's bytes
+ * begin (out).
+ */
+struct seccomp_pin_arg {
+ /* in */
+ __u8 arg_idx;
+ __u8 kind;
+ __u16 _reserved;
+ __u32 max_bytes;
+ __u32 max_entries;
+ __u32 _reserved2;
+ /* out */
+ __u32 actual_size;
+ __u32 actual_entries;
+ __u32 truncated;
+ __u32 _reserved3;
+ __u64 user_addr;
+ __u64 buf_offset;
+};
+
+/**
+ * struct seccomp_notif_pin_args - PIN_ARGS ioctl payload (in/out).
+ * @id: notification id from NOTIF_RECV.
+ * @nr_args: count of valid entries in @args (1..6).
+ * @buf_size: size in bytes of @buf.
+ * @buf: user pointer to the bulk byte buffer; the kernel writes
+ * copied bytes here, indexed by args[i].buf_offset.
+ * @args: per-arg descriptors; only args[0..nr_args-1] are read/written.
+ */
+struct seccomp_notif_pin_args {
+ __u64 id;
+ __u32 nr_args;
+ __u32 buf_size;
+ __u64 buf;
+ struct seccomp_pin_arg args[6];
+};
+
+#define SECCOMP_IOCTL_NOTIF_PIN_ARGS SECCOMP_IOWR(5, \
+ struct seccomp_notif_pin_args)
+
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 6785982013dc..7fb35fa1b43a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_SECCOMP_FILTER) += seccomp_pin.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 25e9cb6de7e7..5d1c54000405 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -917,6 +917,7 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
seccomp_filter_release(tsk);
+ seccomp_clear_pinned_args(tsk);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f3fdfdb14c7..a5b7dbf21932 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1763,6 +1763,11 @@ static void copy_seccomp(struct task_struct *p)
/* Ref-count the new filter user, and assign it. */
get_seccomp_filter(current);
p->seccomp = current->seccomp;
+ /*
+ * pinned_args is a per-trapped-task transient that belongs to the
+ * outstanding notification on the parent (if any). Don't inherit it.
+ */
+ p->seccomp.pinned_args = NULL;
/*
* Explicitly enable no_new_privs here in case it got set
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 066909393c38..66b7a8e4fcab 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -44,6 +44,8 @@
#include <linux/anon_inodes.h>
#include <linux/lockdep.h>
+#include "seccomp_pin.h"
+
/*
* When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
* wrong direction flag in the ioctl number. This is the broken one,
@@ -97,6 +99,13 @@ struct seccomp_knotif {
/* outstanding addfd requests */
struct list_head addfd;
+
+ /*
+ * A SECCOMP_IOCTL_NOTIF_PIN_ARGS for this notification is mid-walk
+ * (i.e. inside Phase B's lockless mm scan). Concurrent PIN_ARGS
+ * ioctls for the same id bail with -EBUSY rather than racing.
+ */
+ bool pin_in_progress;
};
/**
@@ -1475,6 +1484,13 @@ static void seccomp_notify_detach(struct seccomp_filter *filter)
knotif->error = -ENOSYS;
knotif->val = 0;
+ /*
+ * Drop any PIN_ARGS snapshot held on the trapped task; the
+ * supervisor that owned this notif fd is gone, so the pin
+ * can never be consumed via CONTINUE_PINNED.
+ */
+ seccomp_clear_pinned_args(knotif->task);
+
/*
* We do not need to wake up any pending addfd messages, as
* the notifier will do that for us, as this just looks
@@ -1498,7 +1514,7 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
/* must be called with notif_lock held */
static inline struct seccomp_knotif *
-find_notification(struct seccomp_filter *filter, u64 id)
+seccomp_find_notification(struct seccomp_filter *filter, u64 id)
{
struct seccomp_knotif *cur;
@@ -1607,7 +1623,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
* sure it's still around.
*/
mutex_lock(&filter->notify_lock);
- knotif = find_notification(filter, unotif.id);
+ knotif = seccomp_find_notification(filter, unotif.id);
if (knotif) {
/* Reset the process to make sure it's not stuck */
if (should_sleep_killable(filter, knotif))
@@ -1632,18 +1648,27 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (copy_from_user(&resp, buf, sizeof(resp)))
return -EFAULT;
- if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ if (resp.flags & ~(SECCOMP_USER_NOTIF_FLAG_CONTINUE |
+ SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED))
return -EINVAL;
if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
(resp.error || resp.val))
return -EINVAL;
+ /*
+ * CONTINUE_PINNED is only valid alongside CONTINUE, and is a no-op
+ * until the consumption-side hooks land in subsequent patches.
+ */
+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED) &&
+ !(resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE))
+ return -EINVAL;
+
ret = mutex_lock_interruptible(&filter->notify_lock);
if (ret < 0)
return ret;
- knotif = find_notification(filter, resp.id);
+ knotif = seccomp_find_notification(filter, resp.id);
if (!knotif) {
ret = -ENOENT;
goto out;
@@ -1660,6 +1685,37 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
+
+ /*
+ * If CONTINUE_PINNED was set, arm the snapshot so that the
+ * syscall-body fetch points consume from kernel buffers instead of
+ * re-reading user memory. If CONTINUE was set without PINNED, the
+ * supervisor explicitly opted out of the snapshot and we discard
+ * it (re-read from user memory as today).
+ */
+ if (resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) {
+ struct seccomp_pinned_args *kpa =
+ READ_ONCE(knotif->task->seccomp.pinned_args);
+
+ if (kpa && (resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED)) {
+ WRITE_ONCE(kpa->live, true);
+ /*
+ * Schedule a one-shot clear that fires when the
+ * trapped task next returns to user mode (after the
+ * resumed syscall body completes). Failure here
+ * means the task is exiting; cleanup happens via
+ * seccomp_filter_release / do_exit instead.
+ */
+ seccomp_pin_queue_clear(knotif->task);
+ } else if (kpa) {
+ seccomp_clear_pinned_args(knotif->task);
+ }
+ } else if (resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE_PINNED) {
+ /* Already rejected at the top of this function, but be defensive. */
+ ret = -EINVAL;
+ goto out;
+ }
+
if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
complete_on_current_cpu(&knotif->ready);
else
@@ -1683,7 +1739,7 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- knotif = find_notification(filter, id);
+ knotif = seccomp_find_notification(filter, id);
if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
ret = 0;
else
@@ -1751,7 +1807,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
if (ret < 0)
goto out;
- knotif = find_notification(filter, addfd.id);
+ knotif = seccomp_find_notification(filter, addfd.id);
if (!knotif) {
ret = -ENOENT;
goto out_unlock;
@@ -1823,6 +1879,125 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
return ret;
}
+static long seccomp_notif_pin_args(struct seccomp_filter *filter,
+ struct seccomp_notif_pin_args __user *uargs)
+{
+ struct seccomp_notif_pin_args kargs;
+ struct seccomp_pinned_args *kpa = NULL;
+ struct seccomp_knotif *knotif;
+ struct task_struct *task = NULL;
+ void __user *user_buf;
+ u64 args[6];
+ int syscall_nr = 0;
+ int i;
+ long ret;
+
+ if (copy_from_user(&kargs, uargs, sizeof(kargs)))
+ return -EFAULT;
+ if (kargs.nr_args == 0 || kargs.nr_args > SECCOMP_PIN_MAX_ARGS)
+ return -EINVAL;
+ if (kargs.buf_size > SECCOMP_PIN_MAX_TOTAL_BYTES)
+ return -E2BIG;
+
+ /* Validate descriptor inputs before any allocation. */
+ for (i = 0; i < kargs.nr_args; i++) {
+ struct seccomp_pin_arg *d = &kargs.args[i];
+
+ if (d->arg_idx >= 6)
+ return -EINVAL;
+ if (d->kind > SECCOMP_PIN_KIND_MAX)
+ return -EINVAL;
+ if (d->max_bytes == 0)
+ return -EINVAL;
+ if (d->max_bytes > SECCOMP_PIN_MAX_TOTAL_BYTES)
+ return -E2BIG;
+ }
+
+ user_buf = (void __user *)(uintptr_t)kargs.buf;
+ if (kargs.buf_size && !user_buf)
+ return -EINVAL;
+
+ /*
+ * Phase A: validate notif state, snapshot the args we need under
+ * the lock, take task ref, mark pin_in_progress so a concurrent
+ * PIN_ARGS for the same id bails with -EBUSY.
+ */
+ mutex_lock(&filter->notify_lock);
+ knotif = seccomp_find_notification(filter, kargs.id);
+ if (!knotif) {
+ ret = -ENOENT;
+ goto unlock_a;
+ }
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto unlock_a;
+ }
+ if (knotif->task->seccomp.pinned_args) {
+ ret = -EEXIST;
+ goto unlock_a;
+ }
+ if (knotif->pin_in_progress) {
+ ret = -EBUSY;
+ goto unlock_a;
+ }
+ knotif->pin_in_progress = true;
+ memcpy(args, knotif->data->args, sizeof(args));
+ syscall_nr = knotif->data->nr;
+ task = get_task_struct(knotif->task);
+ mutex_unlock(&filter->notify_lock);
+
+ /* Phase B: lockless mm walk + supervisor copy. */
+ ret = seccomp_pin_args_walk(task, &kargs, args, syscall_nr,
+ user_buf, kargs.buf_size, &kpa);
+ if (ret)
+ goto cleanup;
+
+ if (copy_to_user(uargs, &kargs, sizeof(kargs))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ /*
+ * Phase C: re-validate (the notif may have been replied to or the
+ * supervisor may have released the listener) and attach the
+ * snapshot.
+ */
+ mutex_lock(&filter->notify_lock);
+ knotif = seccomp_find_notification(filter, kargs.id);
+ if (!knotif || knotif->state != SECCOMP_NOTIFY_SENT) {
+ mutex_unlock(&filter->notify_lock);
+ ret = -ENOENT;
+ goto cleanup;
+ }
+ WRITE_ONCE(task->seccomp.pinned_args, kpa);
+ knotif->pin_in_progress = false;
+ kpa = NULL; /* ownership transferred to task */
+ mutex_unlock(&filter->notify_lock);
+ put_task_struct(task);
+ return 0;
+
+cleanup:
+ /*
+ * Best-effort: clear pin_in_progress so a subsequent PIN_ARGS can
+ * proceed. The notif may already be gone, in which case there is
+ * nothing to clear.
+ */
+ mutex_lock(&filter->notify_lock);
+ knotif = seccomp_find_notification(filter, kargs.id);
+ if (knotif)
+ knotif->pin_in_progress = false;
+ mutex_unlock(&filter->notify_lock);
+
+ seccomp_free_pinned_args(kpa);
+ if (task)
+ put_task_struct(task);
+ return ret;
+
+unlock_a:
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -1840,6 +2015,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
return seccomp_notify_id_valid(filter, buf);
case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
return seccomp_notify_set_flags(filter, arg);
+ case SECCOMP_IOCTL_NOTIF_PIN_ARGS:
+ return seccomp_notif_pin_args(filter, buf);
}
/* Extensible Argument ioctls */
diff --git a/kernel/seccomp_pin.c b/kernel/seccomp_pin.c
new file mode 100644
index 000000000000..a206fde3d806
--- /dev/null
+++ b/kernel/seccomp_pin.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Pin-args lifecycle and walker for SECCOMP_IOCTL_NOTIF_PIN_ARGS.
+ *
+ * The supervisor calls PIN_ARGS to atomically copy designated pointer-arg
+ * payloads of a trapped child into kernel-owned buffers, then sends
+ * NOTIF_SEND with CONTINUE | CONTINUE_PINNED. The kernel re-executes the
+ * syscall using the pinned bytes instead of re-reading user memory,
+ * closing the documented seccomp_unotify(2) TOCTOU race.
+ *
+ * The lock-and-validate dance lives in kernel/seccomp.c (where
+ * struct seccomp_knotif and filter->notify_lock are defined). This file
+ * owns the per-arg walker (Phase B) and the lifecycle primitives.
+ *
+ * Only SECCOMP_PIN_FIXED is implemented in v1's first cut; CSTRING and
+ * CSTRING_ARRAY arrive in subsequent patches.
+ */
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
+#include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/task_work.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+
+#include "seccomp_pin.h"
+
+struct seccomp_pinned_args *seccomp_alloc_pinned_args(u8 nr_args)
+{
+ struct seccomp_pinned_args *kpa;
+
+ if (nr_args == 0 || nr_args > SECCOMP_PIN_MAX_ARGS)
+ return ERR_PTR(-EINVAL);
+
+ kpa = kzalloc_obj(*kpa, GFP_KERNEL_ACCOUNT);
+ if (!kpa)
+ return ERR_PTR(-ENOMEM);
+ kpa->nr_args = nr_args;
+ return kpa;
+}
+
+void seccomp_free_pinned_args(struct seccomp_pinned_args *kpa)
+{
+ int i;
+
+ if (!kpa)
+ return;
+ for (i = 0; i < kpa->nr_args; i++)
+ kvfree(kpa->args[i].data);
+ kfree(kpa);
+}
+
+void seccomp_clear_pinned_args(struct task_struct *task)
+{
+ struct seccomp_pinned_args *kpa;
+
+ /*
+ * Atomically claim ownership of the kpa: this can be called
+ * concurrently from the task's own task_work callback (returning
+ * to userspace after a CONTINUE_PINNED'd syscall), from a
+ * listener-release path on the supervisor side, and from task
+ * exit. Only the xchg winner frees.
+ */
+ kpa = xchg(&task->seccomp.pinned_args, NULL);
+ if (!kpa)
+ return;
+ /*
+ * Cancel any queued post-syscall clear; its callback_head lives
+ * inside @kpa and would otherwise dangle. If task_work_cancel
+ * returns false the callback has already started running on @task,
+ * but it does its work via current->seccomp.pinned_args (already
+ * NULL) so the in-flight callback observes nothing-to-do.
+ */
+ if (kpa->clear_queued)
+ task_work_cancel(task, &kpa->clear_work);
+ seccomp_free_pinned_args(kpa);
+}
+
+/*
+ * task_work callback: runs on the trapped task when it returns to user
+ * mode after the resumed syscall body has completed. The pin is single-
+ * shot; subsequent traps must call PIN_ARGS again.
+ */
+static void seccomp_pin_clear_cb(struct callback_head *cb)
+{
+ seccomp_clear_pinned_args(current);
+}
+
+int seccomp_pin_queue_clear(struct task_struct *task)
+{
+ struct seccomp_pinned_args *kpa = task->seccomp.pinned_args;
+ int ret;
+
+ if (!kpa || kpa->clear_queued)
+ return 0;
+ init_task_work(&kpa->clear_work, seccomp_pin_clear_cb);
+ ret = task_work_add(task, &kpa->clear_work, TWA_RESUME);
+ if (ret == 0)
+ kpa->clear_queued = true;
+ return ret;
+}
+
+/* Snapshot SECCOMP_PIN_FIXED: copy exactly @desc->max_bytes from @user_addr
+ * in the trapped child's mm into a freshly-allocated kernel buffer.
+ *
+ * On success, @out is populated and @desc->actual_size / .truncated are
+ * filled. The caller is responsible for chaining the bytes into the
+ * supervisor's bulk buffer.
+ */
+static long pin_one_fixed(struct task_struct *task, u64 user_addr,
+ struct seccomp_pin_arg *desc,
+ struct seccomp_pinned_arg *out)
+{
+ struct mm_struct *mm;
+ void *kbuf;
+ int read;
+
+ kbuf = kvmalloc(desc->max_bytes, GFP_KERNEL_ACCOUNT);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mm = get_task_mm(task);
+ if (!mm) {
+ kvfree(kbuf);
+ return -ESRCH;
+ }
+
+ read = access_remote_vm(mm, user_addr, kbuf, desc->max_bytes, 0);
+ mmput(mm);
+
+ if (read <= 0) {
+ kvfree(kbuf);
+ return read ? read : -EFAULT;
+ }
+
+ out->user_addr = user_addr;
+ out->size = read;
+ out->arg_idx = desc->arg_idx;
+ out->kind = SECCOMP_PIN_FIXED;
+ out->data = kbuf;
+
+ desc->actual_size = read;
+ desc->truncated = (read < desc->max_bytes) ?
+ SECCOMP_PIN_TRUNCATED_BYTES : 0;
+ return 0;
+}
+
+/* MAX_ARG_STRINGS is fs/exec.c-private; redefine our own ceiling. */
+#define SECCOMP_PIN_DEFAULT_MAX_ENTRIES 0x7FFFFFFF
+
+/*
+ * Packed CSTRING_ARRAY layout:
+ *
+ * [u32 count][u32 offsets[count]][u8 strings[]]
+ *
+ * Each offset is from the start of the buffer; each string at
+ * data + offsets[i] is NUL-terminated.
+ */
+
+/* Snapshot SECCOMP_PIN_CSTRING: NUL-bounded copy from the trapped child's
+ * mm via the existing copy_remote_vm_str() primitive. The result is
+ * always NUL-terminated; truncation is reported when the byte cap was
+ * hit before the source NUL.
+ */
+static long pin_one_cstring(struct task_struct *task, u64 user_addr,
+ struct seccomp_pin_arg *desc,
+ struct seccomp_pinned_arg *out)
+{
+ void *kbuf;
+ int copied;
+
+ kbuf = kvmalloc(desc->max_bytes, GFP_KERNEL_ACCOUNT);
+ if (!kbuf)
+ return -ENOMEM;
+
+ copied = copy_remote_vm_str(task, user_addr, kbuf, desc->max_bytes, 0);
+ if (copied < 0) {
+ kvfree(kbuf);
+ return copied;
+ }
+
+ /*
+ * copy_remote_vm_str() returns bytes not including the trailing NUL,
+ * which it always writes on success. If we filled the buffer all the
+ * way (copied == max_bytes - 1) the source NUL may not have been
+ * reached; flag that as truncation.
+ */
+ out->user_addr = user_addr;
+ out->size = copied + 1; /* include the trailing NUL */
+ out->arg_idx = desc->arg_idx;
+ out->kind = SECCOMP_PIN_CSTRING;
+ out->data = kbuf;
+
+ desc->actual_size = copied + 1;
+ desc->truncated = (copied == desc->max_bytes - 1) ?
+ SECCOMP_PIN_TRUNCATED_BYTES : 0;
+ return 0;
+}
+
+/*
+ * Snapshot SECCOMP_PIN_CSTRING_ARRAY: walk the NULL-terminated pointer
+ * table at @user_addr in the trapped child's mm; for each non-NULL ptr,
+ * copy its NUL-bounded string into a packed kernel buffer. Format:
+ *
+ * [u32 count][u32 offsets[count]][u8 strings[]]
+ *
+ * Caps on both byte total (@desc->max_bytes) and entry count
+ * (@desc->max_entries; 0 means default cap). The pointer table is
+ * walked first to determine count, *before* any string copy, so a
+ * hostile child can't tie up the kernel walking a giant table.
+ *
+ * v1: native pointer width only. Compat (32-bit pointer table read by
+ * a native supervisor) is a TODO.
+ */
+static long pin_one_cstring_array(struct task_struct *task, u64 user_addr,
+ struct seccomp_pin_arg *desc,
+ struct seccomp_pinned_arg *out)
+{
+ struct mm_struct *mm;
+ void *kbuf = NULL;
+ u32 max_entries;
+ u32 *header;
+ u32 count = 0;
+ u32 byte_off;
+ u32 truncated = 0;
+ u32 i;
+ long ret;
+
+ max_entries = desc->max_entries ?: SECCOMP_PIN_DEFAULT_MAX_ENTRIES;
+ /* Cap entries by what fits in the supervisor's max_bytes assuming
+ * even the smallest header (count + per-entry offset + 1 NUL).
+ * Each entry costs at least 4 (offset) + 1 (NUL) = 5 bytes.
+ */
+ if (max_entries > (desc->max_bytes / 5))
+ max_entries = desc->max_bytes / 5;
+
+ if (desc->max_bytes < sizeof(u32))
+ return -EINVAL;
+
+ kbuf = kvmalloc(desc->max_bytes, GFP_KERNEL_ACCOUNT);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mm = get_task_mm(task);
+ if (!mm) {
+ ret = -ESRCH;
+ goto err_free;
+ }
+
+ /* Phase 1: count entries by walking the pointer table. */
+ for (i = 0; i < max_entries; i++) {
+ unsigned long ptr;
+ int got;
+
+ got = access_remote_vm(mm, user_addr + i * sizeof(ptr),
+ &ptr, sizeof(ptr), 0);
+ if (got != sizeof(ptr)) {
+ mmput(mm);
+ ret = -EFAULT;
+ goto err_free;
+ }
+ if (ptr == 0)
+ break;
+ count++;
+ }
+ if (i == max_entries) {
+ /* Hit the entry cap before the NULL terminator: still report
+ * what we have, flag truncation.
+ */
+ truncated |= SECCOMP_PIN_TRUNCATED_ENTRIES;
+ }
+
+ /* Header layout fits in max_bytes? */
+ if ((u64)sizeof(u32) + (u64)count * sizeof(u32) > desc->max_bytes) {
+ mmput(mm);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ header = kbuf;
+ header[0] = count;
+ byte_off = sizeof(u32) + count * sizeof(u32);
+
+ /* Phase 2: copy each string into the packed area. */
+ for (i = 0; i < count; i++) {
+ unsigned long ptr;
+ u32 remaining;
+ int got, copied;
+
+ if (access_remote_vm(mm, user_addr + i * sizeof(ptr),
+ &ptr, sizeof(ptr), 0) != sizeof(ptr)) {
+ mmput(mm);
+ ret = -EFAULT;
+ goto err_free;
+ }
+ if (byte_off >= desc->max_bytes) {
+ truncated |= SECCOMP_PIN_TRUNCATED_BYTES;
+ count = i;
+ header[0] = count;
+ break;
+ }
+ remaining = desc->max_bytes - byte_off;
+ copied = copy_remote_vm_str(task, ptr,
+ (char *)kbuf + byte_off,
+ remaining, 0);
+ if (copied < 0) {
+ mmput(mm);
+ ret = copied;
+ goto err_free;
+ }
+ header[1 + i] = byte_off;
+ got = copied + 1; /* include the NUL written by helper */
+ if (got >= remaining)
+ truncated |= SECCOMP_PIN_TRUNCATED_BYTES;
+ byte_off += got;
+ }
+ mmput(mm);
+
+ out->user_addr = user_addr;
+ out->size = byte_off;
+ out->arg_idx = desc->arg_idx;
+ out->kind = SECCOMP_PIN_CSTRING_ARRAY;
+ out->data = kbuf;
+
+ desc->actual_size = byte_off;
+ desc->actual_entries = count;
+ desc->truncated = truncated;
+ return 0;
+
+err_free:
+ kvfree(kbuf);
+ return ret;
+}
+
+const struct kvec *seccomp_pin_kvec_for(const struct seccomp_pinned_arg *pin)
+{
+ struct seccomp_pinned_args *kpa;
+ long idx;
+
+ kpa = READ_ONCE(current->seccomp.pinned_args);
+ if (!kpa)
+ return NULL;
+ idx = pin - kpa->args;
+ if (idx < 0 || idx >= kpa->nr_args)
+ return NULL;
+ return &kpa->arg_kvecs[idx];
+}
+
+const struct seccomp_pinned_arg *seccomp_pin_lookup_current(u64 user_addr)
+{
+ struct seccomp_pinned_args *kpa;
+ int i;
+
+ kpa = READ_ONCE(current->seccomp.pinned_args);
+ if (!kpa || !kpa->live)
+ return NULL;
+
+ /*
+ * If the current syscall doesn't match the one snapshotted at pin
+ * time, return NULL so the caller reads user memory. This guards
+ * against a signal handler issuing an unrelated syscall during
+ * -ERESTART* resolution — that syscall has its own user pointers
+ * and must not be served from the pin.
+ */
+ if (kpa->syscall_nr !=
+ syscall_get_nr(current, task_pt_regs(current)))
+ return NULL;
+
+ for (i = 0; i < kpa->nr_args; i++) {
+ if (kpa->args[i].user_addr == user_addr)
+ return &kpa->args[i];
+ }
+ return NULL;
+}
+
+long seccomp_pin_args_walk(struct task_struct *task,
+ struct seccomp_notif_pin_args *kargs,
+ const u64 *args, int syscall_nr,
+ void __user *user_buf, u32 user_buf_size,
+ struct seccomp_pinned_args **out)
+{
+ struct seccomp_pinned_args *kpa;
+ u32 buf_off = 0;
+ int i;
+ long ret;
+
+ kpa = seccomp_alloc_pinned_args(kargs->nr_args);
+ if (IS_ERR(kpa))
+ return PTR_ERR(kpa);
+ kpa->notif_id = kargs->id;
+ kpa->syscall_nr = syscall_nr;
+
+ for (i = 0; i < kargs->nr_args; i++) {
+ struct seccomp_pin_arg *d = &kargs->args[i];
+ u64 user_addr = args[d->arg_idx];
+
+ d->user_addr = user_addr;
+ d->actual_size = 0;
+ d->actual_entries = 0;
+ d->truncated = 0;
+ d->buf_offset = buf_off;
+
+ /* NULL pointers (e.g. execveat with AT_EMPTY_PATH): record
+ * a zero-size pin and move on without faulting.
+ */
+ if (user_addr == 0)
+ continue;
+
+ switch (d->kind) {
+ case SECCOMP_PIN_FIXED:
+ ret = pin_one_fixed(task, user_addr, d, &kpa->args[i]);
+ break;
+ case SECCOMP_PIN_CSTRING:
+ ret = pin_one_cstring(task, user_addr, d, &kpa->args[i]);
+ break;
+ case SECCOMP_PIN_CSTRING_ARRAY:
+ ret = pin_one_cstring_array(task, user_addr, d,
+ &kpa->args[i]);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ if (ret < 0)
+ goto err_free;
+
+ /* Stable kvec for iov_iter_kvec consumers (import_ubuf). */
+ kpa->arg_kvecs[i].iov_base = kpa->args[i].data;
+ kpa->arg_kvecs[i].iov_len = kpa->args[i].size;
+
+ if (kpa->args[i].size > user_buf_size - buf_off) {
+ ret = -ENOSPC;
+ goto err_free;
+ }
+ if (copy_to_user(user_buf + buf_off,
+ kpa->args[i].data, kpa->args[i].size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+ d->buf_offset = buf_off;
+ buf_off += kpa->args[i].size;
+ }
+
+ *out = kpa;
+ return 0;
+
+err_free:
+ seccomp_free_pinned_args(kpa);
+ return ret;
+}
diff --git a/kernel/seccomp_pin.h b/kernel/seccomp_pin.h
new file mode 100644
index 000000000000..ea699bc09645
--- /dev/null
+++ b/kernel/seccomp_pin.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internal interfaces for SECCOMP_IOCTL_NOTIF_PIN_ARGS.
+ *
+ * The pin lifecycle and walker live in kernel/seccomp_pin.c to keep
+ * kernel/seccomp.c focused on the existing notify machinery.
+ */
+#ifndef _KERNEL_SECCOMP_PIN_H
+#define _KERNEL_SECCOMP_PIN_H
+
+#include <linux/types.h>
+#include <uapi/linux/seccomp.h>
+
+#include <linux/seccomp_types.h> /* struct seccomp_pinned_arg, SECCOMP_PIN_MAX_ARGS */
+#include <linux/uio.h> /* struct kvec */
+#include <linux/task_work.h> /* struct callback_head */
+
+struct task_struct;
+struct seccomp_filter;
+struct seccomp_knotif;
+struct seccomp_notif_pin_args;
+
+/*
+ * Maximum cumulative bytes a single PIN_ARGS request may snapshot on
+ * behalf of one notification. Defensive bound only — typical pins are
+ * a few KiB (one PATH_MAX path; argv up to MAX_ARG_STRLEN). Hardcoded
+ * rather than a sysctl: there is no legitimate use case for runtime
+ * tuning. Smaller is always reachable via desc->max_bytes; larger
+ * indicates a policy bug.
+ */
+#define SECCOMP_PIN_MAX_TOTAL_BYTES (1UL << 20) /* 1 MiB */
+
+/**
+ * struct seccomp_pinned_args - the per-task pin record.
+ * @notif_id: id of the outstanding notification this pin belongs to.
+ * @syscall_nr: syscall number captured at pin time; consumption checks this
+ * against current to skip pinned data on a mismatched syscall
+ * (e.g. one issued from a signal handler during restart).
+ * @nr_args: number of populated entries in @args.
+ * @live: false during the pin-decision window, set to true on
+ * CONTINUE_PINNED so consumption hooks know to use the snapshot.
+ * @args: per-slot pinned data; only the first @nr_args entries are valid.
+ */
+struct seccomp_pinned_args {
+ u64 notif_id;
+ int syscall_nr;
+ u8 nr_args;
+ bool live;
+ bool clear_queued; /* clear_work has been task_work_add()'d */
+ struct callback_head clear_work;
+ struct seccomp_pinned_arg args[SECCOMP_PIN_MAX_ARGS];
+ /*
+ * Per-arg stable kvec storage. Populated by the walker for kinds
+ * whose consumption hooks build an iov_iter (currently FIXED ->
+ * import_ubuf). The kvec must outlive the iter; this struct lives
+ * until syscall exit, which is after the iter is fully consumed.
+ */
+ struct kvec arg_kvecs[SECCOMP_PIN_MAX_ARGS];
+};
+
+#ifdef CONFIG_SECCOMP_FILTER
+
+struct seccomp_pinned_args *seccomp_alloc_pinned_args(u8 nr_args);
+void seccomp_free_pinned_args(struct seccomp_pinned_args *kpa);
+void seccomp_clear_pinned_args(struct task_struct *task);
+
+/*
+ * Queue a one-shot task_work that will clear @task's pinned_args when
+ * @task next returns to userspace, i.e. after the trapped-and-resumed
+ * syscall body has completed. Called from NOTIF_SEND on CONTINUE_PINNED.
+ */
+int seccomp_pin_queue_clear(struct task_struct *task);
+
+/**
+ * seccomp_pin_args_walk - per-arg snapshot phase (no seccomp locks).
+ * @task: the trapped child whose mm we're reading; caller must hold a
+ * reference (via get_task_struct).
+ * @kargs: in/out ioctl payload; the walker reads .nr_args / .args[i] inputs
+ * and writes back .args[i] outputs (actual_size, truncated, etc.).
+ * @args: syscall register args (knotif->data->args).
+ * @syscall_nr: syscall number captured at notif time.
+ * @user_buf: the supervisor's bulk byte buffer (user pointer).
+ * @user_buf_size: capacity of @user_buf.
+ * @out: on success, *@out is a freshly-allocated kpa with the snapshot;
+ * caller takes ownership and must seccomp_free_pinned_args() if
+ * the attach step fails.
+ *
+ * Return: 0 on success, negative errno on failure.
+ *
+ * Phase B of PIN_ARGS: this runs without seccomp locks held. Phase A (notif
+ * validation) and Phase C (attach) live in kernel/seccomp.c.
+ */
+long seccomp_pin_args_walk(struct task_struct *task,
+ struct seccomp_notif_pin_args *kargs,
+ const u64 *args, int syscall_nr,
+ void __user *user_buf, u32 user_buf_size,
+ struct seccomp_pinned_args **out);
+
+/* seccomp_pin_lookup_current() lives in include/linux/seccomp.h; it is
+ * called from consumption sites outside kernel/seccomp/ (fs/, net/, lib/).
+ */
+
+#else
+
+static inline void seccomp_clear_pinned_args(struct task_struct *task) { }
+
+#endif /* CONFIG_SECCOMP_FILTER */
+
+#endif /* _KERNEL_SECCOMP_PIN_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 243662af1af7..e0b038b54ce9 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/seccomp.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>
@@ -1444,8 +1445,29 @@ EXPORT_SYMBOL(import_iovec);
int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
+ const struct seccomp_pinned_arg *pin;
+ const struct kvec *kvec;
+
if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT;
+
+ /*
+ * Pinned by a seccomp PIN_ARGS supervisor on this task? Build the
+ * iov_iter over the kernel snapshot rather than re-reading user
+ * memory. The kvec storage is owned by current->seccomp.pinned_args
+ * and lives until syscall exit, so it outlasts @i's consumption.
+ */
+ pin = seccomp_pin_lookup_current((u64)(uintptr_t)buf);
+ if (pin && pin->kind == SECCOMP_PIN_FIXED) {
+ kvec = seccomp_pin_kvec_for(pin);
+ if (kvec) {
+ size_t n = min_t(size_t, len, pin->size);
+
+ iov_iter_kvec(i, rw, kvec, 1, n);
+ return 0;
+ }
+ }
+
if (unlikely(!access_ok(buf, len)))
return -EFAULT;
diff --git a/mm/memory.c b/mm/memory.c
index ea6568571131..766ea403d983 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -7168,7 +7168,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
-#ifdef CONFIG_BPF_SYSCALL
+#if defined(CONFIG_BPF_SYSCALL) || defined(CONFIG_SECCOMP_FILTER)
/*
* Copy a string from another process's address space as given in mm.
* If there is any error return -EFAULT.
@@ -7286,7 +7286,7 @@ int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
return ret;
}
EXPORT_SYMBOL_GPL(copy_remote_vm_str);
-#endif /* CONFIG_BPF_SYSCALL */
+#endif /* CONFIG_BPF_SYSCALL || CONFIG_SECCOMP_FILTER */
/*
* Print the name of a VMA.
diff --git a/mm/nommu.c b/mm/nommu.c
index ed3934bc2de4..4c14ed97d661 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1711,7 +1711,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
EXPORT_SYMBOL_GPL(access_process_vm);
-#ifdef CONFIG_BPF_SYSCALL
+#if defined(CONFIG_BPF_SYSCALL) || defined(CONFIG_SECCOMP_FILTER)
/*
* Copy a string from another process's address space as given in mm.
* If there is any error return -EFAULT.
@@ -1788,7 +1788,7 @@ int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
return ret;
}
EXPORT_SYMBOL_GPL(copy_remote_vm_str);
-#endif /* CONFIG_BPF_SYSCALL */
+#endif /* CONFIG_BPF_SYSCALL || CONFIG_SECCOMP_FILTER */
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
diff --git a/net/socket.c b/net/socket.c
index 22a412fdec07..6e3af6114a60 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -82,6 +82,7 @@
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
+#include <linux/seccomp.h>
#include <linux/wireless.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
@@ -248,10 +249,25 @@ static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
+ const struct seccomp_pinned_arg *pin;
+
if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
return -EINVAL;
if (ulen == 0)
return 0;
+
+ /* If a seccomp supervisor pinned this sockaddr via PIN_ARGS and
+ * sent CONTINUE_PINNED, consume from the kernel snapshot instead
+ * of re-reading user memory. Closes the unotify TOCTOU.
+ */
+ pin = seccomp_pin_lookup_current((u64)(uintptr_t)uaddr);
+ if (pin) {
+ size_t n = min_t(size_t, (size_t)ulen, pin->size);
+
+ memcpy(kaddr, pin->data, n);
+ return audit_sockaddr(ulen, kaddr);
+ }
+
if (copy_from_user(kaddr, uaddr, ulen))
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
--
2.43.0
next prev parent reply other threads:[~2026-05-04 1:12 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-04 1:12 [RFC PATCH 0/3] seccomp: SECCOMP_IOCTL_NOTIF_PIN_ARGS for race-free unotify Cong Wang
2026-05-04 1:12 ` Cong Wang [this message]
2026-05-04 12:51 ` [RFC PATCH 1/3] seccomp: add SECCOMP_IOCTL_NOTIF_PIN_ARGS to close the unotify TOCTOU race Christian Brauner
2026-05-06 5:00 ` Cong Wang
2026-05-04 1:12 ` [RFC PATCH 2/3] selftests/seccomp: add seccomp_pin_args end-to-end coverage Cong Wang
2026-05-04 1:12 ` [RFC PATCH 3/3] Documentation: seccomp: document SECCOMP_IOCTL_NOTIF_PIN_ARGS Cong Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260504011207.539408-2-xiyou.wangcong@gmail.com \
--to=xiyou.wangcong@gmail.com \
--cc=brauner@kernel.org \
--cc=cwang@multikernel.io \
--cc=kees@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@amacapital.net \
--cc=wad@chromium.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox