linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
@ 2025-09-03 20:38 Tom Hromatka
  2025-09-03 20:45 ` Alexei Starovoitov
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Tom Hromatka @ 2025-09-03 20:38 UTC (permalink / raw)
  To: kees, luto, wad, sargun, corbet, shuah, brauner, tom.hromatka
  Cc: linux-doc, linux-kernel, linux-kselftest, bpf

Add an operation, SECCOMP_CLONE_FILTER, that can copy the seccomp filters
from another process to the current process.

I roughly reproduced the Docker seccomp filter [1] and timed how long it
takes to build it (via libseccomp) and attach it to a process.  After
1000 runs, on average it took 3,740,000 TSC ticks (or ~1440 microseconds)
on an AMD EPYC 9J14 running at 2596 MHz.  The median build/load time was
3,715,000 TSC ticks.

On the same system, I preloaded the above Docker seccomp filter onto a
process.  (Note that I opened a pidfd to the reference process and left
the pidfd open for the entire run.)  I then cloned the filter using the
feature in this patch to 1000 new processes.  On average, it took 9,300
TSC ticks (or ~3.6 microseconds) to copy the filter to the new processes.
The median clone time was 9,048 TSC ticks.

This is approximately a 400x performance improvement for those container
managers that are using the exact same seccomp filter across all of their
containers.

[1] https://raw.githubusercontent.com/moby/moby/refs/heads/master/profiles/seccomp/default.json

Signed-off-by: Tom Hromatka <tom.hromatka@oracle.com>
---
 .../userspace-api/seccomp_filter.rst          |  10 ++
 include/uapi/linux/seccomp.h                  |   1 +
 kernel/seccomp.c                              |  48 ++++++
 samples/seccomp/Makefile                      |   2 +-
 samples/seccomp/clone-filter.c                | 143 ++++++++++++++++++
 tools/include/uapi/linux/seccomp.h            |   1 +
 tools/testing/selftests/seccomp/seccomp_bpf.c |  71 +++++++++
 7 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/clone-filter.c

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index cff0fa7f3175..ef1797d093f6 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -289,6 +289,16 @@ above in this document: all arguments being read from the tracee's memory
 should be read into the tracer's memory before any policy decisions are made.
 This allows for an atomic decision on syscall arguments.
 
+Cloning an Existing Seccomp Filter
+==================================
+
+Constructing and loading a complex seccomp filter can often take a non-trivial
+amount of time. If a user wants to use the same seccomp filter across more
+than one process, it can be cloned to new processes via the
+``SECCOMP_CLONE_FILTER`` operation. Note that the clone will only succeed if
+the destination process does not have any seccomp filters already applied to
+it. See ``samples/seccomp/clone-filter.c`` for an example.
+
 Sysctls
 =======
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
 #define SECCOMP_SET_MODE_FILTER		1
 #define SECCOMP_GET_ACTION_AVAIL	2
 #define SECCOMP_GET_NOTIF_SIZES		3
+#define SECCOMP_CLONE_FILTER		4
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 41aa761c7738..b726e0d6715d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2081,6 +2081,49 @@ static long seccomp_get_notif_sizes(void __user *usizes)
 	return 0;
 }
 
+static long seccomp_clone_filter(void __user *upidfd)
+{
+	struct task_struct *task;
+	unsigned int flags;
+	pid_t pidfd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (atomic_read(&current->seccomp.filter_count) > 0)
+		return -EINVAL;
+
+	if (copy_from_user(&pidfd, upidfd, sizeof(pid_t)))
+		return -EFAULT;
+
+	task = pidfd_get_task(pidfd, &flags);
+	if (IS_ERR(task))
+		return -ESRCH;
+
+	spin_lock_irq(&current->sighand->siglock);
+	spin_lock_irq(&task->sighand->siglock);
+
+	if (atomic_read(&task->seccomp.filter_count) == 0) {
+		spin_unlock_irq(&task->sighand->siglock);
+		spin_unlock_irq(&current->sighand->siglock);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	get_seccomp_filter(task);
+	current->seccomp = task->seccomp;
+
+	spin_unlock_irq(&task->sighand->siglock);
+
+	set_task_syscall_work(current, SECCOMP);
+
+	spin_unlock_irq(&current->sighand->siglock);
+
+	put_task_struct(task);
+
+	return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
 		       void __user *uargs)
@@ -2102,6 +2145,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 			return -EINVAL;
 
 		return seccomp_get_notif_sizes(uargs);
+	case SECCOMP_CLONE_FILTER:
+		if (flags != 0)
+			return -EINVAL;
+
+		return seccomp_clone_filter(uargs);
 	default:
 		return -EINVAL;
 	}
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index c85ae0ed8342..d38977f41b86 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
+userprogs-always-y += bpf-fancy dropper bpf-direct user-trap clone-filter
 
 bpf-fancy-objs := bpf-fancy.o bpf-helper.o
 
diff --git a/samples/seccomp/clone-filter.c b/samples/seccomp/clone-filter.c
new file mode 100644
index 000000000000..d26e1375b9dc
--- /dev/null
+++ b/samples/seccomp/clone-filter.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp filter example for cloning a filter
+ *
+ * Copyright (c) 2025 Oracle and/or its affiliates.
+ * Author: Tom Hromatka <tom.hromatka@oracle.com>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications that reuse the same seccomp filter
+ * across many processes.
+ */
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int install_filter(void)
+{
+	struct sock_filter deny_filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog deny_prog = {
+		.len = (unsigned short)ARRAY_SIZE(deny_filter),
+		.filter = deny_filter,
+	};
+
+	return seccomp(SECCOMP_SET_MODE_FILTER, 0, &deny_prog);
+}
+
+static int clone_filter(pid_t ref_pid)
+{
+	int ref_pidfd, ret;
+
+	ref_pidfd = syscall(SYS_pidfd_open, ref_pid, 0);
+	if (ref_pidfd < 0)
+		return -errno;
+
+	ret = seccomp(SECCOMP_CLONE_FILTER, 0, &ref_pidfd);
+
+	close(ref_pidfd);
+
+	return ret;
+}
+
+static void do_ref_filter(void)
+{
+	int ret;
+
+	ret = install_filter();
+	if (ret) {
+		perror("Failed to install ref filter\n");
+		exit(1);
+	}
+
+	while (true)
+		sleep(1);
+}
+
+static void do_child_process(pid_t ref_pid)
+{
+	pid_t res;
+	int ret;
+
+	ret = clone_filter(ref_pid);
+	if (ret != 0) {
+		perror("Failed to clone filter. Installing filter from scratch\n");
+
+		ret = install_filter();
+		if (ret != 0) {
+			perror("Filter install failed\n");
+			exit(ret);
+		}
+	}
+
+	res = syscall(__NR_getpid);
+	if (res < 0) {
+		perror("getpid() unexpectedly failed\n");
+		exit(errno);
+	}
+
+	res = syscall(__NR_getppid);
+	if (res > 0) {
+		perror("getppid() unexpectedly succeeded\n");
+		exit(1);
+	}
+
+	exit(0);
+}
+
+int main(void)
+{
+	pid_t ref_pid = -1, child_pid = -1;
+	int ret, status;
+
+	ref_pid = fork();
+	if (ref_pid < 0)
+		exit(errno);
+	else if (ref_pid == 0)
+		do_ref_filter();
+
+	child_pid = fork();
+	if (child_pid < 0)
+		goto out;
+	else if (child_pid == 0)
+		do_child_process(ref_pid);
+
+	waitpid(child_pid, &status, 0);
+	if (WEXITSTATUS(status) != 0) {
+		perror("child process failed");
+		ret = WEXITSTATUS(status);
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	if (ref_pid != -1)
+		kill(ref_pid, SIGKILL);
+	if (child_pid != -1)
+		kill(child_pid, SIGKILL);
+
+	exit(ret);
+}
diff --git a/tools/include/uapi/linux/seccomp.h b/tools/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/tools/include/uapi/linux/seccomp.h
+++ b/tools/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
 #define SECCOMP_SET_MODE_FILTER		1
 #define SECCOMP_GET_ACTION_AVAIL	2
 #define SECCOMP_GET_NOTIF_SIZES		3
+#define SECCOMP_CLONE_FILTER		4
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 61acbd45ffaa..df5e0f615da0 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -177,6 +177,10 @@ struct seccomp_data {
 #define SECCOMP_GET_NOTIF_SIZES 3
 #endif
 
+#ifndef SECCOMP_CLONE_FILTER
+#define SECCOMP_CLONE_FILTER 4
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 #endif
@@ -5090,6 +5094,73 @@ TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
 	ASSERT_EQ(0, run_probed_with_filter(&prog));
 }
 
+TEST(clone_filter)
+{
+	struct sock_filter deny_filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog deny_prog = {
+		.len = (unsigned short)ARRAY_SIZE(deny_filter),
+		.filter = deny_filter,
+	};
+	struct timespec ts = {
+		.tv_sec = 0,
+		.tv_nsec = 100000000,
+	};
+
+	pid_t child_pid, self_pid, res;
+	int child_pidfd, ret;
+
+	/* Only real root can copy a filter. */
+	if (geteuid()) {
+		SKIP(return, "clone_filter requires real root");
+		return;
+	}
+
+	self_pid = getpid();
+
+	child_pid = fork();
+	ASSERT_LE(0, child_pid);
+
+	if (child_pid == 0) {
+		ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+		ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &deny_prog));
+
+		while (true)
+			EXPECT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+	}
+
+	/* wait for the child pid to create its seccomp filter */
+	ASSERT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+
+	child_pidfd = syscall(SYS_pidfd_open, child_pid, 0);
+	EXPECT_LE(0, child_pidfd);
+
+	/* Invalid flag provided */
+	ret = seccomp(SECCOMP_CLONE_FILTER, 1, &child_pidfd);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(errno, EINVAL);
+
+	errno = 0;
+	ret = seccomp(SECCOMP_CLONE_FILTER, 0, &child_pidfd);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(errno, 0);
+
+	res = syscall(__NR_getppid);
+	EXPECT_EQ(res, -1);
+	EXPECT_EQ(errno, ESRCH);
+
+	res = syscall(__NR_getpid);
+	EXPECT_EQ(res, self_pid);
+
+	close(child_pidfd);
+	kill(child_pid, SIGKILL);
+}
+
 /*
  * TODO:
  * - expand NNP testing
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:38 [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation Tom Hromatka
@ 2025-09-03 20:45 ` Alexei Starovoitov
  2025-09-03 20:51   ` Tom Hromatka
  2025-09-04 11:53 ` kernel test robot
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Alexei Starovoitov @ 2025-09-03 20:45 UTC (permalink / raw)
  To: Tom Hromatka
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, Sargun Dhillon,
	Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On Wed, Sep 3, 2025 at 1:38 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>
> +
> +       spin_lock_irq(&current->sighand->siglock);
> +       spin_lock_irq(&task->sighand->siglock);
> +
> +       if (atomic_read(&task->seccomp.filter_count) == 0) {
> +               spin_unlock_irq(&task->sighand->siglock);
> +               spin_unlock_irq(&current->sighand->siglock);

did you copy this pattern from somewhere ?
It's obviously buggy.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:45 ` Alexei Starovoitov
@ 2025-09-03 20:51   ` Tom Hromatka
  2025-09-03 22:44     ` Alexei Starovoitov
  0 siblings, 1 reply; 13+ messages in thread
From: Tom Hromatka @ 2025-09-03 20:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, Sargun Dhillon,
	Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On 9/3/25 2:45 PM, Alexei Starovoitov wrote:
> On Wed, Sep 3, 2025 at 1:38 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>>
>> +
>> +       spin_lock_irq(&current->sighand->siglock);
>> +       spin_lock_irq(&task->sighand->siglock);
>> +
>> +       if (atomic_read(&task->seccomp.filter_count) == 0) {
>> +               spin_unlock_irq(&task->sighand->siglock);
>> +               spin_unlock_irq(&current->sighand->siglock);
> 
> did you copy this pattern from somewhere ?
> It's obviously buggy.

I tried to mimic the logic in copy_seccomp() in kernel/fork.c,
but as you point out, I probably messed it up :).

Do you have recommendations for a better design pattern?

Thanks!

Tom

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:51   ` Tom Hromatka
@ 2025-09-03 22:44     ` Alexei Starovoitov
  2025-09-04 12:08       ` Tom Hromatka
  2025-09-04 14:26       ` Tom Hromatka
  0 siblings, 2 replies; 13+ messages in thread
From: Alexei Starovoitov @ 2025-09-03 22:44 UTC (permalink / raw)
  To: Tom Hromatka
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, Sargun Dhillon,
	Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On Wed, Sep 3, 2025 at 1:52 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>
> On 9/3/25 2:45 PM, Alexei Starovoitov wrote:
> > On Wed, Sep 3, 2025 at 1:38 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
> >>
> >> +
> >> +       spin_lock_irq(&current->sighand->siglock);
> >> +       spin_lock_irq(&task->sighand->siglock);
> >> +
> >> +       if (atomic_read(&task->seccomp.filter_count) == 0) {
> >> +               spin_unlock_irq(&task->sighand->siglock);
> >> +               spin_unlock_irq(&current->sighand->siglock);
> >
> > did you copy this pattern from somewhere ?
> > It's obviously buggy.
>
> I tried to mimic the logic in copy_seccomp() in kernel/fork.c,
> but as you point out, I probably messed it up :).
>
> Do you have recommendations for a better design pattern?

Several things look wrong here.
Double _irq() is one obvious bug.
Grabbing spin_lock to do atomic_read() is another oddity.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:38 [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation Tom Hromatka
  2025-09-03 20:45 ` Alexei Starovoitov
@ 2025-09-04 11:53 ` kernel test robot
  2025-09-04 16:26 ` Kees Cook
  2025-09-04 17:51 ` Al Viro
  3 siblings, 0 replies; 13+ messages in thread
From: kernel test robot @ 2025-09-04 11:53 UTC (permalink / raw)
  To: Tom Hromatka, kees, luto, wad, sargun, corbet, shuah, brauner
  Cc: oe-kbuild-all, linux-doc, linux-kernel, linux-kselftest, bpf

Hi Tom,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kees/for-next/seccomp]
[also build test WARNING on linus/master v6.17-rc4 next-20250904]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Tom-Hromatka/seccomp-Add-SECCOMP_CLONE_FILTER-operation/20250904-043943
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/seccomp
patch link:    https://lore.kernel.org/r/20250903203805.1335307-1-tom.hromatka%40oracle.com
patch subject: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
config: x86_64-randconfig-074-20250904 (https://download.01.org/0day-ci/archive/20250904/202509041931.XtQcy27H-lkp@intel.com/config)
compiler: gcc-13 (Debian 13.3.0-16) 13.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250904/202509041931.XtQcy27H-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509041931.XtQcy27H-lkp@intel.com/

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   samples/seccomp/clone-filter.c: In function 'main':
>> samples/seccomp/clone-filter.c:142:9: warning: 'ret' may be used uninitialized [-Wmaybe-uninitialized]
     142 |         exit(ret);
         |         ^~~~~~~~~
   samples/seccomp/clone-filter.c:113:13: note: 'ret' was declared here
     113 |         int ret, status;
         |             ^~~


vim +/ret +142 samples/seccomp/clone-filter.c

   109	
   110	int main(void)
   111	{
   112		pid_t ref_pid = -1, child_pid = -1;
   113		int ret, status;
   114	
   115		ref_pid = fork();
   116		if (ref_pid < 0)
   117			exit(errno);
   118		else if (ref_pid == 0)
   119			do_ref_filter();
   120	
   121		child_pid = fork();
   122		if (child_pid < 0)
   123			goto out;
   124		else if (child_pid == 0)
   125			do_child_process(ref_pid);
   126	
   127		waitpid(child_pid, &status, 0);
   128		if (WEXITSTATUS(status) != 0) {
   129			perror("child process failed");
   130			ret = WEXITSTATUS(status);
   131			goto out;
   132		}
   133	
   134		ret = 0;
   135	
   136	out:
   137		if (ref_pid != -1)
   138			kill(ref_pid, SIGKILL);
   139		if (child_pid != -1)
   140			kill(child_pid, SIGKILL);
   141	
 > 142		exit(ret);

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 22:44     ` Alexei Starovoitov
@ 2025-09-04 12:08       ` Tom Hromatka
  2025-09-04 14:26       ` Tom Hromatka
  1 sibling, 0 replies; 13+ messages in thread
From: Tom Hromatka @ 2025-09-04 12:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, Sargun Dhillon,
	Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On 9/3/25 4:44 PM, Alexei Starovoitov wrote:
> On Wed, Sep 3, 2025 at 1:52 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>>
>> On 9/3/25 2:45 PM, Alexei Starovoitov wrote:
>>> On Wed, Sep 3, 2025 at 1:38 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>>>>
>>>> +
>>>> +       spin_lock_irq(&current->sighand->siglock);
>>>> +       spin_lock_irq(&task->sighand->siglock);
>>>> +
>>>> +       if (atomic_read(&task->seccomp.filter_count) == 0) {
>>>> +               spin_unlock_irq(&task->sighand->siglock);
>>>> +               spin_unlock_irq(&current->sighand->siglock);
>>>
>>> did you copy this pattern from somewhere ?
>>> It's obviously buggy.
>>
>> I tried to mimic the logic in copy_seccomp() in kernel/fork.c,
>> but as you point out, I probably messed it up :).
>>
>> Do you have recommendations for a better design pattern?
> 
> Several things look wrong here.

Thanks so much for weighing in.

> Double _irq() is one obvious bug.

Makes sense.  I'll look through the kernel code to see if I can
find another place where two task structs are being locked at
the same time.  I've never had to do that before.

> Grabbing spin_lock to do atomic_read() is another oddity.

That would indeed be strange.

The spin_lock is needed to ensure that the source and target's
seccomp filters don't change out from underneath me.  Once I
read the target's seccomp filter count, I don't want another
thread to make any changes before I've updated the target's
filters.

Thanks!

Tom

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 22:44     ` Alexei Starovoitov
  2025-09-04 12:08       ` Tom Hromatka
@ 2025-09-04 14:26       ` Tom Hromatka
  2025-09-04 14:54         ` Al Viro
  1 sibling, 1 reply; 13+ messages in thread
From: Tom Hromatka @ 2025-09-04 14:26 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, Sargun Dhillon,
	Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On 9/3/25 4:44 PM, Alexei Starovoitov wrote:
> On Wed, Sep 3, 2025 at 1:52 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>>
>> On 9/3/25 2:45 PM, Alexei Starovoitov wrote:
>>> On Wed, Sep 3, 2025 at 1:38 PM Tom Hromatka <tom.hromatka@oracle.com> wrote:
>>>>
>>>> +
>>>> +       spin_lock_irq(&current->sighand->siglock);
>>>> +       spin_lock_irq(&task->sighand->siglock);
>>>> +
>>>> +       if (atomic_read(&task->seccomp.filter_count) == 0) {
>>>> +               spin_unlock_irq(&task->sighand->siglock);
>>>> +               spin_unlock_irq(&current->sighand->siglock);
>>>
>>> did you copy this pattern from somewhere ?
>>> It's obviously buggy.
>>
>> I tried to mimic the logic in copy_seccomp() in kernel/fork.c,
>> but as you point out, I probably messed it up :).
>>
>> Do you have recommendations for a better design pattern?
> 
> Several things look wrong here.
> Double _irq() is one obvious bug.

This snippet addresses the double irq issue.  I also added a
check to make sure that task != current.  (A user shouldn't
do that but who knows what they'll actually do.)

         if (task == current) {
                 put_task_struct(task);
                 return -EINVAL;
         }

         spin_lock_irq(&current->sighand->siglock);
         spin_lock(&task->sighand->siglock);

         if (atomic_read(&task->seccomp.filter_count) == 0) {
                 spin_unlock(&task->sighand->siglock);
                 spin_unlock_irq(&current->sighand->siglock);
                 put_task_struct(task);
                 return -EINVAL;
         }

         get_seccomp_filter(task);
         current->seccomp = task->seccomp;

         spin_unlock(&task->sighand->siglock);

         set_task_syscall_work(current, SECCOMP);

         spin_unlock_irq(&current->sighand->siglock);

Let me know if there are other fixes I need to add.

Thanks so much!

Tom

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-04 14:26       ` Tom Hromatka
@ 2025-09-04 14:54         ` Al Viro
  2025-09-04 18:10           ` Tom Hromatka
  0 siblings, 1 reply; 13+ messages in thread
From: Al Viro @ 2025-09-04 14:54 UTC (permalink / raw)
  To: Tom Hromatka
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Will Drewry,
	Sargun Dhillon, Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On Thu, Sep 04, 2025 at 08:26:30AM -0600, Tom Hromatka wrote:

> This snippet addresses the double irq issue.  I also added a
> check to make sure that task != current.  (A user shouldn't
> do that but who knows what they'll actually do.)
> 
>         if (task == current) {
>                 put_task_struct(task);
>                 return -EINVAL;
>         }
> 
>         spin_lock_irq(&current->sighand->siglock);
>         spin_lock(&task->sighand->siglock);

What do you expect to happen if two tasks do that to each other
at the same time?  Or, for that matter, if task has been spawned
by current with CLONE_VM | CLONE_SIGHAND?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:38 [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation Tom Hromatka
  2025-09-03 20:45 ` Alexei Starovoitov
  2025-09-04 11:53 ` kernel test robot
@ 2025-09-04 16:26 ` Kees Cook
  2025-09-04 18:19   ` Tom Hromatka
  2025-09-04 17:51 ` Al Viro
  3 siblings, 1 reply; 13+ messages in thread
From: Kees Cook @ 2025-09-04 16:26 UTC (permalink / raw)
  To: Tom Hromatka
  Cc: luto, wad, sargun, corbet, shuah, brauner, linux-doc,
	linux-kernel, linux-kselftest, bpf

On Wed, Sep 03, 2025 at 08:38:03PM +0000, Tom Hromatka wrote:
> Add an operation, SECCOMP_CLONE_FILTER, that can copy the seccomp filters
> from another process to the current process.
> 
> I roughly reproduced the Docker seccomp filter [1] and timed how long it
> takes to build it (via libseccomp) and attach it to a process.  After
> 1000 runs, on average it took 3,740,000 TSC ticks (or ~1440 microseconds)
> on an AMD EPYC 9J14 running at 2596 MHz.  The median build/load time was
> 3,715,000 TSC ticks.
> 
> On the same system, I preloaded the above Docker seccomp filter onto a
> process.  (Note that I opened a pidfd to the reference process and left
> the pidfd open for the entire run.)  I then cloned the filter using the
> feature in this patch to 1000 new processes.  On average, it took 9,300
> TSC ticks (or ~3.6 microseconds) to copy the filter to the new processes.
> The median clone time was 9,048 TSC ticks.
> 
> This is approximately a 400x performance improvement for those container
> managers that are using the exact same seccomp filter across all of their
> containers.

This is a nice speedup, but with devil's advocate hat on, are launchers
spawning at rates high enough that this makes a difference?

> 
> [1] https://raw.githubusercontent.com/moby/moby/refs/heads/master/profiles/seccomp/default.json
> 
> Signed-off-by: Tom Hromatka <tom.hromatka@oracle.com>
> ---
>  .../userspace-api/seccomp_filter.rst          |  10 ++
>  include/uapi/linux/seccomp.h                  |   1 +
>  kernel/seccomp.c                              |  48 ++++++
>  samples/seccomp/Makefile                      |   2 +-
>  samples/seccomp/clone-filter.c                | 143 ++++++++++++++++++
>  tools/include/uapi/linux/seccomp.h            |   1 +
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  71 +++++++++
>  7 files changed, 275 insertions(+), 1 deletion(-)
>  create mode 100644 samples/seccomp/clone-filter.c
> 
> diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
> index cff0fa7f3175..ef1797d093f6 100644
> --- a/Documentation/userspace-api/seccomp_filter.rst
> +++ b/Documentation/userspace-api/seccomp_filter.rst
> @@ -289,6 +289,16 @@ above in this document: all arguments being read from the tracee's memory
>  should be read into the tracer's memory before any policy decisions are made.
>  This allows for an atomic decision on syscall arguments.
>  
> +Cloning an Existing Seccomp Filter
> +==================================
> +
> +Constructing and loading a complex seccomp filter can often take a non-trivial
> +amount of time. If a user wants to use the same seccomp filter across more
> +than one process, it can be cloned to new processes via the
> +``SECCOMP_CLONE_FILTER`` operation. Note that the clone will only succeed if
> +the destination process does not have any seccomp filters already applied to
> +it. See ``samples/seccomp/clone-filter.c`` for an example.

Is "does not have any seccomp filters" a reasonable expectation? I mean,
I'm fine with it, but will this feature be generally useful with that
restriction? (i.e. becomes unusable under Docker, in several systemd
contexts, etc)

> +static long seccomp_clone_filter(void __user *upidfd)
> +{
> +	struct task_struct *task;
> +	unsigned int flags;
> +	pid_t pidfd;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;

Again, I'm fine with this being CAP_SYS_ADMIN, but the normal seccomp
filter restriction is NNP || CAP_SYS_ADMIN. Would it be safe to do
clones with non-ADMIN? Hmmm.

> +	if (atomic_read(&current->seccomp.filter_count) > 0)
> +		return -EINVAL;

I'd wait to test this until you're under &current->sighand->siglock,
and just test current->seccomp.filter.

> +
> +	if (copy_from_user(&pidfd, upidfd, sizeof(pid_t)))
> +		return -EFAULT;
> +
> +	task = pidfd_get_task(pidfd, &flags);
> +	if (IS_ERR(task))
> +		return -ESRCH;

Is this the right way to pass in a pidfd? Shouldn't this be an int, not
a pointer? What is idiomatic for pidfd interfaces?

> +
> +	spin_lock_irq(&current->sighand->siglock);
> +	spin_lock_irq(&task->sighand->siglock);

As others mentioned, you can't do this. ;) I'm pretty sure you can just
take references progressively:

- task = pidfd_get_task
- mutex_lock_killable(&task->signal->cred_guard_mutex)
- spin_lock_irq(&task->sighand->siglock);
- check seccomp mode for filter mode, e.g. see seccomp_may_assign_mode()
- get_seccomp_filter(task)
- struct seccomp copy = task->seccomp (copies filter pointer, count,
  and mode)
- release siglock
- release cred_guard_mutex
- put task

Now you have the seccomp copy! :) Any errors from here mean you need
to use __seccomp_filter_release to "put" the filter, if I'm reading
things correctly. (We might have issues with USER_NOTIF, but I haven't
looked closely yet.)

And applying it should be:

- take current->signal->cred_guard_mutex
- take current->sighand->siglock
- make sure task->seccomp.filter == NULL
- current->seccomp = copy
- release siglock
- release cred_guard_mutex

I *think* the above is correct. I may have forgotten some details, but I
was mostly trying to combine TSYNC and regular application of a filter
to current.

> +
> +	if (atomic_read(&task->seccomp.filter_count) == 0) {
> +		spin_unlock_irq(&task->sighand->siglock);
> +		spin_unlock_irq(&current->sighand->siglock);
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	get_seccomp_filter(task);
> +	current->seccomp = task->seccomp;
> +
> +	spin_unlock_irq(&task->sighand->siglock);
> +
> +	set_task_syscall_work(current, SECCOMP);
> +
> +	spin_unlock_irq(&current->sighand->siglock);
> +
> +	put_task_struct(task);
> +
> +	return 0;
> +}
> +
>  /* Common entry point for both prctl and syscall. */
>  static long do_seccomp(unsigned int op, unsigned int flags,
>  		       void __user *uargs)
> @@ -2102,6 +2145,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
>  			return -EINVAL;
>  
>  		return seccomp_get_notif_sizes(uargs);
> +	case SECCOMP_CLONE_FILTER:
> +		if (flags != 0)
> +			return -EINVAL;
> +
> +		return seccomp_clone_filter(uargs);
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
> index c85ae0ed8342..d38977f41b86 100644
> --- a/samples/seccomp/Makefile
> +++ b/samples/seccomp/Makefile
> @@ -1,5 +1,5 @@
>  # SPDX-License-Identifier: GPL-2.0
> -userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
> +userprogs-always-y += bpf-fancy dropper bpf-direct user-trap clone-filter
>  
>  bpf-fancy-objs := bpf-fancy.o bpf-helper.o
>  
> diff --git a/samples/seccomp/clone-filter.c b/samples/seccomp/clone-filter.c
> new file mode 100644
> index 000000000000..d26e1375b9dc
> --- /dev/null
> +++ b/samples/seccomp/clone-filter.c
> @@ -0,0 +1,143 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Seccomp filter example for cloning a filter
> + *
> + * Copyright (c) 2025 Oracle and/or its affiliates.
> + * Author: Tom Hromatka <tom.hromatka@oracle.com>
> + *
> + * The code may be used by anyone for any purpose,
> + * and can serve as a starting point for developing
> + * applications that reuse the same seccomp filter
> + * across many processes.
> + */
> +#include <linux/seccomp.h>
> +#include <linux/filter.h>
> +#include <sys/syscall.h>
> +#include <sys/wait.h>
> +#include <stdbool.h>
> +#include <signal.h>
> +#include <stddef.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <errno.h>
> +
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> +
> +static int seccomp(unsigned int op, unsigned int flags, void *args)
> +{
> +	errno = 0;
> +	return syscall(__NR_seccomp, op, flags, args);
> +}
> +
> +static int install_filter(void)
> +{
> +	struct sock_filter deny_filter[] = {
> +		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> +			offsetof(struct seccomp_data, nr)),
> +		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
> +		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
> +		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> +	};
> +	struct sock_fprog deny_prog = {
> +		.len = (unsigned short)ARRAY_SIZE(deny_filter),
> +		.filter = deny_filter,
> +	};
> +
> +	return seccomp(SECCOMP_SET_MODE_FILTER, 0, &deny_prog);
> +}
> +
> +static int clone_filter(pid_t ref_pid)
> +{
> +	int ref_pidfd, ret;
> +
> +	ref_pidfd = syscall(SYS_pidfd_open, ref_pid, 0);
> +	if (ref_pidfd < 0)
> +		return -errno;
> +
> +	ret = seccomp(SECCOMP_CLONE_FILTER, 0, &ref_pidfd);
> +
> +	close(ref_pidfd);
> +
> +	return ret;
> +}
> +
> +static void do_ref_filter(void)
> +{
> +	int ret;
> +
> +	ret = install_filter();
> +	if (ret) {
> +		perror("Failed to install ref filter\n");
> +		exit(1);
> +	}
> +
> +	while (true)
> +		sleep(1);
> +}
> +
> +static void do_child_process(pid_t ref_pid)
> +{
> +	pid_t res;
> +	int ret;
> +
> +	ret = clone_filter(ref_pid);
> +	if (ret != 0) {
> +		perror("Failed to clone filter. Installing filter from scratch\n");
> +
> +		ret = install_filter();
> +		if (ret != 0) {
> +			perror("Filter install failed\n");
> +			exit(ret);
> +		}
> +	}
> +
> +	res = syscall(__NR_getpid);
> +	if (res < 0) {
> +		perror("getpid() unexpectedly failed\n");
> +		exit(errno);
> +	}
> +
> +	res = syscall(__NR_getppid);
> +	if (res > 0) {
> +		perror("getppid() unexpectedly succeeded\n");
> +		exit(1);
> +	}
> +
> +	exit(0);
> +}
> +
> +int main(void)
> +{
> +	pid_t ref_pid = -1, child_pid = -1;
> +	int ret, status;
> +
> +	ref_pid = fork();
> +	if (ref_pid < 0)
> +		exit(errno);
> +	else if (ref_pid == 0)
> +		do_ref_filter();
> +
> +	child_pid = fork();
> +	if (child_pid < 0)
> +		goto out;
> +	else if (child_pid == 0)
> +		do_child_process(ref_pid);
> +
> +	waitpid(child_pid, &status, 0);
> +	if (WEXITSTATUS(status) != 0) {
> +		perror("child process failed");
> +		ret = WEXITSTATUS(status);
> +		goto out;
> +	}
> +
> +	ret = 0;
> +
> +out:
> +	if (ref_pid != -1)
> +		kill(ref_pid, SIGKILL);
> +	if (child_pid != -1)
> +		kill(child_pid, SIGKILL);
> +
> +	exit(ret);
> +}
> diff --git a/tools/include/uapi/linux/seccomp.h b/tools/include/uapi/linux/seccomp.h
> index dbfc9b37fcae..b0917e333b4b 100644
> --- a/tools/include/uapi/linux/seccomp.h
> +++ b/tools/include/uapi/linux/seccomp.h
> @@ -16,6 +16,7 @@
>  #define SECCOMP_SET_MODE_FILTER		1
>  #define SECCOMP_GET_ACTION_AVAIL	2
>  #define SECCOMP_GET_NOTIF_SIZES		3
> +#define SECCOMP_CLONE_FILTER		4
>  
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>  #define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 61acbd45ffaa..df5e0f615da0 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -177,6 +177,10 @@ struct seccomp_data {
>  #define SECCOMP_GET_NOTIF_SIZES 3
>  #endif
>  
> +#ifndef SECCOMP_CLONE_FILTER
> +#define SECCOMP_CLONE_FILTER 4
> +#endif
> +
>  #ifndef SECCOMP_FILTER_FLAG_TSYNC
>  #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
>  #endif
> @@ -5090,6 +5094,73 @@ TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
>  	ASSERT_EQ(0, run_probed_with_filter(&prog));
>  }
>  
> +TEST(clone_filter)

Yay tests!

> +{
> +	struct sock_filter deny_filter[] = {
> +		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
> +			offsetof(struct seccomp_data, nr)),
> +		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
> +		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
> +		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> +	};
> +	struct sock_fprog deny_prog = {
> +		.len = (unsigned short)ARRAY_SIZE(deny_filter),
> +		.filter = deny_filter,
> +	};
> +	struct timespec ts = {
> +		.tv_sec = 0,
> +		.tv_nsec = 100000000,
> +	};
> +
> +	pid_t child_pid, self_pid, res;
> +	int child_pidfd, ret;
> +
> +	/* Only real root can copy a filter. */
> +	if (geteuid()) {

That's not true: uid==0 is not CAP_SYS_ADMIN. :) Look in this test for:

        cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);

which is how to test this correctly.

> +		SKIP(return, "clone_filter requires real root");
> +		return;
> +	}
> +
> +	self_pid = getpid();
> +
> +	child_pid = fork();
> +	ASSERT_LE(0, child_pid);

Uh, isn't that supposed to be GE? This should have failed your test
immediately every time. Does this test pass for you? :P

> +
> +	if (child_pid == 0) {
> +		ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
> +		ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &deny_prog));

I'd assert that get_ppid gives ESRCH here just for completeness.

> +
> +		while (true)
> +			EXPECT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
> +	}
> +
> +	/* wait for the child pid to create its seccomp filter */
> +	ASSERT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));

Please look at the other tests to see how to synchronize without a
non-deterministic sleep "guess". :) It's bad form to induce needless
delays in tests.

> +
> +	child_pidfd = syscall(SYS_pidfd_open, child_pid, 0);
> +	EXPECT_LE(0, child_pidfd);
> +
> +	/* Invalid flag provided */
> +	ret = seccomp(SECCOMP_CLONE_FILTER, 1, &child_pidfd);

I'd set all the bits, not just 1.

This is also missing a EFAULT test (i.e. a NULL child_pidfd). (Though I
still want to know if this is idiomatic for pidfd interfaces -- I'd not
expect this to be passed as a pointer.)

And it's missing a ESRCH test.

And a "this is not a pidfd" test.

> +	EXPECT_EQ(-1, ret);
> +	EXPECT_EQ(errno, EINVAL);
> +
> +	errno = 0;
> +	ret = seccomp(SECCOMP_CLONE_FILTER, 0, &child_pidfd);
> +	EXPECT_EQ(0, ret);
> +	EXPECT_EQ(errno, 0);
> +
> +	res = syscall(__NR_getppid);
> +	EXPECT_EQ(res, -1);
> +	EXPECT_EQ(errno, ESRCH);

I would validate that getppid succeeds before the CLONE_FILTER, just for
robustness.

> +
> +	res = syscall(__NR_getpid);
> +	EXPECT_EQ(res, self_pid);
> +
> +	close(child_pidfd);
> +	kill(child_pid, SIGKILL);

I'm not a fan of using "kill" for child sync in tests. I'd rather see a
blocking read or similar (so the child doesn't have to spin, even if
it's in sleep). But at the very least I'd want to see a
waitpid for this kill.

> +}

Please also check for the "I already have a filter attached"
failure path.

> +
>  /*
>   * TODO:
>   * - expand NNP testing
> -- 
> 2.47.3
> 

Thanks for working on this! It'd be a nice speed-up for sure.

-Kees

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-03 20:38 [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation Tom Hromatka
                   ` (2 preceding siblings ...)
  2025-09-04 16:26 ` Kees Cook
@ 2025-09-04 17:51 ` Al Viro
  2025-09-05 21:03   ` YiFei Zhu
  3 siblings, 1 reply; 13+ messages in thread
From: Al Viro @ 2025-09-04 17:51 UTC (permalink / raw)
  To: Tom Hromatka
  Cc: kees, luto, wad, sargun, corbet, shuah, brauner, linux-doc,
	linux-kernel, linux-kselftest, bpf, YiFei Zhu

On Wed, Sep 03, 2025 at 08:38:03PM +0000, Tom Hromatka wrote:

> +static long seccomp_clone_filter(void __user *upidfd)
> +{
> +	struct task_struct *task;
> +	unsigned int flags;
> +	pid_t pidfd;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;

OK...

> +	if (atomic_read(&current->seccomp.filter_count) > 0)
> +		return -EINVAL;

If it's atomic, then presumably there's something that can change
it asynchronously, right?  If so, what's there to prevent
invalidation of the result of this test right after you've
decided everything's fine?

Let's check...
; git grep -n -w filter_count
<64 lines of output, most clearly unrelated to that>
; git grep -n -w -c filter_count
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c:1
drivers/net/ethernet/intel/i40e/i40e_common.c:18
drivers/net/ethernet/intel/i40e/i40e_prototype.h:4
drivers/net/ethernet/qlogic/qede/qede_filter.c:13
drivers/net/ipa/ipa.h:2
drivers/net/ipa/ipa_cmd.c:1
drivers/net/ipa/ipa_table.c:6
fs/proc/array.c:1
include/linux/seccomp_types.h:2
init/init_task.c:1
kernel/seccomp.c:3
lib/kunit/executor.c:7
lib/kunit/executor_test.c:5

Sod drivers and lib/kunit, these are irrelevant.  Removing those
hits yields this:
; git grep -n -w filter_count|grep -v '[^dl]'
fs/proc/array.c:340:                        atomic_read(&p->seccomp.filter_count));
include/linux/seccomp_types.h:15: * @filter_count: number of seccomp filters
include/linux/seccomp_types.h:24:       atomic_t filter_count;
init/init_task.c:208:   .seccomp        = { .filter_count = ATOMIC_INIT(0) },
kernel/seccomp.c:631:           atomic_set(&thread->seccomp.filter_count,
kernel/seccomp.c:632:                      atomic_read(&caller->seccomp.filter_count));
kernel/seccomp.c:932:   atomic_inc(&current->seccomp.filter_count);

Aha.  We have a reader in fs/proc/array.c, definition of that thing in
seccomp_types.h, initialization in init_task.c and two places in
seccomp.c, one around line 631 copying the value from one thread to
another (seccomp_sync_threads()) and one at line 932 incrementing
the damn thing (seccomp_attach_filter()).

Humm...  OK, former is copying the filter_count of current (caller is
set to current there) to other threads in the same thread group and
apparently that's serialized on ->signal->cred_guard_mutex of that
bunch, as well as on current->sighand->siglock (and since all threads
in the group are going to share ->sighand, it's the same thing
as their ->sighand->siglock).

The latter increments that thing for current, under ->sighand->siglock.

So
	* atomic_t for filter_count looks like cargo-culting (and unless I'm
missing something, the only thing that cares about it is /proc/*/status;
rudiment of some sort?)
	* looks like the test can be invalidated by another thread hitting
that seccomp_sync_threads() thing (from a quick look, SECCOMP_SET_MODE_FILTER
with SECCOMP_FILTER_FLAG_TSYNC in flags).

> +	if (copy_from_user(&pidfd, upidfd, sizeof(pid_t)))
> +		return -EFAULT;

OK...

> +	task = pidfd_get_task(pidfd, &flags);
> +	if (IS_ERR(task))
> +		return -ESRCH;

OK...

> +	spin_lock_irq(&current->sighand->siglock);
> +	spin_lock_irq(&task->sighand->siglock);

WTF?  You are apparently trying to lock both the current and the task you
want to copy from, but... you are nesting two locks of the same sort
here, with not even preventing the self-deadlock (task and current sharing
->sighand - e.g. by belonging to the same thread group), let alone preventing
the same from two threads trying to take the same couple of locks in the
opposite orders.

More to the point, why do you need both at once?

> +	if (atomic_read(&task->seccomp.filter_count) == 0) {

OK...  from the earlier digging it looks like this actually stands for
"if task has no filter attached, piss off - nothing to copy".

> +		spin_unlock_irq(&task->sighand->siglock);
> +		spin_unlock_irq(&current->sighand->siglock);
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	get_seccomp_filter(task);

Umm...
void get_seccomp_filter(struct task_struct *tsk)
{
        struct seccomp_filter *orig = tsk->seccomp.filter;
	if (!orig)
		return;
	__get_seccomp_filter(orig);
	refcount_inc(&orig->users);
}
and
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
	refcount_inc(&filter->refs);
}

So you are taking task->seccomp.filter and bumping refcounts on
it, presumably allowing to unlock the task->sighand->siglock?

> +	current->seccomp = task->seccomp;

wait, what?  You are copying all fields at once, but... from the look
at what seccomp_sync_threads() was doing, it not quite that simple.
OK, atomic for filter_count is worthless, so plain copy will do,
but what about ->seccomp.filter?  This
                /* Make our new filter tree visible. */
		smp_store_release(&thread->seccomp.filter,
				  caller->seccomp.filter);
is potentially more serious.  Granted, in this case we are doing store
to our own thread's ->seccomp.filter, so the barrier implications
might be unimportant - if all reads are either under ->sighand->siglock
or done to current->seccomp.filter, we should be fine, but that needs
to be verified _AND_ commented upon.  Memory barriers are subtle
enough...

Looks like the only lockless reader is
	struct seccomp_filter *f =
			READ_ONCE(current->seccomp.filter);
in seccomp_run_filters(), so unless I've missed something (and "filter"
is not a search-friendly name ;-/) we should be fine; that READ_ONCE()
is there to handle *other* threads doing stores (with that
smp_store_release() in seccomp_sync_threads()).  Incidentally, this
	if (!lock_task_sighand(task, &flags))
		return -ESRCH;

	f = READ_ONCE(task->seccomp.filter);
in proc_pid_seccomp_cache() looks cargo-culted - it's *not* a lockless
access, so this READ_ONCE() is confusing.

Anyway, that copying needs a comment.  What's more, this
		__seccomp_filter_release(thread->seccomp.filter);
just prior to smp_store_release() is more serious - it drops the old
reference.  Yes, you count upon no old reference being there - that's
what your check of current->seccomp.filter_count was supposed to
guarantee, but... it could've appeared after the test.

> +	spin_unlock_irq(&task->sighand->siglock);

OK, finally unlocked the source...

> +	set_task_syscall_work(current, SECCOMP);

... marked current as "we have filters"

> +	spin_unlock_irq(&current->sighand->siglock);

... and unlocked the current.

So basically you have

	verify that current has no filters
	lock current
	lock source
	verify that source has filters
	grab reference to that
	store it in current, assuming that it still has no filters
	unlock source
	mark current as having filters
	unlock current

For one thing, the first check is done before we locked current,
making it racy.  For another, this "hold two locks at once" is
asking for trouble - it could be dealt with, but do we really
need both at once?  We do need the source locked for checking
if it has filters and grabbing a reference, but we don't need
current locked for that - the only thing this lock would give is
protection against filters appearing, but it's done too late to
guarantee that.  And the lock on source shouldn't be needed after
we'd got its filters and grabbed the reference.  So... something
along the lines of

	lock source
	verify that source has filters
	grab reference to that
	store it in local variable, along with filter_count and mode
	unlock source
	lock current
	verify that current has no filters
	copy the stuff we'd stashed into our local variabl to current
	mark current as having filters
	unlock current

That would avoid all problems with nested locks, by virtue of never
taking more than one at a time, but now we grab the reference(s)
to source filters before checking that current has none.  Which
means that we need to undo that on failure.  From the way an old
reference is dropped by seccomp_sync_threads(), that would be
__seccomp_filter_release(filters)...

So something like this:
	spin_lock_irq(&task->sighand->siglock);
	if (atomic_read(&task->seccomp.filter_count) == 0) {
		spin_unlock_irq(&task->sighand->siglock);
		put_task_struct(task);
		return -EINVAL;
	}
	get_seccomp_filter(task);
	new_seccomp = task->seccomp;
	spin_unlock_irq(&task->sighand->siglock);
	spin_lock_irq(&current->sighand->siglock);
	if (atomic_read(&current->seccomp.filter_count) > 0) {
		spin_unlock_irq(&current->sighand->siglock);
		__seccomp_filter_release(new_seccomp.filter);
		put_task_struct(task);
		return -EINVAL;
	}
	// no barriers - only current->seccomp.filter is read locklessly
	current->seccomp = new_seccomp;
	set_task_syscall_work(current, SECCOMP);
	spin_unlock_irq(&current->sighand->siglock);
	put_task_struct(task);
	return 0;

and I would suggest asking whoever had come up with that atomic for
filter_count if it's needed (or ever had been, for that matter).
Who was it, anyway?  Kees, unless I'm misreading the history,
and AFAICS on Cc in this thread, so...

Kees, is there any reason not to make it a plain int?  And what is
that READ_ONCE() doing in proc_pid_seccomp_cache(), while we are
at it...  That's 0d8315dddd28 "seccomp/cache: Report cache data
through /proc/pid/seccomp_cache", by YiFei Zhu <yifeifz2@illinois.edu>,
AFAICS.  Looks like YiFei Zhu <zhuyifei@google.com> is the current
address [Cc'd]...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-04 14:54         ` Al Viro
@ 2025-09-04 18:10           ` Tom Hromatka
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Hromatka @ 2025-09-04 18:10 UTC (permalink / raw)
  To: Al Viro
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Will Drewry,
	Sargun Dhillon, Jonathan Corbet, Shuah Khan, Christian Brauner,
	open list:DOCUMENTATION, LKML,
	open list:KERNEL SELFTEST FRAMEWORK, bpf

On 9/4/25 8:54 AM, Al Viro wrote:
> On Thu, Sep 04, 2025 at 08:26:30AM -0600, Tom Hromatka wrote:
> 
>> This snippet addresses the double irq issue.  I also added a
>> check to make sure that task != current.  (A user shouldn't
>> do that but who knows what they'll actually do.)
>>
>>          if (task == current) {
>>                  put_task_struct(task);
>>                  return -EINVAL;
>>          }
>>
>>          spin_lock_irq(&current->sighand->siglock);
>>          spin_lock(&task->sighand->siglock);
> 
> What do you expect to happen if two tasks do that to each other
> at the same time? 

As written, they'll deadlock sooner or later :(.

But that should be easy to fix by adding two checks prior to
grabbing locks:
1.  Check that the source has 1 or more seccomp filters
2.  Check that the target has 0 seccomp filters.

This would ensure that for the same two processes, there's
only one way the locks could be grabbed.


> Or, for that matter, if task has been spawned
> by current with CLONE_VM | CLONE_SIGHAND?

Don't know right off hand.  I'll look into it.

Thanks for the help!

Tom

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-04 16:26 ` Kees Cook
@ 2025-09-04 18:19   ` Tom Hromatka
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Hromatka @ 2025-09-04 18:19 UTC (permalink / raw)
  To: Kees Cook
  Cc: luto, wad, sargun, corbet, shuah, brauner, linux-doc,
	linux-kernel, linux-kselftest, bpf

On 9/4/25 10:26 AM, Kees Cook wrote:
> On Wed, Sep 03, 2025 at 08:38:03PM +0000, Tom Hromatka wrote:
>> Add an operation, SECCOMP_CLONE_FILTER, that can copy the seccomp filters
>> from another process to the current process.
>>
>> I roughly reproduced the Docker seccomp filter [1] and timed how long it
>> takes to build it (via libseccomp) and attach it to a process.  After
>> 1000 runs, on average it took 3,740,000 TSC ticks (or ~1440 microseconds)
>> on an AMD EPYC 9J14 running at 2596 MHz.  The median build/load time was
>> 3,715,000 TSC ticks.
>>
>> On the same system, I preloaded the above Docker seccomp filter onto a
>> process.  (Note that I opened a pidfd to the reference process and left
>> the pidfd open for the entire run.)  I then cloned the filter using the
>> feature in this patch to 1000 new processes.  On average, it took 9,300
>> TSC ticks (or ~3.6 microseconds) to copy the filter to the new processes.
>> The median clone time was 9,048 TSC ticks.
>>
>> This is approximately a 400x performance improvement for those container
>> managers that are using the exact same seccomp filter across all of their
>> containers.
> 

Thanks for looking it over.  I'll make the technical changes in a v2 in
the next week or two.

> This is a nice speedup, but with devil's advocate hat on, are launchers
> spawning at rates high enough that this makes a difference?

For users that launch VMs that last hours or more, you are correct, this
change doesn't matter to them.

But there are a small subset of users that launch containers at a very
high rate and startup times are critical.

FWIW, easyseccomp [1] was created a few years ago in part because
generating filters with libseccomp can be challenging and somewhat
slow.

Thanks!

Tom

[1] https://github.com/giuseppe/easyseccomp

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
  2025-09-04 17:51 ` Al Viro
@ 2025-09-05 21:03   ` YiFei Zhu
  0 siblings, 0 replies; 13+ messages in thread
From: YiFei Zhu @ 2025-09-05 21:03 UTC (permalink / raw)
  To: Al Viro
  Cc: Tom Hromatka, kees, luto, wad, sargun, corbet, shuah, brauner,
	linux-doc, linux-kernel, linux-kselftest, bpf

On Thu, Sep 4, 2025 at 10:51 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> Looks like the only lockless reader is
>         struct seccomp_filter *f =
>                         READ_ONCE(current->seccomp.filter);
> in seccomp_run_filters(), so unless I've missed something (and "filter"
> is not a search-friendly name ;-/) we should be fine; that READ_ONCE()
> is there to handle *other* threads doing stores (with that
> smp_store_release() in seccomp_sync_threads()).  Incidentally, this
>         if (!lock_task_sighand(task, &flags))
>                 return -ESRCH;
>
>         f = READ_ONCE(task->seccomp.filter);
> in proc_pid_seccomp_cache() looks cargo-culted - it's *not* a lockless
> access, so this READ_ONCE() is confusing.

> Kees, is there any reason not to make it a plain int?  And what is
> that READ_ONCE() doing in proc_pid_seccomp_cache(), while we are
> at it...  That's 0d8315dddd28 "seccomp/cache: Report cache data
> through /proc/pid/seccomp_cache", by YiFei Zhu <yifeifz2@illinois.edu>,
> AFAICS.  Looks like YiFei Zhu <zhuyifei@google.com> is the current
> address [Cc'd]...

I don't remember the context, but looking at the lore [1], AFAICT, it
was initially incorrectly lockless, then locking was added; READ_ONCE
was a missed leftover.

Can send a patch to remove it.

YiFei Zhu

[1] https://lore.kernel.org/all/CAG48ez0whaSTobwnoJHW+Eyqg5a8H4JCO-KHrgsuNiEg0qbD3w@mail.gmail.com/

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-09-05 21:03 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-03 20:38 [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation Tom Hromatka
2025-09-03 20:45 ` Alexei Starovoitov
2025-09-03 20:51   ` Tom Hromatka
2025-09-03 22:44     ` Alexei Starovoitov
2025-09-04 12:08       ` Tom Hromatka
2025-09-04 14:26       ` Tom Hromatka
2025-09-04 14:54         ` Al Viro
2025-09-04 18:10           ` Tom Hromatka
2025-09-04 11:53 ` kernel test robot
2025-09-04 16:26 ` Kees Cook
2025-09-04 18:19   ` Tom Hromatka
2025-09-04 17:51 ` Al Viro
2025-09-05 21:03   ` YiFei Zhu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).