* [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 [PATCH 0/4] bpf: add a few hooks for sandboxing Christian Brauner
@ 2026-02-20 0:38 ` Christian Brauner
2026-02-23 10:36 ` Matt Bobrowski
` (4 more replies)
2026-02-20 0:38 ` [PATCH 2/4] cgroup: add bpf hook for attach Christian Brauner
` (2 subsequent siblings)
3 siblings, 5 replies; 28+ messages in thread
From: Christian Brauner @ 2026-02-20 0:38 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering,
Christian Brauner
Add the three namespace lifecycle hooks and make them available to bpf
lsm program types. This allows bpf to supervise namespace creation. I'm
in the process of adding various "universal truth" bpf programs to
systemd that will make use of this. This e.g., allows to lock in a
program into a given set of namespaces.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
kernel/nscommon.c | 9 ++++++++-
kernel/nsproxy.c | 7 +++++++
4 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 643809cc78c3..5ae438fdf567 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -12,6 +12,9 @@
#include <linux/bpf_verifier.h>
#include <linux/lsm_hooks.h>
+struct ns_common;
+struct nsset;
+
#ifdef CONFIG_BPF_LSM
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
@@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
struct bpf_retval_range *range);
+
+int bpf_lsm_namespace_alloc(struct ns_common *ns);
+void bpf_lsm_namespace_free(struct ns_common *ns);
+int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
+
int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
const struct bpf_dynptr *value_p, int flags);
int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
@@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
{
return false;
}
+
+static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
+{
+ return 0;
+}
+static inline void bpf_lsm_namespace_free(struct ns_common *ns)
+{
+}
+static inline int bpf_lsm_namespace_install(struct nsset *nsset,
+ struct ns_common *ns)
+{
+ return 0;
+}
#endif /* CONFIG_BPF_LSM */
#endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0c4a0c8e6f70..f6378db46220 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
+__bpf_hook_start();
+
+__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
+{
+ return 0;
+}
+
+__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
+{
+}
+
+__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
+ struct ns_common *ns)
+{
+ return 0;
+}
+
+__bpf_hook_end();
+
#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
BTF_SET_START(bpf_lsm_hooks)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
+BTF_ID(func, bpf_lsm_namespace_alloc)
+BTF_ID(func, bpf_lsm_namespace_free)
+BTF_ID(func, bpf_lsm_namespace_install)
BTF_SET_END(bpf_lsm_hooks)
BTF_SET_START(bpf_lsm_disabled_hooks)
@@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
+BTF_ID(func, bpf_lsm_namespace_alloc)
+BTF_ID(func, bpf_lsm_namespace_install)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
@@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
+BTF_ID(func, bpf_lsm_namespace_free)
BTF_SET_END(untrusted_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index bdc3c86231d3..c3613cab3d41 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#include <linux/bpf_lsm.h>
#include <linux/ns_common.h>
#include <linux/nstree.h>
#include <linux/proc_ns.h>
@@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
ret = proc_alloc_inum(&ns->inum);
if (ret)
return ret;
+
/*
* Tree ref starts at 0. It's incremented when namespace enters
* active use (installed in nsproxy) and decremented when all
@@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
atomic_set(&ns->__ns_ref_active, 1);
else
atomic_set(&ns->__ns_ref_active, 0);
- return 0;
+
+ ret = bpf_lsm_namespace_alloc(ns);
+ if (ret && !inum)
+ proc_free_inum(ns->inum);
+ return ret;
}
void __ns_common_free(struct ns_common *ns)
{
+ bpf_lsm_namespace_free(ns);
proc_free_inum(ns->inum);
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 259c4b4f1eeb..5742f9664dbb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -9,6 +9,7 @@
* Pavel Emelianov <xemul@openvz.org>
*/
+#include <linux/bpf_lsm.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
@@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
{
+ int ret;
+
+ ret = bpf_lsm_namespace_install(nsset, ns);
+ if (ret)
+ return ret;
+
return ns->ops->install(nsset, ns);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
@ 2026-02-23 10:36 ` Matt Bobrowski
2026-02-23 11:12 ` Christian Brauner
2026-02-23 12:44 ` Djalal Harouni
` (3 subsequent siblings)
4 siblings, 1 reply; 28+ messages in thread
From: Matt Bobrowski @ 2026-02-23 10:36 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> Add the three namespace lifecycle hooks and make them available to bpf
> lsm program types. This allows bpf to supervise namespace creation. I'm
> in the process of adding various "universal truth" bpf programs to
> systemd that will make use of this. This e.g., allows to lock in a
> program into a given set of namespaces.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> kernel/nscommon.c | 9 ++++++++-
> kernel/nsproxy.c | 7 +++++++
> 4 files changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index 643809cc78c3..5ae438fdf567 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -12,6 +12,9 @@
> #include <linux/bpf_verifier.h>
> #include <linux/lsm_hooks.h>
>
> +struct ns_common;
> +struct nsset;
> +
> #ifdef CONFIG_BPF_LSM
>
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
>
> int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> struct bpf_retval_range *range);
> +
> +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> +void bpf_lsm_namespace_free(struct ns_common *ns);
> +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> +
> int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> const struct bpf_dynptr *value_p, int flags);
> int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> {
> return false;
> }
> +
> +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> #endif /* CONFIG_BPF_LSM */
>
> #endif /* _LINUX_BPF_LSM_H */
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 0c4a0c8e6f70..f6378db46220 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
>
> +__bpf_hook_start();
> +
> +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +
> +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__bpf_hook_end();
> +
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> BTF_SET_START(bpf_lsm_hooks)
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_free)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(bpf_lsm_hooks)
>
> BTF_SET_START(bpf_lsm_disabled_hooks)
> @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> BTF_ID(func, bpf_lsm_task_setscheduler)
> BTF_ID(func, bpf_lsm_task_to_inode)
> BTF_ID(func, bpf_lsm_userns_create)
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(sleepable_lsm_hooks)
>
> BTF_SET_START(untrusted_lsm_hooks)
> @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> BTF_ID(func, bpf_lsm_sk_free_security)
> #endif /* CONFIG_SECURITY_NETWORK */
> BTF_ID(func, bpf_lsm_task_free)
> +BTF_ID(func, bpf_lsm_namespace_free)
> BTF_SET_END(untrusted_lsm_hooks)
>
> bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> index bdc3c86231d3..c3613cab3d41 100644
> --- a/kernel/nscommon.c
> +++ b/kernel/nscommon.c
> @@ -1,6 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/ns_common.h>
> #include <linux/nstree.h>
> #include <linux/proc_ns.h>
> @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> ret = proc_alloc_inum(&ns->inum);
> if (ret)
> return ret;
> +
> /*
> * Tree ref starts at 0. It's incremented when namespace enters
> * active use (installed in nsproxy) and decremented when all
> @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> atomic_set(&ns->__ns_ref_active, 1);
> else
> atomic_set(&ns->__ns_ref_active, 0);
> - return 0;
> +
> + ret = bpf_lsm_namespace_alloc(ns);
> + if (ret && !inum)
> + proc_free_inum(ns->inum);
> + return ret;
> }
>
> void __ns_common_free(struct ns_common *ns)
> {
> + bpf_lsm_namespace_free(ns);
> proc_free_inum(ns->inum);
> }
>
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 259c4b4f1eeb..5742f9664dbb 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -9,6 +9,7 @@
> * Pavel Emelianov <xemul@openvz.org>
> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/slab.h>
> #include <linux/export.h>
> #include <linux/nsproxy.h>
> @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
>
> static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> {
> + int ret;
> +
> + ret = bpf_lsm_namespace_install(nsset, ns);
> + if (ret)
> + return ret;
> +
> return ns->ops->install(nsset, ns);
> }
What's the reason for not adding these new hook points to the generic
set of hooks that are currently being exposed directly by the LSM
framework? Honestly, it seems a little odd to be providing
declarations/definitions for a set of new hook points which are to be
exclusively siloed to BPF LSM implementations only. I'd argue that
some other LSM implementations could very well find namespace
lifecycle events possibly interesting.
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-23 10:36 ` Matt Bobrowski
@ 2026-02-23 11:12 ` Christian Brauner
2026-02-24 0:15 ` Matt Bobrowski
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-23 11:12 UTC (permalink / raw)
To: Matt Bobrowski
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Mon, Feb 23, 2026 at 10:36:19AM +0000, Matt Bobrowski wrote:
> On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > Add the three namespace lifecycle hooks and make them available to bpf
> > lsm program types. This allows bpf to supervise namespace creation. I'm
> > in the process of adding various "universal truth" bpf programs to
> > systemd that will make use of this. This e.g., allows to lock in a
> > program into a given set of namespaces.
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > kernel/nscommon.c | 9 ++++++++-
> > kernel/nsproxy.c | 7 +++++++
> > 4 files changed, 61 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > index 643809cc78c3..5ae438fdf567 100644
> > --- a/include/linux/bpf_lsm.h
> > +++ b/include/linux/bpf_lsm.h
> > @@ -12,6 +12,9 @@
> > #include <linux/bpf_verifier.h>
> > #include <linux/lsm_hooks.h>
> >
> > +struct ns_common;
> > +struct nsset;
> > +
> > #ifdef CONFIG_BPF_LSM
> >
> > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> >
> > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > struct bpf_retval_range *range);
> > +
> > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > +
> > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > const struct bpf_dynptr *value_p, int flags);
> > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > {
> > return false;
> > }
> > +
> > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
> > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > + struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > #endif /* CONFIG_BPF_LSM */
> >
> > #endif /* _LINUX_BPF_LSM_H */
> > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > index 0c4a0c8e6f70..f6378db46220 100644
> > --- a/kernel/bpf/bpf_lsm.c
> > +++ b/kernel/bpf/bpf_lsm.c
> > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > #include <linux/lsm_hook_defs.h>
> > #undef LSM_HOOK
> >
> > +__bpf_hook_start();
> > +
> > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +
> > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
> > +
> > +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> > + struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +
> > +__bpf_hook_end();
> > +
> > #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> > BTF_SET_START(bpf_lsm_hooks)
> > #include <linux/lsm_hook_defs.h>
> > #undef LSM_HOOK
> > +BTF_ID(func, bpf_lsm_namespace_alloc)
> > +BTF_ID(func, bpf_lsm_namespace_free)
> > +BTF_ID(func, bpf_lsm_namespace_install)
> > BTF_SET_END(bpf_lsm_hooks)
> >
> > BTF_SET_START(bpf_lsm_disabled_hooks)
> > @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> > BTF_ID(func, bpf_lsm_task_setscheduler)
> > BTF_ID(func, bpf_lsm_task_to_inode)
> > BTF_ID(func, bpf_lsm_userns_create)
> > +BTF_ID(func, bpf_lsm_namespace_alloc)
> > +BTF_ID(func, bpf_lsm_namespace_install)
> > BTF_SET_END(sleepable_lsm_hooks)
> >
> > BTF_SET_START(untrusted_lsm_hooks)
> > @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> > BTF_ID(func, bpf_lsm_sk_free_security)
> > #endif /* CONFIG_SECURITY_NETWORK */
> > BTF_ID(func, bpf_lsm_task_free)
> > +BTF_ID(func, bpf_lsm_namespace_free)
> > BTF_SET_END(untrusted_lsm_hooks)
> >
> > bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> > diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> > index bdc3c86231d3..c3613cab3d41 100644
> > --- a/kernel/nscommon.c
> > +++ b/kernel/nscommon.c
> > @@ -1,6 +1,7 @@
> > // SPDX-License-Identifier: GPL-2.0-only
> > /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
> >
> > +#include <linux/bpf_lsm.h>
> > #include <linux/ns_common.h>
> > #include <linux/nstree.h>
> > #include <linux/proc_ns.h>
> > @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > ret = proc_alloc_inum(&ns->inum);
> > if (ret)
> > return ret;
> > +
> > /*
> > * Tree ref starts at 0. It's incremented when namespace enters
> > * active use (installed in nsproxy) and decremented when all
> > @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > atomic_set(&ns->__ns_ref_active, 1);
> > else
> > atomic_set(&ns->__ns_ref_active, 0);
> > - return 0;
> > +
> > + ret = bpf_lsm_namespace_alloc(ns);
> > + if (ret && !inum)
> > + proc_free_inum(ns->inum);
> > + return ret;
> > }
> >
> > void __ns_common_free(struct ns_common *ns)
> > {
> > + bpf_lsm_namespace_free(ns);
> > proc_free_inum(ns->inum);
> > }
> >
> > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> > index 259c4b4f1eeb..5742f9664dbb 100644
> > --- a/kernel/nsproxy.c
> > +++ b/kernel/nsproxy.c
> > @@ -9,6 +9,7 @@
> > * Pavel Emelianov <xemul@openvz.org>
> > */
> >
> > +#include <linux/bpf_lsm.h>
> > #include <linux/slab.h>
> > #include <linux/export.h>
> > #include <linux/nsproxy.h>
> > @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
> >
> > static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> > {
> > + int ret;
> > +
> > + ret = bpf_lsm_namespace_install(nsset, ns);
> > + if (ret)
> > + return ret;
> > +
> > return ns->ops->install(nsset, ns);
> > }
>
> What's the reason for not adding these new hook points to the generic
> set of hooks that are currently being exposed directly by the LSM
> framework? Honestly, it seems a little odd to be providing
> declarations/definitions for a set of new hook points which are to be
> exclusively siloed to BPF LSM implementations only. I'd argue that
> some other LSM implementations could very well find namespace
> lifecycle events possibly interesting.
The LSM layer is of the opinion that adding new security hooks is only
acceptable if an implementation for an in-tree LSM is provided alongside
it (cf. [1]). IOW, your bpf lsm needs are not sufficient justification
for adding new security hooks. So if you want to add security hooks that
a bpf lsm makes use of then you need to come up with an implementation
for another in-tree LSM.
However, a subsystem is free to add as much bpf support as it wants:
none, some, flamethrower mode. Cgroupfs has traditionally been very bpf
friendly. I maintain namespaces and rewrote the infra allowing me to
manage them uniformly now. bpf literally just needs an attach point. I
could also just add fmodret tracepoints and achieve the same result.
The same way you add bpf kfuncs to support access to functionality that
put you way past what an in-tree use would be able do. The question is
whether you want such capabilities to be bounded by in-tree users as
well.
Either a bpf lsm is an inextensible fixture bound to the scope of
security_* or you allow subsystems open to it to add functionality just
like adding a kfuncs is.
[1]: https://patch.msgid.link/20260216-work-security-namespace-v1-1-075c28758e1f@kernel.org
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-23 11:12 ` Christian Brauner
@ 2026-02-24 0:15 ` Matt Bobrowski
0 siblings, 0 replies; 28+ messages in thread
From: Matt Bobrowski @ 2026-02-24 0:15 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Mon, Feb 23, 2026 at 12:12:28PM +0100, Christian Brauner wrote:
> On Mon, Feb 23, 2026 at 10:36:19AM +0000, Matt Bobrowski wrote:
> > On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > > Add the three namespace lifecycle hooks and make them available to bpf
> > > lsm program types. This allows bpf to supervise namespace creation. I'm
> > > in the process of adding various "universal truth" bpf programs to
> > > systemd that will make use of this. This e.g., allows to lock in a
> > > program into a given set of namespaces.
> > >
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > > kernel/nscommon.c | 9 ++++++++-
> > > kernel/nsproxy.c | 7 +++++++
> > > 4 files changed, 61 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > > index 643809cc78c3..5ae438fdf567 100644
> > > --- a/include/linux/bpf_lsm.h
> > > +++ b/include/linux/bpf_lsm.h
> > > @@ -12,6 +12,9 @@
> > > #include <linux/bpf_verifier.h>
> > > #include <linux/lsm_hooks.h>
> > >
> > > +struct ns_common;
> > > +struct nsset;
> > > +
> > > #ifdef CONFIG_BPF_LSM
> > >
> > > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> > >
> > > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > > struct bpf_retval_range *range);
> > > +
> > > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > > +
> > > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > > const struct bpf_dynptr *value_p, int flags);
> > > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > > {
> > > return false;
> > > }
> > > +
> > > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> > > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > > + struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > #endif /* CONFIG_BPF_LSM */
> > >
> > > #endif /* _LINUX_BPF_LSM_H */
> > > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > > index 0c4a0c8e6f70..f6378db46220 100644
> > > --- a/kernel/bpf/bpf_lsm.c
> > > +++ b/kernel/bpf/bpf_lsm.c
> > > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > > #include <linux/lsm_hook_defs.h>
> > > #undef LSM_HOOK
> > >
> > > +__bpf_hook_start();
> > > +
> > > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +
> > > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> > > +
> > > +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> > > + struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +
> > > +__bpf_hook_end();
> > > +
> > > #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> > > BTF_SET_START(bpf_lsm_hooks)
> > > #include <linux/lsm_hook_defs.h>
> > > #undef LSM_HOOK
> > > +BTF_ID(func, bpf_lsm_namespace_alloc)
> > > +BTF_ID(func, bpf_lsm_namespace_free)
> > > +BTF_ID(func, bpf_lsm_namespace_install)
> > > BTF_SET_END(bpf_lsm_hooks)
> > >
> > > BTF_SET_START(bpf_lsm_disabled_hooks)
> > > @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> > > BTF_ID(func, bpf_lsm_task_setscheduler)
> > > BTF_ID(func, bpf_lsm_task_to_inode)
> > > BTF_ID(func, bpf_lsm_userns_create)
> > > +BTF_ID(func, bpf_lsm_namespace_alloc)
> > > +BTF_ID(func, bpf_lsm_namespace_install)
> > > BTF_SET_END(sleepable_lsm_hooks)
> > >
> > > BTF_SET_START(untrusted_lsm_hooks)
> > > @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> > > BTF_ID(func, bpf_lsm_sk_free_security)
> > > #endif /* CONFIG_SECURITY_NETWORK */
> > > BTF_ID(func, bpf_lsm_task_free)
> > > +BTF_ID(func, bpf_lsm_namespace_free)
> > > BTF_SET_END(untrusted_lsm_hooks)
> > >
> > > bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> > > diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> > > index bdc3c86231d3..c3613cab3d41 100644
> > > --- a/kernel/nscommon.c
> > > +++ b/kernel/nscommon.c
> > > @@ -1,6 +1,7 @@
> > > // SPDX-License-Identifier: GPL-2.0-only
> > > /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
> > >
> > > +#include <linux/bpf_lsm.h>
> > > #include <linux/ns_common.h>
> > > #include <linux/nstree.h>
> > > #include <linux/proc_ns.h>
> > > @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > > ret = proc_alloc_inum(&ns->inum);
> > > if (ret)
> > > return ret;
> > > +
> > > /*
> > > * Tree ref starts at 0. It's incremented when namespace enters
> > > * active use (installed in nsproxy) and decremented when all
> > > @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > > atomic_set(&ns->__ns_ref_active, 1);
> > > else
> > > atomic_set(&ns->__ns_ref_active, 0);
> > > - return 0;
> > > +
> > > + ret = bpf_lsm_namespace_alloc(ns);
> > > + if (ret && !inum)
> > > + proc_free_inum(ns->inum);
> > > + return ret;
> > > }
> > >
> > > void __ns_common_free(struct ns_common *ns)
> > > {
> > > + bpf_lsm_namespace_free(ns);
> > > proc_free_inum(ns->inum);
> > > }
> > >
> > > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> > > index 259c4b4f1eeb..5742f9664dbb 100644
> > > --- a/kernel/nsproxy.c
> > > +++ b/kernel/nsproxy.c
> > > @@ -9,6 +9,7 @@
> > > * Pavel Emelianov <xemul@openvz.org>
> > > */
> > >
> > > +#include <linux/bpf_lsm.h>
> > > #include <linux/slab.h>
> > > #include <linux/export.h>
> > > #include <linux/nsproxy.h>
> > > @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
> > >
> > > static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> > > {
> > > + int ret;
> > > +
> > > + ret = bpf_lsm_namespace_install(nsset, ns);
> > > + if (ret)
> > > + return ret;
> > > +
> > > return ns->ops->install(nsset, ns);
> > > }
> >
> > What's the reason for not adding these new hook points to the generic
> > set of hooks that are currently being exposed directly by the LSM
> > framework? Honestly, it seems a little odd to be providing
> > declarations/definitions for a set of new hook points which are to be
> > exclusively siloed to BPF LSM implementations only. I'd argue that
> > some other LSM implementations could very well find namespace
> > lifecycle events possibly interesting.
>
> The LSM layer is of the opinion that adding new security hooks is only
> acceptable if an implementation for an in-tree LSM is provided alongside
> it (cf. [1]). IOW, your bpf lsm needs are not sufficient justification
> for adding new security hooks. So if you want to add security hooks that
> a bpf lsm makes use of then you need to come up with an implementation
> for another in-tree LSM.
I apologize. I didn't realize that adding these as new generic LSM
hooks points had already been proposed and discussed with the LSM
maintainers. I just wanted to make sure that we weren't
unintentionally side-stepping.
> However, a subsystem is free to add as much bpf support as it wants:
> none, some, flamethrower mode. Cgroupfs has traditionally been very bpf
> friendly. I maintain namespaces and rewrote the infra allowing me to
> manage them uniformly now. bpf literally just needs an attach point. I
> could also just add fmodret tracepoints and achieve the same result.
>
> The same way you add bpf kfuncs to support access to functionality that
> put you way past what an in-tree use would be able do. The question is
> whether you want such capabilities to be bounded by in-tree users as
> well.
>
> Either a bpf lsm is an inextensible fixture bound to the scope of
> security_* or you allow subsystems open to it to add functionality just
> like adding a kfuncs is.
Adding these dedicated BPF LSM hooks is OK with me, especially knowing
that I have agreement from you that you'll also be maintaining their
call sites.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
2026-02-23 10:36 ` Matt Bobrowski
@ 2026-02-23 12:44 ` Djalal Harouni
2026-02-27 11:04 ` Christian Brauner
2026-02-24 1:16 ` Matt Bobrowski
` (2 subsequent siblings)
4 siblings, 1 reply; 28+ messages in thread
From: Djalal Harouni @ 2026-02-23 12:44 UTC (permalink / raw)
To: Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering
On 2/20/26 01:38, Christian Brauner wrote:
> Add the three namespace lifecycle hooks and make them available to bpf
> lsm program types. This allows bpf to supervise namespace creation. I'm
> in the process of adding various "universal truth" bpf programs to
> systemd that will make use of this. This e.g., allows to lock in a
> program into a given set of namespaces.
Thank you Christian, so if this feature is added we will also
use it.
The commit log says lock in a given set of namespaces where I see
only setns path am I right? would it make sense to also have the
check around some callers of create_new_namespaces() where
appropriate befor nsproxy switch if we don't want to go deep, but
allow a bit of control or easy checks around
CLONE_NEWNS/mount/pivot_root fs combinations?
Or defering the combination checks to userspace makes more sense?
The other clone flags are presumably nested so safe, for userns
there is already a check, and cgroup+sb you added in the other
patch is great!
Thank you!
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> kernel/nscommon.c | 9 ++++++++-
> kernel/nsproxy.c | 7 +++++++
> 4 files changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index 643809cc78c3..5ae438fdf567 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -12,6 +12,9 @@
> #include <linux/bpf_verifier.h>
> #include <linux/lsm_hooks.h>
>
> +struct ns_common;
> +struct nsset;
> +
> #ifdef CONFIG_BPF_LSM
>
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
>
> int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> struct bpf_retval_range *range);
> +
> +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> +void bpf_lsm_namespace_free(struct ns_common *ns);
> +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> +
> int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> const struct bpf_dynptr *value_p, int flags);
> int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> {
> return false;
> }
> +
> +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> #endif /* CONFIG_BPF_LSM */
>
> #endif /* _LINUX_BPF_LSM_H */
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 0c4a0c8e6f70..f6378db46220 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
>
> +__bpf_hook_start();
> +
> +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +
> +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__bpf_hook_end();
> +
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> BTF_SET_START(bpf_lsm_hooks)
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_free)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(bpf_lsm_hooks)
>
> BTF_SET_START(bpf_lsm_disabled_hooks)
> @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> BTF_ID(func, bpf_lsm_task_setscheduler)
> BTF_ID(func, bpf_lsm_task_to_inode)
> BTF_ID(func, bpf_lsm_userns_create)
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(sleepable_lsm_hooks)
>
> BTF_SET_START(untrusted_lsm_hooks)
> @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> BTF_ID(func, bpf_lsm_sk_free_security)
> #endif /* CONFIG_SECURITY_NETWORK */
> BTF_ID(func, bpf_lsm_task_free)
> +BTF_ID(func, bpf_lsm_namespace_free)
> BTF_SET_END(untrusted_lsm_hooks)
>
> bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> index bdc3c86231d3..c3613cab3d41 100644
> --- a/kernel/nscommon.c
> +++ b/kernel/nscommon.c
> @@ -1,6 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/ns_common.h>
> #include <linux/nstree.h>
> #include <linux/proc_ns.h>
> @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> ret = proc_alloc_inum(&ns->inum);
> if (ret)
> return ret;
> +
> /*
> * Tree ref starts at 0. It's incremented when namespace enters
> * active use (installed in nsproxy) and decremented when all
> @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> atomic_set(&ns->__ns_ref_active, 1);
> else
> atomic_set(&ns->__ns_ref_active, 0);
> - return 0;
> +
> + ret = bpf_lsm_namespace_alloc(ns);
> + if (ret && !inum)
> + proc_free_inum(ns->inum);
> + return ret;
> }
>
> void __ns_common_free(struct ns_common *ns)
> {
> + bpf_lsm_namespace_free(ns);
> proc_free_inum(ns->inum);
> }
>
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 259c4b4f1eeb..5742f9664dbb 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -9,6 +9,7 @@
> * Pavel Emelianov <xemul@openvz.org>
> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/slab.h>
> #include <linux/export.h>
> #include <linux/nsproxy.h>
> @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
>
> static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> {
> + int ret;
> +
> + ret = bpf_lsm_namespace_install(nsset, ns);
> + if (ret)
> + return ret;
> +
> return ns->ops->install(nsset, ns);
> }
>
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-23 12:44 ` Djalal Harouni
@ 2026-02-27 11:04 ` Christian Brauner
0 siblings, 0 replies; 28+ messages in thread
From: Christian Brauner @ 2026-02-27 11:04 UTC (permalink / raw)
To: Djalal Harouni
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Mon, Feb 23, 2026 at 01:44:23PM +0100, Djalal Harouni wrote:
> On 2/20/26 01:38, Christian Brauner wrote:
> > Add the three namespace lifecycle hooks and make them available to bpf
> > lsm program types. This allows bpf to supervise namespace creation. I'm
> > in the process of adding various "universal truth" bpf programs to
> > systemd that will make use of this. This e.g., allows to lock in a
> > program into a given set of namespaces.
>
> Thank you Christian, so if this feature is added we will also
> use it.
>
> The commit log says lock in a given set of namespaces where I see
> only setns path am I right? would it make sense to also have the
Yes.
> check around some callers of create_new_namespaces() where
> appropriate befor nsproxy switch if we don't want to go deep, but
> allow a bit of control or easy checks around
> CLONE_NEWNS/mount/pivot_root fs combinations?
Yes, I have planned that but we will massage that codepath quite a bit
this cycle to deal with some races so I'd rather push this out for this
reason and also...
... I need to think about how exactly we should hook into that. Probably
when we already have assembled the new namespace set but then I want to
pass it to the hook in a way that I can guarantee KF_TRUSTED_ARGS so
callers can use the macros I have to cast from struct ns_common to
actual namespace type.
We will need additional per-ns type hooks in the future as well. Like,
One would very likely want to supervise writes of idmappings to a userns
and so we need to add hooks for that into /proc/<pid>/{g,u}id_map as
well... and setgroups now come to think of it.
An fwiw, I'm replacing pivot_root() this cycle and I expect userspace to
fade it out eventually. It's an insane system call that holds tasklist
lock to walk _all task_ on the system each time you switch the
container's rootfs just to mess with the pwd and root. That creates all
kinds of races and no container setup actually needs to do the pwd/root
replacement.
So it's really unneeded unless you do weird stuff like switching out the
rootfs in init_mnt_ns post early boot. Which is insane and can't work
for a lot of other reasons and the pwd/root rewrite doesn't solve
pinning via fds anyway so really that all needs to be Michael Myers'ed.
Next release MOVE_MOUNT_BENEATH will take over that job by making it
work with locked mounts and the rootfs.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
2026-02-23 10:36 ` Matt Bobrowski
2026-02-23 12:44 ` Djalal Harouni
@ 2026-02-24 1:16 ` Matt Bobrowski
2026-02-27 10:33 ` Christian Brauner
2026-02-24 13:35 ` Matt Bobrowski
2026-02-24 23:04 ` Song Liu
4 siblings, 1 reply; 28+ messages in thread
From: Matt Bobrowski @ 2026-02-24 1:16 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> Add the three namespace lifecycle hooks and make them available to bpf
> lsm program types. This allows bpf to supervise namespace creation. I'm
> in the process of adding various "universal truth" bpf programs to
> systemd that will make use of this. This e.g., allows to lock in a
> program into a given set of namespaces.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> kernel/nscommon.c | 9 ++++++++-
> kernel/nsproxy.c | 7 +++++++
> 4 files changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index 643809cc78c3..5ae438fdf567 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -12,6 +12,9 @@
> #include <linux/bpf_verifier.h>
> #include <linux/lsm_hooks.h>
>
> +struct ns_common;
> +struct nsset;
> +
> #ifdef CONFIG_BPF_LSM
>
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
>
> int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> struct bpf_retval_range *range);
> +
> +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> +void bpf_lsm_namespace_free(struct ns_common *ns);
> +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> +
> int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> const struct bpf_dynptr *value_p, int flags);
> int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> {
> return false;
> }
> +
> +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> #endif /* CONFIG_BPF_LSM */
>
> #endif /* _LINUX_BPF_LSM_H */
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 0c4a0c8e6f70..f6378db46220 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
>
> +__bpf_hook_start();
> +
> +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +
> +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__bpf_hook_end();
Is the usage of __bpf_hook_start()/__bpf_hook_end() strictly necessary
here? If so, why is that? My understanding was that they're only
needed in situations where public function prototypes don't exist
(e.g., BPF kfuncs).
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> BTF_SET_START(bpf_lsm_hooks)
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_free)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(bpf_lsm_hooks)
>
> BTF_SET_START(bpf_lsm_disabled_hooks)
> @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> BTF_ID(func, bpf_lsm_task_setscheduler)
> BTF_ID(func, bpf_lsm_task_to_inode)
> BTF_ID(func, bpf_lsm_userns_create)
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(sleepable_lsm_hooks)
>
> BTF_SET_START(untrusted_lsm_hooks)
> @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> BTF_ID(func, bpf_lsm_sk_free_security)
> #endif /* CONFIG_SECURITY_NETWORK */
> BTF_ID(func, bpf_lsm_task_free)
> +BTF_ID(func, bpf_lsm_namespace_free)
> BTF_SET_END(untrusted_lsm_hooks)
>
> bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> index bdc3c86231d3..c3613cab3d41 100644
> --- a/kernel/nscommon.c
> +++ b/kernel/nscommon.c
> @@ -1,6 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/ns_common.h>
> #include <linux/nstree.h>
> #include <linux/proc_ns.h>
> @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> ret = proc_alloc_inum(&ns->inum);
> if (ret)
> return ret;
> +
> /*
> * Tree ref starts at 0. It's incremented when namespace enters
> * active use (installed in nsproxy) and decremented when all
> @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> atomic_set(&ns->__ns_ref_active, 1);
> else
> atomic_set(&ns->__ns_ref_active, 0);
> - return 0;
> +
> + ret = bpf_lsm_namespace_alloc(ns);
> + if (ret && !inum)
> + proc_free_inum(ns->inum);
> + return ret;
> }
>
> void __ns_common_free(struct ns_common *ns)
> {
> + bpf_lsm_namespace_free(ns);
> proc_free_inum(ns->inum);
> }
>
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 259c4b4f1eeb..5742f9664dbb 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -9,6 +9,7 @@
> * Pavel Emelianov <xemul@openvz.org>
> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/slab.h>
> #include <linux/export.h>
> #include <linux/nsproxy.h>
> @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
>
> static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> {
> + int ret;
> +
> + ret = bpf_lsm_namespace_install(nsset, ns);
> + if (ret)
> + return ret;
> +
> return ns->ops->install(nsset, ns);
> }
>
>
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-24 1:16 ` Matt Bobrowski
@ 2026-02-27 10:33 ` Christian Brauner
2026-03-24 5:10 ` Matt Bobrowski
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-27 10:33 UTC (permalink / raw)
To: Matt Bobrowski
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Tue, Feb 24, 2026 at 01:16:01AM +0000, Matt Bobrowski wrote:
> On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > Add the three namespace lifecycle hooks and make them available to bpf
> > lsm program types. This allows bpf to supervise namespace creation. I'm
> > in the process of adding various "universal truth" bpf programs to
> > systemd that will make use of this. This e.g., allows to lock in a
> > program into a given set of namespaces.
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > kernel/nscommon.c | 9 ++++++++-
> > kernel/nsproxy.c | 7 +++++++
> > 4 files changed, 61 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > index 643809cc78c3..5ae438fdf567 100644
> > --- a/include/linux/bpf_lsm.h
> > +++ b/include/linux/bpf_lsm.h
> > @@ -12,6 +12,9 @@
> > #include <linux/bpf_verifier.h>
> > #include <linux/lsm_hooks.h>
> >
> > +struct ns_common;
> > +struct nsset;
> > +
> > #ifdef CONFIG_BPF_LSM
> >
> > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> >
> > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > struct bpf_retval_range *range);
> > +
> > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > +
> > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > const struct bpf_dynptr *value_p, int flags);
> > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > {
> > return false;
> > }
> > +
> > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
> > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > + struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > #endif /* CONFIG_BPF_LSM */
> >
> > #endif /* _LINUX_BPF_LSM_H */
> > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > index 0c4a0c8e6f70..f6378db46220 100644
> > --- a/kernel/bpf/bpf_lsm.c
> > +++ b/kernel/bpf/bpf_lsm.c
> > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > #include <linux/lsm_hook_defs.h>
> > #undef LSM_HOOK
> >
> > +__bpf_hook_start();
> > +
> > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +
> > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
> > +
> > +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> > + struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +
> > +__bpf_hook_end();
>
> Is the usage of __bpf_hook_start()/__bpf_hook_end() strictly necessary
> here? If so, why is that? My understanding was that they're only
> needed in situations where public function prototypes don't exist
> (e.g., BPF kfuncs).
I don't know. I just went by other sites that added bpf specific
functions. Seems like bpf specific functions I'm adding so I used the
hook annotation. If unneeded I happily drop it. I just need someone to
tell whether that's right and I can't infer from your "my understanding
[...]" phrasing whether that's an authoritative statement or an
expression of doubt.
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-27 10:33 ` Christian Brauner
@ 2026-03-24 5:10 ` Matt Bobrowski
0 siblings, 0 replies; 28+ messages in thread
From: Matt Bobrowski @ 2026-03-24 5:10 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 27, 2026 at 11:33:56AM +0100, Christian Brauner wrote:
> On Tue, Feb 24, 2026 at 01:16:01AM +0000, Matt Bobrowski wrote:
> > On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > > Add the three namespace lifecycle hooks and make them available to bpf
> > > lsm program types. This allows bpf to supervise namespace creation. I'm
> > > in the process of adding various "universal truth" bpf programs to
> > > systemd that will make use of this. This e.g., allows to lock in a
> > > program into a given set of namespaces.
> > >
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > > kernel/nscommon.c | 9 ++++++++-
> > > kernel/nsproxy.c | 7 +++++++
> > > 4 files changed, 61 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > > index 643809cc78c3..5ae438fdf567 100644
> > > --- a/include/linux/bpf_lsm.h
> > > +++ b/include/linux/bpf_lsm.h
> > > @@ -12,6 +12,9 @@
> > > #include <linux/bpf_verifier.h>
> > > #include <linux/lsm_hooks.h>
> > >
> > > +struct ns_common;
> > > +struct nsset;
> > > +
> > > #ifdef CONFIG_BPF_LSM
> > >
> > > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> > >
> > > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > > struct bpf_retval_range *range);
> > > +
> > > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > > +
> > > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > > const struct bpf_dynptr *value_p, int flags);
> > > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > > {
> > > return false;
> > > }
> > > +
> > > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> > > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > > + struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > #endif /* CONFIG_BPF_LSM */
> > >
> > > #endif /* _LINUX_BPF_LSM_H */
> > > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > > index 0c4a0c8e6f70..f6378db46220 100644
> > > --- a/kernel/bpf/bpf_lsm.c
> > > +++ b/kernel/bpf/bpf_lsm.c
> > > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > > #include <linux/lsm_hook_defs.h>
> > > #undef LSM_HOOK
> > >
> > > +__bpf_hook_start();
> > > +
> > > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +
> > > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> > > +
> > > +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> > > + struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +
> > > +__bpf_hook_end();
> >
> > Is the usage of __bpf_hook_start()/__bpf_hook_end() strictly necessary
> > here? If so, why is that? My understanding was that they're only
> > needed in situations where public function prototypes don't exist
> > (e.g., BPF kfuncs).
>
> I don't know. I just went by other sites that added bpf specific
> functions. Seems like bpf specific functions I'm adding so I used the
> hook annotation. If unneeded I happily drop it. I just need someone to
> tell whether that's right and I can't infer from your "my understanding
> [...]" phrasing whether that's an authoritative statement or an
> expression of doubt.
Truly apologies about the delay here Christian, I've been out of
office the last few weeks.
Initially an expression of doubt, but now an authoritative
statement. You do not need your new BPF LSM specific hooks wrapped
within __bpf_hook_start() and __bpf_hook_end(). Those are technically
for BPF kfuncs which are global functions, but are often only called
from a BPF program. The default BPF LSM hook definitions provided by
the LSM_HOOK() macro also aren't wrapped in __bpf_hook_start() and
__bpf_hook_end().
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
` (2 preceding siblings ...)
2026-02-24 1:16 ` Matt Bobrowski
@ 2026-02-24 13:35 ` Matt Bobrowski
2026-02-27 14:33 ` Christian Brauner
2026-02-24 23:04 ` Song Liu
4 siblings, 1 reply; 28+ messages in thread
From: Matt Bobrowski @ 2026-02-24 13:35 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> Add the three namespace lifecycle hooks and make them available to bpf
> lsm program types. This allows bpf to supervise namespace creation. I'm
> in the process of adding various "universal truth" bpf programs to
> systemd that will make use of this. This e.g., allows to lock in a
> program into a given set of namespaces.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> kernel/nscommon.c | 9 ++++++++-
> kernel/nsproxy.c | 7 +++++++
> 4 files changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index 643809cc78c3..5ae438fdf567 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -12,6 +12,9 @@
> #include <linux/bpf_verifier.h>
> #include <linux/lsm_hooks.h>
>
> +struct ns_common;
> +struct nsset;
> +
> #ifdef CONFIG_BPF_LSM
>
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
>
> int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> struct bpf_retval_range *range);
> +
> +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> +void bpf_lsm_namespace_free(struct ns_common *ns);
> +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> +
> int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> const struct bpf_dynptr *value_p, int flags);
> int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> {
> return false;
> }
> +
> +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
> +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> #endif /* CONFIG_BPF_LSM */
>
> #endif /* _LINUX_BPF_LSM_H */
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 0c4a0c8e6f70..f6378db46220 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
>
> +__bpf_hook_start();
> +
> +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> +{
> +}
I'm wondering how you foresee this hook functioning in a scenario
where the BPF LSM program is attached to this new hook point, although
with its attachment type being set to BPF_LSM_CGROUP instead of
BPF_LSM_MAC? You probably wouldn't want to utilize something like
BPF_LSM_CGROUP for your specific use case, but as things stand
currently I don't believe there's anyhthing preventing you from using
BPF_LSM_CGROUP with a hook like bpf_lsm_namespace_free().
Notably, the BPF_LSM_CGROUP infrastructure is designed to execute BPF
programs based on the cgroup of the currently executing task. There
could be some surprises if the bpf_lsm_namespace_free() hook were to
ever be called from a context (e.g, kworker) other than the one
specified whilst attaching the BPF LSM program with type
BPF_LSM_CGROUP.
> +__weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
> + struct ns_common *ns)
> +{
> + return 0;
> +}
> +
> +__bpf_hook_end();
> +
> #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
> BTF_SET_START(bpf_lsm_hooks)
> #include <linux/lsm_hook_defs.h>
> #undef LSM_HOOK
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_free)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(bpf_lsm_hooks)
>
> BTF_SET_START(bpf_lsm_disabled_hooks)
> @@ -383,6 +405,8 @@ BTF_ID(func, bpf_lsm_task_prctl)
> BTF_ID(func, bpf_lsm_task_setscheduler)
> BTF_ID(func, bpf_lsm_task_to_inode)
> BTF_ID(func, bpf_lsm_userns_create)
> +BTF_ID(func, bpf_lsm_namespace_alloc)
> +BTF_ID(func, bpf_lsm_namespace_install)
> BTF_SET_END(sleepable_lsm_hooks)
>
> BTF_SET_START(untrusted_lsm_hooks)
> @@ -395,6 +419,7 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
> BTF_ID(func, bpf_lsm_sk_free_security)
> #endif /* CONFIG_SECURITY_NETWORK */
> BTF_ID(func, bpf_lsm_task_free)
> +BTF_ID(func, bpf_lsm_namespace_free)
> BTF_SET_END(untrusted_lsm_hooks)
>
> bool bpf_lsm_is_sleepable_hook(u32 btf_id)
> diff --git a/kernel/nscommon.c b/kernel/nscommon.c
> index bdc3c86231d3..c3613cab3d41 100644
> --- a/kernel/nscommon.c
> +++ b/kernel/nscommon.c
> @@ -1,6 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/ns_common.h>
> #include <linux/nstree.h>
> #include <linux/proc_ns.h>
> @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> ret = proc_alloc_inum(&ns->inum);
> if (ret)
> return ret;
> +
> /*
> * Tree ref starts at 0. It's incremented when namespace enters
> * active use (installed in nsproxy) and decremented when all
> @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> atomic_set(&ns->__ns_ref_active, 1);
> else
> atomic_set(&ns->__ns_ref_active, 0);
> - return 0;
> +
> + ret = bpf_lsm_namespace_alloc(ns);
> + if (ret && !inum)
> + proc_free_inum(ns->inum);
> + return ret;
> }
>
> void __ns_common_free(struct ns_common *ns)
> {
> + bpf_lsm_namespace_free(ns);
> proc_free_inum(ns->inum);
> }
>
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 259c4b4f1eeb..5742f9664dbb 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -9,6 +9,7 @@
> * Pavel Emelianov <xemul@openvz.org>
> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/slab.h>
> #include <linux/export.h>
> #include <linux/nsproxy.h>
> @@ -379,6 +380,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
>
> static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
> {
> + int ret;
> +
> + ret = bpf_lsm_namespace_install(nsset, ns);
> + if (ret)
> + return ret;
> +
> return ns->ops->install(nsset, ns);
> }
>
>
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-24 13:35 ` Matt Bobrowski
@ 2026-02-27 14:33 ` Christian Brauner
2026-03-24 5:27 ` Matt Bobrowski
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-27 14:33 UTC (permalink / raw)
To: Matt Bobrowski
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Tue, Feb 24, 2026 at 01:35:11PM +0000, Matt Bobrowski wrote:
> On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > Add the three namespace lifecycle hooks and make them available to bpf
> > lsm program types. This allows bpf to supervise namespace creation. I'm
> > in the process of adding various "universal truth" bpf programs to
> > systemd that will make use of this. This e.g., allows to lock in a
> > program into a given set of namespaces.
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > kernel/nscommon.c | 9 ++++++++-
> > kernel/nsproxy.c | 7 +++++++
> > 4 files changed, 61 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > index 643809cc78c3..5ae438fdf567 100644
> > --- a/include/linux/bpf_lsm.h
> > +++ b/include/linux/bpf_lsm.h
> > @@ -12,6 +12,9 @@
> > #include <linux/bpf_verifier.h>
> > #include <linux/lsm_hooks.h>
> >
> > +struct ns_common;
> > +struct nsset;
> > +
> > #ifdef CONFIG_BPF_LSM
> >
> > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> >
> > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > struct bpf_retval_range *range);
> > +
> > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > +
> > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > const struct bpf_dynptr *value_p, int flags);
> > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > {
> > return false;
> > }
> > +
> > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
> > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > + struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > #endif /* CONFIG_BPF_LSM */
> >
> > #endif /* _LINUX_BPF_LSM_H */
> > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > index 0c4a0c8e6f70..f6378db46220 100644
> > --- a/kernel/bpf/bpf_lsm.c
> > +++ b/kernel/bpf/bpf_lsm.c
> > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > #include <linux/lsm_hook_defs.h>
> > #undef LSM_HOOK
> >
> > +__bpf_hook_start();
> > +
> > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > +{
> > + return 0;
> > +}
> > +
> > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > +{
> > +}
>
> I'm wondering how you foresee this hook functioning in a scenario
> where the BPF LSM program is attached to this new hook point, although
> with its attachment type being set to BPF_LSM_CGROUP instead of
> BPF_LSM_MAC? You probably wouldn't want to utilize something like
> BPF_LSM_CGROUP for your specific use case, but as things stand
> currently I don't believe there's anyhthing preventing you from using
> BPF_LSM_CGROUP with a hook like bpf_lsm_namespace_free().
Oh, I very much would like this to be attachable to cgroups.
> Notably, the BPF_LSM_CGROUP infrastructure is designed to execute BPF
> programs based on the cgroup of the currently executing task. There
> could be some surprises if the bpf_lsm_namespace_free() hook were to
> ever be called from a context (e.g, kworker) other than the one
> specified whilst attaching the BPF LSM program with type
> BPF_LSM_CGROUP.
But isn't this then a generic problem? What about:
# RCU callbacks
security_cred_free
security_task_free
security_inode_free_security_rcu
security_bpf_prog_free
security_xfrm_policy_free_security
security_msg_queue_free_security
security_shm_free_security
security_sem_free_security
security_audit_rule_free
security_bdev_free_security
security_sk_free_security
# Workqueues
security_bpf_map_free
security_bpf_token_free
security_sb_free_security
security_file_free_security
security_file_release
security_xfrm_state_free_security
ignoring sofirq/hardirq for now.
So the only real problem I can see is that someone wants to do something
from a *_free() hook that isn't actually freeing but actual policy based
on the cgroup of @current? I find that hard to believe tbh. Fwiw,
bpf_lsm_namespace_free() is classified as untrusted because at that
point the outer namespace might already be blown away partially.
Effectively alloc() and free() hooks are mostly notification mechanisms
of creation/destructions. If you want to do actual policy you might have
to defer it until an actual operation is done.
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-27 14:33 ` Christian Brauner
@ 2026-03-24 5:27 ` Matt Bobrowski
0 siblings, 0 replies; 28+ messages in thread
From: Matt Bobrowski @ 2026-03-24 5:27 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 27, 2026 at 03:33:21PM +0100, Christian Brauner wrote:
> On Tue, Feb 24, 2026 at 01:35:11PM +0000, Matt Bobrowski wrote:
> > On Fri, Feb 20, 2026 at 01:38:29AM +0100, Christian Brauner wrote:
> > > Add the three namespace lifecycle hooks and make them available to bpf
> > > lsm program types. This allows bpf to supervise namespace creation. I'm
> > > in the process of adding various "universal truth" bpf programs to
> > > systemd that will make use of this. This e.g., allows to lock in a
> > > program into a given set of namespaces.
> > >
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > > include/linux/bpf_lsm.h | 21 +++++++++++++++++++++
> > > kernel/bpf/bpf_lsm.c | 25 +++++++++++++++++++++++++
> > > kernel/nscommon.c | 9 ++++++++-
> > > kernel/nsproxy.c | 7 +++++++
> > > 4 files changed, 61 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > > index 643809cc78c3..5ae438fdf567 100644
> > > --- a/include/linux/bpf_lsm.h
> > > +++ b/include/linux/bpf_lsm.h
> > > @@ -12,6 +12,9 @@
> > > #include <linux/bpf_verifier.h>
> > > #include <linux/lsm_hooks.h>
> > >
> > > +struct ns_common;
> > > +struct nsset;
> > > +
> > > #ifdef CONFIG_BPF_LSM
> > >
> > > #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
> > > @@ -48,6 +51,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func)
> > >
> > > int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> > > struct bpf_retval_range *range);
> > > +
> > > +int bpf_lsm_namespace_alloc(struct ns_common *ns);
> > > +void bpf_lsm_namespace_free(struct ns_common *ns);
> > > +int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
> > > +
> > > int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
> > > const struct bpf_dynptr *value_p, int flags);
> > > int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
> > > @@ -104,6 +112,19 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
> > > {
> > > return false;
> > > }
> > > +
> > > +static inline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +static inline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> > > +static inline int bpf_lsm_namespace_install(struct nsset *nsset,
> > > + struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > #endif /* CONFIG_BPF_LSM */
> > >
> > > #endif /* _LINUX_BPF_LSM_H */
> > > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > > index 0c4a0c8e6f70..f6378db46220 100644
> > > --- a/kernel/bpf/bpf_lsm.c
> > > +++ b/kernel/bpf/bpf_lsm.c
> > > @@ -30,10 +30,32 @@ __weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
> > > #include <linux/lsm_hook_defs.h>
> > > #undef LSM_HOOK
> > >
> > > +__bpf_hook_start();
> > > +
> > > +__weak noinline int bpf_lsm_namespace_alloc(struct ns_common *ns)
> > > +{
> > > + return 0;
> > > +}
> > > +
> > > +__weak noinline void bpf_lsm_namespace_free(struct ns_common *ns)
> > > +{
> > > +}
> >
> > I'm wondering how you foresee this hook functioning in a scenario
> > where the BPF LSM program is attached to this new hook point, although
> > with its attachment type being set to BPF_LSM_CGROUP instead of
> > BPF_LSM_MAC? You probably wouldn't want to utilize something like
> > BPF_LSM_CGROUP for your specific use case, but as things stand
> > currently I don't believe there's anyhthing preventing you from using
> > BPF_LSM_CGROUP with a hook like bpf_lsm_namespace_free().
>
> Oh, I very much would like this to be attachable to cgroups.
>
> > Notably, the BPF_LSM_CGROUP infrastructure is designed to execute BPF
> > programs based on the cgroup of the currently executing task. There
> > could be some surprises if the bpf_lsm_namespace_free() hook were to
> > ever be called from a context (e.g, kworker) other than the one
> > specified whilst attaching the BPF LSM program with type
> > BPF_LSM_CGROUP.
>
> But isn't this then a generic problem? What about:
>
> # RCU callbacks
> security_cred_free
> security_task_free
> security_inode_free_security_rcu
> security_bpf_prog_free
> security_xfrm_policy_free_security
> security_msg_queue_free_security
> security_shm_free_security
> security_sem_free_security
> security_audit_rule_free
> security_bdev_free_security
> security_sk_free_security
>
> # Workqueues
> security_bpf_map_free
> security_bpf_token_free
> security_sb_free_security
> security_file_free_security
> security_file_release
> security_xfrm_state_free_security
>
> ignoring sofirq/hardirq for now.
I'd need to take a another deep look, but yeah, from what I can tell
this is a broader general issue for BPF LSM programs which happen to
also make use of the BPF_LSM_CGROUP attachment type.
> So the only real problem I can see is that someone wants to do something
> from a *_free() hook that isn't actually freeing but actual policy based
> on the cgroup of @current? I find that hard to believe tbh.
> Fwiw, bpf_lsm_namespace_free() is classified as untrusted because at
> that point the outer namespace might already be blown away
> partially. Effectively alloc() and free() hooks are mostly
> notification mechanisms of creation/destructions. If you want to do
> actual policy you might have to defer it until an actual operation
> is done.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
` (3 preceding siblings ...)
2026-02-24 13:35 ` Matt Bobrowski
@ 2026-02-24 23:04 ` Song Liu
2026-02-27 10:28 ` Christian Brauner
4 siblings, 1 reply; 28+ messages in thread
From: Song Liu @ 2026-02-24 23:04 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Thu, Feb 19, 2026 at 4:38 PM Christian Brauner <brauner@kernel.org> wrote:
[...]
> @@ -1,6 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
>
> +#include <linux/bpf_lsm.h>
> #include <linux/ns_common.h>
> #include <linux/nstree.h>
> #include <linux/proc_ns.h>
> @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> ret = proc_alloc_inum(&ns->inum);
> if (ret)
> return ret;
> +
> /*
> * Tree ref starts at 0. It's incremented when namespace enters
> * active use (installed in nsproxy) and decremented when all
> @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> atomic_set(&ns->__ns_ref_active, 1);
> else
> atomic_set(&ns->__ns_ref_active, 0);
> - return 0;
> +
> + ret = bpf_lsm_namespace_alloc(ns);
> + if (ret && !inum)
> + proc_free_inum(ns->inum);
> + return ret;
> }
If we change the hook as
bpf_lsm_namespace_alloc(ns, inum);
We can move it to the beginning of __ns_common_init().
This change allows blocking __ns_common_init() before
it makes any changes to the ns. Is this a better approach?
Thanks,
Song
[...]
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-24 23:04 ` Song Liu
@ 2026-02-27 10:28 ` Christian Brauner
2026-02-27 16:38 ` Song Liu
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-27 10:28 UTC (permalink / raw)
To: Song Liu
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Tue, Feb 24, 2026 at 03:04:43PM -0800, Song Liu wrote:
> On Thu, Feb 19, 2026 at 4:38 PM Christian Brauner <brauner@kernel.org> wrote:
> [...]
> > @@ -1,6 +1,7 @@
> > // SPDX-License-Identifier: GPL-2.0-only
> > /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
> >
> > +#include <linux/bpf_lsm.h>
> > #include <linux/ns_common.h>
> > #include <linux/nstree.h>
> > #include <linux/proc_ns.h>
> > @@ -77,6 +78,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > ret = proc_alloc_inum(&ns->inum);
> > if (ret)
> > return ret;
> > +
> > /*
> > * Tree ref starts at 0. It's incremented when namespace enters
> > * active use (installed in nsproxy) and decremented when all
> > @@ -86,11 +88,16 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
> > atomic_set(&ns->__ns_ref_active, 1);
> > else
> > atomic_set(&ns->__ns_ref_active, 0);
> > - return 0;
> > +
> > + ret = bpf_lsm_namespace_alloc(ns);
> > + if (ret && !inum)
> > + proc_free_inum(ns->inum);
> > + return ret;
> > }
>
> If we change the hook as
>
> bpf_lsm_namespace_alloc(ns, inum);
>
> We can move it to the beginning of __ns_common_init().
> This change allows blocking __ns_common_init() before
> it makes any changes to the ns. Is this a better approach?
I don't think it matters tbh. We have no control when exactly
__ns_common_init() is called. That's up to the containing namespace. We
can't rely on the namespace to have been correctly set up at this time.
My main goal was to have struct ns_common to be fully initialized
already so that direct access to it's field already makes sense.
The containing namespace my already have to rollback a bunch of stuff
anyway.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-27 10:28 ` Christian Brauner
@ 2026-02-27 16:38 ` Song Liu
2026-03-02 9:46 ` Christian Brauner
0 siblings, 1 reply; 28+ messages in thread
From: Song Liu @ 2026-02-27 16:38 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 27, 2026 at 2:28 AM Christian Brauner <brauner@kernel.org> wrote:
[...]
> >
> > If we change the hook as
> >
> > bpf_lsm_namespace_alloc(ns, inum);
> >
> > We can move it to the beginning of __ns_common_init().
> > This change allows blocking __ns_common_init() before
> > it makes any changes to the ns. Is this a better approach?
>
> I don't think it matters tbh. We have no control when exactly
> __ns_common_init() is called. That's up to the containing namespace. We
> can't rely on the namespace to have been correctly set up at this time.
> My main goal was to have struct ns_common to be fully initialized
> already so that direct access to it's field already makes sense.
Good point on having ns_common initialized. Besides inum, we
should also pass ns_type and ops into the hook.
OTOH, shall we have the hook before proc_alloc_inum()? With
this change, the hook can block the operation before it causes
any contention on proc_inum_ida. IOW, how about we have:
@@ -71,6 +71,10 @@ int __ns_common_init(struct ns_common *ns, u32
ns_type, const struct proc_ns_ope
ns_debug(ns, ops);
#endif
+ ret = bpf_lsm_namespace_alloc(ns, inum);
+ if (ret)
+ return ret;
+
if (inum)
ns->inum = inum;
else
With this change, ns is already initialized, except the inum.
WDYT?
Thanks,
Song
> The containing namespace my already have to rollback a bunch of stuff
> anyway.
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] ns: add bpf hooks
2026-02-27 16:38 ` Song Liu
@ 2026-03-02 9:46 ` Christian Brauner
2026-03-03 16:44 ` Song Liu
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-03-02 9:46 UTC (permalink / raw)
To: Song Liu
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 27, 2026 at 08:38:48AM -0800, Song Liu wrote:
> On Fri, Feb 27, 2026 at 2:28 AM Christian Brauner <brauner@kernel.org> wrote:
> [...]
> > >
> > > If we change the hook as
> > >
> > > bpf_lsm_namespace_alloc(ns, inum);
> > >
> > > We can move it to the beginning of __ns_common_init().
> > > This change allows blocking __ns_common_init() before
> > > it makes any changes to the ns. Is this a better approach?
> >
> > I don't think it matters tbh. We have no control when exactly
> > __ns_common_init() is called. That's up to the containing namespace. We
> > can't rely on the namespace to have been correctly set up at this time.
> > My main goal was to have struct ns_common to be fully initialized
> > already so that direct access to it's field already makes sense.
>
> Good point on having ns_common initialized. Besides inum, we
> should also pass ns_type and ops into the hook.
But why? The struct ns_common is already fully initialized when it is
passed to bpf_lsm_namespace_alloc() including ops, inum, ns_type etc.
>
> OTOH, shall we have the hook before proc_alloc_inum()? With
> this change, the hook can block the operation before it causes
> any contention on proc_inum_ida. IOW, how about we have:
I think that contention is meaningless and I'd rather have struct
ns_common fully set up so that all fields can be accessed.
>
> @@ -71,6 +71,10 @@ int __ns_common_init(struct ns_common *ns, u32
> ns_type, const struct proc_ns_ope
> ns_debug(ns, ops);
> #endif
>
> + ret = bpf_lsm_namespace_alloc(ns, inum);
> + if (ret)
> + return ret;
> +
> if (inum)
> ns->inum = inum;
> else
>
> With this change, ns is already initialized, except the inum.
>
> WDYT?
>
> Thanks,
> Song
>
> > The containing namespace my already have to rollback a bunch of stuff
> > anyway.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] ns: add bpf hooks
2026-03-02 9:46 ` Christian Brauner
@ 2026-03-03 16:44 ` Song Liu
0 siblings, 0 replies; 28+ messages in thread
From: Song Liu @ 2026-03-03 16:44 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Mon, Mar 2, 2026 at 1:46 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Feb 27, 2026 at 08:38:48AM -0800, Song Liu wrote:
> > On Fri, Feb 27, 2026 at 2:28 AM Christian Brauner <brauner@kernel.org> wrote:
> > [...]
> > > >
> > > > If we change the hook as
> > > >
> > > > bpf_lsm_namespace_alloc(ns, inum);
> > > >
> > > > We can move it to the beginning of __ns_common_init().
> > > > This change allows blocking __ns_common_init() before
> > > > it makes any changes to the ns. Is this a better approach?
> > >
> > > I don't think it matters tbh. We have no control when exactly
> > > __ns_common_init() is called. That's up to the containing namespace. We
> > > can't rely on the namespace to have been correctly set up at this time.
> > > My main goal was to have struct ns_common to be fully initialized
> > > already so that direct access to it's field already makes sense.
> >
> > Good point on having ns_common initialized. Besides inum, we
> > should also pass ns_type and ops into the hook.
>
> But why? The struct ns_common is already fully initialized when it is
> passed to bpf_lsm_namespace_alloc() including ops, inum, ns_type etc.
I meant if we pull bpf_lsm_namespace_alloc() to the beginning of
__ns_common_init(), we need ns_type etc. because ns_common
is not fully initialized. IOW, I agree with your early comment.
> >
> > OTOH, shall we have the hook before proc_alloc_inum()? With
> > this change, the hook can block the operation before it causes
> > any contention on proc_inum_ida. IOW, how about we have:
>
> I think that contention is meaningless and I'd rather have struct
> ns_common fully set up so that all fields can be accessed.
If contention is not a concern, which I believe you know better
than I do, I think this patch works fine. So
Acked-by: Song Liu <song@kernel.org>
Thanks,
Song
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-20 0:38 [PATCH 0/4] bpf: add a few hooks for sandboxing Christian Brauner
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
@ 2026-02-20 0:38 ` Christian Brauner
2026-02-20 15:16 ` Tejun Heo
2026-02-23 15:47 ` Michal Koutný
2026-02-20 0:38 ` [PATCH 3/4] selftests/bpf: add ns hook selftest Christian Brauner
2026-02-20 0:38 ` [PATCH 4/4] selftests/bpf: add cgroup attach selftests Christian Brauner
3 siblings, 2 replies; 28+ messages in thread
From: Christian Brauner @ 2026-02-20 0:38 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering,
Christian Brauner
Add a hook to manage attaching tasks to cgroup. I'm in the process of
adding various "universal truth" bpf programs to systemd that will make
use of this.
This has been a long-standing request (cf. [1] and [2]). It will allow us to
enforce cgroup migrations and ensure that services can never escape their
cgroups. This is just one of many use-cases.
Link: https://github.com/systemd/systemd/issues/6356 [1]
Link: https://github.com/systemd/systemd/issues/22874 [2]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/bpf_lsm.h | 15 +++++++++++++++
kernel/bpf/bpf_lsm.c | 12 ++++++++++++
kernel/cgroup/cgroup.c | 18 +++++++++++-------
3 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 5ae438fdf567..bc1d35b271f5 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -12,8 +12,11 @@
#include <linux/bpf_verifier.h>
#include <linux/lsm_hooks.h>
+struct cgroup;
+struct cgroup_namespace;
struct ns_common;
struct nsset;
+struct super_block;
#ifdef CONFIG_BPF_LSM
@@ -55,6 +58,9 @@ int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
int bpf_lsm_namespace_alloc(struct ns_common *ns);
void bpf_lsm_namespace_free(struct ns_common *ns);
int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
+int bpf_lsm_cgroup_attach(struct task_struct *task, struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp, struct super_block *sb,
+ bool threadgroup, struct cgroup_namespace *ns);
int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
const struct bpf_dynptr *value_p, int flags);
@@ -125,6 +131,15 @@ static inline int bpf_lsm_namespace_install(struct nsset *nsset,
{
return 0;
}
+static inline int bpf_lsm_cgroup_attach(struct task_struct *task,
+ struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb,
+ bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ return 0;
+}
#endif /* CONFIG_BPF_LSM */
#endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index f6378db46220..1da5585082fa 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -47,6 +47,16 @@ __weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
return 0;
}
+__weak noinline int bpf_lsm_cgroup_attach(struct task_struct *task,
+ struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb,
+ bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ return 0;
+}
+
__bpf_hook_end();
#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
@@ -56,6 +66,7 @@ BTF_SET_START(bpf_lsm_hooks)
BTF_ID(func, bpf_lsm_namespace_alloc)
BTF_ID(func, bpf_lsm_namespace_free)
BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
BTF_SET_END(bpf_lsm_hooks)
BTF_SET_START(bpf_lsm_disabled_hooks)
@@ -407,6 +418,7 @@ BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_ID(func, bpf_lsm_namespace_alloc)
BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8af4351536cf..16535349b22f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,6 +28,7 @@
#include "cgroup-internal.h"
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -5334,7 +5335,8 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0;
}
-static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+static int cgroup_attach_permissions(struct task_struct *task,
+ struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb, bool threadgroup,
struct cgroup_namespace *ns)
@@ -5350,9 +5352,9 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
return ret;
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
- ret = -EOPNOTSUPP;
+ return -EOPNOTSUPP;
- return ret;
+ return bpf_lsm_cgroup_attach(task, src_cgrp, dst_cgrp, sb, threadgroup, ns);
}
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
@@ -5384,7 +5386,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* inherited fd attacks.
*/
scoped_with_creds(of->file->f_cred)
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ ret = cgroup_attach_permissions(task, src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb,
threadgroup, ctx->ns);
if (ret)
@@ -6669,6 +6671,7 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
/**
* cgroup_css_set_fork - find or create a css_set for a child process
+ * @task: the task to be attached
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
@@ -6683,7 +6686,8 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
-static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+static int cgroup_css_set_fork(struct task_struct *task,
+ struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
@@ -6752,7 +6756,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
* cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
* to always use the caller's credentials.
*/
- ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ ret = cgroup_attach_permissions(task, cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
if (ret)
@@ -6824,7 +6828,7 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
struct cgroup_subsys *ss;
int i, j, ret;
- ret = cgroup_css_set_fork(kargs);
+ ret = cgroup_css_set_fork(child, kargs);
if (ret)
return ret;
--
2.47.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-20 0:38 ` [PATCH 2/4] cgroup: add bpf hook for attach Christian Brauner
@ 2026-02-20 15:16 ` Tejun Heo
2026-02-21 17:57 ` Christian Brauner
2026-02-23 15:47 ` Michal Koutný
1 sibling, 1 reply; 28+ messages in thread
From: Tejun Heo @ 2026-02-20 15:16 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
Hello,
On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote:
> Add a hook to manage attaching tasks to cgroup. I'm in the process of
> adding various "universal truth" bpf programs to systemd that will make
> use of this.
>
> This has been a long-standing request (cf. [1] and [2]). It will allow us to
> enforce cgroup migrations and ensure that services can never escape their
> cgroups. This is just one of many use-cases.
From cgroup POV, this looks fine to me but I'm curious whether something
dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't
necessary at all. Would something dumber like a mount option disabling
cgroup migrations completely work too or would that be too restrictive?
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-20 15:16 ` Tejun Heo
@ 2026-02-21 17:57 ` Christian Brauner
0 siblings, 0 replies; 28+ messages in thread
From: Christian Brauner @ 2026-02-21 17:57 UTC (permalink / raw)
To: Tejun Heo
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Fri, Feb 20, 2026 at 05:16:13AM -1000, Tejun Heo wrote:
> Hello,
>
> On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote:
> > Add a hook to manage attaching tasks to cgroup. I'm in the process of
> > adding various "universal truth" bpf programs to systemd that will make
> > use of this.
> >
> > This has been a long-standing request (cf. [1] and [2]). It will allow us to
> > enforce cgroup migrations and ensure that services can never escape their
> > cgroups. This is just one of many use-cases.
>
> >From cgroup POV, this looks fine to me but I'm curious whether something
> dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't
> necessary at all. Would something dumber like a mount option disabling
> cgroup migrations completely work too or would that be too restrictive?
It would be too restrictive. I've played with various policies. For
example, a small set of tasks (like PID 1 or the session manager) are
allowed to move processes between cgroups (detectable via e.g., xattrs).
No other task is allowd. But that's already too restrictive because it
fscks over delegated subcgroups were tasks need to be moved around
(container managers etc.). IOW, any policy must be quite modular and
dynamic so a simple mount option wouldn't cover it.
As a sidenote, there would be other mount options that would be useful
but that currently aren't that easy to support/implement because of the
way cgroupfs (for historical reasons ofc) is architected where it shares
a single superblock.
I have a series (from quite some time ago) that makes cgroupfs truly
multi-instance. It would effectively behave just like tmpfs does. A new
mount gets you a new superblock. But once you have that you can e.g.,
simplify cgroup namespaces as well. I've done that work originally to
support idmapped mounts with cgroupfs but I can't find that branch
anymore.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-20 0:38 ` [PATCH 2/4] cgroup: add bpf hook for attach Christian Brauner
2026-02-20 15:16 ` Tejun Heo
@ 2026-02-23 15:47 ` Michal Koutný
2026-02-27 13:44 ` Christian Brauner
1 sibling, 1 reply; 28+ messages in thread
From: Michal Koutný @ 2026-02-23 15:47 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
[-- Attachment #1: Type: text/plain, Size: 1546 bytes --]
Hi.
On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote:
> Add a hook to manage attaching tasks to cgroup. I'm in the process of
> adding various "universal truth" bpf programs to systemd that will make
> use of this.
>
> This has been a long-standing request (cf. [1] and [2]). It will allow us to
> enforce cgroup migrations and ensure that services can never escape their
> cgroups. This is just one of many use-cases.
>
> Link: https://github.com/systemd/systemd/issues/6356 [1]
> Link: https://github.com/systemd/systemd/issues/22874 [2]
These two issues are misconfigured/misunderstood PAM configs. I don't
think those warrant introduction of another permissions mechanism,
furthermore they're relatively old and I estimate many of such configs
must have been fixed in the course of time.
As for services escaping their cgroups -- they needn't run as root, do
they? And if you seek a mechanism how to prevent even root from
migrations, there are cgroupnses for that. (BTW what would prevent a
root detaching/disabling these hook progs anyway?)
I think that the cgroup file permissions are sufficient for many use
cases and this BPF hook is too tempting in unnecessary cases (like
masking other issues).
Could you please expand more about some other reasonable use cases not
covered by those?
(BTW I notice there's already a very similar BPF hook in sched_ext's
cgroup_prep_move. It'd be nicer to have only one generic approach to
these checks.)
Regards,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-23 15:47 ` Michal Koutný
@ 2026-02-27 13:44 ` Christian Brauner
2026-03-09 16:45 ` Michal Koutný
0 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-27 13:44 UTC (permalink / raw)
To: Michal Koutný
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
On Mon, Feb 23, 2026 at 04:47:11PM +0100, Michal Koutný wrote:
> Hi.
>
> On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote:
> > Add a hook to manage attaching tasks to cgroup. I'm in the process of
> > adding various "universal truth" bpf programs to systemd that will make
> > use of this.
> >
> > This has been a long-standing request (cf. [1] and [2]). It will allow us to
> > enforce cgroup migrations and ensure that services can never escape their
> > cgroups. This is just one of many use-cases.
> >
> > Link: https://github.com/systemd/systemd/issues/6356 [1]
> > Link: https://github.com/systemd/systemd/issues/22874 [2]
>
> These two issues are misconfigured/misunderstood PAM configs. I don't
> think those warrant introduction of another permissions mechanism,
> furthermore they're relatively old and I estimate many of such configs
> must have been fixed in the course of time.
logind has to allow cgroup migrations but for say Docker this shouldn't
be allowed. So calling this misconfiguration is like taking a shortcut
by simply pointing to a different destination. But fine, let's say you
insist on this not being valid.
> As for services escaping their cgroups -- they needn't run as root, do
> they? And if you seek a mechanism how to prevent even root from
> migrations, there are cgroupnses for that. (BTW what would prevent a
A bunch of tools that do cgroup migrations don't use cgroup namespaces
and there's no requirement or way to enforce that they do. Plus, there's
no requirement to only do cgroup management via systemd or its APIs.
Frankly, I can't even blame userspace for not having widely adopted
cgroup namespaces. The implementation atop of a single superblock like
cgroupfs is questionable.
But in general the point is that there's no mechanism to enforce cgroup
tree policy currently in a sufficiently flexible manner.
> root detaching/disabling these hook progs anyway?)
I cannot help but read this as you asking me "What if you're too dumb to
write a security policy that isn't self-defeating?" :)
bpf has security hooks for itself including security_bpf(). First thing
that comes to mind is to have security.bpf.* or trusted.* xattrs on
selected processes like PID 1 that mark it as eligible for modifying BPF
state or BPF LSM programs supervising link/prog detach, update etc and
then designating only PID 1 as handing out those magical xattrs. Can be
as fine-grained as needed and that tells everyone else to go away and do
something else.
There's more fine-grained mechanisms to deal with this. IOW, it's a
solvable problem.
> I think that the cgroup file permissions are sufficient for many use
> cases and this BPF hook is too tempting in unnecessary cases (like
> masking other issues).
> Could you please expand more about some other reasonable use cases not
> covered by those?
systemd will gain the ability to implement policy to control cgroup tree
modifications in as much details as it needs without having the kernel
in need to be aware of it. This can take various forms by marking only
select processes as being eligible for managing cgroup migrations or
even just locking down specific cgroups.
The policy needs to be flexible so it can be live-updated, switched into
auditing mode, and losened, tightened on-demand as needed.
> (BTW I notice there's already a very similar BPF hook in sched_ext's
> cgroup_prep_move. It'd be nicer to have only one generic approach to
> these checks.)
This feels a bit like a wild goose chase. But fine, I'll look at it.
/me goes off
Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a
generic permission check. It's called way after
cgroup_attach_permissions() and is a per cgroup controller check that is
only called for some cgroup controllers. So effectively useless to pull
up (Notice also, how some controllers like cpuset call additional
security hooks already.).
The same problem applies to writes for cgroup.procs and for subtree
control. The sched ext hook are per cgroup controller not generically
called.
And they happen to be called in cgroup_migrate_execute() which is way
deep in the callchain. When cgroup_attach_permissions() fails it's
effectively free. If migrate_execute() fails it must put/free css sets,
it must splice back task on mg_tasks, it must call cancel_attach()
callbacks, thus it must call the sched-ext cancel callbacks for each
already prepped task, it must uncharge pids for each already prepped
task, it needs to unlock a bunch of stuff.
On top of that this looks like a category mistake imho. The callbacks
are a dac-like permission mechanism whereas the hooks is actual mac
permission checking. I'm not sure lumping this together with
per-cgroup-controller migration preparations will be very clean. I think
it will end up looking rather confusing. But that's best left to you
cgroup maintainers, I think.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 2/4] cgroup: add bpf hook for attach
2026-02-27 13:44 ` Christian Brauner
@ 2026-03-09 16:45 ` Michal Koutný
0 siblings, 0 replies; 28+ messages in thread
From: Michal Koutný @ 2026-03-09 16:45 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo, KP Singh, bpf, linux-kernel, cgroups,
Lennart Poettering
Hello.
On Fri, Feb 27, 2026 at 02:44:27PM +0100, Christian Brauner <brauner@kernel.org> wrote:
> So calling this misconfiguration is like taking a shortcut
> by simply pointing to a different destination. But fine, let's say you
> insist on this not being valid.
I understand this in analogy with filesystem organization -- there's the
package manager that ensures files are put in right places,
non-conflicting and trackable. Subtrees may be delegated (e.g.
/usr/local). If root (or whoever has perms for it), decides to
manipulate the files, its up to them what they end up with.
> The implementation atop of a single superblock like cgroupfs is
> questionable.
(This is an interesting topic which I'd like to discuss some other
time not to diverge here.)
> But in general the point is that there's no mechanism to enforce cgroup
> tree policy currently in a sufficiently flexible manner.
>
> > root detaching/disabling these hook progs anyway?)
>
> I cannot help but read this as you asking me "What if you're too dumb to
> write a security policy that isn't self-defeating?" :)
I was just trying to express that there's only one level of root (user).
(Cautionary example are "containers" that executed as (host) root.
Lockdown neglected.)
> bpf has security hooks for itself including security_bpf(). First thing
> that comes to mind is to have security.bpf.* or trusted.* xattrs on
> selected processes like PID 1 that mark it as eligible for modifying BPF
> state or BPF LSM programs supervising link/prog detach, update etc and
> then designating only PID 1 as handing out those magical xattrs. Can be
> as fine-grained as needed and that tells everyone else to go away and do
> something else.
(These are too many new concepts for me, I must skip it now. I may catch
up after more study.)
> systemd will gain the ability to implement policy to control cgroup tree
> modifications in as much details as it needs without having the kernel
> in need to be aware of it. This can take various forms by marking only
> select processes as being eligible for managing cgroup migrations or
> even just locking down specific cgroups.
This is how I understand the goal could be expressed in current terms:
a) allowlisting processes that can do migrations
# common ancestor of all + access to each dst
chown -R :grA $root_cgroup/cgroup.procs
chmod -R g+w $root_cgroup/cgroup.procs
# static:
usermod -G grA user_of_pid
(re)start pid
# or in spawner:
fork
setgroups([grA])
exec
b) rules that are specific to cgroup (subtree)
# applying same like above but to a $lower_group
$ setfacl -R -m g:grB:w $lower_group/cgroup.procs
setfacl: cgroup.procs: Operation not supported
# here I notice why current impl isn't sufficient
Also, if I understand this correctly you semm to move from the semantics
where users (UIDs) are subjects to a different one where it's bound to
processes (PIDs).
> The policy needs to be flexible so it can be live-updated, switched into
> auditing mode, and losened, tightened on-demand as needed.
OK.
(I'd add that policy should be also easily debuggable/troubleshootable.)
> Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a
> generic permission check. It's called way after
> cgroup_attach_permissions() and is a per cgroup controller check that is
> only called for some cgroup controllers. So effectively useless to pull
> up (Notice also, how some controllers like cpuset call additional
> security hooks already.).
There could be one BPF predicate (on the cgroup level) and potentially
pass per-controller data, so that function could employ (or not) those.
It's true that semantics would be a bit different because of implicit
migrations happening with controller en-/disablement.
What I don't like about the multiple hooks is that there'd be several
places to check when one is trying to figure out why a migration failed.
> On top of that this looks like a category mistake imho. The callbacks
> are a dac-like permission mechanism whereas the hooks is actual mac
> permission checking. I'm not sure lumping this together with
> per-cgroup-controller migration preparations will be very clean. I think
> it will end up looking rather confusing. But that's best left to you
> cgroup maintainers, I think.
This paragraph hinted me at (yet) another mechanism in the kernel (and
you also mentioned it with cpuset) -- the LSM hooks. Namely, if this was
security_cgroup_attach() hook, the logic could be expressed with other
existing modules, IIUC, one of them is BPF. Would that fulfil the
behaviors you're missing?
(I'm proposing this as potentially less confusing/known "evil" approach
to the scenarios considered above.)
Thanks,
Michal
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 3/4] selftests/bpf: add ns hook selftest
2026-02-20 0:38 [PATCH 0/4] bpf: add a few hooks for sandboxing Christian Brauner
2026-02-20 0:38 ` [PATCH 1/4] ns: add bpf hooks Christian Brauner
2026-02-20 0:38 ` [PATCH 2/4] cgroup: add bpf hook for attach Christian Brauner
@ 2026-02-20 0:38 ` Christian Brauner
2026-03-05 17:36 ` Alan Maguire
2026-02-20 0:38 ` [PATCH 4/4] selftests/bpf: add cgroup attach selftests Christian Brauner
3 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-20 0:38 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering,
Christian Brauner
Add a BPF LSM selftest that implements a "lock on entry" namespace
sandbox policy.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
.../testing/selftests/bpf/prog_tests/ns_sandbox.c | 99 ++++++++++++++++++++++
.../testing/selftests/bpf/progs/test_ns_sandbox.c | 91 ++++++++++++++++++++
2 files changed, 190 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c b/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c
new file mode 100644
index 000000000000..0ac2acfb6365
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+
+/*
+ * Test BPF LSM namespace sandbox: once you enter, you stay.
+ *
+ * The parent creates a tracked namespace, then forks a child.
+ * The child enters the tracked namespace (allowed) and is then locked
+ * out of any further setns().
+ */
+
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include "test_ns_sandbox.skel.h"
+
+void test_ns_sandbox(void)
+{
+ int orig_utsns = -1, new_utsns = -1;
+ struct test_ns_sandbox *skel = NULL;
+ int err, status;
+ pid_t child;
+
+ /* Save FD to current (host) namespace */
+ orig_utsns = open("/proc/self/ns/uts", O_RDONLY);
+ if (!ASSERT_OK_FD(orig_utsns, "open orig utsns"))
+ goto close_fds;
+
+ skel = test_ns_sandbox__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+ goto close_fds;
+
+ err = test_ns_sandbox__attach(skel);
+ if (!ASSERT_OK(err, "skel attach"))
+ goto destroy;
+
+ skel->bss->monitor_pid = getpid();
+
+ /*
+ * Create a sandbox namespace. The alloc hook records its
+ * inum because this task's pid matches monitor_pid.
+ */
+ err = unshare(CLONE_NEWUTS);
+ if (!ASSERT_OK(err, "unshare sandbox"))
+ goto destroy;
+
+ new_utsns = open("/proc/self/ns/uts", O_RDONLY);
+ if (!ASSERT_OK_FD(new_utsns, "open sandbox utsns"))
+ goto restore;
+
+ /*
+ * Return parent to host namespace. The host namespace is not
+ * in the map so the install hook lets us through.
+ */
+ err = setns(orig_utsns, CLONE_NEWUTS);
+ if (!ASSERT_OK(err, "parent setns host utsns"))
+ goto restore;
+
+ /*
+ * Fork a child that:
+ * 1. Enters the sandbox UTS namespace — succeeds and locks it.
+ * 2. Tries to switch to host UTS — denied (locked).
+ */
+ child = fork();
+ if (child == 0) {
+ /* Enter tracked namespace — allowed, we get locked */
+ if (setns(new_utsns, CLONE_NEWUTS) != 0)
+ _exit(1);
+
+ /* Locked: switching to host must fail */
+ if (setns(orig_utsns, CLONE_NEWUTS) != -1 ||
+ errno != EPERM)
+ _exit(2);
+
+ _exit(0);
+ }
+ if (!ASSERT_GE(child, 0, "fork child"))
+ goto restore;
+
+ err = waitpid(child, &status, 0);
+ ASSERT_GT(err, 0, "waitpid child");
+ ASSERT_TRUE(WIFEXITED(status), "child exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "child locked in");
+
+ goto destroy;
+
+restore:
+ setns(orig_utsns, CLONE_NEWUTS);
+destroy:
+ test_ns_sandbox__destroy(skel);
+close_fds:
+ if (new_utsns >= 0)
+ close(new_utsns);
+ if (orig_utsns >= 0)
+ close(orig_utsns);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ns_sandbox.c b/tools/testing/selftests/bpf/progs/test_ns_sandbox.c
new file mode 100644
index 000000000000..75c3493932a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ns_sandbox.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+
+/*
+ * BPF LSM namespace sandbox: once you enter, you stay.
+ *
+ * A designated process creates namespaces (tracked via alloc). When
+ * any other process joins one of those namespaces it gets recorded in
+ * locked_tasks. From that point on that process cannot setns() into
+ * any other namespace — it is locked in. Task local storage is
+ * automatically freed when the task exits.
+ */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/*
+ * Namespaces created by the monitored process.
+ * Key: namespace inode number.
+ * Value: namespace type (CLONE_NEW* flag).
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 64);
+ __type(key, __u32);
+ __type(value, __u32);
+} known_namespaces SEC(".maps");
+
+/* PID of the process whose namespace creations are tracked. */
+int monitor_pid;
+
+/*
+ * Task local storage: marks tasks that have entered a tracked namespace
+ * and are now locked.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, __u8);
+} locked_tasks SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+/* Only the monitored process's namespace creations are tracked. */
+SEC("lsm.s/namespace_alloc")
+int BPF_PROG(ns_alloc, struct ns_common *ns)
+{
+ __u32 inum, ns_type;
+
+ if ((bpf_get_current_pid_tgid() >> 32) != monitor_pid)
+ return 0;
+
+ inum = ns->inum;
+ ns_type = ns->ns_type;
+ bpf_map_update_elem(&known_namespaces, &inum, &ns_type, BPF_ANY);
+
+ return 0;
+}
+
+/*
+ * Enforce the lock-in policy for all tasks:
+ * - Already locked? Deny any setns.
+ * - Entering a tracked namespace? Lock the task and allow.
+ * - Everything else passes through.
+ */
+SEC("lsm.s/namespace_install")
+int BPF_PROG(ns_install, struct nsset *nsset, struct ns_common *ns)
+{
+ struct task_struct *task = bpf_get_current_task_btf();
+ __u32 inum = ns->inum;
+
+ if (bpf_task_storage_get(&locked_tasks, task, 0, 0))
+ return -EPERM;
+
+ if (bpf_map_lookup_elem(&known_namespaces, &inum))
+ bpf_task_storage_get(&locked_tasks, task, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+
+ return 0;
+}
+
+SEC("lsm/namespace_free")
+void BPF_PROG(ns_free, struct ns_common *ns)
+{
+ __u32 inum = ns->inum;
+
+ bpf_map_delete_elem(&known_namespaces, &inum);
+}
--
2.47.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] selftests/bpf: add ns hook selftest
2026-02-20 0:38 ` [PATCH 3/4] selftests/bpf: add ns hook selftest Christian Brauner
@ 2026-03-05 17:36 ` Alan Maguire
0 siblings, 0 replies; 28+ messages in thread
From: Alan Maguire @ 2026-03-05 17:36 UTC (permalink / raw)
To: Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering
On 20/02/2026 00:38, Christian Brauner wrote:
> Add a BPF LSM selftest that implements a "lock on entry" namespace
> sandbox policy.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
one small thing below...
> ---
> .../testing/selftests/bpf/prog_tests/ns_sandbox.c | 99 ++++++++++++++++++++++
> .../testing/selftests/bpf/progs/test_ns_sandbox.c | 91 ++++++++++++++++++++
> 2 files changed, 190 insertions(+)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c b/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c
> new file mode 100644
> index 000000000000..0ac2acfb6365
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/ns_sandbox.c
> @@ -0,0 +1,99 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
> +
> +/*
> + * Test BPF LSM namespace sandbox: once you enter, you stay.
> + *
> + * The parent creates a tracked namespace, then forks a child.
> + * The child enters the tracked namespace (allowed) and is then locked
> + * out of any further setns().
> + */
> +
> +#define _GNU_SOURCE
> +#include <test_progs.h>
> +#include <sched.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <sys/wait.h>
> +#include "test_ns_sandbox.skel.h"
> +
> +void test_ns_sandbox(void)
> +{
> + int orig_utsns = -1, new_utsns = -1;
> + struct test_ns_sandbox *skel = NULL;
> + int err, status;
> + pid_t child;
> +
> + /* Save FD to current (host) namespace */
> + orig_utsns = open("/proc/self/ns/uts", O_RDONLY);
> + if (!ASSERT_OK_FD(orig_utsns, "open orig utsns"))
> + goto close_fds;
> +
> + skel = test_ns_sandbox__open_and_load();
> + if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
> + goto close_fds;
> +
> + err = test_ns_sandbox__attach(skel);
> + if (!ASSERT_OK(err, "skel attach"))
> + goto destroy;
> +
> + skel->bss->monitor_pid = getpid();
> +
> + /*
> + * Create a sandbox namespace. The alloc hook records its
> + * inum because this task's pid matches monitor_pid.
> + */
> + err = unshare(CLONE_NEWUTS);
> + if (!ASSERT_OK(err, "unshare sandbox"))
> + goto destroy;
> +
> + new_utsns = open("/proc/self/ns/uts", O_RDONLY);
> + if (!ASSERT_OK_FD(new_utsns, "open sandbox utsns"))
> + goto restore;
> +
> + /*
> + * Return parent to host namespace. The host namespace is not
> + * in the map so the install hook lets us through.
> + */
> + err = setns(orig_utsns, CLONE_NEWUTS);
> + if (!ASSERT_OK(err, "parent setns host utsns"))
> + goto restore;
> +
> + /*
> + * Fork a child that:
> + * 1. Enters the sandbox UTS namespace — succeeds and locks it.
> + * 2. Tries to switch to host UTS — denied (locked).
> + */
> + child = fork();
> + if (child == 0) {
> + /* Enter tracked namespace — allowed, we get locked */
> + if (setns(new_utsns, CLONE_NEWUTS) != 0)
> + _exit(1);
> +
> + /* Locked: switching to host must fail */
> + if (setns(orig_utsns, CLONE_NEWUTS) != -1 ||
> + errno != EPERM)
> + _exit(2);
> +
> + _exit(0);
> + }
> + if (!ASSERT_GE(child, 0, "fork child"))
should be ASSERT_GT() I think since we deal with the child == 0 path above.
> + goto restore;
> +
> + err = waitpid(child, &status, 0);
> + ASSERT_GT(err, 0, "waitpid child");
> + ASSERT_TRUE(WIFEXITED(status), "child exited");
> + ASSERT_EQ(WEXITSTATUS(status), 0, "child locked in");
> +
> + goto destroy;
> +
> +restore:
> + setns(orig_utsns, CLONE_NEWUTS);
> +destroy:
> + test_ns_sandbox__destroy(skel);
> +close_fds:
> + if (new_utsns >= 0)
> + close(new_utsns);
> + if (orig_utsns >= 0)
> + close(orig_utsns);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_ns_sandbox.c b/tools/testing/selftests/bpf/progs/test_ns_sandbox.c
> new file mode 100644
> index 000000000000..75c3493932a1
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_ns_sandbox.c
> @@ -0,0 +1,91 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
> +
> +/*
> + * BPF LSM namespace sandbox: once you enter, you stay.
> + *
> + * A designated process creates namespaces (tracked via alloc). When
> + * any other process joins one of those namespaces it gets recorded in
> + * locked_tasks. From that point on that process cannot setns() into
> + * any other namespace — it is locked in. Task local storage is
> + * automatically freed when the task exits.
> + */
> +
> +#include "vmlinux.h"
> +#include <errno.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +/*
> + * Namespaces created by the monitored process.
> + * Key: namespace inode number.
> + * Value: namespace type (CLONE_NEW* flag).
> + */
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(max_entries, 64);
> + __type(key, __u32);
> + __type(value, __u32);
> +} known_namespaces SEC(".maps");
> +
> +/* PID of the process whose namespace creations are tracked. */
> +int monitor_pid;
> +
> +/*
> + * Task local storage: marks tasks that have entered a tracked namespace
> + * and are now locked.
> + */
> +struct {
> + __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> + __uint(map_flags, BPF_F_NO_PREALLOC);
> + __type(key, int);
> + __type(value, __u8);
> +} locked_tasks SEC(".maps");
> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* Only the monitored process's namespace creations are tracked. */
> +SEC("lsm.s/namespace_alloc")
> +int BPF_PROG(ns_alloc, struct ns_common *ns)
> +{
> + __u32 inum, ns_type;
> +
> + if ((bpf_get_current_pid_tgid() >> 32) != monitor_pid)
> + return 0;
> +
> + inum = ns->inum;
> + ns_type = ns->ns_type;
> + bpf_map_update_elem(&known_namespaces, &inum, &ns_type, BPF_ANY);
> +
> + return 0;
> +}
> +
> +/*
> + * Enforce the lock-in policy for all tasks:
> + * - Already locked? Deny any setns.
> + * - Entering a tracked namespace? Lock the task and allow.
> + * - Everything else passes through.
> + */
> +SEC("lsm.s/namespace_install")
> +int BPF_PROG(ns_install, struct nsset *nsset, struct ns_common *ns)
> +{
> + struct task_struct *task = bpf_get_current_task_btf();
> + __u32 inum = ns->inum;
> +
> + if (bpf_task_storage_get(&locked_tasks, task, 0, 0))
> + return -EPERM;
> +
> + if (bpf_map_lookup_elem(&known_namespaces, &inum))
> + bpf_task_storage_get(&locked_tasks, task, 0,
> + BPF_LOCAL_STORAGE_GET_F_CREATE);
> +
> + return 0;
> +}
> +
> +SEC("lsm/namespace_free")
> +void BPF_PROG(ns_free, struct ns_common *ns)
> +{
> + __u32 inum = ns->inum;
> +
> + bpf_map_delete_elem(&known_namespaces, &inum);
> +}
>
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 4/4] selftests/bpf: add cgroup attach selftests
2026-02-20 0:38 [PATCH 0/4] bpf: add a few hooks for sandboxing Christian Brauner
` (2 preceding siblings ...)
2026-02-20 0:38 ` [PATCH 3/4] selftests/bpf: add ns hook selftest Christian Brauner
@ 2026-02-20 0:38 ` Christian Brauner
2026-03-05 17:43 ` Alan Maguire
3 siblings, 1 reply; 28+ messages in thread
From: Christian Brauner @ 2026-02-20 0:38 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering,
Christian Brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
.../selftests/bpf/prog_tests/cgroup_attach.c | 362 +++++++++++++++++++++
.../selftests/bpf/progs/test_cgroup_attach.c | 85 +++++
2 files changed, 447 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c
new file mode 100644
index 000000000000..05addf93af46
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+
+/*
+ * Test the bpf_lsm_cgroup_attach hook.
+ *
+ * Verifies that a BPF LSM program can supervise cgroup migration
+ * through both the cgroup.procs write path and the clone3 +
+ * CLONE_INTO_CGROUP path.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "test_cgroup_attach.skel.h"
+
+/* Must match the definition in progs/test_cgroup_attach.c */
+struct attach_event {
+ __u32 task_pid;
+ __u64 src_cgrp_id;
+ __u64 dst_cgrp_id;
+ __u8 threadgroup;
+ __u32 hook_count;
+};
+
+#ifndef CLONE_INTO_CGROUP
+#define CLONE_INTO_CGROUP 0x200000000ULL
+#endif
+
+#ifndef __NR_clone3
+#define __NR_clone3 435
+#endif
+
+struct __clone_args {
+ __aligned_u64 flags;
+ __aligned_u64 pidfd;
+ __aligned_u64 child_tid;
+ __aligned_u64 parent_tid;
+ __aligned_u64 exit_signal;
+ __aligned_u64 stack;
+ __aligned_u64 stack_size;
+ __aligned_u64 tls;
+ __aligned_u64 set_tid;
+ __aligned_u64 set_tid_size;
+ __aligned_u64 cgroup;
+};
+
+static pid_t do_clone3(int cgroup_fd)
+{
+ struct __clone_args args = {
+ .flags = CLONE_INTO_CGROUP,
+ .exit_signal = SIGCHLD,
+ .cgroup = cgroup_fd,
+ };
+
+ return syscall(__NR_clone3, &args, sizeof(args));
+}
+
+/*
+ * Subtest: deny_migration
+ *
+ * Verify that the BPF hook can deny cgroup migration through cgroup.procs
+ * and that detaching the BPF program removes enforcement.
+ */
+static void test_deny_migration(void)
+{
+ struct test_cgroup_attach *skel = NULL;
+ int allowed_fd = -1, denied_fd = -1;
+ unsigned long long denied_cgid;
+ int err, status;
+ __u64 key;
+ __u8 val = 1;
+ pid_t child;
+
+ if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
+ return;
+
+ allowed_fd = create_and_get_cgroup("/allowed");
+ if (!ASSERT_GE(allowed_fd, 0, "create /allowed"))
+ goto cleanup;
+
+ denied_fd = create_and_get_cgroup("/denied");
+ if (!ASSERT_GE(denied_fd, 0, "create /denied"))
+ goto cleanup;
+
+ skel = test_cgroup_attach__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+ goto cleanup;
+
+ err = test_cgroup_attach__attach(skel);
+ if (!ASSERT_OK(err, "skel attach"))
+ goto cleanup;
+
+ skel->bss->monitored_pid = getpid();
+
+ denied_cgid = get_cgroup_id("/denied");
+ if (!ASSERT_NEQ(denied_cgid, 0ULL, "get denied cgroup id"))
+ goto cleanup;
+
+ key = denied_cgid;
+ err = bpf_map__update_elem(skel->maps.denied_cgroups,
+ &key, sizeof(key), &val, sizeof(val), 0);
+ if (!ASSERT_OK(err, "add denied cgroup"))
+ goto cleanup;
+
+ /*
+ * Forked children must use join_parent_cgroup() because the
+ * cgroup workdir was created under the parent's PID and
+ * join_cgroup() constructs paths using getpid().
+ */
+
+ /* Child migrating to /allowed should succeed */
+ child = fork();
+ if (!ASSERT_GE(child, 0, "fork child allowed"))
+ goto cleanup;
+ if (child == 0) {
+ if (join_parent_cgroup("/allowed"))
+ _exit(1);
+ _exit(0);
+ }
+ err = waitpid(child, &status, 0);
+ ASSERT_GT(err, 0, "waitpid allowed");
+ ASSERT_TRUE(WIFEXITED(status), "allowed child exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "allowed migration succeeds");
+
+ /* Child migrating to /denied should fail */
+ child = fork();
+ if (!ASSERT_GE(child, 0, "fork child denied"))
+ goto cleanup;
+ if (child == 0) {
+ if (join_parent_cgroup("/denied") == 0)
+ _exit(1); /* Should have failed */
+ if (errno != EPERM)
+ _exit(2); /* Wrong errno */
+ _exit(0);
+ }
+ err = waitpid(child, &status, 0);
+ ASSERT_GT(err, 0, "waitpid denied");
+ ASSERT_TRUE(WIFEXITED(status), "denied child exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "denied migration blocked");
+
+ /* Detach BPF — /denied should now be accessible */
+ test_cgroup_attach__detach(skel);
+
+ child = fork();
+ if (!ASSERT_GE(child, 0, "fork child post-detach"))
+ goto cleanup;
+ if (child == 0) {
+ if (join_parent_cgroup("/denied"))
+ _exit(1);
+ _exit(0);
+ }
+ err = waitpid(child, &status, 0);
+ ASSERT_GT(err, 0, "waitpid post-detach");
+ ASSERT_TRUE(WIFEXITED(status), "post-detach child exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "post-detach migration free");
+
+cleanup:
+ if (skel)
+ test_cgroup_attach__destroy(skel);
+ if (allowed_fd >= 0)
+ close(allowed_fd);
+ if (denied_fd >= 0)
+ close(denied_fd);
+ cleanup_cgroup_environment();
+}
+
+/*
+ * Subtest: verify_hook_args
+ *
+ * Verify that the hook receives correct src_cgrp, dst_cgrp, task pid,
+ * and threadgroup values.
+ */
+static void test_verify_hook_args(void)
+{
+ struct test_cgroup_attach *skel = NULL;
+ struct attach_event evt = {};
+ unsigned long long src_cgid, dst_cgid;
+ int src_fd = -1, dst_fd = -1;
+ __u32 map_key = 0;
+ char pid_str[32];
+ int err;
+
+ if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
+ return;
+
+ src_fd = create_and_get_cgroup("/src");
+ if (!ASSERT_GE(src_fd, 0, "create /src"))
+ goto cleanup;
+
+ dst_fd = create_and_get_cgroup("/dst");
+ if (!ASSERT_GE(dst_fd, 0, "create /dst"))
+ goto cleanup;
+
+ /* Move ourselves to /src first */
+ if (!ASSERT_OK(join_cgroup("/src"), "join /src"))
+ goto cleanup;
+
+ skel = test_cgroup_attach__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+ goto cleanup;
+
+ err = test_cgroup_attach__attach(skel);
+ if (!ASSERT_OK(err, "skel attach"))
+ goto cleanup;
+
+ skel->bss->monitored_pid = getpid();
+
+ src_cgid = get_cgroup_id("/src");
+ dst_cgid = get_cgroup_id("/dst");
+ if (!ASSERT_NEQ(src_cgid, 0ULL, "get src cgroup id"))
+ goto cleanup;
+ if (!ASSERT_NEQ(dst_cgid, 0ULL, "get dst cgroup id"))
+ goto cleanup;
+
+ /* Migrate self to /dst via cgroup.procs (threadgroup=true) */
+ snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+ if (!ASSERT_OK(write_cgroup_file("/dst", "cgroup.procs", pid_str),
+ "migrate to /dst"))
+ goto cleanup;
+
+ /* Read the recorded event */
+ err = bpf_map__lookup_elem(skel->maps.last_event,
+ &map_key, sizeof(map_key),
+ &evt, sizeof(evt), 0);
+ if (!ASSERT_OK(err, "read last_event"))
+ goto cleanup;
+
+ ASSERT_EQ(evt.src_cgrp_id, src_cgid, "src_cgrp_id matches");
+ ASSERT_EQ(evt.dst_cgrp_id, dst_cgid, "dst_cgrp_id matches");
+ ASSERT_EQ(evt.task_pid, (__u32)getpid(), "task_pid matches");
+ ASSERT_EQ(evt.threadgroup, 1, "threadgroup is true for cgroup.procs");
+ ASSERT_GE(evt.hook_count, (__u32)1, "hook fired at least once");
+
+cleanup:
+ if (skel)
+ test_cgroup_attach__destroy(skel);
+ if (src_fd >= 0)
+ close(src_fd);
+ if (dst_fd >= 0)
+ close(dst_fd);
+ cleanup_cgroup_environment();
+}
+
+/*
+ * Subtest: clone_into_cgroup
+ *
+ * Verify the hook fires on the clone3(CLONE_INTO_CGROUP) path and can
+ * deny spawning a child directly into a cgroup.
+ */
+static void test_clone_into_cgroup(void)
+{
+ struct test_cgroup_attach *skel = NULL;
+ int allowed_fd = -1, denied_fd = -1;
+ unsigned long long denied_cgid, allowed_cgid;
+ struct attach_event evt = {};
+ __u32 map_key = 0;
+ __u64 key;
+ __u8 val = 1;
+ int err, status;
+ pid_t child;
+
+ if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
+ return;
+
+ allowed_fd = create_and_get_cgroup("/clone_allowed");
+ if (!ASSERT_GE(allowed_fd, 0, "create /clone_allowed"))
+ goto cleanup;
+
+ denied_fd = create_and_get_cgroup("/clone_denied");
+ if (!ASSERT_GE(denied_fd, 0, "create /clone_denied"))
+ goto cleanup;
+
+ skel = test_cgroup_attach__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+ goto cleanup;
+
+ err = test_cgroup_attach__attach(skel);
+ if (!ASSERT_OK(err, "skel attach"))
+ goto cleanup;
+
+ skel->bss->monitored_pid = getpid();
+
+ denied_cgid = get_cgroup_id("/clone_denied");
+ allowed_cgid = get_cgroup_id("/clone_allowed");
+ if (!ASSERT_NEQ(denied_cgid, 0ULL, "get denied cgroup id"))
+ goto cleanup;
+ if (!ASSERT_NEQ(allowed_cgid, 0ULL, "get allowed cgroup id"))
+ goto cleanup;
+
+ key = denied_cgid;
+ err = bpf_map__update_elem(skel->maps.denied_cgroups,
+ &key, sizeof(key), &val, sizeof(val), 0);
+ if (!ASSERT_OK(err, "add denied cgroup"))
+ goto cleanup;
+
+ /* clone3 into denied cgroup should fail */
+ child = do_clone3(denied_fd);
+ if (child >= 0) {
+ waitpid(child, NULL, 0);
+ ASSERT_LT(child, 0, "clone3 into denied should fail");
+ goto cleanup;
+ }
+ if (errno == ENOSYS || errno == E2BIG) {
+ test__skip();
+ goto cleanup;
+ }
+ ASSERT_EQ(errno, EPERM, "clone3 denied errno");
+
+ /* clone3 into allowed cgroup should succeed */
+ child = do_clone3(allowed_fd);
+ if (!ASSERT_GE(child, 0, "clone3 into allowed"))
+ goto cleanup;
+ if (child == 0)
+ _exit(0);
+
+ err = waitpid(child, &status, 0);
+ ASSERT_GT(err, 0, "waitpid clone3 allowed");
+ ASSERT_TRUE(WIFEXITED(status), "clone3 child exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "clone3 child ok");
+
+ /* Verify the hook recorded the allowed clone */
+ err = bpf_map__lookup_elem(skel->maps.last_event,
+ &map_key, sizeof(map_key),
+ &evt, sizeof(evt), 0);
+ if (!ASSERT_OK(err, "read last_event"))
+ goto cleanup;
+
+ ASSERT_EQ(evt.dst_cgrp_id, allowed_cgid, "clone3 dst_cgrp_id");
+
+cleanup:
+ if (skel)
+ test_cgroup_attach__destroy(skel);
+ if (allowed_fd >= 0)
+ close(allowed_fd);
+ if (denied_fd >= 0)
+ close(denied_fd);
+ cleanup_cgroup_environment();
+}
+
+void test_cgroup_attach(void)
+{
+ if (test__start_subtest("deny_migration"))
+ test_deny_migration();
+ if (test__start_subtest("verify_hook_args"))
+ test_verify_hook_args();
+ if (test__start_subtest("clone_into_cgroup"))
+ test_clone_into_cgroup();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_cgroup_attach.c b/tools/testing/selftests/bpf/progs/test_cgroup_attach.c
new file mode 100644
index 000000000000..90915d1d7d64
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cgroup_attach.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+
+/*
+ * BPF LSM cgroup attach policy: supervise cgroup migration.
+ *
+ * A designated process populates a denied_cgroups map with cgroup IDs
+ * that should reject migration. The cgroup_attach hook checks every
+ * migration and returns -EPERM when the destination cgroup is denied.
+ * It also records the last hook invocation into last_event for the
+ * userspace test to verify arguments.
+ */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+struct attach_event {
+ __u32 task_pid;
+ __u64 src_cgrp_id;
+ __u64 dst_cgrp_id;
+ __u8 threadgroup;
+ __u32 hook_count;
+};
+
+/*
+ * Cgroups that should reject migration.
+ * Key: cgroup kn->id (u64).
+ * Value: unused marker.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 16);
+ __type(key, __u64);
+ __type(value, __u8);
+} denied_cgroups SEC(".maps");
+
+/*
+ * Record the last hook invocation for argument verification.
+ * Key: 0.
+ * Value: struct attach_event.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, struct attach_event);
+} last_event SEC(".maps");
+
+__u32 monitored_pid;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("lsm.s/cgroup_attach")
+int BPF_PROG(cgroup_attach, struct task_struct *task,
+ struct cgroup *src_cgrp, struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ struct task_struct *current = bpf_get_current_task_btf();
+ struct attach_event *evt;
+ __u64 dst_id;
+ __u32 key = 0;
+
+ dst_id = BPF_CORE_READ(dst_cgrp, kn, id);
+
+ if (bpf_map_lookup_elem(&denied_cgroups, &dst_id))
+ return -EPERM;
+
+ if (!monitored_pid || current->tgid != monitored_pid)
+ return 0;
+
+ evt = bpf_map_lookup_elem(&last_event, &key);
+ if (evt) {
+ evt->task_pid = task->pid;
+ evt->src_cgrp_id = BPF_CORE_READ(src_cgrp, kn, id);
+ evt->dst_cgrp_id = dst_id;
+ evt->threadgroup = threadgroup ? 1 : 0;
+ evt->hook_count++;
+ }
+
+ return 0;
+}
--
2.47.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 4/4] selftests/bpf: add cgroup attach selftests
2026-02-20 0:38 ` [PATCH 4/4] selftests/bpf: add cgroup attach selftests Christian Brauner
@ 2026-03-05 17:43 ` Alan Maguire
0 siblings, 0 replies; 28+ messages in thread
From: Alan Maguire @ 2026-03-05 17:43 UTC (permalink / raw)
To: Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Martin KaFai Lau, Tejun Heo
Cc: KP Singh, bpf, linux-kernel, cgroups, Lennart Poettering
On 20/02/2026 00:38, Christian Brauner wrote:
> Signed-off-by: Christian Brauner <brauner@kernel.org>
one optional suggestion below, but
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
> ---
> .../selftests/bpf/prog_tests/cgroup_attach.c | 362 +++++++++++++++++++++
> .../selftests/bpf/progs/test_cgroup_attach.c | 85 +++++
> 2 files changed, 447 insertions(+)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c
> new file mode 100644
> index 000000000000..05addf93af46
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach.c
> @@ -0,0 +1,362 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
> +
> +/*
> + * Test the bpf_lsm_cgroup_attach hook.
> + *
> + * Verifies that a BPF LSM program can supervise cgroup migration
> + * through both the cgroup.procs write path and the clone3 +
> + * CLONE_INTO_CGROUP path.
> + */
> +
> +#define _GNU_SOURCE
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <linux/sched.h>
> +#include <linux/types.h>
> +#include <sched.h>
> +#include <signal.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <syscall.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +
> +#include <test_progs.h>
> +#include "cgroup_helpers.h"
> +#include "test_cgroup_attach.skel.h"
> +
> +/* Must match the definition in progs/test_cgroup_attach.c */
> +struct attach_event {
> + __u32 task_pid;
> + __u64 src_cgrp_id;
> + __u64 dst_cgrp_id;
> + __u8 threadgroup;
> + __u32 hook_count;
> +};
> +
> +#ifndef CLONE_INTO_CGROUP
> +#define CLONE_INTO_CGROUP 0x200000000ULL
> +#endif
> +
> +#ifndef __NR_clone3
> +#define __NR_clone3 435
> +#endif
> +
> +struct __clone_args {
> + __aligned_u64 flags;
> + __aligned_u64 pidfd;
> + __aligned_u64 child_tid;
> + __aligned_u64 parent_tid;
> + __aligned_u64 exit_signal;
> + __aligned_u64 stack;
> + __aligned_u64 stack_size;
> + __aligned_u64 tls;
> + __aligned_u64 set_tid;
> + __aligned_u64 set_tid_size;
> + __aligned_u64 cgroup;
> +};
> +
> +static pid_t do_clone3(int cgroup_fd)
> +{
> + struct __clone_args args = {
> + .flags = CLONE_INTO_CGROUP,
> + .exit_signal = SIGCHLD,
> + .cgroup = cgroup_fd,
> + };
> +
> + return syscall(__NR_clone3, &args, sizeof(args));
> +}
> +
> +/*
> + * Subtest: deny_migration
> + *
> + * Verify that the BPF hook can deny cgroup migration through cgroup.procs
> + * and that detaching the BPF program removes enforcement.
> + */
> +static void test_deny_migration(void)
> +{
> + struct test_cgroup_attach *skel = NULL;
> + int allowed_fd = -1, denied_fd = -1;
> + unsigned long long denied_cgid;
> + int err, status;
> + __u64 key;
> + __u8 val = 1;
> + pid_t child;
> +
> + if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
> + return;
> +
> + allowed_fd = create_and_get_cgroup("/allowed");
> + if (!ASSERT_GE(allowed_fd, 0, "create /allowed"))
> + goto cleanup;
> +
> + denied_fd = create_and_get_cgroup("/denied");
> + if (!ASSERT_GE(denied_fd, 0, "create /denied"))
> + goto cleanup;
> +
> + skel = test_cgroup_attach__open_and_load();
> + if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
> + goto cleanup;
> +
> + err = test_cgroup_attach__attach(skel);
> + if (!ASSERT_OK(err, "skel attach"))
> + goto cleanup;
> +
> + skel->bss->monitored_pid = getpid();
> +
> + denied_cgid = get_cgroup_id("/denied");
> + if (!ASSERT_NEQ(denied_cgid, 0ULL, "get denied cgroup id"))
> + goto cleanup;
> +
> + key = denied_cgid;
> + err = bpf_map__update_elem(skel->maps.denied_cgroups,
> + &key, sizeof(key), &val, sizeof(val), 0);
> + if (!ASSERT_OK(err, "add denied cgroup"))
> + goto cleanup;
> +
> + /*
> + * Forked children must use join_parent_cgroup() because the
> + * cgroup workdir was created under the parent's PID and
> + * join_cgroup() constructs paths using getpid().
> + */
> +
> + /* Child migrating to /allowed should succeed */
> + child = fork();
> + if (!ASSERT_GE(child, 0, "fork child allowed"))
> + goto cleanup;
> + if (child == 0) {
> + if (join_parent_cgroup("/allowed"))
> + _exit(1);
> + _exit(0);
> + }
> + err = waitpid(child, &status, 0);
> + ASSERT_GT(err, 0, "waitpid allowed");
> + ASSERT_TRUE(WIFEXITED(status), "allowed child exited");
> + ASSERT_EQ(WEXITSTATUS(status), 0, "allowed migration succeeds");
> +
> + /* Child migrating to /denied should fail */
> + child = fork();
> + if (!ASSERT_GE(child, 0, "fork child denied"))
> + goto cleanup;
> + if (child == 0) {
> + if (join_parent_cgroup("/denied") == 0)
> + _exit(1); /* Should have failed */
> + if (errno != EPERM)
> + _exit(2); /* Wrong errno */
> + _exit(0);
> + }
> + err = waitpid(child, &status, 0);
> + ASSERT_GT(err, 0, "waitpid denied");
> + ASSERT_TRUE(WIFEXITED(status), "denied child exited");
> + ASSERT_EQ(WEXITSTATUS(status), 0, "denied migration blocked");
> +
> + /* Detach BPF — /denied should now be accessible */
> + test_cgroup_attach__detach(skel);
> +
> + child = fork();
> + if (!ASSERT_GE(child, 0, "fork child post-detach"))
> + goto cleanup;
> + if (child == 0) {
> + if (join_parent_cgroup("/denied"))
> + _exit(1);
> + _exit(0);
> + }
> + err = waitpid(child, &status, 0);
> + ASSERT_GT(err, 0, "waitpid post-detach");
> + ASSERT_TRUE(WIFEXITED(status), "post-detach child exited");
> + ASSERT_EQ(WEXITSTATUS(status), 0, "post-detach migration free");
> +
> +cleanup:
> + if (skel)
> + test_cgroup_attach__destroy(skel);
> + if (allowed_fd >= 0)
> + close(allowed_fd);
> + if (denied_fd >= 0)
> + close(denied_fd);
> + cleanup_cgroup_environment();
> +}
> +
> +/*
> + * Subtest: verify_hook_args
> + *
> + * Verify that the hook receives correct src_cgrp, dst_cgrp, task pid,
> + * and threadgroup values.
> + */
> +static void test_verify_hook_args(void)
> +{
> + struct test_cgroup_attach *skel = NULL;
> + struct attach_event evt = {};
> + unsigned long long src_cgid, dst_cgid;
> + int src_fd = -1, dst_fd = -1;
> + __u32 map_key = 0;
> + char pid_str[32];
> + int err;
> +
> + if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
> + return;
> +
> + src_fd = create_and_get_cgroup("/src");
> + if (!ASSERT_GE(src_fd, 0, "create /src"))
> + goto cleanup;
> +
> + dst_fd = create_and_get_cgroup("/dst");
> + if (!ASSERT_GE(dst_fd, 0, "create /dst"))
> + goto cleanup;
> +
> + /* Move ourselves to /src first */
> + if (!ASSERT_OK(join_cgroup("/src"), "join /src"))
> + goto cleanup;
> +
> + skel = test_cgroup_attach__open_and_load();
> + if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
> + goto cleanup;
> +
> + err = test_cgroup_attach__attach(skel);
> + if (!ASSERT_OK(err, "skel attach"))
> + goto cleanup;
> +
> + skel->bss->monitored_pid = getpid();
> +
> + src_cgid = get_cgroup_id("/src");
> + dst_cgid = get_cgroup_id("/dst");
> + if (!ASSERT_NEQ(src_cgid, 0ULL, "get src cgroup id"))
> + goto cleanup;
> + if (!ASSERT_NEQ(dst_cgid, 0ULL, "get dst cgroup id"))
> + goto cleanup;
> +
> + /* Migrate self to /dst via cgroup.procs (threadgroup=true) */
> + snprintf(pid_str, sizeof(pid_str), "%d", getpid());
> + if (!ASSERT_OK(write_cgroup_file("/dst", "cgroup.procs", pid_str),
> + "migrate to /dst"))
> + goto cleanup;
> +
> + /* Read the recorded event */
> + err = bpf_map__lookup_elem(skel->maps.last_event,
> + &map_key, sizeof(map_key),
> + &evt, sizeof(evt), 0);
could just add a last_event struct to skel->bss and save the map
storage/lookup, but not a big deal.
> + if (!ASSERT_OK(err, "read last_event"))
> + goto cleanup;
> +
> + ASSERT_EQ(evt.src_cgrp_id, src_cgid, "src_cgrp_id matches");
> + ASSERT_EQ(evt.dst_cgrp_id, dst_cgid, "dst_cgrp_id matches");
> + ASSERT_EQ(evt.task_pid, (__u32)getpid(), "task_pid matches");
> + ASSERT_EQ(evt.threadgroup, 1, "threadgroup is true for cgroup.procs");
> + ASSERT_GE(evt.hook_count, (__u32)1, "hook fired at least once");
> +
> +cleanup:
> + if (skel)
> + test_cgroup_attach__destroy(skel);
> + if (src_fd >= 0)
> + close(src_fd);
> + if (dst_fd >= 0)
> + close(dst_fd);
> + cleanup_cgroup_environment();
> +}
> +
> +/*
> + * Subtest: clone_into_cgroup
> + *
> + * Verify the hook fires on the clone3(CLONE_INTO_CGROUP) path and can
> + * deny spawning a child directly into a cgroup.
> + */
> +static void test_clone_into_cgroup(void)
> +{
> + struct test_cgroup_attach *skel = NULL;
> + int allowed_fd = -1, denied_fd = -1;
> + unsigned long long denied_cgid, allowed_cgid;
> + struct attach_event evt = {};
> + __u32 map_key = 0;
> + __u64 key;
> + __u8 val = 1;
> + int err, status;
> + pid_t child;
> +
> + if (!ASSERT_OK(setup_cgroup_environment(), "setup_cgroup_env"))
> + return;
> +
> + allowed_fd = create_and_get_cgroup("/clone_allowed");
> + if (!ASSERT_GE(allowed_fd, 0, "create /clone_allowed"))
> + goto cleanup;
> +
> + denied_fd = create_and_get_cgroup("/clone_denied");
> + if (!ASSERT_GE(denied_fd, 0, "create /clone_denied"))
> + goto cleanup;
> +
> + skel = test_cgroup_attach__open_and_load();
> + if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
> + goto cleanup;
> +
> + err = test_cgroup_attach__attach(skel);
> + if (!ASSERT_OK(err, "skel attach"))
> + goto cleanup;
> +
> + skel->bss->monitored_pid = getpid();
> +
> + denied_cgid = get_cgroup_id("/clone_denied");
> + allowed_cgid = get_cgroup_id("/clone_allowed");
> + if (!ASSERT_NEQ(denied_cgid, 0ULL, "get denied cgroup id"))
> + goto cleanup;
> + if (!ASSERT_NEQ(allowed_cgid, 0ULL, "get allowed cgroup id"))
> + goto cleanup;
> +
> + key = denied_cgid;
> + err = bpf_map__update_elem(skel->maps.denied_cgroups,
> + &key, sizeof(key), &val, sizeof(val), 0);
> + if (!ASSERT_OK(err, "add denied cgroup"))
> + goto cleanup;
> +
> + /* clone3 into denied cgroup should fail */
> + child = do_clone3(denied_fd);
> + if (child >= 0) {
> + waitpid(child, NULL, 0);
> + ASSERT_LT(child, 0, "clone3 into denied should fail");
> + goto cleanup;
> + }
> + if (errno == ENOSYS || errno == E2BIG) {
> + test__skip();
> + goto cleanup;
> + }
> + ASSERT_EQ(errno, EPERM, "clone3 denied errno");
> +
> + /* clone3 into allowed cgroup should succeed */
> + child = do_clone3(allowed_fd);
> + if (!ASSERT_GE(child, 0, "clone3 into allowed"))
> + goto cleanup;
> + if (child == 0)
> + _exit(0);
> +
> + err = waitpid(child, &status, 0);
> + ASSERT_GT(err, 0, "waitpid clone3 allowed");
> + ASSERT_TRUE(WIFEXITED(status), "clone3 child exited");
> + ASSERT_EQ(WEXITSTATUS(status), 0, "clone3 child ok");
> +
> + /* Verify the hook recorded the allowed clone */
> + err = bpf_map__lookup_elem(skel->maps.last_event,
> + &map_key, sizeof(map_key),
> + &evt, sizeof(evt), 0);
> + if (!ASSERT_OK(err, "read last_event"))
> + goto cleanup;
> +
> + ASSERT_EQ(evt.dst_cgrp_id, allowed_cgid, "clone3 dst_cgrp_id");
> +
> +cleanup:
> + if (skel)
> + test_cgroup_attach__destroy(skel);
> + if (allowed_fd >= 0)
> + close(allowed_fd);
> + if (denied_fd >= 0)
> + close(denied_fd);
> + cleanup_cgroup_environment();
> +}
> +
> +void test_cgroup_attach(void)
> +{
> + if (test__start_subtest("deny_migration"))
> + test_deny_migration();
> + if (test__start_subtest("verify_hook_args"))
> + test_verify_hook_args();
> + if (test__start_subtest("clone_into_cgroup"))
> + test_clone_into_cgroup();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_cgroup_attach.c b/tools/testing/selftests/bpf/progs/test_cgroup_attach.c
> new file mode 100644
> index 000000000000..90915d1d7d64
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_cgroup_attach.c
> @@ -0,0 +1,85 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
> +
> +/*
> + * BPF LSM cgroup attach policy: supervise cgroup migration.
> + *
> + * A designated process populates a denied_cgroups map with cgroup IDs
> + * that should reject migration. The cgroup_attach hook checks every
> + * migration and returns -EPERM when the destination cgroup is denied.
> + * It also records the last hook invocation into last_event for the
> + * userspace test to verify arguments.
> + */
> +
> +#include "vmlinux.h"
> +#include <errno.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_core_read.h>
> +
> +struct attach_event {
> + __u32 task_pid;
> + __u64 src_cgrp_id;
> + __u64 dst_cgrp_id;
> + __u8 threadgroup;
> + __u32 hook_count;
> +};
> +
> +/*
> + * Cgroups that should reject migration.
> + * Key: cgroup kn->id (u64).
> + * Value: unused marker.
> + */
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(max_entries, 16);
> + __type(key, __u64);
> + __type(value, __u8);
> +} denied_cgroups SEC(".maps");
> +
> +/*
> + * Record the last hook invocation for argument verification.
> + * Key: 0.
> + * Value: struct attach_event.
> + */
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARRAY);
> + __uint(max_entries, 1);
> + __type(key, __u32);
> + __type(value, struct attach_event);
> +} last_event SEC(".maps");
> +
> +__u32 monitored_pid;
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("lsm.s/cgroup_attach")
> +int BPF_PROG(cgroup_attach, struct task_struct *task,
> + struct cgroup *src_cgrp, struct cgroup *dst_cgrp,
> + struct super_block *sb, bool threadgroup,
> + struct cgroup_namespace *ns)
> +{
> + struct task_struct *current = bpf_get_current_task_btf();
> + struct attach_event *evt;
> + __u64 dst_id;
> + __u32 key = 0;
> +
> + dst_id = BPF_CORE_READ(dst_cgrp, kn, id);
> +
> + if (bpf_map_lookup_elem(&denied_cgroups, &dst_id))
> + return -EPERM;
> +
> + if (!monitored_pid || current->tgid != monitored_pid)
> + return 0;
> +
> + evt = bpf_map_lookup_elem(&last_event, &key);
> + if (evt) {
> + evt->task_pid = task->pid;
> + evt->src_cgrp_id = BPF_CORE_READ(src_cgrp, kn, id);
> + evt->dst_cgrp_id = dst_id;
> + evt->threadgroup = threadgroup ? 1 : 0;
> + evt->hook_count++;
> + }
> +
> + return 0;
> +}
>
^ permalink raw reply [flat|nested] 28+ messages in thread