Linux Security Modules development

Linux Security Modules development
 help / color / mirror / Atom feed

* [PATCH 10/11] hornet: scripts: harden scripts to handle trailing whitespace
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

Trailing whitespace after the semicolon in the header files may have
caused the binary extracted payload to be corrupted due to a missing
anchor.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/extract-insn.sh | 2 +-
 scripts/hornet/extract-map.sh  | 2 +-
 scripts/hornet/extract-skel.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/hornet/extract-insn.sh b/scripts/hornet/extract-insn.sh
index e136932275aa5..3e7bed049acb6 100755
--- a/scripts/hornet/extract-insn.sh
+++ b/scripts/hornet/extract-insn.sh
@@ -23,5 +23,5 @@ if [ $ARGC -ne $EXPECTED_ARGS ] ; then
     usage
 else
     printf $(gcc -E $1 | grep "opts_insn" | \
-		 awk -F"=" '{print $2}' | sed 's/;\+$//' | sed 's/\"//g')
+		 awk -F"=" '{print $2}' | sed 's/[[:space:];]*$//' | sed 's/\"//g')
 fi
diff --git a/scripts/hornet/extract-map.sh b/scripts/hornet/extract-map.sh
index 058ac1b32d743..1d92ebe1a04b5 100755
--- a/scripts/hornet/extract-map.sh
+++ b/scripts/hornet/extract-map.sh
@@ -23,5 +23,5 @@ if [ $ARGC -ne $EXPECTED_ARGS ] ; then
     usage
 else
     printf $(gcc -E $1 | grep "opts_data" | \
-		 awk -F"=" '{print $2}' | sed 's/;\+$//' | sed 's/\"//g')
+		 awk -F"=" '{print $2}' | sed 's/[[:space:];]*$//' | sed 's/\"//g')
 fi
diff --git a/scripts/hornet/extract-skel.sh b/scripts/hornet/extract-skel.sh
index abc435e2bcd4e..e115f4b7fdf74 100755
--- a/scripts/hornet/extract-skel.sh
+++ b/scripts/hornet/extract-skel.sh
@@ -23,5 +23,5 @@ if [ $ARGC -ne $EXPECTED_ARGS ] ; then
     usage
 else
     printf $(gcc -E $1 | grep "static const char opts_$2" | \
-		 awk -F"=" '{print $2}' | sed 's/;\+$//' | sed 's/\"//g')
+		 awk -F"=" '{print $2}' | sed 's/[[:space:];]*$//' | sed 's/\"//g')
 fi
-- 
2.53.0


^ permalink raw reply related

* [PATCH 09/11] hornet: scripts: set a non-zero error code for usage
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

It was possible that build scripts may continue if arguments were
missing.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/extract-insn.sh | 10 +++++-----
 scripts/hornet/extract-map.sh  | 10 +++++-----
 scripts/hornet/extract-skel.sh | 10 +++++-----
 scripts/hornet/write-sig.sh    | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/scripts/hornet/extract-insn.sh b/scripts/hornet/extract-insn.sh
index 52338f057ff6b..e136932275aa5 100755
--- a/scripts/hornet/extract-insn.sh
+++ b/scripts/hornet/extract-insn.sh
@@ -8,11 +8,11 @@
 # License as published by the Free Software Foundation.
 
 function usage() {
-    echo "Sample script for extracting instructions"
-    echo "autogenerated eBPF lskel headers"
-    echo ""
-    echo "USAGE: header_file"
-    exit
+    echo "Sample script for extracting instructions" >&2
+    echo "autogenerated eBPF lskel headers" >&2
+    echo "" >&2
+    echo "USAGE: header_file" >&2
+    exit 1
 }
 
 ARGC=$#
diff --git a/scripts/hornet/extract-map.sh b/scripts/hornet/extract-map.sh
index c309f505c6238..058ac1b32d743 100755
--- a/scripts/hornet/extract-map.sh
+++ b/scripts/hornet/extract-map.sh
@@ -8,11 +8,11 @@
 # License as published by the Free Software Foundation.
 
 function usage() {
-    echo "Sample script for extracting instructions"
-    echo "autogenerated eBPF lskel headers"
-    echo ""
-    echo "USAGE: header_file"
-    exit
+    echo "Sample script for extracting instructions" >&2
+    echo "autogenerated eBPF lskel headers" >&2
+    echo "" >&2
+    echo "USAGE: header_file" >&2
+    exit 1
 }
 
 ARGC=$#
diff --git a/scripts/hornet/extract-skel.sh b/scripts/hornet/extract-skel.sh
index 6550a86b89917..abc435e2bcd4e 100755
--- a/scripts/hornet/extract-skel.sh
+++ b/scripts/hornet/extract-skel.sh
@@ -8,11 +8,11 @@
 # License as published by the Free Software Foundation.
 
 function usage() {
-    echo "Sample script for extracting instructions and map data out of"
-    echo "autogenerated eBPF lskel headers"
-    echo ""
-    echo "USAGE: header_file field"
-    exit
+    echo "Sample script for extracting instructions and map data out of" >&2
+    echo "autogenerated eBPF lskel headers" >&2
+    echo "" >&2
+    echo "USAGE: header_file field" >&2
+    exit 1
 }
 
 ARGC=$#
diff --git a/scripts/hornet/write-sig.sh b/scripts/hornet/write-sig.sh
index 7eaabe3bab9aa..ad2b65761c282 100755
--- a/scripts/hornet/write-sig.sh
+++ b/scripts/hornet/write-sig.sh
@@ -8,11 +8,11 @@
 # License as published by the Free Software Foundation.
 
 function usage() {
-    echo "Sample for rewriting an autogenerated eBPF lskel headers"
-    echo "with a new signature"
-    echo ""
-    echo "USAGE: header_file sig"
-    exit
+    echo "Sample for rewriting an autogenerated eBPF lskel headers" >&2
+    echo "with a new signature" >&2
+    echo "" >&2
+    echo "USAGE: header_file sig" >&2
+    exit 1
 }
 
 ARGC=$#
-- 
2.53.0


^ permalink raw reply related

* [PATCH 08/11] hornet: gen_sig: fix missing command line switches
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

D was missing from the getopt list. Additionally, we were missing the
help option handler.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/gen_sig.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/hornet/gen_sig.c b/scripts/hornet/gen_sig.c
index fb9ae1934206a..19ae9af006853 100644
--- a/scripts/hornet/gen_sig.c
+++ b/scripts/hornet/gen_sig.c
@@ -295,7 +295,7 @@ int main(int argc, char **argv)
 	int i;
 	int opt;
 
-	const char *short_opts = "C:K:P:O:A:Sh";
+	const char *short_opts = "C:K:P:O:D:A:Sh";
 
 	static const struct option long_opts[] = {
 		{"cert", required_argument, 0, 'C'},
@@ -332,6 +332,9 @@ int main(int argc, char **argv)
 			}
 			hashes[hash_count++].file = optarg;
 			break;
+		case 'h':
+			usage(argv[0]);
+			return EXIT_SUCCESS;
 		default:
 			usage(argv[0]);
 			return EXIT_FAILURE;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 07/11] hornet: gen_sig: check for bad allocations
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

There were a few sites where gen_sig failed to check for bad return
values after allocations. Error out appropriately as needed.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/gen_sig.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/scripts/hornet/gen_sig.c b/scripts/hornet/gen_sig.c
index 647bc3a257dd0..fb9ae1934206a 100644
--- a/scripts/hornet/gen_sig.c
+++ b/scripts/hornet/gen_sig.c
@@ -248,13 +248,25 @@ static int sha256(const char *path, unsigned char out[SHA256_LEN], unsigned int
 	return rc;
 }
 
-static void add_hash(MAP_SET *set, unsigned char *buffer, int buffer_len)
+static int add_hash(MAP_SET *set, unsigned char *buffer, int buffer_len)
 {
-	HORNET_MAP *map = NULL;
+	HORNET_MAP *map;
 
 	map = HORNET_MAP_new();
-	ASN1_OCTET_STRING_set(map->hash, buffer, buffer_len);
-	sk_HORNET_MAP_push(set->maps, map);
+	if (!map)
+		return -1;
+
+	if (ASN1_OCTET_STRING_set(map->hash, buffer, buffer_len) != 1) {
+		HORNET_MAP_free(map);
+		return -1;
+	}
+
+	if (sk_HORNET_MAP_push(set->maps, map) <= 0) {
+		HORNET_MAP_free(map);
+		return -1;
+	}
+
+	return 0;
 }
 
 int main(int argc, char **argv)
@@ -353,13 +365,18 @@ int main(int argc, char **argv)
 	ERR(!si, "add signer failed");
 
 	set = MAP_SET_new();
+	ERR(!set, "alloc MAP_SET failed");
 	set->maps = sk_HORNET_MAP_new_null();
+	ERR(!set->maps, "alloc HORNET_MAP stack failed");
 
 	for (i = 0; i < hash_count; i++) {
 		if (sha256(hashes[i].file, hash_buffer, &hash_len) != 0) {
 			DIE("failed to hash input");
 		}
-		add_hash(set, hash_buffer, hash_len);
+		if (add_hash(set, hash_buffer, hash_len) != 0) {
+			ERR_print_errors_fp(stderr);
+			DIE("failed to add hash to map set");
+		}
 	}
 
 	oid = OBJ_txt2obj("2.25.316487325684022475439036912669789383960", 1);
@@ -380,7 +397,18 @@ int main(int argc, char **argv)
 	b_out = bio_open_wr(out_path);
 	ERR(!b_out, "opening output path failed");
 
-	i2d_CMS_bio_stream(b_out, cms_out, NULL, 0);
+	err = i2d_CMS_bio_stream(b_out, cms_out, NULL, 0);
+	ERR(!err, "writing CMS signature to %s failed", out_path);
+
+	/*
+	 * File BIOs wrap stdio, which buffers writes; small payloads will
+	 * report success from BIO_write even when the underlying file is
+	 * full or otherwise un-writable. Force a flush and check it before
+	 * the BIO is freed, otherwise gen_sig could exit successfully with
+	 * a truncated or empty signature file (e.g. ENOSPC on /dev/full).
+	 */
+	err = BIO_flush(b_out);
+	ERR(err <= 0, "flushing %s failed", out_path);
 
 	BIO_free(data_in);
 	BIO_free(b_out);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 06/11] hornet: gen_sig: fix error string allocations
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

The sha256 function was allocating/freeing it's own error strings,
which could case further errors to only return their error number.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/gen_sig.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/hornet/gen_sig.c b/scripts/hornet/gen_sig.c
index 4e8caad22f381..647bc3a257dd0 100644
--- a/scripts/hornet/gen_sig.c
+++ b/scripts/hornet/gen_sig.c
@@ -200,8 +200,6 @@ static int sha256(const char *path, unsigned char out[SHA256_LEN], unsigned int
 		return -2;
 	}
 
-	ERR_load_crypto_strings();
-
 	rc = -3;
 	ctx = EVP_MD_CTX_new();
 	if (!ctx) {
@@ -247,7 +245,6 @@ static int sha256(const char *path, unsigned char out[SHA256_LEN], unsigned int
 done:
 	EVP_MD_CTX_free(ctx);
 	fclose(f);
-	ERR_free_strings();
 	return rc;
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 05/11] hornet: gen_sig: fix off-by-one check for used maps
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

A logic bug limited the maximum number of used maps to
MAX_USED_MAPS-1.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 scripts/hornet/gen_sig.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/hornet/gen_sig.c b/scripts/hornet/gen_sig.c
index b4f983ab24bcd..4e8caad22f381 100644
--- a/scripts/hornet/gen_sig.c
+++ b/scripts/hornet/gen_sig.c
@@ -317,11 +317,11 @@ int main(int argc, char **argv)
 			data_path = optarg;
 			break;
 		case 'A':
-			hashes[hash_count].file = optarg;
-			if (++hash_count >= MAX_HASHES) {
+			if (hash_count >= MAX_HASHES) {
 				usage(argv[0]);
 				return EXIT_FAILURE;
 			}
+			hashes[hash_count++].file = optarg;
 			break;
 		default:
 			usage(argv[0]);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 04/11] selftests: hornet: handle cross compilation and test skipping
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

There were a few spots in the hornet selftest makefile where some host
resources were assumed to be used. Additionally add proper skip
detection for scenarios where the autogenerated signing keys don't
exist.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 tools/testing/selftests/hornet/Makefile | 114 ++++++++++++++++++------
 1 file changed, 89 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/hornet/Makefile b/tools/testing/selftests/hornet/Makefile
index 316364f95f28c..460adab35e238 100644
--- a/tools/testing/selftests/hornet/Makefile
+++ b/tools/testing/selftests/hornet/Makefile
@@ -5,59 +5,123 @@ include ../../../scripts/Makefile.include
 
 CLANG ?= clang
 CFLAGS := -g -O2 -Wall
+TOOLSDIR := $(abspath ../../..)
 BPFTOOL ?= $(TOOLSDIR)/bpf/bpftool/bpftool
 SCRIPTSDIR := $(abspath ../../../../scripts/hornet)
-TOOLSDIR := $(abspath ../../..)
 LIBDIR := $(TOOLSDIR)/lib
 BPFDIR := $(LIBDIR)/bpf
 TOOLSINCDIR := $(TOOLSDIR)/include
 APIDIR := $(TOOLSINCDIR)/uapi
 CERTDIR := $(abspath ../../../../certs)
-PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
+HOSTPKG_CONFIG ?= pkg-config
+
+SIGNING_KEY  := $(CERTDIR)/signing_key.pem
+SIGNING_CERT := $(CERTDIR)/signing_key.x509
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)				\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)	\
+		     ../../../../vmlinux				\
+		     /sys/kernel/btf/vmlinux				\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+
+# The hornet selftest needs the kernel module signing key/cert (generated when
+# the kernel is built with CONFIG_MODULE_SIG=y), a bpftool binary, and a
+# vmlinux with BTF for trivial.bpf.o. If any of those are missing (cross-build
+# without artifacts, container CI, CONFIG_MODULE_SIG disabled, etc.) skip the
+# targets rather than failing the global selftests build.
+hornet_skip_reason :=
+ifeq ($(wildcard $(SIGNING_KEY)),)
+hornet_skip_reason := module signing key not found at $(SIGNING_KEY) (build the kernel with CONFIG_MODULE_SIG=y first)
+else ifeq ($(wildcard $(SIGNING_CERT)),)
+hornet_skip_reason := module signing cert not found at $(SIGNING_CERT)
+else ifeq ($(wildcard $(BPFTOOL)),)
+hornet_skip_reason := bpftool not found at $(BPFTOOL) (build it under tools/bpf/bpftool first)
+else ifeq ($(VMLINUX_BTF),)
+hornet_skip_reason := no vmlinux with BTF found; tried $(VMLINUX_BTF_PATHS) (build the kernel with CONFIG_DEBUG_INFO_BTF=y or set VMLINUX_BTF=)
+endif
+
+ifneq ($(hornet_skip_reason),)
+$(warning Skipping hornet selftests: $(hornet_skip_reason))
+TEST_GEN_PROGS :=
+TEST_GEN_FILES :=
+
+include ../lib.mk
+
+else
 
 TEST_GEN_PROGS := loader
 TEST_GEN_FILES := vmlinux.h loader.h trivial.bpf.o map.bin sig.bin insn.bin signed_loader.h
-$(TEST_GEN_PROGS): LDLIBS += -lbpf
-$(TEST_GEN_PROGS): $(TEST_GEN_FILES)
 
 include ../lib.mk
 
-BPF_CFLAGS := -target bpf \
-	-D__TARGET_ARCH_$(ARCH) \
-	-I/usr/include/$(shell uname -m)-linux-gnu \
+define get_sys_includes
+$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__loongarch_grlen ' | awk '{printf("-D__BITS_PER_LONG=%d", $$3)}') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep -E 'MIPS(EL|EB)|_MIPS_SZ(PTR|LONG) |_MIPS_SIM |_ABI(O32|N32|64) ' | awk '{printf("-D%s=%s ", $$2, $$3)}')
+endef
+
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
+
+IS_LITTLE_ENDIAN := $(shell $(CC) -dM -E - </dev/null | \
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+BPF_TARGET_ENDIAN := $(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb)
+
+BPF_CFLAGS := $(BPF_TARGET_ENDIAN) \
+	-D__TARGET_ARCH_$(SRCARCH) \
+	$(CLANG_SYS_INCLUDES) \
 	$(KHDR_INCLUDES)
 
-vmlinux.h:
-	$(BPFTOOL) btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
+$(OUTPUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
+	$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
 
-trivial.bpf.o: trivial.bpf.c vmlinux.h
-	$(CLANG) $(CFLAGS) $(BPF_CFLAGS) -c $< -o $@
+$(OUTPUT)/trivial.bpf.o: trivial.bpf.c $(OUTPUT)/vmlinux.h
+	$(CLANG) $(CFLAGS) $(BPF_CFLAGS) -I$(OUTPUT) -c $< -o $@
 
-loader.h: trivial.bpf.o
-	$(BPFTOOL) gen skeleton -S -k $(CERTDIR)/signing_key.pem -i $(CERTDIR)/signing_key.x509 \
+$(OUTPUT)/loader.h: $(OUTPUT)/trivial.bpf.o
+	$(BPFTOOL) gen skeleton -S -k $(SIGNING_KEY) -i $(SIGNING_CERT) \
 		-L $< name trivial > $@
 
-insn.bin: loader.h
+$(OUTPUT)/insn.bin: $(OUTPUT)/loader.h
 	$(SCRIPTSDIR)/extract-insn.sh $< > $@
 
-map.bin: loader.h
+$(OUTPUT)/map.bin: $(OUTPUT)/loader.h
 	$(SCRIPTSDIR)/extract-map.sh $< > $@
 
 $(OUTPUT)/gen_sig: ../../../../scripts/hornet/gen_sig.c
 	$(call msg,GEN_SIG,,$@)
-	$(Q)$(CC) $(shell $(PKG_CONFIG) --cflags libcrypto 2> /dev/null) \
+	$(Q)$(HOSTCC) $(shell $(HOSTPKG_CONFIG) --cflags libcrypto 2> /dev/null) \
 		  $< -o $@ \
-		  $(shell $(PKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
+		  $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
+
+$(OUTPUT)/sig.bin: $(OUTPUT)/insn.bin $(OUTPUT)/map.bin $(OUTPUT)/gen_sig
+	$(OUTPUT)/gen_sig --key $(SIGNING_KEY) --cert $(SIGNING_CERT) \
+		--data $(OUTPUT)/insn.bin --add $(OUTPUT)/map.bin --out $@
+
+$(OUTPUT)/signed_loader.h: $(OUTPUT)/sig.bin $(OUTPUT)/loader.h
+	$(SCRIPTSDIR)/write-sig.sh $(OUTPUT)/loader.h $(OUTPUT)/sig.bin > $@
+
+BPFOBJ := $(OUTPUT)/libbpf/libbpf.a
+
+$(OUTPUT)/libbpf:
+	$(Q)mkdir -p $@
 
-sig.bin: insn.bin map.bin $(OUTPUT)/gen_sig
-	$(OUTPUT)/gen_sig --key $(CERTDIR)/signing_key.pem --cert $(CERTDIR)/signing_key.x509 \
-		--data insn.bin --add map.bin --out sig.bin
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
+	   $(APIDIR)/linux/bpf.h | $(OUTPUT)/libbpf
+	$(Q)$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/libbpf/ \
+		    DESTDIR=$(OUTPUT) prefix= \
+		    $(BPFOBJ) install_headers
 
-signed_loader.h: sig.bin
-	$(SCRIPTSDIR)/write-sig.sh loader.h sig.bin > $@
+$(OUTPUT)/loader: loader.c $(OUTPUT)/signed_loader.h $(BPFOBJ)
+	$(CC) $(CFLAGS) -I$(LIBDIR) -I$(APIDIR) -I$(OUTPUT) \
+		$< $(BPFOBJ) -o $@ -lelf -lz
 
-loader: loader.c signed_loader.h
-	$(CC) $(CFLAGS) -I$(LIBDIR) -I$(APIDIR) $< -o $@ -lbpf
 
+EXTRA_CLEAN = $(OUTPUT)/gen_sig $(OUTPUT)/libbpf
 
-EXTRA_CLEAN = $(OUTPUT)/gen_sig
+endif
-- 
2.53.0


^ permalink raw reply related

* [PATCH 03/11] hornet: fix off-by-one bug in max used maps check
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

Sashiko correctly reported an off-by-one logic error checking against
the maximum number of used maps.  Removing the index constraint allows
us to simplify the check logic.

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 security/hornet/hornet_lsm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/security/hornet/hornet_lsm.c b/security/hornet/hornet_lsm.c
index 35d9522d6bc72..eeb422db1092d 100644
--- a/security/hornet/hornet_lsm.c
+++ b/security/hornet/hornet_lsm.c
@@ -49,8 +49,7 @@ int hornet_next_map(void *context, size_t hdrlen,
 {
 	struct hornet_parse_context *ctx = (struct hornet_parse_context *)context;
 
-	if (++ctx->security->signed_hash_count >= MAX_USED_MAPS)
-		return -EINVAL;
+	ctx->security->signed_hash_count++;
 	return 0;
 }
 
@@ -63,6 +62,8 @@ int hornet_map_hash(void *context, size_t hdrlen,
 
 	if (vlen != SHA256_DIGEST_SIZE && vlen != 0)
 		return -EINVAL;
+	if (ctx->security->signed_hash_count >= MAX_USED_MAPS)
+		return -EINVAL;
 
 	memcpy(&ctx->security->signed_hashes[ctx->security->signed_hash_count * SHA256_DIGEST_SIZE],
 	       value, vlen);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 02/11] hornet: invert map set check logic
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

In a multi-map hash verification scenario, a logic bug may have
allowed an attacker to provide duplicate maps to satisfy the hash
check count. Instead, invert the logic to verify each map discretely

Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 security/hornet/hornet_lsm.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/security/hornet/hornet_lsm.c b/security/hornet/hornet_lsm.c
index 516038413f321..35d9522d6bc72 100644
--- a/security/hornet/hornet_lsm.c
+++ b/security/hornet/hornet_lsm.c
@@ -191,7 +191,6 @@ static int hornet_check_prog_maps(struct bpf_prog *prog)
 	struct bpf_map *map;
 	int i, j;
 	bool found;
-	int covered_count = 0;
 
 	security = hornet_bpf_prog_security(prog);
 
@@ -200,18 +199,18 @@ static int hornet_check_prog_maps(struct bpf_prog *prog)
 
 	mutex_lock(&prog->aux->used_maps_mutex);
 
-	/* Verify every used_map has a matching signed hash */
-	for (j = 0; j < prog->aux->used_map_cnt; j++) {
-		map = prog->aux->used_maps[j];
+	/* Verify every signed map exists in used_maps */
+	for (i = 0; i < security->signed_hash_count; i++) {
+		found = false;
+		for (j = 0; j < prog->aux->used_map_cnt; j++) {
+			map = prog->aux->used_maps[j];
 
-		if (!READ_ONCE(map->frozen) || !map->ops->map_get_hash)
-			continue;
+			if (!READ_ONCE(map->frozen) || !map->ops->map_get_hash)
+				continue;
 
-		if (map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, hash))
-			continue;
+			if (map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, hash))
+				continue;
 
-		found = false;
-		for (i = 0; i < security->signed_hash_count; i++) {
 			if (memcmp(hash,
 				   &security->signed_hashes[i * SHA256_DIGEST_SIZE],
 				   SHA256_DIGEST_SIZE) == 0) {
@@ -223,15 +222,10 @@ static int hornet_check_prog_maps(struct bpf_prog *prog)
 			mutex_unlock(&prog->aux->used_maps_mutex);
 			return -EPERM;
 		}
-		covered_count++;
 	}
 
 	mutex_unlock(&prog->aux->used_maps_mutex);
 
-	/* Ensure all signed hashes were accounted for */
-	if (covered_count != security->signed_hash_count)
-		return -EPERM;
-
 	return 0;
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 01/11] hornet: fix TOCTOU in signed program verification
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module
In-Reply-To: <20260528030915.2654994-1-bboscaccy@linux.microsoft.com>

The signature verification path was vulnerable to a time-of-check vs
time-of-use race at both the program load and program run hook sites:
between the moment a map's contents were hashed for signature
verification and the moment the program run hook re-verified them, an
attacker with sufficient privileges could swap or mutate the map
contents.

Close the race by snapshotting the map hashes during program load,
attaching them to the program, and re-verifying them from the
security_bpf_prog hook against prog->aux->used_maps. Because used_maps
is the same map set the verifier and runtime resolve against, there is
no longer a window in which the verified set and the executed set can
diverge.

Since we are no longer targeting the fd_array passed in, drop the map
index data entirely and check for whether or not the set of requested
map hashes is a subset of prog->aux->used_maps.

Reported-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Blaise Boscaccy <bboscaccy@linux.microsoft.com>
---
 Documentation/admin-guide/LSM/Hornet.rst |  39 +++-----
 scripts/hornet/gen_sig.c                 |  17 +---
 security/hornet/hornet.asn1              |   1 -
 security/hornet/hornet_lsm.c             | 121 +++--------------------
 tools/testing/selftests/hornet/Makefile  |   2 +-
 5 files changed, 35 insertions(+), 145 deletions(-)

diff --git a/Documentation/admin-guide/LSM/Hornet.rst b/Documentation/admin-guide/LSM/Hornet.rst
index 0ade4c17374c6..a369bc11408f4 100644
--- a/Documentation/admin-guide/LSM/Hornet.rst
+++ b/Documentation/admin-guide/LSM/Hornet.rst
@@ -86,15 +86,14 @@ Hornet protects against the following threats:
 
 - **Tampering with map data**: When map hashes are included in the
   signature, Hornet verifies that frozen BPF maps match their expected
-  SHA-256 hashes at load time. Maps are also re-verified before program
-  execution via ``BPF_PROG_RUN``.
+  SHA-256 hashes at load time after the program is publically exposed.
 
 Hornet does **not** protect against:
 
 - Compromise of the signing key itself.
 - Attacks that occur after a program has been loaded and verified.
 - Programs loaded by the kernel itself (kernel-internal loads bypass
-  the ``BPF_PROG_RUN`` map check).
+  the map check).
 
 Known Limitations
 =================
@@ -117,6 +116,10 @@ Known Limitations
   data. It does not guarantee positional binding of maps to specific
   fd_array slots.
 
+- Map hash verification does not enforce any ordering. It simply asserts
+  that the set of map hashes requested to be verified exist in the used
+  array.
+
 - BPF_MAP_TYPE_PROG_ARRAY maps must be frozen for Hornet to verify
   them. Unfrozen prog array maps are not covered by verification.
 
@@ -159,24 +162,19 @@ The following describes what happens when a userspace program calls
 5. Hornet extracts the authenticated attribute identified by
    ``OID_hornet_data`` (OID ``2.25.316487325684022475439036912669789383960``)
    from the PKCS#7 message. This attribute contains an ASN.1-encoded set
-   of map index/hash pairs.
+   of map hash hashes
 
-6. For each map hash entry, Hornet retrieves the corresponding BPF map
-   via its file descriptor, confirms it is frozen, computes its SHA-256
-   hash, and compares it against the signed hash.
+6. For each map hash entry, Hornet retrieves stores the target map hash in
+   the program's LSM blob.
 
 7. The resulting integrity verdict is passed to the
    ``bpf_prog_load_post_integrity`` hook so that downstream LSMs can
    enforce policy.
 
-Runtime Map Verification
-------------------------
-
-When ``bpf(BPF_PROG_RUN, ...)`` is called from userspace, Hornet
-re-verifies the hashes of all maps associated with the program. This
-ensures that map contents have not been modified between program load
-and execution. If any map hash no longer matches, the ``BPF_PROG_RUN``
-command is denied.
+8. After the verifier processes the program, once it's ready to be published,
+   Hornet intercepts the ``bpf_prog`` hook, and verifies that the set of
+   required hashes exist in the programs used maps. If the map hashes are
+   unable to be found, the command is denied.
 
 Userspace Interface
 -------------------
@@ -199,14 +197,10 @@ the following ASN.1 schema::
   HornetData ::= SET OF Map
 
   Map ::= SEQUENCE {
-      index   INTEGER,
       sha     OCTET STRING
   }
 
-Each ``Map`` entry contains the index of the map in the program's
-``fd_array`` and its expected SHA-256 hash. A zero-length ``sha`` field
-indicates that the map at that index should be skipped during
-verification.
+Each ``Map`` entry contains an expected SHA-256 hash.
 
 Tooling
 =======
@@ -229,7 +223,7 @@ Usage::
           --key <signer.key> \
           [--pass <passphrase>] \
           --out <signature.p7b> \
-          [--add <mapfile.bin>:<index> ...]
+          [--add <mapfile.bin> ...]
 
 ``--data``
   Path to the binary file containing eBPF program instructions to sign.
@@ -248,8 +242,7 @@ Usage::
 
 ``--add``
   Attach a map hash as a signed attribute. The argument is a path to a
-  binary map file followed by a colon and the map's index in the
-  ``fd_array``. This option may be specified multiple times.
+  binary map file. This option may be specified multiple times.
 
 extract-skel.sh
 ---------------
diff --git a/scripts/hornet/gen_sig.c b/scripts/hornet/gen_sig.c
index 8dd9ed66346a2..b4f983ab24bcd 100644
--- a/scripts/hornet/gen_sig.c
+++ b/scripts/hornet/gen_sig.c
@@ -55,7 +55,6 @@
 
 struct hash_spec {
 	char *file;
-	int index;
 };
 
 typedef struct {
@@ -66,7 +65,6 @@ typedef struct {
 
 DECLARE_ASN1_FUNCTIONS(HORNET_MAP)
 ASN1_SEQUENCE(HORNET_MAP) = {
-	ASN1_SIMPLE(HORNET_MAP, index, ASN1_INTEGER),
 	ASN1_SIMPLE(HORNET_MAP, hash, ASN1_OCTET_STRING)
 } ASN1_SEQUENCE_END(HORNET_MAP);
 
@@ -253,12 +251,11 @@ static int sha256(const char *path, unsigned char out[SHA256_LEN], unsigned int
 	return rc;
 }
 
-static void add_hash(MAP_SET *set, unsigned char *buffer, int buffer_len, int index)
+static void add_hash(MAP_SET *set, unsigned char *buffer, int buffer_len)
 {
 	HORNET_MAP *map = NULL;
 
 	map = HORNET_MAP_new();
-	ASN1_INTEGER_set(map->index, index);
 	ASN1_OCTET_STRING_set(map->hash, buffer, buffer_len);
 	sk_HORNET_MAP_push(set->maps, map);
 }
@@ -320,14 +317,8 @@ int main(int argc, char **argv)
 			data_path = optarg;
 			break;
 		case 'A':
-			if (strchr(optarg, ':')) {
-				hashes[hash_count].file = strsep(&optarg, ":");
-				hashes[hash_count].index = atoi(optarg);
-				if (++hash_count >= MAX_HASHES) {
-					usage(argv[0]);
-					return EXIT_FAILURE;
-				}
-			} else {
+			hashes[hash_count].file = optarg;
+			if (++hash_count >= MAX_HASHES) {
 				usage(argv[0]);
 				return EXIT_FAILURE;
 			}
@@ -371,7 +362,7 @@ int main(int argc, char **argv)
 		if (sha256(hashes[i].file, hash_buffer, &hash_len) != 0) {
 			DIE("failed to hash input");
 		}
-		add_hash(set, hash_buffer, hash_len, hashes[i].index);
+		add_hash(set, hash_buffer, hash_len);
 	}
 
 	oid = OBJ_txt2obj("2.25.316487325684022475439036912669789383960", 1);
diff --git a/security/hornet/hornet.asn1 b/security/hornet/hornet.asn1
index e60abf451ae23..3cf50379f5e7c 100644
--- a/security/hornet/hornet.asn1
+++ b/security/hornet/hornet.asn1
@@ -7,6 +7,5 @@
 HornetData ::= SET OF Map
 
 Map ::= SEQUENCE {
-	index			INTEGER ({ hornet_map_index }),
 	sha			OCTET STRING ({ hornet_map_hash })
 } ({ hornet_next_map })
diff --git a/security/hornet/hornet_lsm.c b/security/hornet/hornet_lsm.c
index a4d11fa5b0889..516038413f321 100644
--- a/security/hornet/hornet_lsm.c
+++ b/security/hornet/hornet_lsm.c
@@ -21,26 +21,18 @@
 
 #define MAX_USED_MAPS 64
 
-struct hornet_maps {
-	bpfptr_t fd_array;
-};
-
 /* The only hashing algorithm available is SHA256 due to it be hardcoded
  * in the bpf subsystem.
  */
-
-struct hornet_parse_context {
-	int indexes[MAX_USED_MAPS];
-	bool skips[MAX_USED_MAPS];
-	unsigned char hashes[SHA256_DIGEST_SIZE * MAX_USED_MAPS];
-	int hash_count;
-};
-
 struct hornet_prog_security_struct {
 	int signed_hash_count;
 	unsigned char signed_hashes[SHA256_DIGEST_SIZE * MAX_USED_MAPS];
 };
 
+struct hornet_parse_context {
+	struct hornet_prog_security_struct *security;
+};
+
 struct lsm_blob_sizes hornet_blob_sizes __ro_after_init = {
 	.lbs_bpf_prog = sizeof(struct hornet_prog_security_struct),
 };
@@ -51,79 +43,17 @@ hornet_bpf_prog_security(struct bpf_prog *prog)
 	return prog->aux->security + hornet_blob_sizes.lbs_bpf_prog;
 }
 
-static int hornet_verify_hashes(struct hornet_maps *maps,
-				struct hornet_parse_context *ctx,
-				struct bpf_prog *prog)
-{
-	int map_fd;
-	u32 i;
-	struct bpf_map *map;
-	int err = 0;
-	unsigned char hash[SHA256_DIGEST_SIZE];
-	struct hornet_prog_security_struct *security = hornet_bpf_prog_security(prog);
-
-	for (i = 0; i < ctx->hash_count; i++) {
-		if (ctx->skips[i])
-			continue;
-
-		err = copy_from_bpfptr_offset(&map_fd, maps->fd_array,
-					      ctx->indexes[i] * sizeof(map_fd),
-					      sizeof(map_fd));
-		if (err != 0)
-			return LSM_INT_VERDICT_FAULT;
-
-		CLASS(fd, f)(map_fd);
-		if (fd_empty(f))
-			return LSM_INT_VERDICT_FAULT;
-		if (unlikely(fd_file(f)->f_op != &bpf_map_fops))
-			return LSM_INT_VERDICT_FAULT;
-
-		map = fd_file(f)->private_data;
-		if (!READ_ONCE(map->frozen))
-			return LSM_INT_VERDICT_FAULT;
-
-		if (!map->ops->map_get_hash)
-			return LSM_INT_VERDICT_FAULT;
-
-		if (map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, hash))
-			return LSM_INT_VERDICT_FAULT;
-
-		err = memcmp(hash, &ctx->hashes[i * SHA256_DIGEST_SIZE],
-			      SHA256_DIGEST_SIZE);
-		if (err)
-			return LSM_INT_VERDICT_UNEXPECTED;
-
-		memcpy(&security->signed_hashes[security->signed_hash_count * SHA256_DIGEST_SIZE],
-		       &ctx->hashes[i * SHA256_DIGEST_SIZE], SHA256_DIGEST_SIZE);
-		security->signed_hash_count++;
-	}
-	return LSM_INT_VERDICT_OK;
-}
-
 int hornet_next_map(void *context, size_t hdrlen,
 		     unsigned char tag,
 		     const void *value, size_t vlen)
 {
 	struct hornet_parse_context *ctx = (struct hornet_parse_context *)context;
 
-	if (++ctx->hash_count >= MAX_USED_MAPS)
+	if (++ctx->security->signed_hash_count >= MAX_USED_MAPS)
 		return -EINVAL;
 	return 0;
 }
 
-int hornet_map_index(void *context, size_t hdrlen,
-		     unsigned char tag,
-		     const void *value, size_t vlen)
-{
-	struct hornet_parse_context *ctx = (struct hornet_parse_context *)context;
-
-	if (vlen != 1)
-		return -EINVAL;
-
-	ctx->indexes[ctx->hash_count] = *(u8 *)value;
-	return 0;
-}
-
 int hornet_map_hash(void *context, size_t hdrlen,
 		    unsigned char tag,
 		    const void *value, size_t vlen)
@@ -134,11 +64,8 @@ int hornet_map_hash(void *context, size_t hdrlen,
 	if (vlen != SHA256_DIGEST_SIZE && vlen != 0)
 		return -EINVAL;
 
-	if (vlen) {
-		ctx->skips[ctx->hash_count] = false;
-		memcpy(&ctx->hashes[ctx->hash_count * SHA256_DIGEST_SIZE], value, vlen);
-	} else
-		ctx->skips[ctx->hash_count] = true;
+	memcpy(&ctx->security->signed_hashes[ctx->security->signed_hash_count * SHA256_DIGEST_SIZE],
+	       value, vlen);
 
 	return 0;
 }
@@ -147,7 +74,6 @@ static int hornet_check_program(struct bpf_prog *prog, union bpf_attr *attr,
 				struct bpf_token *token, bool is_kernel,
 				enum lsm_integrity_verdict *verdict)
 {
-	struct hornet_maps maps = {0};
 	bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
 	struct pkcs7_message *msg;
 	struct hornet_parse_context *ctx;
@@ -172,7 +98,8 @@ static int hornet_check_program(struct bpf_prog *prog, union bpf_attr *attr,
 	if (!ctx)
 		return -ENOMEM;
 
-	maps.fd_array = make_bpfptr(attr->fd_array, is_kernel);
+	ctx->security = hornet_bpf_prog_security(prog);
+
 	sig = kzalloc(attr->signature_size, GFP_KERNEL);
 	if (!sig) {
 		err = -ENOMEM;
@@ -225,7 +152,7 @@ static int hornet_check_program(struct bpf_prog *prog, union bpf_attr *attr,
 		goto cleanup_msg;
 	}
 
-	*verdict = hornet_verify_hashes(&maps, ctx, prog);
+	*verdict = LSM_INT_VERDICT_OK;
 	err = 0;
 
 cleanup_msg:
@@ -257,10 +184,8 @@ static int hornet_bpf_prog_load_integrity(struct bpf_prog *prog, union bpf_attr
 						     &hornet_lsmid, verdict);
 }
 
-static int hornet_check_prog_maps(u32 ufd)
+static int hornet_check_prog_maps(struct bpf_prog *prog)
 {
-	CLASS(fd, f)(ufd);
-	struct bpf_prog *prog;
 	struct hornet_prog_security_struct *security;
 	unsigned char hash[SHA256_DIGEST_SIZE];
 	struct bpf_map *map;
@@ -268,12 +193,6 @@ static int hornet_check_prog_maps(u32 ufd)
 	bool found;
 	int covered_count = 0;
 
-	if (fd_empty(f))
-		return -EBADF;
-	if (fd_file(f)->f_op != &bpf_prog_fops)
-		return -EINVAL;
-
-	prog = fd_file(f)->private_data;
 	security = hornet_bpf_prog_security(prog);
 
 	if (!security->signed_hash_count)
@@ -316,26 +235,14 @@ static int hornet_check_prog_maps(u32 ufd)
 	return 0;
 }
 
-static int hornet_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
+static int hornet_bpf_prog(struct bpf_prog *prog)
 {
-	/* in horent_bpf(), anything that had originated from kernel space we assume
-	 * has already been checked, in some form or another, so we don't bother
-	 * checking the intergity of any maps. In hornet_bpf_prog_load_integrity(),
-	 * hornet doesn't make any opinion on that and delegates that to the downstream
-	 * policy enforcement.
-	 */
-
-	if (cmd != BPF_PROG_RUN)
-		return 0;
-	if (kernel)
-		return 0;
-
-	return hornet_check_prog_maps(attr->test.prog_fd);
+	return hornet_check_prog_maps(prog);
 }
 
 static struct security_hook_list hornet_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf_prog_load_integrity, hornet_bpf_prog_load_integrity),
-	LSM_HOOK_INIT(bpf, hornet_bpf),
+	LSM_HOOK_INIT(bpf_prog, hornet_bpf_prog),
 };
 
 static int __init hornet_init(void)
diff --git a/tools/testing/selftests/hornet/Makefile b/tools/testing/selftests/hornet/Makefile
index 432bce59f54e7..316364f95f28c 100644
--- a/tools/testing/selftests/hornet/Makefile
+++ b/tools/testing/selftests/hornet/Makefile
@@ -51,7 +51,7 @@ $(OUTPUT)/gen_sig: ../../../../scripts/hornet/gen_sig.c
 
 sig.bin: insn.bin map.bin $(OUTPUT)/gen_sig
 	$(OUTPUT)/gen_sig --key $(CERTDIR)/signing_key.pem --cert $(CERTDIR)/signing_key.x509 \
-		--data insn.bin --add map.bin:0 --out sig.bin
+		--data insn.bin --add map.bin --out sig.bin
 
 signed_loader.h: sig.bin
 	$(SCRIPTSDIR)/write-sig.sh loader.h sig.bin > $@
-- 
2.53.0


^ permalink raw reply related

* [PATCH 00/11] hornet: security, tooling and selftest fixes
From: Blaise Boscaccy @ 2026-05-28  3:08 UTC (permalink / raw)
  To: Jonathan Corbet, Shuah Khan, Paul Moore, James Morris,
	Serge E. Hallyn, Eric Biggers, Fan Wu, James.Bottomley,
	Blaise Boscaccy, linux-security-module

Patch 1 closes a TOCTOU race in signature verification. Map
contents were hashed at the program-load hook and re-hashed at
the program-run hook, leaving a window in which a sufficiently
privileged attacker could mutate a map between the two checks
and run a program whose maps no longer matched what was signed.
The fix records the verified hashes on the prog at load time
and, in security_bpf_prog, checks them against
prog->aux->used_maps — the same map set the verifier and
runtime resolve against — so the verified and executed sets
cannot diverge. The per-map index in the signature format is no
longer needed and is dropped; the check becomes a subset test.
Reported by Eric Biggers.

Patches 2-3 fix two counting bugs in the same area: duplicate maps
could satisfy the required hash count, and an off-by-one capped
accepted maps at MAX_USED_MAPS.

Patches 4-11 are in response to sashiko feedback found here:
https://sashiko.dev/#/patchset/20260507191416.2984054-1-bboscaccy%40linux.microsoft.com

They provide some correctness fixes in the hornet tooling along with
making the selftest behave under cross-compilation and skip cleanly
when signing keys / bpftool / vmlinux BTF are unavailable, instead of
breaking the global selftest build.

Blaise Boscaccy (11):
  hornet: fix TOCTOU in signed program verification
  hornet: invert map set check logic
  hornet: fix off-by-one bug in max used maps check
  selftests: hornet: handle cross compilation and test skipping
  hornet: gen_sig: fix off-by-one check for used maps
  hornet: gen_sig: fix error string allocations
  hornet: gen_sig: check for bad allocations
  hornet: gen_sig: fix missing command line switches
  hornet: scripts: set a non-zero error code for usage
  hornet: scripts: harden scripts to handle trailing whitespace
  hornet: scripts: Improve argument handling and error messages

 Documentation/admin-guide/LSM/Hornet.rst |  39 +++---
 scripts/hornet/extract-insn.sh           |  24 ++--
 scripts/hornet/extract-map.sh            |  25 ++--
 scripts/hornet/extract-skel.sh           |  35 ++++--
 scripts/hornet/gen_sig.c                 |  61 ++++++----
 scripts/hornet/write-sig.sh              |  10 +-
 security/hornet/hornet.asn1              |   1 -
 security/hornet/hornet_lsm.c             | 148 ++++-------------------
 tools/testing/selftests/hornet/Makefile  | 114 +++++++++++++----
 9 files changed, 235 insertions(+), 222 deletions(-)

-- 
2.53.0

^ permalink raw reply

* [PATCH net v2] netlabel: validate unlabeled mask attribute length
From: Chenguang Zhao @ 2026-05-28  1:59 UTC (permalink / raw)
  To: Paul Moore, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Chenguang Zhao, Simon Horman, netdev, linux-security-module

netlbl_unlabel_addrinfo_get() checked the address length
but allowed shorter mask attributes to pass through to
fixed-size address reads.

netlbl_unlabel_addrinfo_get() only rejected a mask
length mismatch when the address attribute length
was also invalid.  A crafted Generic Netlink request
could therefore provide a valid IPv4/IPv6 address
attribute with a shorter mask attribute.

NLA_BINARY policy lengths are maximum lengths,
not exact lengths, so the short mask can pass
policy validation.  The mask is later read as
a full struct in_addr or struct in6_addr.
Require both address and mask attributes to
have the exact expected size.

Fixes: 8cc44579d1bd ("NetLabel: Introduce static network labels for unlabeled connections")
Signed-off-by: Chenguang Zhao <zhaochenguang@kylinos.cn>
---
v2:
 - Adjust commit message 
 - Add Fixes and 'net' subject prefix.
v1:
 https://lore.kernel.org/all/20260522054521.1169755-1-zhaochenguang@kylinos.cn/
---
 net/netlabel/netlabel_unlabeled.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index ca7a9e2a3de7..c1b7e0061886 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -762,8 +762,9 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
 	if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] &&
 	    info->attrs[NLBL_UNLABEL_A_IPV4MASK]) {
 		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
-		if (addr_len != sizeof(struct in_addr) &&
-		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
+		if (addr_len != sizeof(struct in_addr) ||
+		    nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]) !=
+		    sizeof(struct in_addr))
 			return -EINVAL;
 		*len = addr_len;
 		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
@@ -771,8 +772,9 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
 		return 0;
 	} else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
 		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
-		if (addr_len != sizeof(struct in6_addr) &&
-		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
+		if (addr_len != sizeof(struct in6_addr) ||
+		    nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]) !=
+		    sizeof(struct in6_addr))
 			return -EINVAL;
 		*len = addr_len;
 		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v3] security: Expand task_setscheduler LSM hook to include CPU affinity mask
From: Aaron Tomlin @ 2026-05-28  1:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tsbogend, paul, jmorris, serge, mingo, juri.lelli,
	vincent.guittot, stephen.smalley.work, casey, longman, tj, hannes,
	mkoutny, chenridong, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, kprateek.nayak, omosnace, kees, neelx, sean, chjohnst,
	steve, mproche, nick.lange, cgroups, linux-mips, linux-fsdevel,
	linux-security-module, selinux, linux-kernel
In-Reply-To: <20260527195858.GC3493090@noisy.programming.kicks-ass.net>

[-- Attachment #1: Type: text/plain, Size: 3314 bytes --]

On Wed, May 27, 2026 at 09:58:58PM +0200, Peter Zijlstra wrote:
> On Wed, May 27, 2026 at 01:41:52PM -0400, Aaron Tomlin wrote:
> 
> > > > The actual use case here is multi-tenant workload isolation and visibility.
> > > > Passing the evaluated cpumask to the BPF LSM allows operators to write a
> > > > simple eBPF program to detect spatial boundary overlaps (e.g., logging an
> > > > event if a requested mask intersects with platform-reserved cores).
> 
> Why isn't cgroups good enough to enforce this? If you create a cgroup
> hierarchy per tenant, and constrain them using the cpuset controller,
> they should not be able to escape, rendering this event impossible.

Hi Peter,

You raise a very fair point. The cpuset cgroup controller is indeed the
kernel's primary vehicle for spatial enforcement, and under normal
circumstances, it successfully prevents a tenant from escaping their
designated cores.

The cpuset controller does govern resource limits, but does not audit
intent. When __sched_setaffinity() is invoked, the kernel compares the
requested in_mask against the task's allowed cpuset. If there is only a
partial intersection, the kernel silently truncates the requested mask to
fit the cpuset, without raising any alarm.

The BPF LSM hook, conversely, receives the raw, untruncated in_mask,
affording operators the visibility to detect, audit, and even reject these
violations of intent before the kernel silently sanitises the input.

This patch does not seek to replace the cpuset controller, but rather to
complement it by providing auditing capabilities.

> > We are not creating a bespoke BPF hook here; rather, we are rectifying a
> > historical blind spot within the API. The existing LSM hook is invoked
> > during sched_setaffinity(), yet it presently receives only the task_struct
> > pointer. Consequently, the security module is essentially asked, "Should
> > Process A be permitted to alter Process B's affinity?" without being
> > informed of the proposed affinity itself. Providing in_mask simply
> > furnishes the existing hook with the requisite payload to make an informed
> > decision.
> 
> It occurs to me that this same argument would require to also pass in
> the new sched_attr, no? That way the LSM can inspect the new policy
> before it becomes effective.

I agree, the underlying logic does indeed extend perfectly to sched_attr.

Presently, the LSM is equally oblivious as to whether a process is
requesting a benign transition to SCHED_BATCH, or attempting to escalate
its privileges by requesting a real-time policy such as SCHED_FIFO with
maximum priority. Just as with the CPU mask, providing the sched_attr
payload would rectify this parallel blind spot, allowing BPF policies to
inspect and mediate scheduling attributes before they become effective.

If you are amenable, I should be more than happy to expand the scope of the
forthcoming patch to include this. Alternatively, we could address the
sched_attr expansion in a separate, subsequent patch. Personally, I would
favour the latter approach, but please do let me know your preference.

I very much look forward to hearing Paul's thoughts on whether this aligns
with the broader LSM vision.

Thank you.

Kind regards,
-- 
Aaron Tomlin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] apparmor: replace get_zeroed_page() with kzalloc()
From: Paul Moore @ 2026-05-27 23:42 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft), James Morris, John Johansen,
	Ondrej Mosnacek, Serge E. Hallyn, Stephen Smalley
  Cc: Mike Rapoport, apparmor, selinux, linux-kernel, linux-mm,
	linux-security-module
In-Reply-To: <20260520-security-v1-3-831bd8e21dd0@kernel.org>

On May 20, 2026 "Mike Rapoport (Microsoft)" <rppt@kernel.org> wrote:
> 
> multi_transaction_new() allocates memory with get_zeroed_page() and uses
> it as struct multi_transaction.
> 
> The usage of that structure does not require struct page access and it is
> better to allocate multi_transaction objects with kzalloc() that provides
> better scalability and more debugging possibilities.
> 
> Replace use of get_zeroed_page() with kzalloc().
> 
> Link: https://lore.kernel.org/all/635405e4-9423-4a25-a6e7-e03c8ea0bcbe@redhat.com
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  security/apparmor/apparmorfs.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)

I'll leave this for John and/or Georgia to review and ultimately decide
to merge, but it looks okay to me.

Reviewed-by: Paul Moore <paul@paul-moore.com>

--
paul-moore.com

^ permalink raw reply

* Re: [PATCH 2/3] selinux: hooks: use __getname() to allocate path  buffer
From: Paul Moore @ 2026-05-27 23:42 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft), James Morris, John Johansen,
	Ondrej Mosnacek, Serge E. Hallyn, Stephen Smalley
  Cc: Mike Rapoport, apparmor, selinux, linux-kernel, linux-mm,
	linux-security-module
In-Reply-To: <20260520-security-v1-2-831bd8e21dd0@kernel.org>

On May 20, 2026 "Mike Rapoport (Microsoft)" <rppt@kernel.org> wrote:
> 
> selinux_genfs_get_sid() allocates memory for a path with __get_free_page()
> although there is a dedicated helper for allocation of file paths:
> __getname().
> 
> Replace __get_free_page() for allocation of a path buffer with __getname().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  security/selinux/hooks.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Merged into selinux/dev, thanks.

--
paul-moore.com

^ permalink raw reply

* Re: [PATCH 1/3] selinux: use k[mz]alloc() to allocate temporary  buffers
From: Paul Moore @ 2026-05-27 23:42 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft), James Morris, John Johansen,
	Ondrej Mosnacek, Serge E. Hallyn, Stephen Smalley
  Cc: Mike Rapoport, apparmor, selinux, linux-kernel, linux-mm,
	linux-security-module
In-Reply-To: <20260520-security-v1-1-831bd8e21dd0@kernel.org>

On May 20, 2026 "Mike Rapoport (Microsoft)" <rppt@kernel.org> wrote:
> 
> Several functions in selinuxfs.c allocate temporary buffers using
> __get_free_page() or get_zeroed_page().
> 
> These buffers are used either to store a string generated by snprintf() (in
> sel_make_bools()) or to copy data from user (sel_read_avc_hash_stats() and
> sel_read_sidtab_hash_stats()).
> 
> Such usage does not require struct page access and it is better to allocate
> these buffers with kzalloc()/kmalloc() that provide better scalability and
> more debugging possibilities.
> 
> Replace use of get_zeroed_page() with kzalloc() and usage of
> __get_free_page() with kmalloc().
> 
> Link: https://lore.kernel.org/all/635405e4-9423-4a25-a6e7-e03c8ea0bcbe@redhat.com
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  security/selinux/selinuxfs.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)

I suspect if we look closer we can probably also trim some of those
allocations to less then a page, but that can be work for another day.

Merged into selinux/dev, thanks Mike.

--
paul-moore.com

^ permalink raw reply

* Re: [PATCH v4 1/7] lsm: Add granular mount hooks to replace security_sb_mount
From: Song Liu @ 2026-05-27 21:08 UTC (permalink / raw)
  To: Christian Brauner, paul
  Cc: linux-security-module, linux-fsdevel, selinux, apparmor, jmorris,
	serge, viro, jack, john.johansen, stephen.smalley.work, omosnace,
	mic, gnoack, takedakn, penguin-kernel, herton, kernel-team
In-Reply-To: <20260527-landen-bahnfahren-eckpfeiler-c1e1e9cb73aa@brauner>

On Wed, May 27, 2026 at 5:17 AM Christian Brauner <brauner@kernel.org> wrote:
[...]
> > 1/7 adds new hooks:
> >   lsm: Add granular mount hooks to replace security_sb_mount
> > 2/7 through 6/7 migrate LSMs from old hooks to new hooks:
> >   apparmor: Remove redundant MS_MGC_MSK stripping in apparmor_sb_mount
> >   apparmor: Convert from sb_mount to granular mount hooks
> >   selinux: Convert from sb_mount to granular mount hooks
> >   landlock: Convert from sb_mount to granular mount hooks
> >   tomoyo: Convert from sb_mount to granular mount hooks
> > 7/7 removes old hooks:
> >   lsm: Remove security_sb_mount and security_move_mount
> >
> > Some ideas to change this:
>
> My thought had been:
>
> * Add the new hooks to security/.
> * add the individual lsm implementations.
> * Now replace the old hooks with the new hooks in fs/namespace.c
> * Delete the old hooks in security/
>
> IOW, why the migration step? It is a full replacement anyway.

I think having a migration like this doesn't really make
review more difficult. But I am OK refactoring the patches
as requested.

Paul, do you have a strong preference either way?

Thanks,
Song

^ permalink raw reply

* Re: [PATCH v3] security: Expand task_setscheduler LSM hook to include CPU affinity mask
From: Peter Zijlstra @ 2026-05-27 19:58 UTC (permalink / raw)
  To: Aaron Tomlin
  Cc: tsbogend, paul, jmorris, serge, mingo, juri.lelli,
	vincent.guittot, stephen.smalley.work, casey, longman, tj, hannes,
	mkoutny, chenridong, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, kprateek.nayak, omosnace, kees, neelx, sean, chjohnst,
	steve, mproche, nick.lange, cgroups, linux-mips, linux-fsdevel,
	linux-security-module, selinux, linux-kernel
In-Reply-To: <ov33cu2wosubbfufcmfyoinfatecskjgmkvqyit33komlcla2d@2qgj45724bql>

[-- Attachment #1: Type: text/plain, Size: 2572 bytes --]

On Wed, May 27, 2026 at 01:41:52PM -0400, Aaron Tomlin wrote:

> > > The actual use case here is multi-tenant workload isolation and visibility.
> > > Passing the evaluated cpumask to the BPF LSM allows operators to write a
> > > simple eBPF program to detect spatial boundary overlaps (e.g., logging an
> > > event if a requested mask intersects with platform-reserved cores).

Why isn't cgroups good enough to enforce this? If you create a cgroup
hierarchy per tenant, and constrain them using the cpuset controller,
they should not be able to escape, rendering this event impossible.

> > > If this justification makes more sense, I will focus strictly on the
> > > seccomp pointer limitations and multi-tenant workload isolation.
> > 
> > I suppose it does, my only remaining question is if that is indeed
> > proper use of LSM -- I really don't know much about that.
> > 
> 
> We are not creating a bespoke BPF hook here; rather, we are rectifying a
> historical blind spot within the API. The existing LSM hook is invoked
> during sched_setaffinity(), yet it presently receives only the task_struct
> pointer. Consequently, the security module is essentially asked, "Should
> Process A be permitted to alter Process B's affinity?" without being
> informed of the proposed affinity itself. Providing in_mask simply
> furnishes the existing hook with the requisite payload to make an informed
> decision.

It occurs to me that this same argument would require to also pass in
the new sched_attr, no? That way the LSM can inspect the new policy
before it becomes effective.

> Were the objective solely one of observability, a tracepoint would indeed
> be the most suitable mechanism. However, if the aim within multi-tenant
> environments is active enforcement (namely, safely returning -EPERM to deny
> the pinning request before the scheduler applies it), the LSM layer remains
> the standard, architecturally supported gateway for returning syscall
> errors in accordance with administrative policy.

Indeed; but being constrained in a cpuset cgroup would result in the
same, no?

> I shall defer to Paul Moore and the LSM maintainers for their final
> blessing on the LSM API semantics.

Yes, I think that this is an interesting test-case of the LSM purpose.

You seem to be mostly aiming at resource control, something that is
traditionally done elsewhere.

> Thank you once again for the thorough review and for keeping the
> architectural boundaries honest.

No problem, just trying to understand myself ;-)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH v2 7/9] selftests/landlock: Add capability restriction tests
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

Add tests to exercise LANDLOCK_PERM_CAPABILITY_USE enforcement.  A
sandboxed process is denied a handled capability when no rule grants it,
and an explicit rule restores the capability.  Unknown capability values
above CAP_LAST_CAP are silently accepted at rule-add time but have no
runtime effect, so deny-by-default still applies once the domain is
enforced.  Stacking variants cover the three per-layer combinations that
exercise distinct walker paths (allow/deny, allow/allow, deny/allow)
plus a mixed-layer case where one layer does not handle
LANDLOCK_PERM_CAPABILITY_USE, forcing the walker to skip it.  Invalid
rule attributes (unknown flags, out-of-range values) return the expected
errors.

Two tests exercise non-standard capability gain paths.  The first
enforces a domain via CAP_SYS_ADMIN (no_new_privs is not set) and
verifies that denied capabilities are blocked even when still in the
effective set.  The second creates a user namespace under a Landlock
domain to verify that capabilities gained through the kernel's user
namespace ownership bypass (cap_capable_helper) are still restricted by
the domain's rules.

Audit tests verify that a denied capability produces the expected audit
record with the capability number, and that an allowed capability
generates no denial record.

Test coverage for security/landlock is 91.6% of 2398 lines according to
LLVM 22.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-8-mic@digikod.net
- Reflow comments after check-linux.sh comment fixes.
- Rename LANDLOCK_PERM_NAMESPACE_ENTER references to
  LANDLOCK_PERM_NAMESPACE_USE and bump the abi_version expectation
  to 11 (companion changes to the introducing commit).
- Add add_rule_unknown_no_runtime_effect: assert that a rule listing
  only unknown capability bits is accepted at rule-add time but has
  no runtime effect, so an actual CAP_* exercise (sethostname with
  CAP_SYS_ADMIN) is still denied by deny-by-default once the domain
  is enforced.
- Add cap_stacking parent_denies variant covering the inverse
  direction of stacking: layer 1 denies CAP_SYS_ADMIN, layer 2
  allows, capability still denied.  Completes the per-layer walker
  direction coverage.
- Assert records.domain == 0 in cap_audit.allowed so the test also
  checks that no domain-allocation record is emitted when nothing
  is denied.
---
 tools/testing/selftests/landlock/base_test.c |  18 +
 tools/testing/selftests/landlock/cap_test.c  | 673 +++++++++++++++++++
 2 files changed, 691 insertions(+)
 create mode 100644 tools/testing/selftests/landlock/cap_test.c

diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 6c8113c2ded1..2329513d1765 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -142,6 +142,24 @@ TEST(errata)
 	ASSERT_EQ(EINVAL, errno);
 }
 
+#define PERM_LAST LANDLOCK_PERM_CAPABILITY_USE
+
+TEST(ruleset_with_unknown_perm)
+{
+	__u64 perm_mask;
+
+	for (perm_mask = 1ULL << 63; perm_mask != PERM_LAST; perm_mask >>= 1) {
+		struct landlock_ruleset_attr ruleset_attr = {
+			.handled_perm = perm_mask,
+		};
+
+		/* Unknown handled_perm values must be rejected. */
+		ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+						      sizeof(ruleset_attr), 0));
+		ASSERT_EQ(EINVAL, errno);
+	}
+}
+
 /* Tests ordering of syscall argument checks. */
 TEST(create_ruleset_checks_ordering)
 {
diff --git a/tools/testing/selftests/landlock/cap_test.c b/tools/testing/selftests/landlock/cap_test.c
new file mode 100644
index 000000000000..317dbf9d1962
--- /dev/null
+++ b/tools/testing/selftests/landlock/cap_test.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Capability restriction
+ *
+ * Copyright © 2026 Cloudflare
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/capability.h>
+#include <linux/landlock.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "audit.h"
+#include "common.h"
+
+static int create_cap_ruleset(void)
+{
+	const struct landlock_ruleset_attr attr = {
+		.handled_perm = LANDLOCK_PERM_CAPABILITY_USE,
+	};
+
+	return landlock_create_ruleset(&attr, sizeof(attr), 0);
+}
+
+static int add_cap_rule(int ruleset_fd, __u64 cap)
+{
+	const struct landlock_capability_attr attr = {
+		.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE,
+		.capabilities = (1ULL << cap),
+	};
+
+	return landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY, &attr,
+				 0);
+}
+
+TEST(add_rule_bad_attr)
+{
+	const struct landlock_ruleset_attr ns_only_attr = {
+		.handled_perm = LANDLOCK_PERM_NAMESPACE_USE,
+	};
+	int ruleset_fd;
+	struct landlock_capability_attr attr = {};
+
+	ruleset_fd = create_cap_ruleset();
+	ASSERT_LE(0, ruleset_fd);
+
+	/* Empty allowed_perm returns ENOMSG (useless deny rule). */
+	attr.allowed_perm = 0;
+	attr.capabilities = (1ULL << CAP_NET_RAW);
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 0));
+	ASSERT_EQ(ENOMSG, errno);
+
+	/* Useless rule: empty capabilities bitmask. */
+	attr.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE;
+	attr.capabilities = 0;
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 0));
+	ASSERT_EQ(ENOMSG, errno);
+
+	/* allowed_perm with unhandled bit. */
+	attr.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE |
+			    LANDLOCK_PERM_NAMESPACE_USE;
+	attr.capabilities = (1ULL << CAP_NET_RAW);
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 0));
+	ASSERT_EQ(EINVAL, errno);
+
+	/* allowed_perm with wrong type. */
+	attr.allowed_perm = LANDLOCK_PERM_NAMESPACE_USE;
+	attr.capabilities = (1ULL << CAP_NET_RAW);
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 0));
+	ASSERT_EQ(EINVAL, errno);
+
+	/*
+	 * Unknown capability bits (e.g. bit 63) are silently accepted for
+	 * forward compatibility.  Only known bits are stored.
+	 */
+	attr.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE;
+	attr.capabilities = 1ULL << 63;
+	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+				       &attr, 0));
+
+	/* Non-zero flags must be rejected. */
+	attr.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE;
+	attr.capabilities = (1ULL << CAP_NET_RAW);
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 1));
+	ASSERT_EQ(EINVAL, errno);
+
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * Ruleset handles PERM_NAMESPACE_USE but not PERM_CAPABILITY_USE:
+	 * adding a capability rule must be rejected.
+	 */
+	ruleset_fd =
+		landlock_create_ruleset(&ns_only_attr, sizeof(ns_only_attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	attr.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE;
+	attr.capabilities = (1ULL << CAP_NET_RAW);
+	ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+					&attr, 0));
+	ASSERT_EQ(EINVAL, errno);
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+/*
+ * Unknown capability values above CAP_LAST_CAP are silently accepted
+ * (allow-list: they have no effect since the kernel never checks them).
+ */
+TEST(add_rule_unknown)
+{
+	int ruleset_fd;
+	struct landlock_capability_attr attr = {
+		.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE,
+	};
+
+	ruleset_fd = create_cap_ruleset();
+	ASSERT_LE(0, ruleset_fd);
+
+	/* Just above CAP_LAST_CAP should succeed. */
+	attr.capabilities = (1ULL << (CAP_LAST_CAP + 1));
+	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+				       &attr, 0));
+
+	/* High values (below bit 63) should succeed. */
+	attr.capabilities = (1ULL << 62);
+	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+				       &attr, 0));
+
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+/*
+ * A rule that lists only capability bits unknown to the running kernel is
+ * accepted by landlock_add_rule() but has no runtime effect: once the domain is
+ * enforced, any actual CAP_* capability is still denied by the per-category
+ * deny-by-default behaviour.  This documents the forward-compatibility
+ * contract: unknown bits are silently accepted so the same policy can be loaded
+ * across kernels, but they never grant a capability that the running kernel
+ * knows nothing about.
+ */
+TEST(add_rule_unknown_no_runtime_effect)
+{
+	const struct landlock_ruleset_attr ruleset_attr = {
+		.handled_perm = LANDLOCK_PERM_CAPABILITY_USE,
+	};
+	struct landlock_capability_attr attr = {
+		.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE,
+		/* Only unknown bits above CAP_LAST_CAP. */
+		.capabilities = (1ULL << (CAP_LAST_CAP + 1)) | (1ULL << 62),
+	};
+	int ruleset_fd;
+
+	disable_caps(_metadata);
+
+	ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+
+	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+				       &attr, 0));
+
+	enforce_ruleset(_metadata, ruleset_fd);
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * CAP_SYS_ADMIN is a real, known capability but was not authorised by
+	 * the rule above; deny-by-default applies.  sethostname(2) requires
+	 * CAP_SYS_ADMIN.
+	 */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	EXPECT_EQ(-1, sethostname("test", 4));
+	EXPECT_EQ(EPERM, errno);
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+/* clang-format off */
+FIXTURE(cap_enforce) {};
+/* clang-format on */
+
+FIXTURE_VARIANT(cap_enforce)
+{
+	const bool is_sandboxed;
+	const bool handle_caps;
+	const __u64 allowed_cap;
+	const int expected_sysadmin;
+	const int expected_chroot;
+};
+
+/*
+ * Unsandboxed baseline: no Landlock domain is enforced.  Both capabilities
+ * should work normally.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_enforce, unsandboxed) {
+	/* clang-format on */
+	.is_sandboxed = false,	.handle_caps = false, .allowed_cap = 0,
+	.expected_sysadmin = 0, .expected_chroot = 0,
+};
+
+/*
+ * Denied: capabilities are handled but no rule allows them.  All capability
+ * checks must be denied by Landlock even if the capability is effective.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_enforce, denied) {
+	/* clang-format on */
+	.is_sandboxed = true,	    .handle_caps = true,      .allowed_cap = 0,
+	.expected_sysadmin = EPERM, .expected_chroot = EPERM,
+};
+
+/*
+ * Allowed: CAP_SYS_ADMIN is allowed by rule, CAP_SYS_CHROOT is not.  Only the
+ * explicitly allowed capability should succeed.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_enforce, allowed) {
+	/* clang-format on */
+	.is_sandboxed = true,	      .handle_caps = true,
+	.allowed_cap = CAP_SYS_ADMIN, .expected_sysadmin = 0,
+	.expected_chroot = EPERM,
+};
+
+/*
+ * Unhandled: the ruleset does not handle LANDLOCK_PERM_CAPABILITY_USE at all
+ * (only handles FS access).  Both capabilities should work since the domain
+ * does not restrict them.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_enforce, unhandled) {
+	/* clang-format on */
+	.is_sandboxed = true,	.handle_caps = false, .allowed_cap = 0,
+	.expected_sysadmin = 0, .expected_chroot = 0,
+};
+
+FIXTURE_SETUP(cap_enforce)
+{
+	disable_caps(_metadata);
+}
+
+FIXTURE_TEARDOWN(cap_enforce)
+{
+}
+
+/*
+ * Capability enforcement: tests the four fundamental enforcement scenarios
+ * (unsandboxed baseline, denied, allowed, unhandled) using two independent
+ * capability checks (sethostname for CAP_SYS_ADMIN, chroot for CAP_SYS_CHROOT).
+ */
+TEST_F(cap_enforce, use)
+{
+	int ruleset_fd;
+
+	/* Isolate hostname changes from other tests. */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	ASSERT_EQ(0, unshare(CLONE_NEWUTS));
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	if (variant->is_sandboxed) {
+		if (variant->handle_caps) {
+			ruleset_fd = create_cap_ruleset();
+		} else {
+			const struct landlock_ruleset_attr attr = {
+				.handled_access_fs =
+					LANDLOCK_ACCESS_FS_READ_FILE,
+			};
+
+			ruleset_fd =
+				landlock_create_ruleset(&attr, sizeof(attr), 0);
+		}
+		ASSERT_LE(0, ruleset_fd);
+
+		if (variant->allowed_cap)
+			ASSERT_EQ(0, add_cap_rule(ruleset_fd,
+						  variant->allowed_cap));
+
+		enforce_ruleset(_metadata, ruleset_fd);
+		EXPECT_EQ(0, close(ruleset_fd));
+	}
+
+	/* Test CAP_SYS_ADMIN via sethostname. */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	if (variant->expected_sysadmin) {
+		EXPECT_EQ(-1, sethostname("test", 4));
+		EXPECT_EQ(variant->expected_sysadmin, errno);
+	} else {
+		EXPECT_EQ(0, sethostname("test", 4));
+	}
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	/* Test CAP_SYS_CHROOT via chroot. */
+	set_cap(_metadata, CAP_SYS_CHROOT);
+	if (variant->expected_chroot) {
+		EXPECT_EQ(-1, chroot("/"));
+		EXPECT_EQ(variant->expected_chroot, errno);
+	} else {
+		EXPECT_EQ(0, chroot("/"));
+	}
+}
+
+/*
+ * Layer stacking: both layers must allow CAP_SYS_ADMIN for the capability to be
+ * exercisable.  Variants cover the three per-layer combinations that exercise
+ * distinct walker paths (allow/deny, allow/allow, deny/allow), an unsandboxed
+ * baseline, and a mixed-layer case where one layer does not handle
+ * PERM_CAPABILITY_USE at all.
+ */
+/* clang-format off */
+FIXTURE(cap_stacking) {};
+/* clang-format on */
+
+FIXTURE_VARIANT(cap_stacking)
+{
+	const bool is_sandboxed;
+	const bool first_layer_allows;
+	const bool second_layer_allows;
+	const bool second_layer_is_fs_only;
+	const int expected_sysadmin;
+	const int expected_chroot;
+};
+
+/*
+ * Unsandboxed baseline: no Landlock layers are stacked.  Both capabilities
+ * should work normally.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_stacking, unsandboxed) {
+	/* clang-format on */
+	.is_sandboxed = false,	      .first_layer_allows = false,
+	.second_layer_allows = false, .expected_sysadmin = 0,
+	.expected_chroot = 0,
+};
+
+/* Layer 1 allows CAP_SYS_ADMIN, layer 2 denies -> denied. */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_stacking, deny) {
+	/* clang-format on */
+	.is_sandboxed = true,	      .first_layer_allows = true,
+	.second_layer_allows = false, .expected_sysadmin = EPERM,
+	.expected_chroot = EPERM,
+};
+
+/* Both layers allow CAP_SYS_ADMIN -> sysadmin succeeds, chroot still denied. */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_stacking, allow) {
+	/* clang-format on */
+	.is_sandboxed = true,	     .first_layer_allows = true,
+	.second_layer_allows = true, .expected_sysadmin = 0,
+	.expected_chroot = EPERM,
+};
+
+/*
+ * Layer 1 denies CAP_SYS_ADMIN, layer 2 allows -> still denied: a child layer
+ * cannot grant what an ancestor layer withheld.  Complements the
+ * parent-allows/child-denies variant; together they verify the walker checks
+ * both layers and accepts only the (allow, allow) cell.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_stacking, parent_denies) {
+	/* clang-format on */
+	.is_sandboxed = true,	     .first_layer_allows = false,
+	.second_layer_allows = true, .expected_sysadmin = EPERM,
+	.expected_chroot = EPERM,
+};
+
+/*
+ * Mixed layers: first layer handles PERM_CAPABILITY_USE (denies all caps),
+ * second layer is FS-only (does not handle it).  The perm walker iterates from
+ * youngest (layer 1) to oldest (layer 0) and must skip the FS-only layer to
+ * find the denying layer beneath.
+ */
+/* clang-format off */
+FIXTURE_VARIANT_ADD(cap_stacking, mixed_layers) {
+	/* clang-format on */
+	.is_sandboxed = true,
+	.first_layer_allows = false,
+	.second_layer_is_fs_only = true,
+	.expected_sysadmin = EPERM,
+	.expected_chroot = EPERM,
+};
+
+FIXTURE_SETUP(cap_stacking)
+{
+	disable_caps(_metadata);
+}
+
+FIXTURE_TEARDOWN(cap_stacking)
+{
+}
+
+TEST_F(cap_stacking, two_layers)
+{
+	int ruleset_fd;
+
+	if (variant->is_sandboxed) {
+		/* First layer: handles PERM_CAPABILITY_USE; rule added per variant. */
+		ruleset_fd = create_cap_ruleset();
+		ASSERT_LE(0, ruleset_fd);
+		if (variant->first_layer_allows)
+			ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_SYS_ADMIN));
+
+		enforce_ruleset(_metadata, ruleset_fd);
+		EXPECT_EQ(0, close(ruleset_fd));
+
+		if (variant->second_layer_is_fs_only) {
+			/*
+			 * Second layer: FS-only (does not handle
+			 * PERM_CAPABILITY_USE).  The perm walker must skip this
+			 * layer.
+			 */
+			const struct landlock_ruleset_attr fs_attr = {
+				.handled_access_fs =
+					LANDLOCK_ACCESS_FS_READ_FILE,
+			};
+
+			ruleset_fd = landlock_create_ruleset(
+				&fs_attr, sizeof(fs_attr), 0);
+		} else {
+			/* Second layer: cap allow or deny. */
+			ruleset_fd = create_cap_ruleset();
+			if (variant->second_layer_allows)
+				ASSERT_EQ(0, add_cap_rule(ruleset_fd,
+							  CAP_SYS_ADMIN));
+		}
+		ASSERT_LE(0, ruleset_fd);
+		enforce_ruleset(_metadata, ruleset_fd);
+		EXPECT_EQ(0, close(ruleset_fd));
+	}
+
+	/* Test CAP_SYS_ADMIN via sethostname. */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	if (variant->expected_sysadmin) {
+		EXPECT_EQ(-1, sethostname("test", 4));
+		EXPECT_EQ(variant->expected_sysadmin, errno);
+	} else {
+		EXPECT_EQ(0, sethostname("test", 4));
+	}
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	/* Test CAP_SYS_CHROOT via chroot. */
+	set_cap(_metadata, CAP_SYS_CHROOT);
+	if (variant->expected_chroot) {
+		EXPECT_EQ(-1, chroot("/"));
+		EXPECT_EQ(variant->expected_chroot, errno);
+	} else {
+		EXPECT_EQ(0, chroot("/"));
+	}
+	clear_cap(_metadata, CAP_SYS_CHROOT);
+}
+
+/*
+ * Verify that LANDLOCK_PERM_CAPABILITY_USE enforces when the domain is applied
+ * without no_new_privs, using CAP_SYS_ADMIN for landlock_restrict_self()
+ * authorization instead.  Privileged processes (e.g. container managers) can
+ * sandbox themselves this way.
+ */
+TEST(cap_without_nnp)
+{
+	int ruleset_fd;
+
+	disable_caps(_metadata);
+
+	ruleset_fd = create_cap_ruleset();
+	ASSERT_LE(0, ruleset_fd);
+
+	/* Allow CAP_SYS_CHROOT but not CAP_SYS_ADMIN. */
+	ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_SYS_CHROOT));
+
+	/*
+	 * Enforce WITHOUT NNP: landlock_restrict_self() succeeds when the
+	 * caller has CAP_SYS_ADMIN (checked before the new domain takes
+	 * effect).
+	 */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * CAP_SYS_ADMIN is still in effective set but Landlock denies it:
+	 * cap_capable() returns 0, then hook_capable() returns -EPERM.
+	 */
+	EXPECT_EQ(-1, sethostname("test", 4));
+	EXPECT_EQ(EPERM, errno);
+
+	/* CAP_SYS_CHROOT is allowed by the rule. */
+	set_cap(_metadata, CAP_SYS_CHROOT);
+	EXPECT_EQ(0, chroot("/"));
+}
+
+/*
+ * Verify that capabilities gained through user namespace ownership are still
+ * restricted by LANDLOCK_PERM_CAPABILITY_USE.  When a process creates a user
+ * namespace, the kernel grants CAP_FULL_SET in the new namespace via
+ * cap_capable_helper()'s ownership bypass.  Landlock's hook_capable() must
+ * still deny capabilities not in the allowed set, ensuring that user namespace
+ * creation cannot be used to escape capability restrictions.
+ */
+TEST(cap_userns_ownership_bypass)
+{
+	pid_t child;
+	int status;
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		int ruleset_fd;
+
+		disable_caps(_metadata);
+
+		ruleset_fd = create_cap_ruleset();
+		ASSERT_LE(0, ruleset_fd);
+
+		/* Allow CAP_SYS_ADMIN only. */
+		ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_SYS_ADMIN));
+		enforce_ruleset(_metadata, ruleset_fd);
+		EXPECT_EQ(0, close(ruleset_fd));
+
+		/*
+		 * Create a user namespace.  This is unprivileged and does not
+		 * require capabilities.  LANDLOCK_PERM_NAMESPACE_USE is not
+		 * handled so namespace creation is unrestricted.
+		 */
+		ASSERT_EQ(0, unshare(CLONE_NEWUSER));
+
+		/*
+		 * After unshare(CLONE_NEWUSER), the kernel set cap_effective =
+		 * CAP_FULL_SET in the new namespace.  Create a UTS namespace
+		 * (requires CAP_SYS_ADMIN in the new user NS).  Landlock allows
+		 * CAP_SYS_ADMIN.
+		 */
+		ASSERT_EQ(0, unshare(CLONE_NEWUTS))
+		{
+			TH_LOG("unshare(CLONE_NEWUTS): %s", strerror(errno));
+		}
+
+		/*
+		 * sethostname checks against uts_ns->user_ns, which is now the
+		 * new user NS.  CAP_SYS_ADMIN is allowed.
+		 */
+		EXPECT_EQ(0, sethostname("test", 4));
+
+		/*
+		 * chroot checks against current_user_ns(), which is the new
+		 * user NS.  The process has CAP_SYS_CHROOT in cap_effective
+		 * (from user NS creation), so cap_capable() returns 0.  But
+		 * Landlock denies because no rule allows CAP_SYS_CHROOT.
+		 */
+		EXPECT_EQ(-1, chroot("/"));
+		EXPECT_EQ(EPERM, errno);
+
+		_exit(_metadata->exit_code);
+		return;
+	}
+
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+	    WEXITSTATUS(status) != EXIT_SUCCESS)
+		_metadata->exit_code = KSFT_FAIL;
+}
+
+/* Audit tests */
+
+static int matches_log_cap(int audit_fd, int cap_number)
+{
+	static const char log_template[] = REGEX_LANDLOCK_PREFIX
+		" blockers=perm\\.capability_use capability=%d $";
+	char log_match[sizeof(log_template) + 10];
+	int log_match_len;
+
+	log_match_len = snprintf(log_match, sizeof(log_match), log_template,
+				 cap_number);
+	if (log_match_len >= sizeof(log_match))
+		return -E2BIG;
+
+	return audit_match_record(audit_fd, AUDIT_LANDLOCK_ACCESS, log_match,
+				  NULL);
+}
+
+FIXTURE(cap_audit)
+{
+	struct audit_filter audit_filter;
+	int audit_fd;
+};
+
+FIXTURE_SETUP(cap_audit)
+{
+	ASSERT_TRUE(is_in_init_user_ns());
+
+	disable_caps(_metadata);
+
+	set_cap(_metadata, CAP_AUDIT_CONTROL);
+	self->audit_fd = audit_init_with_exe_filter(&self->audit_filter);
+	EXPECT_LE(0, self->audit_fd);
+	clear_cap(_metadata, CAP_AUDIT_CONTROL);
+}
+
+FIXTURE_TEARDOWN(cap_audit)
+{
+	set_cap(_metadata, CAP_AUDIT_CONTROL);
+	EXPECT_EQ(0, audit_cleanup(self->audit_fd, &self->audit_filter));
+}
+
+/*
+ * Verifies that a denied capability produces the expected audit record with the
+ * correct capability number and blocker string.
+ */
+TEST_F(cap_audit, denied)
+{
+	struct audit_records records;
+	int ruleset_fd;
+
+	/* Baseline: chroot works before Landlock. */
+	set_cap(_metadata, CAP_SYS_CHROOT);
+	ASSERT_EQ(0, chroot("/"));
+	clear_cap(_metadata, CAP_SYS_CHROOT);
+
+	ruleset_fd = create_cap_ruleset();
+	ASSERT_LE(0, ruleset_fd);
+	/* Allow CAP_AUDIT_CONTROL for child-side audit cleanup. */
+	ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_AUDIT_CONTROL));
+	enforce_ruleset(_metadata, ruleset_fd);
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	/* Deny CAP_SYS_CHROOT (no allow rule). */
+	set_cap(_metadata, CAP_SYS_CHROOT);
+	EXPECT_EQ(-1, chroot("/"));
+	EXPECT_EQ(EPERM, errno);
+	clear_cap(_metadata, CAP_SYS_CHROOT);
+
+	EXPECT_EQ(0, matches_log_cap(self->audit_fd, CAP_SYS_CHROOT));
+
+	/*
+	 * No extra access records: the denial was already consumed by
+	 * matches_log_cap above.  One domain allocation record, emitted in the
+	 * same event as the first access denial for this domain.
+	 */
+	EXPECT_EQ(0, audit_count_records(self->audit_fd, &records));
+	EXPECT_EQ(0, records.access);
+	EXPECT_EQ(1, records.domain);
+}
+
+TEST_F(cap_audit, allowed)
+{
+	struct audit_records records;
+	int ruleset_fd;
+
+	ruleset_fd = create_cap_ruleset();
+	ASSERT_LE(0, ruleset_fd);
+	ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_SYS_ADMIN));
+	/* Allow CAP_AUDIT_CONTROL for child-side audit cleanup. */
+	ASSERT_EQ(0, add_cap_rule(ruleset_fd, CAP_AUDIT_CONTROL));
+	enforce_ruleset(_metadata, ruleset_fd);
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	EXPECT_EQ(0, sethostname("test", 4));
+
+	/* No records: allowed operations never trigger audit logging. */
+	EXPECT_EQ(0, audit_count_records(self->audit_fd, &records));
+	EXPECT_EQ(0, records.access);
+	EXPECT_EQ(0, records.domain);
+}
+
+TEST_HARNESS_MAIN
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 5/9] landlock: Enforce capability restrictions
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

Add Landlock enforcement for capability use via the LSM capable hook.
This lets a sandboxed process restrict which Linux capabilities it can
exercise, using LANDLOCK_PERM_CAPABILITY_USE and per-capability rules.

The capable hook is purely restrictive: commoncap is registered with
LSM_ORDER_FIRST so cap_capable() always runs first, which means Landlock
can deny capabilities that commoncap would allow but never grant
capabilities that commoncap denied.

Add hook_capable() that uses landlock_perm_is_denied() to perform a pure
bitmask check: if the capability is not in the layer's allowed set, the
check is denied.  No domain ancestry bypass, no cross-namespace
discriminant, just a flat per-layer allowed-caps bitmask, matching the
same pattern used by LANDLOCK_PERM_NAMESPACE_USE.

Adding the 41-bit capability bitfield to struct perm_masks brings it to
49 out of 64 bits used (41 caps + 8 namespace types, 15 bits padding),
keeping struct layer_config at 16 bytes (8 bytes perm_masks + 4 bytes
access_masks + 4 bytes tail padding) and the layers[] array at 256 bytes
maximum.  The caps bitfield is placed first in struct perm_masks (before
the ns bitfield) because capabilities use a direct BIT_ULL(cap) mapping
that benefits from starting at bit 0 of the storage unit.  An explicit
static_assert documents the LANDLOCK_NUM_PERM_CAP + LANDLOCK_NUM_PERM_NS
<= BITS_PER_TYPE(u64) invariant alongside the existing sizeof guard.

Non-user namespace operations require both LANDLOCK_PERM_NAMESPACE_USE
(type allowed) and LANDLOCK_PERM_CAPABILITY_USE (CAP_SYS_ADMIN allowed)
when both permissions are handled.  This follows naturally from the
kernel calling capable(CAP_SYS_ADMIN) before namespace operations: both
hooks fire independently and audit logs identify which permission was
denied.

The enforcement is purely at exercise time via the capable hook, not by
modifying the credential's capability sets.  Stripping denied
capabilities would give processes an accurate capget(2) view of their
usable capabilities, but no LSM other than commoncap modifies capability
sets; Landlock follows this convention and restricts use without
altering what the process holds.  A sandboxed process inside a user
namespace will see all capabilities via capget(2) but will receive
-EPERM when attempting to use any denied capability.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Reviewed-by: Günther Noack <gnoack@google.com>
Reviewed-by: Tingmao Wang <m@maowtm.org>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-7-mic@digikod.net
- Add Reviewed-by: Tingmao Wang.
- Rename internal struct perm_rules to perm_masks (companion change
  to the preceding commit).
- Rename LANDLOCK_PERM_NAMESPACE_ENTER references to
  LANDLOCK_PERM_NAMESPACE_USE (companion change to the introducing
  commit).
- Rename struct layer_rights to struct layer_config (companion
  change to the introducing commit).
- Clarify in the commit body and hook_capable() kdoc that commoncap
  (not Landlock) is registered with LSM_ORDER_FIRST.
- Surface the empty-check semantics in the
  landlock_capability_attr.capabilities kdoc: a rule that sets only
  bits unknown to the running kernel (above CAP_LAST_CAP) succeeds
  but has no runtime effect.
- Add explicit static_assert that LANDLOCK_NUM_PERM_CAP +
  LANDLOCK_NUM_PERM_NS fits in a u64, complementing the existing
  implicit sizeof guard on struct perm_masks.
- Add Reviewed-by: Günther Noack.
---
 include/uapi/linux/landlock.h |  35 +++++++++
 security/landlock/Makefile    |   3 +-
 security/landlock/access.h    |  18 ++++-
 security/landlock/audit.c     |   4 +
 security/landlock/audit.h     |   1 +
 security/landlock/cap.c       | 141 ++++++++++++++++++++++++++++++++++
 security/landlock/cap.h       |  49 ++++++++++++
 security/landlock/cred.h      |   3 +
 security/landlock/limits.h    |   4 +-
 security/landlock/setup.c     |   2 +
 security/landlock/syscalls.c  |  58 +++++++++++++-
 11 files changed, 309 insertions(+), 9 deletions(-)
 create mode 100644 security/landlock/cap.c
 create mode 100644 security/landlock/cap.h

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 233594482aa5..93fea9f0c5e2 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -168,6 +168,11 @@ enum landlock_rule_type {
 	 * @LANDLOCK_RULE_NAMESPACE: Type of a &struct landlock_namespace_attr .
 	 */
 	LANDLOCK_RULE_NAMESPACE,
+	/**
+	 * @LANDLOCK_RULE_CAPABILITY: Type of a &struct
+	 * landlock_capability_attr .
+	 */
+	LANDLOCK_RULE_CAPABILITY,
 };
 
 /**
@@ -242,6 +247,28 @@ struct landlock_namespace_attr {
 	__u64 namespace_types;
 };
 
+/**
+ * struct landlock_capability_attr - Capability definition
+ *
+ * Argument of sys_landlock_add_rule() with %LANDLOCK_RULE_CAPABILITY.
+ */
+struct landlock_capability_attr {
+	/**
+	 * @allowed_perm: Must be set to %LANDLOCK_PERM_CAPABILITY_USE.
+	 */
+	__u64 allowed_perm;
+	/**
+	 * @capabilities: Bitmask of capabilities (``1ULL << CAP_*``) to allow
+	 * under this rule.  Must be non-zero (otherwise the call returns
+	 * ``-ENOMSG``); the non-zero check runs on the raw input before
+	 * unknown-bit masking, so a rule that sets only bits unknown to the
+	 * running kernel (above ``CAP_LAST_CAP``) succeeds but has no runtime
+	 * effect.  Bits above ``CAP_LAST_CAP`` are silently ignored for forward
+	 * compatibility.
+	 */
+	__u64 capabilities;
+};
+
 /**
  * DOC: fs_access
  *
@@ -488,9 +515,17 @@ struct landlock_namespace_attr {
  *   process in a Landlock domain that handles this permission is denied
  *   from using namespace types that are not explicitly allowed by a
  *   %LANDLOCK_RULE_NAMESPACE rule.
+ * - %LANDLOCK_PERM_CAPABILITY_USE: Restrict the use of specific Linux
+ *   capabilities.  A process in a Landlock domain that handles this
+ *   permission is denied from exercising capabilities that are not
+ *   explicitly allowed by a %LANDLOCK_RULE_CAPABILITY rule.  This hook
+ *   is purely restrictive: it can deny capabilities that the kernel
+ *   would otherwise grant, but it can never grant capabilities that the
+ *   kernel already denied.
  */
 /* clang-format off */
 #define LANDLOCK_PERM_NAMESPACE_USE			(1ULL << 0)
+#define LANDLOCK_PERM_CAPABILITY_USE			(1ULL << 1)
 /* clang-format on */
 
 #endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index cacfba075dec..1927b81fea93 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -9,7 +9,8 @@ landlock-y := \
 	task.o \
 	fs.o \
 	tsync.o \
-	ns.o
+	ns.o \
+	cap.o
 
 landlock-$(CONFIG_INET) += net.o
 
diff --git a/security/landlock/access.h b/security/landlock/access.h
index 42229eea6d7e..28c40f8ad5b5 100644
--- a/security/landlock/access.h
+++ b/security/landlock/access.h
@@ -72,6 +72,13 @@ static_assert(sizeof(typeof_member(union access_masks_all, masks)) ==
  * storage unit.
  */
 struct perm_masks {
+	/**
+	 * @caps: Allowed capabilities.  Each bit corresponds to a ``CAP_*``
+	 * value (e.g. ``CAP_NET_RAW`` = bit 13).  Bits are stored directly
+	 * (sequential mapping) and masked with ``CAP_VALID_MASK`` at rule-add
+	 * time.
+	 */
+	u64 caps : LANDLOCK_NUM_PERM_CAP;
 	/**
 	 * @ns: Allowed namespace types.  Each bit corresponds to a sequential
 	 * index assigned by the ``_LANDLOCK_NS_*`` enum (derived from
@@ -83,6 +90,9 @@ struct perm_masks {
 } __packed __aligned(sizeof(u64));
 
 static_assert(sizeof(struct perm_masks) == sizeof(u64));
+/* All perm_masks bitfields must fit in a single u64. */
+static_assert(LANDLOCK_NUM_PERM_CAP + LANDLOCK_NUM_PERM_NS <=
+	      BITS_PER_TYPE(u64));
 
 /**
  * struct layer_config - Per-layer access configuration
@@ -91,10 +101,10 @@ static_assert(sizeof(struct perm_masks) == sizeof(u64));
  * This is the element type of the &struct landlock_ruleset.layers FAM.
  *
  * Unlike filesystem and network access rights, which are tracked per-object in
- * red-black trees, namespace types use a flat bitmask because their keyspace is
- * small and bounded (~8 namespace types).  A single rule adds to the allowed
- * set via bitwise OR; at enforcement time each layer is checked directly (no
- * tree lookup needed).
+ * red-black trees, namespace types and capabilities use flat bitmasks because
+ * their keyspaces are small and bounded (~8 namespace types, 41 capabilities).
+ * A single rule adds to the allowed set via bitwise OR; at enforcement time
+ * each layer is checked directly (no tree lookup needed).
  */
 struct layer_config {
 	/**
diff --git a/security/landlock/audit.c b/security/landlock/audit.c
index eca447ec281d..e7926d464981 100644
--- a/security/landlock/audit.c
+++ b/security/landlock/audit.c
@@ -86,6 +86,10 @@ get_blocker(const enum landlock_request_type type,
 	case LANDLOCK_REQUEST_NAMESPACE:
 		WARN_ON_ONCE(access_bit != -1);
 		return "perm.namespace_use";
+
+	case LANDLOCK_REQUEST_CAPABILITY:
+		WARN_ON_ONCE(access_bit != -1);
+		return "perm.capability_use";
 	}
 
 	WARN_ON_ONCE(1);
diff --git a/security/landlock/audit.h b/security/landlock/audit.h
index e9e52fb628f5..fe5d701ea45d 100644
--- a/security/landlock/audit.h
+++ b/security/landlock/audit.h
@@ -22,6 +22,7 @@ enum landlock_request_type {
 	LANDLOCK_REQUEST_SCOPE_ABSTRACT_UNIX_SOCKET,
 	LANDLOCK_REQUEST_SCOPE_SIGNAL,
 	LANDLOCK_REQUEST_NAMESPACE,
+	LANDLOCK_REQUEST_CAPABILITY,
 };
 
 /*
diff --git a/security/landlock/cap.c b/security/landlock/cap.c
new file mode 100644
index 000000000000..d54bd32297b7
--- /dev/null
+++ b/security/landlock/cap.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock - Capability hooks
+ *
+ * Copyright © 2026 Cloudflare
+ */
+
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/lsm_audit.h>
+#include <linux/lsm_hooks.h>
+#include <uapi/linux/landlock.h>
+
+#include "audit.h"
+#include "cap.h"
+#include "cred.h"
+#include "limits.h"
+#include "ruleset.h"
+#include "setup.h"
+
+static const struct access_masks cap_perm = {
+	.perm = LANDLOCK_PERM_CAPABILITY_USE,
+};
+
+/**
+ * hook_capable - Deny capability use for Landlock-sandboxed processes
+ *
+ * @cred: Credentials being checked.
+ * @ns: User namespace for the capability check.
+ * @cap: Capability number (CAP_*).
+ * @opts: Capability check options.  CAP_OPT_NOAUDIT suppresses audit logging.
+ *
+ * Pure bitmask check: denies the capability if it is not in the layer's allowed
+ * set.  This hook is purely restrictive: commoncap is registered with
+ * LSM_ORDER_FIRST so cap_capable() always runs first, which means Landlock can
+ * deny capabilities that commoncap would allow, but never grant capabilities
+ * that commoncap denied.
+ *
+ * Return: 0 if allowed, -EPERM if capability use is denied.
+ */
+static int hook_capable(const struct cred *cred, struct user_namespace *ns,
+			int cap, unsigned int opts)
+{
+	const struct landlock_cred_security *subject;
+	size_t denied_layer;
+
+	subject = landlock_get_applicable_subject(cred, cap_perm, NULL);
+	if (!subject)
+		return 0;
+
+	denied_layer = landlock_perm_is_denied(subject->domain,
+					       LANDLOCK_PERM_CAPABILITY_USE,
+					       landlock_cap_to_bit(cap));
+	if (!denied_layer)
+		return 0;
+
+	/*
+	 * Respects CAP_OPT_NOAUDIT to suppress audit records for capability
+	 * probes (e.g., ns_capable_noaudit(), has_capability_noaudit()).
+	 */
+	if (!(opts & CAP_OPT_NOAUDIT))
+		landlock_log_denial(subject,
+				    &(struct landlock_request){
+					    .type = LANDLOCK_REQUEST_CAPABILITY,
+					    .audit.type = LSM_AUDIT_DATA_CAP,
+					    .audit.u.cap = cap,
+					    .layer_plus_one = denied_layer,
+				    });
+
+	return -EPERM;
+}
+
+static struct security_hook_list landlock_hooks[] __ro_after_init = {
+	LSM_HOOK_INIT(capable, hook_capable),
+};
+
+__init void landlock_add_cap_hooks(void)
+{
+	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+			   &landlock_lsmid);
+}
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+#include <kunit/test.h>
+
+static void test_cap_to_bit(struct kunit *const test)
+{
+	KUNIT_EXPECT_EQ(test, BIT_ULL(0), landlock_cap_to_bit(0));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(CAP_NET_RAW),
+			landlock_cap_to_bit(CAP_NET_RAW));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(CAP_SYS_ADMIN),
+			landlock_cap_to_bit(CAP_SYS_ADMIN));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(CAP_LAST_CAP),
+			landlock_cap_to_bit(CAP_LAST_CAP));
+}
+
+static void test_cap_to_bit_invalid(struct kunit *const test)
+{
+	KUNIT_EXPECT_EQ(test, 0ULL, landlock_cap_to_bit(-1));
+	KUNIT_EXPECT_EQ(test, 0ULL, landlock_cap_to_bit(CAP_LAST_CAP + 1));
+}
+
+static void test_caps_to_bits_valid(struct kunit *const test)
+{
+	KUNIT_EXPECT_EQ(test, (u64)CAP_VALID_MASK,
+			landlock_caps_to_bits(CAP_VALID_MASK));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(CAP_NET_RAW),
+			landlock_caps_to_bits(BIT_ULL(CAP_NET_RAW)));
+}
+
+static void test_caps_to_bits_unknown(struct kunit *const test)
+{
+	KUNIT_EXPECT_EQ(test, 0ULL,
+			landlock_caps_to_bits(BIT_ULL(CAP_LAST_CAP + 1)));
+}
+
+static void test_caps_to_bits_zero(struct kunit *const test)
+{
+	KUNIT_EXPECT_EQ(test, 0ULL, landlock_caps_to_bits(0));
+}
+
+static struct kunit_case test_cases[] = {
+	/* clang-format off */
+	KUNIT_CASE(test_cap_to_bit),
+	KUNIT_CASE(test_cap_to_bit_invalid),
+	KUNIT_CASE(test_caps_to_bits_valid),
+	KUNIT_CASE(test_caps_to_bits_unknown),
+	KUNIT_CASE(test_caps_to_bits_zero),
+	{}
+	/* clang-format on */
+};
+
+static struct kunit_suite test_suite = {
+	.name = "landlock_cap",
+	.test_cases = test_cases,
+};
+
+kunit_test_suite(test_suite);
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
diff --git a/security/landlock/cap.h b/security/landlock/cap.h
new file mode 100644
index 000000000000..67ac3d0c3ad3
--- /dev/null
+++ b/security/landlock/cap.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock - Capability hooks
+ *
+ * Copyright © 2026 Cloudflare
+ */
+
+#ifndef _SECURITY_LANDLOCK_CAP_H
+#define _SECURITY_LANDLOCK_CAP_H
+
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+/**
+ * landlock_cap_to_bit - Convert a capability number to a compact bitmask
+ *
+ * @cap: Capability number (CAP_*).
+ *
+ * Return: BIT_ULL(@cap), or 0 if @cap is invalid (with a WARN).
+ */
+static inline __attribute_const__ u64 landlock_cap_to_bit(const int cap)
+{
+	if (WARN_ON_ONCE(!cap_valid(cap)))
+		return 0;
+
+	return BIT_ULL(cap);
+}
+
+/**
+ * landlock_caps_to_bits - Validate and mask a capability bitmask
+ *
+ * @capabilities: Bitmask of capabilities (e.g. from user space).
+ *
+ * Return: @capabilities masked to known capabilities.  Warns if unknown bits
+ * are present (callers must pre-mask for user input).
+ */
+static inline __attribute_const__ u64
+landlock_caps_to_bits(const u64 capabilities)
+{
+	WARN_ON_ONCE(capabilities & ~CAP_VALID_MASK);
+	return capabilities & CAP_VALID_MASK;
+}
+
+__init void landlock_add_cap_hooks(void);
+
+#endif /* _SECURITY_LANDLOCK_CAP_H */
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
index 0172345fa86f..d04323a5eb05 100644
--- a/security/landlock/cred.h
+++ b/security/landlock/cred.h
@@ -191,6 +191,9 @@ landlock_perm_is_denied(const struct landlock_ruleset *const domain,
 		case LANDLOCK_PERM_NAMESPACE_USE:
 			allowed = domain->layers[layer].allowed.ns;
 			break;
+		case LANDLOCK_PERM_CAPABILITY_USE:
+			allowed = domain->layers[layer].allowed.caps;
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			return layer + 1;
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index e51122668fd3..01b0b693d0fb 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -11,6 +11,7 @@
 #define _SECURITY_LANDLOCK_LIMITS_H
 
 #include <linux/bitops.h>
+#include <linux/capability.h>
 #include <linux/limits.h>
 #include <linux/ns/ns_common_types.h>
 #include <uapi/linux/landlock.h>
@@ -32,11 +33,12 @@
 #define LANDLOCK_MASK_SCOPE		((LANDLOCK_LAST_SCOPE << 1) - 1)
 #define LANDLOCK_NUM_SCOPE		__const_hweight64(LANDLOCK_MASK_SCOPE)
 
-#define LANDLOCK_LAST_PERM		LANDLOCK_PERM_NAMESPACE_USE
+#define LANDLOCK_LAST_PERM		LANDLOCK_PERM_CAPABILITY_USE
 #define LANDLOCK_MASK_PERM		((LANDLOCK_LAST_PERM << 1) - 1)
 #define LANDLOCK_NUM_PERM		__const_hweight64(LANDLOCK_MASK_PERM)
 
 #define LANDLOCK_NUM_PERM_NS		__const_hweight64((u64)(CLONE_NS_ALL))
+#define LANDLOCK_NUM_PERM_CAP		(CAP_LAST_CAP + 1)
 
 #define LANDLOCK_LAST_RESTRICT_SELF	LANDLOCK_RESTRICT_SELF_TSYNC
 #define LANDLOCK_MASK_RESTRICT_SELF	((LANDLOCK_LAST_RESTRICT_SELF << 1) - 1)
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index a7ed776b41b4..971419d663bb 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -11,6 +11,7 @@
 #include <linux/lsm_hooks.h>
 #include <uapi/linux/lsm.h>
 
+#include "cap.h"
 #include "common.h"
 #include "cred.h"
 #include "errata.h"
@@ -70,6 +71,7 @@ static int __init landlock_init(void)
 	landlock_add_fs_hooks();
 	landlock_add_net_hooks();
 	landlock_add_ns_hooks();
+	landlock_add_cap_hooks();
 	landlock_init_id();
 	landlock_initialized = true;
 	pr_info("Up and running.\n");
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index b5bbeedc6825..6e99cda3d511 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -30,6 +30,7 @@
 #include <linux/uaccess.h>
 #include <uapi/linux/landlock.h>
 
+#include "cap.h"
 #include "cred.h"
 #include "domain.h"
 #include "fs.h"
@@ -98,8 +99,9 @@ static void build_check_abi(void)
 	struct landlock_path_beneath_attr path_beneath_attr;
 	struct landlock_net_port_attr net_port_attr;
 	struct landlock_namespace_attr namespace_attr;
+	struct landlock_capability_attr capability_attr;
 	size_t ruleset_size, path_beneath_size, net_port_size;
-	size_t namespace_size;
+	size_t namespace_size, capability_size;
 
 	/*
 	 * For each user space ABI structures, first checks that there is no
@@ -127,6 +129,11 @@ static void build_check_abi(void)
 	namespace_size += sizeof(namespace_attr.namespace_types);
 	BUILD_BUG_ON(sizeof(namespace_attr) != namespace_size);
 	BUILD_BUG_ON(sizeof(namespace_attr) != 16);
+
+	capability_size = sizeof(capability_attr.allowed_perm);
+	capability_size += sizeof(capability_attr.capabilities);
+	BUILD_BUG_ON(sizeof(capability_attr) != capability_size);
+	BUILD_BUG_ON(sizeof(capability_attr) != 16);
 }
 
 /* Ruleset handling */
@@ -449,14 +456,57 @@ static int add_rule_namespace(struct landlock_ruleset *const ruleset,
 	return 0;
 }
 
+static int add_rule_capability(struct landlock_ruleset *const ruleset,
+			       const void __user *const rule_attr)
+{
+	struct landlock_capability_attr cap_attr;
+	int res;
+	access_mask_t mask;
+
+	/* Copies raw user space buffer. */
+	res = copy_from_user(&cap_attr, rule_attr, sizeof(cap_attr));
+	if (res)
+		return -EFAULT;
+
+	/* Informs about useless rule: empty allowed_perm. */
+	if (!cap_attr.allowed_perm)
+		return -ENOMSG;
+
+	/* The allowed_perm must match LANDLOCK_PERM_CAPABILITY_USE. */
+	if (cap_attr.allowed_perm != LANDLOCK_PERM_CAPABILITY_USE)
+		return -EINVAL;
+
+	/* Checks that allowed_perm matches the @ruleset constraints. */
+	mask = landlock_get_perm_mask(ruleset, 0);
+	if (!(mask & LANDLOCK_PERM_CAPABILITY_USE))
+		return -EINVAL;
+
+	/* Informs about useless rule: empty capabilities. */
+	if (!cap_attr.capabilities)
+		return -ENOMSG;
+
+	/*
+	 * Stores only the capabilities this kernel knows about.  Unknown bits
+	 * are silently accepted for forward compatibility: user space compiled
+	 * against newer headers can pass new CAP_* bits without getting EINVAL
+	 * on older kernels.  Unknown bits have no effect because no hook checks
+	 * them.
+	 */
+	mutex_lock(&ruleset->lock);
+	ruleset->layers[0].allowed.caps |=
+		landlock_caps_to_bits(cap_attr.capabilities & CAP_VALID_MASK);
+	mutex_unlock(&ruleset->lock);
+	return 0;
+}
+
 /**
  * sys_landlock_add_rule - Add a new rule to a ruleset
  *
  * @ruleset_fd: File descriptor tied to the ruleset that should be extended
  *		with the new rule.
  * @rule_type: Identify the structure type pointed to by @rule_attr:
- *             %LANDLOCK_RULE_PATH_BENEATH, %LANDLOCK_RULE_NET_PORT, or
- *             %LANDLOCK_RULE_NAMESPACE.
+ *             %LANDLOCK_RULE_PATH_BENEATH, %LANDLOCK_RULE_NET_PORT,
+ *             %LANDLOCK_RULE_NAMESPACE, or %LANDLOCK_RULE_CAPABILITY.
  * @rule_attr: Pointer to a rule (matching the @rule_type).
  * @flags: Must be 0.
  *
@@ -508,6 +558,8 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 		return add_rule_net_port(ruleset, rule_attr);
 	case LANDLOCK_RULE_NAMESPACE:
 		return add_rule_namespace(ruleset, rule_attr);
+	case LANDLOCK_RULE_CAPABILITY:
+		return add_rule_capability(ruleset, rule_attr);
 	default:
 		return -EINVAL;
 	}
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 0/9] Landlock: Namespace and capability control
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module

Namespaces are a fundamental building block for containers and
application sandboxes, but user namespace creation significantly widens
the kernel attack surface.  CVE-2026-43284 / CVE-2026-43500 ("Dirty
Frag"), CVE-2026-46300 ("Fragnesia"), CVE-2023-32233 and CVE-2022-25636
(netfilter), CVE-2022-0492 (cgroup v1 release_agent), and CVE-2022-0185
(filesystem mount parsing) all demonstrate vulnerabilities exploitable
only through capabilities gained via user namespaces.  Advisories for
the 2026 CVEs recommend disabling unprivileged user-namespace creation
as a temporary mitigation.  Some distributions (e.g. Debian and Arch's
linux-hardened kernel via the kernel.unprivileged_userns_clone sysctl)
block user namespace creation entirely, but this removes a useful
isolation primitive.  Fine-grained control allows trusted programs to
use namespaces while preventing unnecessary exposure for programs that
do not need them.

Existing mechanisms (user.max_*_namespaces sysctls, userns_create LSM
hook, PR_SET_NO_NEW_PRIVS, and capset) each address part of this threat
but none provides per-process, fine-grained control over both namespace
types and capabilities.  Container runtimes resort to seccomp-based
clone/unshare filtering, but seccomp cannot dereference clone3's flag
structure, forcing runtimes to block clone3 entirely.

Landlock's composable layer model enables several patterns: a user
session manager can restrict namespace types and capabilities broadly
while allowing trusted programs to create the namespaces they need, and
each deeper layer can further restrict the allowed set.  Container
runtimes can similarly deny namespace creation inside managed
containers.

Two permissions are needed because the controlled operations sit on
different LSM hook sites (namespace_init/namespace_install vs
capable) and address independent threat axes: capability acquisition
via user-namespace creation, and capability exercise after acquisition.
Collapsing them into a single permission would conflate hook semantics.
LANDLOCK_PERM_NAMESPACE_USE intentionally covers every kernel path
that grants access to a namespace (creation, entry, and fd reference)
because each path widens the kernel attack surface for that namespace
type; splitting it into finer create/enter/fd-reference permissions
would add UAPI surface without isolating a distinct attack axis.

This series adds two new permission categories to Landlock:

- LANDLOCK_PERM_NAMESPACE_USE: Restricts which namespace types a
  sandboxed process can use: creation (unshare/clone), entry (setns),
  and fd reference (open_tree, fsmount).  User namespace creation has
  no capability check in the kernel, so this is the only enforcement
  mechanism for that path.

- LANDLOCK_PERM_CAPABILITY_USE: Restricts which Linux capabilities a
  sandboxed process can use, regardless of how they were obtained
  (including through user namespace creation).

Both use new handled_perm and LANDLOCK_RULE_* constants following the
existing allow-list model.  The UAPI uses raw CAP_* and CLONE_NEW*
values directly; unknown values are silently accepted for forward
compatibility (the allow-list denies them by default).  This series is
planned to merge in the same kernel version as the UDP series, which
already bumped the Landlock ABI to 10; no second bump is needed.

The handled_perm infrastructure is designed to be reusable by future
permission categories.  The last patch documents the design rationale
for the permission model and the criteria for choosing between
handled_access_*, handled_perm, and scoped.  A patch series to add
socket creation control is under review [2]; it could benefit from the
same permission model to achieve complete deny-by-default coverage of
socket creation.

This series builds on Christian Brauner's namespace LSM blob RFC [1],
included as patch 1.  The FOR_EACH_NS_TYPE patch from v1 has been
merged in master: commit 935a04923ad2 ("nsproxy: Add FOR_EACH_NS_TYPE()
X-macro and CLONE_NS_ALL").

Paul, could you please review patch 1 and 2?  The first adds the new LSM
hooks and the second adds LSM_AUDIT_DATA_NS, a new audit record type
that logs namespace_type and ns_id for namespace-related LSM denials.

All of these example vulnerabilities follow the same pattern: an
unprivileged user creates a user namespace to obtain capabilities, then
creates a second namespace to exercise them against vulnerable code.
LANDLOCK_PERM_NAMESPACE_USE prevents this by denying the user
namespace (eliminating the capability grant) or the specific namespace
type needed to exercise it.  LANDLOCK_PERM_CAPABILITY_USE independently
prevents it by denying the required capability.

Namespace restriction is enforced at two hook sites: namespace_init
(unshare/clone) and namespace_install (setns).  Together, these ensure a
process denied a namespace type cannot circumvent the restriction by
entering a pre-existing namespace via setns() on an inherited or passed
file descriptor.  When a domain handles both permissions, both must
independently allow the operation (e.g., unshare(CLONE_NEWNET) requires
both CAP_SYS_ADMIN to be allowed and CLONE_NEWNET to be allowed).

Design evolution:

The first approach added CAP_OPT flags to security_capable() to
distinguish namespace creation contexts.  This was too invasive and
would have required capability splitting (a dedicated CAP_NAMESPACE)
which does not help because the CAP_SYS_ADMIN fallback for backward
compatibility undermines the distinction.

The second stored the namespace creator's domain in the LSM blob and
used domain ancestry comparison in hook_capable() to bypass capability
checks for namespace management operations.  A SCOPE_NAMESPACE flag
restricted setns() by the namespace creator's domain, like SCOPE_SIGNAL.
Both were dropped: scopes should only concern Landlock properties
(domain relationships), not kernel namespace state; and the
cross-namespace heuristic (ns != cred->user_ns) did not accurately
identify namespace management operations.

The final design drops all of this.  The key insight is that
capabilities gained through user namespace creation are only exercisable
against namespaces of a specific type: creating a network namespace is
what makes CAP_NET_ADMIN exercisable.  LANDLOCK_PERM_NAMESPACE_USE
controls where capabilities are exercisable by restricting which
namespace types can be acquired.  LANDLOCK_PERM_CAPABILITY_USE controls
which capabilities are available, as a pure per-layer bitmask check with
no namespace awareness.  The two are independently enforced at their own
hook sites, with no interaction in hook_capable().  No scope flag is
added in this series.

When Landlock filesystem restrictions are in use, mount namespace
creation has an inherent limitation: all mount topology changes are
denied when any filesystem right is handled.  A dedicated mount
access control type is left for future work [3].

Per Paul Moore's review, no security_namespace_switch() post-hook is
added in this series: such a hook would only serve LSMs that maintain
per-task state derived from the active namespace set (SELinux-style
state tracking), and no current LSM (including this series) needs that.
Landlock enforces at namespace_install() and namespace_init(), before
the task-to-nsproxy switch.  The hook is left for a separate LSM
infrastructure proposal once a concrete user emerges.

https://lore.kernel.org/r/20260216-work-security-namespace-v1-1-075c28758e1f@kernel.org [1]
https://lore.kernel.org/r/20251118134639.3314803-1-ivanov.mikhail1@huawei-partners.com [2]
https://github.com/landlock-lsm/linux/issues/14 [3]

Changes since RFC v1:
https://lore.kernel.org/r/20260312100444.2609563-1-mic@digikod.net
- Move security_namespace_install() before ns->ops->install() in
  validate_ns() and fix proc_free_inum() error path when inum is
  caller-provided (patch 1, suggested by Christian Brauner).
- Replace inum with ns_id in namespace audit records: ns_id is the
  stable 64-bit namespace identifier, never recycled (patches 2, 4,
  6, 9; suggested by Christian Brauner).
- Fix user_denied.setns test to expect EPERM from Landlock instead
  of EINVAL from userns_install() after hook reordering (patch 6).
- Add __packed __aligned(sizeof(u64)) to struct perm_masks to fix
  m68k build failure where GCC packs bitfields at byte granularity,
  and add WARN_ON_ONCE guards for invalid perm_bit or request_value
  in landlock_perm_is_denied() (patch 4, suggested by Tingmao Wang).
- Fix anonymous mount namespace blob leak: make __ns_common_free()
  always call security_namespace_free() and conditionally call
  proc_free_inum() via MNT_NS_INO_SPECIAL_MAX, so free_mnt_ns()
  calls ns_common_free() unconditionally (patch 1, suggested by
  Christian Brauner, also reported by Daniel Durning).
- Unify hook_namespace_init() and hook_namespace_install() into a
  shared check_ns_type() helper and drop the redundant entry-level
  WARN_ON_ONCE (the downstream warns in landlock_ns_type_to_bit()
  and landlock_perm_is_denied() suffice; patch 4).
- Remove duplicate ns_audit.unshare_denied test (identical to
  ns_audit.create_denied; patch 6).
- Add sandboxed_allowed variant to setns_cross_process to cover
  allowed cross-process setns (patch 6).
- Rebase onto landlock-next (includes the resolve_unix and UDP
  series).  No ABI bump in v2: the series is planned to merge in
  the same kernel as the UDP series, which already bumped to 10.
- Drop three patches now upstream on landlock-next: the two
  audit-test fixes (filter dealloc records, default audit socket
  timeout) sent independently with Cc: stable, plus the
  allowed_access best-effort filtering demonstration patch.
- Rename LANDLOCK_PERM_NAMESPACE_ENTER to LANDLOCK_PERM_NAMESPACE_USE
  (and audit blocker perm.namespace_enter to perm.namespace_use) for
  semantic accuracy: the verb _ENTER fits setns/unshare/clone but
  misleads for open_tree and fsmount where the caller holds an fd
  reference without entering.  _USE covers both cases and mirrors
  LANDLOCK_PERM_CAPABILITY_USE.
- Add a Design philosophy section to
  Documentation/security/landlock.rst stating Landlock's principle:
  restrict access to data, other tasks, and kernel resources.
- Rewrite Documentation/security/landlock.rst Ruleset restriction
  models with the per-object (handled_access_*) versus per-category
  (handled_perm) framing in place of the previous chokepoints/
  gateways wording.
- Enumerate the seven syscall paths covered by
  LANDLOCK_PERM_NAMESPACE_USE in
  Documentation/userspace-api/landlock.rst (membership via
  unshare/clone/setns; fd reference via open_tree and fsmount).
- Document the deterministic-semantics rationale for accepting
  unknown category member values in rule bodies (per-category
  permissions section of Documentation/security/landlock.rst);
  range-checking against CAP_LAST_CAP is intentionally avoided.
- Address Günther Noack's nits in the layer_config wrapper patch:
  clarify that _LANDLOCK_ACCESS_FS_INITIALLY_DENIED is ORed with
  the .handled field of all ruleset->layers[] entries; rename
  landlock_upgrade_handled_access_masks() to
  landlock_upgrade_handled_layer_config() to match the parameter
  type; rewrap the @layers kdoc to greedy fill (eliminating v1's
  manual short "rulesets in a" line).
- Rename struct layer_rights to struct layer_config: "config" is
  the more general term for per-layer state.
- Rename internal struct perm_rules to struct perm_masks to parallel
  the sibling access_masks in struct layer_config.
- Collect Reviewed-by tags from Günther Noack on patches 2, 3, 4,
  and 5 from the v1 thread.  Patch 1 and patch 8 changed
  substantially since v1 (the mount-namespace blob leak fix and
  validate_ns() reordering for patch 1; the libcap migration for
  patch 8), so the Reviewed-by tags from reviewers who had not
  requested those changes are not carried forward; the affected
  reviewers are kept as Cc:.
- Rename security_namespace_alloc() to security_namespace_init()
  (and the LSM hook namespace_alloc -> namespace_init, plus
  Landlock's hook_namespace_alloc() -> hook_namespace_init())
  to match the caller-name convention and reflect that the hook
  initialises LSM state attached to a constructed ns_common rather
  than allocating it (patch 1, suggested by Paul Moore).
- Refine the security_namespace_free() kdoc to clarify that
  RCU-safe blob freeing is required only if an LSM exposes data
  within the blob to concurrent RCU readers, and document that
  the blob memory itself is released with kfree() after the
  namespace_free hooks return (patch 1, suggested by Paul Moore).
- Use cap_from_name(3) from libcap in the sandboxer; LL_CAP now
  takes colon-delimited capability names (e.g. "cap_sys_chroot")
  or numbers (libcap's numeric fallback), and the Makefile links
  libcap (patch 8, suggested by Günther Noack).
- Rename the sandboxer env var LL_CAPS to LL_CAP for consistency
  with the singular form used by all other LL_* sandboxer env vars
  (LL_NS, LL_FS_RO, LL_FS_RW, LL_TCP_BIND, LL_TCP_CONNECT,
  LL_SCOPED, LL_FORCE_LOG; patch 8).
- Add a bridging sentence in the per-category permissions section
  of Documentation/security/landlock.rst contrasting per-category
  permissions with per-object access rights (patch 9, suggested by
  Günther Noack).
- Disambiguate the orthogonality invariant in
  Documentation/security/landlock.rst ("all new scoped features"
  -> "all Landlock access controls") to avoid clash with the UAPI
  scoped field (patch 9, suggested by Justin Suess).
- Add an introductory paragraph in
  Documentation/userspace-api/landlock.rst contrasting
  LANDLOCK_PERM_CAPABILITY_USE with PR_SET_NO_NEW_PRIVS (patch 9,
  suggested by Justin Suess).
- Add an explicit static_assert that LANDLOCK_NUM_PERM_CAP +
  LANDLOCK_NUM_PERM_NS fits in u64, complementing the implicit
  sizeof guard on struct perm_masks (patch 5).
- Document that setns_cross_process exercises only CLONE_NEWUTS
  (patch 6).
- Add add_rule_unknown_no_runtime_effect tests asserting that a
  rule listing only unknown bits has no runtime effect (patches
  6, 7).
- Extend the cap/ns stacking tests with the parent-denies/child-
  allows variant to complete per-layer walker direction coverage
  (patches 6, 7).

Christian Brauner (1):
  security: add LSM blob and hooks for namespaces

Mickaël Salaün (8):
  security: Add LSM_AUDIT_DATA_NS for namespace audit records
  landlock: Wrap per-layer access masks in struct layer_config
  landlock: Enforce namespace use restrictions
  landlock: Enforce capability restrictions
  selftests/landlock: Add namespace restriction tests
  selftests/landlock: Add capability restriction tests
  samples/landlock: Add capability and namespace restriction support
  landlock: Add documentation for capability and namespace restrictions

 Documentation/admin-guide/LSM/landlock.rst   |   19 +-
 Documentation/security/landlock.rst          |  151 +-
 Documentation/userspace-api/landlock.rst     |  216 ++-
 fs/namespace.c                               |    3 +-
 include/linux/lsm_audit.h                    |    5 +
 include/linux/lsm_hook_defs.h                |    3 +
 include/linux/lsm_hooks.h                    |    1 +
 include/linux/ns/ns_common_types.h           |    3 +
 include/linux/security.h                     |   20 +
 include/uapi/linux/landlock.h                |   97 +-
 include/uapi/linux/nsfs.h                    |    1 +
 kernel/nscommon.c                            |   17 +-
 kernel/nsproxy.c                             |    6 +
 samples/landlock/Makefile                    |    1 +
 samples/landlock/sandboxer.c                 |  144 +-
 security/landlock/Makefile                   |    4 +-
 security/landlock/access.h                   |   77 +-
 security/landlock/audit.c                    |    8 +
 security/landlock/audit.h                    |    2 +
 security/landlock/cap.c                      |  141 ++
 security/landlock/cap.h                      |   49 +
 security/landlock/cred.h                     |   54 +-
 security/landlock/limits.h                   |    9 +
 security/landlock/ns.c                       |  156 ++
 security/landlock/ns.h                       |   73 +
 security/landlock/ruleset.c                  |   27 +-
 security/landlock/ruleset.h                  |   62 +-
 security/landlock/setup.c                    |    4 +
 security/landlock/syscalls.c                 |  122 +-
 security/lsm_audit.c                         |    4 +
 security/lsm_init.c                          |    2 +
 security/security.c                          |   77 +
 tools/testing/selftests/landlock/base_test.c |   18 +
 tools/testing/selftests/landlock/cap_test.c  |  673 +++++++
 tools/testing/selftests/landlock/common.h    |   23 +
 tools/testing/selftests/landlock/config      |    5 +
 tools/testing/selftests/landlock/fs_test.c   |   13 +-
 tools/testing/selftests/landlock/ns_test.c   | 1795 ++++++++++++++++++
 tools/testing/selftests/landlock/wrappers.h  |   29 +
 39 files changed, 4028 insertions(+), 86 deletions(-)
 create mode 100644 security/landlock/cap.c
 create mode 100644 security/landlock/cap.h
 create mode 100644 security/landlock/ns.c
 create mode 100644 security/landlock/ns.h
 create mode 100644 tools/testing/selftests/landlock/cap_test.c
 create mode 100644 tools/testing/selftests/landlock/ns_test.c

-- 
2.54.0

^ permalink raw reply

* [PATCH v2 1/9] security: add LSM blob and hooks for namespaces
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Daniel Durning, Jonathan Corbet, Justin Suess, Lennart Poettering,
	Mickaël Salaün, Mikhail Ivanov, Nicolas Bouchinet,
	Shervin Oloumi, Tingmao Wang, kernel-team, linux-fsdevel,
	linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

From: Christian Brauner <brauner@kernel.org>

All namespace types now share the same ns_common infrastructure. Extend
this to include a security blob so LSMs can start managing namespaces
uniformly without having to add one-off hooks or security fields to
every individual namespace type.

Add a ns_security pointer to ns_common and the corresponding lbs_ns blob
size to lsm_blob_sizes. Allocation and freeing hooks are called from the
common __ns_common_init() and __ns_common_free() paths so every
namespace type gets covered in one go. All information about the
namespace type and the appropriate casting helpers to get at the
containing namespace are available via ns_common making it
straightforward for LSMs to differentiate when they need to.

A namespace_install hook is called from validate_ns() during setns(2)
giving LSMs a chance to enforce policy on namespace transitions.  The
LSM check runs before ns->ops->install() so the security module can deny
the operation before any type-specific installation effects.

Individual namespace types can still have their own specialized security
hooks when needed. This is just the common baseline that makes it easy
to track and manage namespaces from the security side without requiring
every namespace type to reinvent the wheel.

Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20260216-work-security-namespace-v1-1-075c28758e1f@kernel.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-2-mic@digikod.net
- Move security_namespace_install() before ns->ops->install() in
  validate_ns() (suggested by Christian Brauner).
- Only call proc_free_inum() on security_namespace_alloc() failure
  when inum was allocated by this function (suggested by Christian
  Brauner).
- Fix anonymous mount namespace blob leak: move
  security_namespace_free() into __ns_common_free() and make
  proc_free_inum() conditional on dynamically allocated inums
  via MNT_NS_INO_SPECIAL_MAX, so free_mnt_ns() can call
  ns_common_free() unconditionally (suggested by Christian
  Brauner).  Also reported by Daniel Durning while working on
  SELinux support for these hooks:
  https://lore.kernel.org/r/20260318201747.4477-1-danieldurning.work@gmail.com
- Rename security_namespace_alloc() to security_namespace_init()
  to match the caller-name convention and reflect that the hook
  initialises LSM state attached to a constructed ns_common rather
  than allocating the ns_common itself (suggested by Paul Moore).
- Refine the security_namespace_free() kdoc to clarify that
  RCU-safe blob freeing is required only if an LSM exposes data
  within the blob to concurrent RCU readers, and document that
  the blob memory itself is released with kfree() after the
  namespace_free hooks return (suggested by Paul Moore).
- Günther Noack's v1 Reviewed-by is not carried forward to v2:
  the validate_ns() reordering and the anonymous-mount-namespace
  blob-leak fix are semantic changes that were not part of his
  review.  Cc'd instead.
---
 fs/namespace.c                     |  3 +-
 include/linux/lsm_hook_defs.h      |  3 ++
 include/linux/lsm_hooks.h          |  1 +
 include/linux/ns/ns_common_types.h |  3 ++
 include/linux/security.h           | 20 ++++++++
 include/uapi/linux/nsfs.h          |  1 +
 kernel/nscommon.c                  | 17 ++++++-
 kernel/nsproxy.c                   |  6 +++
 security/lsm_init.c                |  2 +
 security/security.c                | 77 ++++++++++++++++++++++++++++++
 10 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index fe919abd2f01..031ef3fafa48 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4179,8 +4179,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
 
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
-	if (!is_anon_ns(ns))
-		ns_common_free(ns);
+	ns_common_free(ns);
 	dec_mnt_namespaces(ns->ucounts);
 	mnt_ns_tree_remove(ns);
 }
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 2b8dfb35caed..c389ea904392 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -265,6 +265,9 @@ LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
 LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
 	 struct inode *inode)
 LSM_HOOK(int, 0, userns_create, const struct cred *cred)
+LSM_HOOK(int, 0, namespace_init, struct ns_common *ns)
+LSM_HOOK(void, LSM_RET_VOID, namespace_free, struct ns_common *ns)
+LSM_HOOK(int, 0, namespace_install, const struct nsset *nsset, struct ns_common *ns)
 LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
 LSM_HOOK(void, LSM_RET_VOID, ipc_getlsmprop, struct kern_ipc_perm *ipcp,
 	 struct lsm_prop *prop)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index b4f8cad53ddb..5cff13069529 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -112,6 +112,7 @@ struct lsm_blob_sizes {
 	unsigned int lbs_ipc;
 	unsigned int lbs_key;
 	unsigned int lbs_msg_msg;
+	unsigned int lbs_ns;
 	unsigned int lbs_perf_event;
 	unsigned int lbs_task;
 	unsigned int lbs_xattr_count; /* num xattr slots in new_xattrs array */
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
index ea45c54e4435..5cfe0ce3c881 100644
--- a/include/linux/ns/ns_common_types.h
+++ b/include/linux/ns/ns_common_types.h
@@ -116,6 +116,9 @@ struct ns_common {
 	struct dentry *stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
+#ifdef CONFIG_SECURITY
+	void *ns_security;
+#endif
 	union {
 		struct ns_tree;
 		struct rcu_head ns_rcu;
diff --git a/include/linux/security.h b/include/linux/security.h
index 41d7367cf403..8865f46cc3a9 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -67,6 +67,7 @@ enum fs_value_type;
 struct watch;
 struct watch_notification;
 struct lsm_ctx;
+struct nsset;
 
 /* Default (no) options for the capable function */
 #define CAP_OPT_NONE 0x0
@@ -80,6 +81,7 @@ struct lsm_ctx;
 
 struct ctl_table;
 struct audit_krule;
+struct ns_common;
 struct user_namespace;
 struct timezone;
 
@@ -540,6 +542,9 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_create_user_ns(const struct cred *cred);
+int security_namespace_init(struct ns_common *ns);
+void security_namespace_free(struct ns_common *ns);
+int security_namespace_install(const struct nsset *nsset, struct ns_common *ns);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop);
 int security_msg_msg_alloc(struct msg_msg *msg);
@@ -1430,6 +1435,21 @@ static inline int security_create_user_ns(const struct cred *cred)
 	return 0;
 }
 
+static inline int security_namespace_init(struct ns_common *ns)
+{
+	return 0;
+}
+
+static inline void security_namespace_free(struct ns_common *ns)
+{
+}
+
+static inline int security_namespace_install(const struct nsset *nsset,
+					     struct ns_common *ns)
+{
+	return 0;
+}
+
 static inline int security_ipc_permission(struct kern_ipc_perm *ipcp,
 					  short flag)
 {
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index a25e38d1c874..ea0f0267d90f 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -55,6 +55,7 @@ enum init_ns_ino {
 	MNT_NS_INIT_INO		= 0xEFFFFFF8U,
 #ifdef __KERNEL__
 	MNT_NS_ANON_INO		= 0xEFFFFFF7U,
+	MNT_NS_INO_SPECIAL_MAX	= MNT_NS_ANON_INO,
 #endif
 };
 
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 3166c1fd844a..e72426bba29a 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -4,6 +4,7 @@
 #include <linux/ns_common.h>
 #include <linux/nstree.h>
 #include <linux/proc_ns.h>
+#include <linux/security.h>
 #include <linux/user_namespace.h>
 #include <linux/vfsdebug.h>
 
@@ -59,6 +60,9 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 
 	refcount_set(&ns->__ns_ref, 1);
 	ns->stashed = NULL;
+#ifdef CONFIG_SECURITY
+	ns->ns_security = NULL;
+#endif
 	ns->ops = ops;
 	ns->ns_id = 0;
 	ns->ns_type = ns_type;
@@ -77,6 +81,14 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 		ret = proc_alloc_inum(&ns->inum);
 	if (ret)
 		return ret;
+
+	ret = security_namespace_init(ns);
+	if (ret) {
+		if (!inum)
+			proc_free_inum(ns->inum);
+		return ret;
+	}
+
 	/*
 	 * Tree ref starts at 0. It's incremented when namespace enters
 	 * active use (installed in nsproxy) and decremented when all
@@ -91,7 +103,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 
 void __ns_common_free(struct ns_common *ns)
 {
-	proc_free_inum(ns->inum);
+	security_namespace_free(ns);
+
+	if (ns->inum > MNT_NS_INO_SPECIAL_MAX)
+		proc_free_inum(ns->inum);
 }
 
 struct ns_common *__must_check ns_owner(struct ns_common *ns)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d9d3d5973bf5..0f1b208d8eef 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -385,6 +385,12 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
 
 static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
 {
+	int ret;
+
+	ret = security_namespace_install(nsset, ns);
+	if (ret)
+		return ret;
+
 	return ns->ops->install(nsset, ns);
 }
 
diff --git a/security/lsm_init.c b/security/lsm_init.c
index 7c0fd17f1601..dcd2a228c4f6 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -303,6 +303,7 @@ static void __init lsm_prepare(struct lsm_info *lsm)
 	lsm_blob_size_update(&blobs->lbs_ipc, &blob_sizes.lbs_ipc);
 	lsm_blob_size_update(&blobs->lbs_key, &blob_sizes.lbs_key);
 	lsm_blob_size_update(&blobs->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+	lsm_blob_size_update(&blobs->lbs_ns, &blob_sizes.lbs_ns);
 	lsm_blob_size_update(&blobs->lbs_perf_event,
 			     &blob_sizes.lbs_perf_event);
 	lsm_blob_size_update(&blobs->lbs_sock, &blob_sizes.lbs_sock);
@@ -450,6 +451,7 @@ int __init security_init(void)
 		lsm_pr("blob(ipc) size %d\n", blob_sizes.lbs_ipc);
 		lsm_pr("blob(key) size %d\n", blob_sizes.lbs_key);
 		lsm_pr("blob(msg_msg)_size %d\n", blob_sizes.lbs_msg_msg);
+		lsm_pr("blob(ns) size %d\n", blob_sizes.lbs_ns);
 		lsm_pr("blob(sock) size %d\n", blob_sizes.lbs_sock);
 		lsm_pr("blob(superblock) size %d\n", blob_sizes.lbs_superblock);
 		lsm_pr("blob(perf_event) size %d\n", blob_sizes.lbs_perf_event);
diff --git a/security/security.c b/security/security.c
index 4e999f023651..21cc45d4bbd0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -26,6 +26,7 @@
 #include <linux/string.h>
 #include <linux/xattr.h>
 #include <linux/msg.h>
+#include <linux/ns_common.h>
 #include <linux/overflow.h>
 #include <linux/perf_event.h>
 #include <linux/fs.h>
@@ -381,6 +382,19 @@ static int lsm_superblock_alloc(struct super_block *sb)
 			      GFP_KERNEL);
 }
 
+/**
+ * lsm_ns_alloc - allocate a composite namespace blob
+ * @ns: the namespace that needs a blob
+ *
+ * Allocate the namespace blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_ns_alloc(struct ns_common *ns)
+{
+	return lsm_blob_alloc(&ns->ns_security, blob_sizes.lbs_ns, GFP_KERNEL);
+}
+
 /**
  * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
  * @uctx: a userspace LSM context to be filled
@@ -3358,6 +3372,69 @@ int security_create_user_ns(const struct cred *cred)
 	return call_int_hook(userns_create, cred);
 }
 
+/**
+ * security_namespace_init() - Initialize LSM security data for a namespace
+ * @ns: the namespace being initialized
+ *
+ * Initialize the LSM security blob attached to the namespace. The namespace type
+ * is available via ns->ns_type, and the owning user namespace (if any)
+ * via ns->ops->owner(ns).
+ *
+ * Return: Returns 0 if successful, otherwise < 0 error code.
+ */
+int security_namespace_init(struct ns_common *ns)
+{
+	int rc;
+
+	rc = lsm_ns_alloc(ns);
+	if (unlikely(rc))
+		return rc;
+
+	rc = call_int_hook(namespace_init, ns);
+	if (unlikely(rc))
+		security_namespace_free(ns);
+
+	return rc;
+}
+
+/**
+ * security_namespace_free() - Release LSM security data from a namespace
+ * @ns: the namespace being freed
+ *
+ * Release security data attached to the namespace. Called before the
+ * namespace structure is freed.
+ *
+ * Note: If an LSM exposes data within the security blob to concurrent
+ * RCU readers, it must use RCU-safe freeing for that data.  The blob
+ * memory itself is released with kfree() after the namespace_free
+ * hooks return.
+ */
+void security_namespace_free(struct ns_common *ns)
+{
+	if (!ns->ns_security)
+		return;
+
+	call_void_hook(namespace_free, ns);
+
+	kfree(ns->ns_security);
+	ns->ns_security = NULL;
+}
+
+/**
+ * security_namespace_install() - Check permission to install a namespace
+ * @nsset: the target nsset being configured
+ * @ns: the namespace being installed
+ *
+ * Check permission before allowing a namespace to be installed into the
+ * process's set of namespaces via setns(2).
+ *
+ * Return: Returns 0 if permission is granted, otherwise < 0 error code.
+ */
+int security_namespace_install(const struct nsset *nsset, struct ns_common *ns)
+{
+	return call_int_hook(namespace_install, nsset, ns);
+}
+
 /**
  * security_ipc_permission() - Check if sysv ipc access is allowed
  * @ipcp: ipc permission structure
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 2/9] security: Add LSM_AUDIT_DATA_NS for namespace audit records
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

Add a new LSM audit data type LSM_AUDIT_DATA_NS that logs namespace
information in audit records.  Two fields are provided:

- ns_type: the CLONE_NEW* flag identifying the namespace type, logged
  in hexadecimal.

- ns_id: the unique 64-bit namespace identifier, retrievable from
  userspace via NS_GET_ID or listns(2).  Unlike the proc inode number
  (inum), ns_id is never recycled.  For namespace creation denials,
  ns_id is 0 because the namespace does not exist yet.

A new audit data type is needed because no existing LSM_AUDIT_DATA_*
type carries namespace information.  The closest alternatives (e.g.
LSM_AUDIT_DATA_TASK or LSM_AUDIT_DATA_NONE with custom strings) would
either lose the namespace type or require ad-hoc formatting that
bypasses the structured audit data union.

Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Günther Noack <gnoack@google.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-3-mic@digikod.net
- Replace inum with ns_id in the audit record: ns_id is the stable
  64-bit namespace identifier (never recycled), accessible to
  userspace via NS_GET_ID and listns(2) (suggested by Christian
  Brauner).
- Add Reviewed-by: Christian Brauner.
- Add Reviewed-by: Günther Noack.
---
 include/linux/lsm_audit.h | 5 +++++
 security/lsm_audit.c      | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 584db296e43b..526a8e7471c8 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -78,6 +78,7 @@ struct common_audit_data {
 #define LSM_AUDIT_DATA_NOTIFICATION 16
 #define LSM_AUDIT_DATA_ANONINODE	17
 #define LSM_AUDIT_DATA_NLMSGTYPE	18
+#define LSM_AUDIT_DATA_NS		19
 	union 	{
 		struct path path;
 		struct dentry *dentry;
@@ -100,6 +101,10 @@ struct common_audit_data {
 		int reason;
 		const char *anonclass;
 		u16 nlmsg_type;
+		struct {
+			u32 ns_type;
+			u64 ns_id;
+		} ns;
 	} u;
 	/* this union contains LSM specific data */
 	union {
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 737f5a263a8f..404ccbbbf94c 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -403,6 +403,10 @@ void audit_log_lsm_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_NLMSGTYPE:
 		audit_log_format(ab, " nl-msgtype=%hu", a->u.nlmsg_type);
 		break;
+	case LSM_AUDIT_DATA_NS:
+		audit_log_format(ab, " namespace_type=0x%x namespace_id=%llu",
+				 a->u.ns.ns_type, a->u.ns.ns_id);
+		break;
 	} /* switch (a->type) */
 }
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 9/9] landlock: Add documentation for capability and namespace restrictions
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

Document the two new Landlock permission categories in the userspace API
guide, admin guide, and kernel security documentation.

The userspace API guide adds sections on capability restriction
(LANDLOCK_PERM_CAPABILITY_USE with LANDLOCK_RULE_CAPABILITY) and
namespace restriction (LANDLOCK_PERM_NAMESPACE_USE with
LANDLOCK_RULE_NAMESPACE, covering creation, entry, and fd-reference
acquisition), the backward-compatible degradation pattern for ABI < 10,
and the per-namespace-type capability requirements.

The admin guide adds the new perm.namespace_use and perm.capability_use
audit blocker names with their object identification fields
(namespace_type, namespace_id, capability).

The kernel security documentation adds a "Ruleset restriction models"
section defining the three models (handled_access_*, handled_perm,
scoped), their coverage and compatibility properties, and the criteria
for choosing between them for future features.  It also documents
composability with user namespaces and adds kernel-doc references for
the new capability and namespace headers.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-12-mic@digikod.net

The userspace API and security guides were revamped to match the v2
permission model: the previous chokepoints/gateways prose is replaced
with the per-object (handled_access_*) versus per-category
(handled_perm) framing, and a new Design philosophy section in the
security guide states Landlock's principle (data, processes, kernel
resources).

- Rename namespace_inum to namespace_id in audit field documentation
  to match the renamed audit field.
- Rename LANDLOCK_PERM_NAMESPACE_ENTER references to
  LANDLOCK_PERM_NAMESPACE_USE (companion change to the introducing
  commit), and enumerate the seven kernel paths it gates in the
  userspace API guide (membership via unshare/clone/clone3/setns; fd
  reference via open_tree/fsmount).
- Clarify that LANDLOCK_PERM_NAMESPACE_USE gates *acquisition* of
  namespace associations only (namespaces the process is already a
  member of when the domain is enforced are implicitly allowed) and
  that LANDLOCK_PERM_CAPABILITY_USE gates every exercise of a
  capability after the domain is enforced, regardless of how the
  capability was obtained.
- Document the rationale for accepting (rather than rejecting)
  unknown category member values in rule bodies: rejection would tie
  Landlock policy semantics to the running kernel's category-member
  set, making cross-kernel policies brittle.  Acceptance is fail-safe
  in both directions and lets a policy activate as written when a
  value becomes real on a future kernel.
- Replace handled_perm = 0 with a per-bit mask in the userspace API
  guide's ABI compat fall-through, so future ABI extensions adding
  new LANDLOCK_PERM_* bits do not get stripped on the path that
  drops the v10 bits.
- Add a bridging sentence in the per-category permissions section
  of Documentation/security/landlock.rst contrasting per-category
  permissions with per-object access rights: per-category gates the
  prerequisite operation itself rather than restricting specific
  operations on a single resource instance (suggested by Günther
  Noack).
- Disambiguate the orthogonality invariant in
  Documentation/security/landlock.rst from the UAPI scoped field
  ("all new scoped features" -> "all Landlock access controls";
  suggested by Justin Suess).
- Add an introductory paragraph in
  Documentation/userspace-api/landlock.rst contrasting
  LANDLOCK_PERM_CAPABILITY_USE with PR_SET_NO_NEW_PRIVS: NNP is the
  broader mechanism that blocks privilege acquisition via execve(2),
  while CAPABILITY_USE restricts the exercise of capabilities the
  process already holds (including those gained via CLONE_NEWUSER,
  which NNP does not block); sandboxes typically set both
  (suggested by Justin Suess).
- Disambiguate "category": object-side uses "object type" / "resource
  kind"; "category" stays for the per-category permissions model.
---
 Documentation/admin-guide/LSM/landlock.rst |  19 +-
 Documentation/security/landlock.rst        | 151 +++++++++++++-
 Documentation/userspace-api/landlock.rst   | 216 +++++++++++++++++++--
 3 files changed, 367 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/LSM/landlock.rst b/Documentation/admin-guide/LSM/landlock.rst
index 9923874e2156..58ac5ae2f5f3 100644
--- a/Documentation/admin-guide/LSM/landlock.rst
+++ b/Documentation/admin-guide/LSM/landlock.rst
@@ -6,7 +6,7 @@ Landlock: system-wide management
 ================================
 
 :Author: Mickaël Salaün
-:Date: January 2026
+:Date: May 2026
 
 Landlock can leverage the audit framework to log events.
 
@@ -59,14 +59,25 @@ AUDIT_LANDLOCK_ACCESS
         - scope.abstract_unix_socket - Abstract UNIX socket connection denied
         - scope.signal - Signal sending denied
 
+    **perm.*** - Permission restrictions (ABI 10+):
+        - perm.namespace_use - Namespace entry was denied (creation via
+          :manpage:`unshare(2)` / :manpage:`clone(2)` or joining via
+          :manpage:`setns(2)`);
+          ``namespace_type`` indicates the type (hex CLONE_NEW* bitmask),
+          ``namespace_id`` identifies the target namespace for
+          :manpage:`setns(2)` operations
+        - perm.capability_use - Capability use was denied;
+          ``capability`` indicates the capability number
+
     Multiple blockers can appear in a single event (comma-separated) when
     multiple access rights are missing. For example, creating a regular file
     in a directory that lacks both ``make_reg`` and ``refer`` rights would show
     ``blockers=fs.make_reg,fs.refer``.
 
-    The object identification fields (path, dev, ino for filesystem; opid,
-    ocomm for signals) depend on the type of access being blocked and provide
-    context about what resource was involved in the denial.
+    The object identification fields depend on the type of access being blocked:
+    ``path``, ``dev``, ``ino`` for filesystem; ``opid``, ``ocomm`` for signals;
+    ``namespace_type`` and ``namespace_id`` for namespace operations;
+    ``capability`` for capability use.
 
 
 AUDIT_LANDLOCK_DOMAIN
diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index c5186526e76f..2b6e4be42893 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -7,7 +7,7 @@ Landlock LSM: kernel documentation
 ==================================
 
 :Author: Mickaël Salaün
-:Date: March 2026
+:Date: May 2026
 
 Landlock's goal is to create scoped access-control (i.e. sandboxing).  To
 harden a whole system, this feature should be available to any process,
@@ -129,6 +129,143 @@ The reasoning is:
   restrictions, because access within the same scope is already
   allowed based on ``LANDLOCK_ACCESS_FS_RESOLVE_UNIX``.
 
+Composability with user namespaces
+----------------------------------
+
+Landlock domain-based scoping and the kernel's user namespace-based capability
+scoping enforce isolation over independent hierarchies.  Landlock checks domain
+ancestry; the kernel's ``ns_capable()`` checks user namespace ancestry.  These
+hierarchies are orthogonal: Landlock enforcement is deterministic with respect
+to its own configuration, regardless of namespace or capability state, and vice
+versa.  This orthogonality is a design invariant that must hold for all Landlock
+access controls.
+
+Design philosophy
+-----------------
+
+Landlock's goal is to restrict a sandboxed process's access to three kinds of
+resources: data (files, sockets, pipes), other processes (signals, ptrace), and
+kernel-internal resources whose use widens the kernel attack surface
+(capabilities, namespace types).  Each access right or permission gates one or
+more operations that grant such access; restricting the operations is how
+Landlock restricts the underlying access.
+
+When designing a new access control, identify the protected resource kind
+first (data, processes, or kernel-internal resources).  The operation set
+follows from the protected resource: which kernel paths grant access to it, and
+at which moment those paths can be gated.  Do not design a permission around
+"restrict the unshare(2) syscall" or similar mechanism-centric framings; design
+it around "restrict the process from acquiring access to namespace types" (the
+protected resource), letting the operation set follow.
+
+Ruleset restriction models
+--------------------------
+
+Landlock provides three restriction models that differ in how rules identify the
+resource being restricted.
+
+Per-object access rights (``handled_access_*``)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Per-object access rights control operations on a specific resource instance,
+identified in the rule key by a value drawn from an open-ended space: a file
+hierarchy referenced by ``parent_fd``, or a network port identified by its
+16-bit number.  Each ``handled_access_*`` field declares a set of access rights
+that the ruleset restricts.  The rule body declares which of the multiple
+distinct operations on that object instance are allowed (open, read, write,
+truncate; bind, connect).  New operations on an existing rule type extend the
+corresponding ``handled_access_*`` field (e.g. a new filesystem operation
+extends ``handled_access_fs``).  A new object type with multiple fine-grained
+operations would use a new ``handled_access_*`` field.
+
+Per-category permissions (``handled_perm``)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Per-category permissions control the process's exercise of category members,
+where the category is a small kernel-defined enumeration (a Linux capability
+number ``CAP_*``, a namespace type ``CLONE_NEW*``).  Unlike per-object access
+rights, which restrict specific operations on a single resource instance,
+per-category permissions gate the prerequisite operation itself (exercising a
+capability, acquiring a namespace), so gating it transitively covers a broad set
+of downstream operations.  These category members are the LSM-level
+access-control objects (the entities the process is authorized against) even
+though they are enum values rather than externally-instantiated kernel data
+structures.  Per-category permissions apply where the controlled operation
+collapses to "may the process use this category member at all" (use a
+capability; acquire a namespace), so the rule body lists which category members
+the process may exercise; each ``LANDLOCK_PERM_*`` flag maps to its own rule
+type and covers every kernel path that exercises a member.  When a ruleset
+handles a permission, all uses of category members are denied unless explicitly
+allowed by a rule.  See Documentation/userspace-api/landlock.rst for the
+concrete syscall paths covered by each permission.
+
+The category enum is owned by the corresponding kernel subsystem (capabilities,
+namespaces, etc.).  Userspace policy authors query category member availability
+via the relevant non-Landlock interfaces:
+
+* For capabilities: ``<linux/capability.h>``,
+  ``/proc/sys/kernel/cap_last_cap``, ``prctl(PR_CAPBSET_READ)``.
+* For namespaces: ``<linux/sched.h>``, ``/proc/$$/ns/*``,
+  :manpage:`unshare(2)` runtime probe.
+
+The Landlock ABI version does not encode this availability; ABI versioning
+describes which Landlock features (rule types, access rights, scopes,
+permissions) the kernel implements, not which category members the kernel knows
+about.
+
+Forward compatibility for new category members follows a simple rule set:
+
+* New members in future kernels are automatically denied: rules whitelist
+  specific values, and a member not in any rule is denied.
+* Kernel-side compatibility for split categories is handled by the owning
+  subsystem (e.g., when ``CAP_BPF`` was split from ``CAP_SYS_ADMIN``, the
+  kernel kept checking either capability, so a rule denying ``CAP_SYS_ADMIN``
+  continues to deny operations gated by ``CAP_SYS_ADMIN || CAP_BPF`` patterns).
+* Unknown values in the rule body are silently accepted rather than rejected.
+  Rejecting them would tie Landlock policy semantics to the running kernel's
+  category-member set: a rule built against future headers would fail to load
+  on older kernels, forcing policy authors to know each kernel's enumeration.
+  Acceptance is fail-safe in both directions: a rule referring to a value the
+  running kernel does not yet know has no effect (deny-by-default still applies
+  to that operation), and a rule written against future headers loads
+  identically across kernels so the same policy keeps the same restrictions.
+  When a value becomes real on a future kernel, the policy activates as written
+  by the author.
+* In contrast, unknown ``LANDLOCK_PERM_*`` flags in ``handled_perm`` are
+  rejected (``-EINVAL``), since Landlock owns that bit space.
+
+Cross-domain scopes (``scoped``)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Scopes restrict **cross-domain interactions** categorically, without rules.
+Setting a scope flag (e.g.  ``LANDLOCK_SCOPE_SIGNAL``) denies the operation to
+targets outside the Landlock domain or its children.  Like per-category
+permissions, scopes provide complete coverage of the controlled operation.
+
+Choosing a model for a new feature
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* If the new feature controls operations on resource objects supplied by the
+  sandbox author, extend or add a per-object access right
+  (``handled_access_*``).
+* If the new feature controls a per-category operation gated by an enum (a
+  Linux capability, a namespace type, a socket family, etc.), use a
+  per-category permission (``handled_perm``).  When several such enums could
+  classify the operation, prefer the enum the originating subsystem already
+  uses for capability/access checks (e.g. ``CAP_*`` for ``capable()`` hooks,
+  ``CLONE_NEW*`` for namespace hooks).
+* When an operation is gated by multiple kernel-defined enums (a classic
+  example being ``CAP_SYS_ADMIN`` plus a ``CLONE_NEW*`` flag for non-user
+  namespace creation), define one per-category permission per enum dimension.
+  Sandbox authors handle each dimension's permission in ``handled_perm`` and
+  add rules for each; the kernel enforces each dimension at its own LSM hook.
+  ``LANDLOCK_PERM_NAMESPACE_USE`` and ``LANDLOCK_PERM_CAPABILITY_USE`` follow
+  this pattern.
+* If the new feature restricts a categorical cross-domain interaction with no
+  per-target granularity, use a cross-domain scope (``scoped``).
+* For all three models, confirm a single LSM hook (or small set of related
+  hooks) covers every kernel path that exercises the operation.
+
 Tests
 =====
 
@@ -150,6 +287,18 @@ Filesystem
 .. kernel-doc:: security/landlock/fs.h
     :identifiers:
 
+Namespace
+---------
+
+.. kernel-doc:: security/landlock/ns.h
+    :identifiers:
+
+Capability
+----------
+
+.. kernel-doc:: security/landlock/cap.h
+    :identifiers:
+
 Process credential
 ------------------
 
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index 45861fa75685..45548d1666fa 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -29,20 +29,29 @@ If Landlock is not currently supported, we need to
 Landlock rules
 ==============
 
-A Landlock rule describes an action on an object which the process intends to
-perform.  A set of rules is aggregated in a ruleset, which can then restrict
-the thread enforcing it, and its future children.
+A Landlock rule describes the actions a process is allowed to perform on a
+specific resource.  A set of rules is aggregated in a ruleset, which can then
+restrict the thread enforcing it, and its future children.
 
-The two existing types of rules are:
+The existing types of rules are:
 
 Filesystem rules
-    For these rules, the object is a file hierarchy,
-    and the related filesystem actions are defined with
-    `filesystem access rights`.
+    The rule key is a file hierarchy, and the actions it allows are
+    defined with `filesystem access rights`.
 
 Network rules (since ABI v4)
-    For these rules, the object is a TCP port,
-    and the related actions are defined with `network access rights`.
+    The rule key is a TCP port, and the actions it allows are defined with
+    `network access rights`.
+
+Capability rules (since ABI v10)
+    The rule body lists which members of the Linux capability category
+    the process may exercise; the action is defined with `permission
+    flags`.
+
+Namespace rules (since ABI v10)
+    The rule body lists which members of the namespace-type
+    category the process may use; the action is defined with `permission
+    flags`.
 
 Defining and enforcing a security policy
 ----------------------------------------
@@ -85,6 +94,9 @@ to be explicit about the denied-by-default access rights.
         .scoped =
             LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET |
             LANDLOCK_SCOPE_SIGNAL,
+        .handled_perm =
+            LANDLOCK_PERM_CAPABILITY_USE |
+            LANDLOCK_PERM_NAMESPACE_USE,
     };
 
 Because we may not know which kernel version an application will be executed
@@ -132,6 +144,11 @@ version, and only use the available subset of access rights:
     case 6 ... 8:
         /* Removes LANDLOCK_ACCESS_FS_RESOLVE_UNIX for ABI < 9 */
         ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_RESOLVE_UNIX;
+        __attribute__((fallthrough));
+    case 9:
+        /* Removes LANDLOCK_PERM_* for ABI < 10 */
+        ruleset_attr.handled_perm &= ~(LANDLOCK_PERM_NAMESPACE_USE |
+                                       LANDLOCK_PERM_CAPABILITY_USE);
     }
 
 This enables the creation of an inclusive ruleset that will contain our rules.
@@ -202,6 +219,53 @@ number for a specific action: HTTPS connections.
         err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT,
                                 &net_port, 0);
 
+Capability and namespace rules use a different attribute layout:
+``allowed_perm`` identifies the permission category (a single
+``LANDLOCK_PERM_*`` flag) and a type-specific value field carries the bitmask to
+allow within it.  See `Capability and namespace restrictions`_ for the model.
+
+For capability access-control, we can add rules that allow specific
+capabilities.  For instance, to allow ``CAP_SYS_CHROOT`` (so the sandboxed
+process can call :manpage:`chroot(2)` inside a user namespace):
+
+.. code-block:: c
+
+    struct landlock_capability_attr cap_attr = {
+        .allowed_perm = LANDLOCK_PERM_CAPABILITY_USE,
+        .capabilities = (1ULL << CAP_SYS_CHROOT),
+    };
+
+    cap_attr.allowed_perm &= ruleset_attr.handled_perm;
+    if (cap_attr.allowed_perm)
+        err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+                                &cap_attr, 0);
+
+For namespace access-control, we can add rules that allow entering specific
+namespace types (creating them via :manpage:`unshare(2)` / :manpage:`clone(2)` /
+:manpage:`clone3(2)`, joining them via :manpage:`setns(2)`, or acquiring an fd
+reference via :manpage:`open_tree(2)` / :manpage:`fsmount(2)`).  For instance,
+to allow creating user namespaces (which grants all capabilities inside the new
+namespace):
+
+.. code-block:: c
+
+    struct landlock_namespace_attr ns_attr = {
+        .allowed_perm = LANDLOCK_PERM_NAMESPACE_USE,
+        .namespace_types = CLONE_NEWUSER,
+    };
+
+    ns_attr.allowed_perm &= ruleset_attr.handled_perm;
+    if (ns_attr.allowed_perm)
+        err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NAMESPACE,
+                                &ns_attr, 0);
+
+Together, these two rules allow an unprivileged process to create a user
+namespace and call :manpage:`chroot(2)` inside it, while denying all other
+capabilities and namespace types.  User namespace creation is the one operation
+that does not require ``CAP_SYS_ADMIN``, so no capability rule is needed for it.
+See `Capability and namespace restrictions`_ for details on capability
+requirements.
+
 When passing a non-zero ``flags`` argument to ``landlock_restrict_self()``, a
 similar backwards compatibility check is needed for the restrict flags
 (see sys_landlock_restrict_self() documentation for available flags):
@@ -380,9 +444,115 @@ The operations which can be scoped are:
     A :manpage:`sendto(2)` on a socket which was previously connected will not
     be restricted.  This works for both datagram and stream sockets.
 
-IPC scoping does not support exceptions via :manpage:`landlock_add_rule(2)`.
-If an operation is scoped within a domain, no rules can be added to allow access
-to resources or processes outside of the scope.
+Scoping does not support exceptions via :manpage:`landlock_add_rule(2)`.  If an
+operation is scoped within a domain, no rules can be added to allow access to
+resources or processes outside of the scope.
+
+Capability and namespace restrictions
+-------------------------------------
+
+``handled_perm`` declares per-category permissions: each permission selects
+which members of a kernel-defined category (CAP_* capabilities, CLONE_NEW*
+namespace types) the process may use.  Unlike per-object access rights
+(``handled_access_*``) or cross-domain scopes (``scoped``), per-category
+permissions constrain the sandboxed process's own use of these enums; members
+not allowed by a rule are denied by default.
+
+``LANDLOCK_PERM_NAMESPACE_USE`` gates *acquisition* of namespace
+associations: creation via :manpage:`unshare(2)` / :manpage:`clone(2)`
+/ :manpage:`clone3(2)`, entry via :manpage:`setns(2)`, and fd-reference
+acquisition via :manpage:`open_tree(2)` / :manpage:`fsmount(2)`.  Namespaces
+the process is already a member of when the domain is enforced are implicitly
+allowed (the process could not continue running otherwise); rules describe which
+new namespace types the process may acquire.  ``LANDLOCK_PERM_CAPABILITY_USE``
+gates every exercise of a capability after the domain is enforced, regardless
+of how the capability was obtained (inherited credentials, ``CLONE_NEWUSER``
+grant, ``setuid``/file-cap-bearing :manpage:`execve(2)`, etc.).  Configuring
+both together restricts what privileges are available *and* the namespaces in
+which they take effect, which matters because user namespace creation has no
+capability check and grants all capabilities within the new namespace: gating
+only one of the two leaves a kernel attack-surface widening path open.
+
+``LANDLOCK_PERM_CAPABILITY_USE`` complements :manpage:`prctl(2)`
+``PR_SET_NO_NEW_PRIVS`` but does not replace it.  ``PR_SET_NO_NEW_PRIVS``
+prevents privilege *acquisition* via :manpage:`execve(2)` (setuid, file
+capability xattrs, privilege-elevating LSM transitions) and is a prerequisite
+for unprivileged Landlock self-sandboxing.  ``LANDLOCK_PERM_CAPABILITY_USE``
+restricts *exercise* of capabilities the process already holds, including those
+gained via ``CLONE_NEWUSER`` which ``PR_SET_NO_NEW_PRIVS`` does not block.
+Sandboxes typically set both.
+
+Rules are added with ``LANDLOCK_RULE_CAPABILITY`` and &struct
+landlock_capability_attr (each rule lists ``CAP_*`` values to allow), and with
+``LANDLOCK_RULE_NAMESPACE`` and &struct landlock_namespace_attr (each rule
+lists ``CLONE_NEW*`` flags to allow).  Landlock is purely restrictive: it can
+only deny what the traditional check would have allowed, never grant additional
+privileges.
+
+Rule bodies silently accept values unknown to the current kernel (capabilities
+above ``CAP_LAST_CAP``, unrecognised ``CLONE_NEW*`` bits): they have no runtime
+effect, so a rule compiled against future kernel headers loads without error on
+older kernels.  Future kernels gain new members denied by default until a rule
+explicitly allows them.
+
+The single ``LANDLOCK_PERM_NAMESPACE_USE`` bit gates every kernel path that
+grants the calling process access to a namespace of the controlled types,
+whether by becoming a member of the namespace or by holding a file descriptor
+that references it.  The covered syscall paths are:
+
+* :manpage:`unshare(2)` with ``CLONE_NEW*``: the caller becomes a member of a
+  newly-created namespace.
+* :manpage:`clone(2)` (or :manpage:`clone3(2)`) with ``CLONE_NEW*``: the
+  child becomes a member of a newly-created namespace.
+* :manpage:`setns(2)`: the caller becomes a member of an existing namespace
+  referenced by file descriptor.
+* :manpage:`open_tree(2)` with ``OPEN_TREE_NAMESPACE``: the caller obtains a
+  file descriptor referring to a newly-created mount namespace.
+* :manpage:`open_tree(2)` with ``OPEN_TREE_CLONE``: the caller obtains a file
+  descriptor referring to a newly-created anonymous mount namespace.
+* :manpage:`fsmount(2)` with ``FSMOUNT_NAMESPACE``: the caller obtains a file
+  descriptor referring to a newly-created mount namespace.
+* :manpage:`fsmount(2)` (default): the caller obtains a file descriptor
+  referring to a newly-created anonymous mount namespace.
+
+Anonymous mount namespaces (created by ``open_tree(OPEN_TREE_CLONE)`` and the
+default :manpage:`fsmount(2)`) are intentionally covered by the bit even though
+the calling process does not become a member of them.  Without this coverage, a
+sandboxed process could combine ``open_tree(OPEN_TREE_CLONE)`` with
+:manpage:`move_mount(2)` to graft mounts from a freshly-allocated mount
+namespace into its current namespace, bypassing the policy.
+
+In practice, unprivileged processes first create a user namespace (which
+requires no capability and grants all capabilities within it), then use those
+capabilities to create other namespace types.  All non-user namespace types
+require ``CAP_SYS_ADMIN`` for both creation and :manpage:`setns(2)` entry; mount
+namespace entry additionally requires ``CAP_SYS_CHROOT``.  For
+:manpage:`setns(2)`, capabilities are checked relative to the target namespace,
+so a process in an ancestor user namespace naturally satisfies them; this
+includes joining user namespaces, which requires ``CAP_SYS_ADMIN``.  When
+``LANDLOCK_PERM_CAPABILITY_USE`` is also handled, each of these capabilities
+must be explicitly allowed by a rule.
+
+When combining ``CLONE_NEWUSER`` with other ``CLONE_NEW*`` flags in a single
+:manpage:`unshare(2)` call, the ``CAP_SYS_ADMIN`` check targets the newly
+created user namespace, which is handled by ``LANDLOCK_PERM_NAMESPACE_USE``
+independently from ``LANDLOCK_PERM_CAPABILITY_USE``.  Performing the user
+namespace creation and the additional namespace creation in two separate
+:manpage:`unshare(2)` calls requires a rule allowing ``CAP_SYS_ADMIN`` if the
+domain also handles ``LANDLOCK_PERM_CAPABILITY_USE``.
+
+When creating child user namespaces, it is recommended to also create a
+dedicated Landlock domain with restrictions relevant to each namespace context.
+
+Note that ``LANDLOCK_PERM_CAPABILITY_USE`` restricts the *use* of capabilities,
+not their presence in the process's credential.  Capability sets can change
+after a domain is enforced through user namespace entry or :manpage:`capset(2)`;
+privileged sandboxes that did not set ``PR_SET_NO_NEW_PRIVS`` may also gain
+capabilities through :manpage:`execve(2)` of binaries with file capabilities.
+In all cases, :manpage:`capget(2)` will report the credential's capability sets,
+but any denied capability will fail with ``EPERM`` when exercised.  Do not rely
+on :manpage:`capget(2)` to determine whether the policy permits a given
+capability; only the actual operation will return ``EPERM`` upon denial.
 
 Truncating files
 ----------------
@@ -545,7 +715,7 @@ Access rights
 -------------
 
 .. kernel-doc:: include/uapi/linux/landlock.h
-    :identifiers: fs_access net_access scope
+    :identifiers: fs_access net_access scope perm
 
 Creating a new ruleset
 ----------------------
@@ -564,7 +734,8 @@ Extending a ruleset
 
 .. kernel-doc:: include/uapi/linux/landlock.h
     :identifiers: landlock_rule_type landlock_path_beneath_attr
-                  landlock_net_port_attr
+                  landlock_net_port_attr landlock_capability_attr
+                  landlock_namespace_attr
 
 Enforcing a ruleset
 -------------------
@@ -722,6 +893,23 @@ Starting with the Landlock ABI version 9, it is possible to restrict
 connections to pathname UNIX domain sockets (:manpage:`unix(7)`) using
 the new ``LANDLOCK_ACCESS_FS_RESOLVE_UNIX`` right.
 
+Capability restriction (ABI < 10)
+---------------------------------
+
+Starting with the Landlock ABI version 10, it is possible to restrict
+:manpage:`capabilities(7)` with the new ``LANDLOCK_PERM_CAPABILITY_USE``
+permission flag and ``LANDLOCK_RULE_CAPABILITY`` rule type.
+
+Namespace restriction (ABI < 10)
+--------------------------------
+
+Starting with the Landlock ABI version 10, it is possible to restrict namespace
+use across creation (:manpage:`unshare(2)`, :manpage:`clone(2)`,
+:manpage:`clone3(2)`), entry (:manpage:`setns(2)`), and fd-reference acquisition
+(:manpage:`open_tree(2)`, :manpage:`fsmount(2)`) with the new
+``LANDLOCK_PERM_NAMESPACE_USE`` permission flag and ``LANDLOCK_RULE_NAMESPACE``
+rule type.
+
 .. _kernel_support:
 
 Kernel support
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 8/9] samples/landlock: Add capability and namespace restriction support
From: Mickaël Salaün @ 2026-05-27 18:11 UTC (permalink / raw)
  To: Christian Brauner, Günther Noack, Paul Moore,
	Serge E . Hallyn
  Cc: Mickaël Salaün, Daniel Durning, Jonathan Corbet,
	Justin Suess, Lennart Poettering, Mikhail Ivanov,
	Nicolas Bouchinet, Shervin Oloumi, Tingmao Wang, kernel-team,
	linux-fsdevel, linux-kernel, linux-security-module
In-Reply-To: <20260527181127.879771-1-mic@digikod.net>

Extend the sandboxer sample to demonstrate the new Landlock capability
and namespace restriction features.  The LL_CAP environment variable
takes a colon-delimited list of allowed capabilities, parsed with
cap_from_name(3) from libcap.  Names (e.g. "cap_sys_chroot",
"CAP_SYS_ADMIN") are accepted; numeric strings (e.g. "18") work too via
cap_from_name's internal numeric fallback.  The LL_NS variable takes a
colon-delimited list of allowed namespace types by short name (e.g.
"user:uts:net").  Add best-effort degradation for older kernels that
predate the LANDLOCK_PERM_* features.

Allow creating user and UTS namespaces but deny network namespaces
(works as an unprivileged user).  All capabilities are available (LL_CAP
is not set), but namespace creation is still restricted to the types
listed in LL_NS.  The first command succeeds because user and UTS types
are in the allowed set, and sets the hostname inside the new UTS
namespace.  The second command fails because the network namespace type
is not allowed by the LANDLOCK_PERM_NAMESPACE_USE rule:

  LL_FS_RO=/ LL_FS_RW=/proc LL_NS="user:uts" \
    ./sandboxer /bin/sh -c \
    "unshare --user --uts --map-root-user hostname sandbox \
    && ! unshare --user --net true"

Allow only user namespace creation and CAP_SYS_CHROOT, denying all other
capabilities and namespace types (works as an unprivileged user).  An
unprivileged process creates a user namespace (no capability required)
and calls chroot inside it using the CAP_SYS_CHROOT granted within the
new namespace:

  LL_FS_RO=/ LL_FS_RW="" LL_NS="user" LL_CAP="cap_sys_chroot" \
    ./sandboxer /bin/sh -c \
    "unshare --user --keep-caps chroot / true"

Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Tingmao Wang <m@maowtm.org>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

Changes since v1:
https://lore.kernel.org/r/20260312100444.2609563-11-mic@digikod.net
- Rename LANDLOCK_PERM_NAMESPACE_ENTER references to
  LANDLOCK_PERM_NAMESPACE_USE (companion change to the introducing
  commit).
- Replace handled_perm = 0 with a per-bit mask in the ABI compat
  fall-through, mirroring the doc example so future ABI extensions
  adding new LANDLOCK_PERM_* bits do not get stripped.
- Parse LL_CAP values with cap_from_name(3) from libcap so users
  can pass capability names (e.g. "cap_sys_chroot") in addition to
  numbers.  cap_from_name accepts both: the canonical name lookup
  is case-insensitive, and a numeric-string fallback maps "18" to
  CAP_SYS_CHROOT identically to the previous numeric-only path.
  Drop the BITS_PER_TYPE workaround and the manual numeric bound
  check (cap_from_name does the right thing in both cases).  Link
  the sandboxer against libcap by adding userldlibs += -lcap in
  samples/landlock/Makefile.  Update help text and example command
  to show capability names (suggested by Günther Noack).
- Rename the LL_CAPS env var to LL_CAP for consistency with the
  singular form of all other sandboxer env vars (LL_NS, LL_FS_RO,
  LL_FS_RW, LL_TCP_BIND, LL_TCP_CONNECT, LL_SCOPED, LL_FORCE_LOG).
  Internal symbols renamed accordingly: ENV_CAPS_NAME -> ENV_CAP_NAME,
  populate_ruleset_caps() -> populate_ruleset_cap().
- Tingmao Wang's v1 Reviewed-by is not carried forward to v2: the
  cap_from_name() / libcap migration is a material implementation
  change requested by Günther Noack that was not part of his
  review.  Cc'd instead.
---
 samples/landlock/Makefile    |   1 +
 samples/landlock/sandboxer.c | 144 ++++++++++++++++++++++++++++++++++-
 2 files changed, 142 insertions(+), 3 deletions(-)

diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile
index 5d601e51c2eb..b30239c8a281 100644
--- a/samples/landlock/Makefile
+++ b/samples/landlock/Makefile
@@ -3,6 +3,7 @@
 userprogs-always-y := sandboxer
 
 userccflags += -I usr/include
+userldlibs += -lcap
 
 .PHONY: all clean
 
diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index 94e399e6b146..1582540f1a89 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -14,15 +14,17 @@
 #include <fcntl.h>
 #include <linux/landlock.h>
 #include <linux/socket.h>
+#include <sched.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/capability.h>
 #include <sys/prctl.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <unistd.h>
-#include <stdbool.h>
 
 #if defined(__GLIBC__)
 #include <linux/prctl.h>
@@ -60,6 +62,8 @@ static inline int landlock_restrict_self(const int ruleset_fd,
 #define ENV_FS_RW_NAME "LL_FS_RW"
 #define ENV_TCP_BIND_NAME "LL_TCP_BIND"
 #define ENV_TCP_CONNECT_NAME "LL_TCP_CONNECT"
+#define ENV_CAP_NAME "LL_CAP"
+#define ENV_NS_NAME "LL_NS"
 #define ENV_SCOPED_NAME "LL_SCOPED"
 #define ENV_FORCE_LOG_NAME "LL_FORCE_LOG"
 #define ENV_UDP_BIND_NAME "LL_UDP_BIND"
@@ -229,6 +233,117 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
 	return ret;
 }
 
+static __u64 str2ns(const char *const name)
+{
+	static const struct {
+		const char *name;
+		__u64 value;
+	} ns_map[] = {
+		/* clang-format off */
+		{ "cgroup",	CLONE_NEWCGROUP },
+		{ "ipc",	CLONE_NEWIPC },
+		{ "mnt",	CLONE_NEWNS },
+		{ "net",	CLONE_NEWNET },
+		{ "pid",	CLONE_NEWPID },
+		{ "time",	CLONE_NEWTIME },
+		{ "user",	CLONE_NEWUSER },
+		{ "uts",	CLONE_NEWUTS },
+		/* clang-format on */
+	};
+	size_t i;
+
+	for (i = 0; i < sizeof(ns_map) / sizeof(ns_map[0]); i++) {
+		if (strcmp(name, ns_map[i].name) == 0)
+			return ns_map[i].value;
+	}
+	return 0;
+}
+
+static int populate_ruleset_cap(const char *const env_var, const int ruleset_fd)
+{
+	int ret = 1;
+	char *env_cap_name, *env_cap_name_next, *strcap;
+	struct landlock_capability_attr cap_attr = {
+		.allowed_perm = LANDLOCK_PERM_CAPABILITY_USE,
+	};
+
+	env_cap_name = getenv(env_var);
+	if (!env_cap_name)
+		return 0;
+	env_cap_name = strdup(env_cap_name);
+	unsetenv(env_var);
+
+	env_cap_name_next = env_cap_name;
+	while ((strcap = strsep(&env_cap_name_next, ENV_DELIMITER))) {
+		cap_value_t cap;
+
+		if (strcmp(strcap, "") == 0)
+			continue;
+
+		if (cap_from_name(strcap, &cap)) {
+			fprintf(stderr, "Failed to parse capability \"%s\"\n",
+				strcap);
+			goto out_free_name;
+		}
+		cap_attr.capabilities = 1ULL << cap;
+		if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_CAPABILITY,
+				      &cap_attr, 0)) {
+			fprintf(stderr,
+				"Failed to update the ruleset with capability \"%s\": %s\n",
+				strcap, strerror(errno));
+			goto out_free_name;
+		}
+	}
+	ret = 0;
+
+out_free_name:
+	free(env_cap_name);
+	return ret;
+}
+
+static int populate_ruleset_ns(const char *const env_var, const int ruleset_fd)
+{
+	int ret = 1;
+	char *env_ns_name, *env_ns_name_next, *strns;
+	struct landlock_namespace_attr ns_attr = {
+		.allowed_perm = LANDLOCK_PERM_NAMESPACE_USE,
+	};
+
+	env_ns_name = getenv(env_var);
+	if (!env_ns_name)
+		return 0;
+	env_ns_name = strdup(env_ns_name);
+	unsetenv(env_var);
+
+	env_ns_name_next = env_ns_name;
+	while ((strns = strsep(&env_ns_name_next, ENV_DELIMITER))) {
+		__u64 ns_type;
+
+		if (strcmp(strns, "") == 0)
+			continue;
+
+		ns_type = str2ns(strns);
+		if (!ns_type) {
+			fprintf(stderr, "Unknown namespace type \"%s\"\n",
+				strns);
+			goto out_free_name;
+		}
+		ns_attr.namespace_types = ns_type;
+		if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NAMESPACE,
+				      &ns_attr, 0)) {
+			fprintf(stderr,
+				"Failed to update the ruleset with namespace \"%s\": %s\n",
+				strns, strerror(errno));
+			goto out_free_name;
+		}
+	}
+	ret = 0;
+
+out_free_name:
+	free(env_ns_name);
+	return ret;
+}
+
 /* Returns true on error, false otherwise. */
 static bool check_ruleset_scope(const char *const env_var,
 				struct landlock_ruleset_attr *ruleset_attr)
@@ -330,6 +445,10 @@ static const char help[] =
 	"prepare to receive on port / client: set as source port)\n"
 	"* " ENV_UDP_CONNECT_SEND_NAME ": remote UDP ports allowed to connect "
 	"or sendmsg (client: use as destination port / server: receive only from it)\n"
+	"* " ENV_CAP_NAME ": capabilities allowed to use, as names "
+	"or numbers (e.g. cap_net_bind_service, cap_sys_admin, 18)\n"
+	"* " ENV_NS_NAME ": namespace types allowed to use "
+	"(cgroup, ipc, mnt, net, pid, time, user, uts)\n"
 	"* " ENV_SCOPED_NAME ": actions denied on the outside of the landlock domain\n"
 	"  - \"a\" to restrict opening abstract unix sockets\n"
 	"  - \"s\" to restrict sending signals\n"
@@ -343,6 +462,8 @@ static const char help[] =
 	ENV_TCP_BIND_NAME "=\"9418\" "
 	ENV_TCP_CONNECT_NAME "=\"80:443\" "
 	ENV_UDP_CONNECT_SEND_NAME "=\"53\" "
+	ENV_CAP_NAME "=\"cap_sys_admin\" "
+	ENV_NS_NAME "=\"user:uts:net\" "
 	ENV_SCOPED_NAME "=\"a:s\" "
 	"%1$s bash -i\n"
 	"\n"
@@ -368,6 +489,8 @@ int main(const int argc, char *const argv[], char *const *const envp)
 				      LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP,
 		.scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET |
 			  LANDLOCK_SCOPE_SIGNAL,
+		.handled_perm = LANDLOCK_PERM_CAPABILITY_USE |
+				LANDLOCK_PERM_NAMESPACE_USE,
 	};
 	int supported_restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
 	int set_restrict_flags = 0;
@@ -455,11 +578,12 @@ int main(const int argc, char *const argv[], char *const *const envp)
 			~LANDLOCK_ACCESS_FS_RESOLVE_UNIX;
 		__attribute__((fallthrough));
 	case 9:
-		/* Removes UDP support for ABI < 10 */
+		/* Removes UDP support and LANDLOCK_PERM_* for ABI < 10 */
 		ruleset_attr.handled_access_net &=
 			~(LANDLOCK_ACCESS_NET_BIND_UDP |
 			  LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP);
-
+		ruleset_attr.handled_perm &= ~(LANDLOCK_PERM_NAMESPACE_USE |
+					       LANDLOCK_PERM_CAPABILITY_USE);
 		/* Must be printed for any ABI < LANDLOCK_ABI_LAST. */
 		fprintf(stderr,
 			"Hint: You should update the running kernel "
@@ -504,6 +628,14 @@ int main(const int argc, char *const argv[], char *const *const envp)
 			~LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP;
 	}
 
+	/* Removes capability handling if not set by a user. */
+	if (!getenv(ENV_CAP_NAME))
+		ruleset_attr.handled_perm &= ~LANDLOCK_PERM_CAPABILITY_USE;
+
+	/* Removes namespace handling if not set by a user. */
+	if (!getenv(ENV_NS_NAME))
+		ruleset_attr.handled_perm &= ~LANDLOCK_PERM_NAMESPACE_USE;
+
 	if (check_ruleset_scope(ENV_SCOPED_NAME, &ruleset_attr))
 		return 1;
 
@@ -556,6 +688,12 @@ int main(const int argc, char *const argv[], char *const *const envp)
 		goto err_close_ruleset;
 	}
 
+	if (populate_ruleset_cap(ENV_CAP_NAME, ruleset_fd))
+		goto err_close_ruleset;
+
+	if (populate_ruleset_ns(ENV_NS_NAME, ruleset_fd))
+		goto err_close_ruleset;
+
 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
 		perror("Failed to restrict privileges");
 		goto err_close_ruleset;
-- 
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox