Live Patching
 help / color / mirror / Atom feed
* [PATCH v6 1/9] sframe: Allow kernelspace sframe sections
From: Dylan Hatch @ 2026-05-19  6:49 UTC (permalink / raw)
  To: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Mark Rutland, Jens Remus
  Cc: Dylan Hatch, Prasanna Kumar T S M, Puranjay Mohan, Song Liu,
	joe.lawrence, linux-toolchains, linux-kernel, live-patching,
	linux-arm-kernel, Randy Dunlap, Mostafa Saleh, Herbert Xu,
	David S. Miller
In-Reply-To: <20260519064950.493949-1-dylanbhatch@google.com>

Generalize the sframe lookup code to support kernelspace sections. This
is done by defining a SFRAME_LOOKUP option that can be activated
separate from HAVE_UNWIND_USER_SFRAME, as there will be other client to
this library than just userspace unwind.

Sframe section location is now tracked in a separate sec_type field to
determine whether user-access functions are necessary to read the sframe
data. Relevant type delarations are moved and renamed to reflect the
non-user sframe support.

Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
---
 MAINTAINERS                                   |   2 +-
 arch/Kconfig                                  |   4 +
 .../{unwind_user_sframe.h => unwind_sframe.h} |   6 +-
 arch/x86/include/asm/unwind_user.h            |  12 +-
 include/linux/sframe.h                        |  48 ++--
 include/linux/unwind_types.h                  |  46 +++
 include/linux/unwind_user_types.h             |  41 ---
 kernel/unwind/Makefile                        |   2 +-
 kernel/unwind/sframe.c                        | 270 ++++++++++++------
 kernel/unwind/user.c                          |  45 +--
 10 files changed, 295 insertions(+), 181 deletions(-)
 rename arch/x86/include/asm/{unwind_user_sframe.h => unwind_sframe.h} (50%)
 create mode 100644 include/linux/unwind_types.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 6812f581d44b..54613c683fdb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27858,7 +27858,7 @@ F:	Documentation/driver-api/uio-howto.rst
 F:	drivers/uio/
 F:	include/linux/uio_driver.h
 
-USERSPACE STACK UNWINDING
+STACK UNWINDING
 M:	Josh Poimboeuf <jpoimboe@kernel.org>
 M:	Steven Rostedt <rostedt@goodmis.org>
 S:	Maintained
diff --git a/arch/Kconfig b/arch/Kconfig
index 78dad97bf2a4..6eeafd86347b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -486,6 +486,9 @@ config AS_SFRAME3
 	def_bool $(as-instr,.cfi_startproc\n.cfi_endproc,-Wa$(comma)--gsframe-3)
 	select AS_SFRAME
 
+config UNWIND_SFRAME_LOOKUP
+	bool
+
 config UNWIND_USER
 	bool
 
@@ -496,6 +499,7 @@ config HAVE_UNWIND_USER_FP
 config HAVE_UNWIND_USER_SFRAME
 	bool
 	select UNWIND_USER
+	select UNWIND_SFRAME_LOOKUP
 
 config SFRAME_VALIDATION
 	bool "Enable .sframe section debugging"
diff --git a/arch/x86/include/asm/unwind_user_sframe.h b/arch/x86/include/asm/unwind_sframe.h
similarity index 50%
rename from arch/x86/include/asm/unwind_user_sframe.h
rename to arch/x86/include/asm/unwind_sframe.h
index d828ae1a4aac..44d42e6ffde4 100644
--- a/arch/x86/include/asm/unwind_user_sframe.h
+++ b/arch/x86/include/asm/unwind_sframe.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_UNWIND_USER_SFRAME_H
-#define _ASM_X86_UNWIND_USER_SFRAME_H
+#ifndef _ASM_X86_UNWIND_SFRAME_H
+#define _ASM_X86_UNWIND_SFRAME_H
 
 #ifdef CONFIG_X86_64
 
@@ -9,4 +9,4 @@
 
 #endif
 
-#endif /* _ASM_X86_UNWIND_USER_SFRAME_H */
+#endif /* _ASM_X86_UNWIND_SFRAME_H */
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index b80f0ec0f7a7..1c7e31ca5d8e 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -54,30 +54,30 @@ static inline int unwind_user_get_reg(unsigned long *val, unsigned int regnum)
 
 #define ARCH_INIT_USER_FP_FRAME(ws)			\
 	.cfa		= {				\
-		.rule		= UNWIND_USER_CFA_RULE_FP_OFFSET,\
+		.rule		= UNWIND_CFA_RULE_FP_OFFSET,\
 		.offset		=  2*(ws),		\
 			},				\
 	.ra		= {				\
-		.rule		= UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+		.rule		= UNWIND_RULE_CFA_OFFSET_DEREF,\
 		.offset		= -1*(ws),		\
 			},				\
 	.fp		= {				\
-		.rule		= UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+		.rule		= UNWIND_RULE_CFA_OFFSET_DEREF,\
 		.offset		= -2*(ws),		\
 			},				\
 	.outermost	= false,
 
 #define ARCH_INIT_USER_FP_ENTRY_FRAME(ws)		\
 	.cfa		= {				\
-		.rule		= UNWIND_USER_CFA_RULE_SP_OFFSET,\
+		.rule		= UNWIND_CFA_RULE_SP_OFFSET,\
 		.offset		=  1*(ws),		\
 			},				\
 	.ra		= {				\
-		.rule		= UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+		.rule		= UNWIND_RULE_CFA_OFFSET_DEREF,\
 		.offset		= -1*(ws),		\
 			},				\
 	.fp		= {				\
-		.rule		= UNWIND_USER_RULE_RETAIN,\
+		.rule		= UNWIND_RULE_RETAIN,\
 			},				\
 	.outermost	= false,
 
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index b79c5ec09229..0cb2924367bc 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -3,37 +3,46 @@
 #define _LINUX_SFRAME_H
 
 #include <linux/mm_types.h>
+#include <linux/unwind_types.h>
 #include <linux/srcu.h>
-#include <linux/unwind_user_types.h>
 
-#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+#ifdef CONFIG_UNWIND_SFRAME_LOOKUP
+
+enum sframe_sec_type {
+	SFRAME_KERNEL,
+	SFRAME_USER,
+};
 
 struct sframe_section {
-	struct rcu_head	rcu;
+	struct rcu_head  rcu;
 #ifdef CONFIG_DYNAMIC_DEBUG
-	const char	*filename;
+	const char		*filename;
 #endif
-	unsigned long	sframe_start;
-	unsigned long	sframe_end;
-	unsigned long	text_start;
-	unsigned long	text_end;
-
-	unsigned long	fdes_start;
-	unsigned long	fres_start;
-	unsigned long	fres_end;
-	unsigned int	num_fdes;
-
-	signed char	ra_off;
-	signed char	fp_off;
+	enum sframe_sec_type	sec_type;
+	unsigned long		sframe_start;
+	unsigned long		sframe_end;
+	unsigned long		text_start;
+	unsigned long		text_end;
+
+	unsigned long		fdes_start;
+	unsigned long		fres_start;
+	unsigned long		fres_end;
+	unsigned int		num_fdes;
+
+	signed char		ra_off;
+	signed char		fp_off;
 };
 
+#endif /* CONFIG_UNWIND_SFRAME_LOOKUP */
+
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
 #define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
 extern void sframe_free_mm(struct mm_struct *mm);
 
 extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 			      unsigned long text_start, unsigned long text_end);
 extern int sframe_remove_section(unsigned long sframe_addr);
-extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
 
 static inline bool current_has_sframe(void)
 {
@@ -42,6 +51,8 @@ static inline bool current_has_sframe(void)
 	return mm && !mtree_empty(&mm->sframe_mt);
 }
 
+extern int sframe_find_user(unsigned long ip, struct unwind_frame *frame);
+
 #else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
 
 #define INIT_MM_SFRAME
@@ -52,9 +63,10 @@ static inline int sframe_add_section(unsigned long sframe_start, unsigned long s
 	return -ENOSYS;
 }
 static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
-static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -ENOSYS; }
 static inline bool current_has_sframe(void) { return false; }
 
+static inline int sframe_find_user(unsigned long ip, struct unwind_frame *frame) { return -ENOSYS; }
+
 #endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
 
 #endif /* _LINUX_SFRAME_H */
diff --git a/include/linux/unwind_types.h b/include/linux/unwind_types.h
new file mode 100644
index 000000000000..08bcb0aa04aa
--- /dev/null
+++ b/include/linux/unwind_types.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_TYPES_H
+#define _LINUX_UNWIND_TYPES_H
+
+#define UNWIND_RULE_DEREF			BIT(31)
+
+enum unwind_cfa_rule {
+	UNWIND_CFA_RULE_SP_OFFSET,		/* CFA = SP + offset */
+	UNWIND_CFA_RULE_FP_OFFSET,		/* CFA = FP + offset */
+	UNWIND_CFA_RULE_REG_OFFSET,	/* CFA = reg + offset */
+	/* DEREF variants */
+	UNWIND_CFA_RULE_REG_OFFSET_DEREF =	/* CFA = *(reg + offset) */
+		UNWIND_CFA_RULE_REG_OFFSET | UNWIND_RULE_DEREF,
+};
+
+struct unwind_cfa_rule_data {
+	enum unwind_cfa_rule rule;
+	s32 offset;
+	unsigned int regnum;
+};
+
+enum unwind_rule {
+	UNWIND_RULE_RETAIN,		/* entity = entity */
+	UNWIND_RULE_CFA_OFFSET,		/* entity = CFA + offset */
+	UNWIND_RULE_REG_OFFSET,		/* entity = register + offset */
+	/* DEREF variants */
+	UNWIND_RULE_CFA_OFFSET_DEREF =	/* entity = *(CFA + offset) */
+		UNWIND_RULE_CFA_OFFSET | UNWIND_RULE_DEREF,
+	UNWIND_RULE_REG_OFFSET_DEREF =	/* entity = *(register + offset) */
+		UNWIND_RULE_REG_OFFSET | UNWIND_RULE_DEREF,
+};
+
+struct unwind_rule_data {
+	enum unwind_rule rule;
+	s32 offset;
+	unsigned int regnum;
+};
+
+struct unwind_frame {
+	struct unwind_cfa_rule_data cfa;
+	struct unwind_rule_data ra;
+	struct unwind_rule_data fp;
+	bool outermost;
+};
+
+#endif /* _LINUX_UNWIND_TYPES_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 059e5c76f2f3..646e5fb774db 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -27,47 +27,6 @@ struct unwind_stacktrace {
 	unsigned long	*entries;
 };
 
-#define UNWIND_USER_RULE_DEREF			BIT(31)
-
-enum unwind_user_cfa_rule {
-	UNWIND_USER_CFA_RULE_SP_OFFSET,		/* CFA = SP + offset */
-	UNWIND_USER_CFA_RULE_FP_OFFSET,		/* CFA = FP + offset */
-	UNWIND_USER_CFA_RULE_REG_OFFSET,	/* CFA = reg + offset */
-	/* DEREF variants */
-	UNWIND_USER_CFA_RULE_REG_OFFSET_DEREF =	/* CFA = *(reg + offset) */
-		UNWIND_USER_CFA_RULE_REG_OFFSET | UNWIND_USER_RULE_DEREF,
-};
-
-struct unwind_user_cfa_rule_data {
-	enum unwind_user_cfa_rule rule;
-	s32 offset;
-	unsigned int regnum;
-};
-
-enum unwind_user_rule {
-	UNWIND_USER_RULE_RETAIN,		/* entity = entity */
-	UNWIND_USER_RULE_CFA_OFFSET,		/* entity = CFA + offset */
-	UNWIND_USER_RULE_REG_OFFSET,		/* entity = register + offset */
-	/* DEREF variants */
-	UNWIND_USER_RULE_CFA_OFFSET_DEREF =	/* entity = *(CFA + offset) */
-		UNWIND_USER_RULE_CFA_OFFSET | UNWIND_USER_RULE_DEREF,
-	UNWIND_USER_RULE_REG_OFFSET_DEREF =	/* entity = *(register + offset) */
-		UNWIND_USER_RULE_REG_OFFSET | UNWIND_USER_RULE_DEREF,
-};
-
-struct unwind_user_rule_data {
-	enum unwind_user_rule rule;
-	s32 offset;
-	unsigned int regnum;
-};
-
-struct unwind_user_frame {
-	struct unwind_user_cfa_rule_data cfa;
-	struct unwind_user_rule_data ra;
-	struct unwind_user_rule_data fp;
-	bool outermost;
-};
-
 struct unwind_user_state {
 	unsigned long				ip;
 	unsigned long				sp;
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 146038165865..c5f9f8124564 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1,2 +1,2 @@
  obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o
- obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME)	+= sframe.o
+ obj-$(CONFIG_UNWIND_SFRAME_LOOKUP)	+= sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 5400f481b05d..a2ab9a3e07b4 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -13,8 +13,8 @@
 #include <linux/string_helpers.h>
 #include <linux/sframe.h>
 #include <linux/syscalls.h>
-#include <asm/unwind_user_sframe.h>
-#include <linux/unwind_user_types.h>
+#include <linux/unwind_types.h>
+#include <asm/unwind_sframe.h>
 #include <uapi/linux/stacktrace.h>
 
 #include "sframe.h"
@@ -46,8 +46,6 @@ struct sframe_fre_internal {
 	unsigned char	dw_size;
 };
 
-DEFINE_STATIC_SRCU(sframe_srcu);
-
 static __always_inline unsigned char fre_type_to_size(unsigned char fre_type)
 {
 	if (fre_type > 2)
@@ -62,6 +60,77 @@ static __always_inline unsigned char dataword_size_enum_to_size(unsigned char da
 	return 1 << dataword_size;
 }
 
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+DEFINE_STATIC_SRCU(sframe_srcu);
+
+#define UNSAFE_USER_COPY(to, from, size, label)				\
+	unsafe_copy_from_user(to, (void __user *)from, size, label)
+
+#define UNSAFE_USER_GET(to, from, type, label)				\
+	unsafe_get_user(to, (type __user *)from, label)
+
+#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#define UNSAFE_USER_COPY(to, from, size, label) do {			\
+	(void)to; (void)from; (void)size;				\
+	goto label;							\
+} while (0)
+
+#define UNSAFE_USER_GET(to, from, type, label) do {			\
+	(void)to; (void)from;						\
+	goto label;							\
+} while (0)
+
+#endif /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#ifdef CONFIG_HAVE_UNWIND_KERNEL_SFRAME
+
+#define KERNEL_COPY(to, from, size, label) memcpy(to, (void *)from, size)
+#define KERNEL_GET(to, from, type, label) ({ (to) = *(type *)(from); })
+
+#else /* !CONFIG_HAVE_UNWIND_KERNEL_SFRAME */
+
+#define KERNEL_COPY(to, from, size, label) do {				\
+	(void)(to); (void)(from); (void)size;				\
+	goto label;							\
+} while (0)
+
+#define KERNEL_GET(to, from, type, label) do {				\
+	(void)(to); (void)(from);					\
+	goto label;							\
+} while (0)
+
+#endif /* !CONFIG_HAVE_UNWIND_KERNEL_SFRAME */
+
+#define DATA_COPY(sec, to, from, size, label)			\
+({								\
+	switch (sec->sec_type) {				\
+	case SFRAME_KERNEL:					\
+		KERNEL_COPY(to, from, size, label);		\
+		break;						\
+	case SFRAME_USER:					\
+		UNSAFE_USER_COPY(to, from, size, label);	\
+		break;						\
+	default:						\
+		goto label;					\
+	}							\
+})
+
+#define DATA_GET(sec, to, from, type, label)			\
+({								\
+	switch (sec->sec_type) {				\
+	case SFRAME_KERNEL:					\
+		KERNEL_GET(to, from, type, label);		\
+		break;						\
+	case SFRAME_USER:					\
+		UNSAFE_USER_GET(to, from, type, label);		\
+		break;						\
+	default:						\
+		goto label;					\
+	}							\
+})
+
 static __always_inline int __read_fde(struct sframe_section *sec,
 				      unsigned int fde_num,
 				      struct sframe_fde_internal *fde)
@@ -71,8 +140,8 @@ static __always_inline int __read_fde(struct sframe_section *sec,
 	struct sframe_fda_v3 _fda;
 
 	fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde_v3));
-	unsafe_copy_from_user(&_fde, (void __user *)fde_addr,
-			      sizeof(struct sframe_fde_v3), Efault);
+	DATA_COPY(sec, &_fde, fde_addr,
+		  sizeof(struct sframe_fde_v3), Efault);
 
 	func_addr = fde_addr + _fde.func_start_off;
 	if (func_addr < sec->text_start || func_addr >= sec->text_end)
@@ -81,8 +150,8 @@ static __always_inline int __read_fde(struct sframe_section *sec,
 	fda_addr = sec->fres_start + _fde.fres_off;
 	if (fda_addr + sizeof(struct sframe_fda_v3) > sec->fres_end)
 		return -EINVAL;
-	unsafe_copy_from_user(&_fda, (void __user *)fda_addr,
-			      sizeof(struct sframe_fda_v3), Efault);
+	DATA_COPY(sec, &_fda, fda_addr,
+		  sizeof(struct sframe_fda_v3), Efault);
 
 	fde->func_addr	= func_addr;
 	fde->func_size	= _fde.func_size;
@@ -104,21 +173,21 @@ static __always_inline int __find_fde(struct sframe_section *sec,
 				      struct sframe_fde_internal *fde)
 {
 	unsigned long func_addr_low = 0, func_addr_high = ULONG_MAX;
-	struct sframe_fde_v3 __user *first, *low, *high, *found = NULL;
+	struct sframe_fde_v3 *first, *low, *high, *found = NULL;
 	int ret;
 
-	first = (void __user *)sec->fdes_start;
+	first = (void *)sec->fdes_start;
 	low = first;
 	high = first + sec->num_fdes - 1;
 
 	while (low <= high) {
-		struct sframe_fde_v3 __user *mid;
+		struct sframe_fde_v3 *mid;
 		s64 func_off;
 		unsigned long func_addr;
 
 		mid = low + ((high - low) / 2);
 
-		unsafe_get_user(func_off, (s64 __user *)mid, Efault);
+		DATA_GET(sec, func_off, mid, s64, Efault);
 		func_addr = (unsigned long)mid + func_off;
 
 		if (ip >= func_addr) {
@@ -156,47 +225,47 @@ static __always_inline int __find_fde(struct sframe_section *sec,
 	return -EFAULT;
 }
 
-#define ____UNSAFE_GET_USER_INC(to, from, type, label)			\
+#define ____GET_INC(sec, to, from, type, label)				\
 ({									\
 	type __to;							\
-	unsafe_get_user(__to, (type __user *)from, label);		\
+	DATA_GET(sec, __to, from, type, label);				\
 	from += sizeof(__to);						\
 	to = __to;							\
 })
 
-#define __UNSAFE_GET_USER_INC(to, from, size, label, u_or_s)		\
+#define __GET_INC(sec, to, from, size, label, u_or_s)			\
 ({									\
 	switch (size) {							\
 	case 1:								\
-		____UNSAFE_GET_USER_INC(to, from, u_or_s##8, label);	\
+		____GET_INC(sec, to, from, u_or_s##8, label);		\
 		break;							\
 	case 2:								\
-		____UNSAFE_GET_USER_INC(to, from, u_or_s##16, label);	\
+		____GET_INC(sec, to, from, u_or_s##16, label);		\
 		break;							\
 	case 4:								\
-		____UNSAFE_GET_USER_INC(to, from, u_or_s##32, label);	\
+		____GET_INC(sec, to, from, u_or_s##32, label);		\
 		break;							\
 	default:							\
 		return -EFAULT;						\
 	}								\
 })
 
-#define UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label)		\
-	__UNSAFE_GET_USER_INC(to, from, size, label, u)
+#define GET_UNSIGNED_INC(sec, to, from, size, label)			\
+	__GET_INC(sec, to, from, size, label, u)
 
-#define UNSAFE_GET_USER_SIGNED_INC(to, from, size, label)		\
-	__UNSAFE_GET_USER_INC(to, from, size, label, s)
+#define GET_SIGNED_INC(sec, to, from, size, label)			\
+	__GET_INC(sec, to, from, size, label, s)
 
-#define UNSAFE_GET_USER_INC(to, from, size, label)				\
-	_Generic(to,								\
-		 u8 :	UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label),	\
-		 u16 :	UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label),	\
-		 u32 :	UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label),	\
-		 u64 :	UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label),	\
-		 s8 :	UNSAFE_GET_USER_SIGNED_INC(to, from, size, label),	\
-		 s16 :	UNSAFE_GET_USER_SIGNED_INC(to, from, size, label),	\
-		 s32 :	UNSAFE_GET_USER_SIGNED_INC(to, from, size, label),	\
-		 s64 :	UNSAFE_GET_USER_SIGNED_INC(to, from, size, label))
+#define GET_INC(sec, to, from, size, label)				\
+	_Generic(to,							\
+		 u8 :	GET_UNSIGNED_INC(sec, to, from, size, label),	\
+		 u16 :	GET_UNSIGNED_INC(sec, to, from, size, label),	\
+		 u32 :	GET_UNSIGNED_INC(sec, to, from, size, label),	\
+		 u64 :	GET_UNSIGNED_INC(sec, to, from, size, label),	\
+		 s8 :	GET_SIGNED_INC(sec, to, from, size, label),	\
+		 s16 :	GET_SIGNED_INC(sec, to, from, size, label),	\
+		 s32 :	GET_SIGNED_INC(sec, to, from, size, label),	\
+		 s64 :	GET_SIGNED_INC(sec, to, from, size, label))
 
 static __always_inline int
 __read_default_fre_datawords(struct sframe_section *sec,
@@ -209,19 +278,19 @@ __read_default_fre_datawords(struct sframe_section *sec,
 	s32 cfa_off, ra_off, fp_off;
 	unsigned int cfa_regnum;
 
-	UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
+	GET_INC(sec, cfa_off, cur, dataword_size, Efault);
 	dataword_count--;
 
 	ra_off = sec->ra_off;
 	if (!ra_off && dataword_count) {
 		dataword_count--;
-		UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
+		GET_INC(sec, ra_off, cur, dataword_size, Efault);
 	}
 
 	fp_off = sec->fp_off;
 	if (!fp_off && dataword_count) {
 		dataword_count--;
-		UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+		GET_INC(sec, fp_off, cur, dataword_size, Efault);
 	}
 
 	if (dataword_count)
@@ -257,17 +326,17 @@ __read_flex_fde_fre_datawords(struct sframe_section *sec,
 
 	if (dataword_count < 2)
 		return -EFAULT;
-	UNSAFE_GET_USER_INC(cfa_ctl, cur, dataword_size, Efault);
-	UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
+	GET_INC(sec, cfa_ctl, cur, dataword_size, Efault);
+	GET_INC(sec, cfa_off, cur, dataword_size, Efault);
 	dataword_count -= 2;
 
 	ra_off = sec->ra_off;
 	ra_ctl = ra_off ? 2 : 0; /* regnum=0, deref_p=(ra_off != 0), reg_p=0 */
 	if (dataword_count >= 2) {
-		UNSAFE_GET_USER_INC(ra_ctl, cur, dataword_size, Efault);
+		GET_INC(sec, ra_ctl, cur, dataword_size, Efault);
 		dataword_count--;
 		if (ra_ctl) {
-			UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
+			GET_INC(sec, ra_off, cur, dataword_size, Efault);
 			dataword_count--;
 		} else {
 			/* Padding RA location info */
@@ -278,10 +347,10 @@ __read_flex_fde_fre_datawords(struct sframe_section *sec,
 	fp_off = sec->fp_off;
 	fp_ctl = fp_off ? 2 : 0; /* regnum=0, deref_p=(fp_off != 0), reg_p=0 */
 	if (dataword_count >= 2) {
-		UNSAFE_GET_USER_INC(fp_ctl, cur, dataword_size, Efault);
+		GET_INC(sec, fp_ctl, cur, dataword_size, Efault);
 		dataword_count--;
 		if (fp_ctl) {
-			UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+			GET_INC(sec, fp_off, cur, dataword_size, Efault);
 			dataword_count--;
 		} else {
 			/* Padding FP location info */
@@ -355,11 +424,11 @@ static __always_inline int __read_fre(struct sframe_section *sec,
 	if (fre_addr + addr_size + 1 > sec->fres_end)
 		return -EFAULT;
 
-	UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault);
+	GET_INC(sec, ip_off, cur, addr_size, Efault);
 	if (fde_pctype == SFRAME_FDE_PCTYPE_INC && ip_off > fde->func_size)
 		return -EFAULT;
 
-	UNSAFE_GET_USER_INC(info, cur, 1, Efault);
+	GET_INC(sec, info, cur, 1, Efault);
 	dataword_count = SFRAME_V3_FRE_DATAWORD_COUNT(info);
 	dataword_size  = dataword_size_enum_to_size(SFRAME_V3_FRE_DATAWORD_SIZE(info));
 	if (!dataword_size)
@@ -382,7 +451,7 @@ static __always_inline int __read_fre(struct sframe_section *sec,
 }
 
 static __always_inline int
-sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
+sframe_init_cfa_rule_data(struct unwind_cfa_rule_data *cfa_rule_data,
 			  u32 ctlword, s32 offset)
 {
 	bool deref_p = SFRAME_V3_FLEX_FDE_CTRLWORD_DEREF_P(ctlword);
@@ -393,13 +462,13 @@ sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
 
 		switch (regnum) {
 		case SFRAME_REG_SP:
-			cfa_rule_data->rule = UNWIND_USER_CFA_RULE_SP_OFFSET;
+			cfa_rule_data->rule = UNWIND_CFA_RULE_SP_OFFSET;
 			break;
 		case SFRAME_REG_FP:
-			cfa_rule_data->rule = UNWIND_USER_CFA_RULE_FP_OFFSET;
+			cfa_rule_data->rule = UNWIND_CFA_RULE_FP_OFFSET;
 			break;
 		default:
-			cfa_rule_data->rule = UNWIND_USER_CFA_RULE_REG_OFFSET;
+			cfa_rule_data->rule = UNWIND_CFA_RULE_REG_OFFSET;
 			cfa_rule_data->regnum = regnum;
 		}
 	} else {
@@ -407,7 +476,7 @@ sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
 	}
 
 	if (deref_p)
-		cfa_rule_data->rule |= UNWIND_USER_RULE_DEREF;
+		cfa_rule_data->rule |= UNWIND_RULE_DEREF;
 
 	cfa_rule_data->offset = offset;
 
@@ -415,27 +484,27 @@ sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
 }
 
 static __always_inline void
-sframe_init_rule_data(struct unwind_user_rule_data *rule_data,
+sframe_init_rule_data(struct unwind_rule_data *rule_data,
 		      u32 ctlword, s32 offset)
 {
 	bool deref_p = SFRAME_V3_FLEX_FDE_CTRLWORD_DEREF_P(ctlword);
 	bool reg_p = SFRAME_V3_FLEX_FDE_CTRLWORD_REG_P(ctlword);
 
 	if (!ctlword && !offset) {
-		rule_data->rule = UNWIND_USER_RULE_RETAIN;
+		rule_data->rule = UNWIND_RULE_RETAIN;
 		return;
 	}
 	if (reg_p) {
 		unsigned int regnum = SFRAME_V3_FLEX_FDE_CTRLWORD_REGNUM(ctlword);
 
-		rule_data->rule = UNWIND_USER_RULE_REG_OFFSET;
+		rule_data->rule = UNWIND_RULE_REG_OFFSET;
 		rule_data->regnum = regnum;
 	} else {
-		rule_data->rule = UNWIND_USER_RULE_CFA_OFFSET;
+		rule_data->rule = UNWIND_RULE_CFA_OFFSET;
 	}
 
 	if (deref_p)
-		rule_data->rule |= UNWIND_USER_RULE_DEREF;
+		rule_data->rule |= UNWIND_RULE_DEREF;
 
 	rule_data->offset = offset;
 }
@@ -443,7 +512,7 @@ sframe_init_rule_data(struct unwind_user_rule_data *rule_data,
 static __always_inline int __find_fre(struct sframe_section *sec,
 				      struct sframe_fde_internal *fde,
 				      unsigned long ip,
-				      struct unwind_user_frame *frame)
+				      struct unwind_frame *frame)
 {
 	unsigned char fde_pctype = SFRAME_V3_FDE_PCTYPE(fde->info);
 	struct sframe_fre_internal *fre, *prev_fre = NULL;
@@ -503,40 +572,18 @@ static __always_inline int __find_fre(struct sframe_section *sec,
 	return 0;
 }
 
-int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
+static __always_inline int __sframe_find(struct sframe_section *sec,
+					 unsigned long ip,
+					 struct unwind_frame *frame)
 {
-	struct mm_struct *mm = current->mm;
-	struct sframe_section *sec;
 	struct sframe_fde_internal fde;
 	int ret;
 
-	if (!mm)
-		return -EINVAL;
-
-	guard(srcu)(&sframe_srcu);
-
-	sec = mtree_load(&mm->sframe_mt, ip);
-	if (!sec)
-		return -EINVAL;
-
-	if (!user_read_access_begin((void __user *)sec->sframe_start,
-				    sec->sframe_end - sec->sframe_start))
-		return -EFAULT;
-
 	ret = __find_fde(sec, ip, &fde);
 	if (ret)
-		goto end;
-
-	ret = __find_fre(sec, &fde, ip, frame);
-end:
-	user_read_access_end();
-
-	if (ret == -EFAULT) {
-		dbg_sec("removing bad .sframe section\n");
-		WARN_ON_ONCE(sframe_remove_section(sec->sframe_start));
-	}
+		return ret;
 
-	return ret;
+	return __find_fre(sec, &fde, ip, frame);
 }
 
 #ifdef CONFIG_SFRAME_VALIDATION
@@ -661,20 +708,23 @@ static int sframe_validate_section(struct sframe_section *sec) { return 0; }
 #endif /* !CONFIG_SFRAME_VALIDATION */
 
 
-static void free_section(struct sframe_section *sec)
-{
-	dbg_free(sec);
-	kfree(sec);
-}
-
 static int sframe_read_header(struct sframe_section *sec)
 {
 	unsigned long header_end, fdes_start, fdes_end, fres_start, fres_end;
 	struct sframe_header shdr;
 	unsigned int num_fdes;
 
-	if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
-		dbg_sec("header usercopy failed\n");
+	switch (sec->sec_type) {
+	case SFRAME_USER:
+		if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
+			dbg_sec("header usercopy failed\n");
+			return -EFAULT;
+		}
+		break;
+	case SFRAME_KERNEL:
+		shdr = *(struct sframe_header *)sec->sframe_start;
+		break;
+	default:
 		return -EFAULT;
 	}
 
@@ -721,6 +771,45 @@ static int sframe_read_header(struct sframe_section *sec)
 	return 0;
 }
 
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+int sframe_find_user(unsigned long ip, struct unwind_frame *frame)
+{
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	int ret;
+
+	if (!mm)
+		return -EINVAL;
+
+	guard(srcu)(&sframe_srcu);
+
+	sec = mtree_load(&mm->sframe_mt, ip);
+	if (!sec)
+		return -EINVAL;
+
+	if (!user_read_access_begin((void __user *)sec->sframe_start,
+				    sec->sframe_end - sec->sframe_start))
+		return -EFAULT;
+
+	ret = __sframe_find(sec, ip, frame);
+
+	user_read_access_end();
+
+	if (ret == -EFAULT) {
+		dbg_sec("removing bad .sframe section\n");
+		WARN_ON_ONCE(sframe_remove_section(sec->sframe_start));
+	}
+
+	return ret;
+}
+
+static void free_section(struct sframe_section *sec)
+{
+	dbg_free(sec);
+	kfree(sec);
+}
+
 int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 		       unsigned long text_start, unsigned long text_end)
 {
@@ -757,6 +846,7 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 	if (!sec)
 		return -ENOMEM;
 
+	sec->sec_type		= SFRAME_USER;
 	sec->sframe_start	= sframe_start;
 	sec->sframe_end		= sframe_end;
 	sec->text_start		= text_start;
@@ -877,3 +967,5 @@ SYSCALL_DEFINE5(stacktrace_setup, int, op, unsigned long, addr_start,
 	}
 	return -EINVAL;
 }
+
+#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 3d596da588d0..5670579e3990 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,6 +8,7 @@
 #include <linux/unwind_user.h>
 #include <linux/uaccess.h>
 #include <linux/sframe.h>
+#include <linux/unwind_types.h>
 
 #define for_each_user_frame(state) \
 	for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
@@ -28,7 +29,7 @@ get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
 }
 
 static int unwind_user_next_common(struct unwind_user_state *state,
-				   const struct unwind_user_frame *frame)
+				   const struct unwind_frame *frame)
 {
 	unsigned long cfa, fp, ra;
 
@@ -40,16 +41,16 @@ static int unwind_user_next_common(struct unwind_user_state *state,
 
 	/* Get the Canonical Frame Address (CFA) */
 	switch (frame->cfa.rule) {
-	case UNWIND_USER_CFA_RULE_SP_OFFSET:
+	case UNWIND_CFA_RULE_SP_OFFSET:
 		cfa = state->sp;
 		break;
-	case UNWIND_USER_CFA_RULE_FP_OFFSET:
+	case UNWIND_CFA_RULE_FP_OFFSET:
 		if (state->fp < state->sp)
 			return -EINVAL;
 		cfa = state->fp;
 		break;
-	case UNWIND_USER_CFA_RULE_REG_OFFSET:
-	case UNWIND_USER_CFA_RULE_REG_OFFSET_DEREF:
+	case UNWIND_CFA_RULE_REG_OFFSET:
+	case UNWIND_CFA_RULE_REG_OFFSET_DEREF:
 		if (!state->topmost || unwind_user_get_reg(&cfa, frame->cfa.regnum))
 			return -EINVAL;
 		break;
@@ -58,7 +59,7 @@ static int unwind_user_next_common(struct unwind_user_state *state,
 		return -EINVAL;
 	}
 	cfa += frame->cfa.offset;
-	if (frame->cfa.rule & UNWIND_USER_RULE_DEREF &&
+	if (frame->cfa.rule & UNWIND_RULE_DEREF &&
 	    get_user_word(&cfa, cfa, 0, state->ws))
 		return -EINVAL;
 
@@ -76,19 +77,19 @@ static int unwind_user_next_common(struct unwind_user_state *state,
 
 	/* Get the Return Address (RA) */
 	switch (frame->ra.rule) {
-	case UNWIND_USER_RULE_RETAIN:
+	case UNWIND_RULE_RETAIN:
 		if (!state->topmost || unwind_user_get_ra_reg(&ra))
 			return -EINVAL;
 		break;
 	/*
-	 * UNWIND_USER_RULE_CFA_OFFSET doesn't make sense for RA.
+	 * UNWIND_RULE_CFA_OFFSET doesn't make sense for RA.
 	 * A return address cannot legitimately be a stack address.
 	 */
-	case UNWIND_USER_RULE_CFA_OFFSET_DEREF:
+	case UNWIND_RULE_CFA_OFFSET_DEREF:
 		ra = cfa + frame->ra.offset;
 		break;
-	case UNWIND_USER_RULE_REG_OFFSET:
-	case UNWIND_USER_RULE_REG_OFFSET_DEREF:
+	case UNWIND_RULE_REG_OFFSET:
+	case UNWIND_RULE_REG_OFFSET_DEREF:
 		if (!state->topmost || unwind_user_get_reg(&ra, frame->ra.regnum))
 			return -EINVAL;
 		ra += frame->ra.offset;
@@ -97,24 +98,24 @@ static int unwind_user_next_common(struct unwind_user_state *state,
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
-	if (frame->ra.rule & UNWIND_USER_RULE_DEREF &&
+	if (frame->ra.rule & UNWIND_RULE_DEREF &&
 	    get_user_word(&ra, ra, 0, state->ws))
 		return -EINVAL;
 
 	/* Get the Frame Pointer (FP) */
 	switch (frame->fp.rule) {
-	case UNWIND_USER_RULE_RETAIN:
+	case UNWIND_RULE_RETAIN:
 		fp = state->fp;
 		break;
 	/*
-	 * UNWIND_USER_RULE_CFA_OFFSET is currently not used for FP
+	 * UNWIND_RULE_CFA_OFFSET is currently not used for FP
 	 * (e.g. SFrame cannot represent this rule).
 	 */
-	case UNWIND_USER_RULE_CFA_OFFSET_DEREF:
+	case UNWIND_RULE_CFA_OFFSET_DEREF:
 		fp = cfa + frame->fp.offset;
 		break;
-	case UNWIND_USER_RULE_REG_OFFSET:
-	case UNWIND_USER_RULE_REG_OFFSET_DEREF:
+	case UNWIND_RULE_REG_OFFSET:
+	case UNWIND_RULE_REG_OFFSET_DEREF:
 		if (!state->topmost || unwind_user_get_reg(&fp, frame->fp.regnum))
 			return -EINVAL;
 		fp += frame->fp.offset;
@@ -123,7 +124,7 @@ static int unwind_user_next_common(struct unwind_user_state *state,
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
-	if (frame->fp.rule & UNWIND_USER_RULE_DEREF &&
+	if (frame->fp.rule & UNWIND_RULE_DEREF &&
 	    get_user_word(&fp, fp, 0, state->ws))
 		return -EINVAL;
 
@@ -139,13 +140,13 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 	struct pt_regs *regs = task_pt_regs(current);
 
 	if (state->topmost && unwind_user_at_function_start(regs)) {
-		const struct unwind_user_frame fp_entry_frame = {
+		const struct unwind_frame fp_entry_frame = {
 			ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws)
 		};
 		return unwind_user_next_common(state, &fp_entry_frame);
 	}
 
-	const struct unwind_user_frame fp_frame = {
+	const struct unwind_frame fp_frame = {
 		ARCH_INIT_USER_FP_FRAME(state->ws)
 	};
 	return unwind_user_next_common(state, &fp_frame);
@@ -153,10 +154,10 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 
 static int unwind_user_next_sframe(struct unwind_user_state *state)
 {
-	struct unwind_user_frame frame;
+	struct unwind_frame frame;
 
 	/* sframe expects the frame to be local storage */
-	if (sframe_find(state->ip, &frame))
+	if (sframe_find_user(state->ip, &frame))
 		return -ENOENT;
 	return unwind_user_next_common(state, &frame);
 }
-- 
2.54.0.563.g4f69b47b94-goog


^ permalink raw reply related

* [PATCH v6 0/9] unwind, arm64: add sframe unwinder for kernel
From: Dylan Hatch @ 2026-05-19  6:49 UTC (permalink / raw)
  To: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Mark Rutland, Jens Remus
  Cc: Dylan Hatch, Prasanna Kumar T S M, Puranjay Mohan, Song Liu,
	joe.lawrence, linux-toolchains, linux-kernel, live-patching,
	linux-arm-kernel, Randy Dunlap, Mostafa Saleh, Herbert Xu,
	David S. Miller

Implement a generic kernel sframe-based [1] unwinder. The main goal is
to improve reliable stacktrace on arm64 by unwinding across exception
boundaries.

On x86, the ORC unwinder provides reliable stacktrace through similar
methodology, but arm64 lacks the necessary support from objtool to
create ORC unwind tables.

Currently, there's already a sframe unwinder proposed for userspace: [2].
To maintain common definitions and algorithms for sframe lookup, a
substantial portion of this patch series aims to refactor the sframe
lookup code to support both kernel and userspace sframe sections.

Currently, only GNU Binutils support sframe. This series relies on the
Sframe V3 format, which is supported in binutils 2.46.

These patches are based on Steven Rostedt's sframe/core branch [3],
which is and aggregation of existing work done for x86 sframe userspace
unwind, and contains [2]. This branch is, in turn, based on Linux
v7.1-rc2. This full series (applied to the sframe/core branch) is
available on github: [4].

Ref:
[1]: https://sourceware.org/binutils/docs/sframe-spec.html
[2]: https://lore.kernel.org/all/20260505121718.3572346-1-jremus@linux.ibm.com/
[3]: https://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git/log/?h=sframe/core
[4]: https://github.com/dylanbhatch/linux/tree/sframe-v6

Changes since v5:
- Rebase on latest sframe/core branch [3] (based on v7.1-rc2).
- (Mark) Drop CFI annotations from el1*_64_* entry functions.
- (Mark) Add CFI annotations for leaf functions in lib/ and crypto/.
- (Jens) Sort module FDEs at load-time, drop linear search method.
- (Jens) Fix mistake in module SFrame validation where temp copy is not
  yet embedded within a struct module.
- (Jens) Initialize debug info for kernel .sframe sections.
- (Mark) Move kernel-specific unwind fields to struct kunwind_state.
- (Mark) Drop SP from unwind state.
- (Mark) Rename unwind_next_frame_sframe -> kunwind_next_regs_sframe,
  add checks to assert a correct KUNWIND_SOURCE_REGS_PC state.
- (Mark) Drop unused flexible FDE handling.
- (Mark) Check CFA alignment to 16 bytes instead of 8 bytes.
- (Mark) For non-KUNWIND_SOURCE_REGS_PC state, drop the fallback to
  SFrame unwind if FP unwind fails in kunwind_next().

Dylan Hatch (8):
  sframe: Allow kernelspace sframe sections
  arm64, unwind: build kernel with sframe V3 info
  arm64, crypto/lib: Annotate leaf functions with CFI info.
  sframe: Provide PC lookup for vmlinux .sframe section
  arm64/module, sframe: Add sframe support for modules
  sframe: Introduce in-kernel SFRAME_VALIDATION
  sframe: Initialize debug info for kernel sections
  unwind: arm64: Use sframe to unwind interrupt frames

Weinan Liu (1):
  arm64: entry: add unwind info for call_on_irq_stack()

 MAINTAINERS                                   |   4 +-
 Makefile                                      |   8 +
 arch/Kconfig                                  |  27 +-
 arch/arm64/Kconfig                            |   1 +
 arch/arm64/crypto/aes-ce-ccm-core.S           |  12 +-
 arch/arm64/crypto/aes-neonbs-core.S           |  40 +-
 arch/arm64/crypto/ghash-ce-core.S             |  20 +-
 arch/arm64/crypto/sm4-ce-ccm-core.S           |  16 +-
 arch/arm64/crypto/sm4-ce-cipher-core.S        |   4 +-
 arch/arm64/crypto/sm4-ce-core.S               |  44 +-
 arch/arm64/crypto/sm4-ce-gcm-core.S           |  16 +-
 arch/arm64/crypto/sm4-neon-core.S             |  12 +-
 arch/arm64/include/asm/linkage.h              |  30 ++
 arch/arm64/include/asm/module.h               |   6 +
 arch/arm64/include/asm/sections.h             |   1 +
 arch/arm64/include/asm/unwind_sframe.h        |  54 +++
 arch/arm64/kernel/entry.S                     |  14 +
 arch/arm64/kernel/module.c                    |   8 +
 arch/arm64/kernel/setup.c                     |   2 +
 arch/arm64/kernel/stacktrace.c                | 222 +++++++++-
 arch/arm64/kernel/vdso/Makefile               |   2 +-
 arch/arm64/kernel/vmlinux.lds.S               |   2 +
 arch/arm64/lib/clear_page.S                   |   4 +-
 arch/arm64/lib/clear_user.S                   |   4 +-
 arch/arm64/lib/copy_from_user.S               |   4 +-
 arch/arm64/lib/copy_page.S                    |   4 +-
 arch/arm64/lib/copy_to_user.S                 |   4 +-
 arch/arm64/lib/memchr.S                       |   4 +-
 arch/arm64/lib/memcmp.S                       |   4 +-
 arch/arm64/lib/memcpy.S                       |   8 +-
 arch/arm64/lib/memset.S                       |   8 +-
 arch/arm64/lib/mte.S                          |  28 +-
 arch/arm64/lib/strchr.S                       |   4 +-
 arch/arm64/lib/strcmp.S                       |   4 +-
 arch/arm64/lib/strlen.S                       |   4 +-
 arch/arm64/lib/strncmp.S                      |   4 +-
 arch/arm64/lib/strnlen.S                      |   4 +-
 arch/arm64/lib/tishift.S                      |  12 +-
 .../{unwind_user_sframe.h => unwind_sframe.h} |   6 +-
 arch/x86/include/asm/unwind_user.h            |  12 +-
 include/asm-generic/sections.h                |   4 +
 include/asm-generic/vmlinux.lds.h             |  15 +
 include/linux/sframe.h                        |  67 ++-
 include/linux/unwind_types.h                  |  46 ++
 include/linux/unwind_user_types.h             |  41 --
 kernel/unwind/Makefile                        |   2 +-
 kernel/unwind/sframe.c                        | 419 ++++++++++++++----
 kernel/unwind/sframe_debug.h                  |  13 +
 kernel/unwind/user.c                          |  45 +-
 49 files changed, 979 insertions(+), 340 deletions(-)
 create mode 100644 arch/arm64/include/asm/unwind_sframe.h
 rename arch/x86/include/asm/{unwind_user_sframe.h => unwind_sframe.h} (50%)
 create mode 100644 include/linux/unwind_types.h

-- 
2.54.0.563.g4f69b47b94-goog


^ permalink raw reply

* Re: [RFC PATCH 5/6] livepatch: Remove obsolete per-object callbacks
From: Yafang Shao @ 2026-05-19  6:46 UTC (permalink / raw)
  To: sashiko-reviews
  Cc: jikos, joe.lawrence, mbenes, song, pmladek, jpoimboe,
	live-patching
In-Reply-To: <20260514224000.8316FC2BCB3@smtp.kernel.org>

On Fri, May 15, 2026 at 6:40 AM <sashiko-bot@kernel.org> wrote:
>
> Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
> - [Critical] Out-of-bounds ELF relocations causing memory corruption
> - [High] Silent failure of livepatch state callbacks due to missing initialization
> - [Low] Build failures due to unmigrated selftests and samples
> --
>
> commit deb761bd65e9a96a95798282bde72ddb362e1c0d
> Author: Yafang Shao <laoar.shao@gmail.com>
>
> livepatch: Remove obsolete per-object callbacks
>
> This commit removes obsolete per-object callbacks from the livepatch framework
> in favor of per-state callbacks. It drops the callback fields from the klp
> object structures and updates objtool and the livepatch core scripts.
>
> > All selftests have been migrated to the new per-state callbacks, making the
> > per-object callbacks redundant.
>
> Is this description accurate?

Selftests are untouched in this RFC. I'll revise the commit log accordingly.

>
> The callbacks field was removed from struct klp_object, but if modules like
> samples/livepatch/livepatch-callbacks-demo.c or the selftests still use it
> in their klp_object declarations, won't this cause build failures?
>
> > diff --git a/scripts/livepatch/init.c b/scripts/livepatch/init.c
> > index 9265e0e504e0c..570366b9268c4 100644
> > --- a/scripts/livepatch/init.c
> > +++ b/scripts/livepatch/init.c
> > @@ -63,8 +63,6 @@ static int __init livepatch_mod_init(void)
> >
> >               obj->name = obj_ext->name;
> >               obj->funcs = funcs;
> > -
> > -             memcpy(&obj->callbacks, &obj_ext->callbacks, sizeof(struct klp_callbacks));
> >       }
> >
> >       patch->mod = THIS_MODULE;
>
> Will the state callbacks be executed when the patch is enabled?
>
> Looking at livepatch_mod_init() after these changes, patch->states is never
> initialized:
>
> scripts/livepatch/init.c:livepatch_mod_init() {
>         ...
>         patch->mod = THIS_MODULE;
>         patch->objs = objs;
>
>         /* TODO patch->states */
>
>         patch->replace_set = KLP_REPLACE_TAG;
>         return klp_enable_patch(patch);
> }
>
> When klp_states_pre_patch() iterates over patch->states, won't it silently
> skip the uninitialized callbacks?

Since these callbacks have been moved to klp_state, they must be
initialized first. Note that the KLP_*_PATCH_CALLBACK macros are
broken after patch #6; I will fix this in the next version.

>
> > diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
> > index 4d1f9e9977eb9..6e2762de3d512 100644
> > --- a/tools/objtool/klp-diff.c
> > +++ b/tools/objtool/klp-diff.c
> [ ... ]
> > @@ -1566,8 +1566,8 @@ static int create_klp_sections(struct elfs *e)
> >               reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
> >
> >               if (!elf_create_reloc(e->out, obj_sec,
> > -                                   offsetof(struct klp_object_ext, callbacks) +
> > -                                   offsetof(struct klp_callbacks, pre_patch),
> > +                                   offsetof(struct klp_state_ext, callbacks) +
> > +                                   offsetof(struct klp_state_callbacks, pre_patch),
> >                                     reloc->sym, reloc_addend(reloc), R_ABS64))
> >                       return -1;
> >       }
>
> Does this result in out-of-bounds memory writes during ELF relocation?
>
> Earlier in create_klp_sections(), obj_sec is allocated using the size of
> struct klp_object_ext, which is 24 bytes:
>
> tools/objtool/klp-diff.c:create_klp_sections() {
>         size_t obj_size  = sizeof(struct klp_object_ext);
>         ...
>         obj_sec  = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0);
>
> However, the new relocations are created using offsets from a completely
> different structure, struct klp_state_ext.
>
> Because offsetof(struct klp_state_ext, callbacks) is 16, and the callback
> offsets go up to 24, this applies relocations at offsets 16, 24, 32, and 40
> within obj_sec.
>
> Won't the relocation at offset 16 corrupt klp_object_ext.nr_funcs, and the
> subsequent relocations write entirely out of bounds of the 24-byte obj_sec
> buffer?

Nice catch. I will fix it in the next version.

-- 
Regards
Yafang

^ permalink raw reply

* Re: [PATCH v5 8/8] unwind: arm64: Use sframe to unwind interrupt frames
From: Dylan Hatch @ 2026-05-19  6:29 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Prasanna Kumar T S M, Puranjay Mohan,
	Song Liu, joe.lawrence, linux-toolchains, linux-kernel,
	live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <afTYzAF_x41pyilu@J2N7QTR9R3>

Hi Mark,

I'm sending a v6 shortly that should address all/most of your
feedback, but I wanted to circle back on a question you had:

On Fri, May 1, 2026 at 9:46 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > +     /*
> > +      * Consume RA and FP from the stack. The frame record puts FP at a lower
> > +      * address than RA, so we always read FP first.
> > +      */
> > +     if (frame.fp.rule & UNWIND_RULE_DEREF &&
> > +         !get_word(&state->common, &fp))
> > +             return -EINVAL;
>
> Why is this get_word() rather than get_consume_word()?

I use get_word() here because get_consume_word(), in calling
unwind_consume_stack() under the hood, consumes the stack up to the
given address+size such that another unwind step cannot consume it
again. If the subsequent call to get_consume_word() fails, the stack
needs to be in a state such that we can fall back on a frame pointer
unwind. But if we were to use get_consume_word() here, the fallback
call to kunwind_next_frame_record() would not be able to consume the
FP from the stack because it would already have been consumed by the
failed call to unwind_next_frame_sframe().

By only calling get_consume_word() on the RA at the end, we defer
making any changes to the underlying unwind state stack until we are
sure the SFrame unwind step will succeed.

>
> > +
> > +     if (frame.ra.rule & UNWIND_RULE_DEREF &&
> > +         get_consume_word(&state->common, &ra))
> > +             return -EINVAL;
> > +
> > +     state->common.pc = ra;
> > +     state->common.sp = cfa;

Please let me know if this reasoning seems sound.

Thanks,
Dylan

^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Sasha Levin @ 2026-05-19  0:22 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-kernel, linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <CAPhsuW44UX663Au=WwHz8MVwnQgLkjxOqpJSCKxNiv3=RpZvqw@mail.gmail.com>

On Mon, May 18, 2026 at 04:59:08PM -0700, Song Liu wrote:
>On Mon, May 18, 2026 at 6:33 AM Sasha Levin <sashal@kernel.org> wrote:
>>
>> On Sun, May 17, 2026 at 11:37:36PM -0700, Song Liu wrote:
>> >On Sun, May 17, 2026 at 6:49 AM Sasha Levin <sashal@kernel.org> wrote:
>> >> * fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
>> >>   most production kernels. Even where enabled, it only works on
>> >>   functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
>> >>   no help for a freshly-disclosed CVE. The debugfs UI is blocked by
>> >>   lockdown=integrity and the override is probabilistic.
>> >>
>> >> * BPF override (bpf_override_return) honors the same
>> >>   ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
>> >>   production kernels. Even where on, the operator interface is
>> >>   "load a verified BPF program," not a one-line write.
>> >
>> >If it is OK for killswitch to attach to any kernel functions, do we still
>> >need ALLOW_ERROR_INJECTION() for fail_function and BPF
>> >override? Shall we instead also allow fail_function and BPF override
>> >to attach to any kernel functions?
>>
>> I don't think so. ALLOW_ERROR_INJECTION is not a security mechanism, it's an
>> integrity/safety mechanism for both bpf and fault injection.
>>
>> It protects against a "developer or CI script doing legitimate fault injection
>> accidentally panics the box" scenario, not an "attacker gets in" one.
>
>There really isn't a clear boundary between "security mechanism" and
>"non-security mechanism". As we are making killswitch available
>everywhere under root, users will soon learn to use it to do fault injection,
>and potentially much more scary things. (Think about agents with sudo
>access).

Wouldn't the same argument apply to /dev/mem? If you enable that, and you give
whatever tool/agent/etc access to the interface, you're bound to have a really
bad time unless you know what you're doing?

root can already load a killswitch equivalent module, right? there's nothing
really new with killswitch.

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Song Liu @ 2026-05-18 23:59 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <agsVDqdALBoHEHlv@laps>

On Mon, May 18, 2026 at 6:33 AM Sasha Levin <sashal@kernel.org> wrote:
>
> On Sun, May 17, 2026 at 11:37:36PM -0700, Song Liu wrote:
> >On Sun, May 17, 2026 at 6:49 AM Sasha Levin <sashal@kernel.org> wrote:
> >> * fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
> >>   most production kernels. Even where enabled, it only works on
> >>   functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
> >>   no help for a freshly-disclosed CVE. The debugfs UI is blocked by
> >>   lockdown=integrity and the override is probabilistic.
> >>
> >> * BPF override (bpf_override_return) honors the same
> >>   ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
> >>   production kernels. Even where on, the operator interface is
> >>   "load a verified BPF program," not a one-line write.
> >
> >If it is OK for killswitch to attach to any kernel functions, do we still
> >need ALLOW_ERROR_INJECTION() for fail_function and BPF
> >override? Shall we instead also allow fail_function and BPF override
> >to attach to any kernel functions?
>
> I don't think so. ALLOW_ERROR_INJECTION is not a security mechanism, it's an
> integrity/safety mechanism for both bpf and fault injection.
>
> It protects against a "developer or CI script doing legitimate fault injection
> accidentally panics the box" scenario, not an "attacker gets in" one.

There really isn't a clear boundary between "security mechanism" and
"non-security mechanism". As we are making killswitch available
everywhere under root, users will soon learn to use it to do fault injection,
and potentially much more scary things. (Think about agents with sudo
access).

Thanks,
Song

^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Song Liu @ 2026-05-18 23:52 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <20260517134858.146569-1-sashal@kernel.org>

On Sun, May 17, 2026 at 6:49 AM Sasha Levin <sashal@kernel.org> wrote:
>
> When a kernel (security) issue goes public, fleets stay exposed until a patched
> kernel is built, distributed, and rebooted into.
>
> For many such issues the simplest mitigation is to stop calling the buggy
> function. Killswitch provides that. An admin writes:
>
>     echo "engage af_alg_sendmsg -1" \
>         > /sys/kernel/security/killswitch/control
>

With v3, we hit this with fentry and killswitch on the same function:

[root@(none) /]# bpftrace -e 'fentry:security_file_open {@count+=1;}' &
[1] 295
Attached 1 probe
[root@(none) /]# echo 'engage security_file_open 0' >
/sys/kernel/security/killswitch/control
[   97.112360] killswitch: engage security_file_open=0 uid=0
auid=4294967295 ses=4294967295 comm=bash
[   97.120766] BUG: unable to handle page fault for address: ffffffffb5855043
[   97.121212] #PF: supervisor read access in kernel mode
[   97.121517] #PF: error_code(0x0000) - not-present page
[   97.121710] PGD 4a76067 P4D 4a77067 PUD 4a78063 PMD 0
[   97.121710] Oops: Oops: 0000 [#1] SMP NOPTI
[   97.121710] CPU: 1 UID: 0 PID: 430 Comm: bash Tainted: G
     N H 7.1.0-rc4+ #195 PREEMPT(full)
[   97.121710] Tainted: [N]=TEST, [H]=KILLSWITCH
[   97.121710] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   97.121710] RIP: 0010:fd_install+0x1c/0x220
[   97.121710] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f
1e fa 0f 1f 44 00 00 65 48 8b 15 47 a0 a4 04 41 54 55 53 48 8b 9a 70
0a 00 00 <f6> 46 43 01 0f 85 62 01 00 00 41 89 fc 48 89 f5 65 ff 05 3d
a0 a4
[   97.121710] RSP: 0018:ffa0000000f2fe70 EFLAGS: 00010286
[   97.121710] RAX: ffffffffb5855000 RBX: ff11000100911c40 RCX: 0000000000000000
[   97.121710] RDX: ff110001045349c0 RSI: ffffffffb5855000 RDI: 0000000000000003
[   97.121710] RBP: ff11000100be81c0 R08: 0000000000000001 R09: 0000000000000000
[   97.121710] R10: 0000000000000001 R11: 00000000000008c2 R12: 0000000000000003
[   97.121710] R13: 00000000ffffff9c R14: 0000000000000101 R15: 0000000000000000
[   97.121710] FS:  00007fb231d4d740(0000) GS:ff110001b5855000(0000)
knlGS:0000000000000000
[   97.121710] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   97.121710] CR2: ffffffffb5855043 CR3: 0000000114513002 CR4: 0000000000771ef0
[   97.121710] PKRU: 00000000
[   97.121710] Call Trace:
[   97.121710]  <TASK>
[   97.121710]  do_sys_openat2+0x7f/0xe0
[   97.121710]  __x64_sys_openat+0x56/0xa0
[   97.121710]  do_syscall_64+0xc4/0xf20
[   97.121710]  ? srso_alias_return_thunk+0x5/0xfbef5
[   97.121710]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   97.121710] RIP: 0033:0x7fb231e4ee1b
[   97.121710] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25
18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00
00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 54 24 28 64 48 2b
14 25
[   97.121710] RSP: 002b:00007ffefe160770 EFLAGS: 00000246 ORIG_RAX:
0000000000000101
[   97.121710] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fb231e4ee1b
[   97.121710] RDX: 0000000000000000 RSI: 000055616f0411d0 RDI: 00000000ffffff9c
[   97.121710] RBP: 000055616f0411d0 R08: 000055616f046b60 R09: 0064692d656e6968
[   97.121710] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[   97.121710] R13: 000055616f03cb20 R14: 000055616f039310 R15: 0000000000000000
[   97.121710]  </TASK>
[   97.121710] Modules linked in:
[   97.121710] CR2: ffffffffb5855043
[   97.121710] ---[ end trace 0000000000000000 ]---
[   97.121710] RIP: 0010:fd_install+0x1c/0x220
[   97.121710] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f
1e fa 0f 1f 44 00 00 65 48 8b 15 47 a0 a4 04 41 54 55 53 48 8b 9a 70
0a 00 00 <f6> 46 43 01 0f 85 62 01 00 00 41 89 fc 48 89 f5 65 ff 05 3d
a0 a4
[   97.121710] RSP: 0018:ffa0000000f2fe70 EFLAGS: 00010286
[   97.121710] RAX: ffffffffb5855000 RBX: ff11000100911c40 RCX: 0000000000000000
[   97.121710] RDX: ff110001045349c0 RSI: ffffffffb5855000 RDI: 0000000000000003
[   97.121710] RBP: ff11000100be81c0 R08: 0000000000000001 R09: 0000000000000000
[   97.121710] R10: 0000000000000001 R11: 00000000000008c2 R12: 0000000000000003
[   97.121710] R13: 00000000ffffff9c R14: 0000000000000101 R15: 0000000000000000
[   97.121710] FS:  00007fb231d4d740(0000) GS:ff110001b5855000(0000)
knlGS:0000000000000000
[   97.121710] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   97.121710] CR2: ffffffffb5855043 CR3: 0000000114513002 CR4: 0000000000771ef0
[   97.121710] PKRU: 00000000
[   97.121710] Kernel panic - not syncing: Fatal exception
[   97.121710] Kernel Offset: disabled

^ permalink raw reply

* Re: [PATCH v5 3/8] arm64: entry: add unwind info for various kernel entries
From: Dylan Hatch @ 2026-05-18 22:41 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Prasanna Kumar T S M, Puranjay Mohan,
	Song Liu, joe.lawrence, linux-toolchains, linux-kernel,
	live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <agbgMb6jrgiFFHRX@J2N7QTR9R3>

On Fri, May 15, 2026 at 1:58 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Thu, May 14, 2026 at 08:30:43PM -0700, Dylan Hatch wrote:
> > On Wed, Apr 29, 2026 at 8:26 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > > On Tue, Apr 28, 2026 at 06:36:38PM +0000, Dylan Hatch wrote:
> > > > From: Weinan Liu <wnliu@google.com>
> > > >
> > > > DWARF CFI (Call Frame Information) specifies how to recover the return
> > > > address and callee-saved registers at each PC in a given function.
> > > > Compilers are able to generate the CFI annotations when they compile
> > > > the code to assembly language. For handcrafted assembly, we need to
> > > > annotate them by hand.
> > > >
> > > > Annotate minimal CFI to enable stacktracing using SFrame for kernel
> > > > exception entries through el1*_64_*() paths
> > >
> > > I thought we were only consuming SFrame when unwinding an exeption
> > > boundary?
> > >
> > > We shouldn't be taking exceptions _from_ the entry assembly functions
> > > unless something has gone horribly wrong, and so I don't see why we'd
> > > need CFI entries for the entry assembly functions.
> > >
> > > Am I missing some reason we need CFI entries for the entry assembly
> > > functions? I strongly suspect it is not necessary to add these, and I'd
> > > prefer to omit them.
> >
> > I believe the el1 entry functions are called in an exception, and are
> > called before call_on_irq_stack.
>
> Yes, but I don't think that matters. See below for more details.
>
> > Example stacktrace segment:
> >
> > [  262.119564]  handle_percpu_devid_irq+0xb4/0x348
> > [  262.119913]  handle_irq_desc+0x3c/0x68
> > [  262.120196]  generic_handle_domain_irq+0x20/0x40
> > [  262.120678]  gic_handle_irq+0x48/0xe0
> > [  262.121005]  call_on_irq_stack+0x30/0x48
> > [  262.121412]  do_interrupt_handler+0x88/0xa0
> > [  262.121779]  el1_interrupt+0x38/0x58
> > [  262.122089]  el1h_64_irq_handler+0x18/0x30
> > [  262.122617]  el1h_64_irq+0x6c/0x70
>
> The segment immediately above can be unwound using FP, as frame records
> were created consistently, and there are no exception boundaries. No CFI
> needed.

Ah right -- with the logic in stacktrace.c changed to use SFrame only
when recovering the next frame from a pt_regs, the FP alone is
sufficient if we know these entry functions won't take an exception.
This patch was originally implemented with an SFrame-only unwinder in
mind, so my mental model still hadn't back-propagated the new logic to
this patch :)

>
> It's legitimate to take an exception from parts of call_on_irq_stack(),
> so it makes sense for that to have CFI, but for the specific unwind
> segment above, CFI isn't necessary.
>
> Everything in the stacktrace segment above was executed *after* HW took
> the exception.
>
> << EXCEPTION BOUNDARY HERE >>
>
> Everything in the stacktrace segment(s) below was executed *before* HW
> took the exception.
>
> The unwinder knows that it has crossed this exception boundary by virtue
> of finding a FRAME_META_TYPE_PT_REGS frame record.
>
> > [  262.123159]  _raw_spin_unlock_irq+0x10/0x60 (P)
>
> The unwinder knows that the value of pt_regs::pc was *definitely* the PC
> at the time the exception was taken, so that entry is reliable. No CFI
> needed.
>
> > [  262.123720]  __filemap_add_folio+0x200/0x580 (L)
>
> The unwinder doesn't know whether the LR should be used, and needs CFI
> to determine that.
>
> After this point, an FP unwind can be used until we encounter the next
> exception boundary.

Right, and this is what is implemented in the final patch of this series.

>
> > [  262.124145]  filemap_add_folio+0xec/0x300
> > [  262.124674]  page_cache_ra_unbounded+0x128/0x368
> > [  262.125338]  do_page_cache_ra+0x70/0x98
> > [  262.125875]  page_cache_ra_order+0x460/0x4e0
>
> The segment immediately above can be unwound using FP. No CFI needed.
>
> > Here, el1h_64_irq is the last function that appears in the exception
> > stack before _raw_spin_unlock_irq and __filemap_add_folio are
> > recovered from the saved PC and LR, respectively. So we therefore need
> > the CFI annotations in order to unwind through the full exception
> > boundary.
> >
> > Is my interpretation here correct?
>
> Given you say "full exception boundary" here, I think we might be using
> the term "exception boundary" to mean different things.
>
> As per the example above, I'm using "exception boundary" to mean the a
> point between two entries in the stacktrace where HW took an exception.
>
> Did my comments on the example help?

I admit I may have been using the term "exception boundary" with a
vague definition, which was partly the source of my confusion. Thanks
for the example, it did help.

>
> > > > and irq entries through call_on_irq_stack()
> > >
> > > Needing some sort of unwind annotations for call_on_irq_stack() makes
> > > sense to me, but don't we need something for other assembly functions
> > > too?
> > >
> > > We can interrupt things like memset(); I assume we'll treat those as
> > > unreliable until annotated?
> >
> > While looking into adding these annotations, I noticed a pattern where
> > a sibling call is made to a local function:
> >
> > SYM_FUNC_START(__pi_memset)
> > alternative_if_not ARM64_HAS_MOPS
> >         b       __pi_memset_generic
> > alternative_else_nop_endif
> >
> >         mov     dst, dstin
> >         setp    [dst]!, count!, val_x
> >         setm    [dst]!, count!, val_x
> >         sete    [dst]!, count!, val_x
> >         ret
> > SYM_FUNC_END(__pi_memset)
> >
> > In this case, do we consider the stacktrace unreliable since
> > __pi_memset may not appear in the trace?
>
> This is a tail-call, and __pi_memset_generic() will not return to
> __pi_memset(). Once the branch to __pi_memset_generic() has been
> executed, it's fine for __pi_memset() to not show up in the trace.
>
> The key thing is that no more instructions from __pi_memset() itself
> will be executed unless it was called again (from its entry point).
>
> > Or is this not important because assembly functions cannot be directly
> > livepatched anyway?
>
> To the best of my knowledge, reliable stacktrace is only used to
> determine whether any thread is still within an old version of a
> patchable function (including where it's within a callee thereof).
>
> I am not aware of a case where we'd need to detect whether a thread is
> still within a non-patchable function, but I can't rule out that as a
> possibility.
>
> That's more of a question for the livepatching maintainers.
>
> Thanks,
> Mark.

Thanks,
Dylan

^ permalink raw reply

* Re: [RFC PATCH 1/6] livepatch: Support scoped atomic replace using replace set
From: Song Liu @ 2026-05-18 21:25 UTC (permalink / raw)
  To: Yafang Shao; +Cc: jpoimboe, jikos, mbenes, pmladek, joe.lawrence, live-patching
In-Reply-To: <20260513143321.26185-2-laoar.shao@gmail.com>

On Wed, May 13, 2026 at 7:34 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> Convert the replace attribute from a boolean to a u32 to function as a
> "replace set." A newly loaded livepatch will now atomically replace
> existing patches that belong to the same set.
>
> This change currently supports function replacement only; support for
> state and shadow variables will be introduced in subsequent patches.
>
> Suggested-by: Song Liu <song@kernel.org>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  .../livepatch/cumulative-patches.rst          | 17 ++++++++------
>  Documentation/livepatch/livepatch.rst         | 23 +++++++++++--------
>  include/linux/livepatch.h                     |  5 ++--
>  kernel/livepatch/core.c                       | 16 ++++++++-----
>  kernel/livepatch/state.c                      | 17 +++++++-------
>  kernel/livepatch/transition.c                 | 10 ++++----
>  scripts/livepatch/init.c                      |  7 +-----
>  scripts/livepatch/klp-build                   | 14 +++++------
>  8 files changed, 59 insertions(+), 50 deletions(-)
>
> diff --git a/Documentation/livepatch/cumulative-patches.rst b/Documentation/livepatch/cumulative-patches.rst
> index 1931f318976a..6ef49748110e 100644
> --- a/Documentation/livepatch/cumulative-patches.rst
> +++ b/Documentation/livepatch/cumulative-patches.rst
> @@ -17,18 +17,20 @@ from all older livepatches and completely replace them in one transition.
>  Usage
>  -----
>
> -The atomic replace can be enabled by setting "replace" flag in struct klp_patch,
> -for example::
> +The "replace_set" attribute in ``struct klp_patch`` acts as a **replace set**,
> +defining the scope of the replacement. By default, the replace set is 1.
> +
> +For example::
>
>         static struct klp_patch patch = {
>                 .mod = THIS_MODULE,
>                 .objs = objs,
> -               .replace = true,
> +               .replace_set = 1,
>         };

I wonder whether we should have "replace_set = 0" means no replace.
This will simplify the transition for users of the existing replace=false
option. I would like to hear other folks' thoughts on this.

Thanks,
Song

^ permalink raw reply

* Re: [PATCH v5 0/8] unwind, arm64: add sframe unwinder for kernel
From: Dylan Hatch @ 2026-05-18 17:55 UTC (permalink / raw)
  To: Mostafa Saleh
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Mark Rutland, Prasanna Kumar T S M,
	Puranjay Mohan, Song Liu, joe.lawrence, linux-toolchains,
	linux-kernel, live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <agcEMEl-QR0g6DgF@google.com>

Hi Mostafa,

On Fri, May 15, 2026 at 4:32 AM Mostafa Saleh <smostafa@google.com> wrote:
>
> On Tue, Apr 28, 2026 at 06:36:35PM +0000, Dylan Hatch wrote:
> > Implement a generic kernel sframe-based [1] unwinder. The main goal is
> > to improve reliable stacktrace on arm64 by unwinding across exception
> > boundaries.
> >
> > On x86, the ORC unwinder provides reliable stacktrace through similar
> > methodology, but arm64 lacks the necessary support from objtool to
> > create ORC unwind tables.
> >
> > Currently, there's already a sframe unwinder proposed for userspace: [2].
> > To maintain common definitions and algorithms for sframe lookup, a
> > substantial portion of this patch series aims to refactor the sframe
> > lookup code to support both kernel and userspace sframe sections.
> >
> > Currently, only GNU Binutils support sframe. This series relies on the
> > Sframe V3 format, which is supported in binutils 2.46.
> >
> > These patches are based on Steven Rostedt's sframe/core branch [3],
> > which is and aggregation of existing work done for x86 sframe userspace
> > unwind, and contains [2]. This branch is, in turn, based on Linux
> > v7.0-rc3. This full series (applied to the sframe/core branch) is
> > available on github: [4].
> >
>
> Not sure if related, but after updating my toolchain
> (aarch64-linux-gnu-gcc (Debian 15.2.0-4) 15.2.0), I hit link errors:
> ld.lld: error: arch/arm64/kernel/vdso/vgettimeofday.o:(.sframe) is being placed in '.sframe'
> ld.lld: error: arch/arm64/kernel/vdso/vgetrandom.o:(.sframe) is being placed in '.sframe`

Previously when developing against the SFrame V2 format, I had fixed
these warnings with the VDSO Makefile change currently in this series:

diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 7dec05dd33b7..c60ef921956f 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -38,7 +38,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
                        $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \
                        $(GCC_PLUGINS_CFLAGS) \
-                       $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
+                       $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(CC_FLAGS_SFRAME) \
                        -Wmissing-prototypes -Wmissing-declarations

 CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables

But the warnings seem to have returned after upgrading my toolchain,
possibly due to SFrame V3 or some confounding change in GCC. The
--gsframe in the assembler should be set to 'no' by default, so
perhaps GCC is providing an override --gsframe internally?

>
> I applied this series hoping that fix it, but it doesn't, so far I
> have this hack :
> diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
> index 52314be29191..53bdf757ee44 100644
> --- a/arch/arm64/kernel/vdso/vdso.lds.S
> +++ b/arch/arm64/kernel/vdso/vdso.lds.S
> @@ -77,7 +77,7 @@ SECTIONS
>         /DISCARD/       : {
>                 *(.data .data.* .gnu.linkonce.d.* .sdata*)
>                 *(.bss .sbss .dynbss .dynsbss)
> -               *(.eh_frame .eh_frame_hdr)
> +               *(.eh_frame .eh_frame_hdr .sframe)
>         }
>  }
>
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index 60c8c22fd3e4..759903acd6fc 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -1064,6 +1064,7 @@
>         /* ld.bfd warns about .gnu.version* even when not emitted */    \
>         *(.gnu.version*)                                                \
>         *(__tracepoint_check)                                           \
> +       *(.sframe)                                                      \
>
>  #define DISCARDS                                                       \
>         /DISCARD/ : {                                                   \

Since this series only handles kernel stacktrace, I believe it's
better to omit the .sframe section entirely in the case where only
ARCH_SUPPORTS_UNWIND_KERNEL_SFRAME is enabled. I think this hack may
work better for this purpose:

diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index c60ef921956f..29f802bfedb1 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -41,7 +41,7 @@ CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os
$(CC_FLAGS_SCS) \
                        $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(CC_FLAGS_SFRAME) \
                        -Wmissing-prototypes -Wmissing-declarations

-CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables
+CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables
-Wa,--gsframe=no

 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO)
 CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO)

Though, I don't understand why it is necessary to provide --gsframe=no
explicitly. If this approach seems ok to other folks/maintainers, I
can fold this into my series.

On the topic of SFrame for VDSO, Jens has a patch adding support for
this as part of a series to support userspace SFrame unwinding for
arm64:

https://lore.kernel.org/lkml/20260417150827.1183376-4-jremus@linux.ibm.com/

>
>
> Thanks,
> Mostafa
>

Thanks,
Dylan

^ permalink raw reply related

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Sasha Levin @ 2026-05-18 13:33 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-kernel, linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <CAPhsuW4x8shWon8Moi5VgCq2n4E2EzaaauZ2HHpy42Rp1Y-J-g@mail.gmail.com>

On Sun, May 17, 2026 at 11:37:36PM -0700, Song Liu wrote:
>On Sun, May 17, 2026 at 6:49 AM Sasha Levin <sashal@kernel.org> wrote:
>> * fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
>>   most production kernels. Even where enabled, it only works on
>>   functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
>>   no help for a freshly-disclosed CVE. The debugfs UI is blocked by
>>   lockdown=integrity and the override is probabilistic.
>>
>> * BPF override (bpf_override_return) honors the same
>>   ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
>>   production kernels. Even where on, the operator interface is
>>   "load a verified BPF program," not a one-line write.
>
>If it is OK for killswitch to attach to any kernel functions, do we still
>need ALLOW_ERROR_INJECTION() for fail_function and BPF
>override? Shall we instead also allow fail_function and BPF override
>to attach to any kernel functions?

I don't think so. ALLOW_ERROR_INJECTION is not a security mechanism, it's an
integrity/safety mechanism for both bpf and fault injection.

It protects against a "developer or CI script doing legitimate fault injection
accidentally panics the box" scenario, not an "attacker gets in" one.

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Song Liu @ 2026-05-18  6:37 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <20260517134858.146569-1-sashal@kernel.org>

On Sun, May 17, 2026 at 6:49 AM Sasha Levin <sashal@kernel.org> wrote:
>
> When a kernel (security) issue goes public, fleets stay exposed until a patched
> kernel is built, distributed, and rebooted into.
>
> For many such issues the simplest mitigation is to stop calling the buggy
> function. Killswitch provides that. An admin writes:
>
>     echo "engage af_alg_sendmsg -1" \
>         > /sys/kernel/security/killswitch/control
>
> After this, af_alg_sendmsg() returns -EPERM on every call without
> running its body. The mitigation takes effect immediately, and is dropped on
> the next reboot -- by which point a patched kernel is hopefully in place.
>
> A lot of recent kernel issues sit in code paths most installs only have enabled
> to support a relative minority of users: AF_ALG, ksmbd, nf_tables, vsock, ax25,
> and friends.
>
> For most users, the cost of "this socket family stops working for the day" is
> much smaller than the cost of running a known vulnerable kernel until the fix
> lands.
>
> Why not an existing facility:
>
> * livepatch needs a built, signed, per-kernel-version module per CVE.
>   Under Secure Boot the operator can't sign their own, so they wait
>   for the vendor, and only a minority of vendors actually ship
>   livepatches. Killswitch covers the days before that module shows
>   up.
>
> * fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
>   most production kernels. Even where enabled, it only works on
>   functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
>   no help for a freshly-disclosed CVE. The debugfs UI is blocked by
>   lockdown=integrity and the override is probabilistic.
>
> * BPF override (bpf_override_return) honors the same
>   ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
>   production kernels. Even where on, the operator interface is
>   "load a verified BPF program," not a one-line write.

If it is OK for killswitch to attach to any kernel functions, do we still
need ALLOW_ERROR_INJECTION() for fail_function and BPF
override? Shall we instead also allow fail_function and BPF override
to attach to any kernel functions?

Thanks,
Song

^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Greg Kroah-Hartman @ 2026-05-18  5:23 UTC (permalink / raw)
  To: Brandon Taylor
  Cc: Sasha Levin, linux-kernel, linux-doc, linux-kselftest, bpf,
	live-patching, Andrew Morton, Jonathan Corbet, Mathieu Desnoyers,
	Joshua Peisach, Florian Weimer, Breno Leitao, Anthony Iliopoulos,
	Michal Hocko, Jiri Olsa
In-Reply-To: <e4527b42-f2e6-447a-861d-580a521bde19@yahoo.com>

On Sun, May 17, 2026 at 02:19:35PM -0500, Brandon Taylor wrote:
> Have we learned NOTHING from just over 9 and a half years ago?!
> 
> I do not pretend to be a prophet of Linus, but I cannot for the life of me
> help but get flashbacks from kernel version 4.8 when Linus himself did not
> explain, but EXPLODED, in saying "there is NO F*CKING EXCUSE to knowingly
> kill the kernel."

That's not what this patchset does, sorry.

best of luck!

greg k-h

^ permalink raw reply

* Re: [PATCH] livepatch: Allow to handle lifetime of shadow variables using the livepatch state
From: Yafang Shao @ 2026-05-18  3:24 UTC (permalink / raw)
  To: sashiko-bot
  Cc: jikos, joe.lawrence, jpoimboe, live-patching, mbenes, pmladek,
	sashiko-reviews, song
In-Reply-To: <20260514220757.412F4C2BCB3@smtp.kernel.org>

> Will this callback run in an atomic context, unlike the other state callbacks?

> All existing state callbacks execute in standard process context and are
> permitted to sleep. However, looking at klp_shadow_free_all(), it invokes
> the destructor while holding klp_shadow_lock:

> kernel/livepatch/shadow.c:klp_shadow_free_all() {
> 	 ...
>	 spin_lock_irqsave(&klp_shadow_lock, flags);
>
>	 hash_for_each(klp_shadow_hash, i, shadow, node) {
>	 	if (klp_shadow_match(shadow, shadow->obj, id))
>	 		klp_shadow_free_struct(shadow, dtor);
>	 }
>
>	 spin_unlock_irqrestore(&klp_shadow_lock, flags);
> }

> If a user assumes this callback runs in process context like the other
> state callbacks and includes sleeping operations, couldn't this trigger
> a scheduling while atomic panic?

This problem wasn't introduced by this change; it already existed beforehand.
There shouldn't be any sleeping operations in klp_shadow_dtor_t. Maybe we
should rename it to klp_shadow_atomic_dtor_t.

[...]

> Does tying shadow variable lifetime to the global patch state cause leaks when
> dealing with loadable modules?

> If a livepatch applies to a loadable module, it may allocate shadow variables
> keyed to the module's dynamically allocated objects.

> When the target module is unloaded, klp_module_going() unpatches the module's
> objects. However, because the state is a global property of the patch, no
> state callbacks are invoked and the state is not considered obsolete. This
> leaves the shadow variables for the unloaded module orphaned in the global
> hash table.

> If the module is subsequently reloaded and allocates new objects at the same
> memory addresses, wouldn't klp_shadow_alloc() fail with a duplicate warning,
> or klp_shadow_get() incorrectly return stale data?

You're right, this is an issue. We need to ensure the associated shadow
variables are freed during klp_module_going().

[...]

> Can this permanently leak shadow variables and state during an atomic replace?

> Livepatch supports stacking, allowing multiple active patches (e.g., Patch A
> and Patch B) to define the same state. If an atomic replace patch (Patch C)
> is loaded, it replaces both A and B.

With the introduction of replace_set, it's no longer possible for different
patches to define the same state within the same set. So, I don't think this
is an issue.


^ permalink raw reply

* Re: [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Brandon Taylor @ 2026-05-17 19:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel
  Cc: linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa
In-Reply-To: <20260517134858.146569-1-sashal@kernel.org>

Have we learned NOTHING from just over 9 and a half years ago?!

I do not pretend to be a prophet of Linus, but I cannot for the life of 
me help but get flashbacks from kernel version 4.8 when Linus himself 
did not explain, but EXPLODED, in saying "there is NO F*CKING EXCUSE to 
knowingly kill the kernel."

So for me to hear about THIS from a YouTube video, the fact that we are 
still--STILL!--coming up with new ways to do something which we ought to 
KNOW to be ABSOLUTELY UNACCEPTABLE and DOWNRIGHT INTOLERABLE, BOILS MY 
BLOOD TO NO END.

You ought to consider yourself lucky that it's ME writing this and not 
Linus, because he'd be saying the exact same thing, and making it God 
knows how many times worse. He would break his foot off in somebody's 
BEHIND over this "killswitch" idiocy, and NEVER MIND that it was 
supposedly "designed" to prevent exploits like Fragnesia, Copy Fail, and 
Dirty Frag from creating havoc in Linux distributions, ESPECIALLY his 
go-to in Fedora!

Forgive me (especially you, Master Linus) for blowing my stack over 
this, but we all ought to take a lesson from the past:

Killing the Linux kernel is NOT an acceptable method to mitigate exploits.

I don't care HOW long it takes, but we HAVE TO PATCH THOSE 
VULNERABILITIES, and we HAVE to do it the RIGHT WAY, NOT just introduce 
some kernel-killing "failsafe" just because somebody doesn't know how to 
plug those holes.

I don't care--and neither will Linus--about the so-called "simplest 
mitigation," and neither should you. We should all care that we get the 
code RIGHT.

Brandon

On 5/17/2026 8:48 AM, Sasha Levin wrote:
> When a kernel (security) issue goes public, fleets stay exposed until a patched
> kernel is built, distributed, and rebooted into.
>
> For many such issues the simplest mitigation is to stop calling the buggy
> function. Killswitch provides that. An admin writes:
>
>      echo "engage af_alg_sendmsg -1" \
>          > /sys/kernel/security/killswitch/control
>
> After this, af_alg_sendmsg() returns -EPERM on every call without
> running its body. The mitigation takes effect immediately, and is dropped on
> the next reboot -- by which point a patched kernel is hopefully in place.
>
> A lot of recent kernel issues sit in code paths most installs only have enabled
> to support a relative minority of users: AF_ALG, ksmbd, nf_tables, vsock, ax25,
> and friends.
>
> For most users, the cost of "this socket family stops working for the day" is
> much smaller than the cost of running a known vulnerable kernel until the fix
> lands.
>
> Why not an existing facility:
>
> * livepatch needs a built, signed, per-kernel-version module per CVE.
>    Under Secure Boot the operator can't sign their own, so they wait
>    for the vendor, and only a minority of vendors actually ship
>    livepatches. Killswitch covers the days before that module shows
>    up.
>
> * fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
>    most production kernels. Even where enabled, it only works on
>    functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
>    no help for a freshly-disclosed CVE. The debugfs UI is blocked by
>    lockdown=integrity and the override is probabilistic.
>
> * BPF override (bpf_override_return) honors the same
>    ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
>    production kernels. Even where on, the operator interface is
>    "load a verified BPF program," not a one-line write.
>
> * Module blacklist only helps when the bug is in a loadable module.
>
> Killswitch fills the gap: write a symbol to securityfs, function
> returns the chosen value until disengage or reboot.
>
> Assisted-by: Claude:claude-opus-4-7
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>
> Changes since v2:
> - Fix LLVM=1 build: gate __noipa__ on __has_attribute() (Breno)
> - Admin guide: do-not-engage list, pre-soak workflow, relation to
>    livepatch/fail_function/BPF (Michal, Mathieu, Joshua)
> - Add CVE-2026-43284 (esp_input) worked example + netns selftest
> - Drop unused [reason] token from Kconfig help and cmdline comment
> - Commit message: spell out why livepatch / fail_function / BPF
>    override / module-blacklist don't cover this window.
>
>   Documentation/admin-guide/index.rst           |   1 +
>   Documentation/admin-guide/killswitch.rst      | 229 +++++
>   Documentation/admin-guide/tainted-kernels.rst |   8 +
>   MAINTAINERS                                   |  11 +
>   include/linux/killswitch.h                    |  19 +
>   include/linux/panic.h                         |   3 +-
>   include/linux/security.h                      |   1 +
>   init/Kconfig                                  |   2 +
>   kernel/Kconfig.killswitch                     |  31 +
>   kernel/Makefile                               |   1 +
>   kernel/killswitch.c                           | 863 ++++++++++++++++++
>   kernel/panic.c                                |   1 +
>   lib/Kconfig.debug                             |  13 +
>   lib/Makefile                                  |   1 +
>   lib/test_killswitch.c                         |  85 ++
>   security/security.c                           |   1 +
>   tools/testing/selftests/Makefile              |   1 +
>   tools/testing/selftests/killswitch/.gitignore |   1 +
>   tools/testing/selftests/killswitch/Makefile   |   8 +
>   .../selftests/killswitch/cve_31431_test.c     | 162 ++++
>   .../selftests/killswitch/cve_43284_test.c     |  88 ++
>   .../selftests/killswitch/killswitch_test.sh   | 254 ++++++
>   22 files changed, 1783 insertions(+), 1 deletion(-)
>   create mode 100644 Documentation/admin-guide/killswitch.rst
>   create mode 100644 include/linux/killswitch.h
>   create mode 100644 kernel/Kconfig.killswitch
>   create mode 100644 kernel/killswitch.c
>   create mode 100644 lib/test_killswitch.c
>   create mode 100644 tools/testing/selftests/killswitch/.gitignore
>   create mode 100644 tools/testing/selftests/killswitch/Makefile
>   create mode 100644 tools/testing/selftests/killswitch/cve_31431_test.c
>   create mode 100644 tools/testing/selftests/killswitch/cve_43284_test.c
>   create mode 100755 tools/testing/selftests/killswitch/killswitch_test.sh
>
> diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
> index cd28dfe91b060..ca37dd70f108d 100644
> --- a/Documentation/admin-guide/index.rst
> +++ b/Documentation/admin-guide/index.rst
> @@ -70,6 +70,7 @@ problems and bugs in particular.
>      bug-hunting
>      bug-bisect
>      tainted-kernels
> +   killswitch
>      ramoops
>      dynamic-debug-howto
>      init
> diff --git a/Documentation/admin-guide/killswitch.rst b/Documentation/admin-guide/killswitch.rst
> new file mode 100644
> index 0000000000000..a524cc9ee23ca
> --- /dev/null
> +++ b/Documentation/admin-guide/killswitch.rst
> @@ -0,0 +1,229 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +..
> +.. Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> +
> +============
> +Killswitch
> +============
> +
> +Killswitch lets a privileged operator make a chosen kernel function
> +return a fixed value without executing its body, as a temporary
> +mitigation for a security bug while a real fix is being prepared.
> +
> +The function returns the operator-supplied value and nothing else
> +runs in its place. There is no allowlist, no return-type check; if
> +the kprobe layer accepts the symbol, killswitch engages it. Once
> +engaged, the change is in effect on every CPU until ``disengage`` is
> +written or the system reboots.
> +
> +Configuration
> +=============
> +
> +``CONFIG_KILLSWITCH``
> +  Enables the feature. Depends on ``SECURITYFS``, ``KPROBES`` (with
> +  ftrace support), and ``FUNCTION_ERROR_INJECTION``.
> +
> +The interface
> +=============
> +
> +::
> +
> +    /sys/kernel/security/killswitch/
> +        engaged                 RO  currently-engaged functions
> +        control                 WO  command sink
> +        taint                   RO  0 or 1
> +        fn/<name>/              per-function directory, created on engage
> +            retval              RW  return value
> +            hits                RO  per-cpu summed call count
> +
> +Three commands are accepted by ``control``::
> +
> +    engage <symbol> <retval>
> +    disengage <symbol>
> +    disengage_all
> +
> +Each engage and disengage emits a single ``KERN_WARNING`` line to
> +dmesg with the symbol, retval, hit count (on disengage), and the
> +operator's identity (uid/auid/sessionid/comm, or ``source=cmdline``).
> +
> +Engagement is rejected when:
> +
> +* the symbol is unknown, in a non-traceable section, on the kprobe
> +  blacklist, or otherwise refused by ``register_kprobe`` (the error
> +  from the kprobe layer is logged and returned to userspace);
> +* the symbol is already engaged (``-EBUSY``);
> +* the operator does not hold ``CAP_SYS_ADMIN``.
> +
> +Whatever value the operator writes is what the function returns.
> +Writing the wrong type or wrong value lands in the caller as-is.
> +
> +Boot parameter
> +==============
> +
> +``killswitch=fn1=<val>,fn2=<val>,...``
> +
> +Parsed early; engagements are applied at the end of kernel init
> +once the kprobe subsystem is up. Parse failures emit a warning and
> +skip the offending entry; they never panic.
> +
> +Useful for fleet rollout: when an issue drops, ship the mitigation
> +in the bootloader / PXE config and roll the fleet through reboots
> +while the real fix is being prepared.
> +
> +Tainting
> +========
> +
> +The first successful engagement (runtime or boot-time) sets
> +``TAINT_KILLSWITCH`` (bit 20, char ``H``). The taint persists across
> +``disengage`` until reboot, so an oops on a killswitch-modified
> +kernel is identifiable from the banner: ``Tainted: ... H`` tells a
> +maintainer to consult ``engaged`` before further triage.
> +
> +Module unload
> +=============
> +
> +If a module containing an engaged target is unloaded, killswitch
> +auto-disengages the entry and emits a ``KERN_WARNING`` so the loss
> +of mitigation is visible. Reloading the module does not silently
> +re-arm the killswitch; the operator re-engages explicitly.
> +
> +Choosing the right target
> +=========================
> +
> +A function that *looks* skippable may be relied on by callers for a
> +side effect (a lock the caller releases, a refcount the caller
> +drops, a scatterlist the caller consumes). The rule of thumb:
> +
> +  Pick the **highest-level** entry point that contains the bug.
> +
> +That gives callers no chance to dereference half-initialised state
> +from a function whose body was skipped. Two illustrative examples
> +from ``crypto/af_alg.c``:
> +
> +Anti-pattern: ``af_alg_count_tsgl``
> +-----------------------------------
> +
> +``af_alg_count_tsgl()`` returns ``unsigned int`` (the number of TX
> +SG entries). Engaging it with retval ``0`` causes the caller in
> +``algif_aead.c`` to allocate a 1-entry scatterlist (its
> +``if (!entries) entries = 1`` guard) and then walk the *real* TX
> +SGL into that undersized destination via ``af_alg_pull_tsgl``,
> +producing out-of-bounds writes. **Killswitching here introduces a
> +worse bug than the one being mitigated.**
> +
> +Anti-pattern: ``af_alg_pull_tsgl``
> +----------------------------------
> +
> +``af_alg_pull_tsgl()`` returns ``void``, so any retval is accepted.
> +But its caller depends on the per-request SGL being filled in.
> +Skipping the body leaves the per-request SGL with NULL pages; the
> +next-stage ``memcpy_sglist`` dereferences them and the kernel
> +oopses.
> +
> +Correct pattern: ``af_alg_sendmsg``
> +-----------------------------------
> +
> +``af_alg_sendmsg()`` is the highest-level entry into the AF_ALG
> +send path. Engaging it with retval ``-EPERM`` causes every send
> +attempt to return -EPERM to userspace; no caller ever sees
> +half-initialised state, and any AF_ALG-reachable bug downstream of
> +``sendmsg`` is unreachable until the killswitch is disengaged.
> +
> +The canonical pattern: pick a syscall-handler-shaped function whose
> +return value already encodes "this operation didn't happen", and
> +let userspace handle the error as it would any other failed
> +syscall.
> +
> +Correct pattern: ``esp_input`` (CVE-2026-43284)
> +-----------------------------------------------
> +
> +The IPsec ESP receive-path bug fixed by ``xfrm: esp: avoid in-place
> +decrypt on shared skb frags`` is reachable through ``esp_input()``
> +in ``net/ipv4/esp4.c`` (and ``esp6_input()`` for IPv6). Engage these
> +with retval ``-EINVAL``:
> +
> +::
> +
> +    echo "engage esp_input -22"  > /sys/kernel/security/killswitch/control
> +    echo "engage esp6_input -22" > /sys/kernel/security/killswitch/control
> +
> +Inbound ESP packets are then dropped before decapsulation, neutering
> +any bug downstream of the ESP receive path. IPsec tunnels stop
> +working; other networking is unaffected.
> +
> +Do not engage
> +=============
> +
> +Do not killswitch:
> +
> +* process or memory primitives the rest of the kernel needs to
> +  function: ``fork``, ``do_exit``, ``__alloc_pages``, ``kmalloc``,
> +  ``schedule``, anything in ``mm/`` reached by every allocation.
> +* hot paths in the scheduler, timekeeping, RCU, or interrupt entry.
> +* functions invoked from the killswitch path itself (``securityfs``,
> +  ``lockdown``, ``audit``, ``kprobe`` registration) -- the system
> +  may livelock or refuse to disengage.
> +* functions whose return value is read structurally (size, count,
> +  pointer-to-allocated-thing) rather than as success/failure.
> +  See the AF_ALG anti-patterns above for what goes wrong.
> +
> +When in doubt, measure first.
> +
> +Pre-soak before engaging
> +========================
> +
> +If the target's call rate is unknown, attach a counter for a few
> +seconds first. With perf::
> +
> +    perf probe --add 'esp_input'
> +    perf stat -a -e probe:esp_input -- sleep 5
> +
> +Or with bpftrace::
> +
> +    bpftrace -e 'kprobe:esp_input { @hits = count(); } interval:s:5 { exit(); }'
> +
> +A target with ten thousand hits per second is not a candidate -- the
> +kernel will not survive five seconds with that path returning a
> +fixed error.
> +
> +Relation to other facilities
> +============================
> +
> +* ``CONFIG_FUNCTION_ERROR_INJECTION`` provides the same architecture
> +  trampoline (``override_function_with_return``), which killswitch
> +  reuses. fail_function is debug-oriented: targets must be
> +  pre-annotated with ``ALLOW_ERROR_INJECTION()`` in source, the
> +  override is probabilistic, and the interface is on debugfs (blocked
> +  under ``lockdown=integrity``). Killswitch is the production cousin:
> +  no whitelist, deterministic, securityfs-visible under integrity
> +  lockdown, with audit and taint.
> +* livepatch can do everything killswitch can and more, at the cost
> +  of building, signing, and shipping a kernel module per mitigation.
> +  Killswitch is for the window before that module exists.
> +* BPF override (``bpf_override_return``) needs a BPF program and
> +  ``CONFIG_BPF_KPROBE_OVERRIDE``; appropriate when the policy is
> +  conditional, overkill for "always return -EPERM".
> +
> +Safety notes
> +============
> +
> +* In-flight calls during ``write()`` to ``control`` may run either
> +  the original body or the override. The override is ``return X``,
> +  which has no preconditions to violate.
> +* SMP visibility comes from ``text_poke_bp()``. ``write()`` to
> +  ``control`` returns only after every CPU sees the new path.
> +* The ftrace ops unregister waits for in-flight pre-handlers, so
> +  freeing the engagement attribute on disengage is safe.
> +* Inline functions, freed ``__init`` symbols, and anything compiled
> +  away cannot be killswitched. ``register_kprobe`` rejects them
> +  with whatever error the kprobe layer chooses.
> +
> +Diagnostics
> +===========
> +
> +Per-call hits are aggregated in a per-cpu counter readable at
> +``/sys/kernel/security/killswitch/fn/<name>/hits``. Per-hit logging
> +is not provided to avoid log storms on hot paths.
> +
> +A ``KILLSWITCH`` entry appears in the kernel taint vector once any
> +engagement succeeds (also visible as ``H`` in the oops banner).
> diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
> index 9ead927a37c0f..71a6e3364eddc 100644
> --- a/Documentation/admin-guide/tainted-kernels.rst
> +++ b/Documentation/admin-guide/tainted-kernels.rst
> @@ -102,6 +102,7 @@ Bit  Log  Number  Reason that got the kernel tainted
>    17  _/T  131072  kernel was built with the struct randomization plugin
>    18  _/N  262144  an in-kernel test has been run
>    19  _/J  524288  userspace used a mutating debug operation in fwctl
> + 20  _/H 1048576  killswitch override engaged (function short-circuited)
>   ===  ===  ======  ========================================================
>   
>   Note: The character ``_`` is representing a blank in this table to make reading
> @@ -189,3 +190,10 @@ More detailed explanation for tainting
>    19) ``J`` if userspace opened /dev/fwctl/* and performed a FWTCL_RPC_DEBUG_WRITE
>        to use the devices debugging features. Device debugging features could
>        cause the device to malfunction in undefined ways.
> +
> + 20) ``H`` if the killswitch primitive (see
> +     Documentation/admin-guide/killswitch.rst) has been engaged on at least
> +     one function. The kernel is no longer running its source: at least one
> +     function has been short-circuited to return a fixed value. The taint
> +     persists across ``disengage`` until the next reboot — once the running
> +     image has been modified, oops triage must reflect that.
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b2040011a3865..b4005b61d444f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -14350,6 +14350,17 @@ F:	lib/Kconfig.kmsan
>   F:	mm/kmsan/
>   F:	scripts/Makefile.kmsan
>   
> +KILLSWITCH (function short-circuit mitigation)
> +M:	Sasha Levin <sashal@kernel.org>
> +L:	linux-kernel@vger.kernel.org
> +S:	Maintained
> +F:	Documentation/admin-guide/killswitch.rst
> +F:	include/linux/killswitch.h
> +F:	kernel/Kconfig.killswitch
> +F:	kernel/killswitch.c
> +F:	lib/test_killswitch.c
> +F:	tools/testing/selftests/killswitch/
> +
>   KPROBES
>   M:	Naveen N Rao <naveen@kernel.org>
>   M:	"David S. Miller" <davem@davemloft.net>
> diff --git a/include/linux/killswitch.h b/include/linux/killswitch.h
> new file mode 100644
> index 0000000000000..3fad49e180ddf
> --- /dev/null
> +++ b/include/linux/killswitch.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> + */
> +#ifndef _LINUX_KILLSWITCH_H
> +#define _LINUX_KILLSWITCH_H
> +
> +#ifdef CONFIG_KILLSWITCH
> +int killswitch_engage(const char *symbol, long retval);
> +int killswitch_disengage(const char *symbol);
> +bool killswitch_is_engaged(const char *symbol);
> +#else
> +static inline int killswitch_engage(const char *symbol, long retval)
> +{ return -EOPNOTSUPP; }
> +static inline int killswitch_disengage(const char *symbol) { return -EOPNOTSUPP; }
> +static inline bool killswitch_is_engaged(const char *symbol) { return false; }
> +#endif
> +
> +#endif /* _LINUX_KILLSWITCH_H */
> diff --git a/include/linux/panic.h b/include/linux/panic.h
> index f1dd417e54b29..6699261a61f13 100644
> --- a/include/linux/panic.h
> +++ b/include/linux/panic.h
> @@ -88,7 +88,8 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
>   #define TAINT_RANDSTRUCT		17
>   #define TAINT_TEST			18
>   #define TAINT_FWCTL			19
> -#define TAINT_FLAGS_COUNT		20
> +#define TAINT_KILLSWITCH		20
> +#define TAINT_FLAGS_COUNT		21
>   #define TAINT_FLAGS_MAX			((1UL << TAINT_FLAGS_COUNT) - 1)
>   
>   struct taint_flag {
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 41d7367cf4036..038027c33ba1a 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -146,6 +146,7 @@ enum lockdown_reason {
>   	LOCKDOWN_DBG_WRITE_KERNEL,
>   	LOCKDOWN_RTAS_ERROR_INJECTION,
>   	LOCKDOWN_XEN_USER_ACTIONS,
> +	LOCKDOWN_KILLSWITCH,
>   	LOCKDOWN_INTEGRITY_MAX,
>   	LOCKDOWN_KCORE,
>   	LOCKDOWN_KPROBES,
> diff --git a/init/Kconfig b/init/Kconfig
> index 2937c4d308aec..5368dd4b5c65b 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -2278,6 +2278,8 @@ config ASN1
>   
>   source "kernel/Kconfig.locks"
>   
> +source "kernel/Kconfig.killswitch"
> +
>   config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
>   	bool
>   
> diff --git a/kernel/Kconfig.killswitch b/kernel/Kconfig.killswitch
> new file mode 100644
> index 0000000000000..a33f7ecb2861e
> --- /dev/null
> +++ b/kernel/Kconfig.killswitch
> @@ -0,0 +1,31 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Killswitch: per-function short-circuit mitigation primitive.
> +#
> +# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> +#
> +
> +config KILLSWITCH
> +	bool "Killswitch: short-circuit a kernel function as a CVE mitigation"
> +	depends on SECURITYFS
> +	depends on KPROBES && HAVE_KPROBES_ON_FTRACE
> +	depends on HAVE_FUNCTION_ERROR_INJECTION
> +	select FUNCTION_ERROR_INJECTION
> +	help
> +	  Provide an admin-facing mechanism to make a chosen kernel function
> +	  return a fixed value without executing its body, as a temporary
> +	  mitigation for a security bug before a real fix is available.
> +
> +	  Operators write "engage <symbol> <retval>" to
> +	  /sys/kernel/security/killswitch/control. The function entry is
> +	  redirected via a kprobe whose pre-handler sets the chosen return
> +	  value and short-circuits the call. There is no allowlist,
> +	  denylist, or return-type validation: if the kprobe layer accepts
> +	  the symbol the engagement proceeds, otherwise its error is
> +	  returned to userspace.
> +
> +	  This is *not* livepatch: there is no replacement implementation,
> +	  the function simply returns the chosen value. Engaging a killswitch
> +	  taints the kernel (TAINT_KILLSWITCH, 'H'). Requires CAP_SYS_ADMIN.
> +
> +	  If unsure, say N.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 6785982013dce..b3e408d9f275e 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -100,6 +100,7 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
>   obj-$(CONFIG_KCOV) += kcov.o
>   obj-$(CONFIG_KPROBES) += kprobes.o
>   obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
> +obj-$(CONFIG_KILLSWITCH) += killswitch.o
>   obj-$(CONFIG_KGDB) += debug/
>   obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>   obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
> diff --git a/kernel/killswitch.c b/kernel/killswitch.c
> new file mode 100644
> index 0000000000000..7f509c62ea748
> --- /dev/null
> +++ b/kernel/killswitch.c
> @@ -0,0 +1,863 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Per-function short-circuit mitigation.
> + *
> + * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> + *
> + * Engaging a killswitch installs a kprobe at the function's entry
> + * whose pre-handler sets the return register and skips the body via
> + * override_function_with_return().  Operator interface lives at
> + * /sys/kernel/security/killswitch/.
> + */
> +
> +#include <linux/audit.h>
> +#include <linux/capability.h>
> +#include <linux/cred.h>
> +#include <linux/ctype.h>
> +#include <linux/error-injection.h>
> +#include <linux/init.h>
> +#include <linux/killswitch.h>
> +#include <linux/kprobes.h>
> +#include <linux/kref.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/panic.h>
> +#include <linux/percpu.h>
> +#include <linux/printk.h>
> +#include <linux/sched.h>
> +#include <linux/security.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <linux/uaccess.h>
> +#include <linux/uidgid.h>
> +
> +struct ks_attr {
> +	struct list_head	list;
> +	struct kprobe		kp;
> +	/* atomic so a writer racing an in-flight call can't tear the long. */
> +	atomic_long_t		retval;
> +	/* false once disengaged; per-fn file ops then return -EIDRM. */
> +	bool			engaged;
> +	unsigned long __percpu	*hits;
> +	struct dentry		*dir;
> +	/* engaged_list holds one ref; each open per-fn fd holds one. */
> +	struct kref		refcnt;
> +};
> +
> +static DEFINE_MUTEX(ks_lock);
> +static LIST_HEAD(ks_engaged_list);
> +static struct dentry *ks_root_dir;
> +static struct dentry *ks_fn_dir;	/* parent for per-fn directories */
> +
> +/* ------------------------------------------------------------------ *
> + * Pre-handler: the actual override                                   *
> + * ------------------------------------------------------------------ */
> +
> +static int ks_kprobe_pre_handler(struct kprobe *kp, struct pt_regs *regs)
> +{
> +	struct ks_attr *attr = container_of(kp, struct ks_attr, kp);
> +
> +	this_cpu_inc(*attr->hits);
> +	regs_set_return_value(regs, (unsigned long)atomic_long_read(&attr->retval));
> +	override_function_with_return(regs);
> +	return 1;
> +}
> +NOKPROBE_SYMBOL(ks_kprobe_pre_handler);
> +
> +/* Defined non-NULL so the kprobe layer keeps the IPMODIFY ops. */
> +static void ks_kprobe_post_handler(struct kprobe *kp, struct pt_regs *regs,
> +				   unsigned long flags)
> +{
> +}
> +
> +/* ------------------------------------------------------------------ *
> + * Attribute lifecycle                                                *
> + * ------------------------------------------------------------------ */
> +
> +static struct ks_attr *ks_attr_lookup(const char *symbol)
> +{
> +	struct ks_attr *attr;
> +
> +	list_for_each_entry(attr, &ks_engaged_list, list)
> +		if (!strcmp(attr->kp.symbol_name, symbol))
> +			return attr;
> +	return NULL;
> +}
> +
> +static unsigned long ks_attr_hits(const struct ks_attr *attr)
> +{
> +	unsigned long total = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		total += *per_cpu_ptr(attr->hits, cpu);
> +	return total;
> +}
> +
> +static void ks_attr_destroy(struct ks_attr *attr)
> +{
> +	if (!attr)
> +		return;
> +	free_percpu(attr->hits);
> +	kfree(attr->kp.symbol_name);
> +	kfree(attr);
> +}
> +
> +static void ks_attr_kref_release(struct kref *kref)
> +{
> +	ks_attr_destroy(container_of(kref, struct ks_attr, refcnt));
> +}
> +
> +static void ks_attr_get(struct ks_attr *attr)
> +{
> +	kref_get(&attr->refcnt);
> +}
> +
> +static void ks_attr_put(struct ks_attr *attr)
> +{
> +	kref_put(&attr->refcnt, ks_attr_kref_release);
> +}
> +
> +static struct ks_attr *ks_attr_alloc(const char *symbol)
> +{
> +	struct ks_attr *attr;
> +
> +	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> +	if (!attr)
> +		return NULL;
> +
> +	attr->kp.symbol_name = kstrdup(symbol, GFP_KERNEL);
> +	if (!attr->kp.symbol_name)
> +		goto err;
> +
> +	attr->hits = alloc_percpu(unsigned long);
> +	if (!attr->hits)
> +		goto err;
> +
> +	attr->kp.pre_handler = ks_kprobe_pre_handler;
> +	attr->kp.post_handler = ks_kprobe_post_handler;
> +	INIT_LIST_HEAD(&attr->list);
> +	kref_init(&attr->refcnt);
> +	return attr;
> +
> +err:
> +	ks_attr_destroy(attr);
> +	return NULL;
> +}
> +
> +/* ------------------------------------------------------------------ *
> + * Securityfs: per-fn attribute files                                 *
> + * ------------------------------------------------------------------ */
> +
> +/*
> + * Look up by symbol name (the parent dentry's basename) under
> + * ks_lock and confirm attr->dir is the file's parent dentry.  This
> + * binds the fd to the engagement it was opened against and avoids
> + * dereferencing inode->i_private, which a racing disengage may have
> + * freed.  d_parent is stable for the open's lifetime via the file's
> + * dentry reference.
> + */
> +static int ks_attr_open(struct inode *inode, struct file *file)
> +{
> +	struct dentry *parent = file->f_path.dentry->d_parent;
> +	const char *name = parent->d_name.name;
> +	struct ks_attr *attr;
> +
> +	mutex_lock(&ks_lock);
> +	attr = ks_attr_lookup(name);
> +	if (attr && attr->dir == parent)
> +		ks_attr_get(attr);
> +	else
> +		attr = NULL;
> +	mutex_unlock(&ks_lock);
> +	if (!attr)
> +		return -ENOENT;
> +	file->private_data = attr;
> +	return 0;
> +}
> +
> +static int ks_attr_release(struct inode *inode, struct file *file)
> +{
> +	ks_attr_put(file->private_data);
> +	file->private_data = NULL;
> +	return 0;
> +}
> +
> +/* Caller must hold ks_lock. */
> +static int ks_attr_check_live(const struct ks_attr *attr)
> +{
> +	return attr->engaged ? 0 : -EIDRM;
> +}
> +
> +static ssize_t ks_retval_read(struct file *file, char __user *ubuf,
> +			      size_t count, loff_t *ppos)
> +{
> +	struct ks_attr *attr = file->private_data;
> +	char buf[32];
> +	long val;
> +	int ret, len;
> +
> +	mutex_lock(&ks_lock);
> +	ret = ks_attr_check_live(attr);
> +	val = atomic_long_read(&attr->retval);
> +	mutex_unlock(&ks_lock);
> +	if (ret)
> +		return ret;
> +	len = scnprintf(buf, sizeof(buf), "%ld\n", val);
> +	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
> +}
> +
> +static ssize_t ks_retval_write(struct file *file, const char __user *ubuf,
> +			       size_t count, loff_t *ppos)
> +{
> +	struct ks_attr *attr = file->private_data;
> +	char buf[32];
> +	long val;
> +	int ret;
> +
> +	if (count >= sizeof(buf))
> +		return -EINVAL;
> +	if (copy_from_user(buf, ubuf, count))
> +		return -EFAULT;
> +	buf[count] = '\0';
> +	strim(buf);
> +
> +	ret = kstrtol(buf, 0, &val);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&ks_lock);
> +	ret = ks_attr_check_live(attr);
> +	if (!ret)
> +		atomic_long_set(&attr->retval, val);
> +	mutex_unlock(&ks_lock);
> +
> +	return ret ? ret : count;
> +}
> +
> +static const struct file_operations ks_retval_fops = {
> +	.open		= ks_attr_open,
> +	.release	= ks_attr_release,
> +	.read		= ks_retval_read,
> +	.write	= ks_retval_write,
> +	.llseek	= default_llseek,
> +};
> +
> +static ssize_t ks_hits_read(struct file *file, char __user *ubuf,
> +			    size_t count, loff_t *ppos)
> +{
> +	struct ks_attr *attr = file->private_data;
> +	char buf[32];
> +	unsigned long hits;
> +	int ret, len;
> +
> +	mutex_lock(&ks_lock);
> +	ret = ks_attr_check_live(attr);
> +	hits = ks_attr_hits(attr);
> +	mutex_unlock(&ks_lock);
> +	if (ret)
> +		return ret;
> +	len = scnprintf(buf, sizeof(buf), "%lu\n", hits);
> +	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
> +}
> +
> +static const struct file_operations ks_hits_fops = {
> +	.open		= ks_attr_open,
> +	.release	= ks_attr_release,
> +	.read		= ks_hits_read,
> +	.llseek		= default_llseek,
> +};
> +
> +static int ks_create_attr_dir(struct ks_attr *attr)
> +{
> +	struct dentry *d;
> +
> +	attr->dir = securityfs_create_dir(attr->kp.symbol_name, ks_fn_dir);
> +	if (IS_ERR(attr->dir))
> +		return PTR_ERR(attr->dir);
> +
> +	/* ks_attr_open looks the attr up by name; i_private is unused. */
> +	d = securityfs_create_file("retval", 0600, attr->dir,
> +				   NULL, &ks_retval_fops);
> +	if (IS_ERR(d))
> +		goto err;
> +	d = securityfs_create_file("hits", 0400, attr->dir,
> +				   NULL, &ks_hits_fops);
> +	if (IS_ERR(d))
> +		goto err;
> +	return 0;
> +err:
> +	securityfs_remove(attr->dir);
> +	attr->dir = NULL;
> +	return PTR_ERR(d);
> +}
> +
> +/* ------------------------------------------------------------------ *
> + * Engage / disengage                                                 *
> + * ------------------------------------------------------------------ */
> +
> +static int __ks_engage(const char *symbol, long retval, bool from_cmdline)
> +{
> +	struct ks_attr *attr;
> +	int ret;
> +
> +	if (!symbol || !*symbol)
> +		return -EINVAL;
> +
> +	if (!from_cmdline) {
> +		ret = security_locked_down(LOCKDOWN_KILLSWITCH);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	mutex_lock(&ks_lock);
> +
> +	if (ks_attr_lookup(symbol)) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	attr = ks_attr_alloc(symbol);
> +	if (!attr) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	atomic_long_set(&attr->retval, retval);
> +
> +	ret = register_kprobe(&attr->kp);
> +	if (ret) {
> +		pr_warn("killswitch: register_kprobe(%s) failed: %d\n",
> +			symbol, ret);
> +		ks_attr_put(attr);
> +		goto out_unlock;
> +	}
> +
> +	ret = ks_create_attr_dir(attr);
> +	if (ret) {
> +		unregister_kprobe(&attr->kp);
> +		ks_attr_put(attr);
> +		goto out_unlock;
> +	}
> +
> +	list_add_tail(&attr->list, &ks_engaged_list);
> +	attr->engaged = true;
> +	add_taint(TAINT_KILLSWITCH, LOCKDEP_STILL_OK);
> +
> +	if (from_cmdline) {
> +		pr_warn("killswitch: engage %s=%ld source=cmdline\n",
> +			symbol, retval);
> +	} else {
> +		pr_warn("killswitch: engage %s=%ld uid=%u auid=%u ses=%u comm=%s\n",
> +			symbol, retval,
> +			from_kuid(&init_user_ns, current_uid()),
> +			from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +			audit_get_sessionid(current),
> +			current->comm);
> +	}
> +	ret = 0;
> +
> +out_unlock:
> +	mutex_unlock(&ks_lock);
> +	return ret;
> +}
> +
> +int killswitch_engage(const char *symbol, long retval)
> +{
> +	return __ks_engage(symbol, retval, false);
> +}
> +
> +static int __ks_disengage(const char *symbol)
> +{
> +	struct ks_attr *attr;
> +	unsigned long hits;
> +	int ret = 0;
> +
> +	mutex_lock(&ks_lock);
> +	attr = ks_attr_lookup(symbol);
> +	if (!attr) {
> +		ret = -ENOENT;
> +		goto out_unlock;
> +	}
> +
> +	unregister_kprobe(&attr->kp);
> +	attr->engaged = false;
> +	list_del(&attr->list);
> +	hits = ks_attr_hits(attr);
> +	securityfs_remove(attr->dir);
> +
> +	pr_warn("killswitch: disengage %s hits=%lu uid=%u auid=%u ses=%u comm=%s\n",
> +		symbol, hits,
> +		from_kuid(&init_user_ns, current_uid()),
> +		from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +		audit_get_sessionid(current),
> +		current->comm);
> +
> +	/* unregister_kprobe() already waited out in-flight pre-handlers. */
> +	ks_attr_put(attr);
> +
> +out_unlock:
> +	mutex_unlock(&ks_lock);
> +	return ret;
> +}
> +
> +int killswitch_disengage(const char *symbol)
> +{
> +	return __ks_disengage(symbol);
> +}
> +
> +bool killswitch_is_engaged(const char *symbol)
> +{
> +	bool engaged;
> +
> +	mutex_lock(&ks_lock);
> +	engaged = ks_attr_lookup(symbol) != NULL;
> +	mutex_unlock(&ks_lock);
> +	return engaged;
> +}
> +
> +static void ks_disengage_all_locked(void)
> +{
> +	struct ks_attr *attr, *n;
> +
> +	list_for_each_entry_safe(attr, n, &ks_engaged_list, list) {
> +		unregister_kprobe(&attr->kp);
> +		attr->engaged = false;
> +		list_del(&attr->list);
> +		securityfs_remove(attr->dir);
> +		pr_warn("killswitch: disengage %s hits=%lu (disengage_all)\n",
> +			attr->kp.symbol_name, ks_attr_hits(attr));
> +		ks_attr_put(attr);
> +	}
> +}
> +
> +/* ------------------------------------------------------------------ *
> + * Module unload: drop engagements on functions in the going module   *
> + * ------------------------------------------------------------------ */
> +
> +static int ks_module_notify(struct notifier_block *nb, unsigned long action,
> +			    void *data)
> +{
> +	struct module *mod = data;
> +	struct ks_attr *attr, *n;
> +
> +	if (action != MODULE_STATE_GOING)
> +		return NOTIFY_DONE;
> +
> +	mutex_lock(&ks_lock);
> +	list_for_each_entry_safe(attr, n, &ks_engaged_list, list) {
> +		if (!attr->kp.addr ||
> +		    __module_address((unsigned long)attr->kp.addr) != mod)
> +			continue;
> +
> +		pr_warn("killswitch: %s mitigation lost: module %s unloading; re-engage after reload if still needed\n",
> +			attr->kp.symbol_name, mod->name);
> +		unregister_kprobe(&attr->kp);
> +		attr->engaged = false;
> +		list_del(&attr->list);
> +		securityfs_remove(attr->dir);
> +		ks_attr_put(attr);
> +	}
> +	mutex_unlock(&ks_lock);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block ks_module_nb = {
> +	.notifier_call = ks_module_notify,
> +};
> +
> +/* ------------------------------------------------------------------ *
> + * Top-level securityfs files: control / engaged / taint              *
> + * ------------------------------------------------------------------ */
> +
> +static int ks_engaged_show(struct seq_file *m, void *v)
> +{
> +	struct ks_attr *attr;
> +
> +	mutex_lock(&ks_lock);
> +	list_for_each_entry(attr, &ks_engaged_list, list) {
> +		seq_printf(m, "%s retval=%ld hits=%lu\n",
> +			   attr->kp.symbol_name,
> +			   atomic_long_read(&attr->retval),
> +			   ks_attr_hits(attr));
> +	}
> +	mutex_unlock(&ks_lock);
> +	return 0;
> +}
> +
> +static int ks_engaged_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, ks_engaged_show, NULL);
> +}
> +
> +static const struct file_operations ks_engaged_fops = {
> +	.open		= ks_engaged_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= single_release,
> +};
> +
> +static ssize_t ks_taint_read(struct file *file, char __user *ubuf,
> +			     size_t count, loff_t *ppos)
> +{
> +	char buf[4];
> +	int len;
> +
> +	len = scnprintf(buf, sizeof(buf), "%d\n",
> +			test_taint(TAINT_KILLSWITCH) ? 1 : 0);
> +	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
> +}
> +
> +static const struct file_operations ks_taint_fops = {
> +	.open	= simple_open,
> +	.read	= ks_taint_read,
> +	.llseek	= default_llseek,
> +};
> +
> +/*
> + * control: parse one of:
> + *   engage <symbol> <retval>
> + *   disengage <symbol>
> + *   disengage_all
> + */
> +static ssize_t ks_control_write(struct file *file, const char __user *ubuf,
> +				size_t count, loff_t *ppos)
> +{
> +	char *buf, *cur, *verb, *sym, *retstr;
> +	long retval = 0;
> +	int ret;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (count == 0 || count > 4096)
> +		return -EINVAL;
> +
> +	buf = memdup_user_nul(ubuf, count);
> +	if (IS_ERR(buf))
> +		return PTR_ERR(buf);
> +
> +	cur = strim(buf);
> +	verb = strsep(&cur, " \t\n");
> +	if (!verb || !*verb) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!strcmp(verb, "disengage_all")) {
> +		mutex_lock(&ks_lock);
> +		ks_disengage_all_locked();
> +		mutex_unlock(&ks_lock);
> +		ret = count;
> +		goto out;
> +	}
> +
> +	sym = strsep(&cur, " \t\n");
> +	if (!sym || !*sym) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!strcmp(verb, "disengage")) {
> +		ret = __ks_disengage(sym);
> +		ret = ret ? ret : count;
> +		goto out;
> +	}
> +
> +	if (strcmp(verb, "engage")) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	retstr = strsep(&cur, " \t\n");
> +	if (!retstr || !*retstr) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	if (kstrtol(retstr, 0, &retval)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	ret = killswitch_engage(sym, retval);
> +	if (!ret)
> +		ret = count;
> +
> +out:
> +	kfree(buf);
> +	return ret;
> +}
> +
> +static const struct file_operations ks_control_fops = {
> +	.open	= simple_open,
> +	.write	= ks_control_write,
> +	.llseek	= noop_llseek,
> +};
> +
> +/* ------------------------------------------------------------------ *
> + * Boot parameter:                                                    *
> + *   killswitch=fn1=-1,fn2=0,fn3=-22                                  *
> + * ------------------------------------------------------------------ */
> +
> +#define KS_BOOT_BUF 1024
> +static char ks_boot_buf[KS_BOOT_BUF] __initdata;
> +static bool ks_boot_present __initdata;
> +
> +static int __init ks_boot_setup(char *str)
> +{
> +	if (!str)
> +		return 0;
> +	strscpy(ks_boot_buf, str, sizeof(ks_boot_buf));
> +	ks_boot_present = true;
> +	return 1;
> +}
> +__setup("killswitch=", ks_boot_setup);
> +
> +static void __init ks_apply_boot_params(void)
> +{
> +	char *cur, *tok;
> +	long retval;
> +
> +	if (!ks_boot_present)
> +		return;
> +
> +	cur = ks_boot_buf;
> +	while ((tok = strsep(&cur, ",")) != NULL) {
> +		char *eq, *sym, *retstr;
> +
> +		if (!*tok)
> +			continue;
> +		eq = strchr(tok, '=');
> +		if (!eq) {
> +			pr_warn("killswitch: cmdline missing '=': %s\n", tok);
> +			continue;
> +		}
> +		*eq++ = '\0';
> +		sym = tok;
> +		retstr = eq;
> +
> +		if (kstrtol(retstr, 0, &retval)) {
> +			pr_warn("killswitch: cmdline bad retval %s=%s\n",
> +				sym, retstr);
> +			continue;
> +		}
> +
> +		if (__ks_engage(sym, retval, true))
> +			pr_warn("killswitch: cmdline engage %s failed\n", sym);
> +	}
> +}
> +
> +/* ------------------------------------------------------------------ *
> + * Init                                                               *
> + * ------------------------------------------------------------------ */
> +
> +static int __init killswitch_init(void)
> +{
> +	struct dentry *d;
> +
> +	ks_root_dir = securityfs_create_dir("killswitch", NULL);
> +	if (IS_ERR(ks_root_dir))
> +		return PTR_ERR(ks_root_dir);
> +
> +	d = securityfs_create_file("control", 0200, ks_root_dir,
> +				   NULL, &ks_control_fops);
> +	if (IS_ERR(d))
> +		goto err;
> +	d = securityfs_create_file("engaged", 0444, ks_root_dir,
> +				   NULL, &ks_engaged_fops);
> +	if (IS_ERR(d))
> +		goto err;
> +	d = securityfs_create_file("taint", 0444, ks_root_dir,
> +				   NULL, &ks_taint_fops);
> +	if (IS_ERR(d))
> +		goto err;
> +
> +	ks_fn_dir = securityfs_create_dir("fn", ks_root_dir);
> +	if (IS_ERR(ks_fn_dir)) {
> +		d = ks_fn_dir;
> +		goto err;
> +	}
> +
> +	register_module_notifier(&ks_module_nb);
> +	ks_apply_boot_params();
> +
> +	pr_info("killswitch: ready (sysfs at /sys/kernel/security/killswitch/)\n");
> +	return 0;
> +
> +err:
> +	securityfs_remove(ks_root_dir);
> +	return PTR_ERR(d);
> +}
> +late_initcall(killswitch_init);
> +
> +/* ------------------------------------------------------------------ *
> + * KUnit tests                                                        *
> + * ------------------------------------------------------------------ */
> +
> +#if IS_ENABLED(CONFIG_KUNIT)
> +#include <kunit/test.h>
> +
> +/* Non-static so kallsyms resolves them without CONFIG_KALLSYMS_ALL. */
> +int ks_kunit_target_int(int x);
> +void *ks_kunit_target_ptr(int x);
> +
> +#if __has_attribute(__noipa__)
> +# define KS_KUNIT_NOIPA __attribute__((__noipa__))
> +#else
> +# define KS_KUNIT_NOIPA noinline __noclone
> +#endif
> +
> +KS_KUNIT_NOIPA int ks_kunit_target_int(int x)
> +{
> +	return x + 1;
> +}
> +
> +KS_KUNIT_NOIPA void *ks_kunit_target_ptr(int x)
> +{
> +	return ERR_PTR(-EIO);
> +}
> +
> +static int ks_kunit_init(struct kunit *test)
> +{
> +	if (security_locked_down(LOCKDOWN_KILLSWITCH))
> +		kunit_skip(test, "integrity lockdown blocks killswitch_engage()");
> +	return 0;
> +}
> +
> +static int ks_kunit_init_lockdown(struct kunit *test)
> +{
> +	if (!security_locked_down(LOCKDOWN_KILLSWITCH))
> +		kunit_skip(test, "requires lockdown=integrity");
> +	return 0;
> +}
> +
> +static void ks_disengage_quiet(const char *sym)
> +{
> +	if (killswitch_is_engaged(sym))
> +		killswitch_disengage(sym);
> +}
> +
> +static void ks_test_engage_int(struct kunit *test)
> +{
> +	int ret;
> +
> +	ret = killswitch_engage("ks_kunit_target_int", -EPERM);
> +	KUNIT_EXPECT_EQ(test, ret, 0);
> +	KUNIT_EXPECT_EQ(test, ks_kunit_target_int(7), -EPERM);
> +	KUNIT_EXPECT_EQ(test, killswitch_disengage("ks_kunit_target_int"), 0);
> +	KUNIT_EXPECT_EQ(test, ks_kunit_target_int(7), 8);
> +}
> +
> +static void ks_test_double_engage(struct kunit *test)
> +{
> +	KUNIT_ASSERT_EQ(test,
> +		killswitch_engage("ks_kunit_target_int", 0), 0);
> +	KUNIT_EXPECT_EQ(test,
> +		killswitch_engage("ks_kunit_target_int", 0), -EBUSY);
> +	ks_disengage_quiet("ks_kunit_target_int");
> +}
> +
> +static void ks_test_disengage_unknown(struct kunit *test)
> +{
> +	KUNIT_EXPECT_EQ(test,
> +		killswitch_disengage("ks_kunit_target_int"), -ENOENT);
> +}
> +
> +static void ks_test_pointer_target(struct kunit *test)
> +{
> +	long retval = (long)(unsigned long)ERR_PTR(-EACCES);
> +
> +	KUNIT_ASSERT_EQ(test,
> +		killswitch_engage("ks_kunit_target_ptr", retval), 0);
> +	KUNIT_EXPECT_TRUE(test, IS_ERR(ks_kunit_target_ptr(0)));
> +	KUNIT_EXPECT_EQ(test, PTR_ERR(ks_kunit_target_ptr(0)), -EACCES);
> +	ks_disengage_quiet("ks_kunit_target_ptr");
> +}
> +
> +static void ks_test_taint_set(struct kunit *test)
> +{
> +	KUNIT_ASSERT_EQ(test,
> +		killswitch_engage("ks_kunit_target_int", 0), 0);
> +	KUNIT_EXPECT_TRUE(test, test_taint(TAINT_KILLSWITCH));
> +	ks_disengage_quiet("ks_kunit_target_int");
> +	/* taint must persist even after disengage */
> +	KUNIT_EXPECT_TRUE(test, test_taint(TAINT_KILLSWITCH));
> +}
> +
> +static void ks_test_hits_counter(struct kunit *test)
> +{
> +	struct ks_attr *attr;
> +	int i;
> +
> +	KUNIT_ASSERT_EQ(test,
> +		killswitch_engage("ks_kunit_target_int", 0), 0);
> +
> +	for (i = 0; i < 17; i++)
> +		(void)ks_kunit_target_int(i);
> +
> +	mutex_lock(&ks_lock);
> +	attr = ks_attr_lookup("ks_kunit_target_int");
> +	KUNIT_EXPECT_NOT_NULL(test, attr);
> +	if (attr)
> +		KUNIT_EXPECT_EQ(test, ks_attr_hits(attr), 17UL);
> +	mutex_unlock(&ks_lock);
> +
> +	ks_disengage_quiet("ks_kunit_target_int");
> +}
> +
> +static struct kunit_case ks_kunit_cases[] = {
> +	KUNIT_CASE(ks_test_engage_int),
> +	KUNIT_CASE(ks_test_double_engage),
> +	KUNIT_CASE(ks_test_disengage_unknown),
> +	KUNIT_CASE(ks_test_pointer_target),
> +	KUNIT_CASE(ks_test_taint_set),
> +	KUNIT_CASE(ks_test_hits_counter),
> +	{}
> +};
> +
> +static struct kunit_suite ks_kunit_suite = {
> +	.name = "killswitch",
> +	.init = ks_kunit_init,
> +	.test_cases = ks_kunit_cases,
> +};
> +
> +/*
> + * Lockdown suite. Skipped unless the kernel was booted with
> + * lockdown=integrity (or higher). Run together with
> + * killswitch=ks_kunit_target_int=... on the same cmdline to also
> + * exercise the cmdline-bypass and disengage-under-lockdown paths.
> + */
> +static void ks_test_lockdown_runtime_engage(struct kunit *test)
> +{
> +	KUNIT_EXPECT_EQ(test,
> +		killswitch_engage("ks_kunit_target_int", 0), -EPERM);
> +}
> +
> +static void ks_test_lockdown_cmdline_disengage(struct kunit *test)
> +{
> +	if (!killswitch_is_engaged("ks_kunit_target_int"))
> +		kunit_skip(test,
> +			   "requires killswitch=ks_kunit_target_int=... on cmdline");
> +	KUNIT_EXPECT_EQ(test,
> +		killswitch_disengage("ks_kunit_target_int"), 0);
> +}
> +
> +static struct kunit_case ks_kunit_lockdown_cases[] = {
> +	KUNIT_CASE(ks_test_lockdown_runtime_engage),
> +	KUNIT_CASE(ks_test_lockdown_cmdline_disengage),
> +	{}
> +};
> +
> +static struct kunit_suite ks_kunit_lockdown_suite = {
> +	.name = "killswitch_lockdown",
> +	.init = ks_kunit_init_lockdown,
> +	.test_cases = ks_kunit_lockdown_cases,
> +};
> +
> +kunit_test_suites(&ks_kunit_suite, &ks_kunit_lockdown_suite);
> +
> +#endif /* CONFIG_KUNIT */
> +
> diff --git a/kernel/panic.c b/kernel/panic.c
> index 20feada5319d4..8ee174c7b7dd0 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -825,6 +825,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
>   	TAINT_FLAG(RANDSTRUCT,			'T', ' '),
>   	TAINT_FLAG(TEST,			'N', ' '),
>   	TAINT_FLAG(FWCTL,			'J', ' '),
> +	TAINT_FLAG(KILLSWITCH,			'H', ' '),
>   };
>   
>   #undef TAINT_FLAG
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 8ff5adcfe1e0a..5770639c7b0ea 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -3349,6 +3349,19 @@ config TEST_HMM
>   
>   	  If unsure, say N.
>   
> +config TEST_KILLSWITCH
> +	tristate "Test module for the killswitch mitigation primitive"
> +	depends on KILLSWITCH && DEBUG_FS
> +	depends on m
> +	help
> +	  Build a module that exposes a deliberately-vulnerable function
> +	  ks_test_vuln() and a debugfs trigger /sys/kernel/debug/test_killswitch/fire.
> +	  The killswitch selftest in tools/testing/selftests/killswitch/
> +	  uses this to confirm engaging a killswitch suppresses the BUG()
> +	  the function would otherwise hit.
> +
> +	  If unsure, say N.
> +
>   config TEST_FREE_PAGES
>   	tristate "Test freeing pages"
>   	help
> diff --git a/lib/Makefile b/lib/Makefile
> index f33a24bf1c19a..d763225340674 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -100,6 +100,7 @@ obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
>   obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
>   obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o
>   obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o
> +obj-$(CONFIG_TEST_KILLSWITCH) += test_killswitch.o
>   obj-$(CONFIG_TEST_HMM) += test_hmm.o
>   obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
>   obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
> diff --git a/lib/test_killswitch.c b/lib/test_killswitch.c
> new file mode 100644
> index 0000000000000..cc2584ad652ff
> --- /dev/null
> +++ b/lib/test_killswitch.c
> @@ -0,0 +1,85 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Test target for the killswitch selftest.  ks_test_vuln() returns
> + * -EBADMSG on a magic input, standing in for "the buggy path runs
> + * and produces a bad outcome".  Engaging killswitch on this function
> + * with retval 0 is the mitigation.
> + *
> + * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> + */
> +
> +#include <linux/debugfs.h>
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/uaccess.h>
> +
> +#define KS_TEST_MAGIC	0xC0FFEEL
> +
> +int ks_test_vuln(long magic);
> +
> +/*
> + * Returns -EBADMSG on the magic input -- stands in for "the buggy
> + * path runs and produces a bad outcome".  Engaging a killswitch on
> + * this function with retval 0 represents the mitigation: even on
> + * the magic input, callers see success because the body never runs.
> + *
> + * noipa prevents inlining/IPA so the call actually reaches the
> + * kprobe-instrumented entry point.
> + */
> +noinline int ks_test_vuln(long magic)
> +{
> +	if (magic == KS_TEST_MAGIC)
> +		return -EBADMSG;
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(ks_test_vuln);
> +
> +static struct dentry *ks_test_dir;
> +
> +static ssize_t ks_test_fire_write(struct file *file, const char __user *ubuf,
> +				  size_t count, loff_t *ppos)
> +{
> +	char buf[32];
> +	long magic;
> +	int ret;
> +
> +	if (count == 0 || count >= sizeof(buf))
> +		return -EINVAL;
> +	if (copy_from_user(buf, ubuf, count))
> +		return -EFAULT;
> +	buf[count] = '\0';
> +
> +	ret = kstrtol(strim(buf), 0, &magic);
> +	if (ret)
> +		return ret;
> +
> +	ret = ks_test_vuln(magic);
> +	return ret ? ret : count;
> +}
> +
> +static const struct file_operations ks_test_fire_fops = {
> +	.write	= ks_test_fire_write,
> +	.open	= simple_open,
> +	.llseek	= noop_llseek,
> +};
> +
> +static int __init test_killswitch_init(void)
> +{
> +	ks_test_dir = debugfs_create_dir("test_killswitch", NULL);
> +	debugfs_create_file("fire", 0200, ks_test_dir, NULL,
> +			    &ks_test_fire_fops);
> +	pr_info("test_killswitch: loaded (magic=0x%lx)\n", KS_TEST_MAGIC);
> +	return 0;
> +}
> +module_init(test_killswitch_init);
> +
> +static void __exit test_killswitch_exit(void)
> +{
> +	debugfs_remove_recursive(ks_test_dir);
> +}
> +module_exit(test_killswitch_exit);
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Deliberately-vulnerable target for killswitch selftest");
> diff --git a/security/security.c b/security/security.c
> index 4e999f0236516..bf700abc911a9 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -62,6 +62,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
>   	[LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
>   	[LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
>   	[LOCKDOWN_XEN_USER_ACTIONS] = "Xen guest user action",
> +	[LOCKDOWN_KILLSWITCH] = "engaging a killswitch",
>   	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
>   	[LOCKDOWN_KCORE] = "/proc/kcore access",
>   	[LOCKDOWN_KPROBES] = "use of kprobes",
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 6e59b8f63e416..04c3f8c5ff229 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -53,6 +53,7 @@ TARGETS += ipc
>   TARGETS += ir
>   TARGETS += kcmp
>   TARGETS += kexec
> +TARGETS += killswitch
>   TARGETS += kselftest_harness
>   TARGETS += kvm
>   TARGETS += landlock
> diff --git a/tools/testing/selftests/killswitch/.gitignore b/tools/testing/selftests/killswitch/.gitignore
> new file mode 100644
> index 0000000000000..cbf204ce18615
> --- /dev/null
> +++ b/tools/testing/selftests/killswitch/.gitignore
> @@ -0,0 +1 @@
> +cve_31431_test
> diff --git a/tools/testing/selftests/killswitch/Makefile b/tools/testing/selftests/killswitch/Makefile
> new file mode 100644
> index 0000000000000..ccf41165cb73d
> --- /dev/null
> +++ b/tools/testing/selftests/killswitch/Makefile
> @@ -0,0 +1,8 @@
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> +TEST_GEN_PROGS := cve_31431_test cve_43284_test
> +TEST_PROGS := killswitch_test.sh
> +
> +CFLAGS += -O2 -g -std=gnu99 -Wall $(KHDR_INCLUDES)
> +
> +include ../lib.mk
> diff --git a/tools/testing/selftests/killswitch/cve_31431_test.c b/tools/testing/selftests/killswitch/cve_31431_test.c
> new file mode 100644
> index 0000000000000..1ff817c51d881
> --- /dev/null
> +++ b/tools/testing/selftests/killswitch/cve_31431_test.c
> @@ -0,0 +1,162 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * AF_ALG AEAD round-trip prober.  The killswitch selftest uses this
> + * to demonstrate that engaging a killswitch on af_alg_sendmsg
> + * neuters AF_ALG operations (sendmsg returns -EPERM), mitigating
> + * any AF_ALG-reachable bug whose exploit primitive runs from the
> + * send path.
> + *
> + * Exit codes:
> + *   0  AEAD round-trip succeeded (function intact)
> + *   1  AEAD round-trip refused (mitigation engaged)
> + *   2  setup error (no AF_ALG, missing aead/gcm(aes), etc.) -> SKIP
> + *
> + * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> + */
> +
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/socket.h>
> +#include <unistd.h>
> +#include <linux/if_alg.h>
> +
> +#define KEY_LEN		16
> +#define IV_LEN		12
> +#define AAD_LEN		16
> +#define PT_LEN		64
> +#define TAG_LEN		16
> +#define EXPECTED_LEN	(AAD_LEN + PT_LEN + TAG_LEN)
> +
> +#ifndef AF_ALG
> +#define AF_ALG		38
> +#endif
> +#ifndef SOL_ALG
> +#define SOL_ALG		279
> +#endif
> +
> +int main(void)
> +{
> +	struct sockaddr_alg sa = {
> +		.salg_family = AF_ALG,
> +		.salg_type   = "aead",
> +		.salg_name   = "gcm(aes)",
> +	};
> +	unsigned char key[KEY_LEN] = { 0 };
> +	unsigned char iv[IV_LEN]   = { 0 };
> +	unsigned char buf[1024]    = { 0 };
> +	struct msghdr msg = { 0 };
> +	struct iovec iov;
> +	struct cmsghdr *cmsg;
> +	struct af_alg_iv *aiv;
> +	char cbuf[256] = { 0 };
> +	int *p_op, *p_assoclen;
> +	int sk, opfd;
> +	ssize_t n;
> +
> +	sk = socket(AF_ALG, SOCK_SEQPACKET, 0);
> +	if (sk < 0) {
> +		fprintf(stderr, "AF_ALG socket: %s -- skip\n", strerror(errno));
> +		return 2;
> +	}
> +	if (bind(sk, (struct sockaddr *)&sa, sizeof(sa))) {
> +		fprintf(stderr, "bind aead/gcm(aes): %s -- skip\n",
> +			strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +	if (setsockopt(sk, SOL_ALG, ALG_SET_KEY, key, KEY_LEN)) {
> +		fprintf(stderr, "ALG_SET_KEY: %s -- skip\n", strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +	if (setsockopt(sk, SOL_ALG, ALG_SET_AEAD_AUTHSIZE, NULL, TAG_LEN)) {
> +		fprintf(stderr, "ALG_SET_AEAD_AUTHSIZE: %s -- skip\n",
> +			strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +
> +	opfd = accept(sk, NULL, 0);
> +	if (opfd < 0) {
> +		fprintf(stderr, "accept: %s -- skip\n", strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +
> +	/* control message: ENCRYPT op + IV + assoclen */
> +	msg.msg_control    = cbuf;
> +	msg.msg_controllen = CMSG_SPACE(sizeof(int))
> +			   + CMSG_SPACE(sizeof(*aiv) + IV_LEN)
> +			   + CMSG_SPACE(sizeof(int));
> +
> +	cmsg = CMSG_FIRSTHDR(&msg);
> +	cmsg->cmsg_level = SOL_ALG;
> +	cmsg->cmsg_type  = ALG_SET_OP;
> +	cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
> +	p_op = (int *)CMSG_DATA(cmsg);
> +	*p_op = ALG_OP_ENCRYPT;
> +
> +	cmsg = CMSG_NXTHDR(&msg, cmsg);
> +	cmsg->cmsg_level = SOL_ALG;
> +	cmsg->cmsg_type  = ALG_SET_IV;
> +	cmsg->cmsg_len   = CMSG_LEN(sizeof(*aiv) + IV_LEN);
> +	aiv = (struct af_alg_iv *)CMSG_DATA(cmsg);
> +	aiv->ivlen = IV_LEN;
> +	memcpy(aiv->iv, iv, IV_LEN);
> +
> +	cmsg = CMSG_NXTHDR(&msg, cmsg);
> +	cmsg->cmsg_level = SOL_ALG;
> +	cmsg->cmsg_type  = ALG_SET_AEAD_ASSOCLEN;
> +	cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
> +	p_assoclen = (int *)CMSG_DATA(cmsg);
> +	*p_assoclen = AAD_LEN;
> +
> +	/* AAD || plaintext */
> +	memset(buf, 0xaa, AAD_LEN);
> +	memset(buf + AAD_LEN, 0x55, PT_LEN);
> +	iov.iov_base = buf;
> +	iov.iov_len  = AAD_LEN + PT_LEN;
> +	msg.msg_iov    = &iov;
> +	msg.msg_iovlen = 1;
> +
> +	n = sendmsg(opfd, &msg, 0);
> +	if (n < 0) {
> +		/*
> +		 * sendmsg refused: this is exactly the killswitch
> +		 * af_alg_sendmsg=-EPERM mitigation outcome.  Distinct
> +		 * exit code from setup failure so the test script can
> +		 * tell them apart.
> +		 */
> +		fprintf(stderr, "sendmsg: %s -- mitigation engaged?\n",
> +			strerror(errno));
> +		close(opfd); close(sk);
> +		return 1;
> +	}
> +
> +	/* recv: AAD echoed, plus ciphertext + tag */
> +	memset(buf, 0, sizeof(buf));
> +	n = read(opfd, buf, EXPECTED_LEN);
> +	close(opfd); close(sk);
> +
> +	if (n == 0) {
> +		printf("AEAD returned 0 bytes -- killswitch mitigation engaged\n");
> +		return 1;
> +	}
> +	if (n != EXPECTED_LEN) {
> +		fprintf(stderr,
> +			"AEAD short read: got %zd, expected %d -- mitigated?\n",
> +			n, EXPECTED_LEN);
> +		return 1;
> +	}
> +
> +	/* sanity: ciphertext (after AAD) shouldn't equal the plaintext bytes */
> +	if (memcmp(buf + AAD_LEN, buf + AAD_LEN + 1, PT_LEN - 1) == 0) {
> +		fprintf(stderr, "AEAD output looks unencrypted\n");
> +		return 2;
> +	}
> +
> +	printf("AEAD round-trip OK (%zd bytes)\n", n);
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/killswitch/cve_43284_test.c b/tools/testing/selftests/killswitch/cve_43284_test.c
> new file mode 100644
> index 0000000000000..4771cb0957dc1
> --- /dev/null
> +++ b/tools/testing/selftests/killswitch/cve_43284_test.c
> @@ -0,0 +1,88 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * UDP loopback round-trip prober.  Wrapped by killswitch_test.sh with
> + * an IPsec ESP SA + policy pair on loopback, this demonstrates that
> + * engaging a killswitch on esp_input drops inbound ESP packets before
> + * decapsulation, mitigating CVE-2026-43284 ("Dirty Frag", upstream fix
> + * xfrm: esp: avoid in-place decrypt on shared skb frags).
> + *
> + * The binary itself knows nothing about ESP -- it sends one UDP
> + * datagram to itself and waits up to a second for delivery.
> + *
> + * Exit codes:
> + *   0  UDP round-trip succeeded (no mitigation in effect)
> + *   1  UDP recv timed out (mitigation engaged)
> + *   2  setup error -> SKIP
> + *
> + * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> + */
> +
> +#include <arpa/inet.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/socket.h>
> +#include <sys/time.h>
> +#include <unistd.h>
> +
> +#define UDP_PORT 53435
> +#define PROBE    "ks-43284-probe"
> +
> +int main(void)
> +{
> +	struct sockaddr_in addr = {
> +		.sin_family      = AF_INET,
> +		.sin_port        = htons(UDP_PORT),
> +		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
> +	};
> +	struct timeval tv = { .tv_sec = 1, .tv_usec = 0 };
> +	char buf[64];
> +	int sk;
> +	ssize_t n;
> +
> +	sk = socket(AF_INET, SOCK_DGRAM, 0);
> +	if (sk < 0) {
> +		fprintf(stderr, "socket: %s -- skip\n", strerror(errno));
> +		return 2;
> +	}
> +	if (bind(sk, (struct sockaddr *)&addr, sizeof(addr))) {
> +		fprintf(stderr, "bind: %s -- skip\n", strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +	if (setsockopt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
> +		fprintf(stderr, "SO_RCVTIMEO: %s -- skip\n", strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +
> +	if (sendto(sk, PROBE, sizeof(PROBE) - 1, 0,
> +		   (struct sockaddr *)&addr, sizeof(addr)) < 0) {
> +		fprintf(stderr, "sendto: %s -- skip\n", strerror(errno));
> +		close(sk);
> +		return 2;
> +	}
> +
> +	memset(buf, 0, sizeof(buf));
> +	n = recvfrom(sk, buf, sizeof(buf), 0, NULL, NULL);
> +	close(sk);
> +
> +	if (n < 0) {
> +		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> +			fprintf(stderr,
> +				"recvfrom: timeout -- mitigation engaged?\n");
> +			return 1;
> +		}
> +		fprintf(stderr, "recvfrom: %s\n", strerror(errno));
> +		return 2;
> +	}
> +	if (n != (ssize_t)(sizeof(PROBE) - 1) ||
> +	    memcmp(buf, PROBE, sizeof(PROBE) - 1)) {
> +		fprintf(stderr, "recvfrom: bad payload (%zd bytes)\n", n);
> +		return 2;
> +	}
> +
> +	printf("UDP round-trip OK (%zd bytes)\n", n);
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/killswitch/killswitch_test.sh b/tools/testing/selftests/killswitch/killswitch_test.sh
> new file mode 100755
> index 0000000000000..ea3fd394a984f
> --- /dev/null
> +++ b/tools/testing/selftests/killswitch/killswitch_test.sh
> @@ -0,0 +1,254 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# End-to-end killswitch selftest.  Drives the test_killswitch module
> +# through an engage/disengage cycle and confirms each transition
> +# behaves as expected.  Also runs the AF_ALG mitigation proof.
> +#
> +# Requirements (see Documentation/admin-guide/killswitch.rst):
> +#   - CONFIG_KILLSWITCH=y
> +#   - CONFIG_TEST_KILLSWITCH=m
> +#   - run as root (CAP_SYS_ADMIN)
> +#
> +# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
> +#
> +
> +set -u
> +
> +KS=/sys/kernel/security/killswitch
> +TRIG=/sys/kernel/debug/test_killswitch/fire
> +
> +NOMOD=0
> +SKIP_RC=4
> +N=0
> +FAIL=0
> +
> +ksft_pass() { N=$((N+1));    echo "ok $N - $*"; }
> +ksft_fail() { N=$((N+1)); FAIL=$((FAIL+1)); echo "not ok $N - $*"; }
> +ksft_skip() { echo "ok 1 - SKIP $*"; echo "1..1"; exit $SKIP_RC; }
> +
> +[[ $EUID -eq 0 ]] || ksft_skip "must be root"
> +[[ -d $KS    ]] || ksft_skip "$KS not present (CONFIG_KILLSWITCH disabled?)"
> +
> +if ! modprobe test_killswitch 2>/dev/null; then
> +	NOMOD=1
> +fi
> +[[ -e $TRIG ]] || ksft_skip "$TRIG missing (test_killswitch.ko not installed?)"
> +
> +cleanup() {
> +	echo "disengage_all" > $KS/control 2>/dev/null || true
> +	[[ $NOMOD -eq 0 ]] && rmmod test_killswitch 2>/dev/null || true
> +}
> +trap cleanup EXIT
> +
> +# --- pre-engage: bad path runs, write fails with EBADMSG ---
> +if echo 0xC0FFEE > $TRIG 2>/dev/null; then
> +	ksft_fail "pre-engage: write should have failed (-EBADMSG)"
> +else
> +	[[ $? -ne 0 ]] && ksft_pass "pre-engage: bad path returns error" \
> +	             || ksft_fail "pre-engage: unexpected outcome"
> +fi
> +
> +# --- engage ---
> +echo "engage ks_test_vuln 0" > $KS/control
> +grep -q "^ks_test_vuln" $KS/engaged \
> +	&& ksft_pass "engage: ks_test_vuln in engaged list" \
> +	|| ksft_fail "engage: missing from engaged list"
> +
> +[[ $(cat $KS/taint) == 1 ]] \
> +	&& ksft_pass "engage: taint set" \
> +	|| ksft_fail "engage: taint not set"
> +
> +[[ -d $KS/fn/ks_test_vuln ]] \
> +	&& ksft_pass "engage: per-fn dir created" \
> +	|| ksft_fail "engage: per-fn dir missing"
> +
> +# --- post-engage: BUG suppressed; write returns successfully ---
> +if echo 0xC0FFEE > $TRIG 2>/dev/null; then
> +	ksft_pass "post-engage: BUG suppressed, write succeeded"
> +else
> +	ksft_fail "post-engage: write should succeed"
> +fi
> +
> +[[ $(cat $KS/fn/ks_test_vuln/hits) -ge 1 ]] \
> +	&& ksft_pass "post-engage: hits counter incremented" \
> +	|| ksft_fail "post-engage: hits counter did not move"
> +
> +# --- retval rewrite is a plain write (no validation) ---
> +echo 7 > $KS/fn/ks_test_vuln/retval
> +[[ $(cat $KS/fn/ks_test_vuln/retval) == 7 ]] \
> +	&& ksft_pass "retval rewrite round-trips" \
> +	|| ksft_fail "retval rewrite failed"
> +
> +# --- engage on a kprobe-rejected function fails ---
> +# warn_thunk_thunk is in /sys/kernel/debug/kprobes/blacklist;
> +# register_kprobe() refuses it.
> +KP_REJECT=warn_thunk_thunk
> +if echo "engage $KP_REJECT 0" > $KS/control 2>/dev/null; then
> +	ksft_fail "register_kprobe should have rejected $KP_REJECT"
> +	echo "disengage $KP_REJECT" > $KS/control
> +else
> +	ksft_pass "register_kprobe refuses blacklisted target"
> +fi
> +
> +# --- disengage ---
> +echo "disengage ks_test_vuln" > $KS/control
> +[[ -z "$(cat $KS/engaged)" ]] \
> +	&& ksft_pass "disengage: engaged list empty" \
> +	|| ksft_fail "disengage: engaged list not empty"
> +
> +[[ ! -d $KS/fn/ks_test_vuln ]] \
> +	&& ksft_pass "disengage: per-fn dir removed" \
> +	|| ksft_fail "disengage: per-fn dir still present"
> +
> +[[ $(cat $KS/taint) == 1 ]] \
> +	&& ksft_pass "disengage: taint persists" \
> +	|| ksft_fail "disengage: taint should persist"
> +
> +# --- post-disengage: bad path active again ---
> +if echo 0xC0FFEE > $TRIG 2>/dev/null; then
> +	ksft_fail "post-disengage: write should fail again"
> +else
> +	ksft_pass "post-disengage: bad path active again"
> +fi
> +
> +# ---- CVE-2026-31431 mitigation proof (AF_ALG aead via af_alg_sendmsg) ----
> +# Skip the whole block if AF_ALG / AEAD machinery isn't compiled in.
> +if [[ -x $(dirname "$0")/cve_31431_test ]]; then
> +	CVE=$(dirname "$0")/cve_31431_test
> +	$CVE >/dev/null 2>&1 && PRE=$? || PRE=$?
> +	if [[ $PRE -eq 0 ]]; then
> +		ksft_pass "cve-31431: pre-engage AEAD round-trip OK"
> +
> +		echo "engage af_alg_sendmsg -1" > $KS/control
> +		$CVE >/dev/null 2>&1 && POST=$? || POST=$?
> +		if [[ $POST -eq 1 ]]; then
> +			ksft_pass "cve-31431: post-engage AEAD refused (mitigated)"
> +		else
> +			ksft_fail "cve-31431: post-engage exit=$POST (expected 1)"
> +		fi
> +
> +		HITS=$(cat $KS/fn/af_alg_sendmsg/hits 2>/dev/null || echo 0)
> +		[[ $HITS -ge 1 ]] && ksft_pass "cve-31431: hits=$HITS recorded" \
> +			|| ksft_fail "cve-31431: hits not recorded"
> +
> +		echo "disengage af_alg_sendmsg" > $KS/control
> +		$CVE >/dev/null 2>&1 && POST2=$? || POST2=$?
> +		[[ $POST2 -eq 0 ]] && ksft_pass "cve-31431: post-disengage restored" \
> +			|| ksft_fail "cve-31431: post-disengage exit=$POST2"
> +	elif [[ $PRE -eq 2 ]]; then
> +		echo "# SKIP cve-31431 (AF_ALG/AEAD not available)"
> +	else
> +		ksft_fail "cve-31431: pre-engage exit=$PRE"
> +	fi
> +fi
> +
> +# ---- CVE-2026-43284 mitigation proof (IPsec ESP via esp_input) ----
> +# Engaging esp_input causes inbound ESP packets to be dropped before
> +# decapsulation, neutering any bug downstream of the ESP receive path.
> +# Two netns + veth so traffic actually traverses xfrm (single-netns
> +# 127.0.0.0/8 traffic short-circuits before xfrm policy lookup).
> +NS0=ks-esp-0
> +NS1=ks-esp-1
> +esp_setup_ok=0
> +esp_cleanup() {
> +	[[ $esp_setup_ok -eq 1 ]] || return 0
> +	ip netns del $NS0 2>/dev/null
> +	ip netns del $NS1 2>/dev/null
> +}
> +trap 'cleanup; esp_cleanup' EXIT
> +
> +# UDP probe in python3 (always present on Debian/Fedora minimal installs).
> +esp_round_trip() {
> +	# $1: source netns, $2: dest netns, $3: dest ip, $4: port
> +	local tmp rpid rc
> +	tmp=$(mktemp)
> +	ip netns exec "$2" python3 -c '
> +import socket
> +r = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
> +r.bind(("0.0.0.0", '"$4"'))
> +r.settimeout(2.0)
> +try:
> +    d,_ = r.recvfrom(64)
> +    print(d.decode(errors="replace"))
> +except socket.timeout:
> +    print("timeout")
> +' > "$tmp" 2>&1 &
> +	rpid=$!
> +	sleep 0.3
> +	ip netns exec "$1" python3 -c '
> +import socket
> +s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
> +s.sendto(b"ks-esp-probe", ("'"$3"'", '"$4"'))
> +' 2>/dev/null
> +	wait $rpid 2>/dev/null
> +	rc=1
> +	grep -q "ks-esp-probe" "$tmp" && rc=0
> +	rm -f "$tmp"
> +	return $rc
> +}
> +
> +if command -v ip >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then
> +	KEY=0x0123456789abcdef0123456789abcdef01234567
> +
> +	if ip netns add $NS0 2>/dev/null && \
> +	   ip netns add $NS1 2>/dev/null && \
> +	   ip link add veth0 type veth peer name veth1 2>/dev/null && \
> +	   ip link set veth0 netns $NS0 2>/dev/null && \
> +	   ip link set veth1 netns $NS1 2>/dev/null && \
> +	   ip -n $NS0 addr add 10.99.0.1/24 dev veth0 2>/dev/null && \
> +	   ip -n $NS1 addr add 10.99.0.2/24 dev veth1 2>/dev/null && \
> +	   ip -n $NS0 link set veth0 up 2>/dev/null && \
> +	   ip -n $NS1 link set veth1 up 2>/dev/null && \
> +	   ip -n $NS0 link set lo up 2>/dev/null && \
> +	   ip -n $NS1 link set lo up 2>/dev/null && \
> +	   ip -n $NS0 xfrm state add src 10.99.0.1 dst 10.99.0.2 proto esp \
> +		spi 0x1000 mode transport reqid 0x100 \
> +		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
> +	   ip -n $NS0 xfrm state add src 10.99.0.2 dst 10.99.0.1 proto esp \
> +		spi 0x1001 mode transport reqid 0x100 \
> +		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
> +	   ip -n $NS1 xfrm state add src 10.99.0.1 dst 10.99.0.2 proto esp \
> +		spi 0x1000 mode transport reqid 0x100 \
> +		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
> +	   ip -n $NS1 xfrm state add src 10.99.0.2 dst 10.99.0.1 proto esp \
> +		spi 0x1001 mode transport reqid 0x100 \
> +		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
> +	   ip -n $NS0 xfrm policy add src 10.99.0.1 dst 10.99.0.2 \
> +		dir out tmpl src 10.99.0.1 dst 10.99.0.2 proto esp \
> +		reqid 0x100 mode transport 2>/dev/null && \
> +	   ip -n $NS1 xfrm policy add src 10.99.0.1 dst 10.99.0.2 \
> +		dir in tmpl src 10.99.0.1 dst 10.99.0.2 proto esp \
> +		reqid 0x100 mode transport 2>/dev/null; then
> +		esp_setup_ok=1
> +	fi
> +
> +	if [[ $esp_setup_ok -eq 1 ]] \
> +	   && esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
> +		ksft_pass "cve-43284: pre-engage ESP round-trip OK"
> +
> +		echo "engage esp_input -22" > $KS/control
> +		if esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
> +			ksft_fail "cve-43284: post-engage ESP should have been dropped"
> +		else
> +			ksft_pass "cve-43284: post-engage ESP refused (mitigated)"
> +		fi
> +
> +		ESP_HITS=$(cat $KS/fn/esp_input/hits 2>/dev/null || echo 0)
> +		[[ $ESP_HITS -ge 1 ]] \
> +			&& ksft_pass "cve-43284: hits=$ESP_HITS recorded" \
> +			|| ksft_fail "cve-43284: hits not recorded"
> +
> +		echo "disengage esp_input" > $KS/control
> +		if esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
> +			ksft_pass "cve-43284: post-disengage restored"
> +		else
> +			ksft_fail "cve-43284: post-disengage ESP still dropped"
> +		fi
> +	else
> +		echo "# SKIP cve-43284 (netns/veth/XFRM/ESP setup failed)"
> +	fi
> +fi
> +
> +echo "1..$N"
> +exit $((FAIL > 0))

^ permalink raw reply

* [PATCH v3] killswitch: add per-function short-circuit mitigation primitive
From: Sasha Levin @ 2026-05-17 13:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-doc, linux-kselftest, bpf, live-patching,
	Greg Kroah-Hartman, Andrew Morton, Jonathan Corbet,
	Mathieu Desnoyers, Joshua Peisach, Florian Weimer, Breno Leitao,
	Anthony Iliopoulos, Michal Hocko, Jiri Olsa, Sasha Levin
In-Reply-To: <20260508195749.1885522-1-sashal@kernel.org>

When a kernel (security) issue goes public, fleets stay exposed until a patched
kernel is built, distributed, and rebooted into.

For many such issues the simplest mitigation is to stop calling the buggy
function. Killswitch provides that. An admin writes:

    echo "engage af_alg_sendmsg -1" \
        > /sys/kernel/security/killswitch/control

After this, af_alg_sendmsg() returns -EPERM on every call without
running its body. The mitigation takes effect immediately, and is dropped on
the next reboot -- by which point a patched kernel is hopefully in place.

A lot of recent kernel issues sit in code paths most installs only have enabled
to support a relative minority of users: AF_ALG, ksmbd, nf_tables, vsock, ax25,
and friends.

For most users, the cost of "this socket family stops working for the day" is
much smaller than the cost of running a known vulnerable kernel until the fix
lands.

Why not an existing facility:

* livepatch needs a built, signed, per-kernel-version module per CVE.
  Under Secure Boot the operator can't sign their own, so they wait
  for the vendor, and only a minority of vendors actually ship
  livepatches. Killswitch covers the days before that module shows
  up.

* fail_function (CONFIG_FUNCTION_ERROR_INJECTION) is disabled in
  most production kernels. Even where enabled, it only works on
  functions pre-annotated with ALLOW_ERROR_INJECTION() in source -
  no help for a freshly-disclosed CVE. The debugfs UI is blocked by
  lockdown=integrity and the override is probabilistic.

* BPF override (bpf_override_return) honors the same
  ALLOW_ERROR_INJECTION() whitelist, and BPF itself is off in many
  production kernels. Even where on, the operator interface is
  "load a verified BPF program," not a one-line write.

* Module blacklist only helps when the bug is in a loadable module.

Killswitch fills the gap: write a symbol to securityfs, function
returns the chosen value until disengage or reboot.

Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

Changes since v2:
- Fix LLVM=1 build: gate __noipa__ on __has_attribute() (Breno)
- Admin guide: do-not-engage list, pre-soak workflow, relation to
  livepatch/fail_function/BPF (Michal, Mathieu, Joshua)
- Add CVE-2026-43284 (esp_input) worked example + netns selftest
- Drop unused [reason] token from Kconfig help and cmdline comment
- Commit message: spell out why livepatch / fail_function / BPF
  override / module-blacklist don't cover this window.

 Documentation/admin-guide/index.rst           |   1 +
 Documentation/admin-guide/killswitch.rst      | 229 +++++
 Documentation/admin-guide/tainted-kernels.rst |   8 +
 MAINTAINERS                                   |  11 +
 include/linux/killswitch.h                    |  19 +
 include/linux/panic.h                         |   3 +-
 include/linux/security.h                      |   1 +
 init/Kconfig                                  |   2 +
 kernel/Kconfig.killswitch                     |  31 +
 kernel/Makefile                               |   1 +
 kernel/killswitch.c                           | 863 ++++++++++++++++++
 kernel/panic.c                                |   1 +
 lib/Kconfig.debug                             |  13 +
 lib/Makefile                                  |   1 +
 lib/test_killswitch.c                         |  85 ++
 security/security.c                           |   1 +
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/killswitch/.gitignore |   1 +
 tools/testing/selftests/killswitch/Makefile   |   8 +
 .../selftests/killswitch/cve_31431_test.c     | 162 ++++
 .../selftests/killswitch/cve_43284_test.c     |  88 ++
 .../selftests/killswitch/killswitch_test.sh   | 254 ++++++
 22 files changed, 1783 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/admin-guide/killswitch.rst
 create mode 100644 include/linux/killswitch.h
 create mode 100644 kernel/Kconfig.killswitch
 create mode 100644 kernel/killswitch.c
 create mode 100644 lib/test_killswitch.c
 create mode 100644 tools/testing/selftests/killswitch/.gitignore
 create mode 100644 tools/testing/selftests/killswitch/Makefile
 create mode 100644 tools/testing/selftests/killswitch/cve_31431_test.c
 create mode 100644 tools/testing/selftests/killswitch/cve_43284_test.c
 create mode 100755 tools/testing/selftests/killswitch/killswitch_test.sh

diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index cd28dfe91b060..ca37dd70f108d 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -70,6 +70,7 @@ problems and bugs in particular.
    bug-hunting
    bug-bisect
    tainted-kernels
+   killswitch
    ramoops
    dynamic-debug-howto
    init
diff --git a/Documentation/admin-guide/killswitch.rst b/Documentation/admin-guide/killswitch.rst
new file mode 100644
index 0000000000000..a524cc9ee23ca
--- /dev/null
+++ b/Documentation/admin-guide/killswitch.rst
@@ -0,0 +1,229 @@
+.. SPDX-License-Identifier: GPL-2.0
+..
+.. Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+
+============
+Killswitch
+============
+
+Killswitch lets a privileged operator make a chosen kernel function
+return a fixed value without executing its body, as a temporary
+mitigation for a security bug while a real fix is being prepared.
+
+The function returns the operator-supplied value and nothing else
+runs in its place. There is no allowlist, no return-type check; if
+the kprobe layer accepts the symbol, killswitch engages it. Once
+engaged, the change is in effect on every CPU until ``disengage`` is
+written or the system reboots.
+
+Configuration
+=============
+
+``CONFIG_KILLSWITCH``
+  Enables the feature. Depends on ``SECURITYFS``, ``KPROBES`` (with
+  ftrace support), and ``FUNCTION_ERROR_INJECTION``.
+
+The interface
+=============
+
+::
+
+    /sys/kernel/security/killswitch/
+        engaged                 RO  currently-engaged functions
+        control                 WO  command sink
+        taint                   RO  0 or 1
+        fn/<name>/              per-function directory, created on engage
+            retval              RW  return value
+            hits                RO  per-cpu summed call count
+
+Three commands are accepted by ``control``::
+
+    engage <symbol> <retval>
+    disengage <symbol>
+    disengage_all
+
+Each engage and disengage emits a single ``KERN_WARNING`` line to
+dmesg with the symbol, retval, hit count (on disengage), and the
+operator's identity (uid/auid/sessionid/comm, or ``source=cmdline``).
+
+Engagement is rejected when:
+
+* the symbol is unknown, in a non-traceable section, on the kprobe
+  blacklist, or otherwise refused by ``register_kprobe`` (the error
+  from the kprobe layer is logged and returned to userspace);
+* the symbol is already engaged (``-EBUSY``);
+* the operator does not hold ``CAP_SYS_ADMIN``.
+
+Whatever value the operator writes is what the function returns.
+Writing the wrong type or wrong value lands in the caller as-is.
+
+Boot parameter
+==============
+
+``killswitch=fn1=<val>,fn2=<val>,...``
+
+Parsed early; engagements are applied at the end of kernel init
+once the kprobe subsystem is up. Parse failures emit a warning and
+skip the offending entry; they never panic.
+
+Useful for fleet rollout: when an issue drops, ship the mitigation
+in the bootloader / PXE config and roll the fleet through reboots
+while the real fix is being prepared.
+
+Tainting
+========
+
+The first successful engagement (runtime or boot-time) sets
+``TAINT_KILLSWITCH`` (bit 20, char ``H``). The taint persists across
+``disengage`` until reboot, so an oops on a killswitch-modified
+kernel is identifiable from the banner: ``Tainted: ... H`` tells a
+maintainer to consult ``engaged`` before further triage.
+
+Module unload
+=============
+
+If a module containing an engaged target is unloaded, killswitch
+auto-disengages the entry and emits a ``KERN_WARNING`` so the loss
+of mitigation is visible. Reloading the module does not silently
+re-arm the killswitch; the operator re-engages explicitly.
+
+Choosing the right target
+=========================
+
+A function that *looks* skippable may be relied on by callers for a
+side effect (a lock the caller releases, a refcount the caller
+drops, a scatterlist the caller consumes). The rule of thumb:
+
+  Pick the **highest-level** entry point that contains the bug.
+
+That gives callers no chance to dereference half-initialised state
+from a function whose body was skipped. Two illustrative examples
+from ``crypto/af_alg.c``:
+
+Anti-pattern: ``af_alg_count_tsgl``
+-----------------------------------
+
+``af_alg_count_tsgl()`` returns ``unsigned int`` (the number of TX
+SG entries). Engaging it with retval ``0`` causes the caller in
+``algif_aead.c`` to allocate a 1-entry scatterlist (its
+``if (!entries) entries = 1`` guard) and then walk the *real* TX
+SGL into that undersized destination via ``af_alg_pull_tsgl``,
+producing out-of-bounds writes. **Killswitching here introduces a
+worse bug than the one being mitigated.**
+
+Anti-pattern: ``af_alg_pull_tsgl``
+----------------------------------
+
+``af_alg_pull_tsgl()`` returns ``void``, so any retval is accepted.
+But its caller depends on the per-request SGL being filled in.
+Skipping the body leaves the per-request SGL with NULL pages; the
+next-stage ``memcpy_sglist`` dereferences them and the kernel
+oopses.
+
+Correct pattern: ``af_alg_sendmsg``
+-----------------------------------
+
+``af_alg_sendmsg()`` is the highest-level entry into the AF_ALG
+send path. Engaging it with retval ``-EPERM`` causes every send
+attempt to return -EPERM to userspace; no caller ever sees
+half-initialised state, and any AF_ALG-reachable bug downstream of
+``sendmsg`` is unreachable until the killswitch is disengaged.
+
+The canonical pattern: pick a syscall-handler-shaped function whose
+return value already encodes "this operation didn't happen", and
+let userspace handle the error as it would any other failed
+syscall.
+
+Correct pattern: ``esp_input`` (CVE-2026-43284)
+-----------------------------------------------
+
+The IPsec ESP receive-path bug fixed by ``xfrm: esp: avoid in-place
+decrypt on shared skb frags`` is reachable through ``esp_input()``
+in ``net/ipv4/esp4.c`` (and ``esp6_input()`` for IPv6). Engage these
+with retval ``-EINVAL``:
+
+::
+
+    echo "engage esp_input -22"  > /sys/kernel/security/killswitch/control
+    echo "engage esp6_input -22" > /sys/kernel/security/killswitch/control
+
+Inbound ESP packets are then dropped before decapsulation, neutering
+any bug downstream of the ESP receive path. IPsec tunnels stop
+working; other networking is unaffected.
+
+Do not engage
+=============
+
+Do not killswitch:
+
+* process or memory primitives the rest of the kernel needs to
+  function: ``fork``, ``do_exit``, ``__alloc_pages``, ``kmalloc``,
+  ``schedule``, anything in ``mm/`` reached by every allocation.
+* hot paths in the scheduler, timekeeping, RCU, or interrupt entry.
+* functions invoked from the killswitch path itself (``securityfs``,
+  ``lockdown``, ``audit``, ``kprobe`` registration) -- the system
+  may livelock or refuse to disengage.
+* functions whose return value is read structurally (size, count,
+  pointer-to-allocated-thing) rather than as success/failure.
+  See the AF_ALG anti-patterns above for what goes wrong.
+
+When in doubt, measure first.
+
+Pre-soak before engaging
+========================
+
+If the target's call rate is unknown, attach a counter for a few
+seconds first. With perf::
+
+    perf probe --add 'esp_input'
+    perf stat -a -e probe:esp_input -- sleep 5
+
+Or with bpftrace::
+
+    bpftrace -e 'kprobe:esp_input { @hits = count(); } interval:s:5 { exit(); }'
+
+A target with ten thousand hits per second is not a candidate -- the
+kernel will not survive five seconds with that path returning a
+fixed error.
+
+Relation to other facilities
+============================
+
+* ``CONFIG_FUNCTION_ERROR_INJECTION`` provides the same architecture
+  trampoline (``override_function_with_return``), which killswitch
+  reuses. fail_function is debug-oriented: targets must be
+  pre-annotated with ``ALLOW_ERROR_INJECTION()`` in source, the
+  override is probabilistic, and the interface is on debugfs (blocked
+  under ``lockdown=integrity``). Killswitch is the production cousin:
+  no whitelist, deterministic, securityfs-visible under integrity
+  lockdown, with audit and taint.
+* livepatch can do everything killswitch can and more, at the cost
+  of building, signing, and shipping a kernel module per mitigation.
+  Killswitch is for the window before that module exists.
+* BPF override (``bpf_override_return``) needs a BPF program and
+  ``CONFIG_BPF_KPROBE_OVERRIDE``; appropriate when the policy is
+  conditional, overkill for "always return -EPERM".
+
+Safety notes
+============
+
+* In-flight calls during ``write()`` to ``control`` may run either
+  the original body or the override. The override is ``return X``,
+  which has no preconditions to violate.
+* SMP visibility comes from ``text_poke_bp()``. ``write()`` to
+  ``control`` returns only after every CPU sees the new path.
+* The ftrace ops unregister waits for in-flight pre-handlers, so
+  freeing the engagement attribute on disengage is safe.
+* Inline functions, freed ``__init`` symbols, and anything compiled
+  away cannot be killswitched. ``register_kprobe`` rejects them
+  with whatever error the kprobe layer chooses.
+
+Diagnostics
+===========
+
+Per-call hits are aggregated in a per-cpu counter readable at
+``/sys/kernel/security/killswitch/fn/<name>/hits``. Per-hit logging
+is not provided to avoid log storms on hot paths.
+
+A ``KILLSWITCH`` entry appears in the kernel taint vector once any
+engagement succeeds (also visible as ``H`` in the oops banner).
diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index 9ead927a37c0f..71a6e3364eddc 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -102,6 +102,7 @@ Bit  Log  Number  Reason that got the kernel tainted
  17  _/T  131072  kernel was built with the struct randomization plugin
  18  _/N  262144  an in-kernel test has been run
  19  _/J  524288  userspace used a mutating debug operation in fwctl
+ 20  _/H 1048576  killswitch override engaged (function short-circuited)
 ===  ===  ======  ========================================================
 
 Note: The character ``_`` is representing a blank in this table to make reading
@@ -189,3 +190,10 @@ More detailed explanation for tainting
  19) ``J`` if userspace opened /dev/fwctl/* and performed a FWTCL_RPC_DEBUG_WRITE
      to use the devices debugging features. Device debugging features could
      cause the device to malfunction in undefined ways.
+
+ 20) ``H`` if the killswitch primitive (see
+     Documentation/admin-guide/killswitch.rst) has been engaged on at least
+     one function. The kernel is no longer running its source: at least one
+     function has been short-circuited to return a fixed value. The taint
+     persists across ``disengage`` until the next reboot — once the running
+     image has been modified, oops triage must reflect that.
diff --git a/MAINTAINERS b/MAINTAINERS
index b2040011a3865..b4005b61d444f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14350,6 +14350,17 @@ F:	lib/Kconfig.kmsan
 F:	mm/kmsan/
 F:	scripts/Makefile.kmsan
 
+KILLSWITCH (function short-circuit mitigation)
+M:	Sasha Levin <sashal@kernel.org>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	Documentation/admin-guide/killswitch.rst
+F:	include/linux/killswitch.h
+F:	kernel/Kconfig.killswitch
+F:	kernel/killswitch.c
+F:	lib/test_killswitch.c
+F:	tools/testing/selftests/killswitch/
+
 KPROBES
 M:	Naveen N Rao <naveen@kernel.org>
 M:	"David S. Miller" <davem@davemloft.net>
diff --git a/include/linux/killswitch.h b/include/linux/killswitch.h
new file mode 100644
index 0000000000000..3fad49e180ddf
--- /dev/null
+++ b/include/linux/killswitch.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+ */
+#ifndef _LINUX_KILLSWITCH_H
+#define _LINUX_KILLSWITCH_H
+
+#ifdef CONFIG_KILLSWITCH
+int killswitch_engage(const char *symbol, long retval);
+int killswitch_disengage(const char *symbol);
+bool killswitch_is_engaged(const char *symbol);
+#else
+static inline int killswitch_engage(const char *symbol, long retval)
+{ return -EOPNOTSUPP; }
+static inline int killswitch_disengage(const char *symbol) { return -EOPNOTSUPP; }
+static inline bool killswitch_is_engaged(const char *symbol) { return false; }
+#endif
+
+#endif /* _LINUX_KILLSWITCH_H */
diff --git a/include/linux/panic.h b/include/linux/panic.h
index f1dd417e54b29..6699261a61f13 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -88,7 +88,8 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
 #define TAINT_RANDSTRUCT		17
 #define TAINT_TEST			18
 #define TAINT_FWCTL			19
-#define TAINT_FLAGS_COUNT		20
+#define TAINT_KILLSWITCH		20
+#define TAINT_FLAGS_COUNT		21
 #define TAINT_FLAGS_MAX			((1UL << TAINT_FLAGS_COUNT) - 1)
 
 struct taint_flag {
diff --git a/include/linux/security.h b/include/linux/security.h
index 41d7367cf4036..038027c33ba1a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -146,6 +146,7 @@ enum lockdown_reason {
 	LOCKDOWN_DBG_WRITE_KERNEL,
 	LOCKDOWN_RTAS_ERROR_INJECTION,
 	LOCKDOWN_XEN_USER_ACTIONS,
+	LOCKDOWN_KILLSWITCH,
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
diff --git a/init/Kconfig b/init/Kconfig
index 2937c4d308aec..5368dd4b5c65b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2278,6 +2278,8 @@ config ASN1
 
 source "kernel/Kconfig.locks"
 
+source "kernel/Kconfig.killswitch"
+
 config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	bool
 
diff --git a/kernel/Kconfig.killswitch b/kernel/Kconfig.killswitch
new file mode 100644
index 0000000000000..a33f7ecb2861e
--- /dev/null
+++ b/kernel/Kconfig.killswitch
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Killswitch: per-function short-circuit mitigation primitive.
+#
+# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+#
+
+config KILLSWITCH
+	bool "Killswitch: short-circuit a kernel function as a CVE mitigation"
+	depends on SECURITYFS
+	depends on KPROBES && HAVE_KPROBES_ON_FTRACE
+	depends on HAVE_FUNCTION_ERROR_INJECTION
+	select FUNCTION_ERROR_INJECTION
+	help
+	  Provide an admin-facing mechanism to make a chosen kernel function
+	  return a fixed value without executing its body, as a temporary
+	  mitigation for a security bug before a real fix is available.
+
+	  Operators write "engage <symbol> <retval>" to
+	  /sys/kernel/security/killswitch/control. The function entry is
+	  redirected via a kprobe whose pre-handler sets the chosen return
+	  value and short-circuits the call. There is no allowlist,
+	  denylist, or return-type validation: if the kprobe layer accepts
+	  the symbol the engagement proceeds, otherwise its error is
+	  returned to userspace.
+
+	  This is *not* livepatch: there is no replacement implementation,
+	  the function simply returns the chosen value. Engaging a killswitch
+	  taints the kernel (TAINT_KILLSWITCH, 'H'). Requires CAP_SYS_ADMIN.
+
+	  If unsure, say N.
diff --git a/kernel/Makefile b/kernel/Makefile
index 6785982013dce..b3e408d9f275e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
+obj-$(CONFIG_KILLSWITCH) += killswitch.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/killswitch.c b/kernel/killswitch.c
new file mode 100644
index 0000000000000..7f509c62ea748
--- /dev/null
+++ b/kernel/killswitch.c
@@ -0,0 +1,863 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-function short-circuit mitigation.
+ *
+ * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+ *
+ * Engaging a killswitch installs a kprobe at the function's entry
+ * whose pre-handler sets the return register and skips the body via
+ * override_function_with_return().  Operator interface lives at
+ * /sys/kernel/security/killswitch/.
+ */
+
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/error-injection.h>
+#include <linux/init.h>
+#include <linux/killswitch.h>
+#include <linux/kprobes.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/panic.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uidgid.h>
+
+struct ks_attr {
+	struct list_head	list;
+	struct kprobe		kp;
+	/* atomic so a writer racing an in-flight call can't tear the long. */
+	atomic_long_t		retval;
+	/* false once disengaged; per-fn file ops then return -EIDRM. */
+	bool			engaged;
+	unsigned long __percpu	*hits;
+	struct dentry		*dir;
+	/* engaged_list holds one ref; each open per-fn fd holds one. */
+	struct kref		refcnt;
+};
+
+static DEFINE_MUTEX(ks_lock);
+static LIST_HEAD(ks_engaged_list);
+static struct dentry *ks_root_dir;
+static struct dentry *ks_fn_dir;	/* parent for per-fn directories */
+
+/* ------------------------------------------------------------------ *
+ * Pre-handler: the actual override                                   *
+ * ------------------------------------------------------------------ */
+
+static int ks_kprobe_pre_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+	struct ks_attr *attr = container_of(kp, struct ks_attr, kp);
+
+	this_cpu_inc(*attr->hits);
+	regs_set_return_value(regs, (unsigned long)atomic_long_read(&attr->retval));
+	override_function_with_return(regs);
+	return 1;
+}
+NOKPROBE_SYMBOL(ks_kprobe_pre_handler);
+
+/* Defined non-NULL so the kprobe layer keeps the IPMODIFY ops. */
+static void ks_kprobe_post_handler(struct kprobe *kp, struct pt_regs *regs,
+				   unsigned long flags)
+{
+}
+
+/* ------------------------------------------------------------------ *
+ * Attribute lifecycle                                                *
+ * ------------------------------------------------------------------ */
+
+static struct ks_attr *ks_attr_lookup(const char *symbol)
+{
+	struct ks_attr *attr;
+
+	list_for_each_entry(attr, &ks_engaged_list, list)
+		if (!strcmp(attr->kp.symbol_name, symbol))
+			return attr;
+	return NULL;
+}
+
+static unsigned long ks_attr_hits(const struct ks_attr *attr)
+{
+	unsigned long total = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		total += *per_cpu_ptr(attr->hits, cpu);
+	return total;
+}
+
+static void ks_attr_destroy(struct ks_attr *attr)
+{
+	if (!attr)
+		return;
+	free_percpu(attr->hits);
+	kfree(attr->kp.symbol_name);
+	kfree(attr);
+}
+
+static void ks_attr_kref_release(struct kref *kref)
+{
+	ks_attr_destroy(container_of(kref, struct ks_attr, refcnt));
+}
+
+static void ks_attr_get(struct ks_attr *attr)
+{
+	kref_get(&attr->refcnt);
+}
+
+static void ks_attr_put(struct ks_attr *attr)
+{
+	kref_put(&attr->refcnt, ks_attr_kref_release);
+}
+
+static struct ks_attr *ks_attr_alloc(const char *symbol)
+{
+	struct ks_attr *attr;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return NULL;
+
+	attr->kp.symbol_name = kstrdup(symbol, GFP_KERNEL);
+	if (!attr->kp.symbol_name)
+		goto err;
+
+	attr->hits = alloc_percpu(unsigned long);
+	if (!attr->hits)
+		goto err;
+
+	attr->kp.pre_handler = ks_kprobe_pre_handler;
+	attr->kp.post_handler = ks_kprobe_post_handler;
+	INIT_LIST_HEAD(&attr->list);
+	kref_init(&attr->refcnt);
+	return attr;
+
+err:
+	ks_attr_destroy(attr);
+	return NULL;
+}
+
+/* ------------------------------------------------------------------ *
+ * Securityfs: per-fn attribute files                                 *
+ * ------------------------------------------------------------------ */
+
+/*
+ * Look up by symbol name (the parent dentry's basename) under
+ * ks_lock and confirm attr->dir is the file's parent dentry.  This
+ * binds the fd to the engagement it was opened against and avoids
+ * dereferencing inode->i_private, which a racing disengage may have
+ * freed.  d_parent is stable for the open's lifetime via the file's
+ * dentry reference.
+ */
+static int ks_attr_open(struct inode *inode, struct file *file)
+{
+	struct dentry *parent = file->f_path.dentry->d_parent;
+	const char *name = parent->d_name.name;
+	struct ks_attr *attr;
+
+	mutex_lock(&ks_lock);
+	attr = ks_attr_lookup(name);
+	if (attr && attr->dir == parent)
+		ks_attr_get(attr);
+	else
+		attr = NULL;
+	mutex_unlock(&ks_lock);
+	if (!attr)
+		return -ENOENT;
+	file->private_data = attr;
+	return 0;
+}
+
+static int ks_attr_release(struct inode *inode, struct file *file)
+{
+	ks_attr_put(file->private_data);
+	file->private_data = NULL;
+	return 0;
+}
+
+/* Caller must hold ks_lock. */
+static int ks_attr_check_live(const struct ks_attr *attr)
+{
+	return attr->engaged ? 0 : -EIDRM;
+}
+
+static ssize_t ks_retval_read(struct file *file, char __user *ubuf,
+			      size_t count, loff_t *ppos)
+{
+	struct ks_attr *attr = file->private_data;
+	char buf[32];
+	long val;
+	int ret, len;
+
+	mutex_lock(&ks_lock);
+	ret = ks_attr_check_live(attr);
+	val = atomic_long_read(&attr->retval);
+	mutex_unlock(&ks_lock);
+	if (ret)
+		return ret;
+	len = scnprintf(buf, sizeof(buf), "%ld\n", val);
+	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
+}
+
+static ssize_t ks_retval_write(struct file *file, const char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct ks_attr *attr = file->private_data;
+	char buf[32];
+	long val;
+	int ret;
+
+	if (count >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+	buf[count] = '\0';
+	strim(buf);
+
+	ret = kstrtol(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&ks_lock);
+	ret = ks_attr_check_live(attr);
+	if (!ret)
+		atomic_long_set(&attr->retval, val);
+	mutex_unlock(&ks_lock);
+
+	return ret ? ret : count;
+}
+
+static const struct file_operations ks_retval_fops = {
+	.open		= ks_attr_open,
+	.release	= ks_attr_release,
+	.read		= ks_retval_read,
+	.write	= ks_retval_write,
+	.llseek	= default_llseek,
+};
+
+static ssize_t ks_hits_read(struct file *file, char __user *ubuf,
+			    size_t count, loff_t *ppos)
+{
+	struct ks_attr *attr = file->private_data;
+	char buf[32];
+	unsigned long hits;
+	int ret, len;
+
+	mutex_lock(&ks_lock);
+	ret = ks_attr_check_live(attr);
+	hits = ks_attr_hits(attr);
+	mutex_unlock(&ks_lock);
+	if (ret)
+		return ret;
+	len = scnprintf(buf, sizeof(buf), "%lu\n", hits);
+	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
+}
+
+static const struct file_operations ks_hits_fops = {
+	.open		= ks_attr_open,
+	.release	= ks_attr_release,
+	.read		= ks_hits_read,
+	.llseek		= default_llseek,
+};
+
+static int ks_create_attr_dir(struct ks_attr *attr)
+{
+	struct dentry *d;
+
+	attr->dir = securityfs_create_dir(attr->kp.symbol_name, ks_fn_dir);
+	if (IS_ERR(attr->dir))
+		return PTR_ERR(attr->dir);
+
+	/* ks_attr_open looks the attr up by name; i_private is unused. */
+	d = securityfs_create_file("retval", 0600, attr->dir,
+				   NULL, &ks_retval_fops);
+	if (IS_ERR(d))
+		goto err;
+	d = securityfs_create_file("hits", 0400, attr->dir,
+				   NULL, &ks_hits_fops);
+	if (IS_ERR(d))
+		goto err;
+	return 0;
+err:
+	securityfs_remove(attr->dir);
+	attr->dir = NULL;
+	return PTR_ERR(d);
+}
+
+/* ------------------------------------------------------------------ *
+ * Engage / disengage                                                 *
+ * ------------------------------------------------------------------ */
+
+static int __ks_engage(const char *symbol, long retval, bool from_cmdline)
+{
+	struct ks_attr *attr;
+	int ret;
+
+	if (!symbol || !*symbol)
+		return -EINVAL;
+
+	if (!from_cmdline) {
+		ret = security_locked_down(LOCKDOWN_KILLSWITCH);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&ks_lock);
+
+	if (ks_attr_lookup(symbol)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	attr = ks_attr_alloc(symbol);
+	if (!attr) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	atomic_long_set(&attr->retval, retval);
+
+	ret = register_kprobe(&attr->kp);
+	if (ret) {
+		pr_warn("killswitch: register_kprobe(%s) failed: %d\n",
+			symbol, ret);
+		ks_attr_put(attr);
+		goto out_unlock;
+	}
+
+	ret = ks_create_attr_dir(attr);
+	if (ret) {
+		unregister_kprobe(&attr->kp);
+		ks_attr_put(attr);
+		goto out_unlock;
+	}
+
+	list_add_tail(&attr->list, &ks_engaged_list);
+	attr->engaged = true;
+	add_taint(TAINT_KILLSWITCH, LOCKDEP_STILL_OK);
+
+	if (from_cmdline) {
+		pr_warn("killswitch: engage %s=%ld source=cmdline\n",
+			symbol, retval);
+	} else {
+		pr_warn("killswitch: engage %s=%ld uid=%u auid=%u ses=%u comm=%s\n",
+			symbol, retval,
+			from_kuid(&init_user_ns, current_uid()),
+			from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			audit_get_sessionid(current),
+			current->comm);
+	}
+	ret = 0;
+
+out_unlock:
+	mutex_unlock(&ks_lock);
+	return ret;
+}
+
+int killswitch_engage(const char *symbol, long retval)
+{
+	return __ks_engage(symbol, retval, false);
+}
+
+static int __ks_disengage(const char *symbol)
+{
+	struct ks_attr *attr;
+	unsigned long hits;
+	int ret = 0;
+
+	mutex_lock(&ks_lock);
+	attr = ks_attr_lookup(symbol);
+	if (!attr) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	unregister_kprobe(&attr->kp);
+	attr->engaged = false;
+	list_del(&attr->list);
+	hits = ks_attr_hits(attr);
+	securityfs_remove(attr->dir);
+
+	pr_warn("killswitch: disengage %s hits=%lu uid=%u auid=%u ses=%u comm=%s\n",
+		symbol, hits,
+		from_kuid(&init_user_ns, current_uid()),
+		from_kuid(&init_user_ns, audit_get_loginuid(current)),
+		audit_get_sessionid(current),
+		current->comm);
+
+	/* unregister_kprobe() already waited out in-flight pre-handlers. */
+	ks_attr_put(attr);
+
+out_unlock:
+	mutex_unlock(&ks_lock);
+	return ret;
+}
+
+int killswitch_disengage(const char *symbol)
+{
+	return __ks_disengage(symbol);
+}
+
+bool killswitch_is_engaged(const char *symbol)
+{
+	bool engaged;
+
+	mutex_lock(&ks_lock);
+	engaged = ks_attr_lookup(symbol) != NULL;
+	mutex_unlock(&ks_lock);
+	return engaged;
+}
+
+static void ks_disengage_all_locked(void)
+{
+	struct ks_attr *attr, *n;
+
+	list_for_each_entry_safe(attr, n, &ks_engaged_list, list) {
+		unregister_kprobe(&attr->kp);
+		attr->engaged = false;
+		list_del(&attr->list);
+		securityfs_remove(attr->dir);
+		pr_warn("killswitch: disengage %s hits=%lu (disengage_all)\n",
+			attr->kp.symbol_name, ks_attr_hits(attr));
+		ks_attr_put(attr);
+	}
+}
+
+/* ------------------------------------------------------------------ *
+ * Module unload: drop engagements on functions in the going module   *
+ * ------------------------------------------------------------------ */
+
+static int ks_module_notify(struct notifier_block *nb, unsigned long action,
+			    void *data)
+{
+	struct module *mod = data;
+	struct ks_attr *attr, *n;
+
+	if (action != MODULE_STATE_GOING)
+		return NOTIFY_DONE;
+
+	mutex_lock(&ks_lock);
+	list_for_each_entry_safe(attr, n, &ks_engaged_list, list) {
+		if (!attr->kp.addr ||
+		    __module_address((unsigned long)attr->kp.addr) != mod)
+			continue;
+
+		pr_warn("killswitch: %s mitigation lost: module %s unloading; re-engage after reload if still needed\n",
+			attr->kp.symbol_name, mod->name);
+		unregister_kprobe(&attr->kp);
+		attr->engaged = false;
+		list_del(&attr->list);
+		securityfs_remove(attr->dir);
+		ks_attr_put(attr);
+	}
+	mutex_unlock(&ks_lock);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ks_module_nb = {
+	.notifier_call = ks_module_notify,
+};
+
+/* ------------------------------------------------------------------ *
+ * Top-level securityfs files: control / engaged / taint              *
+ * ------------------------------------------------------------------ */
+
+static int ks_engaged_show(struct seq_file *m, void *v)
+{
+	struct ks_attr *attr;
+
+	mutex_lock(&ks_lock);
+	list_for_each_entry(attr, &ks_engaged_list, list) {
+		seq_printf(m, "%s retval=%ld hits=%lu\n",
+			   attr->kp.symbol_name,
+			   atomic_long_read(&attr->retval),
+			   ks_attr_hits(attr));
+	}
+	mutex_unlock(&ks_lock);
+	return 0;
+}
+
+static int ks_engaged_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ks_engaged_show, NULL);
+}
+
+static const struct file_operations ks_engaged_fops = {
+	.open		= ks_engaged_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static ssize_t ks_taint_read(struct file *file, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	char buf[4];
+	int len;
+
+	len = scnprintf(buf, sizeof(buf), "%d\n",
+			test_taint(TAINT_KILLSWITCH) ? 1 : 0);
+	return simple_read_from_buffer(ubuf, count, ppos, buf, len);
+}
+
+static const struct file_operations ks_taint_fops = {
+	.open	= simple_open,
+	.read	= ks_taint_read,
+	.llseek	= default_llseek,
+};
+
+/*
+ * control: parse one of:
+ *   engage <symbol> <retval>
+ *   disengage <symbol>
+ *   disengage_all
+ */
+static ssize_t ks_control_write(struct file *file, const char __user *ubuf,
+				size_t count, loff_t *ppos)
+{
+	char *buf, *cur, *verb, *sym, *retstr;
+	long retval = 0;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count == 0 || count > 4096)
+		return -EINVAL;
+
+	buf = memdup_user_nul(ubuf, count);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	cur = strim(buf);
+	verb = strsep(&cur, " \t\n");
+	if (!verb || !*verb) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!strcmp(verb, "disengage_all")) {
+		mutex_lock(&ks_lock);
+		ks_disengage_all_locked();
+		mutex_unlock(&ks_lock);
+		ret = count;
+		goto out;
+	}
+
+	sym = strsep(&cur, " \t\n");
+	if (!sym || !*sym) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!strcmp(verb, "disengage")) {
+		ret = __ks_disengage(sym);
+		ret = ret ? ret : count;
+		goto out;
+	}
+
+	if (strcmp(verb, "engage")) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	retstr = strsep(&cur, " \t\n");
+	if (!retstr || !*retstr) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (kstrtol(retstr, 0, &retval)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = killswitch_engage(sym, retval);
+	if (!ret)
+		ret = count;
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+static const struct file_operations ks_control_fops = {
+	.open	= simple_open,
+	.write	= ks_control_write,
+	.llseek	= noop_llseek,
+};
+
+/* ------------------------------------------------------------------ *
+ * Boot parameter:                                                    *
+ *   killswitch=fn1=-1,fn2=0,fn3=-22                                  *
+ * ------------------------------------------------------------------ */
+
+#define KS_BOOT_BUF 1024
+static char ks_boot_buf[KS_BOOT_BUF] __initdata;
+static bool ks_boot_present __initdata;
+
+static int __init ks_boot_setup(char *str)
+{
+	if (!str)
+		return 0;
+	strscpy(ks_boot_buf, str, sizeof(ks_boot_buf));
+	ks_boot_present = true;
+	return 1;
+}
+__setup("killswitch=", ks_boot_setup);
+
+static void __init ks_apply_boot_params(void)
+{
+	char *cur, *tok;
+	long retval;
+
+	if (!ks_boot_present)
+		return;
+
+	cur = ks_boot_buf;
+	while ((tok = strsep(&cur, ",")) != NULL) {
+		char *eq, *sym, *retstr;
+
+		if (!*tok)
+			continue;
+		eq = strchr(tok, '=');
+		if (!eq) {
+			pr_warn("killswitch: cmdline missing '=': %s\n", tok);
+			continue;
+		}
+		*eq++ = '\0';
+		sym = tok;
+		retstr = eq;
+
+		if (kstrtol(retstr, 0, &retval)) {
+			pr_warn("killswitch: cmdline bad retval %s=%s\n",
+				sym, retstr);
+			continue;
+		}
+
+		if (__ks_engage(sym, retval, true))
+			pr_warn("killswitch: cmdline engage %s failed\n", sym);
+	}
+}
+
+/* ------------------------------------------------------------------ *
+ * Init                                                               *
+ * ------------------------------------------------------------------ */
+
+static int __init killswitch_init(void)
+{
+	struct dentry *d;
+
+	ks_root_dir = securityfs_create_dir("killswitch", NULL);
+	if (IS_ERR(ks_root_dir))
+		return PTR_ERR(ks_root_dir);
+
+	d = securityfs_create_file("control", 0200, ks_root_dir,
+				   NULL, &ks_control_fops);
+	if (IS_ERR(d))
+		goto err;
+	d = securityfs_create_file("engaged", 0444, ks_root_dir,
+				   NULL, &ks_engaged_fops);
+	if (IS_ERR(d))
+		goto err;
+	d = securityfs_create_file("taint", 0444, ks_root_dir,
+				   NULL, &ks_taint_fops);
+	if (IS_ERR(d))
+		goto err;
+
+	ks_fn_dir = securityfs_create_dir("fn", ks_root_dir);
+	if (IS_ERR(ks_fn_dir)) {
+		d = ks_fn_dir;
+		goto err;
+	}
+
+	register_module_notifier(&ks_module_nb);
+	ks_apply_boot_params();
+
+	pr_info("killswitch: ready (sysfs at /sys/kernel/security/killswitch/)\n");
+	return 0;
+
+err:
+	securityfs_remove(ks_root_dir);
+	return PTR_ERR(d);
+}
+late_initcall(killswitch_init);
+
+/* ------------------------------------------------------------------ *
+ * KUnit tests                                                        *
+ * ------------------------------------------------------------------ */
+
+#if IS_ENABLED(CONFIG_KUNIT)
+#include <kunit/test.h>
+
+/* Non-static so kallsyms resolves them without CONFIG_KALLSYMS_ALL. */
+int ks_kunit_target_int(int x);
+void *ks_kunit_target_ptr(int x);
+
+#if __has_attribute(__noipa__)
+# define KS_KUNIT_NOIPA __attribute__((__noipa__))
+#else
+# define KS_KUNIT_NOIPA noinline __noclone
+#endif
+
+KS_KUNIT_NOIPA int ks_kunit_target_int(int x)
+{
+	return x + 1;
+}
+
+KS_KUNIT_NOIPA void *ks_kunit_target_ptr(int x)
+{
+	return ERR_PTR(-EIO);
+}
+
+static int ks_kunit_init(struct kunit *test)
+{
+	if (security_locked_down(LOCKDOWN_KILLSWITCH))
+		kunit_skip(test, "integrity lockdown blocks killswitch_engage()");
+	return 0;
+}
+
+static int ks_kunit_init_lockdown(struct kunit *test)
+{
+	if (!security_locked_down(LOCKDOWN_KILLSWITCH))
+		kunit_skip(test, "requires lockdown=integrity");
+	return 0;
+}
+
+static void ks_disengage_quiet(const char *sym)
+{
+	if (killswitch_is_engaged(sym))
+		killswitch_disengage(sym);
+}
+
+static void ks_test_engage_int(struct kunit *test)
+{
+	int ret;
+
+	ret = killswitch_engage("ks_kunit_target_int", -EPERM);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_EXPECT_EQ(test, ks_kunit_target_int(7), -EPERM);
+	KUNIT_EXPECT_EQ(test, killswitch_disengage("ks_kunit_target_int"), 0);
+	KUNIT_EXPECT_EQ(test, ks_kunit_target_int(7), 8);
+}
+
+static void ks_test_double_engage(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ(test,
+		killswitch_engage("ks_kunit_target_int", 0), 0);
+	KUNIT_EXPECT_EQ(test,
+		killswitch_engage("ks_kunit_target_int", 0), -EBUSY);
+	ks_disengage_quiet("ks_kunit_target_int");
+}
+
+static void ks_test_disengage_unknown(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test,
+		killswitch_disengage("ks_kunit_target_int"), -ENOENT);
+}
+
+static void ks_test_pointer_target(struct kunit *test)
+{
+	long retval = (long)(unsigned long)ERR_PTR(-EACCES);
+
+	KUNIT_ASSERT_EQ(test,
+		killswitch_engage("ks_kunit_target_ptr", retval), 0);
+	KUNIT_EXPECT_TRUE(test, IS_ERR(ks_kunit_target_ptr(0)));
+	KUNIT_EXPECT_EQ(test, PTR_ERR(ks_kunit_target_ptr(0)), -EACCES);
+	ks_disengage_quiet("ks_kunit_target_ptr");
+}
+
+static void ks_test_taint_set(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ(test,
+		killswitch_engage("ks_kunit_target_int", 0), 0);
+	KUNIT_EXPECT_TRUE(test, test_taint(TAINT_KILLSWITCH));
+	ks_disengage_quiet("ks_kunit_target_int");
+	/* taint must persist even after disengage */
+	KUNIT_EXPECT_TRUE(test, test_taint(TAINT_KILLSWITCH));
+}
+
+static void ks_test_hits_counter(struct kunit *test)
+{
+	struct ks_attr *attr;
+	int i;
+
+	KUNIT_ASSERT_EQ(test,
+		killswitch_engage("ks_kunit_target_int", 0), 0);
+
+	for (i = 0; i < 17; i++)
+		(void)ks_kunit_target_int(i);
+
+	mutex_lock(&ks_lock);
+	attr = ks_attr_lookup("ks_kunit_target_int");
+	KUNIT_EXPECT_NOT_NULL(test, attr);
+	if (attr)
+		KUNIT_EXPECT_EQ(test, ks_attr_hits(attr), 17UL);
+	mutex_unlock(&ks_lock);
+
+	ks_disengage_quiet("ks_kunit_target_int");
+}
+
+static struct kunit_case ks_kunit_cases[] = {
+	KUNIT_CASE(ks_test_engage_int),
+	KUNIT_CASE(ks_test_double_engage),
+	KUNIT_CASE(ks_test_disengage_unknown),
+	KUNIT_CASE(ks_test_pointer_target),
+	KUNIT_CASE(ks_test_taint_set),
+	KUNIT_CASE(ks_test_hits_counter),
+	{}
+};
+
+static struct kunit_suite ks_kunit_suite = {
+	.name = "killswitch",
+	.init = ks_kunit_init,
+	.test_cases = ks_kunit_cases,
+};
+
+/*
+ * Lockdown suite. Skipped unless the kernel was booted with
+ * lockdown=integrity (or higher). Run together with
+ * killswitch=ks_kunit_target_int=... on the same cmdline to also
+ * exercise the cmdline-bypass and disengage-under-lockdown paths.
+ */
+static void ks_test_lockdown_runtime_engage(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test,
+		killswitch_engage("ks_kunit_target_int", 0), -EPERM);
+}
+
+static void ks_test_lockdown_cmdline_disengage(struct kunit *test)
+{
+	if (!killswitch_is_engaged("ks_kunit_target_int"))
+		kunit_skip(test,
+			   "requires killswitch=ks_kunit_target_int=... on cmdline");
+	KUNIT_EXPECT_EQ(test,
+		killswitch_disengage("ks_kunit_target_int"), 0);
+}
+
+static struct kunit_case ks_kunit_lockdown_cases[] = {
+	KUNIT_CASE(ks_test_lockdown_runtime_engage),
+	KUNIT_CASE(ks_test_lockdown_cmdline_disengage),
+	{}
+};
+
+static struct kunit_suite ks_kunit_lockdown_suite = {
+	.name = "killswitch_lockdown",
+	.init = ks_kunit_init_lockdown,
+	.test_cases = ks_kunit_lockdown_cases,
+};
+
+kunit_test_suites(&ks_kunit_suite, &ks_kunit_lockdown_suite);
+
+#endif /* CONFIG_KUNIT */
+
diff --git a/kernel/panic.c b/kernel/panic.c
index 20feada5319d4..8ee174c7b7dd0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -825,6 +825,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	TAINT_FLAG(RANDSTRUCT,			'T', ' '),
 	TAINT_FLAG(TEST,			'N', ' '),
 	TAINT_FLAG(FWCTL,			'J', ' '),
+	TAINT_FLAG(KILLSWITCH,			'H', ' '),
 };
 
 #undef TAINT_FLAG
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8ff5adcfe1e0a..5770639c7b0ea 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -3349,6 +3349,19 @@ config TEST_HMM
 
 	  If unsure, say N.
 
+config TEST_KILLSWITCH
+	tristate "Test module for the killswitch mitigation primitive"
+	depends on KILLSWITCH && DEBUG_FS
+	depends on m
+	help
+	  Build a module that exposes a deliberately-vulnerable function
+	  ks_test_vuln() and a debugfs trigger /sys/kernel/debug/test_killswitch/fire.
+	  The killswitch selftest in tools/testing/selftests/killswitch/
+	  uses this to confirm engaging a killswitch suppresses the BUG()
+	  the function would otherwise hit.
+
+	  If unsure, say N.
+
 config TEST_FREE_PAGES
 	tristate "Test freeing pages"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index f33a24bf1c19a..d763225340674 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
 obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o
 obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o
+obj-$(CONFIG_TEST_KILLSWITCH) += test_killswitch.o
 obj-$(CONFIG_TEST_HMM) += test_hmm.o
 obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
 obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
diff --git a/lib/test_killswitch.c b/lib/test_killswitch.c
new file mode 100644
index 0000000000000..cc2584ad652ff
--- /dev/null
+++ b/lib/test_killswitch.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test target for the killswitch selftest.  ks_test_vuln() returns
+ * -EBADMSG on a magic input, standing in for "the buggy path runs
+ * and produces a bad outcome".  Engaging killswitch on this function
+ * with retval 0 is the mitigation.
+ *
+ * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#define KS_TEST_MAGIC	0xC0FFEEL
+
+int ks_test_vuln(long magic);
+
+/*
+ * Returns -EBADMSG on the magic input -- stands in for "the buggy
+ * path runs and produces a bad outcome".  Engaging a killswitch on
+ * this function with retval 0 represents the mitigation: even on
+ * the magic input, callers see success because the body never runs.
+ *
+ * noipa prevents inlining/IPA so the call actually reaches the
+ * kprobe-instrumented entry point.
+ */
+noinline int ks_test_vuln(long magic)
+{
+	if (magic == KS_TEST_MAGIC)
+		return -EBADMSG;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ks_test_vuln);
+
+static struct dentry *ks_test_dir;
+
+static ssize_t ks_test_fire_write(struct file *file, const char __user *ubuf,
+				  size_t count, loff_t *ppos)
+{
+	char buf[32];
+	long magic;
+	int ret;
+
+	if (count == 0 || count >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+	buf[count] = '\0';
+
+	ret = kstrtol(strim(buf), 0, &magic);
+	if (ret)
+		return ret;
+
+	ret = ks_test_vuln(magic);
+	return ret ? ret : count;
+}
+
+static const struct file_operations ks_test_fire_fops = {
+	.write	= ks_test_fire_write,
+	.open	= simple_open,
+	.llseek	= noop_llseek,
+};
+
+static int __init test_killswitch_init(void)
+{
+	ks_test_dir = debugfs_create_dir("test_killswitch", NULL);
+	debugfs_create_file("fire", 0200, ks_test_dir, NULL,
+			    &ks_test_fire_fops);
+	pr_info("test_killswitch: loaded (magic=0x%lx)\n", KS_TEST_MAGIC);
+	return 0;
+}
+module_init(test_killswitch_init);
+
+static void __exit test_killswitch_exit(void)
+{
+	debugfs_remove_recursive(ks_test_dir);
+}
+module_exit(test_killswitch_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Deliberately-vulnerable target for killswitch selftest");
diff --git a/security/security.c b/security/security.c
index 4e999f0236516..bf700abc911a9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -62,6 +62,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
 	[LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
 	[LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
 	[LOCKDOWN_XEN_USER_ACTIONS] = "Xen guest user action",
+	[LOCKDOWN_KILLSWITCH] = "engaging a killswitch",
 	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
 	[LOCKDOWN_KCORE] = "/proc/kcore access",
 	[LOCKDOWN_KPROBES] = "use of kprobes",
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6e59b8f63e416..04c3f8c5ff229 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -53,6 +53,7 @@ TARGETS += ipc
 TARGETS += ir
 TARGETS += kcmp
 TARGETS += kexec
+TARGETS += killswitch
 TARGETS += kselftest_harness
 TARGETS += kvm
 TARGETS += landlock
diff --git a/tools/testing/selftests/killswitch/.gitignore b/tools/testing/selftests/killswitch/.gitignore
new file mode 100644
index 0000000000000..cbf204ce18615
--- /dev/null
+++ b/tools/testing/selftests/killswitch/.gitignore
@@ -0,0 +1 @@
+cve_31431_test
diff --git a/tools/testing/selftests/killswitch/Makefile b/tools/testing/selftests/killswitch/Makefile
new file mode 100644
index 0000000000000..ccf41165cb73d
--- /dev/null
+++ b/tools/testing/selftests/killswitch/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+TEST_GEN_PROGS := cve_31431_test cve_43284_test
+TEST_PROGS := killswitch_test.sh
+
+CFLAGS += -O2 -g -std=gnu99 -Wall $(KHDR_INCLUDES)
+
+include ../lib.mk
diff --git a/tools/testing/selftests/killswitch/cve_31431_test.c b/tools/testing/selftests/killswitch/cve_31431_test.c
new file mode 100644
index 0000000000000..1ff817c51d881
--- /dev/null
+++ b/tools/testing/selftests/killswitch/cve_31431_test.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AF_ALG AEAD round-trip prober.  The killswitch selftest uses this
+ * to demonstrate that engaging a killswitch on af_alg_sendmsg
+ * neuters AF_ALG operations (sendmsg returns -EPERM), mitigating
+ * any AF_ALG-reachable bug whose exploit primitive runs from the
+ * send path.
+ *
+ * Exit codes:
+ *   0  AEAD round-trip succeeded (function intact)
+ *   1  AEAD round-trip refused (mitigation engaged)
+ *   2  setup error (no AF_ALG, missing aead/gcm(aes), etc.) -> SKIP
+ *
+ * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <linux/if_alg.h>
+
+#define KEY_LEN		16
+#define IV_LEN		12
+#define AAD_LEN		16
+#define PT_LEN		64
+#define TAG_LEN		16
+#define EXPECTED_LEN	(AAD_LEN + PT_LEN + TAG_LEN)
+
+#ifndef AF_ALG
+#define AF_ALG		38
+#endif
+#ifndef SOL_ALG
+#define SOL_ALG		279
+#endif
+
+int main(void)
+{
+	struct sockaddr_alg sa = {
+		.salg_family = AF_ALG,
+		.salg_type   = "aead",
+		.salg_name   = "gcm(aes)",
+	};
+	unsigned char key[KEY_LEN] = { 0 };
+	unsigned char iv[IV_LEN]   = { 0 };
+	unsigned char buf[1024]    = { 0 };
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	struct af_alg_iv *aiv;
+	char cbuf[256] = { 0 };
+	int *p_op, *p_assoclen;
+	int sk, opfd;
+	ssize_t n;
+
+	sk = socket(AF_ALG, SOCK_SEQPACKET, 0);
+	if (sk < 0) {
+		fprintf(stderr, "AF_ALG socket: %s -- skip\n", strerror(errno));
+		return 2;
+	}
+	if (bind(sk, (struct sockaddr *)&sa, sizeof(sa))) {
+		fprintf(stderr, "bind aead/gcm(aes): %s -- skip\n",
+			strerror(errno));
+		close(sk);
+		return 2;
+	}
+	if (setsockopt(sk, SOL_ALG, ALG_SET_KEY, key, KEY_LEN)) {
+		fprintf(stderr, "ALG_SET_KEY: %s -- skip\n", strerror(errno));
+		close(sk);
+		return 2;
+	}
+	if (setsockopt(sk, SOL_ALG, ALG_SET_AEAD_AUTHSIZE, NULL, TAG_LEN)) {
+		fprintf(stderr, "ALG_SET_AEAD_AUTHSIZE: %s -- skip\n",
+			strerror(errno));
+		close(sk);
+		return 2;
+	}
+
+	opfd = accept(sk, NULL, 0);
+	if (opfd < 0) {
+		fprintf(stderr, "accept: %s -- skip\n", strerror(errno));
+		close(sk);
+		return 2;
+	}
+
+	/* control message: ENCRYPT op + IV + assoclen */
+	msg.msg_control    = cbuf;
+	msg.msg_controllen = CMSG_SPACE(sizeof(int))
+			   + CMSG_SPACE(sizeof(*aiv) + IV_LEN)
+			   + CMSG_SPACE(sizeof(int));
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_ALG;
+	cmsg->cmsg_type  = ALG_SET_OP;
+	cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
+	p_op = (int *)CMSG_DATA(cmsg);
+	*p_op = ALG_OP_ENCRYPT;
+
+	cmsg = CMSG_NXTHDR(&msg, cmsg);
+	cmsg->cmsg_level = SOL_ALG;
+	cmsg->cmsg_type  = ALG_SET_IV;
+	cmsg->cmsg_len   = CMSG_LEN(sizeof(*aiv) + IV_LEN);
+	aiv = (struct af_alg_iv *)CMSG_DATA(cmsg);
+	aiv->ivlen = IV_LEN;
+	memcpy(aiv->iv, iv, IV_LEN);
+
+	cmsg = CMSG_NXTHDR(&msg, cmsg);
+	cmsg->cmsg_level = SOL_ALG;
+	cmsg->cmsg_type  = ALG_SET_AEAD_ASSOCLEN;
+	cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
+	p_assoclen = (int *)CMSG_DATA(cmsg);
+	*p_assoclen = AAD_LEN;
+
+	/* AAD || plaintext */
+	memset(buf, 0xaa, AAD_LEN);
+	memset(buf + AAD_LEN, 0x55, PT_LEN);
+	iov.iov_base = buf;
+	iov.iov_len  = AAD_LEN + PT_LEN;
+	msg.msg_iov    = &iov;
+	msg.msg_iovlen = 1;
+
+	n = sendmsg(opfd, &msg, 0);
+	if (n < 0) {
+		/*
+		 * sendmsg refused: this is exactly the killswitch
+		 * af_alg_sendmsg=-EPERM mitigation outcome.  Distinct
+		 * exit code from setup failure so the test script can
+		 * tell them apart.
+		 */
+		fprintf(stderr, "sendmsg: %s -- mitigation engaged?\n",
+			strerror(errno));
+		close(opfd); close(sk);
+		return 1;
+	}
+
+	/* recv: AAD echoed, plus ciphertext + tag */
+	memset(buf, 0, sizeof(buf));
+	n = read(opfd, buf, EXPECTED_LEN);
+	close(opfd); close(sk);
+
+	if (n == 0) {
+		printf("AEAD returned 0 bytes -- killswitch mitigation engaged\n");
+		return 1;
+	}
+	if (n != EXPECTED_LEN) {
+		fprintf(stderr,
+			"AEAD short read: got %zd, expected %d -- mitigated?\n",
+			n, EXPECTED_LEN);
+		return 1;
+	}
+
+	/* sanity: ciphertext (after AAD) shouldn't equal the plaintext bytes */
+	if (memcmp(buf + AAD_LEN, buf + AAD_LEN + 1, PT_LEN - 1) == 0) {
+		fprintf(stderr, "AEAD output looks unencrypted\n");
+		return 2;
+	}
+
+	printf("AEAD round-trip OK (%zd bytes)\n", n);
+	return 0;
+}
diff --git a/tools/testing/selftests/killswitch/cve_43284_test.c b/tools/testing/selftests/killswitch/cve_43284_test.c
new file mode 100644
index 0000000000000..4771cb0957dc1
--- /dev/null
+++ b/tools/testing/selftests/killswitch/cve_43284_test.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * UDP loopback round-trip prober.  Wrapped by killswitch_test.sh with
+ * an IPsec ESP SA + policy pair on loopback, this demonstrates that
+ * engaging a killswitch on esp_input drops inbound ESP packets before
+ * decapsulation, mitigating CVE-2026-43284 ("Dirty Frag", upstream fix
+ * xfrm: esp: avoid in-place decrypt on shared skb frags).
+ *
+ * The binary itself knows nothing about ESP -- it sends one UDP
+ * datagram to itself and waits up to a second for delivery.
+ *
+ * Exit codes:
+ *   0  UDP round-trip succeeded (no mitigation in effect)
+ *   1  UDP recv timed out (mitigation engaged)
+ *   2  setup error -> SKIP
+ *
+ * Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+ */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define UDP_PORT 53435
+#define PROBE    "ks-43284-probe"
+
+int main(void)
+{
+	struct sockaddr_in addr = {
+		.sin_family      = AF_INET,
+		.sin_port        = htons(UDP_PORT),
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
+	struct timeval tv = { .tv_sec = 1, .tv_usec = 0 };
+	char buf[64];
+	int sk;
+	ssize_t n;
+
+	sk = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sk < 0) {
+		fprintf(stderr, "socket: %s -- skip\n", strerror(errno));
+		return 2;
+	}
+	if (bind(sk, (struct sockaddr *)&addr, sizeof(addr))) {
+		fprintf(stderr, "bind: %s -- skip\n", strerror(errno));
+		close(sk);
+		return 2;
+	}
+	if (setsockopt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
+		fprintf(stderr, "SO_RCVTIMEO: %s -- skip\n", strerror(errno));
+		close(sk);
+		return 2;
+	}
+
+	if (sendto(sk, PROBE, sizeof(PROBE) - 1, 0,
+		   (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		fprintf(stderr, "sendto: %s -- skip\n", strerror(errno));
+		close(sk);
+		return 2;
+	}
+
+	memset(buf, 0, sizeof(buf));
+	n = recvfrom(sk, buf, sizeof(buf), 0, NULL, NULL);
+	close(sk);
+
+	if (n < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			fprintf(stderr,
+				"recvfrom: timeout -- mitigation engaged?\n");
+			return 1;
+		}
+		fprintf(stderr, "recvfrom: %s\n", strerror(errno));
+		return 2;
+	}
+	if (n != (ssize_t)(sizeof(PROBE) - 1) ||
+	    memcmp(buf, PROBE, sizeof(PROBE) - 1)) {
+		fprintf(stderr, "recvfrom: bad payload (%zd bytes)\n", n);
+		return 2;
+	}
+
+	printf("UDP round-trip OK (%zd bytes)\n", n);
+	return 0;
+}
diff --git a/tools/testing/selftests/killswitch/killswitch_test.sh b/tools/testing/selftests/killswitch/killswitch_test.sh
new file mode 100755
index 0000000000000..ea3fd394a984f
--- /dev/null
+++ b/tools/testing/selftests/killswitch/killswitch_test.sh
@@ -0,0 +1,254 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# End-to-end killswitch selftest.  Drives the test_killswitch module
+# through an engage/disengage cycle and confirms each transition
+# behaves as expected.  Also runs the AF_ALG mitigation proof.
+#
+# Requirements (see Documentation/admin-guide/killswitch.rst):
+#   - CONFIG_KILLSWITCH=y
+#   - CONFIG_TEST_KILLSWITCH=m
+#   - run as root (CAP_SYS_ADMIN)
+#
+# Copyright (C) 2026 Sasha Levin <sashal@kernel.org>
+#
+
+set -u
+
+KS=/sys/kernel/security/killswitch
+TRIG=/sys/kernel/debug/test_killswitch/fire
+
+NOMOD=0
+SKIP_RC=4
+N=0
+FAIL=0
+
+ksft_pass() { N=$((N+1));    echo "ok $N - $*"; }
+ksft_fail() { N=$((N+1)); FAIL=$((FAIL+1)); echo "not ok $N - $*"; }
+ksft_skip() { echo "ok 1 - SKIP $*"; echo "1..1"; exit $SKIP_RC; }
+
+[[ $EUID -eq 0 ]] || ksft_skip "must be root"
+[[ -d $KS    ]] || ksft_skip "$KS not present (CONFIG_KILLSWITCH disabled?)"
+
+if ! modprobe test_killswitch 2>/dev/null; then
+	NOMOD=1
+fi
+[[ -e $TRIG ]] || ksft_skip "$TRIG missing (test_killswitch.ko not installed?)"
+
+cleanup() {
+	echo "disengage_all" > $KS/control 2>/dev/null || true
+	[[ $NOMOD -eq 0 ]] && rmmod test_killswitch 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# --- pre-engage: bad path runs, write fails with EBADMSG ---
+if echo 0xC0FFEE > $TRIG 2>/dev/null; then
+	ksft_fail "pre-engage: write should have failed (-EBADMSG)"
+else
+	[[ $? -ne 0 ]] && ksft_pass "pre-engage: bad path returns error" \
+	             || ksft_fail "pre-engage: unexpected outcome"
+fi
+
+# --- engage ---
+echo "engage ks_test_vuln 0" > $KS/control
+grep -q "^ks_test_vuln" $KS/engaged \
+	&& ksft_pass "engage: ks_test_vuln in engaged list" \
+	|| ksft_fail "engage: missing from engaged list"
+
+[[ $(cat $KS/taint) == 1 ]] \
+	&& ksft_pass "engage: taint set" \
+	|| ksft_fail "engage: taint not set"
+
+[[ -d $KS/fn/ks_test_vuln ]] \
+	&& ksft_pass "engage: per-fn dir created" \
+	|| ksft_fail "engage: per-fn dir missing"
+
+# --- post-engage: BUG suppressed; write returns successfully ---
+if echo 0xC0FFEE > $TRIG 2>/dev/null; then
+	ksft_pass "post-engage: BUG suppressed, write succeeded"
+else
+	ksft_fail "post-engage: write should succeed"
+fi
+
+[[ $(cat $KS/fn/ks_test_vuln/hits) -ge 1 ]] \
+	&& ksft_pass "post-engage: hits counter incremented" \
+	|| ksft_fail "post-engage: hits counter did not move"
+
+# --- retval rewrite is a plain write (no validation) ---
+echo 7 > $KS/fn/ks_test_vuln/retval
+[[ $(cat $KS/fn/ks_test_vuln/retval) == 7 ]] \
+	&& ksft_pass "retval rewrite round-trips" \
+	|| ksft_fail "retval rewrite failed"
+
+# --- engage on a kprobe-rejected function fails ---
+# warn_thunk_thunk is in /sys/kernel/debug/kprobes/blacklist;
+# register_kprobe() refuses it.
+KP_REJECT=warn_thunk_thunk
+if echo "engage $KP_REJECT 0" > $KS/control 2>/dev/null; then
+	ksft_fail "register_kprobe should have rejected $KP_REJECT"
+	echo "disengage $KP_REJECT" > $KS/control
+else
+	ksft_pass "register_kprobe refuses blacklisted target"
+fi
+
+# --- disengage ---
+echo "disengage ks_test_vuln" > $KS/control
+[[ -z "$(cat $KS/engaged)" ]] \
+	&& ksft_pass "disengage: engaged list empty" \
+	|| ksft_fail "disengage: engaged list not empty"
+
+[[ ! -d $KS/fn/ks_test_vuln ]] \
+	&& ksft_pass "disengage: per-fn dir removed" \
+	|| ksft_fail "disengage: per-fn dir still present"
+
+[[ $(cat $KS/taint) == 1 ]] \
+	&& ksft_pass "disengage: taint persists" \
+	|| ksft_fail "disengage: taint should persist"
+
+# --- post-disengage: bad path active again ---
+if echo 0xC0FFEE > $TRIG 2>/dev/null; then
+	ksft_fail "post-disengage: write should fail again"
+else
+	ksft_pass "post-disengage: bad path active again"
+fi
+
+# ---- CVE-2026-31431 mitigation proof (AF_ALG aead via af_alg_sendmsg) ----
+# Skip the whole block if AF_ALG / AEAD machinery isn't compiled in.
+if [[ -x $(dirname "$0")/cve_31431_test ]]; then
+	CVE=$(dirname "$0")/cve_31431_test
+	$CVE >/dev/null 2>&1 && PRE=$? || PRE=$?
+	if [[ $PRE -eq 0 ]]; then
+		ksft_pass "cve-31431: pre-engage AEAD round-trip OK"
+
+		echo "engage af_alg_sendmsg -1" > $KS/control
+		$CVE >/dev/null 2>&1 && POST=$? || POST=$?
+		if [[ $POST -eq 1 ]]; then
+			ksft_pass "cve-31431: post-engage AEAD refused (mitigated)"
+		else
+			ksft_fail "cve-31431: post-engage exit=$POST (expected 1)"
+		fi
+
+		HITS=$(cat $KS/fn/af_alg_sendmsg/hits 2>/dev/null || echo 0)
+		[[ $HITS -ge 1 ]] && ksft_pass "cve-31431: hits=$HITS recorded" \
+			|| ksft_fail "cve-31431: hits not recorded"
+
+		echo "disengage af_alg_sendmsg" > $KS/control
+		$CVE >/dev/null 2>&1 && POST2=$? || POST2=$?
+		[[ $POST2 -eq 0 ]] && ksft_pass "cve-31431: post-disengage restored" \
+			|| ksft_fail "cve-31431: post-disengage exit=$POST2"
+	elif [[ $PRE -eq 2 ]]; then
+		echo "# SKIP cve-31431 (AF_ALG/AEAD not available)"
+	else
+		ksft_fail "cve-31431: pre-engage exit=$PRE"
+	fi
+fi
+
+# ---- CVE-2026-43284 mitigation proof (IPsec ESP via esp_input) ----
+# Engaging esp_input causes inbound ESP packets to be dropped before
+# decapsulation, neutering any bug downstream of the ESP receive path.
+# Two netns + veth so traffic actually traverses xfrm (single-netns
+# 127.0.0.0/8 traffic short-circuits before xfrm policy lookup).
+NS0=ks-esp-0
+NS1=ks-esp-1
+esp_setup_ok=0
+esp_cleanup() {
+	[[ $esp_setup_ok -eq 1 ]] || return 0
+	ip netns del $NS0 2>/dev/null
+	ip netns del $NS1 2>/dev/null
+}
+trap 'cleanup; esp_cleanup' EXIT
+
+# UDP probe in python3 (always present on Debian/Fedora minimal installs).
+esp_round_trip() {
+	# $1: source netns, $2: dest netns, $3: dest ip, $4: port
+	local tmp rpid rc
+	tmp=$(mktemp)
+	ip netns exec "$2" python3 -c '
+import socket
+r = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+r.bind(("0.0.0.0", '"$4"'))
+r.settimeout(2.0)
+try:
+    d,_ = r.recvfrom(64)
+    print(d.decode(errors="replace"))
+except socket.timeout:
+    print("timeout")
+' > "$tmp" 2>&1 &
+	rpid=$!
+	sleep 0.3
+	ip netns exec "$1" python3 -c '
+import socket
+s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+s.sendto(b"ks-esp-probe", ("'"$3"'", '"$4"'))
+' 2>/dev/null
+	wait $rpid 2>/dev/null
+	rc=1
+	grep -q "ks-esp-probe" "$tmp" && rc=0
+	rm -f "$tmp"
+	return $rc
+}
+
+if command -v ip >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then
+	KEY=0x0123456789abcdef0123456789abcdef01234567
+
+	if ip netns add $NS0 2>/dev/null && \
+	   ip netns add $NS1 2>/dev/null && \
+	   ip link add veth0 type veth peer name veth1 2>/dev/null && \
+	   ip link set veth0 netns $NS0 2>/dev/null && \
+	   ip link set veth1 netns $NS1 2>/dev/null && \
+	   ip -n $NS0 addr add 10.99.0.1/24 dev veth0 2>/dev/null && \
+	   ip -n $NS1 addr add 10.99.0.2/24 dev veth1 2>/dev/null && \
+	   ip -n $NS0 link set veth0 up 2>/dev/null && \
+	   ip -n $NS1 link set veth1 up 2>/dev/null && \
+	   ip -n $NS0 link set lo up 2>/dev/null && \
+	   ip -n $NS1 link set lo up 2>/dev/null && \
+	   ip -n $NS0 xfrm state add src 10.99.0.1 dst 10.99.0.2 proto esp \
+		spi 0x1000 mode transport reqid 0x100 \
+		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
+	   ip -n $NS0 xfrm state add src 10.99.0.2 dst 10.99.0.1 proto esp \
+		spi 0x1001 mode transport reqid 0x100 \
+		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
+	   ip -n $NS1 xfrm state add src 10.99.0.1 dst 10.99.0.2 proto esp \
+		spi 0x1000 mode transport reqid 0x100 \
+		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
+	   ip -n $NS1 xfrm state add src 10.99.0.2 dst 10.99.0.1 proto esp \
+		spi 0x1001 mode transport reqid 0x100 \
+		aead 'rfc4106(gcm(aes))' $KEY 128 2>/dev/null && \
+	   ip -n $NS0 xfrm policy add src 10.99.0.1 dst 10.99.0.2 \
+		dir out tmpl src 10.99.0.1 dst 10.99.0.2 proto esp \
+		reqid 0x100 mode transport 2>/dev/null && \
+	   ip -n $NS1 xfrm policy add src 10.99.0.1 dst 10.99.0.2 \
+		dir in tmpl src 10.99.0.1 dst 10.99.0.2 proto esp \
+		reqid 0x100 mode transport 2>/dev/null; then
+		esp_setup_ok=1
+	fi
+
+	if [[ $esp_setup_ok -eq 1 ]] \
+	   && esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
+		ksft_pass "cve-43284: pre-engage ESP round-trip OK"
+
+		echo "engage esp_input -22" > $KS/control
+		if esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
+			ksft_fail "cve-43284: post-engage ESP should have been dropped"
+		else
+			ksft_pass "cve-43284: post-engage ESP refused (mitigated)"
+		fi
+
+		ESP_HITS=$(cat $KS/fn/esp_input/hits 2>/dev/null || echo 0)
+		[[ $ESP_HITS -ge 1 ]] \
+			&& ksft_pass "cve-43284: hits=$ESP_HITS recorded" \
+			|| ksft_fail "cve-43284: hits not recorded"
+
+		echo "disengage esp_input" > $KS/control
+		if esp_round_trip $NS0 $NS1 10.99.0.2 53435; then
+			ksft_pass "cve-43284: post-disengage restored"
+		else
+			ksft_fail "cve-43284: post-disengage ESP still dropped"
+		fi
+	else
+		echo "# SKIP cve-43284 (netns/veth/XFRM/ESP setup failed)"
+	fi
+fi
+
+echo "1..$N"
+exit $((FAIL > 0))
-- 
2.53.0


^ permalink raw reply related

* Re: [RFC PATCH 1/6] livepatch: Support scoped atomic replace using replace set
From: Yafang Shao @ 2026-05-17 13:32 UTC (permalink / raw)
  To: sashiko-reviews
  Cc: mbenes, song, jikos, jpoimboe, pmladek, joe.lawrence,
	live-patching
In-Reply-To: <20260514205439.925F1C2BCB7@smtp.kernel.org>

On Fri, May 15, 2026 at 4:54 AM <sashiko-bot@kernel.org> wrote:
>
> Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
> - [High] Module reference leak during forced transition.
> - [High] Silent semantic reversion for zero-initialized out-of-tree livepatches.
> - [High] Sysfs ABI breakage and incorrect type formatting for the `replace` attribute.
> - [Medium] Build failure in livepatch selftests.
> --
>
> commit 8a05e8a3ebd9241541926ac6aba19452a820c899
> Author: Yafang Shao <laoar.shao@gmail.com>
>
> livepatch: Support scoped atomic replace using replace set
>
> This commit converts the boolean replace attribute to a u32 replace_set
> to allow livepatches to atomically replace only existing patches within
> the same set. Patches with different replace sets can now remain active
> simultaneously.
>
> > diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
> > --- a/include/linux/livepatch.h
> > +++ b/include/linux/livepatch.h
> [ ... ]
> > @@ -137,7 +138,7 @@ struct klp_patch {
> >       struct module *mod;
> >       struct klp_object *objs;
> >       struct klp_state *states;
> > -     bool replace;
> > +     unsigned int replace_set;
>
> By removing the boolean replace flag, out-of-tree livepatches that
> zero-initialize their patch structures will now default to a replace_set of 0.
> Will this cause them to unintentionally replace all other zero-initialized
> patches instead of stacking safely?

As previously discussed, backward compatibility is not a concern.

>
> Additionally, does this rename break the livepatch selftests in
> tools/testing/selftests/livepatch/test_modules/ (such as test_klp_state2.c),
> which still explicitly set patch.replace = true?

Selftests are currently excluded from this RFC

>
> [ ... ]
> > diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
> > --- a/kernel/livepatch/core.c
> > +++ b/kernel/livepatch/core.c
> > @@ -454,7 +454,7 @@ static ssize_t replace_show(struct kobject *kobj,
> >       struct klp_patch *patch;
> >
> >       patch = container_of(kobj, struct klp_patch, kobj);
> > -     return sysfs_emit(buf, "%d\n", patch->replace);
> > +     return sysfs_emit(buf, "%d\n", patch->replace_set);
>
> Does changing the sysfs output from a boolean to a replace set ID break
> compatibility for userspace tools expecting strict 0 or 1 values from the
> replace attribute?
>
> Also, since replace_set is declared as an unsigned int, will using the
> signed %d format specifier cause large replace set IDs to overflow and
> display as negative numbers?

Should use "%u" instead.

>
> [ ... ]
> > diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
> > --- a/kernel/livepatch/transition.c
> > +++ b/kernel/livepatch/transition.c
> [ ... ]
> > @@ -720,11 +720,11 @@ void klp_force_transition(void)
> >               klp_update_patch_state(idle_task(cpu));
> >
> >       /* Set forced flag for patches being removed. */
> > -     if (klp_target_state == KLP_TRANSITION_UNPATCHED)
> > +     if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
> >               klp_transition_patch->forced = true;
> > -     else if (klp_transition_patch->replace) {
> > +     } else {
> >               klp_for_each_patch(patch) {
> > -                     if (patch != klp_transition_patch)
> > +                     if (patch->replace_set == klp_transition_patch->replace_set)
> >                               patch->forced = true;
> >               }
> >       }
>
> Could this inadvertently set forced = true on the klp_transition_patch
> itself?
>
> Since klp_transition_patch was already added to the global list during
> klp_init_patch(), it will match its own replace_set here. If it is marked
> as forced during a patching transition, wouldn't we leak its module reference
> in klp_free_patch_finish() when it is cleanly disabled later?

Right. it should be :

                klp_for_each_patch(patch) {
-                       if (patch != klp_transition_patch)
+                       if (patch != klp_transition_patch &&
+                           patch->replace_set ==
klp_transition_patch->replace_set)
                                patch->forced = true;
                }

-- 
Regards
Yafang

^ permalink raw reply

* [PATCH] livepatch: Improve the accuracy of symbol search
From: luhao @ 2026-05-16  8:08 UTC (permalink / raw)
  To: jpoimboe, jikos, mbenes, pmladek
  Cc: joe.lawrence, live-patching, linux-kernel, zhang.chunA,
	wang.shijie, lu.haoA

module_kallsyms_on_each_symbol, when the input parameter modname is not
 empty, only searches for symbols within the current module. When
patching a kernel object (ko), if the patched function calls
functions from vmlinux or other ko modules, symbol lookup may fail.

When patching a ko, the current approach first searches for symbols
within the module itself. If not found, it uses
kallsyms_on_each_match_symbol to search in vmlinux. If still not
found, it calls module_kallsyms_on_each_symbol with modname set to
NULL to search across all ko modules. The reason for not searching
across all ko modules from the start is to avoid issues with
duplicate symbol names.

Reviewed-by: zhangchun <zhang.chunA@h3c.com>
Reviewed-by: wangshijie <wang.shijie@h3c.com>
Signed-off-by: luhao <lu.haoA@h3c.com>
---
 kernel/livepatch/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 28d15ba58a26..9c587cc4896b 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -167,9 +167,14 @@ static int klp_find_object_symbol(const char *objname, const char *name,
                .pos = sympos,
        };

-       if (objname)
+       if (objname) {
                module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
-       else
+
+               if (args.addr == 0)
+                       kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
+               if (args.addr == 0)
+                       module_kallsyms_on_each_symbol(NULL, klp_find_callback, &args);
+       } else
                kallsyms_on_each_match_symbol(klp_match_callback, name, &args);

        /*
--
2.51.0

-------------------------------------------------------------------------------------------------------------------------------------
±¾Óʼþ¼°Æä¸½¼þº¬ÓÐлªÈý¼¯Íŵı£ÃÜÐÅÏ¢£¬½öÏÞÓÚ·¢Ë͸øÉÏÃæµØÖ·ÖÐÁгöµÄ¸öÈË»òȺ×é¡£
½ûÖ¹ÈÎºÎÆäËûÈËÒÔÈκÎÐÎʽʹÓ㨰üÀ¨µ«²»ÏÞÓÚÈ«²¿»ò²¿·ÖµØÐ¹Â¶¡¢¸´ÖÆ¡¢»òÉ¢·¢£©±¾ÓʼþÖеÄÐÅÏ¢¡£
Èç¹ûÄú´íÊÕÁ˱¾Óʼþ£¬ÇëÄúÁ¢¼´µç»°»òÓʼþ֪ͨ·¢¼þÈ˲¢É¾³ý±¾Óʼþ£¡
This e-mail and its attachments contain confidential information from New H3C, which is intended only for the person or entity whose address is listed above.
Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited.
If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it!

^ permalink raw reply related

* Re: [PATCH v3 16/21] objtool/klp: Filter arm64 mapping symbols in find_symbol_by_offset()
From: Song Liu @ 2026-05-15 21:20 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, live-patching, Peter Zijlstra, Joe Lawrence,
	Catalin Marinas, Will Deacon, linux-arm-kernel, Mark Rutland,
	Miroslav Benes, Petr Mladek
In-Reply-To: <236050080db7b2462fdb13a03ed48a8efb2415a4.1778642120.git.jpoimboe@kernel.org>

On Tue, May 12, 2026 at 8:34 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> ARM64 ELF objects contain $d/$x mapping symbols (STT_NOTYPE) at offset 0
> in data/text sections.  These aren't "real" symbols so filter them from
> find_symbol_by_offset(), consistent with the existing section symbol
> filter.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>

Acked-by: Song Liu <song@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 04/21] arm64: Rename TRAMP_VALIAS -> TRAMP_VALIAS_ASM in asm-offsets
From: Song Liu @ 2026-05-15 21:18 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, live-patching, Peter Zijlstra, Joe Lawrence,
	Catalin Marinas, Will Deacon, linux-arm-kernel, Mark Rutland,
	Miroslav Benes, Petr Mladek
In-Reply-To: <74623fad8c45d26a3da6c5420b00156d8f7c2150.1778642120.git.jpoimboe@kernel.org>

On Tue, May 12, 2026 at 8:34 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> Rename the asm-offsets TRAMP_VALIAS macro to TRAMP_VALIAS_ASM, following
> the naming convention already used by PIE_E0_ASM and PIE_E1_ASM.  This
> disambiguates the asm-offsets-generated constant from the C macro of the
> same name defined in fixmap.h and vectors.h.
>
> This is needed by a later patch which adds new includes to asm-offsets.c
> that would otherwise conflict with the C version.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>

Acked-by: Song Liu <song@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 01/21] klp-build: Reject patches to init/*.c
From: Song Liu @ 2026-05-15 21:16 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: x86, linux-kernel, live-patching, Peter Zijlstra, Joe Lawrence,
	Catalin Marinas, Will Deacon, linux-arm-kernel, Mark Rutland,
	Miroslav Benes, Petr Mladek
In-Reply-To: <f32864b560d40894cdb70d613480d7c2ecdb55e0.1778642120.git.jpoimboe@kernel.org>

On Tue, May 12, 2026 at 8:35 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> init/Makefile hard-codes -fno-function-sections and -fno-data-sections,
> overriding the klp-build flags needed for patch generation.
>
> Don't allow any changes to those files; being init code they aren't
> really patchable anyway.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>

Acked-by: Song Liu <song@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 0/8] unwind, arm64: add sframe unwinder for kernel
From: Mostafa Saleh @ 2026-05-15 11:32 UTC (permalink / raw)
  To: Dylan Hatch
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Mark Rutland, Prasanna Kumar T S M,
	Puranjay Mohan, Song Liu, joe.lawrence, linux-toolchains,
	linux-kernel, live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <20260428183643.3796063-1-dylanbhatch@google.com>

On Tue, Apr 28, 2026 at 06:36:35PM +0000, Dylan Hatch wrote:
> Implement a generic kernel sframe-based [1] unwinder. The main goal is
> to improve reliable stacktrace on arm64 by unwinding across exception
> boundaries.
> 
> On x86, the ORC unwinder provides reliable stacktrace through similar
> methodology, but arm64 lacks the necessary support from objtool to
> create ORC unwind tables.
> 
> Currently, there's already a sframe unwinder proposed for userspace: [2].
> To maintain common definitions and algorithms for sframe lookup, a
> substantial portion of this patch series aims to refactor the sframe
> lookup code to support both kernel and userspace sframe sections.
> 
> Currently, only GNU Binutils support sframe. This series relies on the
> Sframe V3 format, which is supported in binutils 2.46.
> 
> These patches are based on Steven Rostedt's sframe/core branch [3],
> which is and aggregation of existing work done for x86 sframe userspace
> unwind, and contains [2]. This branch is, in turn, based on Linux
> v7.0-rc3. This full series (applied to the sframe/core branch) is
> available on github: [4].
> 

Not sure if related, but after updating my toolchain
(aarch64-linux-gnu-gcc (Debian 15.2.0-4) 15.2.0), I hit link errors:
ld.lld: error: arch/arm64/kernel/vdso/vgettimeofday.o:(.sframe) is being placed in '.sframe'
ld.lld: error: arch/arm64/kernel/vdso/vgetrandom.o:(.sframe) is being placed in '.sframe`

I applied this series hoping that fix it, but it doesn't, so far I
have this hack :
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 52314be29191..53bdf757ee44 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -77,7 +77,7 @@ SECTIONS
        /DISCARD/       : {
                *(.data .data.* .gnu.linkonce.d.* .sdata*)
                *(.bss .sbss .dynbss .dynsbss)
-               *(.eh_frame .eh_frame_hdr)
+               *(.eh_frame .eh_frame_hdr .sframe)
        }
 }

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 60c8c22fd3e4..759903acd6fc 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -1064,6 +1064,7 @@
        /* ld.bfd warns about .gnu.version* even when not emitted */    \
        *(.gnu.version*)                                                \
        *(__tracepoint_check)                                           \
+       *(.sframe)                                                      \

 #define DISCARDS                                                       \
        /DISCARD/ : {                                                   \


Thanks,
Mostafa


^ permalink raw reply related

* Re: [PATCH v5 3/8] arm64: entry: add unwind info for various kernel entries
From: Mark Rutland @ 2026-05-15  8:58 UTC (permalink / raw)
  To: Dylan Hatch
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Prasanna Kumar T S M, Puranjay Mohan,
	Song Liu, joe.lawrence, linux-toolchains, linux-kernel,
	live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <CADBMgpxBeYUdA5X8BPgkgz=KQyN=NQ760bXygwXfvVRScNzgbA@mail.gmail.com>

On Thu, May 14, 2026 at 08:30:43PM -0700, Dylan Hatch wrote:
> On Wed, Apr 29, 2026 at 8:26 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > On Tue, Apr 28, 2026 at 06:36:38PM +0000, Dylan Hatch wrote:
> > > From: Weinan Liu <wnliu@google.com>
> > >
> > > DWARF CFI (Call Frame Information) specifies how to recover the return
> > > address and callee-saved registers at each PC in a given function.
> > > Compilers are able to generate the CFI annotations when they compile
> > > the code to assembly language. For handcrafted assembly, we need to
> > > annotate them by hand.
> > >
> > > Annotate minimal CFI to enable stacktracing using SFrame for kernel
> > > exception entries through el1*_64_*() paths
> >
> > I thought we were only consuming SFrame when unwinding an exeption
> > boundary?
> >
> > We shouldn't be taking exceptions _from_ the entry assembly functions
> > unless something has gone horribly wrong, and so I don't see why we'd
> > need CFI entries for the entry assembly functions.
> >
> > Am I missing some reason we need CFI entries for the entry assembly
> > functions? I strongly suspect it is not necessary to add these, and I'd
> > prefer to omit them.
> 
> I believe the el1 entry functions are called in an exception, and are
> called before call_on_irq_stack. 

Yes, but I don't think that matters. See below for more details.

> Example stacktrace segment:
> 
> [  262.119564]  handle_percpu_devid_irq+0xb4/0x348
> [  262.119913]  handle_irq_desc+0x3c/0x68
> [  262.120196]  generic_handle_domain_irq+0x20/0x40
> [  262.120678]  gic_handle_irq+0x48/0xe0
> [  262.121005]  call_on_irq_stack+0x30/0x48
> [  262.121412]  do_interrupt_handler+0x88/0xa0
> [  262.121779]  el1_interrupt+0x38/0x58
> [  262.122089]  el1h_64_irq_handler+0x18/0x30
> [  262.122617]  el1h_64_irq+0x6c/0x70

The segment immediately above can be unwound using FP, as frame records
were created consistently, and there are no exception boundaries. No CFI
needed.

It's legitimate to take an exception from parts of call_on_irq_stack(),
so it makes sense for that to have CFI, but for the specific unwind
segment above, CFI isn't necessary.

Everything in the stacktrace segment above was executed *after* HW took
the exception.

<< EXCEPTION BOUNDARY HERE >>

Everything in the stacktrace segment(s) below was executed *before* HW
took the exception.

The unwinder knows that it has crossed this exception boundary by virtue
of finding a FRAME_META_TYPE_PT_REGS frame record.

> [  262.123159]  _raw_spin_unlock_irq+0x10/0x60 (P)

The unwinder knows that the value of pt_regs::pc was *definitely* the PC
at the time the exception was taken, so that entry is reliable. No CFI
needed.

> [  262.123720]  __filemap_add_folio+0x200/0x580 (L)

The unwinder doesn't know whether the LR should be used, and needs CFI
to determine that.

After this point, an FP unwind can be used until we encounter the next
exception boundary.

> [  262.124145]  filemap_add_folio+0xec/0x300
> [  262.124674]  page_cache_ra_unbounded+0x128/0x368
> [  262.125338]  do_page_cache_ra+0x70/0x98
> [  262.125875]  page_cache_ra_order+0x460/0x4e0

The segment immediately above can be unwound using FP. No CFI needed.

> Here, el1h_64_irq is the last function that appears in the exception
> stack before _raw_spin_unlock_irq and __filemap_add_folio are
> recovered from the saved PC and LR, respectively. So we therefore need
> the CFI annotations in order to unwind through the full exception
> boundary.
> 
> Is my interpretation here correct?

Given you say "full exception boundary" here, I think we might be using
the term "exception boundary" to mean different things.

As per the example above, I'm using "exception boundary" to mean the a
point between two entries in the stacktrace where HW took an exception.

Did my comments on the example help?

> > > and irq entries through call_on_irq_stack()
> >
> > Needing some sort of unwind annotations for call_on_irq_stack() makes
> > sense to me, but don't we need something for other assembly functions
> > too?
> >
> > We can interrupt things like memset(); I assume we'll treat those as
> > unreliable until annotated?
> 
> While looking into adding these annotations, I noticed a pattern where
> a sibling call is made to a local function:
> 
> SYM_FUNC_START(__pi_memset)
> alternative_if_not ARM64_HAS_MOPS
>         b       __pi_memset_generic
> alternative_else_nop_endif
> 
>         mov     dst, dstin
>         setp    [dst]!, count!, val_x
>         setm    [dst]!, count!, val_x
>         sete    [dst]!, count!, val_x
>         ret
> SYM_FUNC_END(__pi_memset)
> 
> In this case, do we consider the stacktrace unreliable since
> __pi_memset may not appear in the trace?

This is a tail-call, and __pi_memset_generic() will not return to
__pi_memset(). Once the branch to __pi_memset_generic() has been
executed, it's fine for __pi_memset() to not show up in the trace.

The key thing is that no more instructions from __pi_memset() itself
will be executed unless it was called again (from its entry point).

> Or is this not important because assembly functions cannot be directly
> livepatched anyway?

To the best of my knowledge, reliable stacktrace is only used to
determine whether any thread is still within an old version of a
patchable function (including where it's within a callee thereof).

I am not aware of a case where we'd need to detect whether a thread is
still within a non-patchable function, but I can't rule out that as a
possibility.

That's more of a question for the livepatching maintainers.

Thanks,
Mark.

^ permalink raw reply

* Re: [PATCH v5 3/8] arm64: entry: add unwind info for various kernel entries
From: Dylan Hatch @ 2026-05-15  3:30 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Roman Gushchin, Weinan Liu, Will Deacon, Josh Poimboeuf,
	Indu Bhagat, Peter Zijlstra, Steven Rostedt, Catalin Marinas,
	Jiri Kosina, Jens Remus, Prasanna Kumar T S M, Puranjay Mohan,
	Song Liu, joe.lawrence, linux-toolchains, linux-kernel,
	live-patching, linux-arm-kernel, Randy Dunlap
In-Reply-To: <afIjFLbUrdxWA6eR@J2N7QTR9R3.cambridge.arm.com>

On Wed, Apr 29, 2026 at 8:26 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> Hi Dylan,
>
> On Tue, Apr 28, 2026 at 06:36:38PM +0000, Dylan Hatch wrote:
> > From: Weinan Liu <wnliu@google.com>
> >
> > DWARF CFI (Call Frame Information) specifies how to recover the return
> > address and callee-saved registers at each PC in a given function.
> > Compilers are able to generate the CFI annotations when they compile
> > the code to assembly language. For handcrafted assembly, we need to
> > annotate them by hand.
> >
> > Annotate minimal CFI to enable stacktracing using SFrame for kernel
> > exception entries through el1*_64_*() paths
>
> I thought we were only consuming SFrame when unwinding an exeption
> boundary?
>
> We shouldn't be taking exceptions _from_ the entry assembly functions
> unless something has gone horribly wrong, and so I don't see why we'd
> need CFI entries for the entry assembly functions.
>
> Am I missing some reason we need CFI entries for the entry assembly
> functions? I strongly suspect it is not necessary to add these, and I'd
> prefer to omit them.

I believe the el1 entry functions are called in an exception, and are
called before call_on_irq_stack. Example stacktrace segment:

[  262.119564]  handle_percpu_devid_irq+0xb4/0x348
[  262.119913]  handle_irq_desc+0x3c/0x68
[  262.120196]  generic_handle_domain_irq+0x20/0x40
[  262.120678]  gic_handle_irq+0x48/0xe0
[  262.121005]  call_on_irq_stack+0x30/0x48
[  262.121412]  do_interrupt_handler+0x88/0xa0
[  262.121779]  el1_interrupt+0x38/0x58
[  262.122089]  el1h_64_irq_handler+0x18/0x30
[  262.122617]  el1h_64_irq+0x6c/0x70
[  262.123159]  _raw_spin_unlock_irq+0x10/0x60 (P)
[  262.123720]  __filemap_add_folio+0x200/0x580 (L)
[  262.124145]  filemap_add_folio+0xec/0x300
[  262.124674]  page_cache_ra_unbounded+0x128/0x368
[  262.125338]  do_page_cache_ra+0x70/0x98
[  262.125875]  page_cache_ra_order+0x460/0x4e0

Here, el1h_64_irq is the last function that appears in the exception
stack before _raw_spin_unlock_irq and __filemap_add_folio are
recovered from the saved PC and LR, respectively. So we therefore need
the CFI annotations in order to unwind through the full exception
boundary.

Is my interpretation here correct?

>
> > and irq entries through call_on_irq_stack()
>
> Needing some sort of unwind annotations for call_on_irq_stack() makes
> sense to me, but don't we need something for other assembly functions
> too?
>
> We can interrupt things like memset(); I assume we'll treat those as
> unreliable until annotated?

While looking into adding these annotations, I noticed a pattern where
a sibling call is made to a local function:

SYM_FUNC_START(__pi_memset)
alternative_if_not ARM64_HAS_MOPS
        b       __pi_memset_generic
alternative_else_nop_endif

        mov     dst, dstin
        setp    [dst]!, count!, val_x
        setm    [dst]!, count!, val_x
        sete    [dst]!, count!, val_x
        ret
SYM_FUNC_END(__pi_memset)

In this case, do we consider the stacktrace unreliable since
__pi_memset may not appear in the trace? Or is this not important
because assembly functions cannot be directly livepatched anyway?

Thanks,
Dylan

^ permalink raw reply

* Re: [PATCH v3 21/21] klp-build: Add arm64 syscall patching macro
From: sashiko-bot @ 2026-05-15  2:44 UTC (permalink / raw)
  To: x86
  Cc: Miroslav Benes, live-patching, Peter Zijlstra, Will Deacon,
	Joe Lawrence, Catalin Marinas, linux-arm-kernel, Mark Rutland,
	Song Liu, linux-kernel, Petr Mladek
In-Reply-To: <3bf3cee43039fc4da84e83d0891a92e25f40f00f.1778642121.git.jpoimboe@kernel.org>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The arm64 __KLP_SYSCALL_DEFINEx macro omits the ALLOW_ERROR_INJECTION declaration, silently breaking BPF error injection for livepatched syscalls.
--

commit b4d1a9ea352f196a7fd3e7198fdade51440d08e5
Author: Josh Poimboeuf <jpoimboe@kernel.org>

klp-build: Add arm64 syscall patching macro

This commit adds arm64 support for KLP_SYSCALL_DEFINEx(). The commit
message states it is mirroring the arm64 __SYSCALL_DEFINEx() pattern from
arch/arm64/include/asm/syscall_wrapper.h.

> diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h
> index 99d68d0773fa8..4b647b83865f9 100644
> --- a/include/linux/livepatch_helpers.h
> +++ b/include/linux/livepatch_helpers.h
> @@ -72,6 +72,25 @@
>  	}								\
>  	static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
>  
> +#elif defined(CONFIG_ARM64)
> +
> +#define __KLP_SYSCALL_DEFINEx(x, name, ...)				\
> +	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
> +	static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
> +	asmlinkage long __arm64_sys##name(const struct pt_regs *regs);	\
> +	asmlinkage long __arm64_sys##name(const struct pt_regs *regs)	\
> +	{								\
> +		return __se_sys##name(SC_ARM64_REGS_TO_ARGS(x,__VA_ARGS__));\
> +	}								\

Does this macro definition omit the ALLOW_ERROR_INJECTION declaration?

The standard arm64 __SYSCALL_DEFINEx() macro explicitly registers the top-level
syscall handler with ALLOW_ERROR_INJECTION() to enable BPF error injection
workflows like bpf_override_return(). The x86 implementation correctly
preserves this functionality.

Because this macro is missing that declaration, it looks like it might
introduce a regression where BPF cannot inject errors into the replacement
syscalls in a livepatch module on arm64.

[ ... ]

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1778642120.git.jpoimboe@kernel.org?part=21

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox