* [PATCH v16 06/20] unwind_user/sframe: Detect .sframe sections in executables
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
When loading an ELF executable, automatically detect an .sframe section
and associate it with the mm_struct.
[ Jens Remus: Fix checkpatch warning "braces {} are not necessary for
single statement blocks". ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Only add sframe for text that is PT_LOAD in addition to PF_X.
(Sashiko AI)
fs/binfmt_elf.c | 48 +++++++++++++++++++++++++++++++++++++---
include/uapi/linux/elf.h | 1 +
2 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 16a56b6b3f6c..980a9f229cd1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -48,6 +48,7 @@
#include <linux/uaccess.h>
#include <uapi/linux/rseq.h>
#include <linux/rseq.h>
+#include <linux/sframe.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -637,6 +638,21 @@ static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state,
return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp);
}
+static void elf_add_sframe(struct elf_phdr *text, struct elf_phdr *sframe,
+ unsigned long base_addr)
+{
+ unsigned long sframe_start, sframe_end, text_start, text_end;
+
+ sframe_start = base_addr + sframe->p_vaddr;
+ sframe_end = sframe_start + sframe->p_memsz;
+
+ text_start = base_addr + text->p_vaddr;
+ text_end = text_start + text->p_memsz;
+
+ /* Ignore return value, sframe section isn't critical */
+ sframe_add_section(sframe_start, sframe_end, text_start, text_end);
+}
+
/* This is much more generalized than the library routine read function,
so we keep this separate. Technically the library read function
is only provided so that we can read a.out libraries that have
@@ -647,7 +663,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
unsigned long no_base, struct elf_phdr *interp_elf_phdata,
struct arch_elf_state *arch_state)
{
- struct elf_phdr *eppnt;
+ struct elf_phdr *eppnt, *sframe_phdr = NULL;
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long error = ~0UL;
@@ -673,7 +689,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
eppnt = interp_elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
- if (eppnt->p_type == PT_LOAD) {
+ switch (eppnt->p_type) {
+ case PT_LOAD: {
int elf_type = MAP_PRIVATE;
int elf_prot = make_prot(eppnt->p_flags, arch_state,
true, true);
@@ -712,6 +729,19 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
error = -ENOMEM;
goto out;
}
+ break;
+ }
+ case PT_GNU_SFRAME:
+ sframe_phdr = eppnt;
+ break;
+ }
+ }
+
+ if (sframe_phdr) {
+ eppnt = interp_elf_phdata;
+ for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+ if (eppnt->p_flags & PF_X && eppnt->p_type == PT_LOAD)
+ elf_add_sframe(eppnt, sframe_phdr, load_addr);
}
}
@@ -836,7 +866,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
- struct elf_phdr *elf_property_phdata = NULL;
+ struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
unsigned long elf_brk;
bool brk_moved = false;
int retval, i;
@@ -945,6 +975,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
executable_stack = EXSTACK_DISABLE_X;
break;
+ case PT_GNU_SFRAME:
+ sframe_phdr = elf_ppnt;
+ break;
+
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
bprm->file, false,
@@ -1242,6 +1276,14 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_brk = k;
}
+ if (sframe_phdr) {
+ for (i = 0, elf_ppnt = elf_phdata;
+ i < elf_ex->e_phnum; i++, elf_ppnt++) {
+ if (elf_ppnt->p_flags & PF_X && elf_ppnt->p_type == PT_LOAD)
+ elf_add_sframe(elf_ppnt, sframe_phdr, load_bias);
+ }
+ }
+
e_entry = elf_ex->e_entry + load_bias;
phdr_addr += load_bias;
elf_brk += load_bias;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index ee30dcd80901..e2a7dbed2e80 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -41,6 +41,7 @@ typedef __u16 Elf64_Versym;
#define PT_GNU_STACK (PT_LOOS + 0x474e551)
#define PT_GNU_RELRO (PT_LOOS + 0x474e552)
#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553)
+#define PT_GNU_SFRAME (PT_LOOS + 0x474e554)
/* ARM MTE memory tag segment type */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 16/20] unwind_user/sframe: Add support for SFrame V3 flexible FDEs
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
SFrame V3 introduces flexible FDEs in addition to the regular FDEs.
The key difference is that flexible FDEs encode the CFA, RA, and FP
tracking information using two FRE data words, a control word and an
offset, or a single padding data word of zero (e.g. to represent FP
without RA tracking information).
The control word contains the following information:
- reg_p: Whether to use the register contents (reg_p=1) specified
by regnum or the CFA (reg_p=0) as base.
- deref_p: Whether to dereference.
- regnum: A DWARF register number.
The offset is added to the base (i.e. CFA or register contents). Then
the resulting address may optionally be dereferenced.
This enables the following flexible CFA and FP/RA recovery rules:
- CFA = register + offset // reg_p=1, deref_p=0
- CFA = *(register + offset) // reg_p=1, deref_p=1
- FP/RA = *(CFA + offset) // reg_p=0, deref_p=0
- FP/RA = register + offset // reg_p=1, deref_p=0
- FP/RA = *(register + offset) // reg_p=1, deref_p=1
Note that for the CFA a rule with reg_p=0 is invalid, as the value of
the CFA cannot be described using itself as base. For FP/RA a rule with
reg_p=0 and deref_p=0 and regnum=0 is invalid, as it that is equal to
the padding data word of zero.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v16:
- __find_fre(): Move declaration of ret to function scope to resolve
compile error. (Sashiko AI)
Changes in v15:
- __read_flex_fde_fre_datawords(): Add comment on FRE dataword RA/FP
location info decoding logic. (Sashiko AI)
- Fix outermost frame (FRE without datawords) handling to not cause
sframe_init_cfa_rule_data() and ultimately sframe_find() to fail
with -EINVAL. (Sashiko AI)
- sframe_init_[cfa_]rule_data(): Reject FRE control word with
reserved_p=1. (Sashiko AI)
- __find_fre(): Return RC of sframe_init_[cfa_]rule_data() if bad RC.
- Normalize error code usage (.sframe is removed for all but ENOENT):
ENOENT: No sframe or no FDE for IP found
(FDE found but no FRE is EINVAL)
EFAULT: Bad address
EINVAL: Invalid input or sframe
Changes in v14:
- Rename __read_regular_fre_datawords() to
__read_default_fre_datawords() to align to SFrame V3 specification
(default FRE).
- Rename SFRAME_FDE_TYPE_FLEXIBLE to SFRAME_FDE_TYPE_FLEX to match
SFrame V3 specification and adjust to rename of SFRAME_FDE_TYPE_*.
- Rename SFRAME_V3_FLEX_FDE_CTLWORD_*() to
SFRAME_V3_FLEX_FDE_CTRLWORD_*() to match SFrame V3 reference
implementation.
- Add arch/*/include/asm/unwind_user_sframe.h to MAINTAINERS.
MAINTAINERS | 1 +
kernel/unwind/sframe.c | 287 +++++++++++++++++++++++++++++++++--------
kernel/unwind/sframe.h | 6 +
3 files changed, 238 insertions(+), 56 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index a9b42b67a88d..25f0b933511c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27875,6 +27875,7 @@ M: Josh Poimboeuf <jpoimboe@kernel.org>
M: Steven Rostedt <rostedt@goodmis.org>
S: Maintained
F: arch/*/include/asm/unwind_user.h
+F: arch/*/include/asm/unwind_user_sframe.h
F: include/asm-generic/unwind_user.h
F: include/linux/sframe.h
F: include/linux/unwind*.h
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index daa97d8b0231..b623dca072da 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/string_helpers.h>
#include <linux/sframe.h>
+#include <asm/unwind_user_sframe.h>
#include <linux/unwind_user_types.h>
#include "sframe.h"
@@ -31,8 +32,11 @@ struct sframe_fde_internal {
struct sframe_fre_internal {
unsigned int size;
u32 ip_off;
+ u32 cfa_ctl;
s32 cfa_off;
+ u32 ra_ctl;
s32 ra_off;
+ u32 fp_ctl;
s32 fp_off;
u8 info;
};
@@ -200,19 +204,160 @@ static __always_inline int __find_fde(struct sframe_section *sec,
s32 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \
s64 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label))
+static __always_inline int
+__read_default_fre_datawords(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long cur,
+ unsigned char dataword_count,
+ unsigned char dataword_size,
+ struct sframe_fre_internal *fre)
+{
+ s32 cfa_off, ra_off, fp_off;
+ unsigned int cfa_regnum;
+
+ UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
+ dataword_count--;
+
+ ra_off = sec->ra_off;
+ if (!ra_off && dataword_count) {
+ dataword_count--;
+ UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
+ }
+
+ fp_off = sec->fp_off;
+ if (!fp_off && dataword_count) {
+ dataword_count--;
+ UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+ }
+
+ if (dataword_count)
+ return -EINVAL;
+
+ cfa_regnum =
+ (SFRAME_V3_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP) ?
+ SFRAME_REG_FP : SFRAME_REG_SP;
+
+ fre->cfa_ctl = (cfa_regnum << 3) | 1; /* regnum, deref_p=0, reg_p=1 */
+ fre->cfa_off = cfa_off;
+ fre->ra_ctl = ra_off ? 2 : 0; /* regnum=0, deref_p=(ra_off != 0), reg_p=0 */
+ fre->ra_off = ra_off;
+ fre->fp_ctl = fp_off ? 2 : 0; /* regnum=0, deref_p=(fp_off != 0), reg_p=0 */
+ fre->fp_off = fp_off;
+
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+static __always_inline int
+__read_flex_fde_fre_datawords(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long cur,
+ unsigned char dataword_count,
+ unsigned char dataword_size,
+ struct sframe_fre_internal *fre)
+{
+ u32 cfa_ctl, ra_ctl, fp_ctl;
+ s32 cfa_off, ra_off, fp_off;
+
+ if (dataword_count < 2)
+ return -EINVAL;
+ UNSAFE_GET_USER_INC(cfa_ctl, cur, dataword_size, Efault);
+ UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
+ dataword_count -= 2;
+
+ /*
+ * Each RA/FP location info consumes either two datawords
+ * (control word + offset) or one padding word substituting
+ * for that pair. Padding is only valid as substitution if
+ * followed by further non-padding location info. Therefore
+ * decoding only proceeds with at least two datawords. Any
+ * leftover trailing datawords are invalid and rejected by
+ * the final check.
+ */
+
+ ra_off = sec->ra_off;
+ ra_ctl = ra_off ? 2 : 0; /* regnum=0, deref_p=(ra_off != 0), reg_p=0 */
+ if (dataword_count >= 2) {
+ UNSAFE_GET_USER_INC(ra_ctl, cur, dataword_size, Efault);
+ dataword_count--;
+ if (ra_ctl) {
+ UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
+ dataword_count--;
+ } else {
+ /* Padding RA location info */
+ ra_ctl = ra_off ? 2 : 0; /* re-deduce (see above) */
+ }
+ }
+
+ fp_off = sec->fp_off;
+ fp_ctl = fp_off ? 2 : 0; /* regnum=0, deref_p=(fp_off != 0), reg_p=0 */
+ if (dataword_count >= 2) {
+ UNSAFE_GET_USER_INC(fp_ctl, cur, dataword_size, Efault);
+ dataword_count--;
+ if (fp_ctl) {
+ UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+ dataword_count--;
+ } else {
+ /* Padding FP location info */
+ fp_ctl = fp_off ? 2 : 0; /* re-deduce (see above) */
+ }
+ }
+
+ /* Reject trailing padding or unknown extra datawords */
+ if (dataword_count)
+ return -EINVAL;
+
+ fre->cfa_ctl = cfa_ctl;
+ fre->cfa_off = cfa_off;
+ fre->ra_ctl = ra_ctl;
+ fre->ra_off = ra_off;
+ fre->fp_ctl = fp_ctl;
+ fre->fp_off = fp_off;
+
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+static __always_inline int
+__read_fre_datawords(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long cur,
+ unsigned char dataword_count,
+ unsigned char dataword_size,
+ struct sframe_fre_internal *fre)
+{
+ unsigned char fde_type = SFRAME_V3_FDE_TYPE(fde->info2);
+
+ switch (fde_type) {
+ case SFRAME_FDE_TYPE_DEFAULT:
+ return __read_default_fre_datawords(sec, fde, cur,
+ dataword_count,
+ dataword_size,
+ fre);
+ case SFRAME_FDE_TYPE_FLEX:
+ return __read_flex_fde_fre_datawords(sec, fde, cur,
+ dataword_count,
+ dataword_size,
+ fre);
+ default:
+ return -EINVAL;
+ }
+}
+
static __always_inline int __read_fre(struct sframe_section *sec,
struct sframe_fde_internal *fde,
unsigned long fre_addr,
struct sframe_fre_internal *fre)
{
- unsigned char fde_type = SFRAME_V3_FDE_TYPE(fde->info2);
unsigned char fde_pctype = SFRAME_V3_FDE_PCTYPE(fde->info);
unsigned char fre_type = SFRAME_V3_FDE_FRE_TYPE(fde->info);
unsigned char dataword_count, dataword_size;
- s32 cfa_off, ra_off, fp_off;
unsigned long cur = fre_addr;
unsigned char addr_size;
- unsigned int fre_size;
u32 ip_off;
u8 info;
@@ -233,80 +378,105 @@ static __always_inline int __read_fre(struct sframe_section *sec,
dataword_size = dataword_size_enum_to_size(SFRAME_V3_FRE_DATAWORD_SIZE(info));
if (!dataword_size)
return -EINVAL;
- fre_size = addr_size + 1 + (dataword_count * dataword_size);
if (cur + (dataword_count * dataword_size) > sec->fres_end)
return -EFAULT;
- /* TODO: Support for flexible FDEs not implemented yet. */
- if (fde_type != SFRAME_FDE_TYPE_DEFAULT)
- return -EINVAL;
+ fre->size = addr_size + 1 + (dataword_count * dataword_size);
+ fre->ip_off = ip_off;
+ fre->info = info;
if (!dataword_count) {
/*
- * A FRE without data words indicates RA undefined /
- * outermost frame.
+ * A FRE without datawords indicates an outermost
+ * frame. Zero-initialize CFA, RA, and FP location
+ * info, except for the CFA control word, so that
+ * neither sframe_init_cfa_rule_data() nor
+ * sframe_init_rule_data() fail.
*/
- cfa_off = 0;
- ra_off = 0;
- fp_off = 0;
- goto done;
- }
-
- UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
- dataword_count--;
-
- ra_off = sec->ra_off;
- if (!ra_off && dataword_count) {
- dataword_count--;
- UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
- }
+ fre->cfa_ctl = (SFRAME_REG_SP << 3) | 1; /* regnum=SP, deref_p=0, reg_p=1 */
+ fre->cfa_off = 0;
+ fre->ra_ctl = 0;
+ fre->ra_off = 0;
+ fre->fp_ctl = 0;
+ fre->fp_off = 0;
- fp_off = sec->fp_off;
- if (!fp_off && dataword_count) {
- dataword_count--;
- UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+ return 0;
}
- if (dataword_count)
- return -EINVAL;
-
-done:
- fre->size = fre_size;
- fre->ip_off = ip_off;
- fre->cfa_off = cfa_off;
- fre->ra_off = ra_off;
- fre->fp_off = fp_off;
- fre->info = info;
-
- return 0;
+ return __read_fre_datawords(sec, fde, cur, dataword_count, dataword_size, fre);
Efault:
return -EFAULT;
}
-static __always_inline void
+static __always_inline int
sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
- unsigned char fre_info,
- s32 offset)
+ u32 ctlword, s32 offset)
{
- if (SFRAME_V3_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP)
- cfa_rule_data->rule = UNWIND_USER_CFA_RULE_FP_OFFSET;
- else
+ bool deref_p = SFRAME_V3_FLEX_FDE_CTRLWORD_DEREF_P(ctlword);
+ bool reg_p = SFRAME_V3_FLEX_FDE_CTRLWORD_REG_P(ctlword);
+ bool reserved_p = SFRAME_V3_FLEX_FDE_CTRLWORD_RESERVED_P(ctlword);
+ unsigned int regnum = SFRAME_V3_FLEX_FDE_CTRLWORD_REGNUM(ctlword);
+
+ if (reserved_p)
+ return -EINVAL;
+
+ /* CFA recovery rule must be register-based */
+ if (!reg_p)
+ return -EINVAL;
+
+ switch (regnum) {
+ case SFRAME_REG_SP:
cfa_rule_data->rule = UNWIND_USER_CFA_RULE_SP_OFFSET;
+ break;
+ case SFRAME_REG_FP:
+ cfa_rule_data->rule = UNWIND_USER_CFA_RULE_FP_OFFSET;
+ break;
+ default:
+ cfa_rule_data->rule = UNWIND_USER_CFA_RULE_REG_OFFSET;
+ cfa_rule_data->regnum = regnum;
+ }
+
+ if (deref_p)
+ cfa_rule_data->rule |= UNWIND_USER_RULE_DEREF;
+
cfa_rule_data->offset = offset;
+
+ return 0;
}
-static __always_inline void
+static __always_inline int
sframe_init_rule_data(struct unwind_user_rule_data *rule_data,
- s32 offset)
+ u32 ctlword, s32 offset)
{
- if (offset) {
- rule_data->rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF;
- rule_data->offset = offset;
- } else {
+ bool deref_p = SFRAME_V3_FLEX_FDE_CTRLWORD_DEREF_P(ctlword);
+ bool reg_p = SFRAME_V3_FLEX_FDE_CTRLWORD_REG_P(ctlword);
+ bool reserved_p = SFRAME_V3_FLEX_FDE_CTRLWORD_RESERVED_P(ctlword);
+
+ if (!ctlword && !offset) {
rule_data->rule = UNWIND_USER_RULE_RETAIN;
+ return 0;
+ }
+
+ if (reserved_p)
+ return -EINVAL;
+
+ if (reg_p) {
+ unsigned int regnum = SFRAME_V3_FLEX_FDE_CTRLWORD_REGNUM(ctlword);
+
+ rule_data->rule = UNWIND_USER_RULE_REG_OFFSET;
+ rule_data->regnum = regnum;
+ } else {
+ rule_data->rule = UNWIND_USER_RULE_CFA_OFFSET;
}
+
+ if (deref_p)
+ rule_data->rule |= UNWIND_USER_RULE_DEREF;
+
+ rule_data->offset = offset;
+
+ return 0;
}
static __always_inline int __find_fre(struct sframe_section *sec,
@@ -321,6 +491,7 @@ static __always_inline int __find_fre(struct sframe_section *sec,
bool which = false;
unsigned int i;
u32 ip_off;
+ int ret;
ip_off = ip - fde->func_addr;
@@ -330,8 +501,6 @@ static __always_inline int __find_fre(struct sframe_section *sec,
fre_addr = sec->fres_start + fde->fres_off;
for (i = 0; i < fde->fres_num; i++) {
- int ret;
-
/*
* Alternate between the two fre_addr[] entries for 'fre' and
* 'prev_fre'.
@@ -358,9 +527,15 @@ static __always_inline int __find_fre(struct sframe_section *sec,
return -EINVAL;
fre = prev_fre;
- sframe_init_cfa_rule_data(&frame->cfa, fre->info, fre->cfa_off);
- sframe_init_rule_data(&frame->ra, fre->ra_off);
- sframe_init_rule_data(&frame->fp, fre->fp_off);
+ ret = sframe_init_cfa_rule_data(&frame->cfa, fre->cfa_ctl, fre->cfa_off);
+ if (ret)
+ return ret;
+ ret = sframe_init_rule_data(&frame->ra, fre->ra_ctl, fre->ra_off);
+ if (ret)
+ return ret;
+ ret = sframe_init_rule_data(&frame->fp, fre->fp_ctl, fre->fp_off);
+ if (ret)
+ return ret;
frame->outermost = SFRAME_V3_FRE_RA_UNDEFINED_P(fre->info);
return 0;
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
index ed111fd0d702..1a2528e4b149 100644
--- a/kernel/unwind/sframe.h
+++ b/kernel/unwind/sframe.h
@@ -66,6 +66,7 @@ struct sframe_fda_v3 {
#define SFRAME_V3_AARCH64_FDE_PAUTH_KEY(info) (((info) >> 5) & 0x1)
#define SFRAME_FDE_TYPE_DEFAULT 0
+#define SFRAME_FDE_TYPE_FLEX 1
#define SFRAME_V3_FDE_TYPE_MASK 0x1f
#define SFRAME_V3_FDE_TYPE(info2) ((info2) & SFRAME_V3_FDE_TYPE_MASK)
@@ -79,4 +80,9 @@ struct sframe_fda_v3 {
#define SFRAME_V3_AARCH64_FRE_MANGLED_RA_P(info) (((info) >> 7) & 0x1)
#define SFRAME_V3_FRE_RA_UNDEFINED_P(info) (SFRAME_V3_FRE_DATAWORD_COUNT(info) == 0)
+#define SFRAME_V3_FLEX_FDE_CTRLWORD_REGNUM(data) (((data) >> 3) & 0x1f)
+#define SFRAME_V3_FLEX_FDE_CTRLWORD_RESERVED_P(data) (((data) >> 2) & 0x1)
+#define SFRAME_V3_FLEX_FDE_CTRLWORD_DEREF_P(data) (((data) >> 1) & 0x1)
+#define SFRAME_V3_FLEX_FDE_CTRLWORD_REG_P(data) ((data) & 0x1)
+
#endif /* _SFRAME_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 11/20] unwind_user/sframe: Show file name in debug output
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
When debugging sframe issues, the error messages aren't all that helpful
without knowing what file a corresponding .sframe section belongs to.
Prefix debug output strings with the file name.
[ Jens Remus: Fix checkpatch error "space prohibited before that close
parenthesis ')'". ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Use GFP_KERNEL_ACCOUNT instead of GFP_KERNEL (see
memory-allocation.rst, section "Get Free Page flags"). (Sashiko AI)
Changes in v14:
- Uppercase terms FDE and FRE in debug messages.
include/linux/sframe.h | 4 +++-
kernel/unwind/sframe.c | 23 ++++++++++--------
kernel/unwind/sframe_debug.h | 45 +++++++++++++++++++++++++++++++-----
3 files changed, 56 insertions(+), 16 deletions(-)
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index 9a72209696f9..b79c5ec09229 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -10,7 +10,9 @@
struct sframe_section {
struct rcu_head rcu;
-
+#ifdef CONFIG_DYNAMIC_DEBUG
+ const char *filename;
+#endif
unsigned long sframe_start;
unsigned long sframe_end;
unsigned long text_start;
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index e0eb2adf5a07..2cfa274cd8dc 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -372,8 +372,10 @@ int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
}
end:
- if (ret && ret != -ENOENT)
+ if (ret && ret != -ENOENT) {
+ dbg_sec("removing bad .sframe section\n");
WARN_ON_ONCE(sframe_remove_section(sec->sframe_start));
+ }
return ret;
@@ -384,6 +386,7 @@ int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
static void free_section(struct sframe_section *sec)
{
+ dbg_free(sec);
kfree(sec);
}
@@ -403,7 +406,7 @@ static int sframe_read_header(struct sframe_section *sec)
BUILD_BUG_ON(!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS));
if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
- dbg("header usercopy failed\n");
+ dbg_sec("header usercopy failed\n");
return -EFAULT;
}
@@ -412,18 +415,18 @@ static int sframe_read_header(struct sframe_section *sec)
!(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
!(shdr.preamble.flags & SFRAME_F_FDE_FUNC_START_PCREL) ||
shdr.auxhdr_len) {
- dbg("bad/unsupported sframe header\n");
+ dbg_sec("bad/unsupported sframe header\n");
return -EINVAL;
}
if (!shdr.num_fdes || !shdr.num_fres) {
- dbg("no fde/fre entries\n");
+ dbg_sec("no FDE/FRE entries\n");
return -EINVAL;
}
header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
if (header_end >= sec->sframe_end) {
- dbg("header doesn't fit in section\n");
+ dbg_sec("header doesn't fit in section\n");
return -EINVAL;
}
@@ -435,7 +438,7 @@ static int sframe_read_header(struct sframe_section *sec)
fres_end = fres_start + shdr.fre_len;
if (fres_start < fdes_end || fres_end > sec->sframe_end) {
- dbg("inconsistent fde/fre offsets\n");
+ dbg_sec("inconsistent FDE/FRE offsets\n");
return -EINVAL;
}
@@ -491,6 +494,8 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
sec->text_start = text_start;
sec->text_end = text_end;
+ dbg_init(sec);
+
ret = sframe_read_header(sec);
if (ret) {
dbg_print_header(sec);
@@ -500,8 +505,8 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end - 1,
sec, GFP_KERNEL_ACCOUNT);
if (ret) {
- dbg("mtree_insert_range failed: text=%lx-%lx\n",
- sec->text_start, sec->text_end);
+ dbg_sec("mtree_insert_range failed: text=%lx-%lx\n",
+ sec->text_start, sec->text_end);
goto err_free;
}
@@ -523,7 +528,7 @@ static int __sframe_remove_section(struct mm_struct *mm,
struct sframe_section *sec)
{
if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
- dbg("mtree_erase failed: text=%lx\n", sec->text_start);
+ dbg_sec("mtree_erase failed: text=%lx\n", sec->text_start);
return -EINVAL;
}
diff --git a/kernel/unwind/sframe_debug.h b/kernel/unwind/sframe_debug.h
index 36352124cde8..a63e75cccc70 100644
--- a/kernel/unwind/sframe_debug.h
+++ b/kernel/unwind/sframe_debug.h
@@ -10,26 +10,59 @@
#define dbg(fmt, ...) \
pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+#define dbg_sec(fmt, ...) \
+ dbg("%s: " fmt, sec->filename, ##__VA_ARGS__)
+
static __always_inline void dbg_print_header(struct sframe_section *sec)
{
unsigned long fdes_end;
fdes_end = sec->fdes_start + (sec->num_fdes * sizeof(struct sframe_fde_v3));
- dbg("SEC: sframe:0x%lx-0x%lx text:0x%lx-0x%lx "
- "fdes:0x%lx-0x%lx fres:0x%lx-0x%lx "
- "ra_off:%d fp_off:%d\n",
- sec->sframe_start, sec->sframe_end, sec->text_start, sec->text_end,
- sec->fdes_start, fdes_end, sec->fres_start, sec->fres_end,
- sec->ra_off, sec->fp_off);
+ dbg_sec("SEC: sframe:0x%lx-0x%lx text:0x%lx-0x%lx "
+ "fdes:0x%lx-0x%lx fres:0x%lx-0x%lx "
+ "ra_off:%d fp_off:%d\n",
+ sec->sframe_start, sec->sframe_end, sec->text_start, sec->text_end,
+ sec->fdes_start, fdes_end, sec->fres_start, sec->fres_end,
+ sec->ra_off, sec->fp_off);
+}
+
+static inline void dbg_init(struct sframe_section *sec)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ guard(mmap_read_lock)(mm);
+ vma = vma_lookup(mm, sec->sframe_start);
+ if (!vma)
+ sec->filename = kstrdup("(vma gone???)", GFP_KERNEL_ACCOUNT);
+ else if (vma->vm_file)
+ sec->filename = kstrdup_quotable_file(vma->vm_file, GFP_KERNEL_ACCOUNT);
+ else if (vma->vm_ops && vma->vm_ops->name)
+ sec->filename = kstrdup(vma->vm_ops->name(vma), GFP_KERNEL_ACCOUNT);
+ else if (arch_vma_name(vma))
+ sec->filename = kstrdup(arch_vma_name(vma), GFP_KERNEL_ACCOUNT);
+ else if (!vma->vm_mm)
+ sec->filename = kstrdup("(vdso)", GFP_KERNEL_ACCOUNT);
+ else
+ sec->filename = kstrdup("(anonymous)", GFP_KERNEL_ACCOUNT);
+}
+
+static inline void dbg_free(struct sframe_section *sec)
+{
+ kfree(sec->filename);
}
#else /* !CONFIG_DYNAMIC_DEBUG */
#define dbg(args...) no_printk(args)
+#define dbg_sec(args...) no_printk(args)
static inline void dbg_print_header(struct sframe_section *sec) {}
+static inline void dbg_init(struct sframe_section *sec) {}
+static inline void dbg_free(struct sframe_section *sec) {}
+
#endif /* !CONFIG_DYNAMIC_DEBUG */
#endif /* _SFRAME_DEBUG_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 02/20] unwind_user/sframe: Add support for reading .sframe headers
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
In preparation for unwinding user space stacks with sframe, add basic
sframe compile infrastructure and support for reading the .sframe
section header.
sframe_add_section() reads the header and unconditionally returns an
error, so it's not very useful yet. A subsequent patch will improve
that.
Link: https://lore.kernel.org/all/f27e8463783febfa0dabb0432a3dd6be8ad98412.1737511963.git.jpoimboe@kernel.org/
[ Jens Remus: Add support for SFrame V3. Add support for PC-relative
FDE function start offset. Cleanup includes and indentation. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Improve text/sframe section start/end validation. (Sashiko AI)
- Use GFP_KERNEL_ACCOUNT instead of GFP_KERNEL (see
memory-allocation.rst, section "Get Free Page flags"). (Sashiko AI)
Changes in v14:
- Rename SFRAME_FDE_TYPE_REGULAR to SFRAME_FDE_TYPE_DEFAULT to match
SFrame V3 specification. (Indu)
- Correct SFRAME_V3_FDE_TYPE_MASK value.
Changes in v13:
- Update to SFrame V3:
- Add and use SFRAME_VERSION_3 definition.
- Add helper macros to access SFrame V3 FDE type.
- Rename SFRAME_FUNC_*() macros to SFRAME_FDE_*().
- Rename SFRAME_FDE_TYPE_PC* defines to SFRAME_FDE_PCTYPE_* and
SFRAME_FUNC_FDE_TYPE() macro to SFRAME_V3_FDE_PCTYPE().
- Reword OFFSET to DATAWORD in SFRAME_FRE_OFFSET_{COUNT|SIZE}()
macros.
- Rename version-specific SFRAME_*() macros to SFRAME_V3_*().
- Update struct sframe_fde and rename to sframe_fde_v3:
- Change field start_addr from s32 to s64 and rename to
func_start_off.
- Change field fres_num from u32 to u16.
- New field u8 info2.
- Remove u16 padding field.
- Split FDE into function descriptor entry (struct sframe_fde_v3) and
attributes (struct sframe_fde_v3).
- Rename macro parameter "data" to "info" to hint at fde/fre info
word and wrap it in parenthesis.
- Group SFRAME_* definitions so that related ones are together.
- Reword commit message (my changes).
MAINTAINERS | 1 +
arch/Kconfig | 3 +
include/linux/sframe.h | 37 +++++++++++
kernel/unwind/Makefile | 3 +-
kernel/unwind/sframe.c | 136 +++++++++++++++++++++++++++++++++++++++++
kernel/unwind/sframe.h | 81 ++++++++++++++++++++++++
6 files changed, 260 insertions(+), 1 deletion(-)
create mode 100644 include/linux/sframe.h
create mode 100644 kernel/unwind/sframe.c
create mode 100644 kernel/unwind/sframe.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 7434e9d7b33f..a9b42b67a88d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27876,6 +27876,7 @@ M: Steven Rostedt <rostedt@goodmis.org>
S: Maintained
F: arch/*/include/asm/unwind_user.h
F: include/asm-generic/unwind_user.h
+F: include/linux/sframe.h
F: include/linux/unwind*.h
F: kernel/unwind/
diff --git a/arch/Kconfig b/arch/Kconfig
index e86880045158..94b2d5e8e529 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -486,6 +486,9 @@ config HAVE_UNWIND_USER_FP
bool
select UNWIND_USER
+config HAVE_UNWIND_USER_SFRAME
+ bool
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
new file mode 100644
index 000000000000..0642595534f9
--- /dev/null
+++ b/include/linux/sframe.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SFRAME_H
+#define _LINUX_SFRAME_H
+
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+struct sframe_section {
+ unsigned long sframe_start;
+ unsigned long sframe_end;
+ unsigned long text_start;
+ unsigned long text_end;
+
+ unsigned long fdes_start;
+ unsigned long fres_start;
+ unsigned long fres_end;
+ unsigned int num_fdes;
+
+ signed char ra_off;
+ signed char fp_off;
+};
+
+extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
+ unsigned long text_start, unsigned long text_end);
+extern int sframe_remove_section(unsigned long sframe_addr);
+
+#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
+ unsigned long text_start, unsigned long text_end)
+{
+ return -ENOSYS;
+}
+static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
+
+#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#endif /* _LINUX_SFRAME_H */
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index eae37bea54fd..146038165865 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1,2 @@
- obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
+ obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
+ obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME) += sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..d24e9d4f8bef
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace sframe access functions
+ */
+
+#define pr_fmt(fmt) "sframe: " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/string_helpers.h>
+#include <linux/sframe.h>
+#include <linux/unwind_user_types.h>
+
+#include "sframe.h"
+
+#define dbg(fmt, ...) \
+ pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+
+static void free_section(struct sframe_section *sec)
+{
+ kfree(sec);
+}
+
+static int sframe_read_header(struct sframe_section *sec)
+{
+ unsigned long header_end, fdes_start, fdes_end, fres_start, fres_end;
+ struct sframe_header shdr;
+ unsigned int num_fdes;
+
+ if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
+ dbg("header usercopy failed\n");
+ return -EFAULT;
+ }
+
+ if (shdr.preamble.magic != SFRAME_MAGIC ||
+ shdr.preamble.version != SFRAME_VERSION_3 ||
+ !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+ !(shdr.preamble.flags & SFRAME_F_FDE_FUNC_START_PCREL) ||
+ shdr.auxhdr_len) {
+ dbg("bad/unsupported sframe header\n");
+ return -EINVAL;
+ }
+
+ if (!shdr.num_fdes || !shdr.num_fres) {
+ dbg("no fde/fre entries\n");
+ return -EINVAL;
+ }
+
+ header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
+ if (header_end >= sec->sframe_end) {
+ dbg("header doesn't fit in section\n");
+ return -EINVAL;
+ }
+
+ num_fdes = shdr.num_fdes;
+ fdes_start = header_end + shdr.fdes_off;
+ fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde_v3));
+
+ fres_start = header_end + shdr.fres_off;
+ fres_end = fres_start + shdr.fre_len;
+
+ if (fres_start < fdes_end || fres_end > sec->sframe_end) {
+ dbg("inconsistent fde/fre offsets\n");
+ return -EINVAL;
+ }
+
+ sec->num_fdes = num_fdes;
+ sec->fdes_start = fdes_start;
+ sec->fres_start = fres_start;
+ sec->fres_end = fres_end;
+
+ sec->ra_off = shdr.cfa_fixed_ra_offset;
+ sec->fp_off = shdr.cfa_fixed_fp_offset;
+
+ return 0;
+}
+
+int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
+ unsigned long text_start, unsigned long text_end)
+{
+ struct vm_area_struct *sframe_vma, *text_vma;
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ int ret;
+
+ if (sframe_start >= sframe_end || text_start >= text_end) {
+ dbg("invalid sframe/text address\n");
+ return -EINVAL;
+ }
+
+ scoped_guard(mmap_read_lock, mm) {
+ sframe_vma = vma_lookup(mm, sframe_start);
+ if (!sframe_vma || sframe_end > sframe_vma->vm_end) {
+ dbg("bad sframe address (0x%lx - 0x%lx)\n",
+ sframe_start, sframe_end);
+ return -EINVAL;
+ }
+
+ text_vma = vma_lookup(mm, text_start);
+ if (!text_vma ||
+ !(text_vma->vm_flags & VM_EXEC) ||
+ text_end > text_vma->vm_end) {
+ dbg("bad text address (0x%lx - 0x%lx)\n",
+ text_start, text_end);
+ return -EINVAL;
+ }
+ }
+
+ sec = kzalloc(sizeof(*sec), GFP_KERNEL_ACCOUNT);
+ if (!sec)
+ return -ENOMEM;
+
+ sec->sframe_start = sframe_start;
+ sec->sframe_end = sframe_end;
+ sec->text_start = text_start;
+ sec->text_end = text_end;
+
+ ret = sframe_read_header(sec);
+ if (ret)
+ goto err_free;
+
+ /* TODO nowhere to store it yet - just free it and return an error */
+ ret = -ENOSYS;
+
+err_free:
+ free_section(sec);
+ return ret;
+}
+
+int sframe_remove_section(unsigned long sframe_start)
+{
+ return -ENOSYS;
+}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
new file mode 100644
index 000000000000..fc2908e92c7b
--- /dev/null
+++ b/kernel/unwind/sframe.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * From https://www.sourceware.org/binutils/docs/sframe-spec.html
+ */
+#ifndef _SFRAME_H
+#define _SFRAME_H
+
+#include <linux/types.h>
+
+#define SFRAME_VERSION_1 1
+#define SFRAME_VERSION_2 2
+#define SFRAME_VERSION_3 3
+#define SFRAME_MAGIC 0xdee2
+
+#define SFRAME_F_FDE_SORTED 0x1
+#define SFRAME_F_FRAME_POINTER 0x2
+#define SFRAME_F_FDE_FUNC_START_PCREL 0x4
+
+#define SFRAME_ABI_AARCH64_ENDIAN_BIG 1
+#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE 2
+#define SFRAME_ABI_AMD64_ENDIAN_LITTLE 3
+
+struct sframe_preamble {
+ u16 magic;
+ u8 version;
+ u8 flags;
+} __packed;
+
+struct sframe_header {
+ struct sframe_preamble preamble;
+ u8 abi_arch;
+ s8 cfa_fixed_fp_offset;
+ s8 cfa_fixed_ra_offset;
+ u8 auxhdr_len;
+ u32 num_fdes;
+ u32 num_fres;
+ u32 fre_len;
+ u32 fdes_off;
+ u32 fres_off;
+} __packed;
+
+#define SFRAME_HEADER_SIZE(header) \
+ ((sizeof(struct sframe_header) + (header).auxhdr_len))
+
+struct sframe_fde_v3 {
+ s64 func_start_off;
+ u32 func_size;
+ u32 fres_off;
+} __packed;
+
+struct sframe_fda_v3 {
+ u16 fres_num;
+ u8 info;
+ u8 info2;
+ u8 rep_size;
+} __packed;
+
+#define SFRAME_FDE_PCTYPE_INC 0
+#define SFRAME_FDE_PCTYPE_MASK 1
+
+#define SFRAME_AARCH64_PAUTH_KEY_A 0
+#define SFRAME_AARCH64_PAUTH_KEY_B 1
+
+#define SFRAME_V3_FDE_FRE_TYPE(info) ((info) & 0xf)
+#define SFRAME_V3_FDE_PCTYPE(info) (((info) >> 4) & 0x1)
+#define SFRAME_V3_AARCH64_FDE_PAUTH_KEY(info) (((info) >> 5) & 0x1)
+
+#define SFRAME_FDE_TYPE_DEFAULT 0
+
+#define SFRAME_V3_FDE_TYPE_MASK 0x1f
+#define SFRAME_V3_FDE_TYPE(info2) ((info2) & SFRAME_V3_FDE_TYPE_MASK)
+
+#define SFRAME_BASE_REG_FP 0
+#define SFRAME_BASE_REG_SP 1
+
+#define SFRAME_V3_FRE_CFA_BASE_REG_ID(info) ((info) & 0x1)
+#define SFRAME_V3_FRE_DATAWORD_COUNT(info) (((info) >> 1) & 0xf)
+#define SFRAME_V3_FRE_DATAWORD_SIZE(info) (((info) >> 5) & 0x3)
+#define SFRAME_V3_AARCH64_FRE_MANGLED_RA_P(info) (((info) >> 7) & 0x1)
+
+#endif /* _SFRAME_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 10/20] unwind_user/sframe: Remove .sframe section on detected corruption
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
To avoid continued attempted use of a bad .sframe section, remove it
on demand when the first sign of corruption is detected.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- sframe_find(): Align to normalized error code usage and remove .sframe
for all but ENOENT. Also remove if user_read_access_begin() fails.
kernel/unwind/sframe.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 41ece3ca62a1..e0eb2adf5a07 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -371,10 +371,15 @@ int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
ret = __find_fre(sec, &fde, ip, frame);
}
+end:
+ if (ret && ret != -ENOENT)
+ WARN_ON_ONCE(sframe_remove_section(sec->sframe_start));
+
return ret;
Efault:
- return -EFAULT;
+ ret = -EFAULT;
+ goto end;
}
static void free_section(struct sframe_section *sec)
--
2.51.0
^ permalink raw reply related
* [PATCH v16 12/20] unwind_user/sframe: Add .sframe validation option
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add a debug feature to validate all .sframe sections when first loading
the file rather than on demand.
[ Jens Remus: Add support for SFrame V3. Add support for PC-relative
FDE function start offset. Adjust to rename of struct sframe_fre to
sframe_fre_internal. Use %#x/%#lx format specifiers. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v16:
- sframe_validate_section(): Allow for a FDE[0] function start address
of zero. (Sashiko AI)
- sframe_validate_section(): Replace alternation between two FREs with
simpler logic used for FDE and use a prev_ip_off.
Changes in v15:
- sframe_validate_section(): Fix format specifier for number of FREs
in debug message. (Sashiko AI)
- Normalize error code usage (.sframe is removed for all but ENOENT):
ENOENT: No sframe or no FDE for IP found
(FDE found but no FRE is EINVAL)
EFAULT: Bad address
EINVAL: Invalid input or sframe
Changes in v14:
- Add debug message if safe_read_fde() fails.
- Update function names in debug messages.
- Uppercase terms FDE and FRE in debug messages.
Changes in v13:
- Update to SFrame V3:
- Print struct sframe_fde_internal fields fda_off and info2 in debug
message.
- Adjust to rename of struct sframe_fde_internal field func_start_addr
to func_addr.
- Use format strings "%#x" and "%#lx" instead of "0x%x" and "0x%lx".
- Reword commit message (my changes).
arch/Kconfig | 19 +++++++++
kernel/unwind/sframe.c | 92 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 111 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig
index 37549832bd1f..132249d342a3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -490,6 +490,25 @@ config HAVE_UNWIND_USER_SFRAME
bool
select UNWIND_USER
+config SFRAME_VALIDATION
+ bool "Enable .sframe section debugging"
+ depends on HAVE_UNWIND_USER_SFRAME
+ depends on DYNAMIC_DEBUG
+ help
+ When adding an .sframe section for a task, validate the entire
+ section immediately rather than on demand.
+
+ This is a debug feature which is helpful for rooting out .sframe
+ section issues. If the .sframe section is corrupt, it will fail to
+ load immediately, with more information provided in dynamic printks.
+
+ This has a significant page cache footprint due to its reading of the
+ entire .sframe section for every loaded executable and shared
+ library. Also, it's done for all processes, even those which don't
+ get stack traced by the kernel. Not recommended for general use.
+
+ If unsure, say N.
+
config HAVE_PERF_REGS
bool
help
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 2cfa274cd8dc..e6d66ae8e7ac 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -384,6 +384,94 @@ int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
goto end;
}
+#ifdef CONFIG_SFRAME_VALIDATION
+
+static int safe_read_fde(struct sframe_section *sec,
+ unsigned int fde_num, struct sframe_fde_internal *fde)
+{
+ int ret;
+
+ if (!user_read_access_begin((void __user *)sec->sframe_start,
+ sec->sframe_end - sec->sframe_start))
+ return -EFAULT;
+ ret = __read_fde(sec, fde_num, fde);
+ user_read_access_end();
+ return ret;
+}
+
+static int safe_read_fre(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long fre_addr,
+ struct sframe_fre_internal *fre)
+{
+ int ret;
+
+ if (!user_read_access_begin((void __user *)sec->sframe_start,
+ sec->sframe_end - sec->sframe_start))
+ return -EFAULT;
+ ret = __read_fre(sec, fde, fre_addr, fre);
+ user_read_access_end();
+ return ret;
+}
+
+static int sframe_validate_section(struct sframe_section *sec)
+{
+ struct sframe_fde_internal fde;
+ unsigned long prev_func_addr;
+ unsigned int i;
+
+ for (i = 0; i < sec->num_fdes; i++) {
+ struct sframe_fre_internal fre;
+ unsigned long fre_addr;
+ u32 prev_ip_off;
+ unsigned int j;
+ int ret;
+
+ ret = safe_read_fde(sec, i, &fde);
+ if (ret) {
+ dbg_sec("safe_read_fde(%u) failed\n", i);
+ return ret;
+ }
+
+ if (i && fde.func_addr <= prev_func_addr) {
+ dbg_sec("FDE %u not sorted\n", i);
+ return -EINVAL;
+ }
+ prev_func_addr = fde.func_addr;
+
+ fre_addr = sec->fres_start + fde.fres_off;
+ for (j = 0; j < fde.fres_num; j++) {
+ ret = safe_read_fre(sec, &fde, fre_addr, &fre);
+ if (ret) {
+ dbg_sec("FDE %u: safe_read_fre(%u) failed\n", i, j);
+ dbg_sec("FDE: func_addr:%#lx func_size:%#x fda_off:%#x fres_off:%#x fres_num:%u info:%u info2:%u rep_size:%u\n",
+ fde.func_addr, fde.func_size,
+ fde.fda_off,
+ fde.fres_off, fde.fres_num,
+ fde.info, fde.info2,
+ fde.rep_size);
+ return ret;
+ }
+
+ if (j && fre.ip_off <= prev_ip_off) {
+ dbg_sec("FDE %u: FRE %u not sorted\n", i, j);
+ return -EINVAL;
+ }
+ prev_ip_off = fre.ip_off;
+
+ fre_addr += fre.size;
+ }
+ }
+
+ return 0;
+}
+
+#else /* !CONFIG_SFRAME_VALIDATION */
+
+static int sframe_validate_section(struct sframe_section *sec) { return 0; }
+
+#endif /* !CONFIG_SFRAME_VALIDATION */
+
static void free_section(struct sframe_section *sec)
{
dbg_free(sec);
@@ -502,6 +590,10 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
goto err_free;
}
+ ret = sframe_validate_section(sec);
+ if (ret)
+ goto err_free;
+
ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end - 1,
sec, GFP_KERNEL_ACCOUNT);
if (ret) {
--
2.51.0
^ permalink raw reply related
* [PATCH v16 15/20] unwind_user: Flexible CFA recovery rules
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
To enable support for SFrame V3 flexible FDEs with a subsequent patch,
add support for the following flexible Canonical Frame Address (CFA)
recovery rules:
CFA = SP + offset
CFA = *(SP + offset)
CFA = FP + offset
CFA = *(FP + offset)
CFA = register + offset
CFA = *(register + offset)
Note that CFA recovery rules that use arbitrary register contents are
only valid when in the topmost frame, as their contents are otherwise
unknown.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- enum unwind_user_cfa_rule, unwind_user_next_common(): Add support for
SP/FP-based CFA recovery rules with dereferencing. (Sashiko AI)
arch/x86/include/asm/unwind_user.h | 12 ++++++++----
include/linux/unwind_user_types.h | 22 ++++++++++++++++++++--
kernel/unwind/sframe.c | 15 +++++++++++++--
kernel/unwind/user.c | 24 ++++++++++++++++++++----
4 files changed, 61 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index 9c3417be4283..f38f7c5ff1de 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -20,7 +20,10 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
#ifdef CONFIG_HAVE_UNWIND_USER_FP
#define ARCH_INIT_USER_FP_FRAME(ws) \
- .cfa_off = 2*(ws), \
+ .cfa = { \
+ .rule = UNWIND_USER_CFA_RULE_FP_OFFSET,\
+ .offset = 2*(ws), \
+ }, \
.ra = { \
.rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
.offset = -1*(ws), \
@@ -29,11 +32,13 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
.rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
.offset = -2*(ws), \
}, \
- .use_fp = true, \
.outermost = false,
#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
- .cfa_off = 1*(ws), \
+ .cfa = { \
+ .rule = UNWIND_USER_CFA_RULE_SP_OFFSET,\
+ .offset = 1*(ws), \
+ }, \
.ra = { \
.rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
.offset = -1*(ws), \
@@ -41,7 +46,6 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
.fp = { \
.rule = UNWIND_USER_RULE_RETAIN,\
}, \
- .use_fp = false, \
.outermost = false,
static inline bool unwind_user_at_function_start(struct pt_regs *regs)
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 0d02714a1b5d..c18be5b7d586 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -29,6 +29,25 @@ struct unwind_stacktrace {
#define UNWIND_USER_RULE_DEREF BIT(31)
+enum unwind_user_cfa_rule {
+ UNWIND_USER_CFA_RULE_SP_OFFSET, /* CFA = SP + offset */
+ UNWIND_USER_CFA_RULE_FP_OFFSET, /* CFA = FP + offset */
+ UNWIND_USER_CFA_RULE_REG_OFFSET, /* CFA = register + offset */
+ /* DEREF variants */
+ UNWIND_USER_CFA_RULE_SP_OFFSET_DEREF = /* CFA = *(SP + offset) */
+ UNWIND_USER_CFA_RULE_SP_OFFSET | UNWIND_USER_RULE_DEREF,
+ UNWIND_USER_CFA_RULE_FP_OFFSET_DEREF = /* CFA = *(FP + offset) */
+ UNWIND_USER_CFA_RULE_FP_OFFSET | UNWIND_USER_RULE_DEREF,
+ UNWIND_USER_CFA_RULE_REG_OFFSET_DEREF = /* CFA = *(register + offset) */
+ UNWIND_USER_CFA_RULE_REG_OFFSET | UNWIND_USER_RULE_DEREF,
+};
+
+struct unwind_user_cfa_rule_data {
+ enum unwind_user_cfa_rule rule;
+ s32 offset;
+ unsigned int regnum;
+};
+
enum unwind_user_rule {
UNWIND_USER_RULE_RETAIN, /* entity = entity */
UNWIND_USER_RULE_CFA_OFFSET, /* entity = CFA + offset */
@@ -47,10 +66,9 @@ struct unwind_user_rule_data {
};
struct unwind_user_frame {
- s32 cfa_off;
+ struct unwind_user_cfa_rule_data cfa;
struct unwind_user_rule_data ra;
struct unwind_user_rule_data fp;
- bool use_fp;
bool outermost;
};
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 29a874a67f32..daa97d8b0231 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -285,6 +285,18 @@ static __always_inline int __read_fre(struct sframe_section *sec,
return -EFAULT;
}
+static __always_inline void
+sframe_init_cfa_rule_data(struct unwind_user_cfa_rule_data *cfa_rule_data,
+ unsigned char fre_info,
+ s32 offset)
+{
+ if (SFRAME_V3_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP)
+ cfa_rule_data->rule = UNWIND_USER_CFA_RULE_FP_OFFSET;
+ else
+ cfa_rule_data->rule = UNWIND_USER_CFA_RULE_SP_OFFSET;
+ cfa_rule_data->offset = offset;
+}
+
static __always_inline void
sframe_init_rule_data(struct unwind_user_rule_data *rule_data,
s32 offset)
@@ -346,10 +358,9 @@ static __always_inline int __find_fre(struct sframe_section *sec,
return -EINVAL;
fre = prev_fre;
- frame->cfa_off = fre->cfa_off;
+ sframe_init_cfa_rule_data(&frame->cfa, fre->info, fre->cfa_off);
sframe_init_rule_data(&frame->ra, fre->ra_off);
sframe_init_rule_data(&frame->fp, fre->fp_off);
- frame->use_fp = SFRAME_V3_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP;
frame->outermost = SFRAME_V3_FRE_RA_UNDEFINED_P(fre->info);
return 0;
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index c6a2abac78e0..447061b10613 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -53,14 +53,30 @@ static int unwind_user_next_common(struct unwind_user_state *state,
}
/* Get the Canonical Frame Address (CFA) */
- if (frame->use_fp) {
+ switch (frame->cfa.rule) {
+ case UNWIND_USER_CFA_RULE_SP_OFFSET:
+ case UNWIND_USER_CFA_RULE_SP_OFFSET_DEREF:
+ cfa = state->sp;
+ break;
+ case UNWIND_USER_CFA_RULE_FP_OFFSET:
+ case UNWIND_USER_CFA_RULE_FP_OFFSET_DEREF:
if (state->fp < state->sp)
return -EINVAL;
cfa = state->fp;
- } else {
- cfa = state->sp;
+ break;
+ case UNWIND_USER_CFA_RULE_REG_OFFSET:
+ case UNWIND_USER_CFA_RULE_REG_OFFSET_DEREF:
+ if (!state->topmost || unwind_user_get_reg(&cfa, frame->cfa.regnum))
+ return -EINVAL;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
- cfa += frame->cfa_off;
+ cfa += frame->cfa.offset;
+ if (frame->cfa.rule & UNWIND_USER_RULE_DEREF &&
+ get_user_word(&cfa, cfa, 0, state->ws))
+ return -EINVAL;
/*
* Make sure that stack is not going in wrong direction. Allow SP
--
2.51.0
^ permalink raw reply related
* [PATCH v16 19/20] unwind_user/sframe/x86: Enable sframe unwinding on x86
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
The x86 SFrame V3 implementation works fairly well, starting with
binutils 2.46. Enable it.
[ Jens Remus: Reword commit message for SFrame V3, starting with
binutils 2.46. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- unwind_user_get_reg(): Fail if !user_64bit_mode(). (Sashiko AI)
- unwind_user_get_reg(): Simplify guarding using CONFIG_X86_64.
- unwind_user_get_reg(): Add pr_debug_once() if unsupported register
number.
Changes in v14:
- Drop superfluous empty line in unwind_user_get_reg().
Changes in v13:
- Naive implementation of unwind_user_get_reg() to support SFrame V3
flexible FDEs (e.g. used to represent DRAP pattern).
- Define SFRAME_REG_SP and SFRAME_REG_FP to the respective x86-64
DWARF register numbers.
- Reword commit message for SFrame V3 and (upcoming) binutils 2.46.
arch/x86/Kconfig | 1 +
arch/x86/include/asm/unwind_user.h | 39 +++++++++++++++++++++++
arch/x86/include/asm/unwind_user_sframe.h | 12 +++++++
3 files changed, 52 insertions(+)
create mode 100644 arch/x86/include/asm/unwind_user_sframe.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f3f7cb01d69d..51286dfdb5f4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -302,6 +302,7 @@ config X86
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_UNWIND_USER_FP if X86_64
+ select HAVE_UNWIND_USER_SFRAME if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select VDSO_GETRANDOM if X86_64
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index f38f7c5ff1de..942475235431 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -15,6 +15,45 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
return user_64bit_mode(regs) ? 8 : 4;
}
+#ifdef CONFIG_X86_64
+
+static inline int unwind_user_get_reg(unsigned long *val, unsigned int regnum)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /* SFrame only supports x86-64 */
+ if (!user_64bit_mode(regs))
+ return -EINVAL;
+
+ switch (regnum) {
+ /* DWARF register numbers 0..15 */
+ case 0: *val = regs->ax; break;
+ case 1: *val = regs->dx; break;
+ case 2: *val = regs->cx; break;
+ case 3: *val = regs->bx; break;
+ case 4: *val = regs->si; break;
+ case 5: *val = regs->di; break;
+ case 6: *val = regs->bp; break;
+ case 7: *val = regs->sp; break;
+ case 8: *val = regs->r8; break;
+ case 9: *val = regs->r9; break;
+ case 10: *val = regs->r10; break;
+ case 11: *val = regs->r11; break;
+ case 12: *val = regs->r12; break;
+ case 13: *val = regs->r13; break;
+ case 14: *val = regs->r14; break;
+ case 15: *val = regs->r15; break;
+ default:
+ pr_debug_once("%s (%d): unwind_user_get_reg(%u): unsupported register number\n",
+ current->comm, current->pid, regnum);
+ return -EINVAL;
+ }
+ return 0;
+}
+#define unwind_user_get_reg unwind_user_get_reg
+
+#endif /* CONFIG_X86_64 */
+
#endif /* CONFIG_UNWIND_USER */
#ifdef CONFIG_HAVE_UNWIND_USER_FP
diff --git a/arch/x86/include/asm/unwind_user_sframe.h b/arch/x86/include/asm/unwind_user_sframe.h
new file mode 100644
index 000000000000..d828ae1a4aac
--- /dev/null
+++ b/arch/x86/include/asm/unwind_user_sframe.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_USER_SFRAME_H
+#define _ASM_X86_UNWIND_USER_SFRAME_H
+
+#ifdef CONFIG_X86_64
+
+#define SFRAME_REG_SP 7
+#define SFRAME_REG_FP 6
+
+#endif
+
+#endif /* _ASM_X86_UNWIND_USER_SFRAME_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 00/20] unwind_deferred: Implement sframe handling
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
This is the implementation of parsing the SFrame V3 stack trace information
from an .sframe section in an ELF file. It's a continuation of Josh's and
Steve's work that can be found here:
https://lore.kernel.org/all/cover.1737511963.git.jpoimboe@kernel.org/
https://lore.kernel.org/all/20250827201548.448472904@kernel.org/
Currently the only way to get a user space stack trace from a stack
walk (and not just copying large amount of user stack into the kernel
ring buffer) is to use frame pointers. This has a few issues. The biggest
one is that compiling frame pointers into every application and library
has been shown to cause performance overhead.
Another issue is that the format of the frames may not always be consistent
between different compilers and some architectures (s390) has no defined
format to do a reliable stack walk. The only way to perform user space
profiling on these architectures is to copy the user stack into the kernel
buffer.
SFrame [1] is now supported in binutils (x86-64, ARM64, and s390). There is
discussions going on about supporting SFrame in LLVM. SFrame acts more like
ORC, and lives in the ELF executable file as its own section. Like ORC it
has two tables where the first table is sorted by instruction pointers (IP)
and using the current IP and finding it's entry in the first table, it will
take you to the second table which will tell you where the return address
of the current function is located and then you can use that address to
look it up in the first table to find the return address of that function,
and so on. This performs a user space stack walk.
Now because the .sframe section lives in the ELF file it needs to be faulted
into memory when it is used. This means that walking the user space stack
requires being in a faultable context. As profilers like perf request a stack
trace in interrupt or NMI context, it cannot do the walking when it is
requested. Instead it must be deferred until it is safe to fault in user
space. One place this is known to be safe is when the task is about to return
back to user space.
This series makes the deferred unwind user code implement SFrame format V3
and enables it on x86-64.
[1]: https://sourceware.org/binutils/wiki/sframe
This series applies on top of v7.1-rc4 tag:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v7.1-rc4
The to be stack-traced user space programs (and libraries) need to be
built with the recent SFrame stack trace information format V3, as
generated by binutils 2.46+ with assembler option --gsframe-3.
Namhyung Kim's related perf tools deferred callchain support can be used
for testing ("perf record --call-graph fp,defer" and "perf report/script").
Changes in v16 (see patch notes for details):
- Address Sashiko AI review feedback.
- Move SRCU definitions between patches.
- __read_fre(): Convert user read access to scope-based cleanup.
- sframe_validate_section(): Allow for a FDE[0] function start address
of zero.
- sframe_validate_section(): Replace alternation between two FREs with
simpler logic used for FDE and use a prev_ip_off.
- dup_mmap(): Drop unnecessary CONFIG_HAVE_UNWIND_USER_SFRAME #ifdefs.
- dup_mmap(): Call sframe_dup_mm() prior to arch_dup_mmap().
Changes in v15 (see patch notes for details):
- Rebase on v7.1-rc4.
- New patch to duplicate registered .sframe section data on clone/fork.
- Address Sashiko AI review feedback:
- Fix sframe end passed to mtree_insert_range().
- Fix outermost frame (FRE without datawords) handling.
- Use GFP_KERNEL_ACCOUNT instead of GFP_KERNEL.
- Improve text/sframe section start/end validation.
- Always use guard(srcu) when accessing struct sframe_section fields.
- Validate FDE repetition size for PCTYPE_MASK FDEs to be non-zero to
prevent division by zero.
- Only add sframe for text that is PT_LOAD in addition to PF_X.
- Use pr_debug_once() instead of WARN_ON_ONCE() to prevent user-
triggered warning/panic.
- Add support for SP/FP-based CFA recovery rules with
dereferencing.
- Reject FRE control word with reserved_p=1.
- x86-64: Fail unwind_user_get_reg() if !user_64bit_mode().
- Validate FDE PC type for supported values (i.e. PCTYPE_INC or
PCTYPE_MASK).
- Validate FDE function end against text end.
- Validate FDE's number of FREs to be less or equal to FDE's function
size, as each FRE must cover at least one byte. (Indu)
- Validate FRE function offset against FDE repetition size
for PCTYPE_MASK.
- Change type of struct sframe_fde_internal field fres_num to the one of
struct sframe_fda_v3 field fres_num.
- Return RC of sframe_init_[cfa_]rule_data() if bad RC.
- Normalize error code usage (.sframe is removed for all but ENOENT):
ENOENT: No sframe or no FDE for IP found
(FDE found but no FRE found is EINVAL)
EFAULT: Bad address
EINVAL: Invalid input or sframe
- Build-time checks for config options:
- 64BIT: SFrame V3 only supports 64-bit architectures.
- HAVE_EFFICIENT_UNALIGNED_ACCESS: Unaligned access to 16/32-bit
SFrame FRE fields and datawords using unsafe_get_user(). (Steven)
- Add pr_debug_once() when restoring CFA/FP/RA from an unsupported
register number.
Changes in v14 (see patch notes for details):
- Rebase on v7.1-rc2.
- Correct SFRAME_V3_FDE_TYPE_MASK value.
- Fix FDE function start address check in __read_fde().
- Rename SFrame V3 definitions accoring to final specification. (Indu)
- Improve comments on why UNWIND_USER_RULE_CFA_OFFSET is not
implemented. (Mark Rutland)
- Add/update/improve sframe debug messages.
- Add generic and arch-specific unwind_user.h to MAINTAINERS.
- Add arch-specific unwind_user_sframe.h to MAINTAINERS.
Changes in v13 (see patch notes for details):
- Add support for SFrame V3, including its new flexible FDEs. SFrame V2
is not supported.
Changes in v12 (see patch notes for details):
- Adjust to Peter's latest undwind user enhancements.
- Simplify logic by using an internal SFrame FDE representation, whose
FDE function start address field is an address instead of a PC-relative
offset (from FDE).
- Rename struct sframe_fre to sframe_fre_internal to align with
struct sframe_fde_internal.
- Remove unused pt_regs from unwind_user_next_common() and its
callers. (Peter)
- Simplify unwind_user_next_sframe(). (Peter)
- Fix a few checkpatch errors and warnings.
- Minor cleanups (e.g. move includes, fix indentation).
Changes in v11:
- Support for SFrame V2 PC-relative FDE function start address.
- Support for SFrame V2 representing RA undefined as indication for
outermost frames.
Patch 1 (new in v14), as a preparatory cleanup, adds the generic and
arch-specific unwind_user.h to MAINTAINERS.
Patches 2, 5, 12, and 19 have been updated to exclusively support the
latest SFrame V3 stack trace information format, that is generated by
binutils 2.46+. Old SFrame V2 sections get rejected with dynamic debug
message "bad/unsupported sframe header".
Patches 8 and 9 add support to unwind user (sframe) for outermost frames.
Patches 13-16 add support to unwind user (sframe) for the new SFrame V3
flexible FDEs.
Patch 17 improves the performance of searching the SFrame FRE for an IP.
Patch 18 (new in v15) duplicates registered .sframe section data on
clone/fork from the parent to the child process.
Patch 20 is for test purposes only and will get replaced by a new
syscall, that Steven is working on:
[RFC][PATCH] unwind: Add stacktrace_setup system call
https://lore.kernel.org/all/20260429114355.6c712e6a@gandalf.local.home/
Regards,
Jens
Jens Remus (9):
unwind_user: Add generic and arch-specific headers to MAINTAINERS
unwind_user: Stop when reaching an outermost frame
unwind_user/sframe: Add support for outermost frame indication
unwind_user: Enable archs that pass RA in a register
unwind_user: Flexible FP/RA recovery rules
unwind_user: Flexible CFA recovery rules
unwind_user/sframe: Add support for SFrame V3 flexible FDEs
unwind_user/sframe: Separate reading of FRE from reading of FRE data
words
unwind_user/sframe: Duplicate registered .sframe section data on
clone/fork
Josh Poimboeuf (11):
unwind_user/sframe: Add support for reading .sframe headers
unwind_user/sframe: Store .sframe section data in per-mm maple tree
x86/uaccess: Add unsafe_copy_from_user() implementation
unwind_user/sframe: Add support for reading .sframe contents
unwind_user/sframe: Detect .sframe sections in executables
unwind_user/sframe: Wire up unwind_user to sframe
unwind_user/sframe: Remove .sframe section on detected corruption
unwind_user/sframe: Show file name in debug output
unwind_user/sframe: Add .sframe validation option
unwind_user/sframe/x86: Enable sframe unwinding on x86
unwind_user/sframe: Add prctl() interface for registering .sframe
sections
MAINTAINERS | 4 +
arch/Kconfig | 23 +
arch/x86/Kconfig | 1 +
arch/x86/include/asm/mmu.h | 2 +-
arch/x86/include/asm/uaccess.h | 39 +-
arch/x86/include/asm/unwind_user.h | 74 +-
arch/x86/include/asm/unwind_user_sframe.h | 12 +
fs/binfmt_elf.c | 48 +-
include/linux/mm_types.h | 3 +
include/linux/sframe.h | 65 ++
include/linux/unwind_user.h | 20 +
include/linux/unwind_user_types.h | 50 +-
include/uapi/linux/elf.h | 1 +
include/uapi/linux/prctl.h | 4 +
kernel/fork.c | 10 +
kernel/sys.c | 9 +
kernel/unwind/Makefile | 3 +-
kernel/unwind/sframe.c | 938 ++++++++++++++++++++++
kernel/unwind/sframe.h | 88 ++
kernel/unwind/sframe_debug.h | 75 ++
kernel/unwind/user.c | 133 ++-
mm/init-mm.c | 2 +
mm/mmap.c | 5 +
23 files changed, 1571 insertions(+), 38 deletions(-)
create mode 100644 arch/x86/include/asm/unwind_user_sframe.h
create mode 100644 include/linux/sframe.h
create mode 100644 kernel/unwind/sframe.c
create mode 100644 kernel/unwind/sframe.h
create mode 100644 kernel/unwind/sframe_debug.h
--
2.51.0
^ permalink raw reply
* [PATCH v16 03/20] unwind_user/sframe: Store .sframe section data in per-mm maple tree
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Associate an .sframe section with its mm by adding it to a per-mm maple
tree which is indexed by the corresponding text address range. A single
.sframe section can be associated with multiple text ranges.
[ Jens Remus: Minor cleanups. Reword commit subject/message. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v16:
- Move SRCU definitions from patch "unwind_user/sframe: Add support for
reading .sframe contents" here. (Sashiko AI)
Changes in v15:
- Fix text section end passed to mtree_insert_range() to be inclusive.
(Sashiko AI)
- sframe_remove_section(): Add guard(srcu) to guard access to
sec->sframe_start. This also guards access to sec->filename
in __sframe_remove_section(). (Sashiko AI)
- Use GFP_KERNEL_ACCOUNT instead of GFP_KERNEL (see
memory-allocation.rst, section "Get Free Page flags"). (Sashiko AI)
arch/x86/include/asm/mmu.h | 2 +-
include/linux/mm_types.h | 3 ++
include/linux/sframe.h | 18 ++++++++++
kernel/fork.c | 10 ++++++
kernel/unwind/sframe.c | 68 ++++++++++++++++++++++++++++++++++++--
mm/init-mm.c | 2 ++
6 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0fe9c569d171..227a32899a59 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -87,7 +87,7 @@ typedef struct {
.context = { \
.ctx_id = 1, \
.lock = __MUTEX_INITIALIZER(mm.context.lock), \
- }
+ },
void leave_mm(void);
#define leave_mm leave_mm
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..c1505356b6fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1424,6 +1424,9 @@ struct mm_struct {
#ifdef CONFIG_MM_ID
mm_id_t mm_id;
#endif /* CONFIG_MM_ID */
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+ struct maple_tree sframe_mt;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index 0642595534f9..38047760e252 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -2,9 +2,14 @@
#ifndef _LINUX_SFRAME_H
#define _LINUX_SFRAME_H
+#include <linux/mm_types.h>
+#include <linux/srcu.h>
+
#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
struct sframe_section {
+ struct rcu_head rcu;
+
unsigned long sframe_start;
unsigned long sframe_end;
unsigned long text_start;
@@ -19,18 +24,31 @@ struct sframe_section {
signed char fp_off;
};
+#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
+extern void sframe_free_mm(struct mm_struct *mm);
+
extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
unsigned long text_start, unsigned long text_end);
extern int sframe_remove_section(unsigned long sframe_addr);
+static inline bool current_has_sframe(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ return mm && !mtree_empty(&mm->sframe_mt);
+}
+
#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+#define INIT_MM_SFRAME
+static inline void sframe_free_mm(struct mm_struct *mm) {}
static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
unsigned long text_start, unsigned long text_end)
{
return -ENOSYS;
}
static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
+static inline bool current_has_sframe(void) { return false; }
#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f3fdfdb14c7..8d8195561c95 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -110,6 +110,7 @@
#include <linux/tick.h>
#include <linux/unwind_deferred.h>
#include <linux/pgalloc.h>
+#include <linux/sframe.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -735,6 +736,7 @@ void __mmdrop(struct mm_struct *mm)
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+ sframe_free_mm(mm);
free_mm(mm);
}
@@ -1072,6 +1074,13 @@ static void mmap_init_lock(struct mm_struct *mm)
#endif
}
+static void mm_init_sframe(struct mm_struct *mm)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+ mt_init(&mm->sframe_mt);
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -1100,6 +1109,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ mm_init_sframe(mm);
hugetlb_count_init(mm);
mm_flags_clear_all(mm);
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index d24e9d4f8bef..46dba3cb016d 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -19,6 +19,8 @@
#define dbg(fmt, ...) \
pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+DEFINE_STATIC_SRCU(sframe_srcu);
+
static void free_section(struct sframe_section *sec)
{
kfree(sec);
@@ -81,6 +83,7 @@ static int sframe_read_header(struct sframe_section *sec)
int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
unsigned long text_start, unsigned long text_end)
{
+ struct maple_tree *sframe_mt = ¤t->mm->sframe_mt;
struct vm_area_struct *sframe_vma, *text_vma;
struct mm_struct *mm = current->mm;
struct sframe_section *sec;
@@ -122,15 +125,74 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
if (ret)
goto err_free;
- /* TODO nowhere to store it yet - just free it and return an error */
- ret = -ENOSYS;
+ ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end - 1,
+ sec, GFP_KERNEL_ACCOUNT);
+ if (ret) {
+ dbg("mtree_insert_range failed: text=%lx-%lx\n",
+ sec->text_start, sec->text_end);
+ goto err_free;
+ }
+
+ return 0;
err_free:
free_section(sec);
return ret;
}
+static void sframe_free_srcu(struct rcu_head *rcu)
+{
+ struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
+
+ free_section(sec);
+}
+
+static int __sframe_remove_section(struct mm_struct *mm,
+ struct sframe_section *sec)
+{
+ if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
+ dbg("mtree_erase failed: text=%lx\n", sec->text_start);
+ return -EINVAL;
+ }
+
+ call_srcu(&sframe_srcu, &sec->rcu, sframe_free_srcu);
+
+ return 0;
+}
+
int sframe_remove_section(unsigned long sframe_start)
{
- return -ENOSYS;
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ unsigned long index = 0;
+ bool found = false;
+ int ret = 0;
+
+ guard(srcu)(&sframe_srcu);
+
+ mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
+ if (sec->sframe_start == sframe_start) {
+ found = true;
+ ret |= __sframe_remove_section(mm, sec);
+ }
+ }
+
+ if (!found || ret)
+ return -EINVAL;
+
+ return 0;
+}
+
+void sframe_free_mm(struct mm_struct *mm)
+{
+ struct sframe_section *sec;
+ unsigned long index = 0;
+
+ if (!mm)
+ return;
+
+ mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
+ free_section(sec);
+
+ mtree_destroy(&mm->sframe_mt);
}
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c5556bb9d5f0..77909139162e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -11,6 +11,7 @@
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/iommu.h>
+#include <linux/sframe.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -49,6 +50,7 @@ struct mm_struct init_mm = {
#endif
.flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT,
INIT_MM_CONTEXT(init_mm)
+ INIT_MM_SFRAME
};
void setup_initial_init_mm(void *start_code, void *end_code,
--
2.51.0
^ permalink raw reply related
* [PATCH v16 18/20] unwind_user/sframe: Duplicate registered .sframe section data on clone/fork
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
When duplicating a process' virtual memory mappings also duplicate all
of its registered .sframe sections stored in the per-mm maple tree to
enable stacktracing using sframe of the child process.
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v16:
- dup_mmap(): Drop unnecessary CONFIG_HAVE_UNWIND_USER_SFRAME #ifdefs.
(Sashiko AI)
- dup_mmap(): Call sframe_dup_mm() prior to arch_dup_mmap(), so that
comes last.
Changes in v15:
- New patch.
include/linux/sframe.h | 5 ++++
kernel/unwind/sframe.c | 48 ++++++++++++++++++++++++++++++++++++
kernel/unwind/sframe_debug.h | 7 ++++++
mm/mmap.c | 5 ++++
4 files changed, 65 insertions(+)
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index b79c5ec09229..91889b4fe3dd 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -28,6 +28,7 @@ struct sframe_section {
};
#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
+extern int sframe_dup_mm(struct mm_struct *mm, struct mm_struct *oldmm);
extern void sframe_free_mm(struct mm_struct *mm);
extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
@@ -45,6 +46,10 @@ static inline bool current_has_sframe(void)
#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
#define INIT_MM_SFRAME
+static inline int sframe_dup_mm(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ return 0;
+}
static inline void sframe_free_mm(struct mm_struct *mm) {}
static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
unsigned long text_start, unsigned long text_end)
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 7f439600b0f0..db88d993dff1 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -875,6 +875,54 @@ int sframe_remove_section(unsigned long sframe_start)
return 0;
}
+static void __sframe_dup_section(struct sframe_section *sec, struct sframe_section *oldsec)
+{
+ sec->sframe_start = oldsec->sframe_start;
+ sec->sframe_end = oldsec->sframe_end;
+ sec->text_start = oldsec->text_start;
+ sec->text_end = oldsec->text_end;
+
+ sec->fdes_start = oldsec->fdes_start;
+ sec->fres_start = oldsec->fres_start;
+ sec->fres_end = oldsec->fres_end;
+ sec->num_fdes = oldsec->num_fdes;
+
+ sec->ra_off = oldsec->ra_off;
+ sec->fp_off = oldsec->fp_off;
+
+ dbg_dup(sec, oldsec);
+}
+
+int sframe_dup_mm(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct sframe_section *sec, *oldsec;
+ unsigned long index = 0;
+ int ret;
+
+ guard(srcu)(&sframe_srcu);
+
+ mt_for_each(&oldmm->sframe_mt, oldsec, index, ULONG_MAX) {
+ sec = kzalloc(sizeof(*sec), GFP_KERNEL_ACCOUNT);
+ if (!sec)
+ return -ENOMEM;
+
+ __sframe_dup_section(sec, oldsec);
+
+ ret = mtree_insert_range(&mm->sframe_mt,
+ sec->text_start,
+ sec->text_end - 1,
+ sec, GFP_KERNEL_ACCOUNT);
+ if (ret)
+ goto err_free;
+ }
+
+ return 0;
+
+err_free:
+ free_section(sec);
+ return ret;
+}
+
void sframe_free_mm(struct mm_struct *mm)
{
struct sframe_section *sec;
diff --git a/kernel/unwind/sframe_debug.h b/kernel/unwind/sframe_debug.h
index a63e75cccc70..2503972155e8 100644
--- a/kernel/unwind/sframe_debug.h
+++ b/kernel/unwind/sframe_debug.h
@@ -48,6 +48,12 @@ static inline void dbg_init(struct sframe_section *sec)
sec->filename = kstrdup("(anonymous)", GFP_KERNEL_ACCOUNT);
}
+static inline void dbg_dup(struct sframe_section *sec, struct sframe_section *oldsec)
+{
+ if (oldsec->filename)
+ sec->filename = kstrdup(oldsec->filename, GFP_KERNEL_ACCOUNT);
+}
+
static inline void dbg_free(struct sframe_section *sec)
{
kfree(sec->filename);
@@ -61,6 +67,7 @@ static inline void dbg_free(struct sframe_section *sec)
static inline void dbg_print_header(struct sframe_section *sec) {}
static inline void dbg_init(struct sframe_section *sec) {}
+static inline void dbg_dup(struct sframe_section *sec, struct sframe_section *oldsec) {}
static inline void dbg_free(struct sframe_section *sec) {}
#endif /* !CONFIG_DYNAMIC_DEBUG */
diff --git a/mm/mmap.c b/mm/mmap.c
index 5754d1c36462..8715be691488 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -48,6 +48,7 @@
#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/memfd.h>
+#include <linux/sframe.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -1844,6 +1845,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto loop_out;
}
}
+ retval = sframe_dup_mm(mm, oldmm);
+ if (retval)
+ goto loop_out;
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
@@ -1893,6 +1897,7 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
vm_unacct_memory(charge);
}
__mt_destroy(&mm->mm_mt);
+ sframe_free_mm(mm);
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
--
2.51.0
^ permalink raw reply related
* [PATCH v16 09/20] unwind_user/sframe: Add support for outermost frame indication
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
SFrame may represent an undefined return address (RA) as SFrame FRE
without any offsets as indication for an outermost frame.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
kernel/unwind/sframe.c | 15 ++++++++++++++-
kernel/unwind/sframe.h | 1 +
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 2de29c836f6b..41ece3ca62a1 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -231,7 +231,7 @@ static __always_inline int __read_fre(struct sframe_section *sec,
UNSAFE_GET_USER_INC(info, cur, 1, Efault);
dataword_count = SFRAME_V3_FRE_DATAWORD_COUNT(info);
dataword_size = dataword_size_enum_to_size(SFRAME_V3_FRE_DATAWORD_SIZE(info));
- if (!dataword_count || !dataword_size)
+ if (!dataword_size)
return -EINVAL;
fre_size = addr_size + 1 + (dataword_count * dataword_size);
@@ -242,6 +242,17 @@ static __always_inline int __read_fre(struct sframe_section *sec,
if (fde_type != SFRAME_FDE_TYPE_DEFAULT)
return -EINVAL;
+ if (!dataword_count) {
+ /*
+ * A FRE without data words indicates RA undefined /
+ * outermost frame.
+ */
+ cfa_off = 0;
+ ra_off = 0;
+ fp_off = 0;
+ goto done;
+ }
+
UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
dataword_count--;
@@ -262,6 +273,7 @@ static __always_inline int __read_fre(struct sframe_section *sec,
if (dataword_count)
return -EINVAL;
+done:
fre->size = fre_size;
fre->ip_off = ip_off;
fre->cfa_off = cfa_off;
@@ -328,6 +340,7 @@ static __always_inline int __find_fre(struct sframe_section *sec,
frame->ra_off = fre->ra_off;
frame->fp_off = fre->fp_off;
frame->use_fp = SFRAME_V3_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP;
+ frame->outermost = SFRAME_V3_FRE_RA_UNDEFINED_P(fre->info);
return 0;
}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
index fc2908e92c7b..ed111fd0d702 100644
--- a/kernel/unwind/sframe.h
+++ b/kernel/unwind/sframe.h
@@ -77,5 +77,6 @@ struct sframe_fda_v3 {
#define SFRAME_V3_FRE_DATAWORD_COUNT(info) (((info) >> 1) & 0xf)
#define SFRAME_V3_FRE_DATAWORD_SIZE(info) (((info) >> 5) & 0x3)
#define SFRAME_V3_AARCH64_FRE_MANGLED_RA_P(info) (((info) >> 7) & 0x1)
+#define SFRAME_V3_FRE_RA_UNDEFINED_P(info) (SFRAME_V3_FRE_DATAWORD_COUNT(info) == 0)
#endif /* _SFRAME_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 05/20] unwind_user/sframe: Add support for reading .sframe contents
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
In preparation for using sframe to unwind user space stacks, add an
sframe_find() interface for finding the sframe information associated
with a given text address.
For performance, use user_read_access_begin() and the corresponding
unsafe_*() accessors. Note that use of pr_debug() in uaccess-enabled
regions would break noinstr validation, so there aren't any debug
messages yet. That will be added in a subsequent commit.
Link: https://lore.kernel.org/all/77c0d1ec143bf2a53d66c4ecb190e7e0a576fbfd.1737511963.git.jpoimboe@kernel.org/
Link: https://lore.kernel.org/all/b35ca3a3-8de5-4d32-8d30-d4e562f6b0de@linux.ibm.com/
[ Jens Remus: Add initial support for SFrame V3 (limited to default
FDEs). Add support for PC-relative FDE function start offset. Simplify
logic by using an internal FDE representation. Rename struct sframe_fre
to sframe_fre_internal to align with struct sframe_fde_internal.
Cleanup includes. Fix checkpatch errors "spaces required around that
':'". ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v16:
- __read_fre(): Compute FRE size before mutating dataword_count.
(Sashiko AI)
- __read_fre(): Convert user read access to scope-based cleanup.
(Sashiko AI)
- Move SRCU definitions to patch "unwind_user/sframe: Store .sframe
section data in per-mm maple tree". (Sashiko AI)
Changes in v15:
- __read_fde():
- Validate FDE repetition size for PCTYPE_MASK FDEs to be non-zero to
prevent division by zero. (Sashiko AI)
- Validate FDE PC type for supported values (i.e. PCTYPE_INC or
PCTYPE_MASK).
- Validate FDE function end against text end.
- Validate FDE's number of FREs to be less or equal to FDE's function
size, as each FRE must cover at least one byte. (Indu)
- __read_fre(): Validate FRE function offset against FDE repetition size
for PCTYPE_MASK.
- Change type of struct sframe_fde_internal field fres_num to the one of
struct sframe_fda_v3 field fres_num.
- Normalize error code usage (.sframe is removed for all but ENOENT):
ENOENT: No sframe or no FDE for IP found
(FDE found but no FRE found is EINVAL)
EFAULT: Bad address
EINVAL: Invalid input or sframe
- Build-time checks for config options:
- 64BIT: SFrame V3 only supports 64-bit architectures.
- HAVE_EFFICIENT_UNALIGNED_ACCESS: Unaligned access to 16/32-bit
SFrame FRE fields and datawords using unsafe_get_user(). (Steven)
- Reword my changelog in commit message.
Changes in v14:
- Fix FDE function start address check in __read_fde().
- Adjust to rename of SFRAME_FDE_TYPE_*.
Changes in v13:
- Update to SFrame V3:
- Adjust to SFRAME_V3_*() macros and macro/define renames.
- Adjust to struct sframe_fde_v3 rename.
- Adjust to s64 FDE function start offset.
- Rename local variables fde_type to fde_pctype.
- Add and maintain struct sframe_fde_internal field u8 info2.
- Adjust to FDE split into function descriptor entry
(struct sframe_fde_v3) and attributes (struct sframe_fde_v3).
- Rename offset_count/offset_size to dataword_count/dataword_count.
- Limit __read_fre() to SFrame V3 regular FDEs (FDE_TYPE_REGULAR). A
subsequent patch will add support for flexible FDEs (FDE_TYPE_FLEX).
- Rename struct sframe_fde_internal field func_start_addr to func_addr.
- Add support u64/s64 in UNSAFE_GET_USER_INC() for s64 FDE function
start offset.
- Reduce indentation of assignments to fre.
- Reword commit message (my changes).
include/linux/sframe.h | 3 +
kernel/unwind/sframe.c | 362 ++++++++++++++++++++++++++++++++++-
kernel/unwind/sframe_debug.h | 35 ++++
3 files changed, 396 insertions(+), 4 deletions(-)
create mode 100644 kernel/unwind/sframe_debug.h
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index 38047760e252..9a72209696f9 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -4,6 +4,7 @@
#include <linux/mm_types.h>
#include <linux/srcu.h>
+#include <linux/unwind_user_types.h>
#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
@@ -30,6 +31,7 @@ extern void sframe_free_mm(struct mm_struct *mm);
extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
unsigned long text_start, unsigned long text_end);
extern int sframe_remove_section(unsigned long sframe_addr);
+extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
static inline bool current_has_sframe(void)
{
@@ -48,6 +50,7 @@ static inline int sframe_add_section(unsigned long sframe_start, unsigned long s
return -ENOSYS;
}
static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
+static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -ENOSYS; }
static inline bool current_has_sframe(void) { return false; }
#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 46dba3cb016d..2de29c836f6b 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -15,12 +15,355 @@
#include <linux/unwind_user_types.h>
#include "sframe.h"
-
-#define dbg(fmt, ...) \
- pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+#include "sframe_debug.h"
+
+struct sframe_fde_internal {
+ unsigned long func_addr;
+ u32 func_size;
+ u32 fda_off;
+ u32 fres_off;
+ u16 fres_num;
+ u8 info;
+ u8 info2;
+ u8 rep_size;
+};
+
+struct sframe_fre_internal {
+ unsigned int size;
+ u32 ip_off;
+ s32 cfa_off;
+ s32 ra_off;
+ s32 fp_off;
+ u8 info;
+};
DEFINE_STATIC_SRCU(sframe_srcu);
+static __always_inline unsigned char fre_type_to_size(unsigned char fre_type)
+{
+ if (fre_type > 2)
+ return 0;
+ return 1 << fre_type;
+}
+
+static __always_inline unsigned char dataword_size_enum_to_size(unsigned char dataword_size)
+{
+ if (dataword_size > 2)
+ return 0;
+ return 1 << dataword_size;
+}
+
+static __always_inline int __read_fde(struct sframe_section *sec,
+ unsigned int fde_num,
+ struct sframe_fde_internal *fde)
+{
+ unsigned long fde_addr, fda_addr, func_start, func_end;
+ struct sframe_fde_v3 _fde;
+ struct sframe_fda_v3 _fda;
+ unsigned char fde_pctype;
+
+ fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde_v3));
+ unsafe_copy_from_user(&_fde, (void __user *)fde_addr,
+ sizeof(struct sframe_fde_v3), Efault);
+
+ func_start = fde_addr + _fde.func_start_off;
+ func_end = func_start + _fde.func_size;
+ if (func_start < sec->text_start || func_end > sec->text_end)
+ return -EFAULT;
+
+ fda_addr = sec->fres_start + _fde.fres_off;
+ if (fda_addr + sizeof(struct sframe_fda_v3) > sec->fres_end)
+ return -EFAULT;
+ unsafe_copy_from_user(&_fda, (void __user *)fda_addr,
+ sizeof(struct sframe_fda_v3), Efault);
+
+ fde_pctype = SFRAME_V3_FDE_PCTYPE(_fda.info);
+ if (fde_pctype != SFRAME_FDE_PCTYPE_INC &&
+ fde_pctype != SFRAME_FDE_PCTYPE_MASK)
+ return -EINVAL;
+ if (fde_pctype == SFRAME_FDE_PCTYPE_MASK && !_fda.rep_size)
+ return -EINVAL;
+ if (_fda.fres_num > _fde.func_size)
+ return -EINVAL;
+
+ fde->func_addr = func_start;
+ fde->func_size = _fde.func_size;
+ fde->fda_off = _fde.fres_off;
+ fde->fres_off = _fde.fres_off + sizeof(struct sframe_fda_v3);
+ fde->fres_num = _fda.fres_num;
+ fde->info = _fda.info;
+ fde->info2 = _fda.info2;
+ fde->rep_size = _fda.rep_size;
+
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+static __always_inline int __find_fde(struct sframe_section *sec,
+ unsigned long ip,
+ struct sframe_fde_internal *fde)
+{
+ unsigned long func_addr_low = 0, func_addr_high = ULONG_MAX;
+ struct sframe_fde_v3 __user *first, *low, *high, *found = NULL;
+ int ret;
+
+ first = (void __user *)sec->fdes_start;
+ low = first;
+ high = first + sec->num_fdes - 1;
+
+ while (low <= high) {
+ struct sframe_fde_v3 __user *mid;
+ s64 func_off;
+ unsigned long func_addr;
+
+ mid = low + ((high - low) / 2);
+
+ unsafe_get_user(func_off, (s64 __user *)mid, Efault);
+ func_addr = (unsigned long)mid + func_off;
+
+ if (ip >= func_addr) {
+ if (func_addr < func_addr_low)
+ return -EINVAL;
+
+ func_addr_low = func_addr;
+
+ found = mid;
+ low = mid + 1;
+ } else {
+ if (func_addr > func_addr_high)
+ return -EINVAL;
+
+ func_addr_high = func_addr;
+
+ high = mid - 1;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ ret = __read_fde(sec, found - first, fde);
+ if (ret)
+ return ret;
+
+ /* make sure it's not in a gap */
+ if (ip < fde->func_addr || ip >= fde->func_addr + fde->func_size)
+ return -ENOENT;
+
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+#define ____UNSAFE_GET_USER_INC(to, from, type, label) \
+({ \
+ type __to; \
+ unsafe_get_user(__to, (type __user *)from, label); \
+ from += sizeof(__to); \
+ to = __to; \
+})
+
+#define __UNSAFE_GET_USER_INC(to, from, size, label, u_or_s) \
+({ \
+ switch (size) { \
+ case 1: \
+ ____UNSAFE_GET_USER_INC(to, from, u_or_s##8, label); \
+ break; \
+ case 2: \
+ ____UNSAFE_GET_USER_INC(to, from, u_or_s##16, label); \
+ break; \
+ case 4: \
+ ____UNSAFE_GET_USER_INC(to, from, u_or_s##32, label); \
+ break; \
+ default: \
+ return -EFAULT; \
+ } \
+})
+
+#define UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label) \
+ __UNSAFE_GET_USER_INC(to, from, size, label, u)
+
+#define UNSAFE_GET_USER_SIGNED_INC(to, from, size, label) \
+ __UNSAFE_GET_USER_INC(to, from, size, label, s)
+
+#define UNSAFE_GET_USER_INC(to, from, size, label) \
+ _Generic(to, \
+ u8 : UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \
+ u16 : UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \
+ u32 : UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \
+ u64 : UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \
+ s8 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \
+ s16 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \
+ s32 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \
+ s64 : UNSAFE_GET_USER_SIGNED_INC(to, from, size, label))
+
+static __always_inline int __read_fre(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long fre_addr,
+ struct sframe_fre_internal *fre)
+{
+ unsigned char fde_type = SFRAME_V3_FDE_TYPE(fde->info2);
+ unsigned char fde_pctype = SFRAME_V3_FDE_PCTYPE(fde->info);
+ unsigned char fre_type = SFRAME_V3_FDE_FRE_TYPE(fde->info);
+ unsigned char dataword_count, dataword_size;
+ s32 cfa_off, ra_off, fp_off;
+ unsigned long cur = fre_addr;
+ unsigned char addr_size;
+ unsigned int fre_size;
+ u32 ip_off;
+ u8 info;
+
+ addr_size = fre_type_to_size(fre_type);
+ if (!addr_size)
+ return -EINVAL;
+
+ if (fre_addr + addr_size + 1 > sec->fres_end)
+ return -EFAULT;
+
+ UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault);
+ if ((fde_pctype == SFRAME_FDE_PCTYPE_INC && ip_off >= fde->func_size) ||
+ (fde_pctype == SFRAME_FDE_PCTYPE_MASK && ip_off >= fde->rep_size))
+ return -EINVAL;
+
+ UNSAFE_GET_USER_INC(info, cur, 1, Efault);
+ dataword_count = SFRAME_V3_FRE_DATAWORD_COUNT(info);
+ dataword_size = dataword_size_enum_to_size(SFRAME_V3_FRE_DATAWORD_SIZE(info));
+ if (!dataword_count || !dataword_size)
+ return -EINVAL;
+ fre_size = addr_size + 1 + (dataword_count * dataword_size);
+
+ if (cur + (dataword_count * dataword_size) > sec->fres_end)
+ return -EFAULT;
+
+ /* TODO: Support for flexible FDEs not implemented yet. */
+ if (fde_type != SFRAME_FDE_TYPE_DEFAULT)
+ return -EINVAL;
+
+ UNSAFE_GET_USER_INC(cfa_off, cur, dataword_size, Efault);
+ dataword_count--;
+
+ ra_off = sec->ra_off;
+ if (!ra_off) {
+ if (!dataword_count--)
+ return -EINVAL;
+
+ UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
+ }
+
+ fp_off = sec->fp_off;
+ if (!fp_off && dataword_count) {
+ dataword_count--;
+ UNSAFE_GET_USER_INC(fp_off, cur, dataword_size, Efault);
+ }
+
+ if (dataword_count)
+ return -EINVAL;
+
+ fre->size = fre_size;
+ fre->ip_off = ip_off;
+ fre->cfa_off = cfa_off;
+ fre->ra_off = ra_off;
+ fre->fp_off = fp_off;
+ fre->info = info;
+
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+static __always_inline int __find_fre(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ unsigned long ip,
+ struct unwind_user_frame *frame)
+{
+ unsigned char fde_pctype = SFRAME_V3_FDE_PCTYPE(fde->info);
+ struct sframe_fre_internal *fre, *prev_fre = NULL;
+ struct sframe_fre_internal fres[2];
+ unsigned long fre_addr;
+ bool which = false;
+ unsigned int i;
+ u32 ip_off;
+
+ ip_off = ip - fde->func_addr;
+
+ if (fde_pctype == SFRAME_FDE_PCTYPE_MASK)
+ ip_off %= fde->rep_size;
+
+ fre_addr = sec->fres_start + fde->fres_off;
+
+ for (i = 0; i < fde->fres_num; i++) {
+ int ret;
+
+ /*
+ * Alternate between the two fre_addr[] entries for 'fre' and
+ * 'prev_fre'.
+ */
+ fre = which ? fres : fres + 1;
+ which = !which;
+
+ ret = __read_fre(sec, fde, fre_addr, fre);
+ if (ret)
+ return ret;
+
+ fre_addr += fre->size;
+
+ if (prev_fre && fre->ip_off <= prev_fre->ip_off)
+ return -EINVAL;
+
+ if (fre->ip_off > ip_off)
+ break;
+
+ prev_fre = fre;
+ }
+
+ if (!prev_fre)
+ return -EINVAL;
+ fre = prev_fre;
+
+ frame->cfa_off = fre->cfa_off;
+ frame->ra_off = fre->ra_off;
+ frame->fp_off = fre->fp_off;
+ frame->use_fp = SFRAME_V3_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP;
+
+ return 0;
+}
+
+int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
+{
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ struct sframe_fde_internal fde;
+ void __user *sframe_start;
+ int ret;
+
+ if (!mm)
+ return -EINVAL;
+
+ guard(srcu)(&sframe_srcu);
+
+ sec = mtree_load(&mm->sframe_mt, ip);
+ if (!sec)
+ return -ENOENT;
+
+ sframe_start = (void __user *)sec->sframe_start;
+ scoped_user_read_access_size(sframe_start,
+ sec->sframe_end - sec->sframe_start,
+ Efault) {
+ ret = __find_fde(sec, ip, &fde);
+ if (!ret)
+ ret = __find_fre(sec, &fde, ip, frame);
+ }
+
+ return ret;
+
+Efault:
+ return -EFAULT;
+}
+
static void free_section(struct sframe_section *sec)
{
kfree(sec);
@@ -32,6 +375,15 @@ static int sframe_read_header(struct sframe_section *sec)
struct sframe_header shdr;
unsigned int num_fdes;
+ /* SFrame V3 is only supported on 64-bit architectures */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_64BIT));
+
+ /*
+ * Unaligned access to 16/32-bit SFrame FRE fields and datawords
+ * using unsafe_get_user() via UNSAFE_GET_USER_INC()
+ */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS));
+
if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
dbg("header usercopy failed\n");
return -EFAULT;
@@ -122,8 +474,10 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
sec->text_end = text_end;
ret = sframe_read_header(sec);
- if (ret)
+ if (ret) {
+ dbg_print_header(sec);
goto err_free;
+ }
ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end - 1,
sec, GFP_KERNEL_ACCOUNT);
diff --git a/kernel/unwind/sframe_debug.h b/kernel/unwind/sframe_debug.h
new file mode 100644
index 000000000000..36352124cde8
--- /dev/null
+++ b/kernel/unwind/sframe_debug.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SFRAME_DEBUG_H
+#define _SFRAME_DEBUG_H
+
+#include <linux/sframe.h>
+#include "sframe.h"
+
+#ifdef CONFIG_DYNAMIC_DEBUG
+
+#define dbg(fmt, ...) \
+ pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+
+static __always_inline void dbg_print_header(struct sframe_section *sec)
+{
+ unsigned long fdes_end;
+
+ fdes_end = sec->fdes_start + (sec->num_fdes * sizeof(struct sframe_fde_v3));
+
+ dbg("SEC: sframe:0x%lx-0x%lx text:0x%lx-0x%lx "
+ "fdes:0x%lx-0x%lx fres:0x%lx-0x%lx "
+ "ra_off:%d fp_off:%d\n",
+ sec->sframe_start, sec->sframe_end, sec->text_start, sec->text_end,
+ sec->fdes_start, fdes_end, sec->fres_start, sec->fres_end,
+ sec->ra_off, sec->fp_off);
+}
+
+#else /* !CONFIG_DYNAMIC_DEBUG */
+
+#define dbg(args...) no_printk(args)
+
+static inline void dbg_print_header(struct sframe_section *sec) {}
+
+#endif /* !CONFIG_DYNAMIC_DEBUG */
+
+#endif /* _SFRAME_DEBUG_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v16 04/20] x86/uaccess: Add unsafe_copy_from_user() implementation
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add an x86 implementation of unsafe_copy_from_user() similar to the
existing unsafe_copy_to_user().
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- unsafe_copy_from_user(): Use const void *__src. (Sashiko AI)
arch/x86/include/asm/uaccess.h | 39 +++++++++++++++++++++++++---------
1 file changed, 29 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3a0dd3c2b233..235886106f31 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -598,7 +598,7 @@ _label: \
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
*/
-#define unsafe_copy_loop(dst, src, len, type, label) \
+#define unsafe_copy_to_user_loop(dst, src, len, type, label) \
while (len >= sizeof(type)) { \
unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \
dst += sizeof(type); \
@@ -606,15 +606,34 @@ _label: \
len -= sizeof(type); \
}
-#define unsafe_copy_to_user(_dst,_src,_len,label) \
-do { \
- char __user *__ucu_dst = (_dst); \
- const char *__ucu_src = (_src); \
- size_t __ucu_len = (_len); \
- unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \
- unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \
- unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \
- unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
+#define unsafe_copy_to_user(_dst, _src, _len, label) \
+do { \
+ void __user *__dst = (_dst); \
+ const void *__src = (_src); \
+ size_t __len = (_len); \
+ unsafe_copy_to_user_loop(__dst, __src, __len, u64, label); \
+ unsafe_copy_to_user_loop(__dst, __src, __len, u32, label); \
+ unsafe_copy_to_user_loop(__dst, __src, __len, u16, label); \
+ unsafe_copy_to_user_loop(__dst, __src, __len, u8, label); \
+} while (0)
+
+#define unsafe_copy_from_user_loop(dst, src, len, type, label) \
+ while (len >= sizeof(type)) { \
+ unsafe_get_user(*(type *)(dst), (type __user *)(src), label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+#define unsafe_copy_from_user(_dst, _src, _len, label) \
+do { \
+ void *__dst = (_dst); \
+ const void __user *__src = (_src); \
+ size_t __len = (_len); \
+ unsafe_copy_from_user_loop(__dst, __src, __len, u64, label); \
+ unsafe_copy_from_user_loop(__dst, __src, __len, u32, label); \
+ unsafe_copy_from_user_loop(__dst, __src, __len, u16, label); \
+ unsafe_copy_from_user_loop(__dst, __src, __len, u8, label); \
} while (0)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
--
2.51.0
^ permalink raw reply related
* [PATCH v16 14/20] unwind_user: Flexible FP/RA recovery rules
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
To enable support for SFrame V3 flexible FDEs with a subsequent patch,
add support for the following flexible frame pointer (FP) and return
address (RA) recovery rules:
FP/RA = *(CFA + offset)
FP/RA = register + offset
FP/RA = *(register + offset)
Note that FP/RA recovery rules that use arbitrary register contents are
only valid when in the topmost frame, as their contents are otherwise
unknown.
This also enables unwinding of user space for architectures, such as
s390, that may save the frame pointer (FP) and/or return address (RA) in
other registers, for instance when in a leaf function.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Define dbg_once().
- unwind_user_get_reg(): Use pr_debug_once() instead of WARN_ON_ONCE()
to prevent user-triggered warning/panic. (Sashiko AI)
- unwind_user_next_common(): Handle UNWIND_USER_RULE_CFA_OFFSET for RA
and FP to use dbg_once() instead of WARN_ON_ONCE() to prevent user-
triggered warning/panic. (Sashiko AI)
Changes in v14:
- Improve comment on why UNWIND_USER_RULE_CFA_OFFSET is not implemented.
(Mark Rutland)
arch/x86/include/asm/unwind_user.h | 21 +++++++--
include/linux/unwind_user.h | 10 +++++
include/linux/unwind_user_types.h | 23 +++++++++-
kernel/unwind/sframe.c | 16 ++++++-
kernel/unwind/user.c | 70 +++++++++++++++++++++++++++---
5 files changed, 125 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index 2dfb5ef11e36..9c3417be4283 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -21,15 +21,26 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
#define ARCH_INIT_USER_FP_FRAME(ws) \
.cfa_off = 2*(ws), \
- .ra_off = -1*(ws), \
- .fp_off = -2*(ws), \
+ .ra = { \
+ .rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+ .offset = -1*(ws), \
+ }, \
+ .fp = { \
+ .rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+ .offset = -2*(ws), \
+ }, \
.use_fp = true, \
.outermost = false,
#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
.cfa_off = 1*(ws), \
- .ra_off = -1*(ws), \
- .fp_off = 0, \
+ .ra = { \
+ .rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF,\
+ .offset = -1*(ws), \
+ }, \
+ .fp = { \
+ .rule = UNWIND_USER_RULE_RETAIN,\
+ }, \
.use_fp = false, \
.outermost = false,
@@ -41,4 +52,6 @@ static inline bool unwind_user_at_function_start(struct pt_regs *regs)
#endif /* CONFIG_HAVE_UNWIND_USER_FP */
+#include <asm-generic/unwind_user.h>
+
#endif /* _ASM_X86_UNWIND_USER_H */
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
index 7bf58f23aa64..6aca38f89ddd 100644
--- a/include/linux/unwind_user.h
+++ b/include/linux/unwind_user.h
@@ -33,6 +33,16 @@ static inline int unwind_user_get_ra_reg(unsigned long *val)
#define unwind_user_get_ra_reg unwind_user_get_ra_reg
#endif
+#ifndef unwind_user_get_reg
+static inline int unwind_user_get_reg(unsigned long *val, unsigned int regnum)
+{
+ pr_debug_once("%s (%d): unwind_user_get_reg(%u) not implemented\n",
+ current->comm, current->pid, regnum);
+ return -EINVAL;
+}
+#define unwind_user_get_reg unwind_user_get_reg
+#endif
+
int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);
#endif /* _LINUX_UNWIND_USER_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 616cc5ee4586..0d02714a1b5d 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -27,10 +27,29 @@ struct unwind_stacktrace {
unsigned long *entries;
};
+#define UNWIND_USER_RULE_DEREF BIT(31)
+
+enum unwind_user_rule {
+ UNWIND_USER_RULE_RETAIN, /* entity = entity */
+ UNWIND_USER_RULE_CFA_OFFSET, /* entity = CFA + offset */
+ UNWIND_USER_RULE_REG_OFFSET, /* entity = register + offset */
+ /* DEREF variants */
+ UNWIND_USER_RULE_CFA_OFFSET_DEREF = /* entity = *(CFA + offset) */
+ UNWIND_USER_RULE_CFA_OFFSET | UNWIND_USER_RULE_DEREF,
+ UNWIND_USER_RULE_REG_OFFSET_DEREF = /* entity = *(register + offset) */
+ UNWIND_USER_RULE_REG_OFFSET | UNWIND_USER_RULE_DEREF,
+};
+
+struct unwind_user_rule_data {
+ enum unwind_user_rule rule;
+ s32 offset;
+ unsigned int regnum;
+};
+
struct unwind_user_frame {
s32 cfa_off;
- s32 ra_off;
- s32 fp_off;
+ struct unwind_user_rule_data ra;
+ struct unwind_user_rule_data fp;
bool use_fp;
bool outermost;
};
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index d573c2529926..29a874a67f32 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -285,6 +285,18 @@ static __always_inline int __read_fre(struct sframe_section *sec,
return -EFAULT;
}
+static __always_inline void
+sframe_init_rule_data(struct unwind_user_rule_data *rule_data,
+ s32 offset)
+{
+ if (offset) {
+ rule_data->rule = UNWIND_USER_RULE_CFA_OFFSET_DEREF;
+ rule_data->offset = offset;
+ } else {
+ rule_data->rule = UNWIND_USER_RULE_RETAIN;
+ }
+}
+
static __always_inline int __find_fre(struct sframe_section *sec,
struct sframe_fde_internal *fde,
unsigned long ip,
@@ -335,8 +347,8 @@ static __always_inline int __find_fre(struct sframe_section *sec,
fre = prev_fre;
frame->cfa_off = fre->cfa_off;
- frame->ra_off = fre->ra_off;
- frame->fp_off = fre->fp_off;
+ sframe_init_rule_data(&frame->ra, fre->ra_off);
+ sframe_init_rule_data(&frame->fp, fre->fp_off);
frame->use_fp = SFRAME_V3_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP;
frame->outermost = SFRAME_V3_FRE_RA_UNDEFINED_P(fre->info);
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index afa7c6f6d9b4..c6a2abac78e0 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -12,6 +12,17 @@
#include <linux/uaccess.h>
#include <linux/sframe.h>
+#ifdef CONFIG_DYNAMIC_DEBUG
+
+#define dbg_once(fmt, ...) \
+ pr_debug_once("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+
+#else /* !CONFIG_DYNAMIC_DEBUG */
+
+#define dbg_once(args...) no_printk(args)
+
+#endif /* !CONFIG_DYNAMIC_DEBUG */
+
#define for_each_user_frame(state) \
for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
@@ -64,22 +75,67 @@ static int unwind_user_next_common(struct unwind_user_state *state,
return -EINVAL;
/* Get the Return Address (RA) */
- if (frame->ra_off) {
- if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
- return -EINVAL;
- } else {
+ switch (frame->ra.rule) {
+ case UNWIND_USER_RULE_RETAIN:
if (!state->topmost || unwind_user_get_ra_reg(&ra))
return -EINVAL;
+ break;
+ case UNWIND_USER_RULE_CFA_OFFSET:
+ /*
+ * RA = CFA + offset does not make sense.
+ * A return address cannot legitimately be a stack address.
+ */
+ dbg_once("UNWIND_USER_RULE_CFA_OFFSET invalid for RA\n");
+ return -EINVAL;
+ case UNWIND_USER_RULE_CFA_OFFSET_DEREF:
+ ra = cfa + frame->ra.offset;
+ break;
+ case UNWIND_USER_RULE_REG_OFFSET:
+ case UNWIND_USER_RULE_REG_OFFSET_DEREF:
+ if (!state->topmost || unwind_user_get_reg(&ra, frame->ra.regnum))
+ return -EINVAL;
+ ra += frame->ra.offset;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
+ if (frame->ra.rule & UNWIND_USER_RULE_DEREF &&
+ get_user_word(&ra, ra, 0, state->ws))
+ return -EINVAL;
/* Get the Frame Pointer (FP) */
- if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
+ switch (frame->fp.rule) {
+ case UNWIND_USER_RULE_RETAIN:
+ fp = state->fp;
+ break;
+ case UNWIND_USER_RULE_CFA_OFFSET:
+ /*
+ * FP = CFA + offset is currently not used for FP
+ * (e.g. SFrame cannot represent this rule).
+ */
+ dbg_once("UNWIND_USER_RULE_CFA_OFFSET unsupported for FP\n");
+ return -EINVAL;
+ case UNWIND_USER_RULE_CFA_OFFSET_DEREF:
+ fp = cfa + frame->fp.offset;
+ break;
+ case UNWIND_USER_RULE_REG_OFFSET:
+ case UNWIND_USER_RULE_REG_OFFSET_DEREF:
+ if (!state->topmost || unwind_user_get_reg(&fp, frame->fp.regnum))
+ return -EINVAL;
+ fp += frame->fp.offset;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ if (frame->fp.rule & UNWIND_USER_RULE_DEREF &&
+ get_user_word(&fp, fp, 0, state->ws))
return -EINVAL;
state->ip = ra;
state->sp = cfa;
- if (frame->fp_off)
- state->fp = fp;
+ state->fp = fp;
state->topmost = false;
return 0;
}
--
2.51.0
^ permalink raw reply related
* [PATCH v16 20/20] unwind_user/sframe: Add prctl() interface for registering .sframe sections
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
The kernel doesn't have direct visibility to the ELF contents of shared
libraries. Add some prctl() interfaces which allow glibc to tell the
kernel where to find .sframe sections.
[
This adds an interface for prctl() for testing loading of sframes for
libraries. But this interface should really be a system call. This patch
is for testing purposes only and should not be applied to mainline.
]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Fix rebase error (missing break). (Sashiko AI)
Changes in v14:
- Bump PR_ADD_SFRAME and PR_REMOVE_SFRAME.
include/uapi/linux/prctl.h | 4 ++++
kernel/sys.c | 9 +++++++++
2 files changed, 13 insertions(+)
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b6ec6f693719..bd0bf828b033 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -416,4 +416,8 @@ struct prctl_mm_map {
# define PR_CFI_DISABLE _BITUL(1)
# define PR_CFI_LOCK _BITUL(2)
+/* SFRAME management */
+#define PR_ADD_SFRAME 82
+#define PR_REMOVE_SFRAME 83
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 62e842055cc9..b0a9b1e3ccd7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -65,6 +65,7 @@
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
+#include <linux/sframe.h>
#include <linux/nospec.h>
@@ -2907,6 +2908,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg3 & PR_CFI_LOCK && !(arg3 & PR_CFI_DISABLE))
error = arch_prctl_lock_branch_landing_pad_state(me);
break;
+ case PR_ADD_SFRAME:
+ error = sframe_add_section(arg2, arg3, arg4, arg5);
+ break;
+ case PR_REMOVE_SFRAME:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = sframe_remove_section(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
--
2.51.0
^ permalink raw reply related
* [PATCH v16 08/20] unwind_user: Stop when reaching an outermost frame
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
Add an indication for an outermost frame to the unwind user frame
structure and stop unwinding when reaching an outermost frame.
This will be used by unwind user sframe, as SFrame may represent an
undefined return address as indication for an outermost frame.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/include/asm/unwind_user.h | 6 ++++--
include/linux/unwind_user_types.h | 1 +
kernel/unwind/user.c | 6 ++++++
3 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index 6e469044e4de..2dfb5ef11e36 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -23,13 +23,15 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
.cfa_off = 2*(ws), \
.ra_off = -1*(ws), \
.fp_off = -2*(ws), \
- .use_fp = true,
+ .use_fp = true, \
+ .outermost = false,
#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
.cfa_off = 1*(ws), \
.ra_off = -1*(ws), \
.fp_off = 0, \
- .use_fp = false,
+ .use_fp = false, \
+ .outermost = false,
static inline bool unwind_user_at_function_start(struct pt_regs *regs)
{
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 43e4b160883f..616cc5ee4586 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -32,6 +32,7 @@ struct unwind_user_frame {
s32 ra_off;
s32 fp_off;
bool use_fp;
+ bool outermost;
};
struct unwind_user_state {
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 1fb272419733..fdb1001e3750 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -32,6 +32,12 @@ static int unwind_user_next_common(struct unwind_user_state *state,
{
unsigned long cfa, fp, ra;
+ /* Stop unwinding when reaching an outermost frame. */
+ if (frame->outermost) {
+ state->done = true;
+ return 0;
+ }
+
/* Get the Canonical Frame Address (CFA) */
if (frame->use_fp) {
if (state->fp < state->sp)
--
2.51.0
^ permalink raw reply related
* [PATCH v16 13/20] unwind_user: Enable archs that pass RA in a register
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
Not all architectures/ABIs pass the return address (RA) on the stack on
function entry, like x86-64 does due to its CALL instruction pushing
the RA onto the stack. Architectures/ABIs, such as s390, also do not
require the RA to be saved on the stack in the function prologue. In
particular, the RA may never be saved to the stack at all, such as in
leaf functions. Unwinding must therefore not assume the presence of a
RA saved on stack for the topmost frame.
Treat a RA offset from CFA of zero as indication that the RA is not
saved (on the stack). For the topmost frame treat it as indication that
the RA is in the link/RA register, such as on arm64 and s390, and obtain
it from there. For non-topmost frames treat it as error, as the RA must
be saved.
Additionally allow the SP to be unchanged in the topmost frame, for
architectures where SP at function entry == SP at call site, such as
arm64 and s390.
Note that treating a RA offset from CFA of zero as indication that
the RA is not saved on the stack additionally allows for architectures,
such as s390, where the frame pointer (FP) may be saved without the RA
being saved as well. Provided that such architectures represent this
in SFrame by encoding the "missing" RA offset using a padding RA offset
with a value of zero.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- Define pr_fmt().
- unwind_user_get_ra_reg(): Use pr_debug_once() instead of
WARN_ON_ONCE() to prevent user-triggered warning/panic. (Sashiko AI)
- Reworded commit message. (Indu)
include/linux/unwind_user.h | 10 ++++++++++
kernel/unwind/sframe.c | 6 ++----
kernel/unwind/user.c | 20 ++++++++++++++++----
3 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
index 64618618febd..7bf58f23aa64 100644
--- a/include/linux/unwind_user.h
+++ b/include/linux/unwind_user.h
@@ -23,6 +23,16 @@ static inline bool unwind_user_at_function_start(struct pt_regs *regs)
#define unwind_user_at_function_start unwind_user_at_function_start
#endif
+#ifndef unwind_user_get_ra_reg
+static inline int unwind_user_get_ra_reg(unsigned long *val)
+{
+ pr_debug_once("%s (%d): unwind_user_get_ra_reg() not implemented\n",
+ current->comm, current->pid);
+ return -EINVAL;
+}
+#define unwind_user_get_ra_reg unwind_user_get_ra_reg
+#endif
+
int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);
#endif /* _LINUX_UNWIND_USER_H */
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index e6d66ae8e7ac..d573c2529926 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -257,10 +257,8 @@ static __always_inline int __read_fre(struct sframe_section *sec,
dataword_count--;
ra_off = sec->ra_off;
- if (!ra_off) {
- if (!dataword_count--)
- return -EINVAL;
-
+ if (!ra_off && dataword_count) {
+ dataword_count--;
UNSAFE_GET_USER_INC(ra_off, cur, dataword_size, Efault);
}
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index fdb1001e3750..afa7c6f6d9b4 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -2,6 +2,9 @@
/*
* Generic interfaces for unwinding user space
*/
+
+#define pr_fmt(fmt) "unwind_user: " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
@@ -48,8 +51,12 @@ static int unwind_user_next_common(struct unwind_user_state *state,
}
cfa += frame->cfa_off;
- /* Make sure that stack is not going in wrong direction */
- if (cfa <= state->sp)
+ /*
+ * Make sure that stack is not going in wrong direction. Allow SP
+ * to be unchanged for the topmost frame, by subtracting topmost,
+ * which is either 0 or 1.
+ */
+ if (cfa <= state->sp - state->topmost)
return -EINVAL;
/* Make sure that the address is word aligned */
@@ -57,8 +64,13 @@ static int unwind_user_next_common(struct unwind_user_state *state,
return -EINVAL;
/* Get the Return Address (RA) */
- if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
- return -EINVAL;
+ if (frame->ra_off) {
+ if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
+ return -EINVAL;
+ } else {
+ if (!state->topmost || unwind_user_get_ra_reg(&ra))
+ return -EINVAL;
+ }
/* Get the Frame Pointer (FP) */
if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
--
2.51.0
^ permalink raw reply related
* [PATCH v16 17/20] unwind_user/sframe: Separate reading of FRE from reading of FRE data words
From: Jens Remus @ 2026-05-21 14:25 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
In-Reply-To: <20260521142546.3908498-1-jremus@linux.ibm.com>
__find_fre() performs linear search for a matching SFrame FRE for a
given IP. For that purpose it uses __read_fre(), which reads the whole
FRE. That is the variable-size FRE structure as well as the trailing
variable-length array of variable-size data words. For the search logic
to skip over the FRE it would be sufficient to read the variable-size
FRE structure only, which includes the count and size of data words.
Add fields to struct sframe_fre_internal to store the FRE data word's
address, count, and size. Change __read_fre() to read the variable-
size FRE structure only and populate those new fields. Change
__read_fre_datawords() to use those new fields. Change __find_fre()
to use __read_fre_datawords() to read the FRE data words only after a
matching FRE has been found. Introduce safe_read_fre_datawords() and
use it in sframe_validate_section() to validate that the FRE data words.
Reviewed-by: Indu Bhagat <ibhagatgnu@gmail.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v15:
- sframe_validate_section(): Fix format specifier for number of FREs
in debug message. (Sashiko AI)
Changes in v14:
- Adjust to rename of SFRAME_FDE_TYPE_* and
__read_default_fre_datawords().
- Update function name in debug message.
kernel/unwind/sframe.c | 99 +++++++++++++++++++++++++++---------------
1 file changed, 63 insertions(+), 36 deletions(-)
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index b623dca072da..7f439600b0f0 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -39,6 +39,9 @@ struct sframe_fre_internal {
u32 fp_ctl;
s32 fp_off;
u8 info;
+ unsigned long dw_addr;
+ unsigned char dw_count;
+ unsigned char dw_size;
};
DEFINE_STATIC_SRCU(sframe_srcu);
@@ -207,11 +210,11 @@ static __always_inline int __find_fde(struct sframe_section *sec,
static __always_inline int
__read_default_fre_datawords(struct sframe_section *sec,
struct sframe_fde_internal *fde,
- unsigned long cur,
- unsigned char dataword_count,
- unsigned char dataword_size,
struct sframe_fre_internal *fre)
{
+ unsigned char dataword_count = fre->dw_count;
+ unsigned char dataword_size = fre->dw_size;
+ unsigned long cur = fre->dw_addr;
s32 cfa_off, ra_off, fp_off;
unsigned int cfa_regnum;
@@ -253,11 +256,11 @@ __read_default_fre_datawords(struct sframe_section *sec,
static __always_inline int
__read_flex_fde_fre_datawords(struct sframe_section *sec,
struct sframe_fde_internal *fde,
- unsigned long cur,
- unsigned char dataword_count,
- unsigned char dataword_size,
struct sframe_fre_internal *fre)
{
+ unsigned char dataword_count = fre->dw_count;
+ unsigned char dataword_size = fre->dw_size;
+ unsigned long cur = fre->dw_addr;
u32 cfa_ctl, ra_ctl, fp_ctl;
s32 cfa_off, ra_off, fp_off;
@@ -325,24 +328,34 @@ __read_flex_fde_fre_datawords(struct sframe_section *sec,
static __always_inline int
__read_fre_datawords(struct sframe_section *sec,
struct sframe_fde_internal *fde,
- unsigned long cur,
- unsigned char dataword_count,
- unsigned char dataword_size,
struct sframe_fre_internal *fre)
{
unsigned char fde_type = SFRAME_V3_FDE_TYPE(fde->info2);
+ unsigned char dataword_count = fre->dw_count;
+
+ if (!dataword_count) {
+ /*
+ * A FRE without datawords indicates an outermost
+ * frame. Zero-initialize CFA, RA, and FP location
+ * info, except for the CFA control word, so that
+ * neither sframe_init_cfa_rule_data() nor
+ * sframe_init_rule_data() fail.
+ */
+ fre->cfa_ctl = (SFRAME_REG_SP << 3) | 1; /* regnum=SP, deref_p=0, reg_p=1 */
+ fre->cfa_off = 0;
+ fre->ra_ctl = 0;
+ fre->ra_off = 0;
+ fre->fp_ctl = 0;
+ fre->fp_off = 0;
+
+ return 0;
+ }
switch (fde_type) {
case SFRAME_FDE_TYPE_DEFAULT:
- return __read_default_fre_datawords(sec, fde, cur,
- dataword_count,
- dataword_size,
- fre);
+ return __read_default_fre_datawords(sec, fde, fre);
case SFRAME_FDE_TYPE_FLEX:
- return __read_flex_fde_fre_datawords(sec, fde, cur,
- dataword_count,
- dataword_size,
- fre);
+ return __read_flex_fde_fre_datawords(sec, fde, fre);
default:
return -EINVAL;
}
@@ -385,26 +398,11 @@ static __always_inline int __read_fre(struct sframe_section *sec,
fre->size = addr_size + 1 + (dataword_count * dataword_size);
fre->ip_off = ip_off;
fre->info = info;
+ fre->dw_addr = cur;
+ fre->dw_count = dataword_count;
+ fre->dw_size = dataword_size;
- if (!dataword_count) {
- /*
- * A FRE without datawords indicates an outermost
- * frame. Zero-initialize CFA, RA, and FP location
- * info, except for the CFA control word, so that
- * neither sframe_init_cfa_rule_data() nor
- * sframe_init_rule_data() fail.
- */
- fre->cfa_ctl = (SFRAME_REG_SP << 3) | 1; /* regnum=SP, deref_p=0, reg_p=1 */
- fre->cfa_off = 0;
- fre->ra_ctl = 0;
- fre->ra_off = 0;
- fre->fp_ctl = 0;
- fre->fp_off = 0;
-
- return 0;
- }
-
- return __read_fre_datawords(sec, fde, cur, dataword_count, dataword_size, fre);
+ return 0;
Efault:
return -EFAULT;
@@ -527,6 +525,10 @@ static __always_inline int __find_fre(struct sframe_section *sec,
return -EINVAL;
fre = prev_fre;
+ ret = __read_fre_datawords(sec, fde, fre);
+ if (ret)
+ return ret;
+
ret = sframe_init_cfa_rule_data(&frame->cfa, fre->cfa_ctl, fre->cfa_off);
if (ret)
return ret;
@@ -610,6 +612,20 @@ static int safe_read_fre(struct sframe_section *sec,
return ret;
}
+static int safe_read_fre_datawords(struct sframe_section *sec,
+ struct sframe_fde_internal *fde,
+ struct sframe_fre_internal *fre)
+{
+ int ret;
+
+ if (!user_read_access_begin((void __user *)sec->sframe_start,
+ sec->sframe_end - sec->sframe_start))
+ return -EFAULT;
+ ret = __read_fre_datawords(sec, fde, fre);
+ user_read_access_end();
+ return ret;
+}
+
static int sframe_validate_section(struct sframe_section *sec)
{
struct sframe_fde_internal fde;
@@ -648,6 +664,17 @@ static int sframe_validate_section(struct sframe_section *sec)
fde.rep_size);
return ret;
}
+ ret = safe_read_fre_datawords(sec, &fde, &fre);
+ if (ret) {
+ dbg_sec("FDE %u: safe_read_fre_datawords(%u) failed\n", i, j);
+ dbg_sec("FDE: func_addr:%#lx func_size:%#x fda_off:%#x fres_off:%#x fres_num:%u info:%u info2:%u rep_size:%u\n",
+ fde.func_addr, fde.func_size,
+ fde.fda_off,
+ fde.fres_off, fde.fres_num,
+ fde.info, fde.info2,
+ fde.rep_size);
+ return ret;
+ }
if (j && fre.ip_off <= prev_ip_off) {
dbg_sec("FDE %u: FRE %u not sorted\n", i, j);
--
2.51.0
^ permalink raw reply related
* Re: [PATCH v6 05/43] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Ackerley Tng @ 2026-05-21 14:29 UTC (permalink / raw)
To: Sean Christopherson, Fuad Tabba
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <ag8JIlHjohAOC3-g@google.com>
Sean Christopherson <seanjc@google.com> writes:
> On Thu, May 21, 2026, Fuad Tabba wrote:
>> On Wed, 20 May 2026 at 22:44, Ackerley Tng <ackerleytng@google.com> wrote:
>> >
>> > Fuad Tabba <tabba@google.com> writes:
>> >
>> > >
>> > > [...snip...]
>> > >
>> > >> +unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>> > >> +{
>> > >> + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>> > >> + struct inode *inode;
>> > >> +
>> > >> + /*
>> > >> + * If this gfn has no associated memslot, there's no chance of the gfn
>> > >> + * being backed by private memory, since guest_memfd must be used for
>> > >> + * private memory, and guest_memfd must be associated with some memslot.
>> > >> + */
>> > >> + if (!slot)
>> > >> + return 0;
>> > >> +
>> > >> + CLASS(gmem_get_file, file)(slot);
>> > >> + if (!file)
>> > >> + return 0;
>> > >> +
>> > >> + inode = file_inode(file);
>> > >> +
>> > >> + /*
>> > >> + * Rely on the maple tree's internal RCU lock to ensure a
>> > >> + * stable result. This result can become stale as soon as the
>> > >> + * lock is dropped, so the caller _must_ still protect
>> > >> + * consumption of private vs. shared by checking
>> > >> + * mmu_invalidate_retry_gfn() under mmu_lock to serialize
>> > >> + * against ongoing attribute updates.
>> > >> + */
>> > >> + return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
>> > >> +}
>> > >
>> > > Doesn't this imply that all consumers of kvm_mem_is_private() should
>> > > validate the result using mmu_lock and the invalidation sequence?
>> >
>> > Let me know how I can improve the comment.
>>
>> Given Sean's context, the comment is good I think. I would quibble
>> with the the "_must_ still protect" phrasing being a bit too strict.
>>
>> Maybe just soften it slightly to acknowledge the exception? Something like:
>>
>> * lock is dropped, so callers that require a strict result _must_ protect
>> * consumption of private vs. shared by checking mmu_invalidate_retry_gfn()
>> * under mmu_lock to serialize against ongoing attribute updates. Callers
>> * doing lockless reads must be able to tolerate a stale result.
>>
>> That aligns the comment with how KVM is actually using it today. That
>> said, this is nitpicking. Feel free to use or ignore.
>
> Hmm, I wonder if we can figure out a way to consolidate some documentation,
> because this is _exactly_ the same pattern that x86's host_pfn_mapping_level()
> deals with (see its big comment below).
>
This would be great, are you thinking an actual comment or something in
Documentation/?
Perhaps we could iterate on this a little with me providing the newbie
perspective. Do you want me to take a stab at writing something up?
> There's also the stale comment in kvm_invalidate_memslot(), which, stating the
> obvious, speaks to the memslot+SRCU side of things.
>
> Maybe it makes sense to to find a central location for one giant comment about
> how how MMU notifier events and memslot+SRCU protections work? And then refer
> to that in paths where some asset needs to be tied into MMU notifiers and/or
> memslots+SRCU?
>
> [*] https://lore.kernel.org/all/agcbWe8s9lmPuJwG@google.com
>
> [...snip...]
>
^ permalink raw reply
* Re: [PATCH v4 1/2] tracing: Return ERR_PTR() from expr_str()
From: Steven Rostedt @ 2026-05-21 14:30 UTC (permalink / raw)
To: Pengpeng Hou
Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-trace-kernel,
linux-kernel
In-Reply-To: <20260521022817.38453-1-pengpeng@iscas.ac.cn>
On Thu, 21 May 2026 10:28:16 +0800
Pengpeng Hou <pengpeng@iscas.ac.cn> wrote:
> diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
> index 0dbbf6cca9bc..0b33bb8ef6f7 100644
> --- a/kernel/trace/trace_events_hist.c
> +++ b/kernel/trace/trace_events_hist.c
> @@ -1769,18 +1769,18 @@ static void expr_field_str(struct hist_field *field, char *expr)
>
> static char *expr_str(struct hist_field *field, unsigned int level)
> {
> - char *expr;
> + char *expr __free(kfree) = NULL;
Can you split this into two patches.
1. Change expr to use __free(kfree)
2. Update to use ERR_PTR()
as they are two distinct changes.
Thanks,
-- Steve
>
> if (level > 1)
> - return NULL;
> + return ERR_PTR(-EINVAL);
>
> expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
> if (!expr)
> - return NULL;
> + return ERR_PTR(-ENOMEM);
>
> if (!field->operands[0]) {
> expr_field_str(field, expr);
> - return expr;
> + return_ptr(expr);
> }
^ permalink raw reply
* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Wei Yang @ 2026-05-21 14:32 UTC (permalink / raw)
To: Vernon Yang
Cc: Wei Yang, Nico Pache, linux-doc, linux-kernel, linux-mm,
linux-trace-kernel, aarcange, akpm, anshuman.khandual, apopple,
baohua, baolin.wang, byungchul, catalin.marinas, cl, corbet,
dave.hansen, david, dev.jain, gourry, hannes, hughd, jack,
jackmanb, jannh, jglisse, joshua.hahnjy, kas, lance.yang, liam,
ljs, mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
pfalcato, rakie.kim, raquini, rdunlap, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <91015820-f39a-4b06-89de-b49e5ca465fd@gmail.com>
On Thu, May 21, 2026 at 01:11:18PM +0800, Vernon Yang wrote:
>On Thu, May 21, 2026 at 02:46:54AM +0000, Wei Yang wrote:
>> On Thu, May 21, 2026 at 10:36:15AM +0800, Vernon Yang wrote:
>> >On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
>> >> Enable khugepaged to collapse to mTHP orders. This patch implements the
>> >> main scanning logic using a bitmap to track occupied pages and a stack
>> >> structure that allows us to find optimal collapse sizes.
>> >>
>> >> Previous to this patch, PMD collapse had 3 main phases, a light weight
>> >> scanning phase (mmap_read_lock) that determines a potential PMD
>> >> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
>> >> phase (mmap_write_lock).
>> >>
>> >> To enabled mTHP collapse we make the following changes:
>> >>
>> >> During PMD scan phase, track occupied pages in a bitmap. When mTHP
>> >> orders are enabled, we remove the restriction of max_ptes_none during the
>> >> scan phase to avoid missing potential mTHP collapse candidates. Once we
>> >> have scanned the full PMD range and updated the bitmap to track occupied
>> >> pages, we use the bitmap to find the optimal mTHP size.
>> >>
>> >> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
>> >> and determine the best eligible order for the collapse. A stack structure
>> >> is used instead of traditional recursion to manage the search. This also
>> >> prevents a traditional recursive approach when the kernel stack struct is
>> >> limited. The algorithm recursively splits the bitmap into smaller chunks to
>> >> find the highest order mTHPs that satisfy the collapse criteria. We start
>> >> by attempting the PMD order, then moved on the consecutively lower orders
>> >> (mTHP collapse). The stack maintains a pair of variables (offset, order),
>> >> indicating the number of PTEs from the start of the PMD, and the order of
>> >> the potential collapse candidate.
>> >>
>> >> The algorithm for consuming the bitmap works as such:
>> >> 1) push (0, HPAGE_PMD_ORDER) onto the stack
>> >> 2) pop the stack
>> >> 3) check if the number of set bits in that (offset,order) pair
>> >> statisfy the max_ptes_none threshold for that order
>> >> 4) if yes, attempt collapse
>> >> 5) if no (or collapse fails), push two new stack items representing
>> >> the left and right halves of the current bitmap range, at the
>> >> next lower order
>> >> 6) repeat at step (2) until stack is empty.
>> >>
>> >> Below is a diagram representing the algorithm and stack items:
>> >>
>> >> offset mid_offset
>> >> | |
>> >> | |
>> >> v v
>> >> ____________________________________
>> >> | PTE Page Table |
>> >> --------------------------------------
>> >> <-------><------->
>> >> order-1 order-1
>> >>
>> >> mTHP collapses reject regions containing swapped out or shared pages.
>> >> This is because adding new entries can lead to new none pages, and these
>> >> may lead to constant promotion into a higher order mTHP. A similar
>> >> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
>> >> introducing at least 2x the number of pages, and on a future scan will
>> >> satisfy the promotion condition once again. This issue is prevented via
>> >> the collapse_max_ptes_none() function which imposes the max_ptes_none
>> >> restrictions above.
>> >>
>> >> We currently only support mTHP collapse for max_ptes_none values of 0
>> >> and HPAGE_PMD_NR - 1. resulting in the following behavior:
>> >>
>> >> - max_ptes_none=0: Never introduce new empty pages during collapse
>> >> - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>> >> available mTHP order
>> >>
>> >> Any other max_ptes_none value will emit a warning and skip mTHP collapse
>> >> attempts. There should be no behavior change for PMD collapse.
>> >>
>> >> Once we determine what mTHP sizes fits best in that PMD range a collapse
>> >> is attempted. A minimum collapse order of 2 is used as this is the lowest
>> >> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>> >>
>> >> Currently madv_collapse is not supported and will only attempt PMD
>> >> collapse.
>> >>
>> >> We can also remove the check for is_khugepaged inside the PMD scan as
>> >> the collapse_max_ptes_none() function handles this logic now.
>> >>
>> >> Signed-off-by: Nico Pache <npache@redhat.com>
>> >> ---
>> >> mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
>> >> 1 file changed, 174 insertions(+), 8 deletions(-)
>> >>
>> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> >> index 3492b135d667..39bf7ea8a6e8 100644
>> >> --- a/mm/khugepaged.c
>> >> +++ b/mm/khugepaged.c
>> >> @@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>> >>
>> >> static struct kmem_cache *mm_slot_cache __ro_after_init;
>> >>
>> >> +#define KHUGEPAGED_MIN_MTHP_ORDER 2
>> >> +/*
>> >> + * mthp_collapse() does an iterative DFS over a binary tree, from
>> >> + * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
>> >> + * size needed for a DFS on a binary tree is height + 1, where
>> >> + * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
>> >> + *
>> >> + * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
>> >> + * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
>> >> + */
>> >> +#define MTHP_STACK_SIZE (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
>> >> +
>> >> +/*
>> >> + * Defines a range of PTE entries in a PTE page table which are being
>> >> + * considered for mTHP collapse.
>> >> + *
>> >> + * @offset: the offset of the first PTE entry in a PMD range.
>> >> + * @order: the order of the PTE entries being considered for collapse.
>> >> + */
>> >> +struct mthp_range {
>> >> + u16 offset;
>> >> + u8 order;
>> >> +};
>> >> +
>> >> struct collapse_control {
>> >> bool is_khugepaged;
>> >>
>> >> @@ -111,6 +135,12 @@ struct collapse_control {
>> >>
>> >> /* nodemask for allocation fallback */
>> >> nodemask_t alloc_nmask;
>> >> +
>> >> + /* Each bit represents a single occupied (!none/zero) page. */
>> >> + DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
>> >> + /* A mask of the current range being considered for mTHP collapse. */
>> >> + DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> >> + struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
>> >> };
>> >>
>> >> /**
>> >> @@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>> >> return result;
>> >> }
>> >>
>> >> +static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
>> >> + u16 offset, u8 order)
>> >> +{
>> >> + const int size = *stack_size;
>> >> + struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
>> >> +
>> >> + VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
>> >> + stack->order = order;
>> >> + stack->offset = offset;
>> >> + (*stack_size)++;
>> >> +}
>> >> +
>> >> +static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
>> >> + int *stack_size)
>> >> +{
>> >> + const int size = *stack_size;
>> >> +
>> >> + VM_WARN_ON_ONCE(size <= 0);
>> >> + (*stack_size)--;
>> >> + return cc->mthp_bitmap_stack[size - 1];
>> >> +}
>> >> +
>> >> +static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
>> >> + u16 offset, unsigned int nr_ptes)
>> >> +{
>> >> + bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> >> + bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
>> >> + return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> >> +}
>> >> +
>> >> +/*
>> >> + * mthp_collapse() consumes the bitmap that is generated during
>> >> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
>> >> + *
>> >> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
>> >> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
>> >> + * of the bitmap for collapse eligibility. The stack maintains a pair of
>> >> + * variables (offset, order), indicating the number of PTEs from the start of
>> >> + * the PMD, and the order of the potential collapse candidate respectively. We
>> >> + * start at the PMD order and check if it is eligible for collapse; if not, we
>> >> + * add two entries to the stack at a lower order to represent the left and right
>> >> + * halves of the PTE page table we are examining.
>> >> + *
>> >> + * offset mid_offset
>> >> + * | |
>> >> + * | |
>> >> + * v v
>> >> + * --------------------------------------
>> >> + * | cc->mthp_bitmap |
>> >> + * --------------------------------------
>> >> + * <-------><------->
>> >> + * order-1 order-1
>> >> + *
>> >> + * For each of these, we determine how many PTE entries are occupied in the
>> >> + * range of PTE entries we propose to collapse, then we compare this to a
>> >> + * threshold number of PTE entries which would need to be occupied for a
>> >> + * collapse to be permitted at that order (accounting for max_ptes_none).
>> >> + *
>> >> + * If a collapse is permitted, we attempt to collapse the PTE range into a
>> >> + * mTHP.
>> >> + */
>> >> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>> >> + int referenced, int unmapped, struct collapse_control *cc,
>> >> + unsigned long enabled_orders)
>> >> +{
>> >> + unsigned int nr_occupied_ptes, nr_ptes;
>> >> + int max_ptes_none, collapsed = 0, stack_size = 0;
>> >> + unsigned long collapse_address;
>> >> + struct mthp_range range;
>> >> + u16 offset;
>> >> + u8 order;
>> >> +
>> >> + collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
>> >> +
>> >> + while (stack_size) {
>> >> + range = collapse_mthp_stack_pop(cc, &stack_size);
>> >> + order = range.order;
>> >> + offset = range.offset;
>> >> + nr_ptes = 1UL << order;
>> >> +
>> >> + if (!test_bit(order, &enabled_orders))
>> >> + goto next_order;
>> >> +
>> >> + max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
>> >> +
>> >> + if (max_ptes_none < 0)
>> >> + return collapsed;
>> >> +
>> >> + nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
>> >> + nr_ptes);
>> >> +
>> >> + if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
>> >> + int ret;
>> >> +
>> >> + collapse_address = address + offset * PAGE_SIZE;
>> >> + ret = collapse_huge_page(mm, collapse_address, referenced,
>> >> + unmapped, cc, order);
>> >> + if (ret == SCAN_SUCCEED) {
>> >> + collapsed += nr_ptes;
>> >> + continue;
>> >> + }
>> >> + }
>> >> +
>> >> +next_order:
>> >> + if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>> >
>> >Hi Nico, thank you very much for your contributions to this series.
>> >
>> >I found a minor issue, for MADV_COLLAPSE, if collapse_huge_page() fails
>> >for some reason (e.g. allocate folio), it goes to next_order and
>> >continues splitting to the next small order. However, enabled_orders
>> >only supports HPAGE_PMD_ORDER, so it keeps runing the split operations
>> >without any effective work until KHUGEPAGED_MIN_MTHP_ORDER is reached
>> >before exiting. For khugepaged, e.g. setting only 2MB to always, also
>> >same phenomenon.
>>
>> Yes, but it does no actual work since it is checked after pop up.
>>
>> >
>> >This does not affect the overall functionality of mthp collapse, just
>> >redundant.
>> >
>> >The redundant operations can be easily skipped with the following
>> >modification. If I miss some thing, please let me know. Thanks!
>> >
>> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> >index 1a25af3d6d0f..fa407cce525c 100644
>> >--- a/mm/khugepaged.c
>> >+++ b/mm/khugepaged.c
>> >@@ -1574,7 +1574,7 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>> > }
>> >
>> > next_order:
>> >- if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>> >+ if ((BIT(order) - 1) & enabled_orders) {
>> > const u8 next_order = order - 1;
>> > const u16 mid_offset = offset + (nr_ptes / 2);
>> >
>>
>> This would stop the iteration if there are other lower enabled order, right?
> ^^^^ ^^^^^^^^^^^^^^^^^^^
>
>NO :)
Got it. You are right.
The logic here is all lower bits are not set, skip the rest.
>
>For more details, please refer to the following information.
>
>| Scenario | Old Behavior (order > 2) | New Behavior ((BIT(order)-1) & enabled_orders) |
>|-------------------------------------|--------------------------|------------------------------------------------|
>| MADV_COLLAPSE | Splits 9,8,7,...,3 | No split |
>| khugepaged, only 2MB enabled | Splits 9,8,7,...,3 | No split |
>| khugepaged, only 2MB + 64KB enabled | Splits 9,8,7,...,3 | Splits 9,8,7,...,5 |
>| khugepaged, only 32KB enabled | Splits 9,8,7,...,3 | Splits 9,8,7,...,4 |
>| khugepaged, only 16KB enabled | Splits 9,8,7,...,3 | Splits 9,8,7,...,3 |
>| khugepaged, all mTHP enabled | Splits 9,8,7,...,3 | Splits 9,8,7,...,3 |
>
>--
>Cheers,
>Vernon
--
Wei Yang
Help you, Help me
^ permalink raw reply
* [PATCH 0/3] Fix out-of-tree build of some tools
From: Ben Hutchings @ 2026-05-21 14:34 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Steven Rostedt, Tomas Glozar
Cc: linux-perf-users, linux-trace-kernel
[-- Attachment #1: Type: text/plain, Size: 593 bytes --]
perf and rtla currently don't fully support out-of-tree builds, as
they may still create files in their source directory. This series
fixes all the instances of this problem that I have found.
Ben.
Ben Hutchings (3):
rtla: Fix output files in source tree
perf tools: Put Python egg info in output directory
perf tools: Put Python bytecode in output directory
tools/perf/Makefile.perf | 9 ++++++++-
tools/tracing/rtla/Makefile | 31 ++++++++++++++++++-----------
tools/tracing/rtla/tests/timerlat.t | 4 ++--
3 files changed, 29 insertions(+), 15 deletions(-)
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* [PATCH 1/3] rtla: Fix output files in source tree
From: Ben Hutchings @ 2026-05-21 14:35 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Steven Rostedt, Tomas Glozar
Cc: linux-perf-users, linux-trace-kernel
In-Reply-To: <ag8X7gcDw6jpJsLq@decadent.org.uk>
[-- Attachment #1: Type: text/plain, Size: 4742 bytes --]
Some output files (src/timerlat.bpf.o, src/timerlat.skel.h,
example/timerlat_bpf_action.o, tests/bpf/bpf_action_map.o) are
currently generated in the source tree, preventing a fully out-of-tree
build. To fix this:
- Add $(OUTPUT) to their filenames in the relevant Makefile rules, and
create subdirectories as needed
- Add $(OUTPUT)src to the include path
- Add ${OUTPUT} to the BPF object filename in tests/timerlat.t
Fixes: e34293ddcebd ("rtla/timerlat: Add BPF skeleton to collect samples")
Fixes: 0304a3b7ec9a ("rtla/timerlat: Add example for BPF action program")
Fixes: 5525aebd4e0c ("rtla/tests: Test BPF action program")
Signed-off-by: Ben Hutchings <benh@debian.org>
---
tools/tracing/rtla/Makefile | 31 ++++++++++++++++++-----------
tools/tracing/rtla/tests/timerlat.t | 4 ++--
2 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 45690ee14544..f54da7be735d 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -66,30 +66,37 @@ ifeq ($(config),1)
include Makefile.config
endif
+INCLUDES = -I$(OUTPUT)src
+
CFLAGS += $(INCLUDES) $(LIB_INCLUDES)
export CFLAGS OUTPUT srctree
ifeq ($(BUILD_BPF_SKEL),1)
-src/timerlat.bpf.o: src/timerlat.bpf.c
+$(OUTPUT)src/timerlat.bpf.o: src/timerlat.bpf.c
+ mkdir -p $(@D)
$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
-src/timerlat.skel.h: src/timerlat.bpf.o
+$(OUTPUT)src/timerlat.skel.h: $(OUTPUT)src/timerlat.bpf.o
+ mkdir -p $(@D)
$(QUIET_GENSKEL)$(SYSTEM_BPFTOOL) gen skeleton $< > $@
-example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+$(OUTPUT)example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+ mkdir -p $(@D)
$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
-tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+$(OUTPUT)tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+ mkdir -p $(@D)
$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
else
-src/timerlat.skel.h:
- $(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h
+$(OUTPUT)src/timerlat.skel.h:
+ mkdir -p $(@D)
+ $(Q)echo '/* BPF skeleton is disabled */' > $@
-example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+$(OUTPUT)example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
$(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o"
-tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+$(OUTPUT)tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
$(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o"
endif
@@ -103,7 +110,7 @@ static: $(RTLA_IN)
rtla.%: fixdep FORCE
make -f $(srctree)/tools/build/Makefile.build dir=. $@
-$(RTLA_IN): fixdep FORCE src/timerlat.skel.h
+$(RTLA_IN): fixdep FORCE $(OUTPUT)src/timerlat.skel.h
make $(build)=rtla
clean: doc_clean fixdep-clean
@@ -111,10 +118,10 @@ clean: doc_clean fixdep-clean
$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
$(Q)rm -rf feature
- $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o
+ $(Q)rm -f $(OUTPUT)src/timerlat.bpf.o $(OUTPUT)src/timerlat.skel.h $(OUTPUT)example/timerlat_bpf_action.o
$(Q)rm -f $(UNIT_TESTS)
-check: $(RTLA) tests/bpf/bpf_action_map.o
+check: $(RTLA) $(OUTPUT)tests/bpf/bpf_action_map.o
RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f -v tests/
-examples: example/timerlat_bpf_action.o
+examples: $(OUTPUT)example/timerlat_bpf_action.o
.PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index fd4935fd7b49..e0f3fc4df655 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -74,12 +74,12 @@ then
# Test BPF action program properly in BPF mode
[ -z "$BPFTOOL" ] && BPFTOOL=bpftool
check "hist with BPF action program (BPF mode)" \
- "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \
+ "timerlat hist -T 2 --bpf-action ${OUTPUT}tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \
2 '"value": 42'
else
# Test BPF action program failure in non-BPF mode
check "hist with BPF action program (non-BPF mode)" \
- "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o" \
+ "timerlat hist -T 2 --bpf-action ${OUTPUT}tests/bpf/bpf_action_map.o" \
1 "BPF actions are not supported in tracefs-only mode"
fi
done
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply related
* [PATCH 2/3] perf tools: Put Python egg info in output directory
From: Ben Hutchings @ 2026-05-21 14:35 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Steven Rostedt, Tomas Glozar
Cc: linux-perf-users, linux-trace-kernel
In-Reply-To: <ag8X7gcDw6jpJsLq@decadent.org.uk>
[-- Attachment #1: Type: text/plain, Size: 990 bytes --]
Installing the Python extension currently creates an egg-info
directory in the source tree, preventing a fully out-of-tree build.
Add the necessary runes to the setup.py comamnd line to relocate the
egg-info directory in an out-of-tree build.
Signed-off-by: Ben Hutchings <benh@debian.org>
---
tools/perf/Makefile.perf | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index cee19c923c06..899a4249a42f 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1152,7 +1152,9 @@ install-bin: install-tools install-tests
install: install-bin try-install-man
install-python_ext:
- $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
+ $(PYTHON_WORD) util/setup.py --quiet \
+ $(if $(OUTPUT),egg_info --egg-base $(OUTPUT)) \
+ install --root='/$(DESTDIR_SQ)'
# 'make install-doc' should call 'make -C Documentation install'
$(INSTALL_DOC_TARGETS):
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox