linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree
       [not found] <20250827201548.448472904@kernel.org>
@ 2025-08-27 20:15 ` Steven Rostedt
  2025-08-28  1:46   ` Liam R. Howlett
  2025-08-27 20:15 ` [PATCH v10 05/11] unwind_user/sframe: Detect .sframe sections in executables Steven Rostedt
  1 sibling, 1 reply; 6+ messages in thread
From: Steven Rostedt @ 2025-08-27 20:15 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, bpf, x86
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Josh Poimboeuf,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Jens Remus, Linus Torvalds,
	Andrew Morton, Florian Weimer, Sam James, Kees Cook,
	Carlos O'Donell, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, linux-mm

From: Josh Poimboeuf <jpoimboe@kernel.org>

Associate an sframe section with its mm by adding it to a per-mm maple
tree which is indexed by the corresponding text address range.  A single
sframe section can be associated with multiple text ranges.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/x86/include/asm/mmu.h |  2 +-
 include/linux/mm_types.h   |  3 +++
 include/linux/sframe.h     | 13 +++++++++
 kernel/fork.c              | 10 +++++++
 kernel/unwind/sframe.c     | 55 +++++++++++++++++++++++++++++++++++---
 mm/init-mm.c               |  2 ++
 6 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0fe9c569d171..227a32899a59 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -87,7 +87,7 @@ typedef struct {
 	.context = {							\
 		.ctx_id = 1,						\
 		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
-	}
+	},
 
 void leave_mm(void);
 #define leave_mm leave_mm
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08bc2442db93..31fbd6663047 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1210,6 +1210,9 @@ struct mm_struct {
 #ifdef CONFIG_MM_ID
 		mm_id_t mm_id;
 #endif /* CONFIG_MM_ID */
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+		struct maple_tree sframe_mt;
+#endif
 	} __randomize_layout;
 
 	/*
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
index 0584f661f698..73bf6f0b30c2 100644
--- a/include/linux/sframe.h
+++ b/include/linux/sframe.h
@@ -22,18 +22,31 @@ struct sframe_section {
 	signed char	fp_off;
 };
 
+#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
+extern void sframe_free_mm(struct mm_struct *mm);
+
 extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 			      unsigned long text_start, unsigned long text_end);
 extern int sframe_remove_section(unsigned long sframe_addr);
 
+static inline bool current_has_sframe(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	return mm && !mtree_empty(&mm->sframe_mt);
+}
+
 #else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
 
+#define INIT_MM_SFRAME
+static inline void sframe_free_mm(struct mm_struct *mm) {}
 static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 				     unsigned long text_start, unsigned long text_end)
 {
 	return -ENOSYS;
 }
 static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
+static inline bool current_has_sframe(void) { return false; }
 
 #endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index af673856499d..496781b389bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,7 @@
 #include <linux/pidfs.h>
 #include <linux/tick.h>
 #include <linux/unwind_deferred.h>
+#include <linux/sframe.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -690,6 +691,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_destroy_cid(mm);
 	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 	futex_hash_free(mm);
+	sframe_free_mm(mm);
 
 	free_mm(mm);
 }
@@ -1027,6 +1029,13 @@ static void mmap_init_lock(struct mm_struct *mm)
 #endif
 }
 
+static void mm_init_sframe(struct mm_struct *mm)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+	mt_init(&mm->sframe_mt);
+#endif
+}
+
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
@@ -1055,6 +1064,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pmd_huge_pte = NULL;
 #endif
 	mm_init_uprobes_state(mm);
+	mm_init_sframe(mm);
 	hugetlb_count_init(mm);
 
 	if (current->mm) {
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 20287f795b36..fa7d87ffd00a 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -122,15 +122,64 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
 	if (ret)
 		goto err_free;
 
-	/* TODO nowhere to store it yet - just free it and return an error */
-	ret = -ENOSYS;
+	ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end, sec, GFP_KERNEL);
+	if (ret) {
+		dbg("mtree_insert_range failed: text=%lx-%lx\n",
+		    sec->text_start, sec->text_end);
+		goto err_free;
+	}
+
+	return 0;
 
 err_free:
 	free_section(sec);
 	return ret;
 }
 
+static int __sframe_remove_section(struct mm_struct *mm,
+				   struct sframe_section *sec)
+{
+	if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
+		dbg("mtree_erase failed: text=%lx\n", sec->text_start);
+		return -EINVAL;
+	}
+
+	free_section(sec);
+
+	return 0;
+}
+
 int sframe_remove_section(unsigned long sframe_start)
 {
-	return -ENOSYS;
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	unsigned long index = 0;
+	bool found = false;
+	int ret = 0;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
+		if (sec->sframe_start == sframe_start) {
+			found = true;
+			ret |= __sframe_remove_section(mm, sec);
+		}
+	}
+
+	if (!found || ret)
+		return -EINVAL;
+
+	return 0;
+}
+
+void sframe_free_mm(struct mm_struct *mm)
+{
+	struct sframe_section *sec;
+	unsigned long index = 0;
+
+	if (!mm)
+		return;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
+		free_section(sec);
+
+	mtree_destroy(&mm->sframe_mt);
 }
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4600e7605cab..b32fcf167cc2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -11,6 +11,7 @@
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
 #include <linux/iommu.h>
+#include <linux/sframe.h>
 #include <asm/mmu.h>
 
 #ifndef INIT_MM_CONTEXT
@@ -46,6 +47,7 @@ struct mm_struct init_mm = {
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 	INIT_MM_CONTEXT(init_mm)
+	INIT_MM_SFRAME
 };
 
 void setup_initial_init_mm(void *start_code, void *end_code,
-- 
2.50.1




^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v10 05/11] unwind_user/sframe: Detect .sframe sections in executables
       [not found] <20250827201548.448472904@kernel.org>
  2025-08-27 20:15 ` [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree Steven Rostedt
@ 2025-08-27 20:15 ` Steven Rostedt
  1 sibling, 0 replies; 6+ messages in thread
From: Steven Rostedt @ 2025-08-27 20:15 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, bpf, x86
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Josh Poimboeuf,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Jens Remus, Linus Torvalds,
	Andrew Morton, Florian Weimer, Sam James, Kees Cook,
	Carlos O'Donell, linux-mm

From: Josh Poimboeuf <jpoimboe@kernel.org>

When loading an ELF executable, automatically detect an .sframe section
and associate it with the mm_struct.

Cc: linux-mm@kvack.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/binfmt_elf.c          | 49 +++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/elf.h |  1 +
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 264fba0d44bd..1fd7623cf9a5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <linux/rseq.h>
+#include <linux/sframe.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
@@ -622,6 +623,21 @@ static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state,
 	return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp);
 }
 
+static void elf_add_sframe(struct elf_phdr *text, struct elf_phdr *sframe,
+			   unsigned long base_addr)
+{
+	unsigned long sframe_start, sframe_end, text_start, text_end;
+
+	sframe_start = base_addr + sframe->p_vaddr;
+	sframe_end   = sframe_start + sframe->p_memsz;
+
+	text_start   = base_addr + text->p_vaddr;
+	text_end     = text_start + text->p_memsz;
+
+	/* Ignore return value, sframe section isn't critical */
+	sframe_add_section(sframe_start, sframe_end, text_start, text_end);
+}
+
 /* This is much more generalized than the library routine read function,
    so we keep this separate.  Technically the library read function
    is only provided so that we can read a.out libraries that have
@@ -632,7 +648,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		unsigned long no_base, struct elf_phdr *interp_elf_phdata,
 		struct arch_elf_state *arch_state)
 {
-	struct elf_phdr *eppnt;
+	struct elf_phdr *eppnt, *sframe_phdr = NULL;
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long error = ~0UL;
@@ -658,7 +674,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 	eppnt = interp_elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
-		if (eppnt->p_type == PT_LOAD) {
+		switch (eppnt->p_type) {
+		case PT_LOAD: {
 			int elf_type = MAP_PRIVATE;
 			int elf_prot = make_prot(eppnt->p_flags, arch_state,
 						 true, true);
@@ -697,6 +714,20 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				error = -ENOMEM;
 				goto out;
 			}
+			break;
+		}
+		case PT_GNU_SFRAME:
+			sframe_phdr = eppnt;
+			break;
+		}
+	}
+
+	if (sframe_phdr) {
+		eppnt = interp_elf_phdata;
+		for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+			if (eppnt->p_flags & PF_X) {
+				elf_add_sframe(eppnt, sframe_phdr, load_addr);
+			}
 		}
 	}
 
@@ -821,7 +852,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	int first_pt_load = 1;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
-	struct elf_phdr *elf_property_phdata = NULL;
+	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
 	unsigned long elf_brk;
 	bool brk_moved = false;
 	int retval, i;
@@ -930,6 +961,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 				executable_stack = EXSTACK_DISABLE_X;
 			break;
 
+		case PT_GNU_SFRAME:
+			sframe_phdr = elf_ppnt;
+			break;
+
 		case PT_LOPROC ... PT_HIPROC:
 			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
 						  bprm->file, false,
@@ -1227,6 +1262,14 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			elf_brk = k;
 	}
 
+	if (sframe_phdr) {
+		for (i = 0, elf_ppnt = elf_phdata;
+		     i < elf_ex->e_phnum; i++, elf_ppnt++) {
+			if ((elf_ppnt->p_flags & PF_X))
+				elf_add_sframe(elf_ppnt, sframe_phdr, load_bias);
+		}
+	}
+
 	e_entry = elf_ex->e_entry + load_bias;
 	phdr_addr += load_bias;
 	elf_brk += load_bias;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 819ded2d39de..92c16c94fca8 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -41,6 +41,7 @@ typedef __u16	Elf64_Versym;
 #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
 #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
 #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
+#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
 
 
 /* ARM MTE memory tag segment type */
-- 
2.50.1




^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree
  2025-08-27 20:15 ` [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree Steven Rostedt
@ 2025-08-28  1:46   ` Liam R. Howlett
  2025-08-28 14:28     ` Steven Rostedt
  0 siblings, 1 reply; 6+ messages in thread
From: Liam R. Howlett @ 2025-08-28  1:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, bpf, x86, Masami Hiramatsu,
	Mathieu Desnoyers, Josh Poimboeuf, Peter Zijlstra, Ingo Molnar,
	Jiri Olsa, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Andrii Nakryiko, Indu Bhagat, Jose E. Marchesi,
	Beau Belgrave, Jens Remus, Linus Torvalds, Andrew Morton,
	Florian Weimer, Sam James, Kees Cook, Carlos O'Donell,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
	David Hildenbrand, Lorenzo Stoakes, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, linux-mm

* Steven Rostedt <rostedt@kernel.org> [250827 16:24]:
> From: Josh Poimboeuf <jpoimboe@kernel.org>
> 
> Associate an sframe section with its mm by adding it to a per-mm maple
> tree which is indexed by the corresponding text address range.  A single
> sframe section can be associated with multiple text ranges.
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Mike Rapoport <rppt@kernel.org>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: x86@kernel.org
> Cc: linux-mm@kvack.org
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>  arch/x86/include/asm/mmu.h |  2 +-
>  include/linux/mm_types.h   |  3 +++
>  include/linux/sframe.h     | 13 +++++++++
>  kernel/fork.c              | 10 +++++++
>  kernel/unwind/sframe.c     | 55 +++++++++++++++++++++++++++++++++++---
>  mm/init-mm.c               |  2 ++
>  6 files changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 0fe9c569d171..227a32899a59 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -87,7 +87,7 @@ typedef struct {
>  	.context = {							\
>  		.ctx_id = 1,						\
>  		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
> -	}
> +	},
>  
>  void leave_mm(void);
>  #define leave_mm leave_mm
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 08bc2442db93..31fbd6663047 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1210,6 +1210,9 @@ struct mm_struct {
>  #ifdef CONFIG_MM_ID
>  		mm_id_t mm_id;
>  #endif /* CONFIG_MM_ID */
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +		struct maple_tree sframe_mt;
> +#endif
>  	} __randomize_layout;
>  
>  	/*
> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> index 0584f661f698..73bf6f0b30c2 100644
> --- a/include/linux/sframe.h
> +++ b/include/linux/sframe.h
> @@ -22,18 +22,31 @@ struct sframe_section {
>  	signed char	fp_off;
>  };
>  
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
>  extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
>  			      unsigned long text_start, unsigned long text_end);
>  extern int sframe_remove_section(unsigned long sframe_addr);
>  
> +static inline bool current_has_sframe(void)
> +{
> +	struct mm_struct *mm = current->mm;
> +
> +	return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +
>  #else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
>  
> +#define INIT_MM_SFRAME
> +static inline void sframe_free_mm(struct mm_struct *mm) {}
>  static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
>  				     unsigned long text_start, unsigned long text_end)
>  {
>  	return -ENOSYS;
>  }
>  static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
> +static inline bool current_has_sframe(void) { return false; }
>  
>  #endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index af673856499d..496781b389bc 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -106,6 +106,7 @@
>  #include <linux/pidfs.h>
>  #include <linux/tick.h>
>  #include <linux/unwind_deferred.h>
> +#include <linux/sframe.h>
>  
>  #include <asm/pgalloc.h>
>  #include <linux/uaccess.h>
> @@ -690,6 +691,7 @@ void __mmdrop(struct mm_struct *mm)
>  	mm_destroy_cid(mm);
>  	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
>  	futex_hash_free(mm);
> +	sframe_free_mm(mm);
>  
>  	free_mm(mm);
>  }
> @@ -1027,6 +1029,13 @@ static void mmap_init_lock(struct mm_struct *mm)
>  #endif
>  }
>  
> +static void mm_init_sframe(struct mm_struct *mm)
> +{
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +	mt_init(&mm->sframe_mt);
> +#endif
> +}
> +
>  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	struct user_namespace *user_ns)
>  {
> @@ -1055,6 +1064,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	mm->pmd_huge_pte = NULL;
>  #endif
>  	mm_init_uprobes_state(mm);
> +	mm_init_sframe(mm);
>  	hugetlb_count_init(mm);
>  
>  	if (current->mm) {
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> index 20287f795b36..fa7d87ffd00a 100644
> --- a/kernel/unwind/sframe.c
> +++ b/kernel/unwind/sframe.c
> @@ -122,15 +122,64 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
>  	if (ret)
>  		goto err_free;
>  
> -	/* TODO nowhere to store it yet - just free it and return an error */
> -	ret = -ENOSYS;
> +	ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end, sec, GFP_KERNEL);
> +	if (ret) {
> +		dbg("mtree_insert_range failed: text=%lx-%lx\n",
> +		    sec->text_start, sec->text_end);
> +		goto err_free;
> +	}
> +
> +	return 0;
>  
>  err_free:
>  	free_section(sec);
>  	return ret;
>  }
>  
> +static int __sframe_remove_section(struct mm_struct *mm,
> +				   struct sframe_section *sec)
> +{
> +	if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
> +		dbg("mtree_erase failed: text=%lx\n", sec->text_start);
> +		return -EINVAL;
> +	}
> +
> +	free_section(sec);
> +
> +	return 0;
> +}
> +
>  int sframe_remove_section(unsigned long sframe_start)
>  {
> -	return -ENOSYS;
> +	struct mm_struct *mm = current->mm;
> +	struct sframe_section *sec;
> +	unsigned long index = 0;
> +	bool found = false;
> +	int ret = 0;
> +
> +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> +		if (sec->sframe_start == sframe_start) {
> +			found = true;
> +			ret |= __sframe_remove_section(mm, sec);
> +		}
> +	}

If you use the advanced interface you have to handle the locking, but it
will be faster.  I'm not sure how frequent you loop across many entries,
but you can do something like:

MA_SATE(mas, &mm->sframe_mt, index, index);

mas_lock(&mas);
mas_for_each(&mas, sec, ULONG_MAX) {
...
}
mas_unlock(&mas);

The maple state contains memory addresses of internal nodes, so you
cannot just edit the tree without it being either unlocked (which
negates the gains you would have) or by using it in the modification.

This seems like a good choice considering the __sframe_remove_section()
is called from only one place. You can pass the struct ma_state through
to the remove function and use it with mas_erase().

Actually, reading it again,  why are you starting a search at 0?  And
why are you deleting everything after the sframe_start to ULONG_MAX?
This seems incorrect.  Can you explain your plan a bit here?

> +
> +	if (!found || ret)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +void sframe_free_mm(struct mm_struct *mm)
> +{
> +	struct sframe_section *sec;
> +	unsigned long index = 0;
> +
> +	if (!mm)
> +		return;
> +
> +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
> +		free_section(sec);
> +
> +	mtree_destroy(&mm->sframe_mt);

The same goes for this function.  mt_for_each will start at the top of
the tree, lock, find your result, unlock.  Each search starts from the
top of tree because it was unlocked.  In the mas_ functions, the tree is
iterated in place which can be significantly faster depending on the
tree size.

Since you are not going to edit the tree you can use a maple state:

struct sframe_section *sec;
MA_STATE(mas, &mm->sframe_mt, 0, 0);

mas_lock(&mas);
mas_for_each(&mas, sec, ULONG_MAX)
        free_section(sec);

mas_unlock(&mas);
mtree_destroy(&mm->sframe_mt);


>  }
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 4600e7605cab..b32fcf167cc2 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -11,6 +11,7 @@
>  #include <linux/atomic.h>
>  #include <linux/user_namespace.h>
>  #include <linux/iommu.h>
> +#include <linux/sframe.h>
>  #include <asm/mmu.h>
>  
>  #ifndef INIT_MM_CONTEXT
> @@ -46,6 +47,7 @@ struct mm_struct init_mm = {
>  	.user_ns	= &init_user_ns,
>  	.cpu_bitmap	= CPU_BITS_NONE,
>  	INIT_MM_CONTEXT(init_mm)
> +	INIT_MM_SFRAME
>  };
>  
>  void setup_initial_init_mm(void *start_code, void *end_code,
> -- 
> 2.50.1
> 
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree
  2025-08-28  1:46   ` Liam R. Howlett
@ 2025-08-28 14:28     ` Steven Rostedt
  2025-08-28 15:27       ` Liam R. Howlett
  0 siblings, 1 reply; 6+ messages in thread
From: Steven Rostedt @ 2025-08-28 14:28 UTC (permalink / raw)
  To: Liam R. Howlett
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel, bpf, x86,
	Masami Hiramatsu, Mathieu Desnoyers, Josh Poimboeuf,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Jens Remus, Linus Torvalds,
	Andrew Morton, Florian Weimer, Sam James, Kees Cook,
	Carlos O'Donell, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, David Hildenbrand, Lorenzo Stoakes,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	linux-mm

On Wed, 27 Aug 2025 21:46:01 -0400
"Liam R. Howlett" <Liam.Howlett@oracle.com> wrote:

> >  int sframe_remove_section(unsigned long sframe_start)
> >  {
> > -	return -ENOSYS;
> > +	struct mm_struct *mm = current->mm;
> > +	struct sframe_section *sec;
> > +	unsigned long index = 0;
> > +	bool found = false;
> > +	int ret = 0;
> > +
> > +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> > +		if (sec->sframe_start == sframe_start) {
> > +			found = true;
> > +			ret |= __sframe_remove_section(mm, sec);
> > +		}
> > +	}  
> 

Josh should be able to answer this better than I can, as he wrote it, and
I'm not too familiar with how to use maple tree (reading the documentation
now).

> If you use the advanced interface you have to handle the locking, but it
> will be faster.  I'm not sure how frequent you loop across many entries,
> but you can do something like:
> 
> MA_SATE(mas, &mm->sframe_mt, index, index);
> 
> mas_lock(&mas);
> mas_for_each(&mas, sec, ULONG_MAX) {
> ...
> }
> mas_unlock(&mas);
> 
> The maple state contains memory addresses of internal nodes, so you
> cannot just edit the tree without it being either unlocked (which
> negates the gains you would have) or by using it in the modification.
> 
> This seems like a good choice considering the __sframe_remove_section()
> is called from only one place. You can pass the struct ma_state through
> to the remove function and use it with mas_erase().
> 
> Actually, reading it again,  why are you starting a search at 0?  And
> why are you deleting everything after the sframe_start to ULONG_MAX?
> This seems incorrect.  Can you explain your plan a bit here?

Let me give a brief overview of how and why maple trees are used for
sframes:

The sframe section is mapped to the user space address from the elf file
when the application starts. The dynamic library loader could also do a
system call to tell the kernel where the sframe is for some dynamically
loaded code. Since there can be more than one text section that has an
sframe associated to it, the mm->sframe_mt is used to hold the range of
text to find its corresponding sframe section. That is, there's one sframe
section for the code that was loaded during exec(), and then there may be a
separate sframe section for every library that is loaded. Note, it is
possible that the same sframe section may cover more than one range of text.

When doing stack walking, the instruction pointer is used as the key in the
maple tree to find its corresponding sframe section.

Now, if the sframe is determined to be corrupted, it must be removed from
the current->mm->sframe_mt. It also gets removed when the dynamic loader
removes some text from the application that has the code.

I'm guessing that the 0 to ULONG_MAX is to simply find and remove all the
associated sframe sections, as there may be more than one text range that a
single sframe section covers.

Does this make sense?

Thanks for reviewing!

-- Steve

> 
> > +
> > +	if (!found || ret)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +
> > +void sframe_free_mm(struct mm_struct *mm)
> > +{
> > +	struct sframe_section *sec;
> > +	unsigned long index = 0;
> > +
> > +	if (!mm)
> > +		return;
> > +
> > +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
> > +		free_section(sec);
> > +
> > +	mtree_destroy(&mm->sframe_mt);  
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree
  2025-08-28 14:28     ` Steven Rostedt
@ 2025-08-28 15:27       ` Liam R. Howlett
  2025-08-28 15:51         ` Steven Rostedt
  0 siblings, 1 reply; 6+ messages in thread
From: Liam R. Howlett @ 2025-08-28 15:27 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel, bpf, x86,
	Masami Hiramatsu, Mathieu Desnoyers, Josh Poimboeuf,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Jens Remus, Linus Torvalds,
	Andrew Morton, Florian Weimer, Sam James, Kees Cook,
	Carlos O'Donell, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, David Hildenbrand, Lorenzo Stoakes,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	linux-mm

* Steven Rostedt <rostedt@goodmis.org> [250828 10:28]:
> On Wed, 27 Aug 2025 21:46:01 -0400
> "Liam R. Howlett" <Liam.Howlett@oracle.com> wrote:
> 
> > >  int sframe_remove_section(unsigned long sframe_start)
> > >  {
> > > -	return -ENOSYS;
> > > +	struct mm_struct *mm = current->mm;
> > > +	struct sframe_section *sec;
> > > +	unsigned long index = 0;
> > > +	bool found = false;
> > > +	int ret = 0;
> > > +
> > > +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> > > +		if (sec->sframe_start == sframe_start) {
> > > +			found = true;
> > > +			ret |= __sframe_remove_section(mm, sec);
> > > +		}
> > > +	}  
> > 
> 
> Josh should be able to answer this better than I can, as he wrote it, and
> I'm not too familiar with how to use maple tree (reading the documentation
> now).
> 
> > If you use the advanced interface you have to handle the locking, but it
> > will be faster.  I'm not sure how frequent you loop across many entries,
> > but you can do something like:
> > 
> > MA_SATE(mas, &mm->sframe_mt, index, index);
> > 
> > mas_lock(&mas);
> > mas_for_each(&mas, sec, ULONG_MAX) {
> > ...
> > }
> > mas_unlock(&mas);
> > 
> > The maple state contains memory addresses of internal nodes, so you
> > cannot just edit the tree without it being either unlocked (which
> > negates the gains you would have) or by using it in the modification.
> > 
> > This seems like a good choice considering the __sframe_remove_section()
> > is called from only one place. You can pass the struct ma_state through
> > to the remove function and use it with mas_erase().
> > 
> > Actually, reading it again,  why are you starting a search at 0?  And
> > why are you deleting everything after the sframe_start to ULONG_MAX?
> > This seems incorrect.  Can you explain your plan a bit here?
> 
> Let me give a brief overview of how and why maple trees are used for
> sframes:
> 
> The sframe section is mapped to the user space address from the elf file
> when the application starts. The dynamic library loader could also do a
> system call to tell the kernel where the sframe is for some dynamically
> loaded code. Since there can be more than one text section that has an
> sframe associated to it, the mm->sframe_mt is used to hold the range of
> text to find its corresponding sframe section. That is, there's one sframe
> section for the code that was loaded during exec(), and then there may be a
> separate sframe section for every library that is loaded. Note, it is
> possible that the same sframe section may cover more than one range of text.
> 
> When doing stack walking, the instruction pointer is used as the key in the
> maple tree to find its corresponding sframe section.
> 
> Now, if the sframe is determined to be corrupted, it must be removed from
> the current->mm->sframe_mt. It also gets removed when the dynamic loader
> removes some text from the application that has the code.
> 
> I'm guessing that the 0 to ULONG_MAX is to simply find and remove all the
> associated sframe sections, as there may be more than one text range that a
> single sframe section covers.
> 
> Does this make sense?
> 

Perhaps it's the corruption part that I'm missing here.  If the sframe
is corrupt, you are iterating over all elements and checking the start
address passed in against the section start.

So if the section is corrupted then how can we depend on the
sec->sframe_start?

And is the maple tree corrupted?  I mean, the mappings to sframe_start
-> sec is still reliable, right?

Looking at the storing code, you store text_start - text_end to sec,
presumably the text_start cannot be smaller than the sframe_start?

Thanks,
Liam


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree
  2025-08-28 15:27       ` Liam R. Howlett
@ 2025-08-28 15:51         ` Steven Rostedt
  0 siblings, 0 replies; 6+ messages in thread
From: Steven Rostedt @ 2025-08-28 15:51 UTC (permalink / raw)
  To: Liam R. Howlett
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel, bpf, x86,
	Masami Hiramatsu, Mathieu Desnoyers, Josh Poimboeuf,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Jens Remus, Linus Torvalds,
	Andrew Morton, Florian Weimer, Sam James, Kees Cook,
	Carlos O'Donell, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, David Hildenbrand, Lorenzo Stoakes,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	linux-mm

On Thu, 28 Aug 2025 11:27:00 -0400
"Liam R. Howlett" <Liam.Howlett@oracle.com> wrote:

> > Does this make sense?
> >   
> 
> Perhaps it's the corruption part that I'm missing here.  If the sframe
> is corrupt, you are iterating over all elements and checking the start
> address passed in against the section start.
> 
> So if the section is corrupted then how can we depend on the
> sec->sframe_start?
> 
> And is the maple tree corrupted?  I mean, the mappings to sframe_start
> -> sec is still reliable, right?  
> 
> Looking at the storing code, you store text_start - text_end to sec,
> presumably the text_start cannot be smaller than the sframe_start?

Sorry, that's not what gets corrupted. I should have expanded on it.

The sframe section is two tables that describe how to get the return
address from text locations, much like how ORC works in the kernel. We get
a start and end address of where the sframe exists (that has the two
tables) and a start and end section of the text it represents.

When I said "corrupted", I meant that the sframe tables are totally created
by user space and can not be trusted. While reading the sframe tables, if
there's any anomaly that is found, it is considered "corrupted". So no, the
start and end of where the sframes are and where the text should be
validated at the start (I need to check that we do ;-).

But once we start reading the sframe tables, they could hold garbage, or
have something in there that the kernel doesn't support. As soon as that is
detected, it gets removed so that it isn't looked at again.

-- Steve


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-28 15:55 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250827201548.448472904@kernel.org>
2025-08-27 20:15 ` [PATCH v10 02/11] unwind_user/sframe: Store sframe section data in per-mm maple tree Steven Rostedt
2025-08-28  1:46   ` Liam R. Howlett
2025-08-28 14:28     ` Steven Rostedt
2025-08-28 15:27       ` Liam R. Howlett
2025-08-28 15:51         ` Steven Rostedt
2025-08-27 20:15 ` [PATCH v10 05/11] unwind_user/sframe: Detect .sframe sections in executables Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).