public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed
* [PATCH] arm64: Add user stacktrace support
@ 2023-11-18 13:45 qiwuchen55
  2023-11-18 20:40 ` kernel test robot
  2023-11-20 14:29 ` Mark Rutland
  0 siblings, 2 replies; 8+ messages in thread
From: qiwuchen55 @ 2023-11-18 13:45 UTC (permalink / raw)
  To: catalin.marinas, will; +Cc: linux-arm-kernel, linux-kernel, chenqiwu

From: chenqiwu <qiwu.chen@transsion.com>

1. Introduce and export arch_dump_user_stacktrace() API to support
user stacktrace dump for a user task (both current and non-current task).
A example test about the log format of user stacktrace as shown below:
[test-515] Dump user backtrace:
<0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
<0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
<0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
<0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]

2. Add arch_stack_walk_user() implementation to support userstacktrace trace option.
A example test about the output format of ftrace userstacktrace as shown below:
    bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
    bash-489     [000] .....  2167.660787: <user stack trace>
 => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
 => /bin/bash[+0x5f354]
 => /bin/bash[+0x4876c]
 => /bin/bash[+0x4aec4]
 => /bin/bash[+0x4da48]
 => /bin/bash[+0x4b710]
 => /bin/bash[+0x4c31c]
 => /bin/bash[+0x339b0]

Tested-by-by: chenqiwu <qiwu.chen@transsion.com>
Signed-off-by: chenqiwu <qiwu.chen@transsion.com>
---
 arch/arm64/Kconfig             |   1 +
 arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
 include/linux/stacktrace.h     |  10 ++
 3 files changed, 219 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b071a004..4c5066f88 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -255,6 +255,7 @@ config ARM64
 	select TRACE_IRQFLAGS_SUPPORT
 	select TRACE_IRQFLAGS_NMI_SUPPORT
 	select HAVE_SOFTIRQ_ON_OWN_STACK
+	select USER_STACKTRACE_SUPPORT
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 17f66a74c..4e7bf2922 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -215,6 +215,214 @@ static bool dump_backtrace_entry(void *arg, unsigned long where)
 	return true;
 }
 
+/* The struct defined for AArch64 userspace stack frame */
+struct stack_frame_user {
+	unsigned long fp;
+	unsigned long sp;
+	unsigned long pc;
+};
+
+/*
+ * The function of AArch64 userspace stack frame unwind method.
+ * Note: If the caller is not current task, it's supposed to call
+ * access_process_vm() to access another task' address space.
+ */
+static int arch_unwind_user_frame(struct task_struct *tsk, unsigned long high,
+				struct stack_frame_user *frame)
+{
+	int ret = 0;
+	unsigned long fp = frame->fp;
+	unsigned long low = frame->sp;
+
+	if (fp < low || fp > high || fp & 0xf)
+		return -EFAULT;
+
+	frame->sp = fp + 0x10;
+	/* Disable page fault to make sure get_user going on wheels */
+	pagefault_disable();
+	if (tsk == current) {
+		if (get_user(frame->fp, (unsigned long __user *)fp) ||
+			get_user(frame->pc, (unsigned long __user *)(fp + 8)))
+			ret = -EFAULT;
+	} else {
+		if (access_process_vm(tsk, fp, &frame->fp,
+			sizeof(unsigned long), 0) != sizeof(unsigned long) ||
+			access_process_vm(tsk, fp + 0x08, &frame->pc,
+			sizeof(unsigned long), 0) != sizeof(unsigned long))
+			ret = -EFAULT;
+	}
+	pagefault_enable();
+
+	return ret;
+}
+
+/*
+ * Print the executable address and corresponding VMA info.
+ */
+static void print_vma_addr_info(char *prefix, struct task_struct *task,
+				unsigned long ip, const char *loglvl)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+
+	if (task != current)
+		mm = get_task_mm(task);
+	else
+		mm = task->mm;
+
+	if (!mm)
+		return;
+	/*
+	 * we might be running from an atomic context so we cannot sleep
+	 */
+	if (!mmap_read_trylock(mm)) {
+		mmput(mm);
+		return;
+	}
+
+	vma = find_vma(mm, ip);
+	if (vma && vma->vm_file) {
+		struct file *f = vma->vm_file;
+		char *buf = (char *)__get_free_page(GFP_NOWAIT);
+
+		if (buf) {
+			char *p;
+
+			p = file_path(f, buf, PAGE_SIZE);
+			if (IS_ERR(p))
+				p = "?";
+			printk("%s%s%s[%lx-%lx]\n", loglvl, prefix, p,
+					vma->vm_start,
+					vma->vm_end);
+			free_page((unsigned long)buf);
+		}
+	}
+	mmap_read_unlock(mm);
+	if (task != current)
+		mmput(mm);
+}
+
+static struct vm_area_struct *find_user_stack_vma(struct task_struct *task, unsigned long sp)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+
+	if (task != current)
+		mm = get_task_mm(task);
+	else
+		mm = task->mm;
+
+	if (!mm)
+		return NULL;
+	/*
+	 * we might be running from an atomic context so we cannot sleep
+	 */
+	if (!mmap_read_trylock(mm)) {
+		mmput(mm);
+		return NULL;
+	}
+	vma = find_vma(mm, sp);
+	mmap_read_unlock(mm);
+	if (task != current)
+		mmput(mm);
+
+	return vma;
+}
+
+static void dump_user_backtrace_entry(struct task_struct *tsk,
+				unsigned long where, const char *loglvl)
+{
+	char prefix[64];
+
+	snprintf(prefix, sizeof(prefix), "<0x%lx> in ", where);
+	print_vma_addr_info(prefix, tsk, where, loglvl);
+}
+
+void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
+								const char *loglvl)
+{
+	struct stack_frame_user frame;
+	struct vm_area_struct *vma;
+	unsigned long userstack_start, userstack_end;
+
+	if (!tsk)
+		tsk = current;
+
+	/*
+	 * If @regs is not specified or caller is not current task,.
+	 * @regs is supposed to get from @tsk.
+	 */
+	if (!regs || tsk != current)
+		regs = task_pt_regs(tsk);
+
+	/* TODO: support stack unwind for compat user mode */
+	if (compat_user_mode(regs))
+		return;
+
+	userstack_start = regs->user_regs.sp;
+	vma = find_user_stack_vma(tsk, userstack_start);
+	if (!vma)
+		return;
+
+	userstack_end = vma->vm_end;
+	frame.fp = regs->user_regs.regs[29];
+	frame.sp = userstack_start;
+	frame.pc = regs->user_regs.pc;
+
+	printk("%s[%s-%d] Dump user backtrace:\n", loglvl, tsk->comm, tsk->pid);
+	while (1) {
+		unsigned long where = frame.pc;
+
+		if (!where || where & 0x3)
+			break;
+		dump_user_backtrace_entry(tsk, where, loglvl);
+		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
+			break;
+	}
+}
+EXPORT_SYMBOL_GPL(arch_dump_user_stacktrace);
+
+/**
+ * stack_trace_save_user - Save user space stack traces into a storage array
+ * @consume_entry: Callback for save a user space stack trace
+ * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
+ * @regs: The pt_regs pointer of current task
+ */
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+			  const struct pt_regs *regs)
+{
+	struct stack_frame_user frame;
+	struct vm_area_struct *vma;
+	unsigned long userstack_start, userstack_end;
+	struct task_struct *tsk = current;
+
+	/* TODO: support stack unwind for compat user mode */
+	if (!regs || !user_mode(regs) || compat_user_mode(regs))
+		return;
+
+	userstack_start = regs->user_regs.sp;
+	vma = find_user_stack_vma(tsk, userstack_start);
+	if (!vma)
+		return;
+
+	userstack_end = vma->vm_end;
+	frame.fp = regs->user_regs.regs[29];
+	frame.sp = userstack_start;
+	frame.pc = regs->user_regs.pc;
+
+	while (1) {
+		unsigned long where = frame.pc;
+
+		/* Sanity check: ABI requires pc to be aligned 4 bytes. */
+		if (!where || where & 0x3)
+			break;
+		if (!consume_entry(cookie, where))
+			break;
+		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
+			break;
+	}
+}
+
 void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		    const char *loglvl)
 {
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 97455880a..bc5a7bf56 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -60,6 +60,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
 
 void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 			  const struct pt_regs *regs);
+
+/**
+ * arch_dump_user_stacktrace - Architecture specific function to dump the
+ *			       stack trace for user process
+ * @regs: Pointer to the pt_regs of user process
+ * @tsk: Pointer to the task_struct of user process
+ * @loglvl: Log level
+ */
+void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
+			       const char *loglvl);
 #endif /* CONFIG_ARCH_STACKWALK */
 
 #ifdef CONFIG_STACKTRACE
-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-18 13:45 [PATCH] arm64: Add user stacktrace support qiwuchen55
@ 2023-11-18 20:40 ` kernel test robot
  2023-11-20 14:29 ` Mark Rutland
  1 sibling, 0 replies; 8+ messages in thread
From: kernel test robot @ 2023-11-18 20:40 UTC (permalink / raw)
  To: qiwuchen55, catalin.marinas, will
  Cc: oe-kbuild-all, linux-arm-kernel, linux-kernel, chenqiwu

Hi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on linus/master v6.7-rc1 next-20231117]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/qiwuchen55-gmail-com/arm64-Add-user-stacktrace-support/20231118-214756
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link:    https://lore.kernel.org/r/20231118134504.154842-1-qiwu.chen%40transsion.com
patch subject: [PATCH] arm64: Add user stacktrace support
config: arm64-randconfig-r071-20231119 (https://download.01.org/0day-ci/archive/20231119/202311190405.QUvoF0b3-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231119/202311190405.QUvoF0b3-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311190405.QUvoF0b3-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> arch/arm64/kernel/stacktrace.c:393: warning: expecting prototype for stack_trace_save_user(). Prototype was for arch_stack_walk_user() instead


vim +393 arch/arm64/kernel/stacktrace.c

   384	
   385	/**
   386	 * stack_trace_save_user - Save user space stack traces into a storage array
   387	 * @consume_entry: Callback for save a user space stack trace
   388	 * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
   389	 * @regs: The pt_regs pointer of current task
   390	 */
   391	void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
   392				  const struct pt_regs *regs)
 > 393	{
   394		struct stack_frame_user frame;
   395		struct vm_area_struct *vma;
   396		unsigned long userstack_start, userstack_end;
   397		struct task_struct *tsk = current;
   398	
   399		/* TODO: support stack unwind for compat user mode */
   400		if (!regs || !user_mode(regs) || compat_user_mode(regs))
   401			return;
   402	
   403		userstack_start = regs->user_regs.sp;
   404		vma = find_user_stack_vma(tsk, userstack_start);
   405		if (!vma)
   406			return;
   407	
   408		userstack_end = vma->vm_end;
   409		frame.fp = regs->user_regs.regs[29];
   410		frame.sp = userstack_start;
   411		frame.pc = regs->user_regs.pc;
   412	
   413		while (1) {
   414			unsigned long where = frame.pc;
   415	
   416			/* Sanity check: ABI requires pc to be aligned 4 bytes. */
   417			if (!where || where & 0x3)
   418				break;
   419			if (!consume_entry(cookie, where))
   420				break;
   421			if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
   422				break;
   423		}
   424	}
   425	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-18 13:45 [PATCH] arm64: Add user stacktrace support qiwuchen55
  2023-11-18 20:40 ` kernel test robot
@ 2023-11-20 14:29 ` Mark Rutland
  2023-11-20 17:46   ` Mark Rutland
  2023-11-21  9:03   ` chenqiwu
  1 sibling, 2 replies; 8+ messages in thread
From: Mark Rutland @ 2023-11-20 14:29 UTC (permalink / raw)
  To: qiwuchen55
  Cc: catalin.marinas, will, linux-arm-kernel, linux-kernel, chenqiwu

On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> From: chenqiwu <qiwu.chen@transsion.com>
> 
> 1. Introduce and export arch_dump_user_stacktrace() API to support
> user stacktrace dump for a user task (both current and non-current task).
> A example test about the log format of user stacktrace as shown below:
> [test-515] Dump user backtrace:
> <0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> <0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
> <0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> <0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]

Where is this used?

We already have user stacktracing code in arch/arm64/kernel/perf_callchain.c
which doesn't depend on this API. What does this API enable that we don't
support today?

> 2. Add arch_stack_walk_user() implementation to support userstacktrace transsionce option.

What is this 'userstacktrace transsionce option' ?

> A example test about the output format of ftrace userstacktrace as shown below:
>     bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
>     bash-489     [000] .....  2167.660787: <user stack trace>
>  => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
>  => /bin/bash[+0x5f354]
>  => /bin/bash[+0x4876c]
>  => /bin/bash[+0x4aec4]
>  => /bin/bash[+0x4da48]
>  => /bin/bash[+0x4b710]
>  => /bin/bash[+0x4c31c]
>  => /bin/bash[+0x339b0]
> 
> Tested-by-by: chenqiwu <qiwu.chen@transsion.com>
> Signed-off-by: chenqiwu <qiwu.chen@transsion.com>
> ---
>  arch/arm64/Kconfig             |   1 +
>  arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
>  include/linux/stacktrace.h     |  10 ++
>  3 files changed, 219 insertions(+)

As above, we already have user stacktracing code, and we shouldn't add
*distinct* unwinders. Either that code should be factored out and reused, or
this code should replace it.

> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7b071a004..4c5066f88 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -255,6 +255,7 @@ config ARM64
>  	select TRACE_IRQFLAGS_SUPPORT
>  	select TRACE_IRQFLAGS_NMI_SUPPORT
>  	select HAVE_SOFTIRQ_ON_OWN_STACK
> +	select USER_STACKTRACE_SUPPORT
>  	help
>  	  ARM 64-bit (AArch64) Linux support.
>  
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index 17f66a74c..4e7bf2922 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -215,6 +215,214 @@ static bool dump_backtrace_entry(void *arg, unsigned long where)
>  	return true;
>  }
>  
> +/* The struct defined for AArch64 userspace stack frame */
> +struct stack_frame_user {
> +	unsigned long fp;
> +	unsigned long sp;
> +	unsigned long pc;
> +};
> +
> +/*
> + * The function of AArch64 userspace stack frame unwind method.
> + * Note: If the caller is not current task, it's supposed to call
> + * access_process_vm() to access another task' address space.
> + */
> +static int arch_unwind_user_frame(struct task_struct *tsk, unsigned long high,
> +				struct stack_frame_user *frame)
> +{
> +	int ret = 0;
> +	unsigned long fp = frame->fp;
> +	unsigned long low = frame->sp;
> +
> +	if (fp < low || fp > high || fp & 0xf)
> +		return -EFAULT;
> +
> +	frame->sp = fp + 0x10;

Given you always set frame->sp as fp + 0x10, why does frame->sp need to exist
at all?

Per AAPCS64, the frame record only conatins a copy of the FP and LR, and is
*not* directly associated with the SP, so I don't think we should pretend it
is.

> +	/* Disable page fault to make sure get_user going on wheels */

I have no idea what this comment is trying to say.

Why exactly you you think we need to disable page faults? Isn't that going to
make this fail arbitrarily when we *can* fault pages in? I know that the
existing perf unwinder does this, but that's a design problem we'd like to
solve (e.g. by deferring the unwind until return to userspace).

> +	pagefault_disable();
> +	if (tsk == current) {
> +		if (get_user(frame->fp, (unsigned long __user *)fp) ||
> +			get_user(frame->pc, (unsigned long __user *)(fp + 8)))
> +			ret = -EFAULT;
> +	} else {
> +		if (access_process_vm(tsk, fp, &frame->fp,
> +			sizeof(unsigned long), 0) != sizeof(unsigned long) ||
> +			access_process_vm(tsk, fp + 0x08, &frame->pc,
> +			sizeof(unsigned long), 0) != sizeof(unsigned long))
> +			ret = -EFAULT;
> +	}
> +	pagefault_enable();

If task isn't current, userspace could be running and this will be racy and
unreliable.

Where is this used with task != current? Why do we need to support that case at
all?

What does this do for COMPAT tasks?

> +
> +	return ret;
> +}
> +
> +/*
> + * Print the executable address and corresponding VMA info.
> + */
> +static void print_vma_addr_info(char *prefix, struct task_struct *task,
> +				unsigned long ip, const char *loglvl)
> +{
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +
> +	if (task != current)
> +		mm = get_task_mm(task);
> +	else
> +		mm = task->mm;

Why can't we always use get_task_mm(), even for task == current?

> +
> +	if (!mm)
> +		return;
> +	/*
> +	 * we might be running from an atomic context so we cannot sleep
> +	 */
> +	if (!mmap_read_trylock(mm)) {
> +		mmput(mm);
> +		return;
> +	}

When is this called from an atomic context?

> +
> +	vma = find_vma(mm, ip);
> +	if (vma && vma->vm_file) {
> +		struct file *f = vma->vm_file;
> +		char *buf = (char *)__get_free_page(GFP_NOWAIT);
> +
> +		if (buf) {
> +			char *p;
> +
> +			p = file_path(f, buf, PAGE_SIZE);
> +			if (IS_ERR(p))
> +				p = "?";
> +			printk("%s%s%s[%lx-%lx]\n", loglvl, prefix, p,
> +					vma->vm_start,
> +					vma->vm_end);
> +			free_page((unsigned long)buf);
> +		}
> +	}
> +	mmap_read_unlock(mm);
> +	if (task != current)
> +		mmput(mm);
> +}
> +
> +static struct vm_area_struct *find_user_stack_vma(struct task_struct *task, unsigned long sp)
> +{
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +
> +	if (task != current)
> +		mm = get_task_mm(task);
> +	else
> +		mm = task->mm;
> +
> +	if (!mm)
> +		return NULL;
> +	/*
> +	 * we might be running from an atomic context so we cannot sleep
> +	 */
> +	if (!mmap_read_trylock(mm)) {
> +		mmput(mm);
> +		return NULL;
> +	}
> +	vma = find_vma(mm, sp);
> +	mmap_read_unlock(mm);
> +	if (task != current)
> +		mmput(mm);

What guarantees the VMA is safe to use after this? What ensures that it won't
be freed? What ensures that it is still valid and not subject to concurrent
modification?

> +
> +	return vma;
> +}
> +
> +static void dump_user_backtrace_entry(struct task_struct *tsk,
> +				unsigned long where, const char *loglvl)
> +{
> +	char prefix[64];
> +
> +	snprintf(prefix, sizeof(prefix), "<0x%lx> in ", where);
> +	print_vma_addr_info(prefix, tsk, where, loglvl);
> +}
> +
> +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> +								const char *loglvl)
> +{
> +	struct stack_frame_user frame;
> +	struct vm_area_struct *vma;
> +	unsigned long userstack_start, userstack_end;
> +
> +	if (!tsk)
> +		tsk = current;
> +
> +	/*
> +	 * If @regs is not specified or caller is not current task,.
> +	 * @regs is supposed to get from @tsk.
> +	 */
> +	if (!regs || tsk != current)
> +		regs = task_pt_regs(tsk);

The user state is *always* in task_pt_regs(tsk), even when tsk == current.

Why does this function take the regs as an argument at all?

> +
> +	/* TODO: support stack unwind for compat user mode */
> +	if (compat_user_mode(regs))
> +		return;
> +
> +	userstack_start = regs->user_regs.sp;
> +	vma = find_user_stack_vma(tsk, userstack_start);
> +	if (!vma)
> +		return;
> +
> +	userstack_end = vma->vm_end;
> +	frame.fp = regs->user_regs.regs[29];
> +	frame.sp = userstack_start;
> +	frame.pc = regs->user_regs.pc;
> +
> +	printk("%s[%s-%d] Dump user backtrace:\n", loglvl, tsk->comm, tsk->pid);
> +	while (1) {
> +		unsigned long where = frame.pc;
> +
> +		if (!where || where & 0x3)
> +			break;
> +		dump_user_backtrace_entry(tsk, where, loglvl);
> +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> +			break;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(arch_dump_user_stacktrace);

Where is this used from?

Why should it be exported?

> +
> +/**
> + * stack_trace_save_user - Save user space stack traces into a storage array
> + * @consume_entry: Callback for save a user space stack trace
> + * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
> + * @regs: The pt_regs pointer of current task
> + */
> +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> +			  const struct pt_regs *regs)
> +{
> +	struct stack_frame_user frame;
> +	struct vm_area_struct *vma;
> +	unsigned long userstack_start, userstack_end;
> +	struct task_struct *tsk = current;
> +
> +	/* TODO: support stack unwind for compat user mode */
> +	if (!regs || !user_mode(regs) || compat_user_mode(regs))
> +		return;
> +
> +	userstack_start = regs->user_regs.sp;
> +	vma = find_user_stack_vma(tsk, userstack_start);
> +	if (!vma)

Yet again this duplicates the code above.

If we really need this, then arch_stack_walk_user() should be the real
unwinder, and the caes above should be built atop arch_stack_walk_user().

> +		return;
> +
> +	userstack_end = vma->vm_end;
> +	frame.fp = regs->user_regs.regs[29];
> +	frame.sp = userstack_start;
> +	frame.pc = regs->user_regs.pc;
> +
> +	while (1) {
> +		unsigned long where = frame.pc;
> +
> +		/* Sanity check: ABI requires pc to be aligned 4 bytes. */
> +		if (!where || where & 0x3)
> +			break;

Why do we care whether the PC is valid?

There are plenty of other things that we could check (e.g. whether this points
to executable memory), but it seems kinda pointless to care beyond whether we
can unwind the frame.

Note that we're missing the LR anyway, so this *isn't* a reliable unwind.

Thanks,
Mark.

> +		if (!consume_entry(cookie, where))
> +			break;
> +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> +			break;
> +	}
> +}
> +
>  void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
>  		    const char *loglvl)
>  {
> diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
> index 97455880a..bc5a7bf56 100644
> --- a/include/linux/stacktrace.h
> +++ b/include/linux/stacktrace.h
> @@ -60,6 +60,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
>  
>  void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
>  			  const struct pt_regs *regs);
> +
> +/**
> + * arch_dump_user_stacktrace - Architecture specific function to dump the
> + *			       stack trace for user process
> + * @regs: Pointer to the pt_regs of user process
> + * @tsk: Pointer to the task_struct of user process
> + * @loglvl: Log level
> + */
> +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> +			       const char *loglvl);
>  #endif /* CONFIG_ARCH_STACKWALK */
>  
>  #ifdef CONFIG_STACKTRACE
> -- 
> 2.25.1
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-20 14:29 ` Mark Rutland
@ 2023-11-20 17:46   ` Mark Rutland
  2023-11-21  9:03   ` chenqiwu
  1 sibling, 0 replies; 8+ messages in thread
From: Mark Rutland @ 2023-11-20 17:46 UTC (permalink / raw)
  To: qiwuchen55
  Cc: catalin.marinas, will, linux-arm-kernel, linux-kernel, chenqiwu

On Mon, Nov 20, 2023 at 02:29:23PM +0000, Mark Rutland wrote:
> On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> > From: chenqiwu <qiwu.chen@transsion.com>

> > 2. Add arch_stack_walk_user() implementation to support userstacktrace transsionce option.
> 
> What is this 'userstacktrace transsionce option' ?

Sorry, I evidently corrupted the mail at my end; "transsionce" was "trace" in
the original mail, which matches the ftrace output below.

Please ignore this specific comment; my other concerns still apply.

> > A example test about the output format of ftrace userstacktrace as shown below:
> >     bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
> >     bash-489     [000] .....  2167.660787: <user stack trace>
> >  => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
> >  => /bin/bash[+0x5f354]
> >  => /bin/bash[+0x4876c]
> >  => /bin/bash[+0x4aec4]
> >  => /bin/bash[+0x4da48]
> >  => /bin/bash[+0x4b710]
> >  => /bin/bash[+0x4c31c]
> >  => /bin/bash[+0x339b0]

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-20 14:29 ` Mark Rutland
  2023-11-20 17:46   ` Mark Rutland
@ 2023-11-21  9:03   ` chenqiwu
  2023-11-21 14:05     ` Mark Rutland
  1 sibling, 1 reply; 8+ messages in thread
From: chenqiwu @ 2023-11-21  9:03 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, kaleshsingh, ardb, mhiramat,
	linux-arm-kernel

On Mon, Nov 20, 2023 at 02:29:17PM +0000, Mark Rutland wrote:
> On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> > From: chenqiwu <qiwu.chen@transsion.com>
> > 
> > 1. Introduce and export arch_dump_user_stacktrace() API to support
> > user stacktrace dump for a user task (both current and non-current task).
> > A example test about the log format of user stacktrace as shown below:
> > [test-515] Dump user backtrace:
> > <0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > <0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > <0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > <0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]
> 
> Where is this used?
> 
It's used in kernel space for some case need user backtrace for debugging,
such as:  https://lkml.org/lkml/2023/11/9/1365
> We already have user stacktracing code in arch/arm64/kernel/perf_callchain.c
> which doesn't depend on this API. What does this API enable that we don't
> support today?
> 
Sorry, I indeed ignored this case, but it seems only work for perf syscall chain in
case of CONFIG_PERF_EVENTS enabled without universality.
It's supposed to introduce a common API for dumping user backtrace in kernel space.

> > 2. Add arch_stack_walk_user() implementation to support userstacktrace transsionce option.
> 
> What is this 'userstacktrace transsionce option' ?
> 
> > A example test about the output format of ftrace userstacktrace as shown below:
> >     bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
> >     bash-489     [000] .....  2167.660787: <user stack trace>
> >  => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
> >  => /bin/bash[+0x5f354]
> >  => /bin/bash[+0x4876c]
> >  => /bin/bash[+0x4aec4]
> >  => /bin/bash[+0x4da48]
> >  => /bin/bash[+0x4b710]
> >  => /bin/bash[+0x4c31c]
> >  => /bin/bash[+0x339b0]
> > 
> > Tested-by-by: chenqiwu <qiwu.chen@transsion.com>
> > Signed-off-by: chenqiwu <qiwu.chen@transsion.com>
> > ---
> >  arch/arm64/Kconfig             |   1 +
> >  arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
> >  include/linux/stacktrace.h     |  10 ++
> >  3 files changed, 219 insertions(+)
> 
> As above, we already have user stacktracing code, and we shouldn't add
> *distinct* unwinders. Either that code should be factored out and reused, or
> this code should replace it.
> 
Currently, ARM64 platform is not supported for ftrace userstacktrace profile feature,
since CONFIG_USER_STACKTRACE_SUPPORT is not enabled, the call chain cannot be accessed:
ftrace_trace_userstack -> stack_trace_save_user -> arch_stack_walk_user

> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 7b071a004..4c5066f88 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -255,6 +255,7 @@ config ARM64
> >  	select TRACE_IRQFLAGS_SUPPORT
> >  	select TRACE_IRQFLAGS_NMI_SUPPORT
> >  	select HAVE_SOFTIRQ_ON_OWN_STACK
> > +	select USER_STACKTRACE_SUPPORT
> >  	help
> >  	  ARM 64-bit (AArch64) Linux support.
> >  
> > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> > index 17f66a74c..4e7bf2922 100644
> > --- a/arch/arm64/kernel/stacktrace.c
> > +++ b/arch/arm64/kernel/stacktrace.c
> > @@ -215,6 +215,214 @@ static bool dump_backtrace_entry(void *arg, unsigned long where)
> >  	return true;
> >  }
> >  
> > +/* The struct defined for AArch64 userspace stack frame */
> > +struct stack_frame_user {
> > +	unsigned long fp;
> > +	unsigned long sp;
> > +	unsigned long pc;
> > +};
> > +
> > +/*
> > + * The function of AArch64 userspace stack frame unwind method.
> > + * Note: If the caller is not current task, it's supposed to call
> > + * access_process_vm() to access another task' address space.
> > + */
> > +static int arch_unwind_user_frame(struct task_struct *tsk, unsigned long high,
> > +				struct stack_frame_user *frame)
> > +{
> > +	int ret = 0;
> > +	unsigned long fp = frame->fp;
> > +	unsigned long low = frame->sp;
> > +
> > +	if (fp < low || fp > high || fp & 0xf)
> > +		return -EFAULT;
> > +
> > +	frame->sp = fp + 0x10;
> 
> Given you always set frame->sp as fp + 0x10, why does frame->sp need to exist
> at all?
> 
frame->sp refer to the bottom of stack VMA, which increased at least 0x10 on every entry of
arch_unwind_user_frame. A vaild frame->fp is suppoed to between upper and lower limit of
task stack VMA.

> Per AAPCS64, the frame record only conatins a copy of the FP and LR, and is
> *not* directly associated with the SP, so I don't think we should pretend it
> is.
> 
> > +	/* Disable page fault to make sure get_user going on wheels */
> 
> I have no idea what this comment is trying to say.
> 
> Why exactly you you think we need to disable page faults? Isn't that going to
> make this fail arbitrarily when we *can* fault pages in? I know that the
> existing perf unwinder does this, but that's a design problem we'd like to
> solve (e.g. by deferring the unwind until return to userspace).
> 
Hhmm, I refer to the design of existing user unwinder. User access methods will not sleep
when called from a pagefault_disabled(), so the get_user can be acceed in atomic context.

> > +	pagefault_disable();
> > +	if (tsk == current) {
> > +		if (get_user(frame->fp, (unsigned long __user *)fp) ||
> > +			get_user(frame->pc, (unsigned long __user *)(fp + 8)))
> > +			ret = -EFAULT;
> > +	} else {
> > +		if (access_process_vm(tsk, fp, &frame->fp,
> > +			sizeof(unsigned long), 0) != sizeof(unsigned long) ||
> > +			access_process_vm(tsk, fp + 0x08, &frame->pc,
> > +			sizeof(unsigned long), 0) != sizeof(unsigned long))
> > +			ret = -EFAULT;
> > +	}
> > +	pagefault_enable();
> 
> If task isn't current, userspace could be running and this will be racy and
> unreliable.
> 
> Where is this used with task != current? Why do we need to support that case at
> all?
> 
It's my idea to support the case for caller who really want to dump another task's backtrace.
Not sure the racy and unreliablity since access_process_vm call get_task_mm and mmap_read_lock
to pin the task's address space.
We can see access_process_vm is safely used in get_cmdline() case.

> What does this do for COMPAT tasks?
>
This patch is not covered COMPAT task unwinder yet.

> > +
> > +	return ret;
> > +}
> > +
> > +/*
> > + * Print the executable address and corresponding VMA info.
> > + */
> > +static void print_vma_addr_info(char *prefix, struct task_struct *task,
> > +				unsigned long ip, const char *loglvl)
> > +{
> > +	struct mm_struct *mm;
> > +	struct vm_area_struct *vma;
> > +
> > +	if (task != current)
> > +		mm = get_task_mm(task);
> > +	else
> > +		mm = task->mm;
> 
> Why can't we always use get_task_mm(), even for task == current?
> 
get_task_mm increase the task's mm_users which refers to the number of users who access the mm
in order to pin the task's mm.
I think it's meaningless to use get_task_mm for task == current since task will not decrease
its mm_users before calling do_exit -> exit_mm.

> > +
> > +	if (!mm)
> > +		return;
> > +	/*
> > +	 * we might be running from an atomic context so we cannot sleep
> > +	 */
> > +	if (!mmap_read_trylock(mm)) {
> > +		mmput(mm);
> > +		return;
> > +	}
> 
> When is this called from an atomic context?
> 
User who call it in an atomic context such as interrupt context.

> > +
> > +	vma = find_vma(mm, ip);
> > +	if (vma && vma->vm_file) {
> > +		struct file *f = vma->vm_file;
> > +		char *buf = (char *)__get_free_page(GFP_NOWAIT);
> > +
> > +		if (buf) {
> > +			char *p;
> > +
> > +			p = file_path(f, buf, PAGE_SIZE);
> > +			if (IS_ERR(p))
> > +				p = "?";
> > +			printk("%s%s%s[%lx-%lx]\n", loglvl, prefix, p,
> > +					vma->vm_start,
> > +					vma->vm_end);
> > +			free_page((unsigned long)buf);
> > +		}
> > +	}
> > +	mmap_read_unlock(mm);
> > +	if (task != current)
> > +		mmput(mm);
> > +}
> > +
> > +static struct vm_area_struct *find_user_stack_vma(struct task_struct *task, unsigned long sp)
> > +{
> > +	struct mm_struct *mm;
> > +	struct vm_area_struct *vma;
> > +
> > +	if (task != current)
> > +		mm = get_task_mm(task);
> > +	else
> > +		mm = task->mm;
> > +
> > +	if (!mm)
> > +		return NULL;
> > +	/*
> > +	 * we might be running from an atomic context so we cannot sleep
> > +	 */
> > +	if (!mmap_read_trylock(mm)) {
> > +		mmput(mm);
> > +		return NULL;
> > +	}
> > +	vma = find_vma(mm, sp);
> > +	mmap_read_unlock(mm);
> > +	if (task != current)
> > +		mmput(mm);
> 
> What guarantees the VMA is safe to use after this? What ensures that it won't
> be freed? What ensures that it is still valid and not subject to concurrent
> modification?
> 
get_task_mm and mmap_read_trylock will ensure the VMA safe to use, the reliazation of two APIs refer
to print_vma_addr() in mm/memory.c.

> > +
> > +	return vma;
> > +}
> > +
> > +static void dump_user_backtrace_entry(struct task_struct *tsk,
> > +				unsigned long where, const char *loglvl)
> > +{
> > +	char prefix[64];
> > +
> > +	snprintf(prefix, sizeof(prefix), "<0x%lx> in ", where);
> > +	print_vma_addr_info(prefix, tsk, where, loglvl);
> > +}
> > +
> > +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> > +								const char *loglvl)
> > +{
> > +	struct stack_frame_user frame;
> > +	struct vm_area_struct *vma;
> > +	unsigned long userstack_start, userstack_end;
> > +
> > +	if (!tsk)
> > +		tsk = current;
> > +
> > +	/*
> > +	 * If @regs is not specified or caller is not current task,.
> > +	 * @regs is supposed to get from @tsk.
> > +	 */
> > +	if (!regs || tsk != current)
> > +		regs = task_pt_regs(tsk);
> 
> The user state is *always* in task_pt_regs(tsk), even when tsk == current.
> 
> Why does this function take the regs as an argument at all?
> 
The API export the two argument(regs and tsk) for caller, we must make legitimacy judgments
to aviod the caller passed unreasonable arguments.

> > +
> > +	/* TODO: support stack unwind for compat user mode */
> > +	if (compat_user_mode(regs))
> > +		return;
> > +
> > +	userstack_start = regs->user_regs.sp;
> > +	vma = find_user_stack_vma(tsk, userstack_start);
> > +	if (!vma)
> > +		return;
> > +
> > +	userstack_end = vma->vm_end;
> > +	frame.fp = regs->user_regs.regs[29];
> > +	frame.sp = userstack_start;
> > +	frame.pc = regs->user_regs.pc;
> > +
> > +	printk("%s[%s-%d] Dump user backtrace:\n", loglvl, tsk->comm, tsk->pid);
> > +	while (1) {
> > +		unsigned long where = frame.pc;
> > +
> > +		if (!where || where & 0x3)
> > +			break;
> > +		dump_user_backtrace_entry(tsk, where, loglvl);
> > +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> > +			break;
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(arch_dump_user_stacktrace);
> 
> Where is this used from?
> 
> Why should it be exported?
> 
As replied at front, the API is supposed to be used by kernel space such as kernel modules.

> > +
> > +/**
> > + * stack_trace_save_user - Save user space stack traces into a storage array
> > + * @consume_entry: Callback for save a user space stack trace
> > + * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
> > + * @regs: The pt_regs pointer of current task
> > + */
> > +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> > +			  const struct pt_regs *regs)
> > +{
> > +	struct stack_frame_user frame;
> > +	struct vm_area_struct *vma;
> > +	unsigned long userstack_start, userstack_end;
> > +	struct task_struct *tsk = current;
> > +
> > +	/* TODO: support stack unwind for compat user mode */
> > +	if (!regs || !user_mode(regs) || compat_user_mode(regs))
> > +		return;
> > +
> > +	userstack_start = regs->user_regs.sp;
> > +	vma = find_user_stack_vma(tsk, userstack_start);
> > +	if (!vma)
> 
> Yet again this duplicates the code above.
> 
> If we really need this, then arch_stack_walk_user() should be the real
> unwinder, and the caes above should be built atop arch_stack_walk_user().
> 
> > +		return;
> > +
> > +	userstack_end = vma->vm_end;
> > +	frame.fp = regs->user_regs.regs[29];
> > +	frame.sp = userstack_start;
> > +	frame.pc = regs->user_regs.pc;
> > +
> > +	while (1) {
> > +		unsigned long where = frame.pc;
> > +
> > +		/* Sanity check: ABI requires pc to be aligned 4 bytes. */
> > +		if (!where || where & 0x3)
> > +			break;
> 
> Why do we care whether the PC is valid?
> 
If pc is invaild, it's meaningless to unwind whole unwind, just skip unwinding.

> There are plenty of other things that we could check (e.g. whether this points
> to executable memory), but it seems kinda pointless to care beyond whether we
> can unwind the frame.
> 
> Note that we're missing the LR anyway, so this *isn't* a reliable unwind.
> 
The frame.pc is used to record LR, sanity check for lr and fp will make a reliable unwind
since we can safely use print_vma_addr_info() to transfer LR addr to VMA.

> Thanks,
> Mark.
> 
> > +		if (!consume_entry(cookie, where))
> > +			break;
> > +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> > +			break;
> > +	}
> > +}
> > +
> >  void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
> >  		    const char *loglvl)
> >  {
> > diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
> > index 97455880a..bc5a7bf56 100644
> > --- a/include/linux/stacktrace.h
> > +++ b/include/linux/stacktrace.h
> > @@ -60,6 +60,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
> >  
> >  void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> >  			  const struct pt_regs *regs);
> > +
> > +/**
> > + * arch_dump_user_stacktrace - Architecture specific function to dump the
> > + *			       stack trace for user process
> > + * @regs: Pointer to the pt_regs of user process
> > + * @tsk: Pointer to the task_struct of user process
> > + * @loglvl: Log level
> > + */
> > +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> > +			       const char *loglvl);
> >  #endif /* CONFIG_ARCH_STACKWALK */
> >  
> >  #ifdef CONFIG_STACKTRACE
> > -- 
> > 2.25.1
> > 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-21  9:03   ` chenqiwu
@ 2023-11-21 14:05     ` Mark Rutland
  2023-11-24  9:55       ` chenqiwu
  0 siblings, 1 reply; 8+ messages in thread
From: Mark Rutland @ 2023-11-21 14:05 UTC (permalink / raw)
  To: chenqiwu
  Cc: catalin.marinas, will, kaleshsingh, ardb, mhiramat,
	linux-arm-kernel

On Tue, Nov 21, 2023 at 05:03:00PM +0800, chenqiwu wrote:
> On Mon, Nov 20, 2023 at 02:29:17PM +0000, Mark Rutland wrote:
> > On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> > > From: chenqiwu <qiwu.chen@transsion.com>
> > > 
> > > 1. Introduce and export arch_dump_user_stacktrace() API to support
> > > user stacktrace dump for a user task (both current and non-current task).
> > > A example test about the log format of user stacktrace as shown below:
> > > [test-515] Dump user backtrace:
> > > <0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > <0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > > <0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > <0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > 
> > Where is this used?
> > 
> It's used in kernel space for some case need user backtrace for debugging,
> such as:  https://lkml.org/lkml/2023/11/9/1365

That's not in mainline.

> > We already have user stacktracing code in arch/arm64/kernel/perf_callchain.c
> > which doesn't depend on this API. What does this API enable that we don't
> > support today?
> > 
> Sorry, I indeed ignored this case, but it seems only work for perf syscall chain in
> case of CONFIG_PERF_EVENTS enabled without universality.
> It's supposed to introduce a common API for dumping user backtrace in kernel space.

I understand that; I'm saying that you should *modify* this code to handle both
cases.

> > > 2. Add arch_stack_walk_user() implementation to support userstacktrace transsionce option.
> > 
> > What is this 'userstacktrace transsionce option' ?
> > 
> > > A example test about the output format of ftrace userstacktrace as shown below:
> > >     bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
> > >     bash-489     [000] .....  2167.660787: <user stack trace>
> > >  => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
> > >  => /bin/bash[+0x5f354]
> > >  => /bin/bash[+0x4876c]
> > >  => /bin/bash[+0x4aec4]
> > >  => /bin/bash[+0x4da48]
> > >  => /bin/bash[+0x4b710]
> > >  => /bin/bash[+0x4c31c]
> > >  => /bin/bash[+0x339b0]
> > > 
> > > Tested-by-by: chenqiwu <qiwu.chen@transsion.com>
> > > Signed-off-by: chenqiwu <qiwu.chen@transsion.com>
> > > ---
> > >  arch/arm64/Kconfig             |   1 +
> > >  arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
> > >  include/linux/stacktrace.h     |  10 ++
> > >  3 files changed, 219 insertions(+)
> > 
> > As above, we already have user stacktracing code, and we shouldn't add
> > *distinct* unwinders. Either that code should be factored out and reused, or
> > this code should replace it.
> > 
> Currently, ARM64 platform is not supported for ftrace userstacktrace profile feature,
> since CONFIG_USER_STACKTRACE_SUPPORT is not enabled, the call chain cannot be accessed:
> ftrace_trace_userstack -> stack_trace_save_user -> arch_stack_walk_user

As above, please reuse the existing code, e.g. take the existing logic in
perf_callchain_user() and use it to implement arch_stack_walk_user(), and call
that from perf_callchain_user() an stack_trace_save_user().

> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index 7b071a004..4c5066f88 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -255,6 +255,7 @@ config ARM64
> > >  	select TRACE_IRQFLAGS_SUPPORT
> > >  	select TRACE_IRQFLAGS_NMI_SUPPORT
> > >  	select HAVE_SOFTIRQ_ON_OWN_STACK
> > > +	select USER_STACKTRACE_SUPPORT
> > >  	help
> > >  	  ARM 64-bit (AArch64) Linux support.
> > >  
> > > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> > > index 17f66a74c..4e7bf2922 100644
> > > --- a/arch/arm64/kernel/stacktrace.c
> > > +++ b/arch/arm64/kernel/stacktrace.c
> > > @@ -215,6 +215,214 @@ static bool dump_backtrace_entry(void *arg, unsigned long where)
> > >  	return true;
> > >  }
> > >  
> > > +/* The struct defined for AArch64 userspace stack frame */
> > > +struct stack_frame_user {
> > > +	unsigned long fp;
> > > +	unsigned long sp;
> > > +	unsigned long pc;
> > > +};
> > > +
> > > +/*
> > > + * The function of AArch64 userspace stack frame unwind method.
> > > + * Note: If the caller is not current task, it's supposed to call
> > > + * access_process_vm() to access another task' address space.
> > > + */
> > > +static int arch_unwind_user_frame(struct task_struct *tsk, unsigned long high,
> > > +				struct stack_frame_user *frame)
> > > +{
> > > +	int ret = 0;
> > > +	unsigned long fp = frame->fp;
> > > +	unsigned long low = frame->sp;
> > > +
> > > +	if (fp < low || fp > high || fp & 0xf)
> > > +		return -EFAULT;
> > > +
> > > +	frame->sp = fp + 0x10;
> > 
> > Given you always set frame->sp as fp + 0x10, why does frame->sp need to exist
> > at all?
> > 
> frame->sp refer to the bottom of stack VMA, which increased at least 0x10 on every entry of
> arch_unwind_user_frame. A vaild frame->fp is suppoed to between upper and lower limit of
> task stack VMA.

What you're calling the 'sp' is the top of the last frame record. It has
nothing to do with the SP, and has nothing to do with any VMA.

You don't need a separate field for this.

> 
> > Per AAPCS64, the frame record only conatins a copy of the FP and LR, and is
> > *not* directly associated with the SP, so I don't think we should pretend it
> > is.
> > 
> > > +	/* Disable page fault to make sure get_user going on wheels */
> > 
> > I have no idea what this comment is trying to say.
> > 
> > Why exactly you you think we need to disable page faults? Isn't that going to
> > make this fail arbitrarily when we *can* fault pages in? I know that the
> > existing perf unwinder does this, but that's a design problem we'd like to
> > solve (e.g. by deferring the unwind until return to userspace).
> > 
> Hhmm, I refer to the design of existing user unwinder. User access methods will not sleep
> when called from a pagefault_disabled(), so the get_user can be acceed in atomic context.

I think the comment should clearly say that, then.

I was confused by "going on wheels", which doesn't mean anything in particular.

> 
> > > +	pagefault_disable();
> > > +	if (tsk == current) {
> > > +		if (get_user(frame->fp, (unsigned long __user *)fp) ||
> > > +			get_user(frame->pc, (unsigned long __user *)(fp + 8)))
> > > +			ret = -EFAULT;
> > > +	} else {
> > > +		if (access_process_vm(tsk, fp, &frame->fp,
> > > +			sizeof(unsigned long), 0) != sizeof(unsigned long) ||
> > > +			access_process_vm(tsk, fp + 0x08, &frame->pc,
> > > +			sizeof(unsigned long), 0) != sizeof(unsigned long))
> > > +			ret = -EFAULT;
> > > +	}
> > > +	pagefault_enable();
> > 
> > If task isn't current, userspace could be running and this will be racy and
> > unreliable.
> > 
> > Where is this used with task != current? Why do we need to support that case at
> > all?
> > 
> It's my idea to support the case for caller who really want to dump another task's backtrace.

Are there *any* eixsting users in the kernel tree?

If not, please just delete this.

> Not sure the racy and unreliablity since access_process_vm call get_task_mm and mmap_read_lock
> to pin the task's address space.

If task != current, then it can be concurrently executing. In that case, its
regs are stale and the task can concurrently modify its stack. Pinning the
memory doesn't prevent that.

> We can see access_process_vm is safely used in get_cmdline() case.
> 
> > What does this do for COMPAT tasks?
> >
> This patch is not covered COMPAT task unwinder yet.

Supporting COMPAT is necessary for this to be merged.

> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * Print the executable address and corresponding VMA info.
> > > + */
> > > +static void print_vma_addr_info(char *prefix, struct task_struct *task,
> > > +				unsigned long ip, const char *loglvl)
> > > +{
> > > +	struct mm_struct *mm;
> > > +	struct vm_area_struct *vma;
> > > +
> > > +	if (task != current)
> > > +		mm = get_task_mm(task);
> > > +	else
> > > +		mm = task->mm;
> > 
> > Why can't we always use get_task_mm(), even for task == current?
> > 
> get_task_mm increase the task's mm_users which refers to the number of users who access the mm
> in order to pin the task's mm.
> I think it's meaningless to use get_task_mm for task == current since task will not decrease
> its mm_users before calling do_exit -> exit_mm.

So? It's *safe* to use get_task_mm() and mmput() regardless, and it's far
simpler than conditional logic.

Please do not treat current as a special case. Always use the same logic to get/put the mm.

> > > +
> > > +	if (!mm)
> > > +		return;
> > > +	/*
> > > +	 * we might be running from an atomic context so we cannot sleep
> > > +	 */
> > > +	if (!mmap_read_trylock(mm)) {
> > > +		mmput(mm);
> > > +		return;
> > > +	}
> > 
> > When is this called from an atomic context?
> > 
> User who call it in an atomic context such as interrupt context.

Yes, but *which* users?

e.g. does ftrace end up calling this in atomic context, because we might trace
functions called in an IRQ handler?

> > > +
> > > +	vma = find_vma(mm, ip);
> > > +	if (vma && vma->vm_file) {
> > > +		struct file *f = vma->vm_file;
> > > +		char *buf = (char *)__get_free_page(GFP_NOWAIT);
> > > +
> > > +		if (buf) {
> > > +			char *p;
> > > +
> > > +			p = file_path(f, buf, PAGE_SIZE);
> > > +			if (IS_ERR(p))
> > > +				p = "?";
> > > +			printk("%s%s%s[%lx-%lx]\n", loglvl, prefix, p,
> > > +					vma->vm_start,
> > > +					vma->vm_end);
> > > +			free_page((unsigned long)buf);
> > > +		}
> > > +	}
> > > +	mmap_read_unlock(mm);
> > > +	if (task != current)
> > > +		mmput(mm);
> > > +}
> > > +
> > > +static struct vm_area_struct *find_user_stack_vma(struct task_struct *task, unsigned long sp)
> > > +{
> > > +	struct mm_struct *mm;
> > > +	struct vm_area_struct *vma;
> > > +
> > > +	if (task != current)
> > > +		mm = get_task_mm(task);
> > > +	else
> > > +		mm = task->mm;
> > > +
> > > +	if (!mm)
> > > +		return NULL;
> > > +	/*
> > > +	 * we might be running from an atomic context so we cannot sleep
> > > +	 */
> > > +	if (!mmap_read_trylock(mm)) {
> > > +		mmput(mm);
> > > +		return NULL;
> > > +	}
> > > +	vma = find_vma(mm, sp);
> > > +	mmap_read_unlock(mm);
> > > +	if (task != current)
> > > +		mmput(mm);
> > 
> > What guarantees the VMA is safe to use after this? What ensures that it won't
> > be freed? What ensures that it is still valid and not subject to concurrent
> > modification?
> > 
> get_task_mm and mmap_read_trylock will ensure the VMA safe to use, the reliazation of two APIs refer
> to print_vma_addr() in mm/memory.c.

We've just called:

	mmap_read_unlock(mm);
	...
	mmput(mm);

What ensures that the dangling reference to the VMA is still safe?

AFAICT, nothing does.

> > > +
> > > +	return vma;
> > > +}
> > > +
> > > +static void dump_user_backtrace_entry(struct task_struct *tsk,
> > > +				unsigned long where, const char *loglvl)
> > > +{
> > > +	char prefix[64];
> > > +
> > > +	snprintf(prefix, sizeof(prefix), "<0x%lx> in ", where);
> > > +	print_vma_addr_info(prefix, tsk, where, loglvl);
> > > +}
> > > +
> > > +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> > > +								const char *loglvl)
> > > +{
> > > +	struct stack_frame_user frame;
> > > +	struct vm_area_struct *vma;
> > > +	unsigned long userstack_start, userstack_end;
> > > +
> > > +	if (!tsk)
> > > +		tsk = current;
> > > +
> > > +	/*
> > > +	 * If @regs is not specified or caller is not current task,.
> > > +	 * @regs is supposed to get from @tsk.
> > > +	 */
> > > +	if (!regs || tsk != current)
> > > +		regs = task_pt_regs(tsk);
> > 
> > The user state is *always* in task_pt_regs(tsk), even when tsk == current.
> > 
> > Why does this function take the regs as an argument at all?
> > 
> The API export the two argument(regs and tsk) for caller, we must make legitimacy judgments
> to aviod the caller passed unreasonable arguments.

My point is that you can aovid that *by construction*.

If this was:

	void arch_dump_user_stacktrace(struct task_struct *tsk, const char *loglvl)

... then it's *impossible* for callers to pass incorrect arguments.

That said, there is no in-tree user for this function.

Please delete this function.

> > > +
> > > +	/* TODO: support stack unwind for compat user mode */
> > > +	if (compat_user_mode(regs))
> > > +		return;
> > > +
> > > +	userstack_start = regs->user_regs.sp;
> > > +	vma = find_user_stack_vma(tsk, userstack_start);
> > > +	if (!vma)
> > > +		return;
> > > +
> > > +	userstack_end = vma->vm_end;
> > > +	frame.fp = regs->user_regs.regs[29];
> > > +	frame.sp = userstack_start;
> > > +	frame.pc = regs->user_regs.pc;
> > > +
> > > +	printk("%s[%s-%d] Dump user backtrace:\n", loglvl, tsk->comm, tsk->pid);
> > > +	while (1) {
> > > +		unsigned long where = frame.pc;
> > > +
> > > +		if (!where || where & 0x3)
> > > +			break;
> > > +		dump_user_backtrace_entry(tsk, where, loglvl);
> > > +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> > > +			break;
> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(arch_dump_user_stacktrace);
> > 
> > Where is this used from?
> > 
> > Why should it be exported?
> > 
> As replied at front, the API is supposed to be used by kernel space such as kernel modules.

This should only be added if there is an in-tree user. There are no existing
users and none are added by this series.

Please delete this function.

NAK to adding this.

> > > +
> > > +/**
> > > + * stack_trace_save_user - Save user space stack traces into a storage array
> > > + * @consume_entry: Callback for save a user space stack trace
> > > + * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
> > > + * @regs: The pt_regs pointer of current task
> > > + */
> > > +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> > > +			  const struct pt_regs *regs)
> > > +{
> > > +	struct stack_frame_user frame;
> > > +	struct vm_area_struct *vma;
> > > +	unsigned long userstack_start, userstack_end;
> > > +	struct task_struct *tsk = current;
> > > +
> > > +	/* TODO: support stack unwind for compat user mode */
> > > +	if (!regs || !user_mode(regs) || compat_user_mode(regs))
> > > +		return;
> > > +
> > > +	userstack_start = regs->user_regs.sp;
> > > +	vma = find_user_stack_vma(tsk, userstack_start);
> > > +	if (!vma)
> > 
> > Yet again this duplicates the code above.
> > 
> > If we really need this, then arch_stack_walk_user() should be the real
> > unwinder, and the caes above should be built atop arch_stack_walk_user().
> > 
> > > +		return;
> > > +
> > > +	userstack_end = vma->vm_end;
> > > +	frame.fp = regs->user_regs.regs[29];
> > > +	frame.sp = userstack_start;
> > > +	frame.pc = regs->user_regs.pc;
> > > +
> > > +	while (1) {
> > > +		unsigned long where = frame.pc;
> > > +
> > > +		/* Sanity check: ABI requires pc to be aligned 4 bytes. */
> > > +		if (!where || where & 0x3)
> > > +			break;
> > 
> > Why do we care whether the PC is valid?
> > 
> If pc is invaild, it's meaningless to unwind whole unwind, just skip unwinding.

Why should we enforce that policy?

If the PC is 0, or is in a non-executable page, it's equally meaningless but we
don't check that.

What if someone is deliberately using a non-canonical PC to force a prefetch
abort, which they'll handle later (e.g. JITs might do that)?

> > There are plenty of other things that we could check (e.g. whether this points
> > to executable memory), but it seems kinda pointless to care beyond whether we
> > can unwind the frame.
> > 
> > Note that we're missing the LR anyway, so this *isn't* a reliable unwind.
> > 
> The frame.pc is used to record LR, sanity check for lr and fp will make a reliable unwind
> since we can safely use print_vma_addr_info() to transfer LR addr to VMA.

I'm saying you're missing the regs->user_regs.lr, and since you cannot know
whether that LR is valid, the unwind is unreliable. In the absence of metadata
from the compiler, you cannot handle that reliably.

The existing perf unwinder has the same issue, and we leave it to userspace to
recover the LR via DWARF is necessary, which isn't an option here.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-21 14:05     ` Mark Rutland
@ 2023-11-24  9:55       ` chenqiwu
  2023-11-24 10:12         ` Mark Rutland
  0 siblings, 1 reply; 8+ messages in thread
From: chenqiwu @ 2023-11-24  9:55 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, kaleshsingh, ardb, mhiramat,
	linux-arm-kernel

On Tue, Nov 21, 2023 at 02:05:29PM +0000, Mark Rutland wrote:
> On Tue, Nov 21, 2023 at 05:03:00PM +0800, chenqiwu wrote:
> > On Mon, Nov 20, 2023 at 02:29:17PM +0000, Mark Rutland wrote:
> > > On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> > > > From: chenqiwu <qiwu.chen@transsion.com>
> > > > 
> > > > 1. Introduce and export arch_dump_user_stacktrace() API to support
> > > > user stacktrace dump for a user task (both current and non-current task).
> > > > A example test about the log format of user stacktrace as shown below:
> > > > [test-515] Dump user backtrace:
> > > > <0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > > <0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > > > <0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > > <0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > > 
> > > Where is this used?
> > > 
> > It's used in kernel space for some case need user backtrace for debugging,
> > such as:  https://lkml.org/lkml/2023/11/9/1365
> 
> That's not in mainline.
>
I konw, you may misslead my intention for introducing the arch_dump_user_stacktrace API.
For this case only dump panic kernel trace any user regs and backtrace info on global
init exit. If arch_dump_user_stacktrace API can be merged into kernel-tree firstly, we
can use the API directly to dump user backtrace on global init exit, which will be helpful
to find the global init exit reason.
BTW, I think there are two significant places for the usage of arch_dump_user_stacktrace:
1) we can dump any important user thread backtrace without log ratelimit in arm64_show_signal(),
e.g: global_init thread
+++ b/arch/arm64/kernel/traps.c
@@ -247,12 +247,14 @@ static void arm64_show_signal(int signo, const char *str)
        unsigned long esr = tsk->thread.fault_code;
        struct pt_regs *regs = task_pt_regs(tsk);

+       if (unlikely(is_global_init(tsk)))
+               goto dump;
        /* Leave if the signal won't be shown */
        if (!show_unhandled_signals ||
            !unhandled_signal(tsk, signo) ||
            !__ratelimit(&rs))
                return;
-
+dump:
        pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
        if (esr)
                pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr);
@@ -261,6 +263,7 @@ static void arm64_show_signal(int signo, const char *str)
        print_vma_addr(KERN_CONT " in ", regs->pc);
        pr_cont("\n");
        __show_regs(regs);
+       arch_dump_user_stacktrace(regs);
 }

2) we can dump the user thread's backtrace which blocked in D state in hung_task,
in this way, we can get more clues to find the user thread blocked reason.
+++ b/kernel/hung_task.c
@@ -137,6 +137,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
                        init_utsname()->version);
                pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                        " disables this message.\n");
+               if (user_mode(regs))
+                       arch_dump_user_stacktrace(task_pt_regs(t));
                sched_show_task(t);


> > > We already have user stacktracing code in arch/arm64/kernel/perf_callchain.c
> > > which doesn't depend on this API. What does this API enable that we don't
> > > support today?
> > > 
> > Sorry, I indeed ignored this case, but it seems only work for perf syscall chain in
> > case of CONFIG_PERF_EVENTS enabled without universality.
> > It's supposed to introduce a common API for dumping user backtrace in kernel space.
> 
> I understand that; I'm saying that you should *modify* this code to handle both
> cases.
> 
> > > > 2. Add arch_stack_walk_user() implementation to support userstacktrace transsionce option.
> > > 
> > > What is this 'userstacktrace transsionce option' ?
> > > 
> > > > A example test about the output format of ftrace userstacktrace as shown below:
> > > >     bash-489     [000] .....  2167.660775: sched_process_fork: comm=bash pid=489 child_comm=bash child_pid=596
> > > >     bash-489     [000] .....  2167.660787: <user stack trace>
> > > >  => /lib/aarch64-linux-gnu/libc-2.32.so[+0xa76d8]
> > > >  => /bin/bash[+0x5f354]
> > > >  => /bin/bash[+0x4876c]
> > > >  => /bin/bash[+0x4aec4]
> > > >  => /bin/bash[+0x4da48]
> > > >  => /bin/bash[+0x4b710]
> > > >  => /bin/bash[+0x4c31c]
> > > >  => /bin/bash[+0x339b0]
> > > > 
> > > > Tested-by-by: chenqiwu <qiwu.chen@transsion.com>
> > > > Signed-off-by: chenqiwu <qiwu.chen@transsion.com>
> > > > ---
> > > >  arch/arm64/Kconfig             |   1 +
> > > >  arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
> > > >  include/linux/stacktrace.h     |  10 ++
> > > >  3 files changed, 219 insertions(+)
> > > 
> > > As above, we already have user stacktracing code, and we shouldn't add
> > > *distinct* unwinders. Either that code should be factored out and reused, or
> > > this code should replace it.
> > > 
> > Currently, ARM64 platform is not supported for ftrace userstacktrace profile feature,
> > since CONFIG_USER_STACKTRACE_SUPPORT is not enabled, the call chain cannot be accessed:
> > ftrace_trace_userstack -> stack_trace_save_user -> arch_stack_walk_user
> 
> As above, please reuse the existing code, e.g. take the existing logic in
> perf_callchain_user() and use it to implement arch_stack_walk_user(), and call
> that from perf_callchain_user() an stack_trace_save_user().
> 
Okay, I will develop the stack_trace_save_user() refers to perf_callchain_user() and
resend this patch as V2.

Thanks
Qiwu
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index 7b071a004..4c5066f88 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -255,6 +255,7 @@ config ARM64
> > > >  	select TRACE_IRQFLAGS_SUPPORT
> > > >  	select TRACE_IRQFLAGS_NMI_SUPPORT
> > > >  	select HAVE_SOFTIRQ_ON_OWN_STACK
> > > > +	select USER_STACKTRACE_SUPPORT
> > > >  	help
> > > >  	  ARM 64-bit (AArch64) Linux support.
> > > >  
> > > > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> > > > index 17f66a74c..4e7bf2922 100644
> > > > --- a/arch/arm64/kernel/stacktrace.c
> > > > +++ b/arch/arm64/kernel/stacktrace.c
> > > > @@ -215,6 +215,214 @@ static bool dump_backtrace_entry(void *arg, unsigned long where)
> > > >  	return true;
> > > >  }
> > > >  
> > > > +/* The struct defined for AArch64 userspace stack frame */
> > > > +struct stack_frame_user {
> > > > +	unsigned long fp;
> > > > +	unsigned long sp;
> > > > +	unsigned long pc;
> > > > +};
> > > > +
> > > > +/*
> > > > + * The function of AArch64 userspace stack frame unwind method.
> > > > + * Note: If the caller is not current task, it's supposed to call
> > > > + * access_process_vm() to access another task' address space.
> > > > + */
> > > > +static int arch_unwind_user_frame(struct task_struct *tsk, unsigned long high,
> > > > +				struct stack_frame_user *frame)
> > > > +{
> > > > +	int ret = 0;
> > > > +	unsigned long fp = frame->fp;
> > > > +	unsigned long low = frame->sp;
> > > > +
> > > > +	if (fp < low || fp > high || fp & 0xf)
> > > > +		return -EFAULT;
> > > > +
> > > > +	frame->sp = fp + 0x10;
> > > 
> > > Given you always set frame->sp as fp + 0x10, why does frame->sp need to exist
> > > at all?
> > > 
> > frame->sp refer to the bottom of stack VMA, which increased at least 0x10 on every entry of
> > arch_unwind_user_frame. A vaild frame->fp is suppoed to between upper and lower limit of
> > task stack VMA.
> 
> What you're calling the 'sp' is the top of the last frame record. It has
> nothing to do with the SP, and has nothing to do with any VMA.
> 
> You don't need a separate field for this.
> 
> > 
> > > Per AAPCS64, the frame record only conatins a copy of the FP and LR, and is
> > > *not* directly associated with the SP, so I don't think we should pretend it
> > > is.
> > > 
> > > > +	/* Disable page fault to make sure get_user going on wheels */
> > > 
> > > I have no idea what this comment is trying to say.
> > > 
> > > Why exactly you you think we need to disable page faults? Isn't that going to
> > > make this fail arbitrarily when we *can* fault pages in? I know that the
> > > existing perf unwinder does this, but that's a design problem we'd like to
> > > solve (e.g. by deferring the unwind until return to userspace).
> > > 
> > Hhmm, I refer to the design of existing user unwinder. User access methods will not sleep
> > when called from a pagefault_disabled(), so the get_user can be acceed in atomic context.
> 
> I think the comment should clearly say that, then.
> 
> I was confused by "going on wheels", which doesn't mean anything in particular.
> 
> > 
> > > > +	pagefault_disable();
> > > > +	if (tsk == current) {
> > > > +		if (get_user(frame->fp, (unsigned long __user *)fp) ||
> > > > +			get_user(frame->pc, (unsigned long __user *)(fp + 8)))
> > > > +			ret = -EFAULT;
> > > > +	} else {
> > > > +		if (access_process_vm(tsk, fp, &frame->fp,
> > > > +			sizeof(unsigned long), 0) != sizeof(unsigned long) ||
> > > > +			access_process_vm(tsk, fp + 0x08, &frame->pc,
> > > > +			sizeof(unsigned long), 0) != sizeof(unsigned long))
> > > > +			ret = -EFAULT;
> > > > +	}
> > > > +	pagefault_enable();
> > > 
> > > If task isn't current, userspace could be running and this will be racy and
> > > unreliable.
> > > 
> > > Where is this used with task != current? Why do we need to support that case at
> > > all?
> > > 
> > It's my idea to support the case for caller who really want to dump another task's backtrace.
> 
> Are there *any* eixsting users in the kernel tree?
> 
> If not, please just delete this.
> 
> > Not sure the racy and unreliablity since access_process_vm call get_task_mm and mmap_read_lock
> > to pin the task's address space.
> 
> If task != current, then it can be concurrently executing. In that case, its
> regs are stale and the task can concurrently modify its stack. Pinning the
> memory doesn't prevent that.
> 
> > We can see access_process_vm is safely used in get_cmdline() case.
> > 
> > > What does this do for COMPAT tasks?
> > >
> > This patch is not covered COMPAT task unwinder yet.
> 
> Supporting COMPAT is necessary for this to be merged.
> 
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Print the executable address and corresponding VMA info.
> > > > + */
> > > > +static void print_vma_addr_info(char *prefix, struct task_struct *task,
> > > > +				unsigned long ip, const char *loglvl)
> > > > +{
> > > > +	struct mm_struct *mm;
> > > > +	struct vm_area_struct *vma;
> > > > +
> > > > +	if (task != current)
> > > > +		mm = get_task_mm(task);
> > > > +	else
> > > > +		mm = task->mm;
> > > 
> > > Why can't we always use get_task_mm(), even for task == current?
> > > 
> > get_task_mm increase the task's mm_users which refers to the number of users who access the mm
> > in order to pin the task's mm.
> > I think it's meaningless to use get_task_mm for task == current since task will not decrease
> > its mm_users before calling do_exit -> exit_mm.
> 
> So? It's *safe* to use get_task_mm() and mmput() regardless, and it's far
> simpler than conditional logic.
> 
> Please do not treat current as a special case. Always use the same logic to get/put the mm.
> 
> > > > +
> > > > +	if (!mm)
> > > > +		return;
> > > > +	/*
> > > > +	 * we might be running from an atomic context so we cannot sleep
> > > > +	 */
> > > > +	if (!mmap_read_trylock(mm)) {
> > > > +		mmput(mm);
> > > > +		return;
> > > > +	}
> > > 
> > > When is this called from an atomic context?
> > > 
> > User who call it in an atomic context such as interrupt context.
> 
> Yes, but *which* users?
> 
> e.g. does ftrace end up calling this in atomic context, because we might trace
> functions called in an IRQ handler?
> 
> > > > +
> > > > +	vma = find_vma(mm, ip);
> > > > +	if (vma && vma->vm_file) {
> > > > +		struct file *f = vma->vm_file;
> > > > +		char *buf = (char *)__get_free_page(GFP_NOWAIT);
> > > > +
> > > > +		if (buf) {
> > > > +			char *p;
> > > > +
> > > > +			p = file_path(f, buf, PAGE_SIZE);
> > > > +			if (IS_ERR(p))
> > > > +				p = "?";
> > > > +			printk("%s%s%s[%lx-%lx]\n", loglvl, prefix, p,
> > > > +					vma->vm_start,
> > > > +					vma->vm_end);
> > > > +			free_page((unsigned long)buf);
> > > > +		}
> > > > +	}
> > > > +	mmap_read_unlock(mm);
> > > > +	if (task != current)
> > > > +		mmput(mm);
> > > > +}
> > > > +
> > > > +static struct vm_area_struct *find_user_stack_vma(struct task_struct *task, unsigned long sp)
> > > > +{
> > > > +	struct mm_struct *mm;
> > > > +	struct vm_area_struct *vma;
> > > > +
> > > > +	if (task != current)
> > > > +		mm = get_task_mm(task);
> > > > +	else
> > > > +		mm = task->mm;
> > > > +
> > > > +	if (!mm)
> > > > +		return NULL;
> > > > +	/*
> > > > +	 * we might be running from an atomic context so we cannot sleep
> > > > +	 */
> > > > +	if (!mmap_read_trylock(mm)) {
> > > > +		mmput(mm);
> > > > +		return NULL;
> > > > +	}
> > > > +	vma = find_vma(mm, sp);
> > > > +	mmap_read_unlock(mm);
> > > > +	if (task != current)
> > > > +		mmput(mm);
> > > 
> > > What guarantees the VMA is safe to use after this? What ensures that it won't
> > > be freed? What ensures that it is still valid and not subject to concurrent
> > > modification?
> > > 
> > get_task_mm and mmap_read_trylock will ensure the VMA safe to use, the reliazation of two APIs refer
> > to print_vma_addr() in mm/memory.c.
> 
> We've just called:
> 
> 	mmap_read_unlock(mm);
> 	...
> 	mmput(mm);
> 
> What ensures that the dangling reference to the VMA is still safe?
> 
> AFAICT, nothing does.
> 
> > > > +
> > > > +	return vma;
> > > > +}
> > > > +
> > > > +static void dump_user_backtrace_entry(struct task_struct *tsk,
> > > > +				unsigned long where, const char *loglvl)
> > > > +{
> > > > +	char prefix[64];
> > > > +
> > > > +	snprintf(prefix, sizeof(prefix), "<0x%lx> in ", where);
> > > > +	print_vma_addr_info(prefix, tsk, where, loglvl);
> > > > +}
> > > > +
> > > > +void arch_dump_user_stacktrace(struct pt_regs *regs, struct task_struct *tsk,
> > > > +								const char *loglvl)
> > > > +{
> > > > +	struct stack_frame_user frame;
> > > > +	struct vm_area_struct *vma;
> > > > +	unsigned long userstack_start, userstack_end;
> > > > +
> > > > +	if (!tsk)
> > > > +		tsk = current;
> > > > +
> > > > +	/*
> > > > +	 * If @regs is not specified or caller is not current task,.
> > > > +	 * @regs is supposed to get from @tsk.
> > > > +	 */
> > > > +	if (!regs || tsk != current)
> > > > +		regs = task_pt_regs(tsk);
> > > 
> > > The user state is *always* in task_pt_regs(tsk), even when tsk == current.
> > > 
> > > Why does this function take the regs as an argument at all?
> > > 
> > The API export the two argument(regs and tsk) for caller, we must make legitimacy judgments
> > to aviod the caller passed unreasonable arguments.
> 
> My point is that you can aovid that *by construction*.
> 
> If this was:
> 
> 	void arch_dump_user_stacktrace(struct task_struct *tsk, const char *loglvl)
> 
> ... then it's *impossible* for callers to pass incorrect arguments.
> 
> That said, there is no in-tree user for this function.
> 
> Please delete this function.
> 
> > > > +
> > > > +	/* TODO: support stack unwind for compat user mode */
> > > > +	if (compat_user_mode(regs))
> > > > +		return;
> > > > +
> > > > +	userstack_start = regs->user_regs.sp;
> > > > +	vma = find_user_stack_vma(tsk, userstack_start);
> > > > +	if (!vma)
> > > > +		return;
> > > > +
> > > > +	userstack_end = vma->vm_end;
> > > > +	frame.fp = regs->user_regs.regs[29];
> > > > +	frame.sp = userstack_start;
> > > > +	frame.pc = regs->user_regs.pc;
> > > > +
> > > > +	printk("%s[%s-%d] Dump user backtrace:\n", loglvl, tsk->comm, tsk->pid);
> > > > +	while (1) {
> > > > +		unsigned long where = frame.pc;
> > > > +
> > > > +		if (!where || where & 0x3)
> > > > +			break;
> > > > +		dump_user_backtrace_entry(tsk, where, loglvl);
> > > > +		if (arch_unwind_user_frame(tsk, userstack_end, &frame) < 0)
> > > > +			break;
> > > > +	}
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(arch_dump_user_stacktrace);
> > > 
> > > Where is this used from?
> > > 
> > > Why should it be exported?
> > > 
> > As replied at front, the API is supposed to be used by kernel space such as kernel modules.
> 
> This should only be added if there is an in-tree user. There are no existing
> users and none are added by this series.
> 
> Please delete this function.
> 
> NAK to adding this.
> 
> > > > +
> > > > +/**
> > > > + * stack_trace_save_user - Save user space stack traces into a storage array
> > > > + * @consume_entry: Callback for save a user space stack trace
> > > > + * @cookie:	Caller supplied pointer handed back by arch_stack_walk()
> > > > + * @regs: The pt_regs pointer of current task
> > > > + */
> > > > +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> > > > +			  const struct pt_regs *regs)
> > > > +{
> > > > +	struct stack_frame_user frame;
> > > > +	struct vm_area_struct *vma;
> > > > +	unsigned long userstack_start, userstack_end;
> > > > +	struct task_struct *tsk = current;
> > > > +
> > > > +	/* TODO: support stack unwind for compat user mode */
> > > > +	if (!regs || !user_mode(regs) || compat_user_mode(regs))
> > > > +		return;
> > > > +
> > > > +	userstack_start = regs->user_regs.sp;
> > > > +	vma = find_user_stack_vma(tsk, userstack_start);
> > > > +	if (!vma)
> > > 
> > > Yet again this duplicates the code above.
> > > 
> > > If we really need this, then arch_stack_walk_user() should be the real
> > > unwinder, and the caes above should be built atop arch_stack_walk_user().
> > > 
> > > > +		return;
> > > > +
> > > > +	userstack_end = vma->vm_end;
> > > > +	frame.fp = regs->user_regs.regs[29];
> > > > +	frame.sp = userstack_start;
> > > > +	frame.pc = regs->user_regs.pc;
> > > > +
> > > > +	while (1) {
> > > > +		unsigned long where = frame.pc;
> > > > +
> > > > +		/* Sanity check: ABI requires pc to be aligned 4 bytes. */
> > > > +		if (!where || where & 0x3)
> > > > +			break;
> > > 
> > > Why do we care whether the PC is valid?
> > > 
> > If pc is invaild, it's meaningless to unwind whole unwind, just skip unwinding.
> 
> Why should we enforce that policy?
> 
> If the PC is 0, or is in a non-executable page, it's equally meaningless but we
> don't check that.
> 
> What if someone is deliberately using a non-canonical PC to force a prefetch
> abort, which they'll handle later (e.g. JITs might do that)?
> 
> > > There are plenty of other things that we could check (e.g. whether this points
> > > to executable memory), but it seems kinda pointless to care beyond whether we
> > > can unwind the frame.
> > > 
> > > Note that we're missing the LR anyway, so this *isn't* a reliable unwind.
> > > 
> > The frame.pc is used to record LR, sanity check for lr and fp will make a reliable unwind
> > since we can safely use print_vma_addr_info() to transfer LR addr to VMA.
> 
> I'm saying you're missing the regs->user_regs.lr, and since you cannot know
> whether that LR is valid, the unwind is unreliable. In the absence of metadata
> from the compiler, you cannot handle that reliably.
> 
> The existing perf unwinder has the same issue, and we leave it to userspace to
> recover the LR via DWARF is necessary, which isn't an option here.
> 
> Thanks,
> Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] arm64: Add user stacktrace support
  2023-11-24  9:55       ` chenqiwu
@ 2023-11-24 10:12         ` Mark Rutland
  0 siblings, 0 replies; 8+ messages in thread
From: Mark Rutland @ 2023-11-24 10:12 UTC (permalink / raw)
  To: chenqiwu
  Cc: catalin.marinas, will, kaleshsingh, ardb, mhiramat,
	linux-arm-kernel

On Fri, Nov 24, 2023 at 05:55:27PM +0800, chenqiwu wrote:
> On Tue, Nov 21, 2023 at 02:05:29PM +0000, Mark Rutland wrote:
> > On Tue, Nov 21, 2023 at 05:03:00PM +0800, chenqiwu wrote:
> > > On Mon, Nov 20, 2023 at 02:29:17PM +0000, Mark Rutland wrote:
> > > > On Sat, Nov 18, 2023 at 09:45:04PM +0800, qiwuchen55@gmail.com wrote:
> > > > > From: chenqiwu <qiwu.chen@transsion.com>
> > > > > 
> > > > > 1. Introduce and export arch_dump_user_stacktrace() API to support
> > > > > user stacktrace dump for a user task (both current and non-current task).
> > > > > A example test about the log format of user stacktrace as shown below:
> > > > > [test-515] Dump user backtrace:
> > > > > <0xffffb0c1a750> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > > > <0xaaaacbf8097c> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > > > > <0xffffb0b778b8> in /lib/aarch64-linux-gnu/libc-2.32.so[ffffb0b53000-ffffb0cb1000]
> > > > > <0xaaaacbf80834> in /mnt/test[aaaacbf80000-aaaacbf81000]
> > > > 
> > > > Where is this used?
> > > > 
> > > It's used in kernel space for some case need user backtrace for debugging,
> > > such as:  https://lkml.org/lkml/2023/11/9/1365
> > 
> > That's not in mainline.
> >
> I konw, you may misslead my intention for introducing the arch_dump_user_stacktrace API.
> For this case only dump panic kernel trace any user regs and backtrace info on global
> init exit. If arch_dump_user_stacktrace API can be merged into kernel-tree firstly, we
> can use the API directly to dump user backtrace on global init exit, which will be helpful
> to find the global init exit reason.

I don't know what you mean by "you may misslead my intention" here. This patch
didn't add code to call the backtrace on global init exit, nor did the commit
message say anything about that.

Regardless, that doesn't change the fact that there's no caller in mainline nor
any caller added in this patch. NAK for adding that without a user.

> BTW, I think there are two significant places for the usage of arch_dump_user_stacktrace:
> 1) we can dump any important user thread backtrace without log ratelimit in arm64_show_signal(),
> e.g: global_init thread
> +++ b/arch/arm64/kernel/traps.c
> @@ -247,12 +247,14 @@ static void arm64_show_signal(int signo, const char *str)
>         unsigned long esr = tsk->thread.fault_code;
>         struct pt_regs *regs = task_pt_regs(tsk);
> 
> +       if (unlikely(is_global_init(tsk)))
> +               goto dump;
>         /* Leave if the signal won't be shown */
>         if (!show_unhandled_signals ||
>             !unhandled_signal(tsk, signo) ||
>             !__ratelimit(&rs))
>                 return;
> -
> +dump:
>         pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
>         if (esr)
>                 pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr);
> @@ -261,6 +263,7 @@ static void arm64_show_signal(int signo, const char *str)
>         print_vma_addr(KERN_CONT " in ", regs->pc);
>         pr_cont("\n");
>         __show_regs(regs);
> +       arch_dump_user_stacktrace(regs);
>  }

Huh? So for any signal directed to the global init task we'll print something
out, even if that signal will be handled?

That is *not* a good idea, and I don't think that's something we want generally
(though I appreciate it may help to debug some issues).

> 2) we can dump the user thread's backtrace which blocked in D state in hung_task,
> in this way, we can get more clues to find the user thread blocked reason.
> +++ b/kernel/hung_task.c
> @@ -137,6 +137,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
>                         init_utsname()->version);
>                 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
>                         " disables this message.\n");
> +               if (user_mode(regs))
> +                       arch_dump_user_stacktrace(task_pt_regs(t));
>                 sched_show_task(t);

Maybe, but I don't think that's actually necessary.

> > > > >  arch/arm64/Kconfig             |   1 +
> > > > >  arch/arm64/kernel/stacktrace.c | 208 +++++++++++++++++++++++++++++++++
> > > > >  include/linux/stacktrace.h     |  10 ++
> > > > >  3 files changed, 219 insertions(+)
> > > > 
> > > > As above, we already have user stacktracing code, and we shouldn't add
> > > > *distinct* unwinders. Either that code should be factored out and reused, or
> > > > this code should replace it.
> > > > 
> > > Currently, ARM64 platform is not supported for ftrace userstacktrace profile feature,
> > > since CONFIG_USER_STACKTRACE_SUPPORT is not enabled, the call chain cannot be accessed:
> > > ftrace_trace_userstack -> stack_trace_save_user -> arch_stack_walk_user
> > 
> > As above, please reuse the existing code, e.g. take the existing logic in
> > perf_callchain_user() and use it to implement arch_stack_walk_user(), and call
> > that from perf_callchain_user() an stack_trace_save_user().
> > 
> Okay, I will develop the stack_trace_save_user() refers to perf_callchain_user() and
> resend this patch as V2.

Thanks!

Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-11-24 10:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-18 13:45 [PATCH] arm64: Add user stacktrace support qiwuchen55
2023-11-18 20:40 ` kernel test robot
2023-11-20 14:29 ` Mark Rutland
2023-11-20 17:46   ` Mark Rutland
2023-11-21  9:03   ` chenqiwu
2023-11-21 14:05     ` Mark Rutland
2023-11-24  9:55       ` chenqiwu
2023-11-24 10:12         ` Mark Rutland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox