[PATCH] ptrace RSE bug

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] ptrace RSE bug
@ 2007-08-29  3:21 Shaohua Li
  2007-08-29  7:10 ` Roland McGrath
                   ` (34 more replies)
  0 siblings, 35 replies; 36+ messages in thread
From: Shaohua Li @ 2007-08-29  3:21 UTC (permalink / raw)
  To: linux-ia64

This is base kernel patch for ptrace RSE bug. It's basically a backport
from the utrace RSE patch I sent out several weeks ago. please review.

when a thread is stopped (ptraced), debugger might change thread's user
stack (change memory directly), and we must avoid the RSE stored in
kernel to override user stack (user space's RSE is newer than kernel's
in the case). To workaround the issue, we copy kernel RSE to user RSE
before the task is stopped, so user RSE has updated data.  we then copy
user RSE to kernel after the task is resummed from traced stop and
kernel will use the newer RSE to return to user. 

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 arch/ia64/kernel/perfmon.c     |   21 +---------
 arch/ia64/kernel/process.c     |   15 +++++++
 arch/ia64/kernel/ptrace.c      |   82 +++++++++++++++++++++++++++++++++++++++++
 include/asm-ia64/ptrace.h      |    3 +
 include/asm-ia64/thread_info.h |   11 ++++-
 include/linux/ptrace.h         |    4 ++
 kernel/signal.c                |    5 ++
 7 files changed, 119 insertions(+), 22 deletions(-)

Index: linux/arch/ia64/kernel/perfmon.c
=================================--- linux.orig/arch/ia64/kernel/perfmon.c	2007-08-28 16:54:18.000000000 +0800
+++ linux/arch/ia64/kernel/perfmon.c	2007-08-28 16:54:41.000000000 +0800
@@ -586,21 +586,6 @@ pfm_put_task(struct task_struct *task)
 }
 
 static inline void
-pfm_set_task_notify(struct task_struct *task)
-{
-	struct thread_info *info;
-
-	info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
-	set_bit(TIF_PERFMON_WORK, &info->flags);
-}
-
-static inline void
-pfm_clear_task_notify(void)
-{
-	clear_thread_flag(TIF_PERFMON_WORK);
-}
-
-static inline void
 pfm_reserve_page(unsigned long a)
 {
 	SetPageReserved(vmalloc_to_page((void *)a));
@@ -3730,7 +3715,7 @@ pfm_restart(pfm_context_t *ctx, void *ar
 
 		PFM_SET_WORK_PENDING(task, 1);
 
-		pfm_set_task_notify(task);
+		tsk_set_notify_resume(task);
 
 		/*
 		 * XXX: send reschedule if task runs on another CPU
@@ -5087,7 +5072,7 @@ pfm_handle_work(void)
 
 	PFM_SET_WORK_PENDING(current, 0);
 
-	pfm_clear_task_notify();
+	tsk_clear_notify_resume(current);
 
 	regs = task_pt_regs(current);
 
@@ -5455,7 +5440,7 @@ pfm_overflow_handler(struct task_struct 
 			 * when coming from ctxsw, current still points to the
 			 * previous task, therefore we must work with task and not current.
 			 */
-			pfm_set_task_notify(task);
+			tsk_set_notify_resume(task);
 		}
 		/*
 		 * defer until state is changed (shorten spin window). the context is locked
Index: linux/arch/ia64/kernel/process.c
=================================--- linux.orig/arch/ia64/kernel/process.c	2007-08-28 16:54:18.000000000 +0800
+++ linux/arch/ia64/kernel/process.c	2007-08-28 16:54:41.000000000 +0800
@@ -154,6 +154,17 @@ show_regs (struct pt_regs *regs)
 		show_stack(NULL, NULL);
 }
 
+void tsk_clear_notify_resume(struct task_struct *tsk)
+{
+#ifdef CONFIG_PERFMON
+	if (tsk->thread.pfm_needs_checking)
+		return;
+#endif
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_RSE))
+		return;
+	clear_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME);
+}
+
 void
 do_notify_resume_user (sigset_t *unused, struct sigscratch *scr, long in_syscall)
 {
@@ -172,6 +183,10 @@ do_notify_resume_user (sigset_t *unused,
 	/* deal with pending signal delivery */
 	if (test_thread_flag(TIF_SIGPENDING)||test_thread_flag(TIF_RESTORE_SIGMASK))
 		ia64_do_signal(scr, in_syscall);
+
+	/* copy user rbs to kernel rbs */
+	if (unlikely(test_thread_flag(TIF_RESTORE_RSE)))
+		ia64_sync_krbs();
 }
 
 static int pal_halt        = 1;
Index: linux/arch/ia64/kernel/ptrace.c
=================================--- linux.orig/arch/ia64/kernel/ptrace.c	2007-08-28 16:54:18.000000000 +0800
+++ linux/arch/ia64/kernel/ptrace.c	2007-08-29 10:59:02.000000000 +0800
@@ -547,6 +547,74 @@ ia64_sync_user_rbs (struct task_struct *
 	return 0;
 }
 
+static long
+ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
+		unsigned long user_rbs_start, unsigned long user_rbs_end)
+{
+	unsigned long addr, val;
+	long ret;
+
+	/* now copy word for word from user rbs to kernel rbs: */
+	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
+		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+				!= sizeof(val))
+			return -EIO;
+
+		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+#define RBS_SYNC_TO_USER 0
+#define RBS_SYNC_TO_KERNEL 1
+static void do_sync_rbs(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	unsigned long urbs_end;
+	int to_user = (unsigned long)arg;
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+	pt = task_pt_regs(info->task);
+	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
+
+	if (to_user = RBS_SYNC_TO_USER)
+		ia64_sync_user_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+	else
+		ia64_sync_kernel_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+}
+
+/*
+ * when a thread is stopped (ptraced), debugger might change thread's user
+ * stack (change memory directly), and we must avoid the RSE stored in kernel
+ * to override user stack (user space's RSE is newer than kernel's in the
+ * case). To workaround the issue, we copy kernel RSE to user RSE before the
+ * task is stopped, so user RSE has updated data.  we then copy user RSE to
+ * kernel after the task is resummed from traced stop and kernel will use the
+ * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need
+ * synchronize user RSE to kernel.
+ */
+void arch_ptrace_stop(void)
+{
+	if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE))
+		return;
+	tsk_set_notify_resume(current);
+	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_USER);
+}
+
+/*
+ * This is called to read back the register backing store.
+ */
+void ia64_sync_krbs(void)
+{
+	clear_tsk_thread_flag(current, TIF_RESTORE_RSE);
+	tsk_clear_notify_resume(current);
+
+	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_KERNEL);
+}
+
 static inline int
 thread_matches (struct task_struct *thread, unsigned long addr)
 {
@@ -1422,6 +1490,7 @@ sys_ptrace (long request, pid_t pid, uns
 	struct task_struct *child;
 	struct switch_stack *sw;
 	long ret;
+	struct unw_frame_info info;
 
 	lock_kernel();
 	ret = -EPERM;
@@ -1481,6 +1550,11 @@ sys_ptrace (long request, pid_t pid, uns
 		/* write the word at location addr */
 		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
 		ret = ia64_poke(child, sw, urbs_end, addr, data);
+
+		/* Make sure user RBS has the latest data */
+		unw_init_from_blocked_task(&info, child);
+		do_sync_rbs(&info, RBS_SYNC_TO_USER);
+
 		goto out_tsk;
 
 	      case PTRACE_PEEKUSR:
@@ -1635,6 +1709,10 @@ syscall_trace_enter (long arg0, long arg
 	    && (current->ptrace & PT_PTRACED))
 		syscall_trace();
 
+	/* copy user rbs to kernel rbs */
+	if (test_thread_flag(TIF_RESTORE_RSE))
+		ia64_sync_krbs();
+
 	if (unlikely(current->audit_context)) {
 		long syscall;
 		int arch;
@@ -1672,4 +1750,8 @@ syscall_trace_leave (long arg0, long arg
 	    || test_thread_flag(TIF_SINGLESTEP))
 	    && (current->ptrace & PT_PTRACED))
 		syscall_trace();
+
+	/* copy user rbs to kernel rbs */
+	if (test_thread_flag(TIF_RESTORE_RSE))
+		ia64_sync_krbs();
 }
Index: linux/include/asm-ia64/ptrace.h
=================================--- linux.orig/include/asm-ia64/ptrace.h	2007-08-28 16:54:18.000000000 +0800
+++ linux/include/asm-ia64/ptrace.h	2007-08-28 16:54:41.000000000 +0800
@@ -292,6 +292,7 @@ struct switch_stack {
 			 unsigned long, long);
   extern void ia64_flush_fph (struct task_struct *);
   extern void ia64_sync_fph (struct task_struct *);
+  extern void ia64_sync_krbs(void);
   extern long ia64_sync_user_rbs (struct task_struct *, struct switch_stack *,
 				  unsigned long, unsigned long);
 
@@ -303,6 +304,8 @@ struct switch_stack {
   extern void ia64_increment_ip (struct pt_regs *pt);
   extern void ia64_decrement_ip (struct pt_regs *pt);
 
+  extern void arch_ptrace_stop(void);
+  #define HAVE_ARCH_PTRACE_STOP
 #endif /* !__KERNEL__ */
 
 /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
Index: linux/include/asm-ia64/thread_info.h
=================================--- linux.orig/include/asm-ia64/thread_info.h	2007-08-28 16:54:18.000000000 +0800
+++ linux/include/asm-ia64/thread_info.h	2007-08-28 16:54:41.000000000 +0800
@@ -71,6 +71,9 @@ struct thread_info {
 #define alloc_task_struct()	((struct task_struct *)__get_free_pages(GFP_KERNEL | __GFP_COMP, KERNEL_STACK_SIZE_ORDER))
 #define free_task_struct(tsk)	free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER)
 
+#define tsk_set_notify_resume(tsk) \
+	set_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME)
+extern void tsk_clear_notify_resume(struct task_struct *tsk);
 #endif /* !__ASSEMBLY */
 
 /*
@@ -85,28 +88,30 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	3	/* syscall auditing active */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
-#define TIF_PERFMON_WORK	6	/* work for pfm_handle_work() */
+#define TIF_NOTIFY_RESUME	6	/* resumption notification requested */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
 #define TIF_FREEZE		20	/* is freezing for suspend */
+#define TIF_RESTORE_RSE		21	/* task need RSE synchronization */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_SYSCALL_TRACEAUDIT	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
-#define _TIF_PERFMON_WORK	(1 << TIF_PERFMON_WORK)
+#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
+#define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
 
 /* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_PERFMON_WORK|_TIF_SYSCALL_AUDIT|\
+#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
 				 _TIF_NEED_RESCHED| _TIF_SYSCALL_TRACE|\
 				 _TIF_RESTORE_SIGMASK)
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
Index: linux/kernel/signal.c
=================================--- linux.orig/kernel/signal.c	2007-08-28 16:54:18.000000000 +0800
+++ linux/kernel/signal.c	2007-08-28 16:54:41.000000000 +0800
@@ -1600,9 +1600,12 @@ static void ptrace_stop(int exit_code, i
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
 
+	spin_unlock_irq(&current->sighand->siglock);
 	/* Let the debugger run.  */
+	arch_ptrace_stop();
+
 	set_current_state(TASK_TRACED);
-	spin_unlock_irq(&current->sighand->siglock);
+
 	try_to_freeze();
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
Index: linux/include/linux/ptrace.h
=================================--- linux.orig/include/linux/ptrace.h	2007-08-28 16:54:18.000000000 +0800
+++ linux/include/linux/ptrace.h	2007-08-28 16:54:41.000000000 +0800
@@ -128,6 +128,10 @@ int generic_ptrace_pokedata(struct task_
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef HAVE_ARCH_PTRACE_STOP
+#define arch_ptrace_stop()	do { } while (0)
+#endif
+
 #endif
 
 #endif

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
@ 2007-08-29  7:10 ` Roland McGrath
  2007-08-29  7:29 ` Matthew Chapman
                   ` (33 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-08-29  7:10 UTC (permalink / raw)
  To: linux-ia64

The arch_ptrace_stop path there has an issue with SIGKILL races.  But if
the ia64 parts are approved to go in upstream, I can help out with the
generic kernel/signal.c part to make it safe.  Please let me know.

Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
  2007-08-29  7:10 ` Roland McGrath
@ 2007-08-29  7:29 ` Matthew Chapman
  2007-08-29  8:01 ` Roland McGrath
                   ` (32 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Matthew Chapman @ 2007-08-29  7:29 UTC (permalink / raw)
  To: linux-ia64

Perhaps I'm misunderstanding what you mean, but isn't this what is dealt
with in ia64_poke: if you try and update a process's user RBS via
ptrace, it writes to the kernel RBS instead?  How do you manage to make
the user RBS newer - are you somehow updating it via shared memory?  (In
which case I would say don't do that ;))

IMHO copying the RBS back and forth seems like an awfully expensive
workaround for something that would better be handled at the time that
you actually need to write to the RBS.

Matt


On Wed, Aug 29, 2007 at 11:21:50AM +0800, Shaohua Li wrote:
> This is base kernel patch for ptrace RSE bug. It's basically a backport
> from the utrace RSE patch I sent out several weeks ago. please review.
> 
> when a thread is stopped (ptraced), debugger might change thread's user
> stack (change memory directly), and we must avoid the RSE stored in
> kernel to override user stack (user space's RSE is newer than kernel's
> in the case). To workaround the issue, we copy kernel RSE to user RSE
> before the task is stopped, so user RSE has updated data.  we then copy
> user RSE to kernel after the task is resummed from traced stop and
> kernel will use the newer RSE to return to user. 
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
>  arch/ia64/kernel/perfmon.c     |   21 +---------
>  arch/ia64/kernel/process.c     |   15 +++++++
>  arch/ia64/kernel/ptrace.c      |   82 +++++++++++++++++++++++++++++++++++++++++
>  include/asm-ia64/ptrace.h      |    3 +
>  include/asm-ia64/thread_info.h |   11 ++++-
>  include/linux/ptrace.h         |    4 ++
>  kernel/signal.c                |    5 ++
>  7 files changed, 119 insertions(+), 22 deletions(-)
> 
> Index: linux/arch/ia64/kernel/perfmon.c
> =================================> --- linux.orig/arch/ia64/kernel/perfmon.c	2007-08-28 16:54:18.000000000 +0800
> +++ linux/arch/ia64/kernel/perfmon.c	2007-08-28 16:54:41.000000000 +0800
> @@ -586,21 +586,6 @@ pfm_put_task(struct task_struct *task)
>  }
>  
>  static inline void
> -pfm_set_task_notify(struct task_struct *task)
> -{
> -	struct thread_info *info;
> -
> -	info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
> -	set_bit(TIF_PERFMON_WORK, &info->flags);
> -}
> -
> -static inline void
> -pfm_clear_task_notify(void)
> -{
> -	clear_thread_flag(TIF_PERFMON_WORK);
> -}
> -
> -static inline void
>  pfm_reserve_page(unsigned long a)
>  {
>  	SetPageReserved(vmalloc_to_page((void *)a));
> @@ -3730,7 +3715,7 @@ pfm_restart(pfm_context_t *ctx, void *ar
>  
>  		PFM_SET_WORK_PENDING(task, 1);
>  
> -		pfm_set_task_notify(task);
> +		tsk_set_notify_resume(task);
>  
>  		/*
>  		 * XXX: send reschedule if task runs on another CPU
> @@ -5087,7 +5072,7 @@ pfm_handle_work(void)
>  
>  	PFM_SET_WORK_PENDING(current, 0);
>  
> -	pfm_clear_task_notify();
> +	tsk_clear_notify_resume(current);
>  
>  	regs = task_pt_regs(current);
>  
> @@ -5455,7 +5440,7 @@ pfm_overflow_handler(struct task_struct 
>  			 * when coming from ctxsw, current still points to the
>  			 * previous task, therefore we must work with task and not current.
>  			 */
> -			pfm_set_task_notify(task);
> +			tsk_set_notify_resume(task);
>  		}
>  		/*
>  		 * defer until state is changed (shorten spin window). the context is locked
> Index: linux/arch/ia64/kernel/process.c
> =================================> --- linux.orig/arch/ia64/kernel/process.c	2007-08-28 16:54:18.000000000 +0800
> +++ linux/arch/ia64/kernel/process.c	2007-08-28 16:54:41.000000000 +0800
> @@ -154,6 +154,17 @@ show_regs (struct pt_regs *regs)
>  		show_stack(NULL, NULL);
>  }
>  
> +void tsk_clear_notify_resume(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_PERFMON
> +	if (tsk->thread.pfm_needs_checking)
> +		return;
> +#endif
> +	if (test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_RSE))
> +		return;
> +	clear_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME);
> +}
> +
>  void
>  do_notify_resume_user (sigset_t *unused, struct sigscratch *scr, long in_syscall)
>  {
> @@ -172,6 +183,10 @@ do_notify_resume_user (sigset_t *unused,
>  	/* deal with pending signal delivery */
>  	if (test_thread_flag(TIF_SIGPENDING)||test_thread_flag(TIF_RESTORE_SIGMASK))
>  		ia64_do_signal(scr, in_syscall);
> +
> +	/* copy user rbs to kernel rbs */
> +	if (unlikely(test_thread_flag(TIF_RESTORE_RSE)))
> +		ia64_sync_krbs();
>  }
>  
>  static int pal_halt        = 1;
> Index: linux/arch/ia64/kernel/ptrace.c
> =================================> --- linux.orig/arch/ia64/kernel/ptrace.c	2007-08-28 16:54:18.000000000 +0800
> +++ linux/arch/ia64/kernel/ptrace.c	2007-08-29 10:59:02.000000000 +0800
> @@ -547,6 +547,74 @@ ia64_sync_user_rbs (struct task_struct *
>  	return 0;
>  }
>  
> +static long
> +ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
> +		unsigned long user_rbs_start, unsigned long user_rbs_end)
> +{
> +	unsigned long addr, val;
> +	long ret;
> +
> +	/* now copy word for word from user rbs to kernel rbs: */
> +	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
> +		if (access_process_vm(child, addr, &val, sizeof(val), 0)
> +				!= sizeof(val))
> +			return -EIO;
> +
> +		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +#define RBS_SYNC_TO_USER 0
> +#define RBS_SYNC_TO_KERNEL 1
> +static void do_sync_rbs(struct unw_frame_info *info, void *arg)
> +{
> +	struct pt_regs *pt;
> +	unsigned long urbs_end;
> +	int to_user = (unsigned long)arg;
> +
> +	if (unw_unwind_to_user(info) < 0)
> +		return;
> +	pt = task_pt_regs(info->task);
> +	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
> +
> +	if (to_user = RBS_SYNC_TO_USER)
> +		ia64_sync_user_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
> +	else
> +		ia64_sync_kernel_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
> +}
> +
> +/*
> + * when a thread is stopped (ptraced), debugger might change thread's user
> + * stack (change memory directly), and we must avoid the RSE stored in kernel
> + * to override user stack (user space's RSE is newer than kernel's in the
> + * case). To workaround the issue, we copy kernel RSE to user RSE before the
> + * task is stopped, so user RSE has updated data.  we then copy user RSE to
> + * kernel after the task is resummed from traced stop and kernel will use the
> + * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need
> + * synchronize user RSE to kernel.
> + */
> +void arch_ptrace_stop(void)
> +{
> +	if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE))
> +		return;
> +	tsk_set_notify_resume(current);
> +	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_USER);
> +}
> +
> +/*
> + * This is called to read back the register backing store.
> + */
> +void ia64_sync_krbs(void)
> +{
> +	clear_tsk_thread_flag(current, TIF_RESTORE_RSE);
> +	tsk_clear_notify_resume(current);
> +
> +	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_KERNEL);
> +}
> +
>  static inline int
>  thread_matches (struct task_struct *thread, unsigned long addr)
>  {
> @@ -1422,6 +1490,7 @@ sys_ptrace (long request, pid_t pid, uns
>  	struct task_struct *child;
>  	struct switch_stack *sw;
>  	long ret;
> +	struct unw_frame_info info;
>  
>  	lock_kernel();
>  	ret = -EPERM;
> @@ -1481,6 +1550,11 @@ sys_ptrace (long request, pid_t pid, uns
>  		/* write the word at location addr */
>  		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
>  		ret = ia64_poke(child, sw, urbs_end, addr, data);
> +
> +		/* Make sure user RBS has the latest data */
> +		unw_init_from_blocked_task(&info, child);
> +		do_sync_rbs(&info, RBS_SYNC_TO_USER);
> +
>  		goto out_tsk;
>  
>  	      case PTRACE_PEEKUSR:
> @@ -1635,6 +1709,10 @@ syscall_trace_enter (long arg0, long arg
>  	    && (current->ptrace & PT_PTRACED))
>  		syscall_trace();
>  
> +	/* copy user rbs to kernel rbs */
> +	if (test_thread_flag(TIF_RESTORE_RSE))
> +		ia64_sync_krbs();
> +
>  	if (unlikely(current->audit_context)) {
>  		long syscall;
>  		int arch;
> @@ -1672,4 +1750,8 @@ syscall_trace_leave (long arg0, long arg
>  	    || test_thread_flag(TIF_SINGLESTEP))
>  	    && (current->ptrace & PT_PTRACED))
>  		syscall_trace();
> +
> +	/* copy user rbs to kernel rbs */
> +	if (test_thread_flag(TIF_RESTORE_RSE))
> +		ia64_sync_krbs();
>  }
> Index: linux/include/asm-ia64/ptrace.h
> =================================> --- linux.orig/include/asm-ia64/ptrace.h	2007-08-28 16:54:18.000000000 +0800
> +++ linux/include/asm-ia64/ptrace.h	2007-08-28 16:54:41.000000000 +0800
> @@ -292,6 +292,7 @@ struct switch_stack {
>  			 unsigned long, long);
>    extern void ia64_flush_fph (struct task_struct *);
>    extern void ia64_sync_fph (struct task_struct *);
> +  extern void ia64_sync_krbs(void);
>    extern long ia64_sync_user_rbs (struct task_struct *, struct switch_stack *,
>  				  unsigned long, unsigned long);
>  
> @@ -303,6 +304,8 @@ struct switch_stack {
>    extern void ia64_increment_ip (struct pt_regs *pt);
>    extern void ia64_decrement_ip (struct pt_regs *pt);
>  
> +  extern void arch_ptrace_stop(void);
> +  #define HAVE_ARCH_PTRACE_STOP
>  #endif /* !__KERNEL__ */
>  
>  /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
> Index: linux/include/asm-ia64/thread_info.h
> =================================> --- linux.orig/include/asm-ia64/thread_info.h	2007-08-28 16:54:18.000000000 +0800
> +++ linux/include/asm-ia64/thread_info.h	2007-08-28 16:54:41.000000000 +0800
> @@ -71,6 +71,9 @@ struct thread_info {
>  #define alloc_task_struct()	((struct task_struct *)__get_free_pages(GFP_KERNEL | __GFP_COMP, KERNEL_STACK_SIZE_ORDER))
>  #define free_task_struct(tsk)	free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER)
>  
> +#define tsk_set_notify_resume(tsk) \
> +	set_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME)
> +extern void tsk_clear_notify_resume(struct task_struct *tsk);
>  #endif /* !__ASSEMBLY */
>  
>  /*
> @@ -85,28 +88,30 @@ struct thread_info {
>  #define TIF_SYSCALL_AUDIT	3	/* syscall auditing active */
>  #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
>  #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
> -#define TIF_PERFMON_WORK	6	/* work for pfm_handle_work() */
> +#define TIF_NOTIFY_RESUME	6	/* resumption notification requested */
>  #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
>  #define TIF_MEMDIE		17
>  #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
>  #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
>  #define TIF_FREEZE		20	/* is freezing for suspend */
> +#define TIF_RESTORE_RSE		21	/* task need RSE synchronization */
>  
>  #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
>  #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
>  #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
>  #define _TIF_SYSCALL_TRACEAUDIT	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP)
>  #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
> -#define _TIF_PERFMON_WORK	(1 << TIF_PERFMON_WORK)
> +#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>  #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
>  #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
>  #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
>  #define _TIF_FREEZE		(1 << TIF_FREEZE)
> +#define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
>  
>  /* "work to do on user-return" bits */
> -#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_PERFMON_WORK|_TIF_SYSCALL_AUDIT|\
> +#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
>  				 _TIF_NEED_RESCHED| _TIF_SYSCALL_TRACE|\
>  				 _TIF_RESTORE_SIGMASK)
>  /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
> Index: linux/kernel/signal.c
> =================================> --- linux.orig/kernel/signal.c	2007-08-28 16:54:18.000000000 +0800
> +++ linux/kernel/signal.c	2007-08-28 16:54:41.000000000 +0800
> @@ -1600,9 +1600,12 @@ static void ptrace_stop(int exit_code, i
>  	current->last_siginfo = info;
>  	current->exit_code = exit_code;
>  
> +	spin_unlock_irq(&current->sighand->siglock);
>  	/* Let the debugger run.  */
> +	arch_ptrace_stop();
> +
>  	set_current_state(TASK_TRACED);
> -	spin_unlock_irq(&current->sighand->siglock);
> +
>  	try_to_freeze();
>  	read_lock(&tasklist_lock);
>  	if (may_ptrace_stop()) {
> Index: linux/include/linux/ptrace.h
> =================================> --- linux.orig/include/linux/ptrace.h	2007-08-28 16:54:18.000000000 +0800
> +++ linux/include/linux/ptrace.h	2007-08-28 16:54:41.000000000 +0800
> @@ -128,6 +128,10 @@ int generic_ptrace_pokedata(struct task_
>  #define force_successful_syscall_return() do { } while (0)
>  #endif
>  
> +#ifndef HAVE_ARCH_PTRACE_STOP
> +#define arch_ptrace_stop()	do { } while (0)
> +#endif
> +
>  #endif
>  
>  #endif
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
  2007-08-29  7:10 ` Roland McGrath
  2007-08-29  7:29 ` Matthew Chapman
@ 2007-08-29  8:01 ` Roland McGrath
  2007-09-05 16:25 ` Petr Tesarik
                   ` (31 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-08-29  8:01 UTC (permalink / raw)
  To: linux-ia64

> Perhaps I'm misunderstanding what you mean, but isn't this what is dealt
> with in ia64_poke: if you try and update a process's user RBS via
> ptrace, it writes to the kernel RBS instead?  How do you manage to make
> the user RBS newer - are you somehow updating it via shared memory?  (In
> which case I would say don't do that ;))

This is a step on the path to getting rid of the ia64_poke kludges in
ptrace.  It has been under discussion for a long time (years now).  When
this is in place, ia64's special code for PTRACE_PEEKDATA et al will go
away and enable other desired cleanups in that code.  This also fixes the
longstanding problem that debuggers cannot reliably use /proc/pid/mem to
read all memory on ia64 (as they do on all other machines).  Because they
do not get true data for user RBS regions, debuggers now special-case ia64
and stick to using ptrace for memory access, crippling the performance of
some debugger uses (especially gdb's gcore) compared to Linux on other machines.

> IMHO copying the RBS back and forth seems like an awfully expensive
> workaround for something that would better be handled at the time that
> you actually need to write to the RBS.

Been there, done that, that's how it is now and those who did it in the
first place agreed it was a bad can of worms and this is the new plan.

Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (2 preceding siblings ...)
  2007-08-29  8:01 ` Roland McGrath
@ 2007-09-05 16:25 ` Petr Tesarik
  2007-09-06  3:16 ` Shaohua Li
                   ` (30 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-09-05 16:25 UTC (permalink / raw)
  To: linux-ia64

Shaohua Li wrote:
> This is base kernel patch for ptrace RSE bug. It's basically a backport
> from the utrace RSE patch I sent out several weeks ago. please review.
> 
> when a thread is stopped (ptraced), debugger might change thread's user
> stack (change memory directly), and we must avoid the RSE stored in
> kernel to override user stack (user space's RSE is newer than kernel's
> in the case). To workaround the issue, we copy kernel RSE to user RSE
> before the task is stopped, so user RSE has updated data.  we then copy
> user RSE to kernel after the task is resummed from traced stop and
> kernel will use the newer RSE to return to user. 

Hi Shaohua,

somehow I fail to see the need for a dedicated bit in the task flags.
The user RBS can only be more up-to-date than the kernel RBS if the
values there have been modified by a debugger. So, it should be enough
to copy the relevant part of the user RBS back into the kernel RBS when
the ptraced process is resumed, which happens AFAIK only:

  1. when the debugger lets it continue via ptrace(), or
  2. when the debugger ceases to exist.

Yes, we'd most likely have to add an arch_ptrace_resume() in addition to
arch_ptrace_stop(), but there are some advantages, too:

  1. It would save one bit in the task flags.
  2. The usual path for task switches is not modified
     (OK, the few instructions needed to check for TIF_RESTORE_RSE are
     negligible)

Or am I totally missing what you're trying to do?

Kind regards,
Petr Tesarik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (3 preceding siblings ...)
  2007-09-05 16:25 ` Petr Tesarik
@ 2007-09-06  3:16 ` Shaohua Li
  2007-09-06 13:59 ` Petr Tesarik
                   ` (29 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-09-06  3:16 UTC (permalink / raw)
  To: linux-ia64

On Wed, 2007-09-05 at 18:25 +0200, Petr Tesarik wrote:
> Shaohua Li wrote:
> > This is base kernel patch for ptrace RSE bug. It's basically a backport
> > from the utrace RSE patch I sent out several weeks ago. please review.
> > 
> > when a thread is stopped (ptraced), debugger might change thread's user
> > stack (change memory directly), and we must avoid the RSE stored in
> > kernel to override user stack (user space's RSE is newer than kernel's
> > in the case). To workaround the issue, we copy kernel RSE to user RSE
> > before the task is stopped, so user RSE has updated data.  we then copy
> > user RSE to kernel after the task is resummed from traced stop and
> > kernel will use the newer RSE to return to user. 
> 
> Hi Shaohua,
> 
> somehow I fail to see the need for a dedicated bit in the task flags.
> The user RBS can only be more up-to-date than the kernel RBS if the
> values there have been modified by a debugger. So, it should be enough
> to copy the relevant part of the user RBS back into the kernel RBS when
> the ptraced process is resumed, which happens AFAIK only:
> 
>   1. when the debugger lets it continue via ptrace(), or
>   2. when the debugger ceases to exist.
> 
> Yes, we'd most likely have to add an arch_ptrace_resume() in addition to
> arch_ptrace_stop(), but there are some advantages, too:
> 
>   1. It would save one bit in the task flags.
>   2. The usual path for task switches is not modified
>      (OK, the few instructions needed to check for TIF_RESTORE_RSE are
>      negligible)
> 
> Or am I totally missing what you're trying to do?
No, you didn't. Ideaily the should do sync user RBS to kernel just after
ptraced process is resumed. In previous discussion, somebody thought
this might be too agressive, as ptraced task might do suspend/resume
several times in a single syscall (syscall trace, fork trace) and we
supposed debugger will only change user RBS just after syscall return.
so the flag is used to try to do less sync. If the assumption is not
true, we should always do sync just after ptraced task is resumed.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (4 preceding siblings ...)
  2007-09-06  3:16 ` Shaohua Li
@ 2007-09-06 13:59 ` Petr Tesarik
  2007-09-07  1:02 ` Shaohua Li
                   ` (28 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-09-06 13:59 UTC (permalink / raw)
  To: linux-ia64

On Thu, 2007-09-06 at 11:16 +0800, Shaohua Li wrote:
> On Wed, 2007-09-05 at 18:25 +0200, Petr Tesarik wrote:
> > Shaohua Li wrote:
> > > This is base kernel patch for ptrace RSE bug. It's basically a backport
> > > from the utrace RSE patch I sent out several weeks ago. please review.
> > > 
> > > when a thread is stopped (ptraced), debugger might change thread's user
> > > stack (change memory directly), and we must avoid the RSE stored in
> > > kernel to override user stack (user space's RSE is newer than kernel's
> > > in the case). To workaround the issue, we copy kernel RSE to user RSE
> > > before the task is stopped, so user RSE has updated data.  we then copy
> > > user RSE to kernel after the task is resummed from traced stop and
> > > kernel will use the newer RSE to return to user. 
> > 
> > Hi Shaohua,
> > 
> > somehow I fail to see the need for a dedicated bit in the task flags.
> > The user RBS can only be more up-to-date than the kernel RBS if the
> > values there have been modified by a debugger. So, it should be enough
> > to copy the relevant part of the user RBS back into the kernel RBS when
> > the ptraced process is resumed, which happens AFAIK only:
> > 
> >   1. when the debugger lets it continue via ptrace(), or
> >   2. when the debugger ceases to exist.
> > 
> > Yes, we'd most likely have to add an arch_ptrace_resume() in addition to
> > arch_ptrace_stop(), but there are some advantages, too:
> > 
> >   1. It would save one bit in the task flags.
> >   2. The usual path for task switches is not modified
> >      (OK, the few instructions needed to check for TIF_RESTORE_RSE are
> >      negligible)
> > 
> > Or am I totally missing what you're trying to do?
> No, you didn't. Ideaily the should do sync user RBS to kernel just after
> ptraced process is resumed. In previous discussion, somebody thought
> this might be too agressive, as ptraced task might do suspend/resume
> several times in a single syscall (syscall trace, fork trace) and we
> supposed debugger will only change user RBS just after syscall return.
> so the flag is used to try to do less sync. If the assumption is not
> true, we should always do sync just after ptraced task is resumed.

Well, you're right, but are you sure it's really what you want? I did
some testing and made sure that syscall arguments are stored just below
ar.bsp as seen by ptrace().

So, what happens if upon syscall entry notification the debugger
modifies the part of the RBS (in user-space) which corresponds to the
arguments of that syscall? Currently, the syscall takes the modified
arguments, but with your change it would still take the stale data from
the kernel RBS.

BTW shouldn't the syscall arguments appear in the current frame instead?
Then, this would be a non-issue...

Regards,
Petr Tesarik


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (5 preceding siblings ...)
  2007-09-06 13:59 ` Petr Tesarik
@ 2007-09-07  1:02 ` Shaohua Li
  2007-09-07  8:26 ` Petr Tesarik
                   ` (27 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-09-07  1:02 UTC (permalink / raw)
  To: linux-ia64

On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> On Thu, 2007-09-06 at 11:16 +0800, Shaohua Li wrote:
> > On Wed, 2007-09-05 at 18:25 +0200, Petr Tesarik wrote:
> > > Shaohua Li wrote:
> > > > This is base kernel patch for ptrace RSE bug. It's basically a backport
> > > > from the utrace RSE patch I sent out several weeks ago. please review.
> > > > 
> > > > when a thread is stopped (ptraced), debugger might change thread's user
> > > > stack (change memory directly), and we must avoid the RSE stored in
> > > > kernel to override user stack (user space's RSE is newer than kernel's
> > > > in the case). To workaround the issue, we copy kernel RSE to user RSE
> > > > before the task is stopped, so user RSE has updated data.  we then copy
> > > > user RSE to kernel after the task is resummed from traced stop and
> > > > kernel will use the newer RSE to return to user. 
> > > 
> > > Hi Shaohua,
> > > 
> > > somehow I fail to see the need for a dedicated bit in the task flags.
> > > The user RBS can only be more up-to-date than the kernel RBS if the
> > > values there have been modified by a debugger. So, it should be enough
> > > to copy the relevant part of the user RBS back into the kernel RBS when
> > > the ptraced process is resumed, which happens AFAIK only:
> > > 
> > >   1. when the debugger lets it continue via ptrace(), or
> > >   2. when the debugger ceases to exist.
> > > 
> > > Yes, we'd most likely have to add an arch_ptrace_resume() in addition to
> > > arch_ptrace_stop(), but there are some advantages, too:
> > > 
> > >   1. It would save one bit in the task flags.
> > >   2. The usual path for task switches is not modified
> > >      (OK, the few instructions needed to check for TIF_RESTORE_RSE are
> > >      negligible)
> > > 
> > > Or am I totally missing what you're trying to do?
> > No, you didn't. Ideaily the should do sync user RBS to kernel just after
> > ptraced process is resumed. In previous discussion, somebody thought
> > this might be too agressive, as ptraced task might do suspend/resume
> > several times in a single syscall (syscall trace, fork trace) and we
> > supposed debugger will only change user RBS just after syscall return.
> > so the flag is used to try to do less sync. If the assumption is not
> > true, we should always do sync just after ptraced task is resumed.
> 
> Well, you're right, but are you sure it's really what you want? I did
> some testing and made sure that syscall arguments are stored just below
> ar.bsp as seen by ptrace().
> 
> So, what happens if upon syscall entry notification the debugger
> modifies the part of the RBS (in user-space) which corresponds to the
> arguments of that syscall? Currently, the syscall takes the modified
> arguments, but with your change it would still take the stale data
> from
> the kernel RBS.
The patch does sync from user RBS to kernel RBS just after syscall trace
enter. this is an exception I said doing sync just before syscall
return. I thought this covers your case, no?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (6 preceding siblings ...)
  2007-09-07  1:02 ` Shaohua Li
@ 2007-09-07  8:26 ` Petr Tesarik
  2007-09-07 15:11 ` David Mosberger-Tang
                   ` (26 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-09-07  8:26 UTC (permalink / raw)
  To: linux-ia64

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Shaohua Li wrote:
> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
>>[...]
>> So, what happens if upon syscall entry notification the debugger
>> modifies the part of the RBS (in user-space) which corresponds to the
>> arguments of that syscall? Currently, the syscall takes the modified
>> arguments, but with your change it would still take the stale data
>> from
>> the kernel RBS.
> The patch does sync from user RBS to kernel RBS just after syscall trace
> enter. this is an exception I said doing sync just before syscall
> return. I thought this covers your case, no?

Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
sync on every syscall_trace_enter() and syscall_trace_leave(), then the
only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
seems to be in the clone/fork and exit paths. In other words, it's
probably not worth the added complexity. But since you have written the
whole complex thing already, I have no objections against it.

Regards,
Petr Tesarik
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFG4QtOjpY2ODFi2ogRArB1AJ0bcewvu+VQvpoQ7NMeloJDK9GDLACgnmVW
qw0ovAkl8PztYwpsru96eXc=eZUo
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (7 preceding siblings ...)
  2007-09-07  8:26 ` Petr Tesarik
@ 2007-09-07 15:11 ` David Mosberger-Tang
  2007-09-11  8:39 ` Shaohua Li
                   ` (25 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: David Mosberger-Tang @ 2007-09-07 15:11 UTC (permalink / raw)
  To: linux-ia64

Anything that avoids complicating the kernel exit path is worth doing!
 The exit path is complicated enough as it is.

  --david

On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Shaohua Li wrote:
> > On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> >>[...]
> >> So, what happens if upon syscall entry notification the debugger
> >> modifies the part of the RBS (in user-space) which corresponds to the
> >> arguments of that syscall? Currently, the syscall takes the modified
> >> arguments, but with your change it would still take the stale data
> >> from
> >> the kernel RBS.
> > The patch does sync from user RBS to kernel RBS just after syscall trace
> > enter. this is an exception I said doing sync just before syscall
> > return. I thought this covers your case, no?
>
> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> seems to be in the clone/fork and exit paths. In other words, it's
> probably not worth the added complexity. But since you have written the
> whole complex thing already, I have no objections against it.
>
> Regards,
> Petr Tesarik
> -----BEGIN PGP SIGNATURE-----
> Version: GnuPG v1.4.6 (GNU/Linux)
> Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org
>
> iD8DBQFG4QtOjpY2ODFi2ogRArB1AJ0bcewvu+VQvpoQ7NMeloJDK9GDLACgnmVW
> qw0ovAkl8PztYwpsru96eXc> =eZUo
> -----END PGP SIGNATURE-----
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


-- 
Mosberger Consulting LLC, http://www.mosberger-consulting.com/

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (8 preceding siblings ...)
  2007-09-07 15:11 ` David Mosberger-Tang
@ 2007-09-11  8:39 ` Shaohua Li
  2007-10-17 14:56 ` Petr Tesarik
                   ` (24 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-09-11  8:39 UTC (permalink / raw)
  To: linux-ia64

On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
> Anything that avoids complicating the kernel exit path is worth doing!
>  The exit path is complicated enough as it is.
> 
>   --david
> 
> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> > -----BEGIN PGP SIGNED MESSAGE-----
> > Hash: SHA1
> >
> > Shaohua Li wrote:
> > > On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> > >>[...]
> > >> So, what happens if upon syscall entry notification the debugger
> > >> modifies the part of the RBS (in user-space) which corresponds to the
> > >> arguments of that syscall? Currently, the syscall takes the modified
> > >> arguments, but with your change it would still take the stale data
> > >> from
> > >> the kernel RBS.
> > > The patch does sync from user RBS to kernel RBS just after syscall trace
> > > enter. this is an exception I said doing sync just before syscall
> > > return. I thought this covers your case, no?
> >
> > Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> > sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> > only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> > seems to be in the clone/fork and exit paths. In other words, it's
> > probably not worth the added complexity. But since you have written the
> > whole complex thing already, I have no objections against it.
Ok, this is a simplified patch. please review.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 arch/ia64/kernel/ptrace.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-ia64/ptrace.h |    2 +
 include/linux/ptrace.h    |    4 ++
 kernel/signal.c           |    5 ++-
 4 files changed, 72 insertions(+), 1 deletion(-)

Index: linux/arch/ia64/kernel/ptrace.c
=================================--- linux.orig/arch/ia64/kernel/ptrace.c	2007-09-06 13:06:49.000000000 +0800
+++ linux/arch/ia64/kernel/ptrace.c	2007-09-11 11:33:35.000000000 +0800
@@ -547,6 +547,62 @@ ia64_sync_user_rbs (struct task_struct *
 	return 0;
 }
 
+static long
+ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
+		unsigned long user_rbs_start, unsigned long user_rbs_end)
+{
+	unsigned long addr, val;
+	long ret;
+
+	/* now copy word for word from user rbs to kernel rbs: */
+	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
+		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+				!= sizeof(val))
+			return -EIO;
+
+		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+#define RBS_SYNC_TO_USER 0
+#define RBS_SYNC_TO_KERNEL 1
+static void do_sync_rbs(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	unsigned long urbs_end;
+	int to_user = (unsigned long)arg;
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+	pt = task_pt_regs(info->task);
+	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
+
+	if (to_user = RBS_SYNC_TO_USER)
+		ia64_sync_user_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+	else
+		ia64_sync_kernel_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+}
+
+/*
+ * when a thread is stopped (ptraced), debugger might change thread's user
+ * stack (change memory directly), and we must avoid the RSE stored in kernel
+ * to override user stack (user space's RSE is newer than kernel's in the
+ * case). To workaround the issue, we copy kernel RSE to user RSE before the
+ * task is stopped, so user RSE has updated data.  we then copy user RSE to
+ * kernel after the task is resummed from traced stop and kernel will use the
+ * newer RSE to return to user.
+ */
+void arch_ptrace_stop(int before_suspend)
+{
+	if (before_suspend)
+		unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_USER);
+	else
+		unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_KERNEL);
+}
+
 static inline int
 thread_matches (struct task_struct *thread, unsigned long addr)
 {
@@ -1422,6 +1478,7 @@ sys_ptrace (long request, pid_t pid, uns
 	struct task_struct *child;
 	struct switch_stack *sw;
 	long ret;
+	struct unw_frame_info info;
 
 	lock_kernel();
 	ret = -EPERM;
@@ -1481,6 +1538,11 @@ sys_ptrace (long request, pid_t pid, uns
 		/* write the word at location addr */
 		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
 		ret = ia64_poke(child, sw, urbs_end, addr, data);
+
+		/* Make sure user RBS has the latest data */
+		unw_init_from_blocked_task(&info, child);
+		do_sync_rbs(&info, RBS_SYNC_TO_USER);
+
 		goto out_tsk;
 
 	      case PTRACE_PEEKUSR:
Index: linux/include/asm-ia64/ptrace.h
=================================--- linux.orig/include/asm-ia64/ptrace.h	2007-08-29 11:26:33.000000000 +0800
+++ linux/include/asm-ia64/ptrace.h	2007-09-11 09:53:04.000000000 +0800
@@ -303,6 +303,8 @@ struct switch_stack {
   extern void ia64_increment_ip (struct pt_regs *pt);
   extern void ia64_decrement_ip (struct pt_regs *pt);
 
+  extern void arch_ptrace_stop(int);
+  #define HAVE_ARCH_PTRACE_STOP
 #endif /* !__KERNEL__ */
 
 /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
Index: linux/kernel/signal.c
=================================--- linux.orig/kernel/signal.c	2007-09-11 09:33:14.000000000 +0800
+++ linux/kernel/signal.c	2007-09-11 10:55:05.000000000 +0800
@@ -1599,15 +1599,18 @@ static void ptrace_stop(int exit_code, i
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
 
+	spin_unlock_irq(&current->sighand->siglock);
+	arch_ptrace_stop(1);
 	/* Let the debugger run.  */
 	set_current_state(TASK_TRACED);
-	spin_unlock_irq(&current->sighand->siglock);
+
 	try_to_freeze();
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
+		arch_ptrace_stop(0);
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
Index: linux/include/linux/ptrace.h
=================================--- linux.orig/include/linux/ptrace.h	2007-08-29 11:26:33.000000000 +0800
+++ linux/include/linux/ptrace.h	2007-09-11 09:52:32.000000000 +0800
@@ -128,6 +128,10 @@ int generic_ptrace_pokedata(struct task_
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef HAVE_ARCH_PTRACE_STOP
+#define arch_ptrace_stop(a)	do { } while (0)
+#endif
+
 #endif
 
 #endif

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (9 preceding siblings ...)
  2007-09-11  8:39 ` Shaohua Li
@ 2007-10-17 14:56 ` Petr Tesarik
  2007-10-17 19:48 ` Petr Tesarik
                   ` (23 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-10-17 14:56 UTC (permalink / raw)
  To: linux-ia64

Shaohua Li wrote:
> On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
>> Anything that avoids complicating the kernel exit path is worth doing!
>>  The exit path is complicated enough as it is.
>>
>>   --david
>>
>> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
>>> -----BEGIN PGP SIGNED MESSAGE-----
>>> Hash: SHA1
>>>
>>> Shaohua Li wrote:
>>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
>>>>> [...]
>>>>> So, what happens if upon syscall entry notification the debugger
>>>>> modifies the part of the RBS (in user-space) which corresponds to the
>>>>> arguments of that syscall? Currently, the syscall takes the modified
>>>>> arguments, but with your change it would still take the stale data
>>>>> from
>>>>> the kernel RBS.
>>>> The patch does sync from user RBS to kernel RBS just after syscall trace
>>>> enter. this is an exception I said doing sync just before syscall
>>>> return. I thought this covers your case, no?
>>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
>>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
>>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
>>> seems to be in the clone/fork and exit paths. In other words, it's
>>> probably not worth the added complexity. But since you have written the
>>> whole complex thing already, I have no objections against it.
> Ok, this is a simplified patch. please review.

Well, it's been quite some time, but here we go.

I'm generally fine with this patch, but pleas note that it can't be
included on its own:

  1. There still is the race condition introduced by moving
set_current_state(TASK_TRACED) after the spin_unlock_irq

  2. You must couple it with the (planned) changes to the ptrace,
because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
RBS, but it gets later overwritten back from userspace when it is synced.

  3. There are possibly other issues with ia64_peek/ia64_poke, e.g. how
is PTRACE_{PEEK,POKE}USER implemented? Maybe I'm wrong but the way
PT_AR_RNAT is handled seems to access the wrong RBS too.

  4. While talking about RNAT, does the RBS syncing back and forth
handle correctly the case when part of the RNAT stored in the backing
store belongs to the kernel registers? It must not be possible to change
the NAT bits for kernel registers from userspace!

     Maybe it's not an issue, because I tried to actually exploit this
bug, and my attempts failed.

Kind Regards,
Petr Tesarik

> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
>  arch/ia64/kernel/ptrace.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/asm-ia64/ptrace.h |    2 +
>  include/linux/ptrace.h    |    4 ++
>  kernel/signal.c           |    5 ++-
>  4 files changed, 72 insertions(+), 1 deletion(-)
> 
> Index: linux/arch/ia64/kernel/ptrace.c
> =================================> --- linux.orig/arch/ia64/kernel/ptrace.c	2007-09-06 13:06:49.000000000 +0800
> +++ linux/arch/ia64/kernel/ptrace.c	2007-09-11 11:33:35.000000000 +0800
> @@ -547,6 +547,62 @@ ia64_sync_user_rbs (struct task_struct *
>  	return 0;
>  }
>  
> +static long
> +ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
> +		unsigned long user_rbs_start, unsigned long user_rbs_end)
> +{
> +	unsigned long addr, val;
> +	long ret;
> +
> +	/* now copy word for word from user rbs to kernel rbs: */
> +	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
> +		if (access_process_vm(child, addr, &val, sizeof(val), 0)
> +				!= sizeof(val))
> +			return -EIO;
> +
> +		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +#define RBS_SYNC_TO_USER 0
> +#define RBS_SYNC_TO_KERNEL 1
> +static void do_sync_rbs(struct unw_frame_info *info, void *arg)
> +{
> +	struct pt_regs *pt;
> +	unsigned long urbs_end;
> +	int to_user = (unsigned long)arg;
> +
> +	if (unw_unwind_to_user(info) < 0)
> +		return;
> +	pt = task_pt_regs(info->task);
> +	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
> +
> +	if (to_user = RBS_SYNC_TO_USER)
> +		ia64_sync_user_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
> +	else
> +		ia64_sync_kernel_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
> +}
> +
> +/*
> + * when a thread is stopped (ptraced), debugger might change thread's user
> + * stack (change memory directly), and we must avoid the RSE stored in kernel
> + * to override user stack (user space's RSE is newer than kernel's in the
> + * case). To workaround the issue, we copy kernel RSE to user RSE before the
> + * task is stopped, so user RSE has updated data.  we then copy user RSE to
> + * kernel after the task is resummed from traced stop and kernel will use the
> + * newer RSE to return to user.
> + */
> +void arch_ptrace_stop(int before_suspend)
> +{
> +	if (before_suspend)
> +		unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_USER);
> +	else
> +		unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_KERNEL);
> +}
> +
>  static inline int
>  thread_matches (struct task_struct *thread, unsigned long addr)
>  {
> @@ -1422,6 +1478,7 @@ sys_ptrace (long request, pid_t pid, uns
>  	struct task_struct *child;
>  	struct switch_stack *sw;
>  	long ret;
> +	struct unw_frame_info info;
>  
>  	lock_kernel();
>  	ret = -EPERM;
> @@ -1481,6 +1538,11 @@ sys_ptrace (long request, pid_t pid, uns
>  		/* write the word at location addr */
>  		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
>  		ret = ia64_poke(child, sw, urbs_end, addr, data);
> +
> +		/* Make sure user RBS has the latest data */
> +		unw_init_from_blocked_task(&info, child);
> +		do_sync_rbs(&info, RBS_SYNC_TO_USER);
> +
>  		goto out_tsk;
>  
>  	      case PTRACE_PEEKUSR:
> Index: linux/include/asm-ia64/ptrace.h
> =================================> --- linux.orig/include/asm-ia64/ptrace.h	2007-08-29 11:26:33.000000000 +0800
> +++ linux/include/asm-ia64/ptrace.h	2007-09-11 09:53:04.000000000 +0800
> @@ -303,6 +303,8 @@ struct switch_stack {
>    extern void ia64_increment_ip (struct pt_regs *pt);
>    extern void ia64_decrement_ip (struct pt_regs *pt);
>  
> +  extern void arch_ptrace_stop(int);
> +  #define HAVE_ARCH_PTRACE_STOP
>  #endif /* !__KERNEL__ */
>  
>  /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
> Index: linux/kernel/signal.c
> =================================> --- linux.orig/kernel/signal.c	2007-09-11 09:33:14.000000000 +0800
> +++ linux/kernel/signal.c	2007-09-11 10:55:05.000000000 +0800
> @@ -1599,15 +1599,18 @@ static void ptrace_stop(int exit_code, i
>  	current->last_siginfo = info;
>  	current->exit_code = exit_code;
>  
> +	spin_unlock_irq(&current->sighand->siglock);
> +	arch_ptrace_stop(1);
>  	/* Let the debugger run.  */
>  	set_current_state(TASK_TRACED);
> -	spin_unlock_irq(&current->sighand->siglock);
> +
>  	try_to_freeze();
>  	read_lock(&tasklist_lock);
>  	if (may_ptrace_stop()) {
>  		do_notify_parent_cldstop(current, CLD_TRAPPED);
>  		read_unlock(&tasklist_lock);
>  		schedule();
> +		arch_ptrace_stop(0);
>  	} else {
>  		/*
>  		 * By the time we got the lock, our tracer went away.
> Index: linux/include/linux/ptrace.h
> =================================> --- linux.orig/include/linux/ptrace.h	2007-08-29 11:26:33.000000000 +0800
> +++ linux/include/linux/ptrace.h	2007-09-11 09:52:32.000000000 +0800
> @@ -128,6 +128,10 @@ int generic_ptrace_pokedata(struct task_
>  #define force_successful_syscall_return() do { } while (0)
>  #endif
>  
> +#ifndef HAVE_ARCH_PTRACE_STOP
> +#define arch_ptrace_stop(a)	do { } while (0)
> +#endif
> +
>  #endif
>  
>  #endif


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (10 preceding siblings ...)
  2007-10-17 14:56 ` Petr Tesarik
@ 2007-10-17 19:48 ` Petr Tesarik
  2007-10-17 19:55 ` Petr Tesarik
                   ` (22 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-10-17 19:48 UTC (permalink / raw)
  To: linux-ia64

Petr Tesarik wrote:
>[...]
>   4. While talking about RNAT, does the RBS syncing back and forth
> handle correctly the case when part of the RNAT stored in the backing
> store belongs to the kernel registers? It must not be possible to change
> the NAT bits for kernel registers from userspace!
> 
>      Maybe it's not an issue, because I tried to actually exploit this
> bug, and my attempts failed.

I've just verified that put_rnat() does the right thing here (even
prevents setting NaT for syscall arguments), so modifying RNAT bits is
no problem. The other things still apply.

Regards,
Petr Tesarik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (11 preceding siblings ...)
  2007-10-17 19:48 ` Petr Tesarik
@ 2007-10-17 19:55 ` Petr Tesarik
  2007-10-18  1:54 ` Shaohua Li
                   ` (21 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-10-17 19:55 UTC (permalink / raw)
  To: linux-ia64

Petr Tesarik wrote:
>[...]
>   2. You must couple it with the (planned) changes to ptrace,
> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
> RBS, but it gets later overwritten back from userspace when it is synced.

I have verified that failing to do so breaks "strace -f", because strace
relies on intercepting the clone() system call and setting the
CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
set in the kernel RBS, which is overwritten with the (old) value from
the user RBS on a PTRACE_CONT, the new process is not traced.

Regards,
Petr Tesarik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (12 preceding siblings ...)
  2007-10-17 19:55 ` Petr Tesarik
@ 2007-10-18  1:54 ` Shaohua Li
  2007-10-18 10:59 ` Petr Tesarik
                   ` (20 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-10-18  1:54 UTC (permalink / raw)
  To: linux-ia64

On Wed, 2007-10-17 at 16:56 +0200, Petr Tesarik wrote:
> Shaohua Li wrote:
> > On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
> >> Anything that avoids complicating the kernel exit path is worth doing!
> >>  The exit path is complicated enough as it is.
> >>
> >>   --david
> >>
> >> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> >>> -----BEGIN PGP SIGNED MESSAGE-----
> >>> Hash: SHA1
> >>>
> >>> Shaohua Li wrote:
> >>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> >>>>> [...]
> >>>>> So, what happens if upon syscall entry notification the debugger
> >>>>> modifies the part of the RBS (in user-space) which corresponds to the
> >>>>> arguments of that syscall? Currently, the syscall takes the modified
> >>>>> arguments, but with your change it would still take the stale data
> >>>>> from
> >>>>> the kernel RBS.
> >>>> The patch does sync from user RBS to kernel RBS just after syscall trace
> >>>> enter. this is an exception I said doing sync just before syscall
> >>>> return. I thought this covers your case, no?
> >>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> >>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> >>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> >>> seems to be in the clone/fork and exit paths. In other words, it's
> >>> probably not worth the added complexity. But since you have written the
> >>> whole complex thing already, I have no objections against it.
> > Ok, this is a simplified patch. please review.
> 
> Well, it's been quite some time, but here we go.
> 
> I'm generally fine with this patch, but pleas note that it can't be
> included on its own:
> 
>   1. There still is the race condition introduced by moving
> set_current_state(TASK_TRACED) after the spin_unlock_irq
I don't know the details, but Roland said if other parts are ok, he can help fix the issue.

>   2. You must couple it with the (planned) changes to the ptrace,
> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
> RBS, but it gets later overwritten back from userspace when it is synced.

> I have verified that failing to do so breaks "strace -f", because
> strace
> relies on intercepting the clone() system call and setting the
> CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
> set in the kernel RBS, which is overwritten with the (old) value from
> the user RBS on a PTRACE_CONT, the new process is not traced.
The patch sync kernel RBS to user just before the task is suspended, so
I think we should be fine here. I did test 'strace -f', and test is ok.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (13 preceding siblings ...)
  2007-10-18  1:54 ` Shaohua Li
@ 2007-10-18 10:59 ` Petr Tesarik
  2007-10-18 16:02 ` Christoph Hellwig
                   ` (19 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-10-18 10:59 UTC (permalink / raw)
  To: linux-ia64

Shaohua Li wrote:
> On Wed, 2007-10-17 at 16:56 +0200, Petr Tesarik wrote:
>> Shaohua Li wrote:
>>> On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
>>>> Anything that avoids complicating the kernel exit path is worth doing!
>>>>  The exit path is complicated enough as it is.
>>>>
>>>>   --david
>>>>
>>>> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
>>>>> -----BEGIN PGP SIGNED MESSAGE-----
>>>>> Hash: SHA1
>>>>>
>>>>> Shaohua Li wrote:
>>>>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
>>>>>>> [...]
>>>>>>> So, what happens if upon syscall entry notification the debugger
>>>>>>> modifies the part of the RBS (in user-space) which corresponds to the
>>>>>>> arguments of that syscall? Currently, the syscall takes the modified
>>>>>>> arguments, but with your change it would still take the stale data
>>>>>>> from
>>>>>>> the kernel RBS.
>>>>>> The patch does sync from user RBS to kernel RBS just after syscall trace
>>>>>> enter. this is an exception I said doing sync just before syscall
>>>>>> return. I thought this covers your case, no?
>>>>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
>>>>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
>>>>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
>>>>> seems to be in the clone/fork and exit paths. In other words, it's
>>>>> probably not worth the added complexity. But since you have written the
>>>>> whole complex thing already, I have no objections against it.
>>> Ok, this is a simplified patch. please review.
>> Well, it's been quite some time, but here we go.
>>
>> I'm generally fine with this patch, but pleas note that it can't be
>> included on its own:
>>
>>   1. There still is the race condition introduced by moving
>> set_current_state(TASK_TRACED) after the spin_unlock_irq
> I don't know the details, but Roland said if other parts are ok, he can help fix the issue.
> 
>>   2. You must couple it with the (planned) changes to the ptrace,
>> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
>> RBS, but it gets later overwritten back from userspace when it is synced.
> 
>> I have verified that failing to do so breaks "strace -f", because
>> strace
>> relies on intercepting the clone() system call and setting the
>> CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
>> set in the kernel RBS, which is overwritten with the (old) value from
>> the user RBS on a PTRACE_CONT, the new process is not traced.
> The patch sync kernel RBS to user just before the task is suspended, so
> I think we should be fine here. I did test 'strace -f', and test is ok.

Maybe you're right. I was porting this to 2.6.16 for SUSE Linux
Enterprise Server 10, so my patch was a bit different. I'll retest with
latest git. Nevertheless, I still think that ia64_poke() can't do the
right thing here, because the changes made by PTRACE_PEEKDATA should
also be visible in /proc/<pid>/mem, for example.

Cheers,
Petr Tesarik


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (14 preceding siblings ...)
  2007-10-18 10:59 ` Petr Tesarik
@ 2007-10-18 16:02 ` Christoph Hellwig
  2007-10-19  7:30 ` Shaohua Li
                   ` (18 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2007-10-18 16:02 UTC (permalink / raw)
  To: linux-ia64

Just as a little notice, we now have the arch_ptrace_attach that
allows switching to the generic ptrace once the RSE bits are
settled.  It would be nice if the people working on this area
could do that conversion aswell.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (15 preceding siblings ...)
  2007-10-18 16:02 ` Christoph Hellwig
@ 2007-10-19  7:30 ` Shaohua Li
  2007-10-19 19:42 ` Petr Tesarik
                   ` (17 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-10-19  7:30 UTC (permalink / raw)
  To: linux-ia64

On Thu, 2007-10-18 at 12:59 +0200, Petr Tesarik wrote:
> Shaohua Li wrote:
> > On Wed, 2007-10-17 at 16:56 +0200, Petr Tesarik wrote:
> >> Shaohua Li wrote:
> >>> On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
> >>>> Anything that avoids complicating the kernel exit path is worth doing!
> >>>>  The exit path is complicated enough as it is.
> >>>>
> >>>>   --david
> >>>>
> >>>> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> >>>>> -----BEGIN PGP SIGNED MESSAGE-----
> >>>>> Hash: SHA1
> >>>>>
> >>>>> Shaohua Li wrote:
> >>>>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> >>>>>>> [...]
> >>>>>>> So, what happens if upon syscall entry notification the debugger
> >>>>>>> modifies the part of the RBS (in user-space) which corresponds to the
> >>>>>>> arguments of that syscall? Currently, the syscall takes the modified
> >>>>>>> arguments, but with your change it would still take the stale data
> >>>>>>> from
> >>>>>>> the kernel RBS.
> >>>>>> The patch does sync from user RBS to kernel RBS just after syscall trace
> >>>>>> enter. this is an exception I said doing sync just before syscall
> >>>>>> return. I thought this covers your case, no?
> >>>>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> >>>>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> >>>>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> >>>>> seems to be in the clone/fork and exit paths. In other words, it's
> >>>>> probably not worth the added complexity. But since you have written the
> >>>>> whole complex thing already, I have no objections against it.
> >>> Ok, this is a simplified patch. please review.
> >> Well, it's been quite some time, but here we go.
> >>
> >> I'm generally fine with this patch, but pleas note that it can't be
> >> included on its own:
> >>
> >>   1. There still is the race condition introduced by moving
> >> set_current_state(TASK_TRACED) after the spin_unlock_irq
> > I don't know the details, but Roland said if other parts are ok, he can help fix the issue.
> > 
> >>   2. You must couple it with the (planned) changes to the ptrace,
> >> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
> >> RBS, but it gets later overwritten back from userspace when it is synced.
> > 
> >> I have verified that failing to do so breaks "strace -f", because
> >> strace
> >> relies on intercepting the clone() system call and setting the
> >> CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
> >> set in the kernel RBS, which is overwritten with the (old) value from
> >> the user RBS on a PTRACE_CONT, the new process is not traced.
> > The patch sync kernel RBS to user just before the task is suspended, so
> > I think we should be fine here. I did test 'strace -f', and test is ok.
> 
> Maybe you're right. I was porting this to 2.6.16 for SUSE Linux
> Enterprise Server 10, so my patch was a bit different. I'll retest with
> latest git. Nevertheless, I still think that ia64_poke() can't do the
> right thing here, because the changes made by PTRACE_PEEKDATA should
> also be visible in /proc/<pid>/mem, for example.
Yes, exactly. I remember somebody said ia64_poke should be removed.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (16 preceding siblings ...)
  2007-10-19  7:30 ` Shaohua Li
@ 2007-10-19 19:42 ` Petr Tesarik
  2007-10-24  3:34 ` Shaohua Li
                   ` (16 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-10-19 19:42 UTC (permalink / raw)
  To: linux-ia64

On Thu, 2007-10-18 at 12:59 +0200, Petr Tesarik wrote:
> Shaohua Li wrote:
> > On Wed, 2007-10-17 at 16:56 +0200, Petr Tesarik wrote:
> >> Shaohua Li wrote:
> >>> On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
> >>>> Anything that avoids complicating the kernel exit path is worth doing!
> >>>>  The exit path is complicated enough as it is.
> >>>>
> >>>>   --david
> >>>>
> >>>> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> >>>>> -----BEGIN PGP SIGNED MESSAGE-----
> >>>>> Hash: SHA1
> >>>>>
> >>>>> Shaohua Li wrote:
> >>>>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> >>>>>>> [...]
> >>>>>>> So, what happens if upon syscall entry notification the debugger
> >>>>>>> modifies the part of the RBS (in user-space) which corresponds to the
> >>>>>>> arguments of that syscall? Currently, the syscall takes the modified
> >>>>>>> arguments, but with your change it would still take the stale data
> >>>>>>> from
> >>>>>>> the kernel RBS.
> >>>>>> The patch does sync from user RBS to kernel RBS just after syscall trace
> >>>>>> enter. this is an exception I said doing sync just before syscall
> >>>>>> return. I thought this covers your case, no?
> >>>>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> >>>>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> >>>>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> >>>>> seems to be in the clone/fork and exit paths. In other words, it's
> >>>>> probably not worth the added complexity. But since you have written the
> >>>>> whole complex thing already, I have no objections against it.
> >>> Ok, this is a simplified patch. please review.
> >> Well, it's been quite some time, but here we go.
> >>
> >> I'm generally fine with this patch, but pleas note that it can't be
> >> included on its own:
> >>
> >>   1. There still is the race condition introduced by moving
> >> set_current_state(TASK_TRACED) after the spin_unlock_irq
> > I don't know the details, but Roland said if other parts are ok, he can help fix the issue.
> > 
> >>   2. You must couple it with the (planned) changes to the ptrace,
> >> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
> >> RBS, but it gets later overwritten back from userspace when it is synced.
> > 
> >> I have verified that failing to do so breaks "strace -f", because
> >> strace
> >> relies on intercepting the clone() system call and setting the
> >> CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
> >> set in the kernel RBS, which is overwritten with the (old) value from
> >> the user RBS on a PTRACE_CONT, the new process is not traced.
> > The patch sync kernel RBS to user just before the task is suspended, so
> > I think we should be fine here. I did test 'strace -f', and test is ok.
> 
> Maybe you're right. I was porting this to 2.6.16 for SUSE Linux
> Enterprise Server 10, so my patch was a bit different. I'll retest with
> latest git. Nevertheless, I still think that ia64_poke() can't do the
> right thing here, because the changes made by PTRACE_PEEKDATA should
> also be visible in /proc/<pid>/mem, for example.

OK, I retested everything again with 2.6.23 and I can confirm that the
kernel behaves consistently with this patch applied - modifying syscall
arguments works (both for break and for fsyscalls), changes are refleced
in /proc/<pid>/mem and accessing the RNAT bits works too.

I would still like to get rid of ia64_peek() and ia64_poke(), because it
is no longer needed and is inefficient. For example, currently each
PTRACE_POKE first non-trivially finds out the correct location within
the kernel RBS and then immediately synchronizes the RBS to user space.
Not to mention that for peeking/poking a process with more threads the
kernel must first find the correct thread for a given address.

Shaohua's patch allows us to greatly simplify the architecture-specific
bits of ptrace. I'll send a patch soon.

In short, you've got my ack (whatever it's worth).

Cheers,
Petr Tesarik


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (17 preceding siblings ...)
  2007-10-19 19:42 ` Petr Tesarik
@ 2007-10-24  3:34 ` Shaohua Li
  2007-10-24 23:38 ` Luck, Tony
                   ` (15 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-10-24  3:34 UTC (permalink / raw)
  To: linux-ia64

On Fri, 2007-10-19 at 21:42 +0200, Petr Tesarik wrote:
> On Thu, 2007-10-18 at 12:59 +0200, Petr Tesarik wrote:
> > Shaohua Li wrote:
> > > On Wed, 2007-10-17 at 16:56 +0200, Petr Tesarik wrote:
> > >> Shaohua Li wrote:
> > >>> On Fri, 2007-09-07 at 09:11 -0600, David Mosberger-Tang wrote:
> > >>>> Anything that avoids complicating the kernel exit path is worth doing!
> > >>>>  The exit path is complicated enough as it is.
> > >>>>
> > >>>>   --david
> > >>>>
> > >>>> On 9/7/07, Petr Tesarik <ptesarik@suse.cz> wrote:
> > >>>>> -----BEGIN PGP SIGNED MESSAGE-----
> > >>>>> Hash: SHA1
> > >>>>>
> > >>>>> Shaohua Li wrote:
> > >>>>>> On Thu, 2007-09-06 at 15:59 +0200, Petr Tesarik wrote:
> > >>>>>>> [...]
> > >>>>>>> So, what happens if upon syscall entry notification the debugger
> > >>>>>>> modifies the part of the RBS (in user-space) which corresponds to the
> > >>>>>>> arguments of that syscall? Currently, the syscall takes the modified
> > >>>>>>> arguments, but with your change it would still take the stale data
> > >>>>>>> from
> > >>>>>>> the kernel RBS.
> > >>>>>> The patch does sync from user RBS to kernel RBS just after syscall trace
> > >>>>>> enter. this is an exception I said doing sync just before syscall
> > >>>>>> return. I thought this covers your case, no?
> > >>>>> Ah, I'm sorry, I missed that part of the patch. Well, if we have to do a
> > >>>>> sync on every syscall_trace_enter() and syscall_trace_leave(), then the
> > >>>>> only cases where introducing TIF_RESTORE_RSE saves us a duplicate sync
> > >>>>> seems to be in the clone/fork and exit paths. In other words, it's
> > >>>>> probably not worth the added complexity. But since you have written the
> > >>>>> whole complex thing already, I have no objections against it.
> > >>> Ok, this is a simplified patch. please review.
> > >> Well, it's been quite some time, but here we go.
> > >>
> > >> I'm generally fine with this patch, but pleas note that it can't be
> > >> included on its own:
> > >>
> > >>   1. There still is the race condition introduced by moving
> > >> set_current_state(TASK_TRACED) after the spin_unlock_irq
> > > I don't know the details, but Roland said if other parts are ok, he can help fix the issue.
> > > 
> > >>   2. You must couple it with the (planned) changes to the ptrace,
> > >> because otherwise PTRACE_{PEEK,POKE}{TEXT,DATA} still access the kernel
> > >> RBS, but it gets later overwritten back from userspace when it is synced.
> > > 
> > >> I have verified that failing to do so breaks "strace -f", because
> > >> strace
> > >> relies on intercepting the clone() system call and setting the
> > >> CLONE_PTRACE bit in the flags argument. Of course, if the bit is only
> > >> set in the kernel RBS, which is overwritten with the (old) value from
> > >> the user RBS on a PTRACE_CONT, the new process is not traced.
> > > The patch sync kernel RBS to user just before the task is suspended, so
> > > I think we should be fine here. I did test 'strace -f', and test is ok.
> > 
> > Maybe you're right. I was porting this to 2.6.16 for SUSE Linux
> > Enterprise Server 10, so my patch was a bit different. I'll retest with
> > latest git. Nevertheless, I still think that ia64_poke() can't do the
> > right thing here, because the changes made by PTRACE_PEEKDATA should
> > also be visible in /proc/<pid>/mem, for example.
> 
> OK, I retested everything again with 2.6.23 and I can confirm that the
> kernel behaves consistently with this patch applied - modifying syscall
> arguments works (both for break and for fsyscalls), changes are refleced
> in /proc/<pid>/mem and accessing the RNAT bits works too.
> 
> I would still like to get rid of ia64_peek() and ia64_poke(), because it
> is no longer needed and is inefficient. For example, currently each
> PTRACE_POKE first non-trivially finds out the correct location within
> the kernel RBS and then immediately synchronizes the RBS to user space.
> Not to mention that for peeking/poking a process with more threads the
> kernel must first find the correct thread for a given address.
> 
> Shaohua's patch allows us to greatly simplify the architecture-specific
> bits of ptrace. I'll send a patch soon.
> 
> In short, you've got my ack (whatever it's worth).
Thanks. So Tony, how do think about the IA64 part of the patch?

Thanks,
Shaohua 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (18 preceding siblings ...)
  2007-10-24  3:34 ` Shaohua Li
@ 2007-10-24 23:38 ` Luck, Tony
  2007-10-25  0:38 ` Shaohua Li
                   ` (14 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Luck, Tony @ 2007-10-24 23:38 UTC (permalink / raw)
  To: linux-ia64

> Thanks. So Tony, how do think about the IA64 part of the patch?

Having to use ia64_peek() and ia64_poke() to copy the RBS 8 bytes
at a time looks a bit ugly ... but this isn't in the performance
path (unless you are running your whole application under strace(1)!)
so it probably isn't an issue.  If it turns out to be, then an
incremental fix on top of this would be the way to go.

So I'm ok with the ia64 parts ... which leaves the open issue
of the generic part (moving the spin_unlock_irq() after the
set_current_state(TASK_TRACED)) ... which (if I follow this
thread correctly) is still an open issue, right?

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (19 preceding siblings ...)
  2007-10-24 23:38 ` Luck, Tony
@ 2007-10-25  0:38 ` Shaohua Li
  2007-11-12  2:14 ` Roland McGrath
                   ` (13 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-10-25  0:38 UTC (permalink / raw)
  To: linux-ia64

On Thu, 2007-10-25 at 07:38 +0800, Luck, Tony wrote:
> > Thanks. So Tony, how do think about the IA64 part of the patch?
> 
> Having to use ia64_peek() and ia64_poke() to copy the RBS 8 bytes
> at a time looks a bit ugly ... but this isn't in the performance
> path (unless you are running your whole application under strace(1)!)
> so it probably isn't an issue.  If it turns out to be, then an
> incremental fix on top of this would be the way to go.

> So I'm ok with the ia64 parts ... which leaves the open issue
> of the generic part (moving the spin_unlock_irq() after the
> set_current_state(TASK_TRACED)) ... which (if I follow this
> thread correctly) is still an open issue, right?
Right. Roland, can you help on this? I remember you said you know how to
fix it, I'm not familar on this.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (20 preceding siblings ...)
  2007-10-25  0:38 ` Shaohua Li
@ 2007-11-12  2:14 ` Roland McGrath
  2007-11-12 15:41 ` Petr Tesarik
                   ` (12 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-12  2:14 UTC (permalink / raw)
  To: linux-ia64

Sorry for the late reply on this.  I've been juggling too many other
things, and also I was out for most of a couple of weeks in October.

I did indeed promise to do the ptrace/signals magic part of the equation
once you ia64 folks ironed out the ia64-specific code for this plan.
I'm very glad to do it and see this wart finally get thoroughly cleaned up!

To integrate this with the ia64 code, I imagine in asm-ia64/ptrace.h adding:

#define arch_ptrace_stop_needed(code, info)	\
	(!test_thread_flag(TIF_RESTORE_RSE))
#define arch_ptrace_stop(code, info)		ia64_rse_writeback()

I have not tested this patch except to verify that it compiles away to no
change when the two new macros are not defined.  It may be difficult to
really test for the case it's particularly intended for, since it's a race.
You need something like repeated runs of strace /bin/true while
asynchronously trying kill -9 on the traced /bin/true process (but might
need to avoid killing the whole pgrp including strace), and see if it ever
leaves off with the traced process stuck in TASK_TRACED (T) with a pending
SIGKILL, rather than being a dead zombie (and cleaned up by strace).  If
you try to produce a test scenario for the problem, you can test it with a
version that never calls sigkill_pending after calling arch_ptrace_stop
with the siglock dropped.  Once you produce the bug with that kernel and
believe you can make it hit with a known test scenario, try the proper
patch as below to verify it can no longer hit.

If you are happy with this patch and the results of integrating it with
your ia64 changes, then either you or I can post this machine-independent
patch upstream for inclusion in the mainline kernel.  I'm eager to see all
the old cruft in ia64 ptrace get cleaned out after we have this in.

Thanks,
Roland

---
[PATCH] arch_ptrace_stop

This adds support to allow asm/ptrace.h to define two new macros,
arch_ptrace_stop_needed and arch_ptrace_stop.  These control special
machine-specific actions to be done before a ptrace stop.  The new
code compiles away to nothing when the new macros are not defined.
This is the case on all machines to begin with.

On ia64, these macros will be defined to solve the long-standing
issue of ptrace vs register backing store.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/signal.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 47 insertions(+), 1 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 909a0cc..0ac614a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1594,6 +1594,32 @@ static inline int may_ptrace_stop(void)
 }

 /*
+ * The arch can have machine-specific work to be done before ptrace stops.
+ * On ia64, register backing store gets written back to user memory here.
+ * Since this can be costly (requires dropping the siglock), we only do it
+ * when the arch requires it for this particular stop.  For example, if
+ * the thread has not been back to user mode since the last stop, the
+ * thread state might indicate that nothing needs to be done.
+ */
+#ifndef arch_ptrace_stop_needed
+#define arch_ptrace_stop_needed(code, info)	0
+#endif
+#ifndef arch_ptrace_stop
+#define arch_ptrace_stop(code, info)		do { } while (0)
+#endif
+
+/*
+ * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
+}
+
+/*
  * This must be called with current->sighand->siglock held.
  *
  * This should be the path for all ptrace stops.
@@ -1606,6 +1632,26 @@ static inline int may_ptrace_stop(void)
  */
 static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 {
+	int killed = 0;
+
+	if (arch_ptrace_stop_needed(exit_code, info)) {
+		/*
+		 * The arch code has something special to do before a
+		 * ptrace stop.  This is allowed to block, e.g. for faults
+		 * on user stack pages.  We can't keep the siglock while
+		 * calling arch_ptrace_stop, so we must release it now.
+		 * To preserve proper semantics, we must do this before
+		 * any signal bookkeeping like checking group_stop_count.
+		 * Meanwhile, a SIGKILL could come in before we retake the
+		 * siglock.  That must prevent us from sleeping in TASK_TRACED.
+		 * So after regaining the lock, we must check for SIGKILL.
+		 */
+		spin_unlock_irq(&current->sighand->siglock);
+		arch_ptrace_stop(exit_code, info);
+		spin_lock_irq(&current->sighand->siglock);
+		killed = sigkill_pending(current);
+	}
+
 	/*
 	 * If there is a group stop in progress,
 	 * we must participate in the bookkeeping.
@@ -1621,7 +1667,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 	spin_unlock_irq(&current->sighand->siglock);
 	try_to_freeze();
 	read_lock(&tasklist_lock);
-	if (may_ptrace_stop()) {
+	if (!unlikely(killed) && may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (21 preceding siblings ...)
  2007-11-12  2:14 ` Roland McGrath
@ 2007-11-12 15:41 ` Petr Tesarik
  2007-11-12 16:11 ` Petr Tesarik
                   ` (11 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-12 15:41 UTC (permalink / raw)
  To: linux-ia64

On Sun, 2007-11-11 at 18:14 -0800, Roland McGrath wrote: 
> Sorry for the late reply on this.  I've been juggling too many other
> things, and also I was out for most of a couple of weeks in October.
> 
> I did indeed promise to do the ptrace/signals magic part of the equation
> once you ia64 folks ironed out the ia64-specific code for this plan.
> I'm very glad to do it and see this wart finally get thoroughly cleaned up!
> 
> To integrate this with the ia64 code, I imagine in asm-ia64/ptrace.h adding:
> 
> #define arch_ptrace_stop_needed(code, info)	\
> 	(!test_thread_flag(TIF_RESTORE_RSE))
> #define arch_ptrace_stop(code, info)		ia64_rse_writeback()
> 
> I have not tested this patch except to verify that it compiles away to no
> change when the two new macros are not defined.  It may be difficult to
> really test for the case it's particularly intended for, since it's a race.
> You need something like repeated runs of strace /bin/true while
> asynchronously trying kill -9 on the traced /bin/true process (but might
> need to avoid killing the whole pgrp including strace), and see if it ever
> leaves off with the traced process stuck in TASK_TRACED (T) with a pending
> SIGKILL, rather than being a dead zombie (and cleaned up by strace).  If
> you try to produce a test scenario for the problem, you can test it with a
> version that never calls sigkill_pending after calling arch_ptrace_stop
> with the siglock dropped.  Once you produce the bug with that kernel and
> believe you can make it hit with a known test scenario, try the proper
> patch as below to verify it can no longer hit.

Hi Roland,

the trouble is I used the current ia64 patch and even inserted an
msleep(10) into ptrace_stop() to make sure it does sleep but I don't see
any problems. I added the following code between arch_ptrace_stop(1) and
set_current_state(TASK_TRACED):

	msleep(10);
	if (unlikely(sigismember(&current->pending.signal, SIGKILL)))
		printk(KERN_INFO "%d (%s): Got SIGKILL in ptrace_stop\n",
			current->pid, current->comm);

I ran strace on a simple program (calling gettimeofday() in an endless
loop) and killed it with SIGKILL. The program exited correctly and I got
the message in syslog. I'm puzzled. :/  Is this not the correct place
where the race condition should happen?

Kind regards,
Petr Tesarik

> If you are happy with this patch and the results of integrating it with
> your ia64 changes, then either you or I can post this machine-independent
> patch upstream for inclusion in the mainline kernel.  I'm eager to see all
> the old cruft in ia64 ptrace get cleaned out after we have this in.
> 
> 
> Thanks,
> Roland
> 
> ---
> [PATCH] arch_ptrace_stop
> 
> This adds support to allow asm/ptrace.h to define two new macros,
> arch_ptrace_stop_needed and arch_ptrace_stop.  These control special
> machine-specific actions to be done before a ptrace stop.  The new
> code compiles away to nothing when the new macros are not defined.
> This is the case on all machines to begin with.
> 
> On ia64, these macros will be defined to solve the long-standing
> issue of ptrace vs register backing store.
> 
> Signed-off-by: Roland McGrath <roland@redhat.com>
> ---
>  kernel/signal.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 47 insertions(+), 1 deletions(-)
> 
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 909a0cc..0ac614a 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1594,6 +1594,32 @@ static inline int may_ptrace_stop(void)
>  }
>  
>  /*
> + * The arch can have machine-specific work to be done before ptrace stops.
> + * On ia64, register backing store gets written back to user memory here.
> + * Since this can be costly (requires dropping the siglock), we only do it
> + * when the arch requires it for this particular stop.  For example, if
> + * the thread has not been back to user mode since the last stop, the
> + * thread state might indicate that nothing needs to be done.
> + */
> +#ifndef arch_ptrace_stop_needed
> +#define arch_ptrace_stop_needed(code, info)	0
> +#endif
> +#ifndef arch_ptrace_stop
> +#define arch_ptrace_stop(code, info)		do { } while (0)
> +#endif
> +
> +/*
> + * Return nonzero if there is a SIGKILL that should be waking us up.
> + * Called with the siglock held.
> + */
> +static int sigkill_pending(struct task_struct *tsk)
> +{
> +	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
> +		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
> +		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
> +}
> +
> +/*
>   * This must be called with current->sighand->siglock held.
>   *
>   * This should be the path for all ptrace stops.
> @@ -1606,6 +1632,26 @@ static inline int may_ptrace_stop(void)
>   */
>  static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
>  {
> +	int killed = 0;
> +
> +	if (arch_ptrace_stop_needed(exit_code, info)) {
> +		/*
> +		 * The arch code has something special to do before a
> +		 * ptrace stop.  This is allowed to block, e.g. for faults
> +		 * on user stack pages.  We can't keep the siglock while
> +		 * calling arch_ptrace_stop, so we must release it now.
> +		 * To preserve proper semantics, we must do this before
> +		 * any signal bookkeeping like checking group_stop_count.
> +		 * Meanwhile, a SIGKILL could come in before we retake the
> +		 * siglock.  That must prevent us from sleeping in TASK_TRACED.
> +		 * So after regaining the lock, we must check for SIGKILL.
> +		 */
> +		spin_unlock_irq(&current->sighand->siglock);
> +		arch_ptrace_stop(exit_code, info);
> +		spin_lock_irq(&current->sighand->siglock);
> +		killed = sigkill_pending(current);
> +	}
> +
>  	/*
>  	 * If there is a group stop in progress,
>  	 * we must participate in the bookkeeping.
> @@ -1621,7 +1667,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
>  	spin_unlock_irq(&current->sighand->siglock);
>  	try_to_freeze();
>  	read_lock(&tasklist_lock);
> -	if (may_ptrace_stop()) {
> +	if (!unlikely(killed) && may_ptrace_stop()) {
>  		do_notify_parent_cldstop(current, CLD_TRAPPED);
>  		read_unlock(&tasklist_lock);
>  		schedule();

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (22 preceding siblings ...)
  2007-11-12 15:41 ` Petr Tesarik
@ 2007-11-12 16:11 ` Petr Tesarik
  2007-11-13  0:30 ` Roland McGrath
                   ` (10 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-12 16:11 UTC (permalink / raw)
  To: linux-ia64

On Mon, 2007-11-12 at 16:41 +0100, Petr Tesarik wrote:
> On Sun, 2007-11-11 at 18:14 -0800, Roland McGrath wrote: 
>[...]
> Hi Roland,
> 
> the trouble is I used the current ia64 patch and even inserted an
> msleep(10) into ptrace_stop() to make sure it does sleep but I don't see
> any problems. I added the following code between arch_ptrace_stop(1) and
> set_current_state(TASK_TRACED):
> 
> 	msleep(10);
> 	if (unlikely(sigismember(&current->pending.signal, SIGKILL)))
> 		printk(KERN_INFO "%d (%s): Got SIGKILL in ptrace_stop\n",
> 			current->pid, current->comm);
> 
> I ran strace on a simple program (calling gettimeofday() in an endless
> loop) and killed it with SIGKILL. The program exited correctly and I got
> the message in syslog. I'm puzzled. :/  Is this not the correct place
> where the race condition should happen?

Ah, Roland, you're right, strace ends with:

+++ killed by SIGKILL +++
Process 2946 detached

I've just realized that it's exactly what SHOULDN'T happen. Sorry for
the fuss.

Regards,
Petr Tesarik


^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (23 preceding siblings ...)
  2007-11-12 16:11 ` Petr Tesarik
@ 2007-11-13  0:30 ` Roland McGrath
  2007-11-13 11:07 ` Petr Tesarik
                   ` (9 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-13  0:30 UTC (permalink / raw)
  To: linux-ia64

> the trouble is I used the current ia64 patch and even inserted an
> msleep(10) into ptrace_stop() to make sure it does sleep but I don't see
> any problems. I added the following code between arch_ptrace_stop(1) and
> set_current_state(TASK_TRACED):
> 
> 	msleep(10);
> 	if (unlikely(sigismember(&current->pending.signal, SIGKILL)))
> 		printk(KERN_INFO "%d (%s): Got SIGKILL in ptrace_stop\n",
> 			current->pid, current->comm);
> 
> I ran strace on a simple program (calling gettimeofday() in an endless
> loop) and killed it with SIGKILL. The program exited correctly and I got
> the message in syslog. I'm puzzled. :/  Is this not the correct place
> where the race condition should happen?

I'm not entirely clear on what code you are using.  

If you are using my patch, then the sigkill_pending check fixes this.  

If you are using code that does not drop the siglock before calling the
arch_ptrace_stop code, then you won't see the SIGKILL race either.  In that
case, you are just breaking rules for how long to hold locks and what you
can hold while you block and so forth.  This will have other bad effects
and would never be allowed to go into the kernel, but I don't have a
straightforward test case for such problems.

What I suggested testing was my code without the sigkill_pending check,
i.e. dropping the siglock around arch_ptrace_stop but no other fix-up.
If that is what you are trying and it does not produce a problem, then
I am surprised.

> Ah, Roland, you're right, strace ends with:
> 
> +++ killed by SIGKILL +++
> Process 2946 detached
> 
> I've just realized that it's exactly what SHOULDN'T happen. Sorry for
> the fuss.

No, this is correct behavior.  The bug symptom would be that noone
ever saw the SIGKILL because the traced process didn't wake up and
remains in TASK_TRACED with SIGKILL pending.

The test scenario I gave using strace is the wrong one.  In that case,
strace is always about to continue the process anyway, so you wouldn't
notice the problem even if it happened.  The problem case is when the
tracer doesn't do a PTRACE_CONT soon, so there is nothing other than
SIGKILL that would wake it up right away.  The race is between the
traced process going into ptrace_stop and the SIGKILL being sent.  It
probably does happen in this test, but once it does, strace sees the
process stop and immediately resumes it after printing its syscall
details.  

If you do the artificial test using a long sleep in arch_ptrace_stop,
then you can probably produce this by hand with gdb.  Have the process
doing raise(SIGCHLD) or some other harmless signal.  The traced
process will stop to report the signal to gdb, and then gdb will sit
at the prompt before resuming it (given "handle SIGFOO stop" if not default).  
If your sleep is long enough, it won't be hard to get your SIGKILL in there.
Then when gdb is sitting, the traced process may still be sitting too.
But it should have gone away instantly from SIGKILL.

Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (24 preceding siblings ...)
  2007-11-13  0:30 ` Roland McGrath
@ 2007-11-13 11:07 ` Petr Tesarik
  2007-11-14  5:38 ` Shaohua Li
                   ` (8 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-13 11:07 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 1403 bytes --]

On Mon, 2007-11-12 at 16:30 -0800, Roland McGrath wrote:
> [...]
> If you do the artificial test using a long sleep in arch_ptrace_stop,
> then you can probably produce this by hand with gdb.  Have the process
> doing raise(SIGCHLD) or some other harmless signal.  The traced
> process will stop to report the signal to gdb, and then gdb will sit
> at the prompt before resuming it (given "handle SIGFOO stop" if not default).  
> If your sleep is long enough, it won't be hard to get your SIGKILL in there.
> Then when gdb is sitting, the traced process may still be sitting too.
> But it should have gone away instantly from SIGKILL.

I found it extremely difficult to trigger the race condition without the
articifial test - arch_ptrace_stop() only sleeps if the user page is not
present, but in the common case the register stack backing store will
have been quite recently accessed by the process.

It should be possible to create a large file, flush the page cache, put
the RSE into lazy mode, flush it and map the register stack from that
file, so that no memory accesses to the backing store are done before
ptrace_stop(), but for the time being I placed an msleep(100) after
arch_ptrace_stop().

Anyway, I produced a test case which succeeds when the call to
sigkill_pending() is in and fails when it's commented out. I'm attaching
it here (the kernel patch to follow).

Regards,
Petr Tesarik


[-- Attachment #2: kill-race.c --]
[-- Type: text/x-csrc, Size: 1899 bytes --]

#include <stdio.h>
#include <signal.h>
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <time.h>

#define TEST_OK		0
#define TEST_FAILED	1
#define TEST_ABORTED	2

#define INTERACTIVE	1

static int run_debugger(pid_t pid)
{
	struct timespec tm;
	int stat;
	int res;

	printf("Running debugger on pid %ld\n", (long) pid);

	if (waitpid(pid, &stat, 0) == -1) {
		perror("waitpid attach");
		goto error_out;
	}

	if (ptrace(PTRACE_CONT, pid, 0, 0) == -1) {
		perror("PTRACE_CONT");
		goto error_out;
	}

	tm.tv_sec = 0;
	tm.tv_nsec = 10000;
	nanosleep(&tm, NULL);

	if (kill(pid, SIGKILL) == -1) {
		perror("kill SIGKILL");
		goto error_out;
	}

#if INTERACTIVE
	puts("Sent SIGKILL. Press Enter to continue.");
	getchar();
#endif

	if (waitpid(pid, &stat, 0) == -1) {
		perror("waitpid exit");
		res = TEST_FAILED;
	} else if (WIFSIGNALED(stat)) {
		if (WTERMSIG(stat) != SIGKILL) {
			fprintf(stderr, "Child terminated by signal %d\n",
				WTERMSIG(stat));
			res = TEST_FAILED;
		}
	} else if (WIFSTOPPED(stat)) {
		fprintf(stderr, "Child notified us about signal %d - FAILED\n",
			WSTOPSIG(stat));
		res = TEST_FAILED;
	} else if (!WIFEXITED(stat)) {
		fprintf(stderr, "Child terminated abnormally, stat=0x%x\n",
			stat);
		res = TEST_FAILED;
	} else {
		fprintf(stderr, "Child exited with code %d\n",
			WEXITSTATUS(stat));
		res = TEST_FAILED;
		if (res < WEXITSTATUS(stat))
			res = WEXITSTATUS(stat);
	}

	return res;

error_out:
	ptrace(PTRACE_KILL, pid, 0, 0);

	return TEST_ABORTED;
}

static int run_child()
{
	/* Attach ourselves */
	if (ptrace(PTRACE_TRACEME, 0, 0, 0) == -1) {
		perror("TRACE_ME");
		return TEST_ABORTED;
	}

	for (;;)
		raise(SIGCHLD);
}

int main()
{
	pid_t pid;

	if ((pid = fork()))
		return run_debugger(pid);
	else
		return run_child();
}

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (25 preceding siblings ...)
  2007-11-13 11:07 ` Petr Tesarik
@ 2007-11-14  5:38 ` Shaohua Li
  2007-11-14  6:47 ` Roland McGrath
                   ` (7 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Shaohua Li @ 2007-11-14  5:38 UTC (permalink / raw)
  To: linux-ia64

On Tue, 2007-11-13 at 12:07 +0100, Petr Tesarik wrote:
> On Mon, 2007-11-12 at 16:30 -0800, Roland McGrath wrote:
> > [...]
> > If you do the artificial test using a long sleep in arch_ptrace_stop,
> > then you can probably produce this by hand with gdb.  Have the process
> > doing raise(SIGCHLD) or some other harmless signal.  The traced
> > process will stop to report the signal to gdb, and then gdb will sit
> > at the prompt before resuming it (given "handle SIGFOO stop" if not default).  
> > If your sleep is long enough, it won't be hard to get your SIGKILL in there.
> > Then when gdb is sitting, the traced process may still be sitting too.
> > But it should have gone away instantly from SIGKILL.
> 
> I found it extremely difficult to trigger the race condition without the
> articifial test - arch_ptrace_stop() only sleeps if the user page is not
> present, but in the common case the register stack backing store will
> have been quite recently accessed by the process.
> 
> It should be possible to create a large file, flush the page cache, put
> the RSE into lazy mode, flush it and map the register stack from that
> file, so that no memory accesses to the backing store are done before
> ptrace_stop(), but for the time being I placed an msleep(100) after
> arch_ptrace_stop().
> 
> Anyway, I produced a test case which succeeds when the call to
> sigkill_pending() is in and fails when it's commented out. I'm attaching
> it here (the kernel patch to follow).
So without the sigkill_pending() check, your test case will print an
error, and otherwise, nothing. This is expected output, right? Looks I
can reproduce it here.

I'll refresh the RSE fix to align with Roland's patch.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (26 preceding siblings ...)
  2007-11-14  5:38 ` Shaohua Li
@ 2007-11-14  6:47 ` Roland McGrath
  2007-11-14  7:37 ` Petr Tesarik
                   ` (6 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-14  6:47 UTC (permalink / raw)
  To: linux-ia64

> I found it extremely difficult to trigger the race condition without the
> articifial test - arch_ptrace_stop() only sleeps if the user page is not
> present, but in the common case the register stack backing store will
> have been quite recently accessed by the process.

It is supposed to be a rare race, after all. :-)  We're just being thorough
to cover it, not that it ever actually happened in practice or was expected to.

> It should be possible to create a large file, flush the page cache, put
> the RSE into lazy mode, flush it and map the register stack from that
> file, so that no memory accesses to the backing store are done before
> ptrace_stop(), but for the time being I placed an msleep(100) after
> arch_ptrace_stop().

And then make the file so mapped be from a broken NFS or FUSE or somesuch
mount that actually blocks forever on the fault.  That would be the
probable style of a DoS attack exploiting this to create unkillable processes.

> Anyway, I produced a test case which succeeds when the call to
> sigkill_pending() is in and fails when it's commented out. I'm attaching
> it here (the kernel patch to follow).

Ok!  It sounds like we've verified all the pieces of the fix.

There's one more wrinkle that I've remembered.  A traced process has not
necessarily gone through ptrace_stop when you do ptrace on it, though
normally so.  It can be in job control stop (TASK_STOPPED), such as when
you attach gdb to something stopped by ^Z.  To cover that case, the easiest
thing I can see is to define an arch_ptrace_attach macro that does the
writeback.  This requires making the ia64_rse_writeback function (or
whatever it's to be called) work on either current or another task that's
stopped.  i.e.

#define arch_ptrace_attach(child) \
	do if (!test_tsk_thread_flag(child, TIF_RESTORE_RSE) && \
	       ptrace_check_attach(child, 0) = 0) \
		ia64_writeback_rse(child); \
	while (0)

Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (27 preceding siblings ...)
  2007-11-14  6:47 ` Roland McGrath
@ 2007-11-14  7:37 ` Petr Tesarik
  2007-11-14  7:40 ` Roland McGrath
                   ` (5 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-14  7:37 UTC (permalink / raw)
  To: linux-ia64

Shaohua Li wrote:
> On Tue, 2007-11-13 at 12:07 +0100, Petr Tesarik wrote:
>> On Mon, 2007-11-12 at 16:30 -0800, Roland McGrath wrote:
>>> [...]
>>> If you do the artificial test using a long sleep in arch_ptrace_stop,
>>> then you can probably produce this by hand with gdb.  Have the process
>>> doing raise(SIGCHLD) or some other harmless signal.  The traced
>>> process will stop to report the signal to gdb, and then gdb will sit
>>> at the prompt before resuming it (given "handle SIGFOO stop" if not default).  
>>> If your sleep is long enough, it won't be hard to get your SIGKILL in there.
>>> Then when gdb is sitting, the traced process may still be sitting too.
>>> But it should have gone away instantly from SIGKILL.
>> I found it extremely difficult to trigger the race condition without the
>> articifial test - arch_ptrace_stop() only sleeps if the user page is not
>> present, but in the common case the register stack backing store will
>> have been quite recently accessed by the process.
>>
>> It should be possible to create a large file, flush the page cache, put
>> the RSE into lazy mode, flush it and map the register stack from that
>> file, so that no memory accesses to the backing store are done before
>> ptrace_stop(), but for the time being I placed an msleep(100) after
>> arch_ptrace_stop().
>>
>> Anyway, I produced a test case which succeeds when the call to
>> sigkill_pending() is in and fails when it's commented out. I'm attaching
>> it here (the kernel patch to follow).
> So without the sigkill_pending() check, your test case will print an
> error, and otherwise, nothing. This is expected output, right? Looks I
> can reproduce it here.

Yes - you can also check the exit code - it is TEST_OK (=0) if the test
passes, TEST_FAILED (=1) if it fails in a known way and TEST_ABORTED
(=2) if something went wrong unexpectedly.

I was able to manage my test case to run even without the msleep() -
I'll send the instruction in the following mail. Anyway, I was finally
running with the following patch:

diff -ru linux-2.6.orig/arch/ia64/kernel/ptrace.c linux-2.6/arch/ia64/kernel/ptrace.c
--- linux-2.6.orig/arch/ia64/kernel/ptrace.c	2007-10-09 22:31:38.000000000 +0200
+++ linux-2.6/arch/ia64/kernel/ptrace.c	2007-11-13 11:34:03.664000000 +0100
@@ -547,6 +547,64 @@
 	return 0;
 }
 
+static long
+ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
+		unsigned long user_rbs_start, unsigned long user_rbs_end)
+{
+	unsigned long addr, val;
+	long ret;
+
+	/* now copy word for word from user rbs to kernel rbs: */
+	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
+		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+				!= sizeof(val))
+			return -EIO;
+
+		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+#define RBS_SYNC_TO_USER 0
+#define RBS_SYNC_TO_KERNEL 1
+static void do_sync_rbs(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	unsigned long urbs_end;
+	int to_user = (unsigned long)arg;
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+	pt = task_pt_regs(info->task);
+	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
+
+	if (to_user = RBS_SYNC_TO_USER)
+		ia64_sync_user_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+	else
+		ia64_sync_kernel_rbs(info->task, info->sw, pt->ar_bspstore, urbs_end);
+}
+
+/*
+ * when a thread is stopped (ptraced), debugger might change thread's user
+ * stack (change memory directly), and we must avoid the RSE stored in kernel
+ * to override user stack (user space's RSE is newer than kernel's in the
+ * case). To workaround the issue, we copy kernel RSE to user RSE before the
+ * task is stopped, so user RSE has updated data.  we then copy user RSE to
+ * kernel after the task is resummed from traced stop and kernel will use the
+ * newer RSE to return to user.
+ */
+void ia64_ptrace_stop(void)
+{
+	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_USER);
+}
+
+void ia64_ptrace_resume(void)
+{
+	unw_init_running(do_sync_rbs, (void *)RBS_SYNC_TO_KERNEL);
+}
+
 static inline int
 thread_matches (struct task_struct *thread, unsigned long addr)
 {
diff -ru linux-2.6.orig/include/asm-ia64/ptrace.h linux-2.6/include/asm-ia64/ptrace.h
--- linux-2.6.orig/include/asm-ia64/ptrace.h	2007-10-09 22:31:38.000000000 +0200
+++ linux-2.6/include/asm-ia64/ptrace.h	2007-11-13 11:35:17.848000000 +0100
@@ -303,6 +303,12 @@
   extern void ia64_increment_ip (struct pt_regs *pt);
   extern void ia64_decrement_ip (struct pt_regs *pt);
 
+  extern void ia64_ptrace_stop(void);
+  extern void ia64_ptrace_resume(void);
+
+  #define arch_ptrace_stop		ia64_ptrace_stop
+  #define arch_ptrace_resume		ia64_ptrace_resume
+  #define arch_ptrace_stop_needed()	1
 #endif /* !__KERNEL__ */
 
 /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
diff -ru linux-2.6.orig/include/linux/ptrace.h linux-2.6/include/linux/ptrace.h
--- linux-2.6.orig/include/linux/ptrace.h	2007-10-09 22:31:38.000000000 +0200
+++ linux-2.6/include/linux/ptrace.h	2007-11-13 11:32:01.568000000 +0100
@@ -128,6 +128,18 @@
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef arch_ptrace_stop
+#define arch_ptrace_stop()	do { } while (0)
+#endif
+
+#ifndef arch_ptrace_resume
+#define arch_ptrace_resume()	do { } while (0)
+#endif
+
+#ifndef arch_ptrace_stop_needed
+#define arch_ptrace_stop_needed()	0
+#endif
+
 #endif
 
 #endif
diff -ru linux-2.6.orig/kernel/signal.c linux-2.6/kernel/signal.c
--- linux-2.6.orig/kernel/signal.c	2007-11-13 14:19:35.267851100 +0100
+++ linux-2.6/kernel/signal.c	2007-11-13 11:37:49.236000000 +0100
@@ -1569,6 +1570,17 @@
 }
 
 /*
+ * Return non-zero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
+}
+
+/*
  * This must be called with current->sighand->siglock held.
  *
  * This should be the path for all ptrace stops.
@@ -1581,6 +1593,26 @@
  */
 static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 {
+	int killed = 0;
+
+	if (arch_ptrace_stop_needed()) {
+		/*
+		 * The arch code has something special to do before a
+		 * ptrace stop.  This is allowed to block, e.g. for faults
+		 * on user stack pages.  We can't keep the siglock while
+		 * calling arch_ptrace_stop, so we must release it now.
+		 * To preserve proper semantics, we must do this before
+		 * any signal bookkeeping like checking group_stop_count.
+		 * Meanwhile, a SIGKILL could come in before we retake the
+		 * siglock.  That must prevent us from sleeping in TASK_TRACED.
+		 * So after regaining the lock, we must check for SIGKILL.
+		 */
+		spin_unlock_irq(&current->sighand->siglock);
+		arch_ptrace_stop();
+		spin_lock_irq(&current->sighand->siglock);
+		killed = sigkill_pending(current);
+	}
+
 	/*
 	 * If there is a group stop in progress,
 	 * we must participate in the bookkeeping.
@@ -1596,10 +1630,11 @@
 	spin_unlock_irq(&current->sighand->siglock);
 	try_to_freeze();
 	read_lock(&tasklist_lock);
-	if (may_ptrace_stop()) {
+	if (likely(!killed) && may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
+		arch_ptrace_resume();
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (28 preceding siblings ...)
  2007-11-14  7:37 ` Petr Tesarik
@ 2007-11-14  7:40 ` Roland McGrath
  2007-11-14  7:53 ` Petr Tesarik
                   ` (4 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-14  7:40 UTC (permalink / raw)
  To: linux-ia64

What's arch_ptrace_resume about?  
I thought we were agreed on the plan using TIF_RESTORE_RSE.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (29 preceding siblings ...)
  2007-11-14  7:40 ` Roland McGrath
@ 2007-11-14  7:53 ` Petr Tesarik
  2007-11-14  7:55 ` Petr Tesarik
                   ` (3 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-14  7:53 UTC (permalink / raw)
  To: linux-ia64

Roland McGrath wrote:
> What's arch_ptrace_resume about?  
> I thought we were agreed on the plan using TIF_RESTORE_RSE.

No, after further discussion we came to the conclusion that introducing
the bit actually saves us only a couple of user-to-kernel/kernel-to-user
copies in do_exit(), but it complicates the kernel exit path, so it's
not really worth it. Shaohua didn't call it arch_ptrace_resume(), but he
added an argument to arch_ptrace_stop(). I was already testing a
different variant of that patch and I'm sorry if it caused confusion. :(

Is there any other advantage in introducing TIF_RESTORE_RSE than saving
some unneeded data copying?

Regards,
Petr Tesarik

> Thanks,
> Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (30 preceding siblings ...)
  2007-11-14  7:53 ` Petr Tesarik
@ 2007-11-14  7:55 ` Petr Tesarik
  2007-11-14 11:09 ` Roland McGrath
                   ` (2 subsequent siblings)
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-14  7:55 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 1268 bytes --]

Roland McGrath wrote:
>> I found it extremely difficult to trigger the race condition without the
>> articifial test - arch_ptrace_stop() only sleeps if the user page is not
>> present, but in the common case the register stack backing store will
>> have been quite recently accessed by the process.
> 
> It is supposed to be a rare race, after all. :-)  We're just being thorough
> to cover it, not that it ever actually happened in practice or was expected to.
> 
>> It should be possible to create a large file, flush the page cache, put
>> the RSE into lazy mode, flush it and map the register stack from that
>> file, so that no memory accesses to the backing store are done before
>> ptrace_stop(), but for the time being I placed an msleep(100) after
>> arch_ptrace_stop().
> 
> And then make the file so mapped be from a broken NFS or FUSE or somesuch
> mount that actually blocks forever on the fault.  That would be the
> probable style of a DoS attack exploiting this to create unkillable processes.

That's exactly what I did. FUSE doesn't implement mmap (guess why), but
I was able to trigger the race even with a working NFS after tweaking
the timing a bit. I'm attaching the test case I used (the NFS volume was
mounted on /nfs).

Regards,
Petr Tesarik

[-- Attachment #2: sigkill-race.tar.gz --]
[-- Type: application/gzip, Size: 1916 bytes --]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (31 preceding siblings ...)
  2007-11-14  7:55 ` Petr Tesarik
@ 2007-11-14 11:09 ` Roland McGrath
  2007-11-16 20:05 ` Petr Tesarik
  2007-11-18  3:08 ` Roland McGrath
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-14 11:09 UTC (permalink / raw)
  To: linux-ia64

> Roland McGrath wrote:
> > What's arch_ptrace_resume about?  
> > I thought we were agreed on the plan using TIF_RESTORE_RSE.
> 
> No, after further discussion we came to the conclusion that introducing
> the bit actually saves us only a couple of user-to-kernel/kernel-to-user
> copies in do_exit(), but it complicates the kernel exit path, so it's
> not really worth it. Shaohua didn't call it arch_ptrace_resume(), but he
> added an argument to arch_ptrace_stop(). I was already testing a
> different variant of that patch and I'm sorry if it caused confusion. :(

I think I missed that part of the discussion, but I may have overlooked it.
I'd thought Shaohua (or his predecessors) were already clear from my end on
why to argue for TIF_RESTORE_RSE.  Not being an actual ia64 user myself,
the plan was to outsource the arguing with ia64 people to the Intel folks.
So much for that.

It's not just do_exit, where I presume you mean for the ptrace EXIT stop,
after which user mode will never run again.  It's also any time ptrace_stop
runs more than once before going back to user mode.  This includes a signal
stop that is followed by more signal stops, a syscall-exit stop followed by
signal stops, a ptrace_notify (clone et al) followed by another (vfork-done
follows clone) or by signal or syscall-exit stops, etc.

But optimizing those cases is not really what motivates me.  What you've
implemented is pretty much exactly what David and I settled on when we
first discussed this a few years ago.  The recent action on the subject
was spurred by a slightly more recent set of interests on my part.

> Is there any other advantage in introducing TIF_RESTORE_RSE than saving
> some unneeded data copying?

No, unless "other" includes "saving a lot of very unneeded data copying".
Given ptrace, I can see the argument for simplicity over optimization.
The TIF_RESTORE_RSE plan looks forward to future debugging facilities,
where this issue could be a large performance impediment that other
machines won't have.

I became aware of this issue a long time ago because of gdb's inability
to use /proc/pid/mem reliably as it can do on all other machines.  But
when I started pressing to resolve it was while working on utrace.
Regardless of the fate of utrace per se, I think something will arise
that has the same requirements on arch code that I've set down and that
motivated the TIF_RESTORE_RSE approach to ia64's register backing store.

Consider a flexible facility for tracing actions at the kinds of event
points that ptrace monitors today.  Where now ptrace_notify or
ptrace_stop is called, a hook into some moderate intelligence can run to
decide whether to stop and what to communicate to the debugger and so forth.

It might monitor all syscalls and usually decide to do nothing at all,
or send a very cheap asynchronous notification somewhere.  The hoped-for
promise of fancy new facilities is that they can do this unobtrusively
across many, many threads in the system.  So in this case, it should be
no more expensive than TIF_SYSCALL_AUDIT.

It might stop and wait with a clunky amount of overhead like ptrace.
In this case, the additional overhead of extra copies probably doesn't rate.

It might not stop at all, but instead do some self-examination before
going on.  This might include reading (or writing) memory via
access_process_vm or get_user, including accessing register values in
the register backing store memory.  In this case, correctness demands
that the writeback to user memory be done before that memory is
examined, and that changed user memory will be reloaded into the RSE
before the user registers are used next (in user mode or at syscall entry).

But it's a flexible facility.  So it's not easy to know ahead of time
which of these scenarios it will be.  (In utrace, there's a function
pointer provided by some kernel module that can do what it likes.)
For the filtered event case (first of the three), you want to skip the
whole overhead.  For the ptrace-like case (second), you want to flush
before you stop and reload afterward.  For the non-stop introspection
case (third), you need flush and reload but here won't be any stop.

So the natural interface for these to all fall out optimal is that there
is no automagic copying at the low level, but an explicit arch
"writeback" function to call.  This gets called before something reads
user memory and expects it to be harmonized with the thread's state.  It
should be cheap to call when it's already been done, so multiple
uncoordinated things can request it when they need to ensure it, but
don't pay lots of extra overhead when several of those come before
actually going to user mode (or syscall entry).  To keep the bookkeeping
and interactions simple even among multiple uncoordinated things, that
one call should make it so that reloading from user memory is automatic
on the next return to user mode (or syscall entry).  ptrace calls this
at every stop (and attach, cf my earlier mail).  Other future facilities
would call it only selectively when they've decided they need register
values from memory at this particular stop.

The bookkeeping to prevent repeated flushes and to ensure reloads before
resuming could be done by higher layers that call ia64_ptrace_stop and
ia64_ptrace_resume.  But, this whole issue really only exists on ia64.
So it makes sense to keep the generic interface semantics that has to
consider it as simple as possible and push this work into the ia64 code,
and TIF_* is a perfect fit for this.  The main "complexity" introduced
is the overloading of TIF_NOTIFY_RESUME because there isn't another free
bit in TIF_ALLWORK_MASK.  The need to reload from memory after
syscall_trace_enter and before using the register values as syscall args
is subtle, too.  But still, it's not so much.  And frankly, ia64
deserves it for deciding to have the godforsaken RSE semantics.

Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (32 preceding siblings ...)
  2007-11-14 11:09 ` Roland McGrath
@ 2007-11-16 20:05 ` Petr Tesarik
  2007-11-18  3:08 ` Roland McGrath
  34 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2007-11-16 20:05 UTC (permalink / raw)
  To: linux-ia64

Roland McGrath wrote:
>> I found it extremely difficult to trigger the race condition without the
>> articifial test - arch_ptrace_stop() only sleeps if the user page is not
>> present, but in the common case the register stack backing store will
>> have been quite recently accessed by the process.
> 
> It is supposed to be a rare race, after all. :-)  We're just being thorough
> to cover it, not that it ever actually happened in practice or was expected to.
> 
>> It should be possible to create a large file, flush the page cache, put
>> the RSE into lazy mode, flush it and map the register stack from that
>> file, so that no memory accesses to the backing store are done before
>> ptrace_stop(), but for the time being I placed an msleep(100) after
>> arch_ptrace_stop().
> 
> And then make the file so mapped be from a broken NFS or FUSE or somesuch
> mount that actually blocks forever on the fault.  That would be the
> probable style of a DoS attack exploiting this to create unkillable processes.
> 
>> Anyway, I produced a test case which succeeds when the call to
>> sigkill_pending() is in and fails when it's commented out. I'm attaching
>> it here (the kernel patch to follow).
> 
> Ok!  It sounds like we've verified all the pieces of the fix.
> 
> There's one more wrinkle that I've remembered.  A traced process has not
> necessarily gone through ptrace_stop when you do ptrace on it, though
> normally so.  It can be in job control stop (TASK_STOPPED), such as when
> you attach gdb to something stopped by ^Z.  To cover that case, the easiest

Interesting. I've just tried attaching to a stopped process and gdb
hangs on wait4(). When I resume the process, gdb complains like this:

/build/buildd/gdb-6.4.90.dfsg/gdb/linux-nat.c:1025: internal-error:
linux_nat_attach: Assertion `pid = GET_PID (inferior_ptid) &&
WIFSTOPPED (status) && WSTOPSIG (status) = SIGSTOP' failed.
A problem internal to GDB has been detected,
further debugging may prove unreliable.
Quit this debugging session? (y or n)

So, it seems to me that attaching to stopped processes is broken anyway,
because debugger expect to get a notification signal when they do
PTRACE_ATTACH. Shouldn't the kernel generate one? (Which would BTW also
make the special handling of ptrace_attach for our purposes unnecessary.)

Regards,
Petr Tesarik


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] ptrace RSE bug
  2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
                   ` (33 preceding siblings ...)
  2007-11-16 20:05 ` Petr Tesarik
@ 2007-11-18  3:08 ` Roland McGrath
  34 siblings, 0 replies; 36+ messages in thread
From: Roland McGrath @ 2007-11-18  3:08 UTC (permalink / raw)
  To: linux-ia64

No, gdb was just buggy.  More recent versions have been fixed.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2007-11-18  3:08 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-29  3:21 [PATCH] ptrace RSE bug Shaohua Li
2007-08-29  7:10 ` Roland McGrath
2007-08-29  7:29 ` Matthew Chapman
2007-08-29  8:01 ` Roland McGrath
2007-09-05 16:25 ` Petr Tesarik
2007-09-06  3:16 ` Shaohua Li
2007-09-06 13:59 ` Petr Tesarik
2007-09-07  1:02 ` Shaohua Li
2007-09-07  8:26 ` Petr Tesarik
2007-09-07 15:11 ` David Mosberger-Tang
2007-09-11  8:39 ` Shaohua Li
2007-10-17 14:56 ` Petr Tesarik
2007-10-17 19:48 ` Petr Tesarik
2007-10-17 19:55 ` Petr Tesarik
2007-10-18  1:54 ` Shaohua Li
2007-10-18 10:59 ` Petr Tesarik
2007-10-18 16:02 ` Christoph Hellwig
2007-10-19  7:30 ` Shaohua Li
2007-10-19 19:42 ` Petr Tesarik
2007-10-24  3:34 ` Shaohua Li
2007-10-24 23:38 ` Luck, Tony
2007-10-25  0:38 ` Shaohua Li
2007-11-12  2:14 ` Roland McGrath
2007-11-12 15:41 ` Petr Tesarik
2007-11-12 16:11 ` Petr Tesarik
2007-11-13  0:30 ` Roland McGrath
2007-11-13 11:07 ` Petr Tesarik
2007-11-14  5:38 ` Shaohua Li
2007-11-14  6:47 ` Roland McGrath
2007-11-14  7:37 ` Petr Tesarik
2007-11-14  7:40 ` Roland McGrath
2007-11-14  7:53 ` Petr Tesarik
2007-11-14  7:55 ` Petr Tesarik
2007-11-14 11:09 ` Roland McGrath
2007-11-16 20:05 ` Petr Tesarik
2007-11-18  3:08 ` Roland McGrath

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox