[RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
@ 2006-04-13 17:20 Jeff Dike
  2006-04-18 12:57 ` Pavel Machek
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Jeff Dike @ 2006-04-13 17:20 UTC (permalink / raw)
  To: linux-kernel, user-mode-linux-devel

Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
traced.  It takes a bitmask and a length.  A system call is traced
if its bit is one.  Otherwise, it executes normally, and is
invisible to the ptracing parent.

This is not just useful for UML - strace -e could make good use of it as well.

Index: linux-2.6.17-mm-vtime/arch/um/kernel/ptrace.c
===================================================================
--- linux-2.6.17-mm-vtime.orig/arch/um/kernel/ptrace.c	2006-04-13 13:48:02.000000000 -0400
+++ linux-2.6.17-mm-vtime/arch/um/kernel/ptrace.c	2006-04-13 13:49:32.000000000 -0400
@@ -83,7 +83,7 @@ long arch_ptrace(struct task_struct *chi
                 break;
 
 	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-	case PTRACE_CONT: { /* restart after signal. */
+	case PTRACE_CONT: /* restart after signal. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
@@ -99,7 +99,9 @@ long arch_ptrace(struct task_struct *chi
 		wake_up_process(child);
 		ret = 0;
 		break;
-	}
+	case PTRACE_SYSCALL_MASK:
+		ret = set_syscall_mask(child, (char __user *) addr, data);
+		break;
 
 /*
  * make the child exit.  Best I can do is send it a sigkill. 
@@ -295,6 +297,12 @@ void syscall_trace(union uml_pt_regs *re
 	if (!(current->ptrace & PT_PTRACED))
 		return;
 
+	if((current->syscall_mask != NULL) &&
+	   ((long) UPT_SYSCALL_NR(regs) != -1) &&
+	   !(current->syscall_mask[UPT_SYSCALL_NR(regs) / (8 * sizeof(long))] &
+	     (1 << (UPT_SYSCALL_NR(regs) % (8 * sizeof(long))))))
+		return;
+
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
 	tracesysgood = (current->ptrace & PT_TRACESYSGOOD);
Index: linux-2.6.17-mm-vtime/include/linux/ptrace.h
===================================================================
--- linux-2.6.17-mm-vtime.orig/include/linux/ptrace.h	2006-04-13 13:48:02.000000000 -0400
+++ linux-2.6.17-mm-vtime/include/linux/ptrace.h	2006-04-13 13:49:32.000000000 -0400
@@ -93,6 +93,8 @@ extern void __ptrace_link(struct task_st
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
 extern int ptrace_may_attach(struct task_struct *task);
+extern int set_syscall_mask(struct task_struct *child, char __user *mask,
+			    unsigned long len);
 
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
Index: linux-2.6.17-mm-vtime/arch/i386/kernel/ptrace.c
===================================================================
--- linux-2.6.17-mm-vtime.orig/arch/i386/kernel/ptrace.c	2006-04-13 13:48:02.000000000 -0400
+++ linux-2.6.17-mm-vtime/arch/i386/kernel/ptrace.c	2006-04-13 13:49:32.000000000 -0400
@@ -500,6 +500,9 @@ long arch_ptrace(struct task_struct *chi
 		ret = 0;
 		break;
 
+	case PTRACE_SYSCALL_MASK:
+		ret = set_syscall_mask(child, (char __user *) addr, data);
+		break;
 /*
  * make the child exit.  Best I can do is send it a sigkill. 
  * perhaps it should be put in the status that it wants to 
@@ -690,6 +693,11 @@ int do_syscall_trace(struct pt_regs *reg
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
+	if((current->syscall_mask != NULL) && ((long) regs->orig_eax != -1) &&
+	   !(current->syscall_mask[regs->orig_eax / (8 * sizeof(long))] &
+	     (1 << (regs->orig_eax % (8 * sizeof(long))))))
+		goto out;
+
 	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
 	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
 	 * here. We have to check this and return */
Index: linux-2.6.17-mm-vtime/include/asm-i386/ptrace.h
===================================================================
--- linux-2.6.17-mm-vtime.orig/include/asm-i386/ptrace.h	2006-04-13 13:48:02.000000000 -0400
+++ linux-2.6.17-mm-vtime/include/asm-i386/ptrace.h	2006-04-13 13:49:32.000000000 -0400
@@ -53,6 +53,7 @@ struct pt_regs {
 
 #define PTRACE_GET_THREAD_AREA    25
 #define PTRACE_SET_THREAD_AREA    26
+#define PTRACE_SYSCALL_MASK	  27
 
 #define PTRACE_SYSEMU		  31
 #define PTRACE_SYSEMU_SINGLESTEP  32
Index: linux-2.6.17-mm-vtime/include/linux/sched.h
===================================================================
--- linux-2.6.17-mm-vtime.orig/include/linux/sched.h	2006-04-13 13:49:11.000000000 -0400
+++ linux-2.6.17-mm-vtime/include/linux/sched.h	2006-04-13 13:49:32.000000000 -0400
@@ -899,6 +899,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
+	unsigned long *syscall_mask;
 /*
  * current io wait handle: wait queue entry to use for io waits
  * If this thread is processing aio, this points at the waitqueue
Index: linux-2.6.17-mm-vtime/kernel/ptrace.c
===================================================================
--- linux-2.6.17-mm-vtime.orig/kernel/ptrace.c	2006-04-13 13:48:02.000000000 -0400
+++ linux-2.6.17-mm-vtime/kernel/ptrace.c	2006-04-13 13:49:32.000000000 -0400
@@ -21,6 +21,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
+#include <asm/unistd.h>
 
 /*
  * ptrace a task: make the debugger its new parent and
@@ -450,6 +451,41 @@ int ptrace_traceme(void)
 	return 0;
 }
 
+int set_syscall_mask(struct task_struct *child, char __user *mask,
+		     unsigned long len)
+{
+	int i, n = (NR_syscalls + 7) / 8;
+	char c;
+
+	if(len > n){
+		for(i = NR_syscalls; i < len * 8; i++){
+			get_user(c, &mask[i / 8]);
+			if(!(c & (1 << (i % 8)))){
+				printk("Out of range syscall at %d\n", i);
+				return -EINVAL;
+			}
+		}
+
+		len = n;
+	}
+
+	if(child->syscall_mask == NULL){
+		child->syscall_mask = kmalloc(n, GFP_KERNEL);
+		if(child->syscall_mask == NULL)
+			return -ENOMEM;
+
+		memset(child->syscall_mask, 0xff, n);
+	}
+
+	/* XXX If this partially fails, we will have a partially updated
+	 * mask.
+	 */
+	if(copy_from_user(child->syscall_mask, mask, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 /**
  * ptrace_get_task_struct  --  grab a task struct reference for ptrace
  * @pid:       process id to grab a task_struct reference of
Index: linux-2.6.17-mm-vtime/include/linux/init_task.h
===================================================================
--- linux-2.6.17-mm-vtime.orig/include/linux/init_task.h	2006-04-13 13:48:32.000000000 -0400
+++ linux-2.6.17-mm-vtime/include/linux/init_task.h	2006-04-13 13:50:21.000000000 -0400
@@ -124,6 +124,7 @@ extern struct group_info init_groups;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
+	.syscall_mask	= NULL					\
 	INIT_RT_MUTEXES(tsk)						\
 }
 
Index: linux-2.6.17-mm-vtime/kernel/fork.c
===================================================================
--- linux-2.6.17-mm-vtime.orig/kernel/fork.c	2006-04-13 13:49:11.000000000 -0400
+++ linux-2.6.17-mm-vtime/kernel/fork.c	2006-04-13 13:49:32.000000000 -0400
@@ -1110,6 +1110,7 @@ static task_t *copy_process(unsigned lon
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
+	p->syscall_mask = NULL;
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-13 17:20 [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK Jeff Dike
@ 2006-04-18 12:57 ` Pavel Machek
  2006-04-26 18:38   ` Jeff Dike
  2006-04-20  9:05 ` Heiko Carstens
  2006-04-21 18:34 ` [uml-devel] " Blaisorblade
  2 siblings, 1 reply; 26+ messages in thread
From: Pavel Machek @ 2006-04-18 12:57 UTC (permalink / raw)
  To: Jeff Dike; +Cc: linux-kernel, user-mode-linux-devel

Hi!

> Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
> traced.  It takes a bitmask and a length.  A system call is traced
> if its bit is one.  Otherwise, it executes normally, and is
> invisible to the ptracing parent.
> 
> This is not just useful for UML - strace -e could make good use of it as well.

> +	if((current->syscall_mask != NULL) &&

Please put space between if and (.

> @@ -690,6 +693,11 @@ int do_syscall_trace(struct pt_regs *reg
>  	if (!(current->ptrace & PT_PTRACED))
>  		goto out;
>  
> +	if((current->syscall_mask != NULL) && ((long) regs->orig_eax != -1) &&
> +	   !(current->syscall_mask[regs->orig_eax / (8 * sizeof(long))] &
> +	     (1 << (regs->orig_eax % (8 * sizeof(long))))))
> +		goto out;
> +

Same here... and perhaps you can use __get_bit/__set_bit? (this
applies to few more places).

Are you going to fix non-i386, too?
							Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-18 12:57 ` Pavel Machek
@ 2006-04-26 18:38   ` Jeff Dike
  0 siblings, 0 replies; 26+ messages in thread
From: Jeff Dike @ 2006-04-26 18:38 UTC (permalink / raw)
  To: Pavel Machek; +Cc: linux-kernel, user-mode-linux-devel

On Tue, Apr 18, 2006 at 02:57:28PM +0200, Pavel Machek wrote:
> Same here... and perhaps you can use __get_bit/__set_bit? (this
> applies to few more places).

I did the bit-mashing by hand because I couldn't tell from a quick
look at the code whether __get_bit/__set_bit had any limitations that
I might exceed (i.e. nr needs to be < 256).

> Are you going to fix non-i386, too?

Probably, but I have other things to fix first.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-13 17:20 [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK Jeff Dike
  2006-04-18 12:57 ` Pavel Machek
@ 2006-04-20  9:05 ` Heiko Carstens
  2006-04-20 14:17   ` [uml-devel] " Bodo Stroesser
  2006-04-21 18:16   ` Blaisorblade
  2006-04-21 18:34 ` [uml-devel] " Blaisorblade
  2 siblings, 2 replies; 26+ messages in thread
From: Heiko Carstens @ 2006-04-20  9:05 UTC (permalink / raw)
  To: Jeff Dike; +Cc: linux-kernel, user-mode-linux-devel

> Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
> traced.  It takes a bitmask and a length.  A system call is traced
> if its bit is one.  Otherwise, it executes normally, and is
> invisible to the ptracing parent.
> [...]
> +int set_syscall_mask(struct task_struct *child, char __user *mask,
> +		     unsigned long len)
> +{
> +	int i, n = (NR_syscalls + 7) / 8;
> +	char c;
> +
> +	if(len > n){
> +		for(i = NR_syscalls; i < len * 8; i++){
> +			get_user(c, &mask[i / 8]);
> +			if(!(c & (1 << (i % 8)))){
> +				printk("Out of range syscall at %d\n", i);
> +				return -EINVAL;
> +			}
> +		}
> +
> +		len = n;
> +	}

Since it's quite likely that len > n will be true (e.g. after installing the
latest version of your debug tool) it would be better to silently ignore all
bits not within the range of NR_syscalls.
There is no point in flooding the console. The tracing process won't see any
of the non existant syscalls it requested to see anyway.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-20  9:05 ` Heiko Carstens
@ 2006-04-20 14:17   ` Bodo Stroesser
  2006-04-25 18:32     ` Jeff Dike
  2006-04-26 20:26     ` Charles P. Wright
  2006-04-21 18:16   ` Blaisorblade
  1 sibling, 2 replies; 26+ messages in thread
From: Bodo Stroesser @ 2006-04-20 14:17 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Heiko Carstens, linux-kernel, user-mode-linux-devel

Heiko Carstens wrote:
>>Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
>>traced.  It takes a bitmask and a length.  A system call is traced
>>if its bit is one.  Otherwise, it executes normally, and is
>>invisible to the ptracing parent.
>>[...]
>>+int set_syscall_mask(struct task_struct *child, char __user *mask,
>>+		     unsigned long len)
>>+{
>>+	int i, n = (NR_syscalls + 7) / 8;
>>+	char c;
>>+
>>+	if(len > n){
>>+		for(i = NR_syscalls; i < len * 8; i++){
>>+			get_user(c, &mask[i / 8]);
>>+			if(!(c & (1 << (i % 8)))){
>>+				printk("Out of range syscall at %d\n", i);
>>+				return -EINVAL;
>>+			}
>>+		}
>>+
>>+		len = n;
>>+	}
> 
> 
> Since it's quite likely that len > n will be true (e.g. after installing the
> latest version of your debug tool) it would be better to silently ignore all
> bits not within the range of NR_syscalls.
> There is no point in flooding the console. The tracing process won't see any
> of the non existant syscalls it requested to see anyway.

Shouldn't 'len' better be the number of bits in the mask than the number of chars?
Assume a syscall newly added to UML would be a candidate for processing on the host,
but the incremented NR_syscalls still would result in the same number of bytes. Also
assume, host doesn't yet have that new syscall. Current implementation doesn't catch
the fact, that host can't execute that syscall.

OTOH, I think UML shouldn't send the entire mask, but relevant part only. The missing
end is filled with 0xff by host anyway. So it would be enough to send the mask up to the
highest bit representing a syscall, that needs to be executed by host. (currently, that
is __NR_gettimeofday). If UML would do so, no more problem results from UML having
a higher NR_syscall than the host (as long as the new syscalls are to be intercepted
and executed by UML)

A greater problem might be a process in UML, that calls an invalid syscall number. AFAICS
syscall number (orig_eax) isn't checked before it is used in do_syscall_trace to address
syscall_mask. This might result in a crash.

Bodo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-20 14:17   ` [uml-devel] " Bodo Stroesser
@ 2006-04-25 18:32     ` Jeff Dike
  2006-04-26 20:26     ` Charles P. Wright
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff Dike @ 2006-04-25 18:32 UTC (permalink / raw)
  To: Bodo Stroesser; +Cc: Heiko Carstens, linux-kernel, user-mode-linux-devel

On Thu, Apr 20, 2006 at 04:17:28PM +0200, Bodo Stroesser wrote:
> Shouldn't 'len' better be the number of bits in the mask than the number of 
> chars?

Yup.

> OTOH, I think UML shouldn't send the entire mask, but relevant part only. 
> The missing end is filled with 0xff by host anyway. So it would be
> enough to send the mask up to the highest bit representing a
> syscall, that needs to be executed by host.  (currently, that is
> __NR_gettimeofday). If UML would do so, no more problem results from
> UML having a higher NR_syscall than the host (as long as the new
> syscalls are to be intercepted and executed by UML)

Yup, that was part of the intent of sending in the mask length.

> A greater problem might be a process in UML, that calls an invalid syscall 
> number. AFAICS syscall number (orig_eax) isn't checked before it is
> used in do_syscall_trace to address syscall_mask. This might result
> in a crash. 

Yeah, this needs fixing.

				Heff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-20 14:17   ` [uml-devel] " Bodo Stroesser
  2006-04-25 18:32     ` Jeff Dike
@ 2006-04-26 20:26     ` Charles P. Wright
  2006-04-26 19:40       ` Jeff Dike
  1 sibling, 1 reply; 26+ messages in thread
From: Charles P. Wright @ 2006-04-26 20:26 UTC (permalink / raw)
  To: Jeff Dike, Bodo Stroesser
  Cc: Heiko Carstens, linux-kernel, user-mode-linux-devel

On Thu, 2006-04-20 at 16:17 +0200, Bodo Stroesser wrote:
> Heiko Carstens wrote:
> >>Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
> >>traced.  It takes a bitmask and a length.  A system call is traced
> >>if its bit is one.  Otherwise, it executes normally, and is
> >>invisible to the ptracing parent.
> >>[...]
> >>+int set_syscall_mask(struct task_struct *child, char __user *mask,
> >>+		     unsigned long len)
> >>+{
> >>+	int i, n = (NR_syscalls + 7) / 8;
> >>+	char c;
> >>+
> >>+	if(len > n){
> >>+		for(i = NR_syscalls; i < len * 8; i++){
> >>+			get_user(c, &mask[i / 8]);
> >>+			if(!(c & (1 << (i % 8)))){
> >>+				printk("Out of range syscall at %d\n", i);
> >>+				return -EINVAL;
> >>+			}
> >>+		}
> >>+
> >>+		len = n;
> >>+	}
> > 
> > 
> > Since it's quite likely that len > n will be true (e.g. after installing the
> > latest version of your debug tool) it would be better to silently ignore all
> > bits not within the range of NR_syscalls.
> > There is no point in flooding the console. The tracing process won't see any
> > of the non existant syscalls it requested to see anyway.
> 
> Shouldn't 'len' better be the number of bits in the mask than the number of chars?
> Assume a syscall newly added to UML would be a candidate for processing on the host,
> but the incremented NR_syscalls still would result in the same number of bytes. Also
> assume, host doesn't yet have that new syscall. Current implementation doesn't catch
> the fact, that host can't execute that syscall.
> 
> OTOH, I think UML shouldn't send the entire mask, but relevant part only. The missing
> end is filled with 0xff by host anyway. So it would be enough to send the mask up to the
> highest bit representing a syscall, that needs to be executed by host. (currently, that
> is __NR_gettimeofday). If UML would do so, no more problem results from UML having
> a higher NR_syscall than the host (as long as the new syscalls are to be intercepted
> and executed by UML)
> 
> A greater problem might be a process in UML, that calls an invalid syscall number. AFAICS
> syscall number (orig_eax) isn't checked before it is used in do_syscall_trace to address
> syscall_mask. This might result in a crash.
I have a similar local patch that I've been using.  I think it would be
worthwhile to have an extra bit in the bitmap that says what to do with
calls that fall outside the range [0, __NR_syscall].  That way the
ptrace monitor can decide whether it is useful to get informed of these
"bogus" calls.

Charles


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-26 20:26     ` Charles P. Wright
@ 2006-04-26 19:40       ` Jeff Dike
  2006-04-26 21:29         ` Charles P. Wright
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-04-26 19:40 UTC (permalink / raw)
  To: Charles P. Wright
  Cc: Bodo Stroesser, Heiko Carstens, linux-kernel,
	user-mode-linux-devel

On Wed, Apr 26, 2006 at 04:26:42PM -0400, Charles P. Wright wrote:
> I have a similar local patch that I've been using.  I think it would be
> worthwhile to have an extra bit in the bitmap that says what to do with
> calls that fall outside the range [0, __NR_syscall].  That way the
> ptrace monitor can decide whether it is useful to get informed of these
> "bogus" calls.

The bit needs to be somewhere, but I think sticking it in the syscall
bitmask is a bad idea.  Mixing apples and oranges, as it were.
Sticking it in the op is better, even though that's a bit of apples
and oranges as well.

Another alternative would be to make it an option and set it with
PTRACE_SETOPTIONS.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-26 19:40       ` Jeff Dike
@ 2006-04-26 21:29         ` Charles P. Wright
  0 siblings, 0 replies; 26+ messages in thread
From: Charles P. Wright @ 2006-04-26 21:29 UTC (permalink / raw)
  To: Jeff Dike
  Cc: Bodo Stroesser, Heiko Carstens, linux-kernel,
	user-mode-linux-devel

On Wed, 2006-04-26 at 15:40 -0400, Jeff Dike wrote:
> On Wed, Apr 26, 2006 at 04:26:42PM -0400, Charles P. Wright wrote:
> > I have a similar local patch that I've been using.  I think it would be
> > worthwhile to have an extra bit in the bitmap that says what to do with
> > calls that fall outside the range [0, __NR_syscall].  That way the
> > ptrace monitor can decide whether it is useful to get informed of these
> > "bogus" calls.
> 
> The bit needs to be somewhere, but I think sticking it in the syscall
> bitmask is a bad idea.  Mixing apples and oranges, as it were.
> Sticking it in the op is better, even though that's a bit of apples
> and oranges as well.
> 
> Another alternative would be to make it an option and set it with
> PTRACE_SETOPTIONS.
That is probably a better solution than sticking it in the request (I
assume you meant request by op).   I think spawning more PTRACE_*
requests that perform some permutation of PTRACE_SYSCALL is likely to
make things confusing.

Charles


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-20  9:05 ` Heiko Carstens
  2006-04-20 14:17   ` [uml-devel] " Bodo Stroesser
@ 2006-04-21 18:16   ` Blaisorblade
  2006-04-21 18:38     ` Blaisorblade
  2006-04-22  7:06     ` Heiko Carstens
  1 sibling, 2 replies; 26+ messages in thread
From: Blaisorblade @ 2006-04-21 18:16 UTC (permalink / raw)
  To: user-mode-linux-devel; +Cc: Heiko Carstens, Jeff Dike, linux-kernel

On Thursday 20 April 2006 11:05, Heiko Carstens wrote:
> > Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
> > traced.  It takes a bitmask and a length.  A system call is traced
> > if its bit is one.  Otherwise, it executes normally, and is
> > invisible to the ptracing parent.
> > [...]
> > +int set_syscall_mask(struct task_struct *child, char __user *mask,
> > +		     unsigned long len)
> > +{
> > +	int i, n = (NR_syscalls + 7) / 8;
> > +	char c;
> > +
> > +	if(len > n){
> > +		for(i = NR_syscalls; i < len * 8; i++){
> > +			get_user(c, &mask[i / 8]);
> > +			if(!(c & (1 << (i % 8)))){
> > +				printk("Out of range syscall at %d\n", i);
> > +				return -EINVAL;
> > +			}
> > +		}
> > +
> > +		len = n;
> > +	}
>
> Since it's quite likely that len > n will be true (e.g. after installing
> the latest version of your debug tool) it would be better to silently
> ignore all bits not within the range of NR_syscalls.

For strace -e what you say is reasonable, since it will set only a few bits to 
1 (the ones for the requested syscalls) and everything else to 0. Also, 
there's a problem for this case since the host will 1-extend the mask, so an 
old strace would trace some unwanted and unknown syscalls. I.e. we want here 
to 0-extend the mask and only maybe complain for bits set to 1.

For UML, instead, it's important to set that some peculiar syscalls are not 
traced, that the mask is 1-extended and that errors are reported.

So, I suggest a "flags" parameter for this. Sadly, we're using the ptrace() 
syscall and there's no 5th argument normally, we could either use it (IIRC 
some calls use the 5th regs indeed), or pass as "data" a struct with flags 
and the mask.

The flags could be:

MASK_DEFAULT_TRACE (set the default to 1 for remaining bits)
MASK_DEFAULT_IGNORE (set the default to 0 for remaining bits)
MASK_STRICT_VERIFY (return -EINVAL for bits exceeding NR_syscalls and set 
differently than the default).

probably with a reasonable prefix to avoid namespace pollution (something like 
"PT_SC_-").

> There is no point in flooding the console.

This one is at all correct - that printk is only meaningful for debug.

> The tracing process won't see 
> any of the non existant syscalls it requested to see anyway.

No, you misunderstood the code, it does the opposite very different - the loop 
will detect the syscalls that the process wanted to ignore but don't exist. 
For the UML case, it needs it must trace that syscall and execute it on his 
own rather than rely on the host performing it.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

___________________________________ 
Bolletta salata? Passa a Yahoo! Messenger with Voice 
http://it.messenger.yahoo.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-21 18:16   ` Blaisorblade
@ 2006-04-21 18:38     ` Blaisorblade
  2006-04-22  7:06     ` Heiko Carstens
  1 sibling, 0 replies; 26+ messages in thread
From: Blaisorblade @ 2006-04-21 18:38 UTC (permalink / raw)
  To: user-mode-linux-devel; +Cc: Heiko Carstens, Jeff Dike, linux-kernel

On Friday 21 April 2006 20:16, Blaisorblade wrote:
> On Thursday 20 April 2006 11:05, Heiko Carstens wrote:

> The flags could be:
>
> MASK_DEFAULT_TRACE (set the default to 1 for remaining bits)
> MASK_DEFAULT_IGNORE (set the default to 0 for remaining bits)
> MASK_STRICT_VERIFY (return -EINVAL for bits exceeding NR_syscalls and set
> differently than the default).

Actually, for a more elegant API the default should be to check and there 
should be a flag PT_SC_MASK_IGNORE_UNKNOWN_SYSCALL (reworded in some clearer 
way, it doesn't mean to ignore the syscall but the bits - IGNORE should be 
something like "be comprehensive with me when you check". Maybe 
ACCEPT_UNKNOWN_SYSCALL).

> probably with a reasonable prefix to avoid namespace pollution (something
> like "PT_SC_-").

-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

		
___________________________________ 
Bolletta salata? Passa a Yahoo! Messenger with Voice 
http://it.messenger.yahoo.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-21 18:16   ` Blaisorblade
  2006-04-21 18:38     ` Blaisorblade
@ 2006-04-22  7:06     ` Heiko Carstens
  2006-04-22  8:32       ` Blaisorblade
  2006-04-25 15:59       ` Jeff Dike
  1 sibling, 2 replies; 26+ messages in thread
From: Heiko Carstens @ 2006-04-22  7:06 UTC (permalink / raw)
  To: Blaisorblade; +Cc: user-mode-linux-devel, Jeff Dike, linux-kernel

> For UML, instead, it's important to set that some peculiar syscalls are not 
> traced, that the mask is 1-extended and that errors are reported.
> 
> So, I suggest a "flags" parameter for this. Sadly, we're using the ptrace() 
> syscall and there's no 5th argument normally, we could either use it (IIRC 
> some calls use the 5th regs indeed), or pass as "data" a struct with flags 
> and the mask.
> 
> The flags could be:
> 
> MASK_DEFAULT_TRACE (set the default to 1 for remaining bits)
> MASK_DEFAULT_IGNORE (set the default to 0 for remaining bits)
> MASK_STRICT_VERIFY (return -EINVAL for bits exceeding NR_syscalls and set 
> differently than the default).
> 
> probably with a reasonable prefix to avoid namespace pollution (something like 
> "PT_SC_-").

You might as well introduce yet another ptrace call which returns the number
of system calls and for this ptrace call force user space to pass a complete
bitmap. Sounds easier to me.

> > The tracing process won't see 
> > any of the non existant syscalls it requested to see anyway.
> No, you misunderstood the code, it does the opposite very different - the loop 

Looks I missed a few "!"s :)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-22  7:06     ` Heiko Carstens
@ 2006-04-22  8:32       ` Blaisorblade
  2006-04-25 15:59       ` Jeff Dike
  1 sibling, 0 replies; 26+ messages in thread
From: Blaisorblade @ 2006-04-22  8:32 UTC (permalink / raw)
  To: Heiko Carstens; +Cc: user-mode-linux-devel, Jeff Dike, linux-kernel

On Saturday 22 April 2006 09:06, Heiko Carstens wrote:
> > For UML, instead, it's important to set that some peculiar syscalls are
> > not traced, that the mask is 1-extended and that errors are reported.
> >
> > So, I suggest a "flags" parameter for this. Sadly, we're using the
> > ptrace() syscall and there's no 5th argument normally, we could either
> > use it (IIRC some calls use the 5th regs indeed), or pass as "data" a
> > struct with flags and the mask.
> >
> > The flags could be:
> >
> > MASK_DEFAULT_TRACE (set the default to 1 for remaining bits)
> > MASK_DEFAULT_IGNORE (set the default to 0 for remaining bits)
> > MASK_STRICT_VERIFY (return -EINVAL for bits exceeding NR_syscalls and set
> > differently than the default).
> >
> > probably with a reasonable prefix to avoid namespace pollution (something
> > like "PT_SC_-").
>
> You might as well introduce yet another ptrace call which returns the
> number of system calls and for this ptrace call force user space to pass a
> complete bitmap. Sounds easier to me.

I thought to something such, but it's interesting to have auto-complete and I 
didn't like the idea of querying the syscall number... It's true that my 
suggestion wasn't (maybe) that marvellous either.



As a side note, I'd like to inform you that this patch made it to the LWN 
front page... I'm sorry that the article cannot yet be read (it's subscribers 
only for now and until next Thursday):

http://lwn.net/Articles/179829/

The article is positive about the patch and shows interest, on the wave of all 
the various existing virtualization ideas.

> > > The tracing process won't see
> > > any of the non existant syscalls it requested to see anyway.

> > No, you misunderstood the code, it does the opposite very different - the
> > loop

> Looks I missed a few "!"s :)
Don't worry! Bye
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] Re: [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-22  7:06     ` Heiko Carstens
  2006-04-22  8:32       ` Blaisorblade
@ 2006-04-25 15:59       ` Jeff Dike
  1 sibling, 0 replies; 26+ messages in thread
From: Jeff Dike @ 2006-04-25 15:59 UTC (permalink / raw)
  To: Heiko Carstens; +Cc: Blaisorblade, user-mode-linux-devel, linux-kernel

On Sat, Apr 22, 2006 at 09:06:10AM +0200, Heiko Carstens wrote:
> > The flags could be:
> > 
> > MASK_DEFAULT_TRACE (set the default to 1 for remaining bits)
> > MASK_DEFAULT_IGNORE (set the default to 0 for remaining bits)
> > MASK_STRICT_VERIFY (return -EINVAL for bits exceeding NR_syscalls and set 
> > differently than the default).

I'd prefer (given that there aren't any unused ptrace arguments) using
the operation for this - PTRACE_SYSCALL_MASK_TRACE,
PTRACE_SYSCALL_MASK_IGNORE.  We'd need better names than these
horribly over-long ones, though.

> You might as well introduce yet another ptrace call which returns the number
> of system calls and for this ptrace call force user space to pass a complete
> bitmap. Sounds easier to me.

I think that's just building in fragility whenever userspace doesn't
happen to match the kernel.  Both UML and strace will know what system
calls they are interested in.  Having the kernel 1- or 0-extend the
mask will automatically do the right thing.  If userspace is newer
than the kernel, and asks for special treatment for system calls that
don't exist, then it should get a -EINVAL.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-13 17:20 [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK Jeff Dike
  2006-04-18 12:57 ` Pavel Machek
  2006-04-20  9:05 ` Heiko Carstens
@ 2006-04-21 18:34 ` Blaisorblade
  2006-04-25 16:29   ` Jeff Dike
  2 siblings, 1 reply; 26+ messages in thread
From: Blaisorblade @ 2006-04-21 18:34 UTC (permalink / raw)
  To: user-mode-linux-devel
  Cc: Jeff Dike, linux-kernel, Heiko Carstens, Bodo Stroesser

On Thursday 13 April 2006 19:20, Jeff Dike wrote:
> Add PTRACE_SYSCALL_MASK, which allows system calls to be selectively
> traced.  It takes a bitmask and a length.  A system call is traced
> if its bit is one.  Otherwise, it executes normally, and is
> invisible to the ptracing parent.

> This is not just useful for UML - strace -e could make good use of it as
> well.

> Index: linux-2.6.17-mm-vtime/include/asm-i386/ptrace.h
> ===================================================================
> --- linux-2.6.17-mm-vtime.orig/include/asm-i386/ptrace.h	2006-04-13
> 13:48:02.000000000 -0400 +++
> linux-2.6.17-mm-vtime/include/asm-i386/ptrace.h	2006-04-13
> 13:49:32.000000000 -0400 @@ -53,6 +53,7 @@ struct pt_regs {
>
>  #define PTRACE_GET_THREAD_AREA    25
>  #define PTRACE_SET_THREAD_AREA    26
> +#define PTRACE_SYSCALL_MASK	  27

I think there could be a reason we skipped that for SYSEMU - that's to see. 
Also, if this capability will be implemented in other archs, we should use 
the 0x4200-0x4300 range for it.

>  #define PTRACE_SYSEMU		  31
>  #define PTRACE_SYSEMU_SINGLESTEP  32

> @@ -450,6 +451,41 @@ int ptrace_traceme(void)
>  	return 0;
>  }
>
> +int set_syscall_mask(struct task_struct *child, char __user *mask,
> +		     unsigned long len)
> +{
> +	int i, n = (NR_syscalls + 7) / 8;
> +	char c;
> +
> +	if(len > n){
> +		for(i = NR_syscalls; i < len * 8; i++){
> +			get_user(c, &mask[i / 8]);

This get_user() inside a loop is poor, it could slow down a valid call. It'd 
be simpler to copy the mask from userspace in a local variable (with 400 
syscalls that's 50 bytes, i.e. fully ok), and then perform the checks, if 
wanted (I disagree with Heiko's message, this check is needed sometimes - see 
my response to that).

And only after that set all at once child->syscall_mask. You copy twice that 
little quantity of data but that's not at all time-critical, and you're 
forced to do that to avoid partial updates; btw you've saved getting twice 
the content from userspace (slow when address spaces are distinct, like for 
4G/4G or SKAS implementation of copy_from_user).

Actually we would copy the whole struct in my API proposal (as I've described 
in the other message, we need to pass another param IMHO, so we'd pack them 
in a struct and pass its address).

> +			if(!(c & (1 << (i % 8)))){
> +				printk("Out of range syscall at %d\n", i);
> +				return -EINVAL;
> +			}
> +		}
> +
> +		len = n;
> +	}
> +
> +	if(child->syscall_mask == NULL){
> +		child->syscall_mask = kmalloc(n, GFP_KERNEL);
> +		if(child->syscall_mask == NULL)
> +			return -ENOMEM;
> +
> +		memset(child->syscall_mask, 0xff, n);
> +	}
> +
> +	/* XXX If this partially fails, we will have a partially updated
> +	 * mask.
> +	 */
> +	if(copy_from_user(child->syscall_mask, mask, len))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +

-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-21 18:34 ` [uml-devel] " Blaisorblade
@ 2006-04-25 16:29   ` Jeff Dike
  2006-04-26 15:47     ` Blaisorblade
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-04-25 16:29 UTC (permalink / raw)
  To: Blaisorblade
  Cc: user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Fri, Apr 21, 2006 at 08:34:52PM +0200, Blaisorblade wrote:
> >  #define PTRACE_GET_THREAD_AREA    25
> >  #define PTRACE_SET_THREAD_AREA    26
> > +#define PTRACE_SYSCALL_MASK	  27
> 
> I think there could be a reason we skipped that for SYSEMU - that's to see. 
> Also, if this capability will be implemented in other archs, we should use 
> the 0x4200-0x4300 range for it.

Yeah, we need to decide somewhat carefully which number to use.

> > +		for(i = NR_syscalls; i < len * 8; i++){
> > +			get_user(c, &mask[i / 8]);
> 
> This get_user() inside a loop is poor, it could slow down a valid call. It'd 
> be simpler to copy the mask from userspace in a local variable (with 400 
> syscalls that's 50 bytes, i.e. fully ok), and then perform the checks, if 
> wanted (I disagree with Heiko's message, this check is needed
> sometimes - see  my response to that).

Agree, except that we need to be careful about when userspace knows
about more system calls than the kernel.  We should copy-user as many
bits as the kernel knows about (or the process passes in, which ever
is less) and if the process knows about more system calls than the
kernel, the extra bits should be checked (maybe in a get_user(c, ...)
loop) to make sure that special treatment isn't being requested for
unknown syscalls.

> And only after that set all at once child->syscall_mask. You copy twice that 
> little quantity of data but that's not at all time-critical, and you're 
> forced to do that to avoid partial updates; btw you've saved getting twice 
> the content from userspace (slow when address spaces are distinct, like for 
> 4G/4G or SKAS implementation of copy_from_user).

Yup.

> Actually we would copy the whole struct in my API proposal (as I've
> described in the other message, we need to pass another param IMHO,
> so we'd pack them in a struct and pass its address).

You mean adding a fifth argument to ptrace?  I don't really like that
idea.  We could either make two new PTRACE_* operations (I don't like
the MASK_STRICT_VERIFY option since that seems unnecessary and
fragile) or make the data argument something like this
	struct {
		int flag;
		void *mask;
	}

which seems to be something like what you're suggesting.  You'll want
to stick the mask length in there as well, and leave the data argument
unused.

Except that passing pointers to pointers into system calls seems like
a bad idea - it makes ptrace look (more) like ioctl.  So, you'd want
something like
	struct {
		int flag;
		char mask[(NR_syscalls + 7)/8];
	}

then you'd want the length back in data so you know how much data the
process is giving you.  But then, you'll read the smaller of the
kernel's and process's version of the structure, and if the process
one is bigger, you need to read the extra bits to sanity-check them.
Given that you'll need this extra treatment, I think it's simpler to
just leave the addr argument as a pointer to the bits and add an extra
ptrace op.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-25 16:29   ` Jeff Dike
@ 2006-04-26 15:47     ` Blaisorblade
  2006-04-26 15:46       ` Jeff Dike
  0 siblings, 1 reply; 26+ messages in thread
From: Blaisorblade @ 2006-04-26 15:47 UTC (permalink / raw)
  To: user-mode-linux-devel
  Cc: Jeff Dike, linux-kernel, Heiko Carstens, Bodo Stroesser

On Tuesday 25 April 2006 18:29, Jeff Dike wrote:
> On Fri, Apr 21, 2006 at 08:34:52PM +0200, Blaisorblade wrote:
> > >  #define PTRACE_GET_THREAD_AREA    25
> > >  #define PTRACE_SET_THREAD_AREA    26
> > > +#define PTRACE_SYSCALL_MASK	  27
> >
> > I think there could be a reason we skipped that for SYSEMU - that's to
> > see. Also, if this capability will be implemented in other archs, we
> > should use the 0x4200-0x4300 range for it.
>
> Yeah, we need to decide somewhat carefully which number to use.
>
> > > +		for(i = NR_syscalls; i < len * 8; i++){
> > > +			get_user(c, &mask[i / 8]);
> >
> > This get_user() inside a loop is poor, it could slow down a valid call.
> > It'd be simpler to copy the mask from userspace in a local variable (with
> > 400 syscalls that's 50 bytes, i.e. fully ok), and then perform the
> > checks, if wanted (I disagree with Heiko's message, this check is needed
> > sometimes - see  my response to that).
>
> Agree, except that we need to be careful about when userspace knows
> about more system calls than the kernel.  We should copy-user as many
> bits as the kernel knows about (or the process passes in, which ever
> is less) and if the process knows about more system calls than the
> kernel, the extra bits should be checked (maybe in a get_user(c, ...)
> loop) to make sure that special treatment isn't being requested for
> unknown syscalls.

Yes, that's exactly what I thought. The get_user() loop isn't that nice but 
that's possibly a minor point.

> > And only after that set all at once child->syscall_mask. You copy twice
> > that little quantity of data but that's not at all time-critical, and
> > you're forced to do that to avoid partial updates; btw you've saved
> > getting twice the content from userspace (slow when address spaces are
> > distinct, like for 4G/4G or SKAS implementation of copy_from_user).

> Yup.

> > Actually we would copy the whole struct in my API proposal (as I've
> > described in the other message, we need to pass another param IMHO,
> > so we'd pack them in a struct and pass its address).

> You mean adding a fifth argument to ptrace?  I don't really like that
> idea.  We could either make two new PTRACE_* operations (I don't like
> the MASK_STRICT_VERIFY option since that seems unnecessary and
> fragile) or make the data argument something like this

> Except that passing pointers to pointers into system calls seems like
> a bad idea - it makes ptrace look (more) like ioctl.  So, you'd want
> something like
> 	struct {
> 		int flag;
> 		char mask[(NR_syscalls + 7)/8];
> 	}
>
> then you'd want the length back in data so you know how much data the
> process is giving you.

Yes, this is what I mean.

> But then, you'll read the smaller of the 
> kernel's and process's version of the structure, and if the process
> one is bigger, you need to read the extra bits to sanity-check them.
> Given that you'll need this extra treatment,

You need this treatment anyway - above we're passing a pointer to a bitstring, 
here we're passing a pointer to a struct containing a bitstring, in both 
cases we must copy in the right amount of bytes.

> I think it's simpler to 
> just leave the addr argument as a pointer to the bits and add an extra
> ptrace op.

If we can do without MASK_STRICT_VERIFY, that works fully, and anyway it's 
simpler - however, say, when running strace -e read,tee (sys_tee will soon be 
added, it seems) this call would fail, while it would be desirable to have it 
work as strace -e read.

MASK_STRICT_VERIFY isn't necessarily the best solution, but if userspace must 
search the maximum allowed syscall by multiple attempts, we've still a bad 
API.

Probably, a better option (_instead_ of MASK_STRICT_VERIFY) would be to return 
somewhere an "extended error code" saying which is the last allowed syscall 
or (better) which is the first syscall which failed. I.e. if there is strace 
-e read,splice,tee and nor splice nor tee are supported, then this value 
would be __NR_splice and strace (or any app) could then decide what to do.

To do that we need again a structure with a field where to store the code 
(which _must_ be at the beginning).

But this is cleaner than saying to the kernel "interpret what I say if I'm 
wrong", and I said above the complexity is the same when copying the 
structure.

And I'd use this together with the "two ptrace codes" idea.
Let's say we'll use PTRACE_TRACE_ONLY or PTRACE_TRACE_EXCEPT.

Another possibility (which however implies implementation for all 
architectures) is to put these two requests between ptrace options (i.e. 
PTRACE_SETOPTIONS), where it logically belongs (and this is the only point 
reason to do it this way); however we have then only one parameter, which 
would become then a pointer to such a structure:

struct {
	int ret_code;
	int mask_len;
	char mask[];
};
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-26 15:47     ` Blaisorblade
@ 2006-04-26 15:46       ` Jeff Dike
  2006-04-28 20:28         ` Blaisorblade
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-04-26 15:46 UTC (permalink / raw)
  To: Blaisorblade
  Cc: user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Wed, Apr 26, 2006 at 05:47:54PM +0200, Blaisorblade wrote:
> If we can do without MASK_STRICT_VERIFY, that works fully, and
> anyway it's simpler - however, say, when running strace -e read,tee
> (sys_tee will soon be added, it seems) this call would fail, while it
> would be desirable to have it work as strace -e read. 
> 
> MASK_STRICT_VERIFY isn't necessarily the best solution, but if
> userspace must search the maximum allowed syscall by multiple
> attempts, we've still a bad API.
> 
> Probably, a better option (_instead_ of MASK_STRICT_VERIFY) would be
> to return somewhere an "extended error code" saying which is the
> last allowed syscall or (better) which is the first syscall which
> failed. I.e. if there is strace -e read,splice,tee and nor splice nor
> tee are supported, then this value would be __NR_splice and strace (or
> any app) could then decide what to do.

Why not just zero out the bits that the kernel knows about?  Then, if
we return -EINVAL, the process just looks at the remaining bits that
are set to see what system calls the kernel didn't know about.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-26 15:46       ` Jeff Dike
@ 2006-04-28 20:28         ` Blaisorblade
  2006-04-29  1:49           ` Jeff Dike
  2006-04-29  8:49           ` Heiko Carstens
  0 siblings, 2 replies; 26+ messages in thread
From: Blaisorblade @ 2006-04-28 20:28 UTC (permalink / raw)
  To: user-mode-linux-devel
  Cc: Jeff Dike, linux-kernel, Heiko Carstens, Bodo Stroesser

On Wednesday 26 April 2006 17:46, Jeff Dike wrote:
> On Wed, Apr 26, 2006 at 05:47:54PM +0200, Blaisorblade wrote:

> Why not just zero out the bits that the kernel knows about?  Then, if
> we return -EINVAL, the process just looks at the remaining bits that
> are set to see what system calls the kernel didn't know about.

Good idea. When you're leaving the whole mask to 1 _except_ some bits set to 0 
what do you suggest? Setting everything to 1 so the process sees the invalid 
0 bits?

However, I've had a new idea for the API form - sigprocmask() is used to 
either enable or disable some bits in the _signal_ mask. But you pass in both 
cases the bits to toggle. Making the API more similar to this would be good.

Even if the semantics of both settings and clearing bits are unclear. 
Probably, simply making both calls _set_ the mask but one of them (i.e. 
MASK_DEFAULT_TRACE) reverse the mask before setting and after zero-extending 
it to the right.

Ok, this gives us a definite proposal, which I finally like:

* to exclude sys_tee:

bitmask = 0;
set_bit(__NR_tee, bitmask);
ptrace(PTRACE_SET_NOTRACE, bitmask);

* to trace only sys_tee:

bitmask = 0;
set_bit(__NR_tee, bitmask);
ptrace(PTRACE_SET_TRACEONLY, bitmask);

Semantics:

in both cases, the mask is first zero-extended to the right (for syscalls not 
known to userspace), bits for syscall not known to the kernel are checked and 
the call fails if any of them is 1, and in the failure case E2BIG or 
EOVERFLOW is returned (I want to avoid EINVAL and ENOSYS to avoid confusion) 
and the part of the mask known to the kernel is 0-ed.

In case of success, for NOTRACE (which was DEFAULT_TRACE) the mask is reversed 
before copying in the kernel syscall mask, for TRACEONLY it's copied there 
directly.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade
Chiacchiera con i tuoi amici in tempo reale! 
 http://it.yahoo.com/mail_it/foot/*http://it.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-28 20:28         ` Blaisorblade
@ 2006-04-29  1:49           ` Jeff Dike
  2006-05-01 13:51             ` Daniel Jacobowitz
  2006-04-29  8:49           ` Heiko Carstens
  1 sibling, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-04-29  1:49 UTC (permalink / raw)
  To: Blaisorblade
  Cc: user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Fri, Apr 28, 2006 at 10:28:46PM +0200, Blaisorblade wrote:
> Ok, this gives us a definite proposal, which I finally like:
> 
> * to exclude sys_tee:
> 
> bitmask = 0;
> set_bit(__NR_tee, bitmask);
> ptrace(PTRACE_SET_NOTRACE, bitmask);
> 
> * to trace only sys_tee:
> 
> bitmask = 0;
> set_bit(__NR_tee, bitmask);
> ptrace(PTRACE_SET_TRACEONLY, bitmask);

Yup, I like this.

> the call fails if any of them is 1, and in the failure case E2BIG or 
> EOVERFLOW is returned

strerror(E2BIG) is "Arg list too long"

strerror(EOVERFLOW) is "Value too large for defined data type"

Neither of those seems right.  I'd just as soon stick with -EINVAL.

				Jeff



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-29  1:49           ` Jeff Dike
@ 2006-05-01 13:51             ` Daniel Jacobowitz
  2006-05-01 13:45               ` Jeff Dike
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Jacobowitz @ 2006-05-01 13:51 UTC (permalink / raw)
  To: Jeff Dike
  Cc: Blaisorblade, user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Fri, Apr 28, 2006 at 09:49:56PM -0400, Jeff Dike wrote:
> On Fri, Apr 28, 2006 at 10:28:46PM +0200, Blaisorblade wrote:
> > Ok, this gives us a definite proposal, which I finally like:
> > 
> > * to exclude sys_tee:
> > 
> > bitmask = 0;
> > set_bit(__NR_tee, bitmask);
> > ptrace(PTRACE_SET_NOTRACE, bitmask);
> > 
> > * to trace only sys_tee:
> > 
> > bitmask = 0;
> > set_bit(__NR_tee, bitmask);
> > ptrace(PTRACE_SET_TRACEONLY, bitmask);
> 
> Yup, I like this.

I really recommend you not do this.  One (better) suggestion earlier
was:

struct {
  int bitmask_length;
  int flags;
  char bitmask[0];
};

The difference between this case and the sigprocmask example is that
the size of a sigset_t is very hard to change - it's a userspace ABI
break.  If you want to model it after sigprocmask, don't look at the
man page, which describes the POSIX function.  Look at the more recent
RT version of the syscall instead:

sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize)

Suppose the kernel knows about 32 more syscalls than userspace.  It's
going to read extra bits out of the bitmask that userspace didn't
initialize!

Also, if you store the mask with the child process, it risks surprising
existing tracers: attach, set mask, detach, then the next time someone
attaches an old version of strace some syscalls will be "hidden".

-- 
Daniel Jacobowitz
CodeSourcery

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-05-01 13:51             ` Daniel Jacobowitz
@ 2006-05-01 13:45               ` Jeff Dike
  2006-05-01 15:01                 ` Daniel Jacobowitz
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-05-01 13:45 UTC (permalink / raw)
  To: Blaisorblade, user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Mon, May 01, 2006 at 09:51:27AM -0400, Daniel Jacobowitz wrote:
> On Fri, Apr 28, 2006 at 09:49:56PM -0400, Jeff Dike wrote:
> > On Fri, Apr 28, 2006 at 10:28:46PM +0200, Blaisorblade wrote:
> > > bitmask = 0;
> > > set_bit(__NR_tee, bitmask);
> > > ptrace(PTRACE_SET_TRACEONLY, bitmask);
> > 
> > Yup, I like this.
> 
> I really recommend you not do this.  

> Suppose the kernel knows about 32 more syscalls than userspace.  It's
> going to read extra bits out of the bitmask that userspace didn't
> initialize!

The example above is a sketch, not a fully formed, compilable user.  Every
proposed interface has had the mask length passed in - in the case
above in the data argument.

> Also, if you store the mask with the child process, it risks surprising
> existing tracers: attach, set mask, detach, then the next time someone
> attaches an old version of strace some syscalls will be "hidden".

Not if the mask only survives for the duration of a PTRACE_ATTACH, and
the mask is released on PTRACE_DETACH.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-05-01 13:45               ` Jeff Dike
@ 2006-05-01 15:01                 ` Daniel Jacobowitz
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Jacobowitz @ 2006-05-01 15:01 UTC (permalink / raw)
  To: Jeff Dike
  Cc: Blaisorblade, user-mode-linux-devel, linux-kernel, Heiko Carstens,
	Bodo Stroesser

On Mon, May 01, 2006 at 09:45:52AM -0400, Jeff Dike wrote:
> The example above is a sketch, not a fully formed, compilable user.  Every
> proposed interface has had the mask length passed in - in the case
> above in the data argument.

Oh.  Well, then, I must have missed a message when I read the thread
this morning - sorry.  I'll watch for the next posting.

-- 
Daniel Jacobowitz
CodeSourcery

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-28 20:28         ` Blaisorblade
  2006-04-29  1:49           ` Jeff Dike
@ 2006-04-29  8:49           ` Heiko Carstens
  2006-05-01 17:02             ` Jeff Dike
  1 sibling, 1 reply; 26+ messages in thread
From: Heiko Carstens @ 2006-04-29  8:49 UTC (permalink / raw)
  To: Blaisorblade
  Cc: user-mode-linux-devel, Jeff Dike, linux-kernel, Bodo Stroesser

> Ok, this gives us a definite proposal, which I finally like:
> 
> * to exclude sys_tee:
> 
> bitmask = 0;
> set_bit(__NR_tee, bitmask);
> ptrace(PTRACE_SET_NOTRACE, bitmask);
> 
> * to trace only sys_tee:
> 
> bitmask = 0;
> set_bit(__NR_tee, bitmask);
> ptrace(PTRACE_SET_TRACEONLY, bitmask);
> 
> Semantics:
> 
> in both cases, the mask is first zero-extended to the right (for syscalls not 
> known to userspace), bits for syscall not known to the kernel are checked and 
> the call fails if any of them is 1, and in the failure case E2BIG or 
> EOVERFLOW is returned (I want to avoid EINVAL and ENOSYS to avoid confusion) 
> and the part of the mask known to the kernel is 0-ed.
> 
> In case of success, for NOTRACE (which was DEFAULT_TRACE) the mask is reversed 
> before copying in the kernel syscall mask, for TRACEONLY it's copied there 
> directly.

IMHO this is way too complicated. Introducing a ptrace call that returns
the number of syscalls and forcing user space to pass a complete bitmask
is much easier. Also the semantics are much easier to understand.

In addition your proposal would already introduce a rather complicated
interface to figure out how many syscalls the kernel has. I'm sure this
will be (mis)used sooner or later.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-04-29  8:49           ` Heiko Carstens
@ 2006-05-01 17:02             ` Jeff Dike
  2006-05-02  6:57               ` Heiko Carstens
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Dike @ 2006-05-01 17:02 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Blaisorblade, user-mode-linux-devel, Jeff Dike, linux-kernel,
	Bodo Stroesser

On Sat, Apr 29, 2006 at 10:49:07AM +0200, Heiko Carstens wrote:
> IMHO this is way too complicated. Introducing a ptrace call that returns
> the number of syscalls and forcing user space to pass a complete bitmask
> is much easier. Also the semantics are much easier to understand.

This sounds more complicated than what we are proposing.  

This would make the process care about the number of system calls
implemented by the kernel, which is something that doesn't even come
up in the normal case with the current interface.  You only care about
it if you get a -EINVAL and want to figure out exactly why.

>From a practical point of view, you would want code that looks like
this:
	n = nsyscalls();
	mask = malloc((n + 7)/8);
	if(mask == NULL)
		return;

	/* Zero mask, set bits, call ptrace */

	free(mask);

rather than code like this:

	int mask[(BIGGEST_SYSCALL_I_CARE_ABOUT + 7) / 8];

	/* Zero mask, set bits, call ptrace */

That doesn't seem like an improvement to me.

The second case would be more complicated if it wanted to figure out
what the problem was if ptrace returned -EINVAL.  However, some users
won't care, so that complexity is optional.  For example, UML will
already know by other means what system calls are implemented on the
host, so it won't bother looking at the mask in the case of a
failure.  I'm not sure what the right thing for strace is.

> In addition your proposal would already introduce a rather complicated
> interface to figure out how many syscalls the kernel has. I'm sure this
> will be (mis)used sooner or later.

How?  And, if so, why is that a problem?

There are already complicated ways to figure out what system calls the
kernel has, and I don't recall them causing problems.

				Jeff

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [uml-devel] [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK
  2006-05-01 17:02             ` Jeff Dike
@ 2006-05-02  6:57               ` Heiko Carstens
  0 siblings, 0 replies; 26+ messages in thread
From: Heiko Carstens @ 2006-05-02  6:57 UTC (permalink / raw)
  To: Jeff Dike
  Cc: Blaisorblade, user-mode-linux-devel, linux-kernel, Bodo Stroesser

> This sounds more complicated than what we are proposing.  
> 
> This would make the process care about the number of system calls
> implemented by the kernel, which is something that doesn't even come
> up in the normal case with the current interface.  You only care about
> it if you get a -EINVAL and want to figure out exactly why.
> 
> From a practical point of view, you would want code that looks like
> this:

Yes.

[1]
> 	n = nsyscalls();
> 	mask = malloc((n + 7)/8);
> 	if(mask == NULL)
> 		return;
> 
> 	/* Zero mask, set bits, call ptrace */
> 
> 	free(mask);
> 

[2]
> rather than code like this:
> 
> 	int mask[(BIGGEST_SYSCALL_I_CARE_ABOUT + 7) / 8];
> 
> 	/* Zero mask, set bits, call ptrace */


> That doesn't seem like an improvement to me.
> 
> The second case would be more complicated if it wanted to figure out
> what the problem was if ptrace returned -EINVAL.  However, some users

That is actually my point. If you're checking for errors you will end up
first doing [2] and later on doing something like [1] anyway...

> > In addition your proposal would already introduce a rather complicated
> > interface to figure out how many syscalls the kernel has. I'm sure this
> > will be (mis)used sooner or later.
> 
> How?  And, if so, why is that a problem?

>From the proposal:

<snip>
> Semantics:
>
> in both cases, the mask is first zero-extended to the right (for syscalls not
> known to userspace), bits for syscall not known to the kernel are checked and
> the call fails if any of them is 1, and in the failure case E2BIG or
> EOVERFLOW is returned (I want to avoid EINVAL and ENOSYS to avoid confusion)
> and the part of the mask known to the kernel is 0-ed.
<snip>

So you just need to pass a large enough bitmask with all ones and the kernel
will put zeroes in the bitmask up to the bit number NR_sycalls - 1.
Counting the zeroes should work...

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2006-05-02  6:57 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-13 17:20 [RFC] PATCH 3/4 - Time virtualization : PTRACE_SYSCALL_MASK Jeff Dike
2006-04-18 12:57 ` Pavel Machek
2006-04-26 18:38   ` Jeff Dike
2006-04-20  9:05 ` Heiko Carstens
2006-04-20 14:17   ` [uml-devel] " Bodo Stroesser
2006-04-25 18:32     ` Jeff Dike
2006-04-26 20:26     ` Charles P. Wright
2006-04-26 19:40       ` Jeff Dike
2006-04-26 21:29         ` Charles P. Wright
2006-04-21 18:16   ` Blaisorblade
2006-04-21 18:38     ` Blaisorblade
2006-04-22  7:06     ` Heiko Carstens
2006-04-22  8:32       ` Blaisorblade
2006-04-25 15:59       ` Jeff Dike
2006-04-21 18:34 ` [uml-devel] " Blaisorblade
2006-04-25 16:29   ` Jeff Dike
2006-04-26 15:47     ` Blaisorblade
2006-04-26 15:46       ` Jeff Dike
2006-04-28 20:28         ` Blaisorblade
2006-04-29  1:49           ` Jeff Dike
2006-05-01 13:51             ` Daniel Jacobowitz
2006-05-01 13:45               ` Jeff Dike
2006-05-01 15:01                 ` Daniel Jacobowitz
2006-04-29  8:49           ` Heiko Carstens
2006-05-01 17:02             ` Jeff Dike
2006-05-02  6:57               ` Heiko Carstens

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox