linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
       [not found] <20250113170925.GA392@strace.io>
@ 2025-01-13 17:12 ` Dmitry V. Levin
  2025-01-15 16:38   ` Oleg Nesterov
                     ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-13 17:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements
PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of
system calls the tracee is blocked in.

This API allows ptracers to obtain and modify system call details
in a straightforward and architecture-agnostic way.

Current implementation supports changing only those bits of system call
information that are used by strace, namely, syscall number, syscall
arguments, and syscall return value.

Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO,
such as instruction pointer and stack pointer, could be added later
if needed, by using struct ptrace_syscall_info.flags to specify
the additional details that should be set.  Currently, flags and reserved
fields of struct ptrace_syscall_info must be initialized with zeroes;
arch, instruction_pointer, and stack_pointer fields are ignored.

PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY,
PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations.
Other operations could be added later if needed.

Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with
PTRACE_GET_SYSCALL_INFO, but it didn't happen.  The last straw that
convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure
to provide an API of changing the first system call argument on riscv
architecture.

ptrace(2) man page:

long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data);
...
PTRACE_SET_SYSCALL_INFO
       Modify information about the system call that caused the stop.
       The "data" argument is a pointer to struct ptrace_syscall_info
       that specifies the system call information to be set.
       The "addr" argument should be set to sizeof(struct ptrace_syscall_info)).

Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/
Signed-off-by: Dmitry V. Levin <ldv@strace.io>
---
 include/linux/ptrace.h      |  3 ++
 include/uapi/linux/ptrace.h |  4 +-
 kernel/ptrace.c             | 95 +++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 90507d4afcd6..c8dbf1e498bf 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -17,6 +17,9 @@ struct syscall_info {
 	struct seccomp_data	data;
 };
 
+/* sizeof() the first published struct ptrace_syscall_info */
+#define PTRACE_SYSCALL_INFO_SIZE_VER0	84
+
 extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 			    void *buf, int len, unsigned int gup_flags);
 
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 72c038fc71d0..ca75b3ab5d22 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -74,6 +74,7 @@ struct seccomp_metadata {
 };
 
 #define PTRACE_GET_SYSCALL_INFO		0x420e
+#define PTRACE_SET_SYSCALL_INFO		0x4212
 #define PTRACE_SYSCALL_INFO_NONE	0
 #define PTRACE_SYSCALL_INFO_ENTRY	1
 #define PTRACE_SYSCALL_INFO_EXIT	2
@@ -81,7 +82,8 @@ struct seccomp_metadata {
 
 struct ptrace_syscall_info {
 	__u8 op;	/* PTRACE_SYSCALL_INFO_* */
-	__u8 pad[3];
+	__u8 reserved;
+	__u16 flags;
 	__u32 arch;
 	__u64 instruction_pointer;
 	__u64 stack_pointer;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 22e7d74cf4cd..41d37cb8f74a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,97 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
 	write_size = min(actual_size, user_size);
 	return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
 }
+
+static unsigned long
+ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+			      struct ptrace_syscall_info *info)
+{
+	unsigned long args[ARRAY_SIZE(info->entry.args)];
+	int nr = info->entry.nr;
+	int i;
+
+	if (nr != info->entry.nr)
+		return -ERANGE;
+
+	for (i = 0; i < ARRAY_SIZE(args); i++) {
+		args[i] = info->entry.args[i];
+		if (args[i] != info->entry.args[i])
+			return -ERANGE;
+	}
+
+	syscall_set_nr(child, regs, nr);
+	/*
+	 * If the syscall number is set to -1, setting syscall arguments is not
+	 * just pointless, it would also clobber the syscall return value on
+	 * those architectures that share the same register both for the first
+	 * argument of syscall and its return value.
+	 */
+	if (nr != -1)
+		syscall_set_arguments(child, regs, args);
+
+	return 0;
+}
+
+static unsigned long
+ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+				struct ptrace_syscall_info *info)
+{
+	/*
+	 * info->entry is currently a subset of info->seccomp,
+	 * info->seccomp.ret_data is currently ignored.
+	 */
+	return ptrace_set_syscall_info_entry(child, regs, info);
+}
+
+static unsigned long
+ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+			     struct ptrace_syscall_info *info)
+{
+	if (info->exit.is_error)
+		syscall_set_return_value(child, regs, info->exit.rval, 0);
+	else
+		syscall_set_return_value(child, regs, 0, info->exit.rval);
+
+	return 0;
+}
+
+static int
+ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
+			void __user *datavp)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	struct ptrace_syscall_info info;
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct ptrace_syscall_info) < PTRACE_SYSCALL_INFO_SIZE_VER0);
+
+	if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
+		return -EINVAL;
+
+	error = copy_struct_from_user(&info, sizeof(info), datavp, user_size);
+	if (error)
+		return error;
+
+	/* Reserved for future use. */
+	if (info.flags || info.reserved)
+		return -EINVAL;
+
+	/* Changing the type of the system call stop is not supported. */
+	if (ptrace_get_syscall_info_op(child) != info.op)
+		return -EINVAL;
+
+	switch (info.op) {
+	case PTRACE_SYSCALL_INFO_ENTRY:
+		return ptrace_set_syscall_info_entry(child, regs, &info);
+	case PTRACE_SYSCALL_INFO_EXIT:
+		return ptrace_set_syscall_info_exit(child, regs, &info);
+	case PTRACE_SYSCALL_INFO_SECCOMP:
+		return ptrace_set_syscall_info_seccomp(child, regs, &info);
+	default:
+		/* Other types of system call stops are not supported. */
+		return -EINVAL;
+	}
+}
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
 int ptrace_request(struct task_struct *child, long request,
@@ -1234,6 +1325,10 @@ int ptrace_request(struct task_struct *child, long request,
 	case PTRACE_GET_SYSCALL_INFO:
 		ret = ptrace_get_syscall_info(child, addr, datavp);
 		break;
+
+	case PTRACE_SET_SYSCALL_INFO:
+		ret = ptrace_set_syscall_info(child, addr, datavp);
+		break;
 #endif
 
 	case PTRACE_SECCOMP_GET_FILTER:
-- 
ldv

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-13 17:12 ` [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request Dmitry V. Levin
@ 2025-01-15 16:38   ` Oleg Nesterov
  2025-01-15 17:36     ` Dmitry V. Levin
  2025-01-16  1:55   ` Charlie Jenkins
  2025-01-16 15:21   ` Oleg Nesterov
  2 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-15 16:38 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

Dmitry,

I can't review the non-x86 changes in 1/7 - 4/7.

As for this and the previous patch I see nothing bad after a quick glance.

Just I have some concerns about the "future extensions", I'll write another
email tomorrow. In particualar, I personally hate the very idea of
copy_struct_from_user/check_zeroed_user ;)

On 01/13, Dmitry V. Levin wrote:
>
> +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
> +			      struct ptrace_syscall_info *info)
> +{
> +	unsigned long args[ARRAY_SIZE(info->entry.args)];
> +	int nr = info->entry.nr;
> +	int i;
> +
> +	if (nr != info->entry.nr)
> +		return -ERANGE;
> +
> +	for (i = 0; i < ARRAY_SIZE(args); i++) {
> +		args[i] = info->entry.args[i];
> +		if (args[i] != info->entry.args[i])
> +			return -ERANGE;
> +	}
> +
> +	syscall_set_nr(child, regs, nr);
> +	/*
> +	 * If the syscall number is set to -1, setting syscall arguments is not
> +	 * just pointless, it would also clobber the syscall return value on
> +	 * those architectures that share the same register both for the first
> +	 * argument of syscall and its return value.
> +	 */
> +	if (nr != -1)
> +		syscall_set_arguments(child, regs, args);

Thanks, much better than I tried to suggest in my reply to V1.

But may be

	if (syscall_get_nr() != -1)
		syscall_set_arguments(...);

will look a bit more consistent?

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-15 16:38   ` Oleg Nesterov
@ 2025-01-15 17:36     ` Dmitry V. Levin
  2025-01-15 19:10       ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-15 17:36 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Alexey Gladkov, Eugene Syromyatnikov, Mike Frysinger,
	Renzo Davoli, Davide Berardi, strace-devel, linux-kernel,
	linux-api

On Wed, Jan 15, 2025 at 05:38:09PM +0100, Oleg Nesterov wrote:
[...]
> > +	syscall_set_nr(child, regs, nr);
> > +	/*
> > +	 * If the syscall number is set to -1, setting syscall arguments is not
> > +	 * just pointless, it would also clobber the syscall return value on
> > +	 * those architectures that share the same register both for the first
> > +	 * argument of syscall and its return value.
> > +	 */
> > +	if (nr != -1)
> > +		syscall_set_arguments(child, regs, args);
> 
> Thanks, much better than I tried to suggest in my reply to V1.
> 
> But may be
> 
> 	if (syscall_get_nr() != -1)
> 		syscall_set_arguments(...);
> 
> will look a bit more consistent?

I'm sorry, but I didn't follow.  As we've just set the syscall number with
syscall_set_nr(), why would we want to call syscall_get_nr() right after
that to obtain the syscall number?


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-15 17:36     ` Dmitry V. Levin
@ 2025-01-15 19:10       ` Oleg Nesterov
  0 siblings, 0 replies; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-15 19:10 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Alexey Gladkov, Eugene Syromyatnikov, Mike Frysinger,
	Renzo Davoli, Davide Berardi, strace-devel, linux-kernel,
	linux-api

On 01/15, Dmitry V. Levin wrote:
>
> On Wed, Jan 15, 2025 at 05:38:09PM +0100, Oleg Nesterov wrote:
> >
> > But may be
> >
> > 	if (syscall_get_nr() != -1)
> > 		syscall_set_arguments(...);
> >
> > will look a bit more consistent?
>
> I'm sorry, but I didn't follow.  As we've just set the syscall number with
> syscall_set_nr(), why would we want to call syscall_get_nr() right after
> that to obtain the syscall number?

Mostly for grep. We have more syscall_get_nr() != -1 checks. Even right after
syscall_set_nr-like code, see putreg32().

I think this needs another helper (which can have more users) and some cleanups.

But this is another issue, so please forget. I agree that syscall_get_nr() in
this code will probably just add the unnecessary confusion.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-13 17:12 ` [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request Dmitry V. Levin
  2025-01-15 16:38   ` Oleg Nesterov
@ 2025-01-16  1:55   ` Charlie Jenkins
  2025-01-16  8:33     ` Dmitry V. Levin
  2025-01-16 15:21   ` Oleg Nesterov
  2 siblings, 1 reply; 19+ messages in thread
From: Charlie Jenkins @ 2025-01-16  1:55 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Oleg Nesterov, Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Mon, Jan 13, 2025 at 07:12:08PM +0200, Dmitry V. Levin wrote:
> PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements
> PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of
> system calls the tracee is blocked in.
> 
> This API allows ptracers to obtain and modify system call details
> in a straightforward and architecture-agnostic way.
> 
> Current implementation supports changing only those bits of system call
> information that are used by strace, namely, syscall number, syscall
> arguments, and syscall return value.
> 
> Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO,
> such as instruction pointer and stack pointer, could be added later
> if needed, by using struct ptrace_syscall_info.flags to specify
> the additional details that should be set.  Currently, flags and reserved
> fields of struct ptrace_syscall_info must be initialized with zeroes;
> arch, instruction_pointer, and stack_pointer fields are ignored.
> 
> PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY,
> PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations.
> Other operations could be added later if needed.
> 
> Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with
> PTRACE_GET_SYSCALL_INFO, but it didn't happen.  The last straw that
> convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure
> to provide an API of changing the first system call argument on riscv
> architecture.
> 
> ptrace(2) man page:
> 
> long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data);
> ...
> PTRACE_SET_SYSCALL_INFO
>        Modify information about the system call that caused the stop.
>        The "data" argument is a pointer to struct ptrace_syscall_info
>        that specifies the system call information to be set.
>        The "addr" argument should be set to sizeof(struct ptrace_syscall_info)).
> 
> Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/
> Signed-off-by: Dmitry V. Levin <ldv@strace.io>
> ---
>  include/linux/ptrace.h      |  3 ++
>  include/uapi/linux/ptrace.h |  4 +-
>  kernel/ptrace.c             | 95 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 101 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
> index 90507d4afcd6..c8dbf1e498bf 100644
> --- a/include/linux/ptrace.h
> +++ b/include/linux/ptrace.h
> @@ -17,6 +17,9 @@ struct syscall_info {
>  	struct seccomp_data	data;
>  };
>  
> +/* sizeof() the first published struct ptrace_syscall_info */
> +#define PTRACE_SYSCALL_INFO_SIZE_VER0	84
> +
>  extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
>  			    void *buf, int len, unsigned int gup_flags);
>  
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index 72c038fc71d0..ca75b3ab5d22 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -74,6 +74,7 @@ struct seccomp_metadata {
>  };
>  
>  #define PTRACE_GET_SYSCALL_INFO		0x420e
> +#define PTRACE_SET_SYSCALL_INFO		0x4212
>  #define PTRACE_SYSCALL_INFO_NONE	0
>  #define PTRACE_SYSCALL_INFO_ENTRY	1
>  #define PTRACE_SYSCALL_INFO_EXIT	2
> @@ -81,7 +82,8 @@ struct seccomp_metadata {
>  
>  struct ptrace_syscall_info {
>  	__u8 op;	/* PTRACE_SYSCALL_INFO_* */
> -	__u8 pad[3];
> +	__u8 reserved;
> +	__u16 flags;
>  	__u32 arch;
>  	__u64 instruction_pointer;
>  	__u64 stack_pointer;
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 22e7d74cf4cd..41d37cb8f74a 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -1016,6 +1016,97 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
>  	write_size = min(actual_size, user_size);
>  	return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
>  }
> +
> +static unsigned long
> +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
> +			      struct ptrace_syscall_info *info)
> +{
> +	unsigned long args[ARRAY_SIZE(info->entry.args)];
> +	int nr = info->entry.nr;
> +	int i;
> +
> +	if (nr != info->entry.nr)
> +		return -ERANGE;
> +
> +	for (i = 0; i < ARRAY_SIZE(args); i++) {
> +		args[i] = info->entry.args[i];
> +		if (args[i] != info->entry.args[i])
> +			return -ERANGE;
> +	}
> +
> +	syscall_set_nr(child, regs, nr);
> +	/*
> +	 * If the syscall number is set to -1, setting syscall arguments is not
> +	 * just pointless, it would also clobber the syscall return value on
> +	 * those architectures that share the same register both for the first
> +	 * argument of syscall and its return value.
> +	 */
> +	if (nr != -1)
> +		syscall_set_arguments(child, regs, args);
> +
> +	return 0;
> +}
> +
> +static unsigned long
> +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
> +				struct ptrace_syscall_info *info)
> +{
> +	/*
> +	 * info->entry is currently a subset of info->seccomp,
> +	 * info->seccomp.ret_data is currently ignored.
> +	 */
> +	return ptrace_set_syscall_info_entry(child, regs, info);
> +}
> +
> +static unsigned long
> +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
> +			     struct ptrace_syscall_info *info)
> +{
> +	if (info->exit.is_error)
> +		syscall_set_return_value(child, regs, info->exit.rval, 0);
> +	else
> +		syscall_set_return_value(child, regs, 0, info->exit.rval);
> +
> +	return 0;
> +}
> +
> +static int
> +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
> +			void __user *datavp)
> +{
> +	struct pt_regs *regs = task_pt_regs(child);
> +	struct ptrace_syscall_info info;
> +	int error;
> +
> +	BUILD_BUG_ON(sizeof(struct ptrace_syscall_info) < PTRACE_SYSCALL_INFO_SIZE_VER0);
> +
> +	if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
> +		return -EINVAL;
> +
> +	error = copy_struct_from_user(&info, sizeof(info), datavp, user_size);
> +	if (error)
> +		return error;
> +
> +	/* Reserved for future use. */
> +	if (info.flags || info.reserved)
> +		return -EINVAL;
> +
> +	/* Changing the type of the system call stop is not supported. */
> +	if (ptrace_get_syscall_info_op(child) != info.op)

Since this isn't supported anyway, would it make sense to set the
info.op to ptrace_get_syscall_info_op(child) like is done for
get_syscall_info? The usecase I see for this is simplifying when the
user doesn't call PTRACE_GET_SYSCALL_INFO before calling
PTRACE_SET_SYSCALL_INFO.

- Charlie

> +		return -EINVAL;
> +
> +	switch (info.op) {
> +	case PTRACE_SYSCALL_INFO_ENTRY:
> +		return ptrace_set_syscall_info_entry(child, regs, &info);
> +	case PTRACE_SYSCALL_INFO_EXIT:
> +		return ptrace_set_syscall_info_exit(child, regs, &info);
> +	case PTRACE_SYSCALL_INFO_SECCOMP:
> +		return ptrace_set_syscall_info_seccomp(child, regs, &info);
> +	default:
> +		/* Other types of system call stops are not supported. */
> +		return -EINVAL;
> +	}
> +}
>  #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
>  
>  int ptrace_request(struct task_struct *child, long request,
> @@ -1234,6 +1325,10 @@ int ptrace_request(struct task_struct *child, long request,
>  	case PTRACE_GET_SYSCALL_INFO:
>  		ret = ptrace_get_syscall_info(child, addr, datavp);
>  		break;
> +
> +	case PTRACE_SET_SYSCALL_INFO:
> +		ret = ptrace_set_syscall_info(child, addr, datavp);
> +		break;
>  #endif
>  
>  	case PTRACE_SECCOMP_GET_FILTER:
> -- 
> ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16  1:55   ` Charlie Jenkins
@ 2025-01-16  8:33     ` Dmitry V. Levin
  2025-01-16 21:07       ` Charlie Jenkins
  0 siblings, 1 reply; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-16  8:33 UTC (permalink / raw)
  To: Charlie Jenkins
  Cc: Oleg Nesterov, Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Wed, Jan 15, 2025 at 05:55:31PM -0800, Charlie Jenkins wrote:
> On Mon, Jan 13, 2025 at 07:12:08PM +0200, Dmitry V. Levin wrote:
[...]
> > +	/* Changing the type of the system call stop is not supported. */
> > +	if (ptrace_get_syscall_info_op(child) != info.op)
> 
> Since this isn't supported anyway, would it make sense to set the
> info.op to ptrace_get_syscall_info_op(child) like is done for
> get_syscall_info? The usecase I see for this is simplifying when the
> user doesn't call PTRACE_GET_SYSCALL_INFO before calling
> PTRACE_SET_SYSCALL_INFO.

struct ptrace_syscall_info.op is a field that specifies how to interpret
the union fields of the structure, so if "op" is ignored, then the
kernel would infer the meaning of the structure specified by the userspace
tracer from the kernel state of the tracee.  This looks a bit too
error-prone to allow.  For example, nothing good is expected to happen
if syscall entry information is applied in a syscall exit stop.

The tracer is not obliged to call PTRACE_GET_SYSCALL_INFO to set
struct ptrace_syscall_info.op.  If the tracer keeps track of ptrace stops
by other means, it can assign the right value by itself.

And, btw, the comment should say "is not currently supported",
I'll update it in the next iteration.

An idea mentioned in prior discussions was that it would make sense to
specify syscall return value along with skipping the syscall in seccomp stop,
and this would require a different value for "op" field, but
I decided not to introduce this extra complexity yet.


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-13 17:12 ` [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request Dmitry V. Levin
  2025-01-15 16:38   ` Oleg Nesterov
  2025-01-16  1:55   ` Charlie Jenkins
@ 2025-01-16 15:21   ` Oleg Nesterov
  2025-01-16 16:04     ` Dmitry V. Levin
  2 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-16 15:21 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On 01/13, Dmitry V. Levin wrote:
>
> +static int
> +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
> +			void __user *datavp)
> +{
> +	struct pt_regs *regs = task_pt_regs(child);
> +	struct ptrace_syscall_info info;
> +	int error;
> +
> +	BUILD_BUG_ON(sizeof(struct ptrace_syscall_info) < PTRACE_SYSCALL_INFO_SIZE_VER0);
> +
> +	if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
> +		return -EINVAL;
> +
> +	error = copy_struct_from_user(&info, sizeof(info), datavp, user_size);

OK, I certainly can't understand why copy_struct_from_user/check_zeroed_user
is useful, at least in this case. In particular, this won't allow to run the
new code (which uses the "extended" ptrace_syscall_info) on the older kernels?

Can't we just use user_size as a version number?

We can also turn info->reserved into info->version filled by
ptrace_get_syscall_info().

ptrace_set_syscall_info() can check that info->version matches user_size.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16 15:21   ` Oleg Nesterov
@ 2025-01-16 16:04     ` Dmitry V. Levin
  2025-01-16 16:40       ` Dmitry V. Levin
  2025-01-17 14:45       ` Oleg Nesterov
  0 siblings, 2 replies; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-16 16:04 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Thu, Jan 16, 2025 at 04:21:38PM +0100, Oleg Nesterov wrote:
> On 01/13, Dmitry V. Levin wrote:
> >
> > +static int
> > +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
> > +			void __user *datavp)
> > +{
> > +	struct pt_regs *regs = task_pt_regs(child);
> > +	struct ptrace_syscall_info info;
> > +	int error;
> > +
> > +	BUILD_BUG_ON(sizeof(struct ptrace_syscall_info) < PTRACE_SYSCALL_INFO_SIZE_VER0);
> > +
> > +	if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
> > +		return -EINVAL;
> > +
> > +	error = copy_struct_from_user(&info, sizeof(info), datavp, user_size);
> 
> OK, I certainly can't understand why copy_struct_from_user/check_zeroed_user
> is useful, at least in this case. In particular, this won't allow to run the
> new code (which uses the "extended" ptrace_syscall_info) on the older kernels?
> 
> Can't we just use user_size as a version number?
> 
> We can also turn info->reserved into info->version filled by
> ptrace_get_syscall_info().
> 
> ptrace_set_syscall_info() can check that info->version matches user_size.

The idea is to use "op" to specify the operation, and "flags" to specify
future extensions to the operation.  For example, we could later add
PTRACE_SYSCALL_INFO_SECCOMP_SKIP operation to specify an exit-like
data for seccomp stops, or some flag to set instruction_pointer or
stack_pointer.  I don't think any of these would require a version field,
though.

That is, the zero check implied by copy_struct_from_user() is not really
needed here since the compatibility is tracked by "op" and "flags":
if "op" and "flags" do not instruct the kernel to use these unknown
extra bits, the kernel is not obliged to check them either.
For the same reason I don't think the kernel is obliged to read more
than sizeof(info) from userspace.

What would you recommend using instead of copy_struct_from_user in this
case?


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16 16:04     ` Dmitry V. Levin
@ 2025-01-16 16:40       ` Dmitry V. Levin
  2025-01-17 14:45       ` Oleg Nesterov
  1 sibling, 0 replies; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-16 16:40 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Thu, Jan 16, 2025 at 06:04:03PM +0200, Dmitry V. Levin wrote:
> On Thu, Jan 16, 2025 at 04:21:38PM +0100, Oleg Nesterov wrote:
> > On 01/13, Dmitry V. Levin wrote:
> > >
> > > +static int
> > > +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
> > > +			void __user *datavp)
> > > +{
> > > +	struct pt_regs *regs = task_pt_regs(child);
> > > +	struct ptrace_syscall_info info;
> > > +	int error;
> > > +
> > > +	BUILD_BUG_ON(sizeof(struct ptrace_syscall_info) < PTRACE_SYSCALL_INFO_SIZE_VER0);
> > > +
> > > +	if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
> > > +		return -EINVAL;
> > > +
> > > +	error = copy_struct_from_user(&info, sizeof(info), datavp, user_size);
> > 
> > OK, I certainly can't understand why copy_struct_from_user/check_zeroed_user
> > is useful, at least in this case. In particular, this won't allow to run the
> > new code (which uses the "extended" ptrace_syscall_info) on the older kernels?
> > 
> > Can't we just use user_size as a version number?
> > 
> > We can also turn info->reserved into info->version filled by
> > ptrace_get_syscall_info().
> > 
> > ptrace_set_syscall_info() can check that info->version matches user_size.
> 
> The idea is to use "op" to specify the operation, and "flags" to specify
> future extensions to the operation.  For example, we could later add
> PTRACE_SYSCALL_INFO_SECCOMP_SKIP operation to specify an exit-like
> data for seccomp stops, or some flag to set instruction_pointer or
> stack_pointer.  I don't think any of these would require a version field,
> though.
> 
> That is, the zero check implied by copy_struct_from_user() is not really
> needed here since the compatibility is tracked by "op" and "flags":
> if "op" and "flags" do not instruct the kernel to use these unknown
> extra bits, the kernel is not obliged to check them either.
> For the same reason I don't think the kernel is obliged to read more
> than sizeof(info) from userspace.
> 
> What would you recommend using instead of copy_struct_from_user in this
> case?

Something like this?

        if (user_size < PTRACE_SYSCALL_INFO_SIZE_VER0 || user_size > PAGE_SIZE)
                return -EINVAL;

        if (copy_from_user(&info, datavp, min(sizeof(info), user_size)))
                return -EFAULT;

        if (user_size < sizeof(info))
                memset((void *)&info + user_size, 0, sizeof(info) - user_size);

-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16  8:33     ` Dmitry V. Levin
@ 2025-01-16 21:07       ` Charlie Jenkins
  2025-01-16 21:47         ` Charlie Jenkins
  0 siblings, 1 reply; 19+ messages in thread
From: Charlie Jenkins @ 2025-01-16 21:07 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Oleg Nesterov, Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Thu, Jan 16, 2025 at 10:33:28AM +0200, Dmitry V. Levin wrote:
> On Wed, Jan 15, 2025 at 05:55:31PM -0800, Charlie Jenkins wrote:
> > On Mon, Jan 13, 2025 at 07:12:08PM +0200, Dmitry V. Levin wrote:
> [...]
> > > +	/* Changing the type of the system call stop is not supported. */
> > > +	if (ptrace_get_syscall_info_op(child) != info.op)
> > 
> > Since this isn't supported anyway, would it make sense to set the
> > info.op to ptrace_get_syscall_info_op(child) like is done for
> > get_syscall_info? The usecase I see for this is simplifying when the
> > user doesn't call PTRACE_GET_SYSCALL_INFO before calling
> > PTRACE_SET_SYSCALL_INFO.
> 
> struct ptrace_syscall_info.op is a field that specifies how to interpret
> the union fields of the structure, so if "op" is ignored, then the
> kernel would infer the meaning of the structure specified by the userspace
> tracer from the kernel state of the tracee.  This looks a bit too
> error-prone to allow.  For example, nothing good is expected to happen
> if syscall entry information is applied in a syscall exit stop.

Yes that's a good point. 

> 
> The tracer is not obliged to call PTRACE_GET_SYSCALL_INFO to set
> struct ptrace_syscall_info.op.  If the tracer keeps track of ptrace stops
> by other means, it can assign the right value by itself.
>
> And, btw, the comment should say "is not currently supported",
> I'll update it in the next iteration.
> 
> An idea mentioned in prior discussions was that it would make sense to
> specify syscall return value along with skipping the syscall in seccomp stop,
> and this would require a different value for "op" field, but
> I decided not to introduce this extra complexity yet.

Makes sense, thank you!

- Charlie

> 
> 
> -- 
> ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16 21:07       ` Charlie Jenkins
@ 2025-01-16 21:47         ` Charlie Jenkins
  0 siblings, 0 replies; 19+ messages in thread
From: Charlie Jenkins @ 2025-01-16 21:47 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Oleg Nesterov, Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, Celeste Liu, strace-devel, linux-kernel,
	linux-api

On Thu, Jan 16, 2025 at 01:07:59PM -0800, Charlie Jenkins wrote:
> On Thu, Jan 16, 2025 at 10:33:28AM +0200, Dmitry V. Levin wrote:
> > On Wed, Jan 15, 2025 at 05:55:31PM -0800, Charlie Jenkins wrote:
> > > On Mon, Jan 13, 2025 at 07:12:08PM +0200, Dmitry V. Levin wrote:
> > [...]
> > > > +	/* Changing the type of the system call stop is not supported. */
> > > > +	if (ptrace_get_syscall_info_op(child) != info.op)
> > > 
> > > Since this isn't supported anyway, would it make sense to set the
> > > info.op to ptrace_get_syscall_info_op(child) like is done for
> > > get_syscall_info? The usecase I see for this is simplifying when the
> > > user doesn't call PTRACE_GET_SYSCALL_INFO before calling
> > > PTRACE_SET_SYSCALL_INFO.
> > 
> > struct ptrace_syscall_info.op is a field that specifies how to interpret
> > the union fields of the structure, so if "op" is ignored, then the
> > kernel would infer the meaning of the structure specified by the userspace
> > tracer from the kernel state of the tracee.  This looks a bit too
> > error-prone to allow.  For example, nothing good is expected to happen
> > if syscall entry information is applied in a syscall exit stop.
> 
> Yes that's a good point. 
> 
> > 
> > The tracer is not obliged to call PTRACE_GET_SYSCALL_INFO to set
> > struct ptrace_syscall_info.op.  If the tracer keeps track of ptrace stops
> > by other means, it can assign the right value by itself.
> >
> > And, btw, the comment should say "is not currently supported",
> > I'll update it in the next iteration.
> > 
> > An idea mentioned in prior discussions was that it would make sense to
> > specify syscall return value along with skipping the syscall in seccomp stop,
> > and this would require a different value for "op" field, but
> > I decided not to introduce this extra complexity yet.
> 
> Makes sense, thank you!
> 
> - Charlie

I am no longer convinced that we need Celeste's patch that solves this
problem on riscv [1]. That patch is necessary without this change, but
PTRACE_SET_SYSCALL_INFO seems like a cleaner solution.

Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>

- Charlie

[1] https://lore.kernel.org/lkml/20250115-13cc73c36c7bb3b9f046f614@orel/T/

> 
> > 
> > 
> > -- 
> > ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-16 16:04     ` Dmitry V. Levin
  2025-01-16 16:40       ` Dmitry V. Levin
@ 2025-01-17 14:45       ` Oleg Nesterov
  2025-01-17 15:06         ` Dmitry V. Levin
  1 sibling, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-17 14:45 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On 01/16, Dmitry V. Levin wrote:
>
> The idea is to use "op" to specify the operation, and "flags" to specify
> future extensions to the operation.

OK,

> That is, the zero check implied by copy_struct_from_user() is not really
> needed here since the compatibility is tracked by "op" and "flags":

OK, but then why this patch uses copy_struct_from_user() ?

Why can't we simply do

	if (user_size != PTRACE_SYSCALL_INFO_SIZE_VER0)
		return -EINVAL;

	if (copy_from_user(..., user_size))
		return EFAULT;

now, until we add the extensions ?

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-17 14:45       ` Oleg Nesterov
@ 2025-01-17 15:06         ` Dmitry V. Levin
  2025-01-17 15:32           ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-17 15:06 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Fri, Jan 17, 2025 at 03:45:57PM +0100, Oleg Nesterov wrote:
> On 01/16, Dmitry V. Levin wrote:
> >
> > The idea is to use "op" to specify the operation, and "flags" to specify
> > future extensions to the operation.
> 
> OK,
> 
> > That is, the zero check implied by copy_struct_from_user() is not really
> > needed here since the compatibility is tracked by "op" and "flags":
> 
> OK, but then why this patch uses copy_struct_from_user() ?
> 
> Why can't we simply do
> 
> 	if (user_size != PTRACE_SYSCALL_INFO_SIZE_VER0)
> 		return -EINVAL;
> 
> 	if (copy_from_user(..., user_size))
> 		return EFAULT;
> 
> now, until we add the extensions ?

We should accept larger user_size from the very beginning, so that in case
the structure grows in the future, the userspace that sicks to the current
set of supported features would be still able to work with older kernels.

I think we can do the following:

        /*
         * ptrace_syscall_info.seccomp is the largest member in the union,
         * and ret_data is the last field there.
         * min_size can be less than sizeof(info) due to alignment.
         */
        size_t min_size = offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
        size_t copy_size = min(sizeof(info), user_size);

        if (copy_size < min_size)
                return -EINVAL;

        if (copy_from_user(&info, datavp, copy_size))
                return -EFAULT;

We cannot just use sizeof(info) because it depends on the alignment of
__u64.  Also, I don't think we need to fill with zeroes the trailing
padding bytes of the structure as we are not going to use them in any way.


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-17 15:06         ` Dmitry V. Levin
@ 2025-01-17 15:32           ` Oleg Nesterov
  2025-01-17 16:22             ` Dmitry V. Levin
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-17 15:32 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

Dmitry,

You certainly understand the user-space needs much better than me.
I am just trying to understand your point.

On 01/17, Dmitry V. Levin wrote:
>
> We should accept larger user_size from the very beginning, so that in case
> the structure grows in the future, the userspace that sicks to the current
> set of supported features would be still able to work with older kernels.

This is what I can't understand, perhaps I have a blind spot here ;)

Could you provide an example (even absolutely artificial) of possible extension
which can help me to understand?

> We cannot just use sizeof(info) because it depends on the alignment of
> __u64.

Hmm why? I thought that the kernel already depends on the "natural" alignment?
And if we can't, then PTRACE_SYSCALL_INFO_SIZE_VER0 added by this patch makes
no sense?

Sorry I guess I must have missed something, I am sick today.

> Also, I don't think we need to fill with zeroes the trailing
> padding bytes of the structure as we are not going to use them in any way.

At least we seem to agree here ;)

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-17 15:32           ` Oleg Nesterov
@ 2025-01-17 16:22             ` Dmitry V. Levin
  2025-01-18 14:13               ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-17 16:22 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Fri, Jan 17, 2025 at 04:32:59PM +0100, Oleg Nesterov wrote:
> Dmitry,
> 
> You certainly understand the user-space needs much better than me.
> I am just trying to understand your point.
> 
> On 01/17, Dmitry V. Levin wrote:
> >
> > We should accept larger user_size from the very beginning, so that in case
> > the structure grows in the future, the userspace that sicks to the current
> > set of supported features would be still able to work with older kernels.
> 
> This is what I can't understand, perhaps I have a blind spot here ;)
> 
> Could you provide an example (even absolutely artificial) of possible extension
> which can help me to understand?

An absolutely artificial example: let's say we're adding an optional 
64-bit field "artificial" to ptrace_syscall_info.seccomp, this means
sizeof(ptrace_syscall_info) grows by 8 bytes.  When userspace wants
to set this optional field, it sets a bit in ptrace_syscall_info.flags,
this tells the kernel to look into this new "artificial" field.
When userspace is not interested in setting new optional fields,
it just keeps ptrace_syscall_info.flags == 0.  Remember, however, that
by adding the new optional field sizeof(ptrace_syscall_info) grew by 8 bytes.

What we need is to make sure that an older kernel that has no idea of this
new field would still accept the bigger size, so that userspace would be
able to continue doing its
	ptrace(PTRACE_SET_SYSCALL_INFO, pid, sizeof(info), &info)
despite of potential growth of sizeof(info) until it actually starts using
new optional fields.

> > We cannot just use sizeof(info) because it depends on the alignment of
> > __u64.
> 
> Hmm why? I thought that the kernel already depends on the "natural" alignment?
> And if we can't, then PTRACE_SYSCALL_INFO_SIZE_VER0 added by this patch makes
> no sense?

struct ptrace_syscall_info has members of type __u64, and it currently
ends with "__u32 ret_data".  So depending on the alignment, the structure
either has extra 4 trailing padding bytes, or it doesn't.

For example, on x86_64 sizeof(struct ptrace_syscall_info) is currently 88,
while on x86 it is 84.


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-17 16:22             ` Dmitry V. Levin
@ 2025-01-18 14:13               ` Oleg Nesterov
  2025-01-19 12:44                 ` Dmitry V. Levin
  2025-01-19 14:38                 ` Aleksa Sarai
  0 siblings, 2 replies; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-18 14:13 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On 01/17, Dmitry V. Levin wrote:
>

(reordered)

> struct ptrace_syscall_info has members of type __u64, and it currently
> ends with "__u32 ret_data".  So depending on the alignment, the structure
> either has extra 4 trailing padding bytes, or it doesn't.

Ah, I didn't realize that the last member is __u32, so I completely
misunderstood your "it depends on the alignment of __u64" note.

> For example, on x86_64 sizeof(struct ptrace_syscall_info) is currently 88,
> while on x86 it is 84.

Not good, but too late to complain...

OK, I see your point now and I won't argue with approach you outlined in your
previous email

        size_t min_size = offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
        size_t copy_size = min(sizeof(info), user_size);

        if (copy_size < min_size)
                return -EINVAL;

        if (copy_from_user(&info, datavp, copy_size))
                return -EFAULT;

-------------------------------------------------------------------------------
Thats said... Can't resist,

> An absolutely artificial example: let's say we're adding an optional
> 64-bit field "artificial" to ptrace_syscall_info.seccomp, this means
> sizeof(ptrace_syscall_info) grows by 8 bytes.  When userspace wants
> to set this optional field, it sets a bit in ptrace_syscall_info.flags,
> this tells the kernel to look into this new "artificial" field.
> When userspace is not interested in setting new optional fields,
> it just keeps ptrace_syscall_info.flags == 0.  Remember, however, that
> by adding the new optional field sizeof(ptrace_syscall_info) grew by 8 bytes.
>
> What we need is to make sure that an older kernel that has no idea of this
> new field would still accept the bigger size, so that userspace would be
> able to continue doing its
> 	ptrace(PTRACE_SET_SYSCALL_INFO, pid, sizeof(info), &info)
> despite of potential growth of sizeof(info) until it actually starts using
> new optional fields.

This is clear, but personally I don't really like this pattern... Consider

	void set_syscall_info(int unlikely_condition)
	{
		struct ptrace_syscall_info info;

		fill_info(&info);
		if (unlikely_condition) {
			info.flags = USE_ARTIFICIAL;
			info.artificial = 1;
		}

		assert(ptrace(PTRACE_SET_SYSCALL_INFO, sizeof(info), &info) == 0);
	}

Now this application (running on the older kernel) can fail or not, depending
on "unlikely_condition". To me it would be better to always fail in this case.

That is why I tried to suggest to use "user_size" as a version number.
Currently we have PTRACE_SYSCALL_INFO_SIZE_VER0, when we add the new
"artificial" member we will have PTRACE_SYSCALL_INFO_SIZE_VER1. Granted,
this way set_syscall_info() can't use sizeof(info), it should do

	ptrace(PTRACE_SET_SYSCALL_INFO, PTRACE_SYSCALL_INFO_SIZE_VER1, info);

and the kernel needs more checks, but this is what I had in mind when I said
that the 1st version can just require "user_size == PTRACE_SYSCALL_INFO_SIZE_VER0".

But I won't insist, I do not pretend I understand the user-space needs.

Thanks!

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-18 14:13               ` Oleg Nesterov
@ 2025-01-19 12:44                 ` Dmitry V. Levin
  2025-01-20 19:56                   ` Oleg Nesterov
  2025-01-19 14:38                 ` Aleksa Sarai
  1 sibling, 1 reply; 19+ messages in thread
From: Dmitry V. Levin @ 2025-01-19 12:44 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On Sat, Jan 18, 2025 at 03:13:42PM +0100, Oleg Nesterov wrote:
> On 01/17, Dmitry V. Levin wrote:
[...]
> > For example, on x86_64 sizeof(struct ptrace_syscall_info) is currently 88,
> > while on x86 it is 84.
> 
> Not good, but too late to complain...

Actually, I don't think it's too late to add an extra __u32 padding
there since it wouldn't affect PTRACE_GET_SYSCALL_INFO.

I can add an explicit padding to the structure if you say
you like it better this way.

[...]
> Thats said... Can't resist,
> 
> > An absolutely artificial example: let's say we're adding an optional
> > 64-bit field "artificial" to ptrace_syscall_info.seccomp, this means
> > sizeof(ptrace_syscall_info) grows by 8 bytes.  When userspace wants
> > to set this optional field, it sets a bit in ptrace_syscall_info.flags,
> > this tells the kernel to look into this new "artificial" field.
> > When userspace is not interested in setting new optional fields,
> > it just keeps ptrace_syscall_info.flags == 0.  Remember, however, that
> > by adding the new optional field sizeof(ptrace_syscall_info) grew by 8 bytes.
> >
> > What we need is to make sure that an older kernel that has no idea of this
> > new field would still accept the bigger size, so that userspace would be
> > able to continue doing its
> > 	ptrace(PTRACE_SET_SYSCALL_INFO, pid, sizeof(info), &info)
> > despite of potential growth of sizeof(info) until it actually starts using
> > new optional fields.
> 
> This is clear, but personally I don't really like this pattern... Consider
> 
> 	void set_syscall_info(int unlikely_condition)
> 	{
> 		struct ptrace_syscall_info info;
> 
> 		fill_info(&info);
> 		if (unlikely_condition) {
> 			info.flags = USE_ARTIFICIAL;
> 			info.artificial = 1;
> 		}
> 
> 		assert(ptrace(PTRACE_SET_SYSCALL_INFO, sizeof(info), &info) == 0);
> 	}
> 
> Now this application (running on the older kernel) can fail or not, depending
> on "unlikely_condition". To me it would be better to always fail in this case.

In practice, user-space programs rarely have the luxury to assume that
some new kernel API is available.  For example, strace still performs a
runtime check for PTRACE_GET_SYSCALL_INFO (introduced more than 5 years
ago) and falls back to pre-PTRACE_GET_SYSCALL_INFO interfaces when the
kernel lacks support.  Consequently, user-space programs would have to
keep track of PTRACE_SET_SYSCALL_INFO interfaces supported by the kernel,
so ...

> That is why I tried to suggest to use "user_size" as a version number.
> Currently we have PTRACE_SYSCALL_INFO_SIZE_VER0, when we add the new
> "artificial" member we will have PTRACE_SYSCALL_INFO_SIZE_VER1. Granted,
> this way set_syscall_info() can't use sizeof(info), it should do
> 
> 	ptrace(PTRACE_SET_SYSCALL_INFO, PTRACE_SYSCALL_INFO_SIZE_VER1, info);
> 
> and the kernel needs more checks, but this is what I had in mind when I said
> that the 1st version can just require "user_size == PTRACE_SYSCALL_INFO_SIZE_VER0".

... it wouldn't be a big deal for user-space to specify also an
appropriate "user_size", e.g. PTRACE_SYSCALL_INFO_SIZE_VER1 when it starts
using the interface available since VER1, but it wouldn't help user-space
programs either as they would have to update "op" and/or "flags" anyway,
and "user_size" would become just yet another detail they have to care
about.

At the same time, "flags" is needed anyway because the most likely
extension of PTRACE_SET_SYSCALL_INFO would be support of setting some
fields that are present in the structure already, e.g.
instruction_pointer.


-- 
ldv

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-18 14:13               ` Oleg Nesterov
  2025-01-19 12:44                 ` Dmitry V. Levin
@ 2025-01-19 14:38                 ` Aleksa Sarai
  1 sibling, 0 replies; 19+ messages in thread
From: Aleksa Sarai @ 2025-01-19 14:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Dmitry V. Levin, Eugene Syromyatnikov, Mike Frysinger,
	Renzo Davoli, Davide Berardi, strace-devel, linux-kernel,
	linux-api

[-- Attachment #1: Type: text/plain, Size: 4362 bytes --]

On 2025-01-18, Oleg Nesterov <oleg@redhat.com> wrote:
> On 01/17, Dmitry V. Levin wrote:
> >
> 
> (reordered)
> 
> > struct ptrace_syscall_info has members of type __u64, and it currently
> > ends with "__u32 ret_data".  So depending on the alignment, the structure
> > either has extra 4 trailing padding bytes, or it doesn't.
> 
> Ah, I didn't realize that the last member is __u32, so I completely
> misunderstood your "it depends on the alignment of __u64" note.
> 
> > For example, on x86_64 sizeof(struct ptrace_syscall_info) is currently 88,
> > while on x86 it is 84.
> 
> Not good, but too late to complain...
> 
> OK, I see your point now and I won't argue with approach you outlined in your
> previous email
> 
>         size_t min_size = offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
>         size_t copy_size = min(sizeof(info), user_size);
> 
>         if (copy_size < min_size)
>                 return -EINVAL;
> 
>         if (copy_from_user(&info, datavp, copy_size))
>                 return -EFAULT;
> 
> -------------------------------------------------------------------------------
> Thats said... Can't resist,
> 
> > An absolutely artificial example: let's say we're adding an optional
> > 64-bit field "artificial" to ptrace_syscall_info.seccomp, this means
> > sizeof(ptrace_syscall_info) grows by 8 bytes.  When userspace wants
> > to set this optional field, it sets a bit in ptrace_syscall_info.flags,
> > this tells the kernel to look into this new "artificial" field.
> > When userspace is not interested in setting new optional fields,
> > it just keeps ptrace_syscall_info.flags == 0.  Remember, however, that
> > by adding the new optional field sizeof(ptrace_syscall_info) grew by 8 bytes.
> >
> > What we need is to make sure that an older kernel that has no idea of this
> > new field would still accept the bigger size, so that userspace would be
> > able to continue doing its
> > 	ptrace(PTRACE_SET_SYSCALL_INFO, pid, sizeof(info), &info)
> > despite of potential growth of sizeof(info) until it actually starts using
> > new optional fields.
> 
> This is clear, but personally I don't really like this pattern... Consider
> 
> 	void set_syscall_info(int unlikely_condition)
> 	{
> 		struct ptrace_syscall_info info;
> 
> 		fill_info(&info);
> 		if (unlikely_condition) {
> 			info.flags = USE_ARTIFICIAL;
> 			info.artificial = 1;
> 		}
> 
> 		assert(ptrace(PTRACE_SET_SYSCALL_INFO, sizeof(info), &info) == 0);
> 	}
> 
> Now this application (running on the older kernel) can fail or not, depending
> on "unlikely_condition". To me it would be better to always fail in this case.
> 
> That is why I tried to suggest to use "user_size" as a version number.
> Currently we have PTRACE_SYSCALL_INFO_SIZE_VER0, when we add the new
> "artificial" member we will have PTRACE_SYSCALL_INFO_SIZE_VER1. Granted,
> this way set_syscall_info() can't use sizeof(info), it should do

user_size *is* a version number, it's just that copy_struct_from_user()
allows programs built with newer headers to run on older kernels (if
they don't use the new features). The alternative is that programs that
build with a newer set of kernel headers will implicitly have a larger
ptrace_syscall_info struct, which will cause them to start failing after
the binary is rebuilt.

*Strictly speaking* this wouldn't be a kernel regression (because it's a
new binary, the old binary would still work), but the risk of these
kinds of APIs being incredibly fragile is the reason why I went with the
check_zeroed_user() approach in copy_struct_from_user().

(I haven't looked at the details of this patchset, this is just a
general comment about copy_struct_from_user() and why this feature is
useful to userspace programs. Not all APIs need the extensibility of
copy_struct_from_user().)

> 	ptrace(PTRACE_SET_SYSCALL_INFO, PTRACE_SYSCALL_INFO_SIZE_VER1, info);
> 
> and the kernel needs more checks, but this is what I had in mind when I said
> that the 1st version can just require "user_size == PTRACE_SYSCALL_INFO_SIZE_VER0".
> 
> But I won't insist, I do not pretend I understand the user-space needs.
> 
> Thanks!
> 
> Oleg.
> 
> 

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request
  2025-01-19 12:44                 ` Dmitry V. Levin
@ 2025-01-20 19:56                   ` Oleg Nesterov
  0 siblings, 0 replies; 19+ messages in thread
From: Oleg Nesterov @ 2025-01-20 19:56 UTC (permalink / raw)
  To: Dmitry V. Levin
  Cc: Eugene Syromyatnikov, Mike Frysinger, Renzo Davoli,
	Davide Berardi, strace-devel, linux-kernel, linux-api

On 01/19, Dmitry V. Levin wrote:
>
> On Sat, Jan 18, 2025 at 03:13:42PM +0100, Oleg Nesterov wrote:
> > On 01/17, Dmitry V. Levin wrote:
> [...]
> > > For example, on x86_64 sizeof(struct ptrace_syscall_info) is currently 88,
> > > while on x86 it is 84.
> >
> > Not good, but too late to complain...
>
> Actually, I don't think it's too late to add an extra __u32 padding
> there since it wouldn't affect PTRACE_GET_SYSCALL_INFO.

Hmm, indeed thanks for correcting me. I forgot that ptrace_get_syscall_info()
returns actual_size, not sizeof().

> I can add an explicit padding to the structure if you say
> you like it better this way.

I dunno, up to you...

Well if we add "__u32 padding" at the end, we can probably use sizeof(info)
instead of min_size = offsetofend(struct ptrace_syscall_info, seccomp.ret_data)
in ptrace_set_syscall_info(), but then it probably makes sense to check
info->padding == 0 (just like info.flags || info.reserved) and rename this
member to reserved2.

Again, up to you, I don't know.

> > Currently we have PTRACE_SYSCALL_INFO_SIZE_VER0, when we add the new
> > "artificial" member we will have PTRACE_SYSCALL_INFO_SIZE_VER1. Granted,
> > this way set_syscall_info() can't use sizeof(info), it should do
> >
> > 	ptrace(PTRACE_SET_SYSCALL_INFO, PTRACE_SYSCALL_INFO_SIZE_VER1, info);
> >
> > and the kernel needs more checks, but this is what I had in mind when I said
> > that the 1st version can just require "user_size == PTRACE_SYSCALL_INFO_SIZE_VER0".
>
> ... it wouldn't be a big deal for user-space to specify also an
> appropriate "user_size", e.g. PTRACE_SYSCALL_INFO_SIZE_VER1 when it starts
> using the interface available since VER1, but it wouldn't help user-space
> programs either as they would have to update "op" and/or "flags" anyway,

Sure, and yes, "flags" is needed anyway.

> and "user_size" would become just yet another detail they have to care
> about.

True.

It is not that I ever thought that my suggestion could "help user-space".
Not at all. Just imo it would be better to fail "early" on the older kernel
in the case when user-space expects the "extended" API, even if flags == 0.
And no, it is not that I am 100% sure it would be always better.

So let me repeat: please do what you think is right, I won't argue. I just
tried to understand your points and explain mine to ensure we more or less
understand each other.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-01-20 19:57 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250113170925.GA392@strace.io>
2025-01-13 17:12 ` [PATCH v2 6/7] ptrace: introduce PTRACE_SET_SYSCALL_INFO request Dmitry V. Levin
2025-01-15 16:38   ` Oleg Nesterov
2025-01-15 17:36     ` Dmitry V. Levin
2025-01-15 19:10       ` Oleg Nesterov
2025-01-16  1:55   ` Charlie Jenkins
2025-01-16  8:33     ` Dmitry V. Levin
2025-01-16 21:07       ` Charlie Jenkins
2025-01-16 21:47         ` Charlie Jenkins
2025-01-16 15:21   ` Oleg Nesterov
2025-01-16 16:04     ` Dmitry V. Levin
2025-01-16 16:40       ` Dmitry V. Levin
2025-01-17 14:45       ` Oleg Nesterov
2025-01-17 15:06         ` Dmitry V. Levin
2025-01-17 15:32           ` Oleg Nesterov
2025-01-17 16:22             ` Dmitry V. Levin
2025-01-18 14:13               ` Oleg Nesterov
2025-01-19 12:44                 ` Dmitry V. Levin
2025-01-20 19:56                   ` Oleg Nesterov
2025-01-19 14:38                 ` Aleksa Sarai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).