Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH for 5.2 04/12] rseq/selftests: Use __rseq_handled symbol to coexist with glibc
From: Mathieu Desnoyers @ 2019-04-29 15:27 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

In order to integrate rseq into user-space applications, expose a
__rseq_handled symbol so many rseq users can be linked into the same
application (e.g. librseq and glibc).

The __rseq_refcount TLS variable is static to the librseq library. It
ensures that rseq syscall registration/unregistration happens only for
the most early/late caller to rseq_{,un}register_current_thread for each
thread, thus ensuring that rseq is registered across the lifetime of all
rseq users for a given thread.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Carlos O'Donell <carlos@redhat.com>
CC: Florian Weimer <fweimer@redhat.com>
CC: Joseph Myers <joseph@codesourcery.com>
CC: Szabolcs Nagy <szabolcs.nagy@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ben Maurer <bmaurer@fb.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Paul Turner <pjt@google.com>
CC: linux-api@vger.kernel.org
---
 tools/testing/selftests/rseq/rseq.c | 55 +++++++++++++++++++++++++++++++------
 tools/testing/selftests/rseq/rseq.h |  1 +
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 4847e97ed049..7159eb777fd3 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -25,18 +25,27 @@
 #include <syscall.h>
 #include <assert.h>
 #include <signal.h>
+#include <limits.h>
 
 #include "rseq.h"
 
 #define ARRAY_SIZE(arr)	(sizeof(arr) / sizeof((arr)[0]))
 
-__attribute__((tls_model("initial-exec"))) __thread
-volatile struct rseq __rseq_abi = {
+__thread volatile struct rseq __rseq_abi = {
 	.cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
 };
 
-static __attribute__((tls_model("initial-exec"))) __thread
-volatile int refcount;
+/*
+ * Shared with other libraries. This library may take rseq ownership if it is
+ * still 0 when executing the library constructor. Set to 1 by library
+ * constructor when handling rseq. Set to 0 in destructor if handling rseq.
+ */
+int __rseq_handled;
+
+/* Whether this library have ownership of rseq registration. */
+static int rseq_ownership;
+
+static __thread volatile uint32_t __rseq_refcount;
 
 static void signal_off_save(sigset_t *oldset)
 {
@@ -69,8 +78,14 @@ int rseq_register_current_thread(void)
 	int rc, ret = 0;
 	sigset_t oldset;
 
+	if (!rseq_ownership)
+		return 0;
 	signal_off_save(&oldset);
-	if (refcount++)
+	if (__rseq_refcount == UINT_MAX) {
+		ret = -1;
+		goto end;
+	}
+	if (__rseq_refcount++)
 		goto end;
 	rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
 	if (!rc) {
@@ -78,9 +93,9 @@ int rseq_register_current_thread(void)
 		goto end;
 	}
 	if (errno != EBUSY)
-		__rseq_abi.cpu_id = -2;
+		__rseq_abi.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED;
 	ret = -1;
-	refcount--;
+	__rseq_refcount--;
 end:
 	signal_restore(oldset);
 	return ret;
@@ -91,13 +106,20 @@ int rseq_unregister_current_thread(void)
 	int rc, ret = 0;
 	sigset_t oldset;
 
+	if (!rseq_ownership)
+		return 0;
 	signal_off_save(&oldset);
-	if (--refcount)
+	if (!__rseq_refcount) {
+		ret = -1;
+		goto end;
+	}
+	if (--__rseq_refcount)
 		goto end;
 	rc = sys_rseq(&__rseq_abi, sizeof(struct rseq),
 		      RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
 	if (!rc)
 		goto end;
+	__rseq_refcount = 1;
 	ret = -1;
 end:
 	signal_restore(oldset);
@@ -115,3 +137,20 @@ int32_t rseq_fallback_current_cpu(void)
 	}
 	return cpu;
 }
+
+void __attribute__((constructor)) rseq_init(void)
+{
+	/* Check whether rseq is handled by another library. */
+	if (__rseq_handled)
+		return;
+	__rseq_handled = 1;
+	rseq_ownership = 1;
+}
+
+void __attribute__((destructor)) rseq_fini(void)
+{
+	if (!rseq_ownership)
+		return;
+	__rseq_handled = 0;
+	rseq_ownership = 0;
+}
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index c72eb70f9b52..26348e2c44f3 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -45,6 +45,7 @@
 #endif
 
 extern __thread volatile struct rseq __rseq_abi;
+extern int __rseq_handled;
 
 #define rseq_likely(x)		__builtin_expect(!!(x), 1)
 #define rseq_unlikely(x)	__builtin_expect(!!(x), 0)
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 05/12] rseq/selftests: s390: use jg instruction for jumps outside of the asm
From: Mathieu Desnoyers @ 2019-04-29 15:27 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

The branch target range of the "j" instruction is 64K, which is not
enough for the general case.

Suggested-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/rseq/rseq-s390.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index fbb97815d71c..7c4f3a70b6c7 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -117,14 +117,14 @@ do {									\
 		".long " __rseq_str(RSEQ_SIG) "\n\t"			\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
-		"j %l[" __rseq_str(abort_label) "]\n\t"			\
+		"jg %l[" __rseq_str(abort_label) "]\n\t"		\
 		".popsection\n\t"
 
 #define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label)		\
 		".pushsection __rseq_failure, \"ax\"\n\t"		\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
-		"j %l[" __rseq_str(cmpfail_label) "]\n\t"		\
+		"jg %l[" __rseq_str(cmpfail_label) "]\n\t"		\
 		".popsection\n\t"
 
 static inline __attribute__((always_inline))
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 06/12] rseq/selftests: x86: use ud1 instruction as RSEQ_SIG opcode
From: Mathieu Desnoyers @ 2019-04-29 15:27 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Use ud1 as the guard instruction for the restartable sequence abort
handler. Its benefit compared to nopl is to trap execution if the
program ends up trying to execute it by mistake, which makes debugging
easier.

The 4-byte signature per se is unchanged (it is the instruction
operand). Only the opcode is changed from nopl to ud1.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/rseq/rseq-x86.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index 03095236f6fa..b2da6004fe30 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -7,6 +7,13 @@
 
 #include <stdint.h>
 
+/*
+ * RSEQ_SIG is used with the following reserved undefined instructions, which
+ * trap in user-space:
+ *
+ * x86-32:    0f b9 3d 53 30 05 53      ud1    0x53053053,%edi
+ * x86-64:    0f b9 3d 53 30 05 53      ud1    0x53053053(%rip),%edi
+ */
 #define RSEQ_SIG	0x53053053
 
 /*
@@ -88,8 +95,8 @@ do {									\
 
 #define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label)		\
 		".pushsection __rseq_failure, \"ax\"\n\t"		\
-		/* Disassembler-friendly signature: nopl <sig>(%rip). */\
-		".byte 0x0f, 0x1f, 0x05\n\t"				\
+		/* Disassembler-friendly signature: ud1 <sig>(%rip),%edi. */ \
+		".byte 0x0f, 0xb9, 0x3d\n\t"				\
 		".long " __rseq_str(RSEQ_SIG) "\n\t"			\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
@@ -609,8 +616,8 @@ do {									\
 
 #define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label)		\
 		".pushsection __rseq_failure, \"ax\"\n\t"		\
-		/* Disassembler-friendly signature: nopl <sig>. */	\
-		".byte 0x0f, 0x1f, 0x05\n\t"				\
+		/* Disassembler-friendly signature: ud1 <sig>,%edi. */	\
+		".byte 0x0f, 0xb9, 0x3d\n\t"				\
 		".long " __rseq_str(RSEQ_SIG) "\n\t"			\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 07/12] rseq/selftests: s390: use trap4 for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-04-29 15:27 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

Use trap4 as the guard instruction for the restartable sequence abort
handler.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 tools/testing/selftests/rseq/rseq-s390.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 7c4f3a70b6c7..1d05c5187ae6 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -1,6 +1,13 @@
 /* SPDX-License-Identifier: LGPL-2.1 OR MIT */
 
-#define RSEQ_SIG	0x53053053
+/*
+ * RSEQ_SIG uses the trap4 instruction. As Linux does not make use of the
+ * access-register mode nor the linkage stack this instruction will always
+ * cause a special-operation exception (the trap-enabled bit in the DUCT
+ * is and will stay 0). The instruction pattern is
+ *	b2 ff 0f ff	trap4   4095(%r0)
+ */
+#define RSEQ_SIG	0xB2FF0FFF
 
 #define rseq_smp_mb()	__asm__ __volatile__ ("bcr 15,0" ::: "memory")
 #define rseq_smp_rmb()	rseq_smp_mb()
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 08/12] rseq/selftests: arm: use udf instruction for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-04-29 15:27 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Use udf as the guard instruction for the restartable sequence abort
handler.

Previously, the chosen signature was not a valid instruction, based
on the assumption that it could always sit in a literal pool. However,
there are compilation environments in which literal pools are not
availble, for instance execute-only code. Therefore, we need to
choose a signature value that is also a valid instruction.

Handle compiling with -mbig-endian on ARMv6+, which generates binaries
with mixed code vs data endianness (little endian code, big endian
data).

Else mismatch between code endianness for the generated signatures and
data endianness for the RSEQ_SIG parameter passed to the rseq
registration will trigger application segmentation faults when the
kernel try to abort rseq critical sections.

Prior to ARMv6, -mbig-endian generates big-endian code and data, so
endianness should not be reversed in that case.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/rseq/rseq-arm.h | 52 +++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 5f262c54364f..e8ccfc37d685 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -5,7 +5,54 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#define RSEQ_SIG	0x53053053
+/*
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5de3. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern in the A32 instruction set is:
+ *
+ * e7f5def3    udf    #24035    ; 0x5de3
+ *
+ * This translates to the following instruction pattern in the T16 instruction
+ * set:
+ *
+ * little endian:
+ * def3        udf    #243      ; 0xf3
+ * e7f5        b.n    <7f5>
+ *
+ * pre-ARMv6 big endian code:
+ * e7f5        b.n    <7f5>
+ * def3        udf    #243      ; 0xf3
+ *
+ * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
+ * code and big-endian data. Ensure the RSEQ_SIG data signature matches code
+ * endianness. Prior to ARMv6, -mbig-endian generates big-endian code and data
+ * (which match), so there is no need to reverse the endianness of the data
+ * representation of the signature. However, the choice between BE32 and BE8
+ * is done by the linker, so we cannot know whether code and data endianness
+ * will be mixed before the linker is invoked.
+ */
+
+#define RSEQ_SIG_CODE	0xe7f5def3
+
+#ifndef __ASSEMBLER__
+
+#define RSEQ_SIG_DATA							\
+	({								\
+		int sig;						\
+		asm volatile (  "b 2f\n\t"				\
+				"1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
+				"2:\n\t"				\
+				"ldr %[sig], 1b\n\t"			\
+				: [sig] "=r" (sig));			\
+		sig;							\
+	})
+
+#define RSEQ_SIG	RSEQ_SIG_DATA
+
+#endif
 
 #define rseq_smp_mb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
 #define rseq_smp_rmb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
@@ -78,7 +125,8 @@ do {									\
 		__rseq_str(table_label) ":\n\t"				\
 		".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
 		".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
-		".word " __rseq_str(RSEQ_SIG) "\n\t"			\
+		".arm\n\t"						\
+		".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t"		\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
 		"b %l[" __rseq_str(abort_label) "]\n\t"
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 09/12] rseq/selftests: aarch64 code signature: handle big-endian environment
From: Mathieu Desnoyers @ 2019-04-29 15:28 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Handle compiling with -mbig-endian on aarch64, which generates binaries
with mixed code vs data endianness (little endian code, big endian
data).

Else mismatch between code endianness for the generated signatures and
data endianness for the RSEQ_SIG parameter passed to the rseq
registration will trigger application segmentation faults when the
kernel try to abort rseq critical sections.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Will Deacon <will.deacon@arm.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/rseq/rseq-arm64.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index b41a2a48e965..200dae9e4208 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -6,7 +6,20 @@
  * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
  */
 
-#define RSEQ_SIG	0xd428bc00	/* BRK #0x45E0 */
+/*
+ * aarch64 -mbig-endian generates mixed endianness code vs data:
+ * little-endian code and big-endian data. Ensure the RSEQ_SIG signature
+ * matches code endianness.
+ */
+#define RSEQ_SIG_CODE	0xd428bc00	/* BRK #0x45E0.  */
+
+#ifdef __AARCH64EB__
+#define RSEQ_SIG_DATA	0x00bc28d4	/* BRK #0x45E0.  */
+#else
+#define RSEQ_SIG_DATA	RSEQ_SIG_CODE
+#endif
+
+#define RSEQ_SIG	RSEQ_SIG_DATA
 
 #define rseq_smp_mb()	__asm__ __volatile__ ("dmb ish" ::: "memory")
 #define rseq_smp_rmb()	__asm__ __volatile__ ("dmb ishld" ::: "memory")
@@ -121,7 +134,7 @@ do {										\
 
 #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
 	"	b	222f\n"							\
-	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	"	.inst 	"	__rseq_str(RSEQ_SIG_CODE) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
 	"222:\n"
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 10/12] rseq/selftests: powerpc code signature: generate valid instructions
From: Mathieu Desnoyers @ 2019-04-29 15:28 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Use "twui" as the guard instruction for the restartable sequence abort
handler.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Alan Modra <amodra@gmail.com>
CC: linuxppc-dev@lists.ozlabs.org
---
 tools/testing/selftests/rseq/rseq-ppc.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index 9df18487fa9f..76be90196fe4 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -6,7 +6,15 @@
  * (C) Copyright 2016-2018 - Boqun Feng <boqun.feng@gmail.com>
  */
 
-#define RSEQ_SIG	0x53053053
+/*
+ * RSEQ_SIG is used with the following trap instruction:
+ *
+ * powerpc-be:    0f e5 00 0b           twui   r5,11
+ * powerpc64-le:  0b 00 e5 0f           twui   r5,11
+ * powerpc64-be:  0f e5 00 0b           twui   r5,11
+ */
+
+#define RSEQ_SIG	0x0fe5000b
 
 #define rseq_smp_mb()		__asm__ __volatile__ ("sync"	::: "memory", "cc")
 #define rseq_smp_lwsync()	__asm__ __volatile__ ("lwsync"	::: "memory", "cc")
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 11/12] rseq/selftests: mips: use break instruction for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-04-29 15:28 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Use break as guard instruction for the restartable sequence abort
handler.

Previously, the chosen signature was simply data, based on the
assumption that it could always sit in a literal pool. However,
some compilation environments favor disabling literal pool. Therefore,
ensure the signature is a valid uncommon trap instruction.

Suggested-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Paul Burton <paul.burton@mips.com>
CC: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
---
 tools/testing/selftests/rseq/rseq-mips.h | 34 +++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index fe3eabcdcbe5..e989e7c14b09 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -7,7 +7,39 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#define RSEQ_SIG	0x53053053
+/*
+ * RSEQ_SIG uses the break instruction. The instruction pattern is:
+ *
+ * On MIPS:
+ *	0350000d        break     0x350
+ *
+ * On nanoMIPS:
+ *      00100350        break     0x350
+ *
+ * On microMIPS:
+ *      0000d407        break     0x350
+ *
+ * For nanoMIPS32 and microMIPS, the instruction stream is encoded as 16-bit
+ * halfwords, so the signature halfwords need to be swapped accordingly for
+ * little-endian.
+ */
+#if defined(__nanomips__)
+# ifdef __MIPSEL__
+#  define RSEQ_SIG	0x03500010
+# else
+#  define RSEQ_SIG	0x00100350
+# endif
+#elif defined(__mips_micromips)
+# ifdef __MIPSEL__
+#  define RSEQ_SIG	0xd4070000
+# else
+#  define RSEQ_SIG	0x0000d407
+# endif
+#elif defined(__mips__)
+# define RSEQ_SIG	0x0350000d
+#else
+/* Unknown MIPS architecture. */
+#endif
 
 #define rseq_smp_mb()	__asm__ __volatile__ ("sync" ::: "memory")
 #define rseq_smp_rmb()	rseq_smp_mb()
-- 
2.11.0

^ permalink raw reply related

* [PATCH for 5.2 12/12] rseq/selftests: add -no-integrated-as for clang
From: Mathieu Desnoyers @ 2019-04-29 15:28 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

Ongoing work for asm goto support from clang requires the
-no-integrated-as compiler flag.

This compiler flag is present in the toplevel kernel Makefile,
but is not replicated for selftests. Add it specifically for
the rseq selftest which requires asm goto.

Link: https://reviews.llvm.org/D56571
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Nick Desaulniers <ndesaulniers@google.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/rseq/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index c30c52e1d0d2..d6469535630a 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -1,5 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0+ OR MIT
-CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./
+
+ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
+CLANG_FLAGS += -no-integrated-as
+endif
+
+CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
+	  $(CLANG_FLAGS)
 LDLIBS += -lpthread
 
 # Own dependencies because we only want to build against 1st prerequisite, but
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH v3] moduleparam: Save information about built-in modules in separate file
From: Alexey Gladkov @ 2019-04-29 15:35 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Linux Kernel Mailing List, Linux Kbuild mailing list, linux-api,
	linux-modules, Kirill A . Shutemov, Gleb Fotengauer-Malinovskiy,
	Dmitry V. Levin, Michal Marek, Dmitry Torokhov, Rusty Russell,
	Jessica Yu, Lucas De Marchi
In-Reply-To: <CAK7LNARpeSTr=VWudDRQ8sobcPQqtqzcLm7EqyvoKFYT84Hk6Q@mail.gmail.com>

On Tue, Apr 30, 2019 at 12:08:44AM +0900, Masahiro Yamada wrote:
> >  modules.builtin
> > +modules.builtin.modinfo
> >
> >  #
> >  # Top-level generic files
> 
> 
> Let me repeat the same comments as in v2
> (https://patchwork.kernel.org/patch/10888207/#22595563)
> as you ignored them.

I miss it. Sorry about that.

> > diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
> > index c8cf45362bd6..41ef7cb043c1 100755
> > --- a/scripts/link-vmlinux.sh
> > +++ b/scripts/link-vmlinux.sh
> > @@ -226,6 +226,10 @@ modpost_link vmlinux.o
> >  # modpost vmlinux.o to check for section mismatches
> >  ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o
> >
> > +info MODINFO modules.builtin.modinfo
> > +"${OBJCOPY}" -j .modinfo -O binary vmlinux.o modules.builtin.modinfo
> > +chmod 444 modules.builtin.modinfo
> 
> 
> Why is this 'chmod 444' necessary?

I just wanted to show that this file will never change.
I will remove this line.

-- 
Rgrds, legion

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Serge E. Hallyn @ 2019-04-29 15:49 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: Serge E. Hallyn, Christian Brauner, torvalds, viro, jannh,
	dhowells, linux-api, linux-kernel, luto, arnd, ebiederm, keescook,
	tglx, mtk.manpages, akpm, oleg, cyphar, joel, dancol
In-Reply-To: <000a64d6-1e22-21bf-f232-15f141092e44@metux.net>

On Tue, Apr 16, 2019 at 08:32:50PM +0200, Enrico Weigelt, metux IT consult wrote:

(Sorry for the late reply, I had missed this one)

> On 15.04.19 17:50, Serge E. Hallyn wrote:
> 
> Hi,
> 
> >> I'm working on implementing plan9-like fs namespaces, where unprivileged>> processes can change their own namespace at will. For that, certain>
> > Is there any place where we can see previous discussion about this?
> Yes, lkml and constainers list.
> It's stalled since few month, as I'm too busy w/ other things.
> 
> > If you have to disable suid anyway, then is there any reason why the> existing ability to do this in a private user namespace, with only>
> your own uid mapped (which you can do without any privilege) does> not
> suffice?  That was actually one of the main design goals of user>
> namespaces, to be able to clone(CLONE_NEWUSER), map your current uid,>
> then clone(CLONE_NEWNS) and bind mount at will.
> Well, it's not that easy ... maybe I should explain a bit more about how
> Plan9 works, and how I intent to map it into Linux:
> 
> * on plan9, anybody can alter his own fs namespace (bind and mount), as
>   well as spawning new ones
> * basically anything is coming from some fileserver - even devices
>   (eg. there is no such thing like device nodes)
> * access control is done by the individual fileservers, based on the
>   initial authentication (on connecting to the server, before mounting)

yes, so far I'm aware of this,

> * all users are equal - no root at all. the only exception is the
>   initial process, which gets the kernel devices mounted into his
>   namespace.

This does not match my understanding, but I'm most likely wrong.  (I thought
there was an actual 'host owner' uid, which mostly is only used for initial
process, but is basically root with a different name, and used far less.  No
uid transitions without factotem so that it *looked* like no root user).

> What I'd like to achieve on Linux:
> 
> * unprivileged users can have their own mount namespace, where they
>   can mount at will (maybe just 9P).

No problem, you can do that now.

> * but they still appear as the same normal users to the rest of the
>   system

No problem, you can do that now.

> * 9p programs (compiled for Linux ABI) can run parallel to traditional
>   linux programs within the same user and sessions (eg. from a terminal,
>   i can call both the same way)
> * namespace modifications affect both equally (eg. I could run ff in
>   an own ns)

affect both of what equally?

> * these namespaces exist as long as there's one process alive in here

That's sort of how it is now, except you can also pin the namespaces
with their fds.

> * creating a new ns can be done by unprivileged user

That's true now.

>  One of the things to make this work (w/o introducing a massive security
> hole) is disable suid for those processes (actually, one day i'd like to
> get rid of it completely, but that's another story).

That's exactly what user namespaces are for.  You can create a new
user namespace, using no privilege at all, with your current uid (i.e.
1000) mapped to whatever uid you like; if you pick 0, then you can unshare all
the namespaces you like.  Once you unshare mnt_ns, you can mount to your
heart's content.  To other processes on the host, your process is
uid 1000.  Host uid 0 is not mapped into your ns, so you cannot exploit
suid to host root.

Regarding factotem, I agree that with the pidfd work going on etc, it's getting
more and more tempting to attempt a switch to that.  Looking back at my folder,
I see you posted a kernel patch for it.  I had done the same long ago.  Happy to
work with you again on that, and put a simple daemon into shadow package, if
util-linux isn't deemed the far better place.

-serge

^ permalink raw reply

* Re: [PATCH] fanotify: Make wait for permission events interruptible
From: Orion Poplawski @ 2019-04-29 16:05 UTC (permalink / raw)
  To: Jan Kara, linux-fsdevel
  Cc: Vivek Trivedi, Amir Goldstein, linux-api, LKML, Eric W. Biederman
In-Reply-To: <20190415095907.GA14466@quack2.suse.cz>

[-- Attachment #1: Type: text/plain, Size: 1920 bytes --]

On 4/15/19 3:59 AM, Jan Kara wrote:
> On Thu 21-03-19 16:11:42, Jan Kara wrote:
>> Switch waiting for response to fanotify permission events interruptible.
>> This allows e.g. the system to be suspended while there are some
>> fanotify permission events pending (which is reportedly pretty common
>> when for example AV solution is in use). However just making the wait
>> interruptible can result in e.g. open(2) returning -EINTR where
>> previously such error code never happened in practice. To avoid
>> confusion of userspace due to this error code, return -ERESTARTNOINTR
>> instead.
>>
>> Signed-off-by: Jan Kara <jack@suse.cz>
>> ---
>>  fs/notify/fanotify/fanotify.c | 11 +++++++++--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> Orion, can you give this patch some testing with your usecase? Also if anybody
>> sees any issue with returning -ERESTARTNOINTR I have missed, please speak up.
> 
> Ping Orion? Did you have any chance to give this patch a try? Does it fix
> hibernation issues you observe without causing issues with bash and other
> programs? I'd like to queue this patch for the coming merge window but
> I'd like to see some testing results showing that it actually helps
> anything... Thanks!
> 
> 								Honza


I've been running it for a while with mostly promising results but one
concern.  Notably, when running in conjuction with BitDefender Anti-Virus I
have noticed issues when cloning large git projects (seems to be a good stress
test on open()).  I believe the problems go away when BD is stopped.  At this
point I'm not sure if the issue lies more with BD or with the kernel patch.


-- 
Orion Poplawski
Manager of NWRA Technical Systems          720-772-5637
NWRA, Boulder/CoRA Office             FAX: 303-415-9702
3380 Mitchell Lane                       orion@nwra.com
Boulder, CO 80301                 https://www.nwra.com/


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3799 bytes --]

^ permalink raw reply

* [PATCH v4] moduleparam: Save information about built-in modules in separate file
From: Alexey Gladkov @ 2019-04-29 16:11 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Linux Kernel Mailing List, Linux Kbuild mailing list, linux-api,
	linux-modules, Kirill A . Shutemov, Gleb Fotengauer-Malinovskiy,
	Dmitry V. Levin, Michal Marek, Dmitry Torokhov, Rusty Russell,
	Jessica Yu, Lucas De Marchi

Problem:

When a kernel module is compiled as a separate module, some important
information about the kernel module is available via .modinfo section of
the module.  In contrast, when the kernel module is compiled into the
kernel, that information is not available.

Information about built-in modules is necessary in the following cases:

1. When it is necessary to find out what additional parameters can be
passed to the kernel at boot time.

2. When you need to know which module names and their aliases are in
the kernel. This is very useful for creating an initrd image.

Proposal:

The proposed patch does not remove .modinfo section with module
information from the vmlinux at the build time and saves it into a
separate file after kernel linking. So, the kernel does not increase in
size and no additional information remains in it. Information is stored
in the same format as in the separate modules (null-terminated string
array). Because the .modinfo section is already exported with a separate
modules, we are not creating a new API.

It can be easily read in the userspace:

$ tr '\0' '\n' < modules.builtin.modinfo
ext4.softdep=pre: crc32c
ext4.license=GPL
ext4.description=Fourth Extended Filesystem
ext4.author=Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others
ext4.alias=fs-ext4
ext4.alias=ext3
ext4.alias=fs-ext3
ext4.alias=ext2
ext4.alias=fs-ext2
md_mod.alias=block-major-9-*
md_mod.alias=md
md_mod.description=MD RAID framework
md_mod.license=GPL
md_mod.parmtype=create_on_open:bool
md_mod.parmtype=start_dirty_degraded:int
...

v2:
 * Extract modinfo from vmlinux.o as suggested by Masahiro Yamada;
 * Rename output file to kernel.builtin;
 * Add MODULE_VERSION to modinfo that is saved to the kernel.builtin;
 * Fix build warnings on powerpc.

v3:
 * Rename output file to modules.builtin.modinfo as suggested by Masahiro Yamada;
 * Update Documentation/dontdiff, Documentation/kbuild/kbuild.txt.

Co-Developed-by: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Signed-off-by: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
Acked-by: Jessica Yu <jeyu@kernel.org>
---
 .gitignore                        |  1 +
 Documentation/dontdiff            |  1 +
 Documentation/kbuild/kbuild.txt   |  5 +++++
 Makefile                          |  2 ++
 include/asm-generic/vmlinux.lds.h |  1 +
 include/linux/module.h            |  1 +
 include/linux/moduleparam.h       | 12 +++++-------
 scripts/link-vmlinux.sh           |  3 +++
 8 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index a20ac26aa2f5..91711164cd0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ modules.builtin
 /vmlinuz
 /System.map
 /Module.markers
+/modules.builtin.modinfo
 
 #
 # RPM spec file (make rpm-pkg)
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 2228fcc8e29f..3d4d5a402b8b 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -179,6 +179,7 @@ mktables
 mktree
 modpost
 modules.builtin
+modules.builtin.modinfo
 modules.order
 modversions.h*
 nconf
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
index c9e3d93e7a89..63e118c8c837 100644
--- a/Documentation/kbuild/kbuild.txt
+++ b/Documentation/kbuild/kbuild.txt
@@ -11,6 +11,11 @@ modules.builtin
 This file lists all modules that are built into the kernel. This is used
 by modprobe to not fail when trying to load something builtin.
 
+modules.builtin.modinfo
+--------------------------------------------------
+This file contains modinfo from all modules that are built into the kernel.
+Unlike modinfo of a separate module, all fields are prefixed with module name.
+
 
 Environment variables
 
diff --git a/Makefile b/Makefile
index d5713e7b1e50..5e6657fcf08c 100644
--- a/Makefile
+++ b/Makefile
@@ -1288,6 +1288,7 @@ _modinst_:
 	fi
 	@cp -f $(objtree)/modules.order $(MODLIB)/
 	@cp -f $(objtree)/modules.builtin $(MODLIB)/
+	@cp -f $(objtree)/modules.builtin.modinfo $(MODLIB)/
 	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
 
 # This depmod is only for convenience to give the initial
@@ -1328,6 +1329,7 @@ endif # CONFIG_MODULES
 
 # Directories & files removed with 'make clean'
 CLEAN_DIRS  += $(MODVERDIR) include/ksym
+CLEAN_FILES += modules.builtin.modinfo
 
 # Directories & files removed with 'make mrproper'
 MRPROPER_DIRS  += include/config usr/include include/generated          \
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3d7a6a9c2370..44c724bf7d3a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -844,6 +844,7 @@
 	EXIT_CALL							\
 	*(.discard)							\
 	*(.discard.*)							\
+	*(.modinfo)							\
 	}
 
 /**
diff --git a/include/linux/module.h b/include/linux/module.h
index f5bc4c046461..1cae28b1172a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -237,6 +237,7 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
 #define MODULE_VERSION(_version) MODULE_INFO(version, _version)
 #else
 #define MODULE_VERSION(_version)					\
+	MODULE_INFO(version, _version);					\
 	static struct module_version_attribute ___modver_attr = {	\
 		.mattr	= {						\
 			.attr	= {					\
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index ba36506db4fb..5ba250d9172a 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -10,23 +10,21 @@
    module name. */
 #ifdef MODULE
 #define MODULE_PARAM_PREFIX /* empty */
+#define __MODULE_INFO_PREFIX /* empty */
 #else
 #define MODULE_PARAM_PREFIX KBUILD_MODNAME "."
+/* We cannot use MODULE_PARAM_PREFIX because some modules override it. */
+#define __MODULE_INFO_PREFIX KBUILD_MODNAME "."
 #endif
 
 /* Chosen so that structs with an unsigned long line up. */
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
-#ifdef MODULE
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __UNIQUE_ID(name)[]					  \
   __used __attribute__((section(".modinfo"), unused, aligned(1)))	  \
-  = __stringify(tag) "=" info
-#else  /* !MODULE */
-/* This struct is here for syntactic coherency, it is not used */
-#define __MODULE_INFO(tag, name, info)					  \
-  struct __UNIQUE_ID(name) {}
-#endif
+  = __MODULE_INFO_PREFIX __stringify(tag) "=" info
+
 #define __MODULE_PARM_TYPE(name, _type)					  \
   __MODULE_INFO(parmtype, name##type, #name ":" _type)
 
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index c8cf45362bd6..43abf917f1ec 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -226,6 +226,9 @@ modpost_link vmlinux.o
 # modpost vmlinux.o to check for section mismatches
 ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o
 
+info MODINFO modules.builtin.modinfo
+"${OBJCOPY}" -j .modinfo -O binary vmlinux.o modules.builtin.modinfo
+
 kallsymso=""
 kallsyms_vmlinux=""
 if [ -n "${CONFIG_KALLSYMS}" ]; then
-- 
2.21.0

^ permalink raw reply related

* Re: [PATCH v1 1/2] Add polling support to pidfd
From: Joel Fernandes @ 2019-04-29 16:32 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, linux-kernel, luto, rostedt, dancol, sspatil,
	jannh, surenb, timmurray, Jonathan Kowalski, torvalds,
	kernel-team, Andrew Morton, Arnd Bergmann, Eric W. Biederman,
	Greg Kroah-Hartman, Ingo Molnar, Jann Horn, linux-kselftest,
	Michal Hocko, Peter Zijlstra (Intel), Serge Hallyn, Shuah Khan,
	Stephen Rothwell, Thomas Gleixner <tgl>
In-Reply-To: <20190429142030.GA17715@redhat.com>

On Mon, Apr 29, 2019 at 04:20:30PM +0200, Oleg Nesterov wrote:
> On 04/29, Joel Fernandes wrote:
> >
> > However, in your code above, it is avoided because we get:
> >
> > Task A (poller)		Task B (exiting task being polled)
> > ------------            ----------------
> > poll() called
> > add_wait_queue()
> > 			exit_state is set to non-zero
> > read exit_state
> > remove_wait_queue()
> > 			wake_up_all()
> 
> just to clarify... No, sys_poll() path doesn't do remove_wait_queue() until
> it returns to user mode, and that is why we can't race with set-exit_code +
> wake_up().

I didn't follow what you mean, the removal from the waitqueue happens in
free_poll_entry() called from poll_freewait() which happens from
do_sys_poll() which is before the syscall returns to user mode. Could you
explain more?

> pidfd_poll() can race with the exiting task, miss exit_code != 0, and return
> zero. However, do_poll() won't block after that and pidfd_poll() will be called
> again.

Here also I didn't follow what you mean. If exit_code is read as 0 in
pidfd_poll(), then in do_poll() the count will be 0 and it will block in
poll_schedule_timeout(). Right? But above you're saying it wont block.
Also if you could show a timing diagram of this different race you're talking
about, that will make things clear. It is a bit hard for me to picture
otherwise.

Also, I will use task_pid() for getting the pid from the task, as you suggest
in the other thread.

thanks,

- Joel

^ permalink raw reply

* Re: [PATCH for 5.2 12/12] rseq/selftests: add -no-integrated-as for clang
From: Nick Desaulniers @ 2019-04-29 17:03 UTC (permalink / raw)
  To: Mathieu Desnoyers, Masahiro Yamada
  Cc: Shuah Khan, LKML, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas
In-Reply-To: <20190429152803.7719-13-mathieu.desnoyers@efficios.com>

On Mon, Apr 29, 2019 at 8:29 AM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> Ongoing work for asm goto support from clang requires the
> -no-integrated-as compiler flag.
>
> This compiler flag is present in the toplevel kernel Makefile,
> but is not replicated for selftests. Add it specifically for
> the rseq selftest which requires asm goto.
>
> Link: https://reviews.llvm.org/D56571
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> CC: Nick Desaulniers <ndesaulniers@google.com>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: Joel Fernandes <joelaf@google.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Catalin Marinas <catalin.marinas@arm.com>
> CC: Dave Watson <davejwatson@fb.com>
> CC: Will Deacon <will.deacon@arm.com>
> CC: Shuah Khan <shuah@kernel.org>
> CC: Andi Kleen <andi@firstfloor.org>
> CC: linux-kselftest@vger.kernel.org
> CC: "H . Peter Anvin" <hpa@zytor.com>
> CC: Chris Lameter <cl@linux.com>
> CC: Russell King <linux@arm.linux.org.uk>
> CC: Michael Kerrisk <mtk.manpages@gmail.com>
> CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
> CC: Paul Turner <pjt@google.com>
> CC: Boqun Feng <boqun.feng@gmail.com>
> CC: Josh Triplett <josh@joshtriplett.org>
> CC: Steven Rostedt <rostedt@goodmis.org>
> CC: Ben Maurer <bmaurer@fb.com>
> CC: linux-api@vger.kernel.org
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: Linus Torvalds <torvalds@linux-foundation.org>
> ---
>  tools/testing/selftests/rseq/Makefile | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
> index c30c52e1d0d2..d6469535630a 100644
> --- a/tools/testing/selftests/rseq/Makefile
> +++ b/tools/testing/selftests/rseq/Makefile
> @@ -1,5 +1,11 @@
>  # SPDX-License-Identifier: GPL-2.0+ OR MIT
> -CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./
> +
> +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
> +CLANG_FLAGS += -no-integrated-as
> +endif
> +
> +CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
> +         $(CLANG_FLAGS)

The top level Makefile exports $(CLANG_FLAGS), which should contain
`-no-integrated-as`.  Is that available here?  If so, then you can
just add `$(CLANG_FLAGS)`, no compiler check needed.

If not, maybe the test for CONFIG_CC_IS_CLANG is cleaner?

Thanks for the patch, and helping test asm goto in Clang!
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Enrico Weigelt, metux IT consult @ 2019-04-29 17:31 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Christian Brauner, torvalds, viro, jannh, dhowells, linux-api,
	linux-kernel, luto, arnd, ebiederm, keescook, tglx, mtk.manpages,
	akpm, oleg, cyphar, joel, dancol
In-Reply-To: <20190429154949.GA23456@mail.hallyn.com>

On 29.04.19 17:49, Serge E. Hallyn wrote:

>> * all users are equal - no root at all. the only exception is the>>   initial process, which gets the kernel devices mounted into his>>
 namespace.> > This does not match my understanding, but I'm most likely
wrong.  (I thought> there was an actual 'host owner' uid, which mostly
is only used for initial> process, but is basically root with a
different name, and used far less.  No> uid transitions without factotem
so that it *looked* like no root user).
Not quite (IIRC). The hostowner is just the user who booted the machine,
the initial process runs under this uname and gets the kernel devices
bound into his namespace, so he can start fileservers on them.

Also the caphash device (the one you can create capabilities, eg. for
user change, which then can be used via capuse device) can only be
opened once - usually by the host factotum.

There really is no such thing like root user.

>> What I'd like to achieve on Linux:>>>> * unprivileged users can have their own mount namespace, where
they>>   can mount at will (maybe just 9P).> > No problem, you can do
that now.
But only within separate userns, IMHO. (and, when I last tried, plain
users couldn't directly create their userns).

>> * but they still appear as the same normal users to the rest of the
>>   system
> 
> No problem, you can do that now.

How exactly ? Did I miss something vital ?

>> * 9p programs (compiled for Linux ABI) can run parallel to traditional
>>   linux programs within the same user and sessions (eg. from a terminal,
>>   i can call both the same way)
>> * namespace modifications affect both equally (eg. I could run ff in
>>   an own ns)
> 
> affect both of what equally?

mount / bind.

> That's exactly what user namespaces are for.  You can create a new
> user namespace, using no privilege at all, with your current uid (i.e.
> 1000) mapped to whatever uid you like; if you pick 0, then you can unshare all
> the namespaces you like.  

But I don't like to appear as 'root' in here. I just wanna have my own
filesystem namespace, nothing more.

> Once you unshare mnt_ns, you can mount to your
> heart's content.  To other processes on the host, your process is
> uid 1000.

Is that the uid, I'm appearing to filesystems ?

> Regarding factotem, I agree that with the pidfd work going on etc, it's getting
> more and more tempting to attempt a switch to that.  Looking back at my folder,
> I see you posted a kernel patch for it.  I had done the same long ago.  Happy to
> work with you again on that, and put a simple daemon into shadow package, if
> util-linux isn't deemed the far better place.

Yeah :)


--mtx

-- 
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
info@metux.net -- +49-151-27565287

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Jann Horn @ 2019-04-29 19:30 UTC (permalink / raw)
  To: Kevin Easton, Andy Lutomirski, Christian Brauner
  Cc: Aleksa Sarai, Enrico Weigelt, metux IT consult, Linus Torvalds,
	Al Viro, David Howells, Linux API, LKML, Serge E. Hallyn,
	Arnd Bergmann, Eric W. Biederman, Kees Cook, Thomas Gleixner,
	Michael Kerrisk, Andrew Morton, Oleg Nesterov, Joel Fernandes,
	Daniel Colascione
In-Reply-To: <20190420071406.GA22257@ip-172-31-15-78>

On Sat, Apr 20, 2019 at 3:14 AM Kevin Easton <kevin@guarana.org> wrote:
> On Mon, Apr 15, 2019 at 01:29:23PM -0700, Andy Lutomirski wrote:
> > On Mon, Apr 15, 2019 at 12:59 PM Aleksa Sarai <cyphar@cyphar.com> wrote:
> > >
> > > On 2019-04-15, Enrico Weigelt, metux IT consult <lkml@metux.net> wrote:
> > > > > This patchset makes it possible to retrieve pid file descriptors at
> > > > > process creation time by introducing the new flag CLONE_PIDFD to the
> > > > > clone() system call as previously discussed.
> > > >
> > > > Sorry, for highjacking this thread, but I'm curious on what things to
> > > > consider when introducing new CLONE_* flags.
> > > >
> > > > The reason I'm asking is:
> > > >
> > > > I'm working on implementing plan9-like fs namespaces, where unprivileged
> > > > processes can change their own namespace at will. For that, certain
> > > > traditional unix'ish things have to be disabled, most notably suid.
> > > > As forbidding suid can be helpful in other scenarios, too, I thought
> > > > about making this its own feature. Doing that switch on clone() seems
> > > > a nice place for that, IMHO.
> > >
> > > Just spit-balling -- is no_new_privs not sufficient for this usecase?
> > > Not granting privileges such as setuid during execve(2) is the main
> > > point of that flag.
> > >
> >
> > I would personally *love* it if distros started setting no_new_privs
> > for basically all processes.  And pidfd actually gets us part of the
> > way toward a straightforward way to make sudo and su still work in a
> > no_new_privs world: su could call into a daemon that would spawn the
> > privileged task, and su would get a (read-only!) pidfd back and then
> > wait for the fd and exit.  I suppose that, done naively, this might
> > cause some odd effects with respect to tty handling, but I bet it's
> > solveable.  I suppose it would be nifty if there were a way for a
> > process, by mutual agreement, to reparent itself to an unrelated
> > process.
> >
> > Anyway, clone(2) is an enormous mess.  Surely the right solution here
> > is to have a whole new process creation API that takes a big,
> > extensible struct as an argument, and supports *at least* the full
> > abilities of posix_spawn() and ideally covers all the use cases for
> > fork() + do stuff + exec().  It would be nifty if this API also had a
> > way to say "add no_new_privs and therefore enable extra functionality
> > that doesn't work without no_new_privs".  This functionality would
> > include things like returning a future extra-privileged pidfd that
> > gives ptrace-like access.
> >
> > As basic examples, the improved process creation API should take a
> > list of dup2() operations to perform, fds to remove the O_CLOEXEC flag
> > from, fds to close (or, maybe even better, a list of fds to *not*
> > close), a list of rlimit changes to make, a list of signal changes to
> > make, the ability to set sid, pgrp, uid, gid (as in
> > setresuid/setresgid), the ability to do capset() operations, etc.  The
> > posix_spawn() API, for all that it's rather complicated, covers a
> > bunch of the basics pretty well.
>
> The idea of a system call that takes an infinitely-extendable laundry
> list of operations to perform in kernel space seems quite inelegant, if
> only for the error-reporting reason.
>
> Instead, I suggest that what you'd want is a way to create a new
> embryonic process that has no address space and isn't yet schedulable.
> You then just need other-process-directed variants of all the normal
> setup functions - so pr_openat(pidfd, dirfd, pathname, flags, mode),
> pr_sigaction(pidfd, signum, act, oldact), pr_dup2(pidfd, oldfd, newfd)
> etc.
>
> Then when it's all set up you pr_execve() to kick it off.

Is this really necessary? I agree that fork()+exec() is suboptimal,
but if you just want to avoid the cost of duplicating the address
space, you can AFAICS already do that in userspace with
clone(CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD). Then
the parent can block on a futex until the child leaves the mm_struct
through execve() (or by exiting, in the case of an error), and the
child can temporarily have its stack at the bottom of the caller's
stack. You could build an API like this around it in userspace:

int clone_temporary(int (*fn)(void *arg), void *arg, pid_t *child_pid,
<clone flags and arguments, maybe in a struct>)

and then you'd use it like this to fork off a child process:

int spawn_shell_subprocess_(void *arg) {
  char *cmdline = arg;
  execl("/bin/sh", "sh", "-c", cmdline);
  return -1;
}
pid_t spawn_shell_subprocess(char *cmdline) {
  pid_t child_pid;
  int res = clone_temporary(spawn_shell_subprocess_, cmdline,
&child_pid, [...]);
  if (res == 0) return child_pid;
  return res;
}

clone_temporary() could be implemented roughly as follows by the libc
(or other userspace code):

sigset_t sigset, sigset_old;
sigfillset(&sigset);
sigprocmask(SIG_SETMASK, &sigset, &sigset_old);
int child_pid;
int result = 0;
/* starting here, use inline assembly to ensure that no stack
allocations occur */
long child = syscall(__NR_clone,
CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD, $RSP -
ABI_STACK_REDZONE_SIZE, NULL, &child_pid, 0);
if (child == -1) { result = -1; goto reset_sigmask; }
if (child == 0) {
  result = fn(arg);
  syscall(__NR_exit, 0);
}
futex(&child_pid, FUTEX_WAIT, child, NULL);
/* end of no-stack-allocations zone */
reset_sigmask:
sigprocmask(SIG_SETMASK, &sigset_old, NULL);
return result;

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Jann Horn @ 2019-04-29 19:55 UTC (permalink / raw)
  To: Kevin Easton, Andy Lutomirski, Christian Brauner
  Cc: Aleksa Sarai, Enrico Weigelt, metux IT consult, Linus Torvalds,
	Al Viro, David Howells, Linux API, LKML, Serge E. Hallyn,
	Arnd Bergmann, Eric W. Biederman, Kees Cook, Thomas Gleixner,
	Michael Kerrisk, Andrew Morton, Oleg Nesterov, Joel Fernandes,
	Daniel Colascione
In-Reply-To: <CAG48ez0gG4bd-t1wdR2p6-N2FjWbCqm_+ZThKfF7yKnD=KLqAQ@mail.gmail.com>

On Mon, Apr 29, 2019 at 3:30 PM Jann Horn <jannh@google.com> wrote:
> On Sat, Apr 20, 2019 at 3:14 AM Kevin Easton <kevin@guarana.org> wrote:
> > On Mon, Apr 15, 2019 at 01:29:23PM -0700, Andy Lutomirski wrote:
> > > On Mon, Apr 15, 2019 at 12:59 PM Aleksa Sarai <cyphar@cyphar.com> wrote:
> > > >
> > > > On 2019-04-15, Enrico Weigelt, metux IT consult <lkml@metux.net> wrote:
> > > > > > This patchset makes it possible to retrieve pid file descriptors at
> > > > > > process creation time by introducing the new flag CLONE_PIDFD to the
> > > > > > clone() system call as previously discussed.
> > > > >
> > > > > Sorry, for highjacking this thread, but I'm curious on what things to
> > > > > consider when introducing new CLONE_* flags.
> > > > >
> > > > > The reason I'm asking is:
> > > > >
> > > > > I'm working on implementing plan9-like fs namespaces, where unprivileged
> > > > > processes can change their own namespace at will. For that, certain
> > > > > traditional unix'ish things have to be disabled, most notably suid.
> > > > > As forbidding suid can be helpful in other scenarios, too, I thought
> > > > > about making this its own feature. Doing that switch on clone() seems
> > > > > a nice place for that, IMHO.
> > > >
> > > > Just spit-balling -- is no_new_privs not sufficient for this usecase?
> > > > Not granting privileges such as setuid during execve(2) is the main
> > > > point of that flag.
> > > >
> > >
> > > I would personally *love* it if distros started setting no_new_privs
> > > for basically all processes.  And pidfd actually gets us part of the
> > > way toward a straightforward way to make sudo and su still work in a
> > > no_new_privs world: su could call into a daemon that would spawn the
> > > privileged task, and su would get a (read-only!) pidfd back and then
> > > wait for the fd and exit.  I suppose that, done naively, this might
> > > cause some odd effects with respect to tty handling, but I bet it's
> > > solveable.  I suppose it would be nifty if there were a way for a
> > > process, by mutual agreement, to reparent itself to an unrelated
> > > process.
> > >
> > > Anyway, clone(2) is an enormous mess.  Surely the right solution here
> > > is to have a whole new process creation API that takes a big,
> > > extensible struct as an argument, and supports *at least* the full
> > > abilities of posix_spawn() and ideally covers all the use cases for
> > > fork() + do stuff + exec().  It would be nifty if this API also had a
> > > way to say "add no_new_privs and therefore enable extra functionality
> > > that doesn't work without no_new_privs".  This functionality would
> > > include things like returning a future extra-privileged pidfd that
> > > gives ptrace-like access.
> > >
> > > As basic examples, the improved process creation API should take a
> > > list of dup2() operations to perform, fds to remove the O_CLOEXEC flag
> > > from, fds to close (or, maybe even better, a list of fds to *not*
> > > close), a list of rlimit changes to make, a list of signal changes to
> > > make, the ability to set sid, pgrp, uid, gid (as in
> > > setresuid/setresgid), the ability to do capset() operations, etc.  The
> > > posix_spawn() API, for all that it's rather complicated, covers a
> > > bunch of the basics pretty well.
> >
> > The idea of a system call that takes an infinitely-extendable laundry
> > list of operations to perform in kernel space seems quite inelegant, if
> > only for the error-reporting reason.
> >
> > Instead, I suggest that what you'd want is a way to create a new
> > embryonic process that has no address space and isn't yet schedulable.
> > You then just need other-process-directed variants of all the normal
> > setup functions - so pr_openat(pidfd, dirfd, pathname, flags, mode),
> > pr_sigaction(pidfd, signum, act, oldact), pr_dup2(pidfd, oldfd, newfd)
> > etc.
> >
> > Then when it's all set up you pr_execve() to kick it off.
>
> Is this really necessary? I agree that fork()+exec() is suboptimal,
> but if you just want to avoid the cost of duplicating the address
> space, you can AFAICS already do that in userspace with
> clone(CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD). Then
> the parent can block on a futex until the child leaves the mm_struct
> through execve() (or by exiting, in the case of an error), and the
> child can temporarily have its stack at the bottom of the caller's
> stack. You could build an API like this around it in userspace:
>
> int clone_temporary(int (*fn)(void *arg), void *arg, pid_t *child_pid,
> <clone flags and arguments, maybe in a struct>)
>
> and then you'd use it like this to fork off a child process:
>
> int spawn_shell_subprocess_(void *arg) {
>   char *cmdline = arg;
>   execl("/bin/sh", "sh", "-c", cmdline);
>   return -1;
> }
> pid_t spawn_shell_subprocess(char *cmdline) {
>   pid_t child_pid;
>   int res = clone_temporary(spawn_shell_subprocess_, cmdline,
> &child_pid, [...]);
>   if (res == 0) return child_pid;
>   return res;
> }
>
> clone_temporary() could be implemented roughly as follows by the libc
> (or other userspace code):
>
> sigset_t sigset, sigset_old;
> sigfillset(&sigset);
> sigprocmask(SIG_SETMASK, &sigset, &sigset_old);
> int child_pid;
> int result = 0;
> /* starting here, use inline assembly to ensure that no stack
> allocations occur */
> long child = syscall(__NR_clone,
> CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD, $RSP -
> ABI_STACK_REDZONE_SIZE, NULL, &child_pid, 0);
> if (child == -1) { result = -1; goto reset_sigmask; }
> if (child == 0) {
>   result = fn(arg);
>   syscall(__NR_exit, 0);
> }
> futex(&child_pid, FUTEX_WAIT, child, NULL);
> /* end of no-stack-allocations zone */
> reset_sigmask:
> sigprocmask(SIG_SETMASK, &sigset_old, NULL);
> return result;

... I guess that already has a name, and it's called vfork(). (Well,
except that the Linux vfork() isn't a real vfork().)

So I guess my question is: Why not vfork()?

And if vfork() alone isn't flexible enough, alternatively: How about
an API that forks a new child in the same address space, and then
allows the parent to invoke arbitrary syscalls in the context of the
child? You could also build that in userspace if you wanted, I think -
just let the child run an assembly loop that reads registers from a
unix seqpacket socket, invokes the syscall instruction, and writes the
value of the result register back into the seqpacket socket. As long
as you use CLONE_VM, you don't have to worry about moving the pointer
targets of syscalls. The user-visible API could look like this:

// flags added by the implementation: CLONE_VM|CLONE_CHILD_SETTID
puppet_handle = fork_puppet(CLONE_NEWUSER|SIGCHLD);
int uid_map_fd = puppet_syscall(SYS_open, "/proc/self/uid_map");
char uid_map_buf[1000];
puppet_syscall(SYS_write, uid_map_fd, uid_map_buf, strlen(uid_map_buf));
puppet_syscall(SYS_close, uid_map_fd);
// waits for the child to either exit or switch to new mm via
CLONE_CHILD_CLEARTID
puppet_finish_execve(path, argv, envv);

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Linus Torvalds @ 2019-04-29 20:21 UTC (permalink / raw)
  To: Jann Horn
  Cc: Kevin Easton, Andy Lutomirski, Christian Brauner, Aleksa Sarai,
	Enrico Weigelt, metux IT consult, Al Viro, David Howells,
	Linux API, LKML, Serge E. Hallyn, Arnd Bergmann,
	Eric W. Biederman, Kees Cook, Thomas Gleixner, Michael Kerrisk,
	Andrew Morton, Oleg Nesterov, Joel Fernandes, Daniel Colascione
In-Reply-To: <CAG48ez15bf1EJB0XTJsGFpvf8r5pj9+rv1axKVr13H1NW7ARZw@mail.gmail.com>

On Mon, Apr 29, 2019 at 12:55 PM Jann Horn <jannh@google.com> wrote:
>
> ... I guess that already has a name, and it's called vfork(). (Well,
> except that the Linux vfork() isn't a real vfork().)

What?

Linux vfork() is very much a real vfork(). What do you mean?

                 Linus

^ permalink raw reply

* Re: [PATCH for 5.2 12/12] rseq/selftests: add -no-integrated-as for clang
From: Mathieu Desnoyers @ 2019-04-29 20:28 UTC (permalink / raw)
  To: ndesaulniers, shuah
  Cc: Masahiro Yamada, linux-kernel, linux-api, Thomas Gleixner,
	Peter Zijlstra, Paul E . McKenney, Boqun Feng, Andy Lutomirski,
	Dave Watson, Paul Turner, Andrew Morton, Russell King,
	Ingo Molnar, H. Peter Anvin, Andi Kleen, Chris Lameter,
	Ben Maurer, rostedt, Josh Triplett, Linus Torvalds, Cat
In-Reply-To: <CAKwvOdnbH0+ju5Ny-mB-Z4kC+ALyCJOU4Q8OCLHHjFAQzJqsXA@mail.gmail.com>

----- On Apr 29, 2019, at 1:03 PM, ndesaulniers ndesaulniers@google.com wrote:

> On Mon, Apr 29, 2019 at 8:29 AM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
>> Ongoing work for asm goto support from clang requires the
>> -no-integrated-as compiler flag.
>>
>> This compiler flag is present in the toplevel kernel Makefile,
>> but is not replicated for selftests. Add it specifically for
>> the rseq selftest which requires asm goto.
>>
>> Link: https://reviews.llvm.org/D56571
>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> CC: Nick Desaulniers <ndesaulniers@google.com>
>> CC: Thomas Gleixner <tglx@linutronix.de>
>> CC: Joel Fernandes <joelaf@google.com>
>> CC: Peter Zijlstra <peterz@infradead.org>
>> CC: Catalin Marinas <catalin.marinas@arm.com>
>> CC: Dave Watson <davejwatson@fb.com>
>> CC: Will Deacon <will.deacon@arm.com>
>> CC: Shuah Khan <shuah@kernel.org>
>> CC: Andi Kleen <andi@firstfloor.org>
>> CC: linux-kselftest@vger.kernel.org
>> CC: "H . Peter Anvin" <hpa@zytor.com>
>> CC: Chris Lameter <cl@linux.com>
>> CC: Russell King <linux@arm.linux.org.uk>
>> CC: Michael Kerrisk <mtk.manpages@gmail.com>
>> CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
>> CC: Paul Turner <pjt@google.com>
>> CC: Boqun Feng <boqun.feng@gmail.com>
>> CC: Josh Triplett <josh@joshtriplett.org>
>> CC: Steven Rostedt <rostedt@goodmis.org>
>> CC: Ben Maurer <bmaurer@fb.com>
>> CC: linux-api@vger.kernel.org
>> CC: Andy Lutomirski <luto@amacapital.net>
>> CC: Andrew Morton <akpm@linux-foundation.org>
>> CC: Linus Torvalds <torvalds@linux-foundation.org>
>> ---
>>  tools/testing/selftests/rseq/Makefile | 8 +++++++-
>>  1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/testing/selftests/rseq/Makefile
>> b/tools/testing/selftests/rseq/Makefile
>> index c30c52e1d0d2..d6469535630a 100644
>> --- a/tools/testing/selftests/rseq/Makefile
>> +++ b/tools/testing/selftests/rseq/Makefile
>> @@ -1,5 +1,11 @@
>>  # SPDX-License-Identifier: GPL-2.0+ OR MIT
>> -CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./
>> +
>> +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
>> +CLANG_FLAGS += -no-integrated-as
>> +endif
>> +
>> +CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
>> +         $(CLANG_FLAGS)
> 
> The top level Makefile exports $(CLANG_FLAGS), which should contain
> `-no-integrated-as`.  Is that available here?  If so, then you can
> just add `$(CLANG_FLAGS)`, no compiler check needed.

AFAIU, the makefiles under tools/testing/selftests all end up including
tools/testing/selftests/lib.mk, which states:

"# This mimics the top-level Makefile. We do it explicitly here so that this
 # Makefile can operate with or without the kbuild infrastructure."

So I don't think it's using any of the definitions from the toplevel
Makefile.

> If not, maybe the test for CONFIG_CC_IS_CLANG is cleaner?

A quick test indicates that the toplevel CONFIG_* definitions are unavailable
from the kernel selftests makefiles.

> Thanks for the patch, and helping test asm goto in Clang!

You're very welcome! Considering that I intend to have rseq widely adopted in
user-space, it's only natural to consider that its user-space side needs to be
compiled by clang as well.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH for 5.2 12/12] rseq/selftests: add -no-integrated-as for clang
From: Nick Desaulniers @ 2019-04-29 20:30 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: shuah, Masahiro Yamada, linux-kernel, linux-api, Thomas Gleixner,
	Peter Zijlstra, Paul E . McKenney, Boqun Feng, Andy Lutomirski,
	Dave Watson, Paul Turner, Andrew Morton, Russell King,
	Ingo Molnar, H. Peter Anvin, Andi Kleen, Chris Lameter,
	Ben Maurer, rostedt, Josh Triplett, Linus Torvalds <torvalds@
In-Reply-To: <712082435.384.1556569697998.JavaMail.zimbra@efficios.com>

On Mon, Apr 29, 2019 at 1:28 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> ----- On Apr 29, 2019, at 1:03 PM, ndesaulniers ndesaulniers@google.com wrote:
>
> > On Mon, Apr 29, 2019 at 8:29 AM Mathieu Desnoyers
> > <mathieu.desnoyers@efficios.com> wrote:
> >> diff --git a/tools/testing/selftests/rseq/Makefile
> >> b/tools/testing/selftests/rseq/Makefile
> >> index c30c52e1d0d2..d6469535630a 100644
> >> --- a/tools/testing/selftests/rseq/Makefile
> >> +++ b/tools/testing/selftests/rseq/Makefile
> >> @@ -1,5 +1,11 @@
> >>  # SPDX-License-Identifier: GPL-2.0+ OR MIT
> >> -CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./
> >> +
> >> +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
> >> +CLANG_FLAGS += -no-integrated-as
> >> +endif
> >> +
> >> +CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
> >> +         $(CLANG_FLAGS)
> >
> > The top level Makefile exports $(CLANG_FLAGS), which should contain
> > `-no-integrated-as`.  Is that available here?  If so, then you can
> > just add `$(CLANG_FLAGS)`, no compiler check needed.
>
> AFAIU, the makefiles under tools/testing/selftests all end up including
> tools/testing/selftests/lib.mk, which states:
>
> "# This mimics the top-level Makefile. We do it explicitly here so that this
>  # Makefile can operate with or without the kbuild infrastructure."
>
> So I don't think it's using any of the definitions from the toplevel
> Makefile.
>
> > If not, maybe the test for CONFIG_CC_IS_CLANG is cleaner?
>
> A quick test indicates that the toplevel CONFIG_* definitions are unavailable
> from the kernel selftests makefiles.

Ok, in that case...
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Thanks again!

-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Florian Weimer @ 2019-04-29 20:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jann Horn, Kevin Easton, Andy Lutomirski, Christian Brauner,
	Aleksa Sarai, Enrico Weigelt, metux IT consult, Al Viro,
	David Howells, Linux API, LKML, Serge E. Hallyn, Arnd Bergmann,
	Eric W. Biederman, Kees Cook, Thomas Gleixner, Michael Kerrisk,
	Andrew Morton, Oleg Nesterov, Joel Fernandes, Daniel Colascione
In-Reply-To: <CAHk-=wi_N81mKYFz33ycoWiL7_tGbZBMJOsAs16inYzSza+OEw@mail.gmail.com>

* Linus Torvalds:

> On Mon, Apr 29, 2019 at 12:55 PM Jann Horn <jannh@google.com> wrote:
>>
>> ... I guess that already has a name, and it's called vfork(). (Well,
>> except that the Linux vfork() isn't a real vfork().)
>
> What?
>
> Linux vfork() is very much a real vfork(). What do you mean?

In Linux-as-the-ABI (as opposed to Linux-as-the-implementation), vfork
is sometimes implemented as fork, so applications cannot rely on the
vfork behavior regarding the stopped parent and the shared address
space.

In fact, it would be nice to have a flag we can check in the posix_spawn
implementation, so that we can support vfork-as-fork without any run
time cost to native Linux.

Thanks,
Florian

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Florian Weimer @ 2019-04-29 20:49 UTC (permalink / raw)
  To: Jann Horn
  Cc: Kevin Easton, Andy Lutomirski, Christian Brauner, Aleksa Sarai,
	Enrico Weigelt, metux IT consult, Linus Torvalds, Al Viro,
	David Howells, Linux API, LKML, Serge E. Hallyn, Arnd Bergmann,
	Eric W. Biederman, Kees Cook, Thomas Gleixner, Michael Kerrisk,
	Andrew Morton, Oleg Nesterov, Joel Fernandes, Dan
In-Reply-To: <CAG48ez15bf1EJB0XTJsGFpvf8r5pj9+rv1axKVr13H1NW7ARZw@mail.gmail.com>

* Jann Horn:

>> int clone_temporary(int (*fn)(void *arg), void *arg, pid_t *child_pid,
>> <clone flags and arguments, maybe in a struct>)
>>
>> and then you'd use it like this to fork off a child process:
>>
>> int spawn_shell_subprocess_(void *arg) {
>>   char *cmdline = arg;
>>   execl("/bin/sh", "sh", "-c", cmdline);
>>   return -1;
>> }
>> pid_t spawn_shell_subprocess(char *cmdline) {
>>   pid_t child_pid;
>>   int res = clone_temporary(spawn_shell_subprocess_, cmdline,
>> &child_pid, [...]);
>>   if (res == 0) return child_pid;
>>   return res;
>> }
>>
>> clone_temporary() could be implemented roughly as follows by the libc
>> (or other userspace code):
>>
>> sigset_t sigset, sigset_old;
>> sigfillset(&sigset);
>> sigprocmask(SIG_SETMASK, &sigset, &sigset_old);
>> int child_pid;
>> int result = 0;
>> /* starting here, use inline assembly to ensure that no stack
>> allocations occur */
>> long child = syscall(__NR_clone,
>> CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD, $RSP -
>> ABI_STACK_REDZONE_SIZE, NULL, &child_pid, 0);
>> if (child == -1) { result = -1; goto reset_sigmask; }
>> if (child == 0) {
>>   result = fn(arg);
>>   syscall(__NR_exit, 0);
>> }
>> futex(&child_pid, FUTEX_WAIT, child, NULL);
>> /* end of no-stack-allocations zone */
>> reset_sigmask:
>> sigprocmask(SIG_SETMASK, &sigset_old, NULL);
>> return result;
>
> ... I guess that already has a name, and it's called vfork(). (Well,
> except that the Linux vfork() isn't a real vfork().)
>
> So I guess my question is: Why not vfork()?

Mainly because some users want access to the clone flags, and that's not
possible with the current userspace wrappers.  The stack setup for the
undocumented clone wrapper is also cumbersome, and the ia64 pecularity
annoying.

For the stack sharing, the callback-based interface looks like the
absolutely right thing to do to me.  It enforces the notion that you can
safely return on the child path from a function calling vfork.

> And if vfork() alone isn't flexible enough, alternatively: How about
> an API that forks a new child in the same address space, and then
> allows the parent to invoke arbitrary syscalls in the context of the
> child?

As long it's not an eBPF script …

> You could also build that in userspace if you wanted, I think - just
> let the child run an assembly loop that reads registers from a unix
> seqpacket socket, invokes the syscall instruction, and writes the
> value of the result register back into the seqpacket socket. As long
> as you use CLONE_VM, you don't have to worry about moving the pointer
> targets of syscalls. The user-visible API could look like this:

People already use a variant of this, execve'ing twice.  See
jspawnhelper.

Thanks,
Florian

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Christian Brauner @ 2019-04-29 20:51 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Linus Torvalds, Jann Horn, Kevin Easton, Andy Lutomirski,
	Aleksa Sarai, Enrico Weigelt, metux IT consult, Al Viro,
	David Howells, Linux API, LKML, Serge E. Hallyn, Arnd Bergmann,
	Eric W. Biederman, Kees Cook, Thomas Gleixner, Michael Kerrisk,
	Andrew Morton, Oleg Nesterov, Joel Fernandes,
	Daniel Colascione <danco>
In-Reply-To: <87zho8bl8x.fsf@oldenburg2.str.redhat.com>

On Mon, Apr 29, 2019 at 10:38 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Linus Torvalds:
>
> > On Mon, Apr 29, 2019 at 12:55 PM Jann Horn <jannh@google.com> wrote:
> >>
> >> ... I guess that already has a name, and it's called vfork(). (Well,
> >> except that the Linux vfork() isn't a real vfork().)
> >
> > What?
> >
> > Linux vfork() is very much a real vfork(). What do you mean?
>
> In Linux-as-the-ABI (as opposed to Linux-as-the-implementation), vfork
> is sometimes implemented as fork, so applications cannot rely on the
> vfork behavior regarding the stopped parent and the shared address
> space.
>
> In fact, it would be nice to have a flag we can check in the posix_spawn
> implementation, so that we can support vfork-as-fork without any run
> time cost to native Linux.

After the next merge window we'll be out of flags if things go as planned.
To address this problem, Jann and I are currently in the middle of working
on a clone version that we intend to send out for discussion afterwards.
If the proposal is acceptable it would bump the number of available flags
significantly, putting things like this within reach.

Christian

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Christian Brauner @ 2019-04-29 20:52 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Jann Horn, Kevin Easton, Andy Lutomirski, Aleksa Sarai,
	Enrico Weigelt, metux IT consult, Linus Torvalds, Al Viro,
	David Howells, Linux API, LKML, Serge E. Hallyn, Arnd Bergmann,
	Eric W. Biederman, Kees Cook, Thomas Gleixner, Michael Kerrisk,
	Andrew Morton, Oleg Nesterov, Joel Fernandes,
	Daniel Colascione <danco>
In-Reply-To: <87v9ywbkp8.fsf@oldenburg2.str.redhat.com>

On Mon, Apr 29, 2019 at 10:50 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Jann Horn:
>
> >> int clone_temporary(int (*fn)(void *arg), void *arg, pid_t *child_pid,
> >> <clone flags and arguments, maybe in a struct>)
> >>
> >> and then you'd use it like this to fork off a child process:
> >>
> >> int spawn_shell_subprocess_(void *arg) {
> >>   char *cmdline = arg;
> >>   execl("/bin/sh", "sh", "-c", cmdline);
> >>   return -1;
> >> }
> >> pid_t spawn_shell_subprocess(char *cmdline) {
> >>   pid_t child_pid;
> >>   int res = clone_temporary(spawn_shell_subprocess_, cmdline,
> >> &child_pid, [...]);
> >>   if (res == 0) return child_pid;
> >>   return res;
> >> }
> >>
> >> clone_temporary() could be implemented roughly as follows by the libc
> >> (or other userspace code):
> >>
> >> sigset_t sigset, sigset_old;
> >> sigfillset(&sigset);
> >> sigprocmask(SIG_SETMASK, &sigset, &sigset_old);
> >> int child_pid;
> >> int result = 0;
> >> /* starting here, use inline assembly to ensure that no stack
> >> allocations occur */
> >> long child = syscall(__NR_clone,
> >> CLONE_VM|CLONE_CHILD_SETTID|CLONE_CHILD_CLEARTID|SIGCHLD, $RSP -
> >> ABI_STACK_REDZONE_SIZE, NULL, &child_pid, 0);
> >> if (child == -1) { result = -1; goto reset_sigmask; }
> >> if (child == 0) {
> >>   result = fn(arg);
> >>   syscall(__NR_exit, 0);
> >> }
> >> futex(&child_pid, FUTEX_WAIT, child, NULL);
> >> /* end of no-stack-allocations zone */
> >> reset_sigmask:
> >> sigprocmask(SIG_SETMASK, &sigset_old, NULL);
> >> return result;
> >
> > ... I guess that already has a name, and it's called vfork(). (Well,
> > except that the Linux vfork() isn't a real vfork().)
> >
> > So I guess my question is: Why not vfork()?
>
> Mainly because some users want access to the clone flags, and that's not
> possible with the current userspace wrappers.  The stack setup for the
> undocumented clone wrapper is also cumbersome, and the ia64 pecularity
> annoying.
>
> For the stack sharing, the callback-based interface looks like the
> absolutely right thing to do to me.  It enforces the notion that you can
> safely return on the child path from a function calling vfork.
>
> > And if vfork() alone isn't flexible enough, alternatively: How about
> > an API that forks a new child in the same address space, and then
> > allows the parent to invoke arbitrary syscalls in the context of the
> > child?
>
> As long it's not an eBPF script …

You shouldn't even joke about this (I'm serious.).
I'm very certain there are people who'd think this is a good idea.

>
> > You could also build that in userspace if you wanted, I think - just
> > let the child run an assembly loop that reads registers from a unix
> > seqpacket socket, invokes the syscall instruction, and writes the
> > value of the result register back into the seqpacket socket. As long
> > as you use CLONE_VM, you don't have to worry about moving the pointer
> > targets of syscalls. The user-visible API could look like this:
>
> People already use a variant of this, execve'ing twice.  See
> jspawnhelper.
>
> Thanks,
> Florian

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox