* [RFC PATCH for 4.15 1/6] membarrier: selftest: Test private expedited cmd
2017-11-08 18:35 [RFC PATCH for 4.15 0/6] membarrier updates for 4.15 Mathieu Desnoyers
@ 2017-11-08 18:35 ` Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 2/6] membarrier: powerpc: Skip memory barrier in switch_mm() (v6) Mathieu Desnoyers
` (3 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, linux-api, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael, gromer, Avi Kivity,
Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Dave Watson, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
Andrea Parri, x86, Mathieu Desnoyers, Alan Stern, Will Deacon
Test the new MEMBARRIER_CMD_PRIVATE_EXPEDITED and
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED commands.
Add checks expecting specific error values on system calls expected to
fail.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: Alice Ferrazzi <alice.ferrazzi@gmail.com>
CC: Paul Elder <paul.elder@pitt.edu>
CC: linux-kselftest@vger.kernel.org
CC: linux-arch@vger.kernel.org
---
.../testing/selftests/membarrier/membarrier_test.c | 109 ++++++++++++++++++---
1 file changed, 94 insertions(+), 15 deletions(-)
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index 9e674d9514d1..d7543a6d9030 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -16,49 +16,119 @@ static int sys_membarrier(int cmd, int flags)
static int test_membarrier_cmd_fail(void)
{
int cmd = -1, flags = 0;
+ const char *test_name = "sys membarrier invalid command";
if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg(
- "sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n",
- cmd, flags);
+ "%s test: command = %d, flags = %d. Should fail, but passed\n",
+ test_name, cmd, flags);
+ }
+ if (errno != EINVAL) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EINVAL, strerror(EINVAL),
+ errno, strerror(errno));
}
ksft_test_result_pass(
- "sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n",
- cmd, flags);
+ "%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+ test_name, cmd, flags, errno);
return 0;
}
static int test_membarrier_flags_fail(void)
{
int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg(
- "sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n",
- flags);
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EINVAL) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EINVAL, strerror(EINVAL),
+ errno, strerror(errno));
}
ksft_test_result_pass(
- "sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n",
- flags);
+ "%s test: flags = %d, errno = %d. Failed as expected\n",
+ test_name, flags, errno);
return 0;
}
-static int test_membarrier_success(void)
+static int test_membarrier_shared_success(void)
{
int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
- const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n";
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n", test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EPERM) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EPERM, strerror(EPERM),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
- "sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
- flags);
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
}
ksft_test_result_pass(
- "sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
- flags);
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
return 0;
}
@@ -72,7 +142,16 @@ static int test_membarrier(void)
status = test_membarrier_flags_fail();
if (status)
return status;
- status = test_membarrier_success();
+ status = test_membarrier_shared_success();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_fail();
+ if (status)
+ return status;
+ status = test_membarrier_register_private_expedited_success();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_success();
if (status)
return status;
return 0;
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [RFC PATCH for 4.15 2/6] membarrier: powerpc: Skip memory barrier in switch_mm() (v6)
2017-11-08 18:35 [RFC PATCH for 4.15 0/6] membarrier updates for 4.15 Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 1/6] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
@ 2017-11-08 18:35 ` Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 3/6] membarrier: Document scheduler barrier requirements (v5) Mathieu Desnoyers
` (2 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, linux-api, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael, gromer, Avi Kivity,
Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Dave Watson, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
Andrea Parri, x86, Mathieu Desnoyers, Alan Stern, Will Deacon
Allow PowerPC to skip the full memory barrier in switch_mm(), and
only issue the barrier when scheduling into a task belonging to a
process that has registered to use expedited private.
Threads targeting the same VM but which belong to different thread
groups is a tricky case. It has a few consequences:
It turns out that we cannot rely on get_nr_threads(p) to count the
number of threads using a VM. We can use
(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)
instead to skip the synchronize_sched() for cases where the VM only has
a single user, and that user only has a single thread.
It also turns out that we cannot use for_each_thread() to set
thread flags in all threads using a VM, as it only iterates on the
thread group.
Therefore, test the membarrier state variable directly rather than
relying on thread flags. This means
membarrier_register_private_expedited() needs to set the
MEMBARRIER_STATE_SWITCH_MM flag, issue synchronize_sched(), and only
then set MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY which allows private
expedited membarrier commands to succeed. membarrier_arch_switch_mm()
now tests for the MEMBARRIER_STATE_SWITCH_MM flag.
Changes since v1:
- Use test_ti_thread_flag(next, ...) instead of test_thread_flag() in
powerpc membarrier_arch_sched_in(), given that we want to specifically
check the next thread state.
- Add missing ARCH_HAS_MEMBARRIER_HOOKS in Kconfig.
- Use task_thread_info() to pass thread_info from task to
*_ti_thread_flag().
Changes since v2:
- Move membarrier_arch_sched_in() call to finish_task_switch().
- Check for NULL t->mm in membarrier_arch_fork().
- Use membarrier_sched_in() in generic code, which invokes the
arch-specific membarrier_arch_sched_in(). This fixes allnoconfig
build on PowerPC.
- Move asm/membarrier.h include under CONFIG_MEMBARRIER, fixing
allnoconfig build on PowerPC.
- Build and runtime tested on PowerPC.
Changes since v3:
- Simply rely on copy_mm() to copy the membarrier_private_expedited mm
field on fork.
- powerpc: test thread flag instead of reading
membarrier_private_expedited in membarrier_arch_fork().
- powerpc: skip memory barrier in membarrier_arch_sched_in() if coming
from kernel thread, since mmdrop() implies a full barrier.
- Set membarrier_private_expedited to 1 only after arch registration
code, thus eliminating a race where concurrent commands could succeed
when they should fail if issued concurrently with process
registration.
- Use READ_ONCE() for membarrier_private_expedited field access in
membarrier_private_expedited. Matches WRITE_ONCE() performed in
process registration.
Changes since v4:
- Move powerpc hook from sched_in() to switch_mm(), based on feedback
from Nicholas Piggin.
Changes since v5:
- Rebase on v4.14-rc6.
- Fold "Fix: membarrier: Handle CLONE_VM + !CLONE_THREAD correctly on
powerpc (v2)"
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: Ingo Molnar <mingo@redhat.com>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Nicholas Piggin <npiggin@gmail.com>
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-arch@vger.kernel.org
---
MAINTAINERS | 2 ++
arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/membarrier.h | 26 +++++++++++++++++++++++++
arch/powerpc/kernel/Makefile | 2 ++
arch/powerpc/kernel/membarrier.c | 36 +++++++++++++++++++++++++++++++++++
arch/powerpc/mm/mmu_context.c | 7 +++++++
include/linux/sched/mm.h | 15 +++++++++++++++
init/Kconfig | 3 +++
kernel/sched/core.c | 10 ----------
kernel/sched/membarrier.c | 1 +
10 files changed, 93 insertions(+), 10 deletions(-)
create mode 100644 arch/powerpc/include/asm/membarrier.h
create mode 100644 arch/powerpc/kernel/membarrier.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 2f4e462aa4a2..34687a0ec28c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8829,6 +8829,8 @@ L: linux-kernel@vger.kernel.org
S: Supported
F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
+F: arch/powerpc/kernel/membarrier.c
+F: arch/powerpc/include/asm/membarrier.h
MEMORY MANAGEMENT
L: linux-mm@kvack.org
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index cb782ac1c35d..86077c4cd0a4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_MEMBARRIER_HOOKS
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
new file mode 100644
index 000000000000..0951646253d9
--- /dev/null
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_POWERPC_MEMBARRIER_H
+#define _ASM_POWERPC_MEMBARRIER_H
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next, struct task_struct *tsk)
+{
+ /*
+ * Only need the full barrier when switching between processes.
+ * Barrier when switching from kernel to userspace is not
+ * required here, given that it is implied by mmdrop(). Barrier
+ * when switching from userspace to kernel is not needed after
+ * store to rq->curr.
+ */
+ if (likely(!(atomic_read(&next->membarrier_state)
+ & MEMBARRIER_STATE_SWITCH_MM) || !prev))
+ return;
+
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ */
+ smp_mb();
+}
+void membarrier_arch_register_private_expedited(struct task_struct *t);
+
+#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6c6cce937dd8..2ad3b55fa2b6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -136,6 +136,8 @@ endif
obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+
# Disable GCOV & sanitizers in odd or sensitive code
GCOV_PROFILE_prom_init.o := n
UBSAN_SANITIZE_prom_init.o := n
diff --git a/arch/powerpc/kernel/membarrier.c b/arch/powerpc/kernel/membarrier.c
new file mode 100644
index 000000000000..4795ad59b833
--- /dev/null
+++ b/arch/powerpc/kernel/membarrier.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call - PowerPC architecture code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/thread_info.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+
+void membarrier_arch_register_private_expedited(struct task_struct *p)
+{
+ struct mm_struct *mm = p->mm;
+
+ atomic_or(MEMBARRIER_STATE_SWITCH_MM, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)
+ return;
+ /*
+ * Ensure all future scheduler executions will observe the new
+ * thread flag state for this process.
+ */
+ synchronize_sched();
+}
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0f613bc63c50..22f5c91cdc38 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
@@ -67,6 +68,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*
* On the read side the barrier is in pte_xchg(), which orders
* the store to the PTE vs the load of mm_cpumask.
+ *
+ * This full barrier is needed by membarrier when switching
+ * between processes after store to rq->curr, before user-space
+ * memory accesses.
*/
smp_mb();
@@ -89,6 +94,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (new_on_cpu)
radix_kvm_prefetch_workaround(next);
+ else
+ membarrier_arch_switch_mm(prev, next, tsk);
/*
* The actual HW switching method differs between the various
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3d49b91b674d..bbc3861d14c8 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -218,11 +218,26 @@ enum {
MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
};
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
+#include <asm/membarrier.h>
+#else
+static inline void membarrier_arch_register_private_expedited(
+ struct task_struct *p)
+{
+}
+#endif
+
static inline void membarrier_execve(struct task_struct *t)
{
atomic_set(&t->mm->membarrier_state, 0);
}
#else
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next, struct task_struct *tsk)
+{
+}
+#endif
static inline void membarrier_execve(struct task_struct *t)
{
}
diff --git a/init/Kconfig b/init/Kconfig
index 3c1faaa2af4a..ba7f17971554 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1400,6 +1400,9 @@ config MEMBARRIER
If unsure, say Y.
+config ARCH_HAS_MEMBARRIER_HOOKS
+ bool
+
config EMBEDDED
bool "Embedded system"
option allnoconfig_y
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..f29525e453c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2644,16 +2644,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
- /*
- * The membarrier system call requires a full memory barrier
- * after storing to rq->curr, before going back to user-space.
- *
- * TODO: This smp_mb__after_unlock_lock can go away if PPC end
- * up adding a full barrier to switch_mm(), or we should figure
- * out if a smp_mb__after_unlock_lock is really the proper API
- * to use.
- */
- smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..23bd256f8458 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -116,6 +116,7 @@ static void membarrier_register_private_expedited(void)
if (atomic_read(&mm->membarrier_state)
& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
return;
+ membarrier_arch_register_private_expedited(p);
atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
&mm->membarrier_state);
}
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [RFC PATCH for 4.15 3/6] membarrier: Document scheduler barrier requirements (v5)
2017-11-08 18:35 [RFC PATCH for 4.15 0/6] membarrier updates for 4.15 Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 1/6] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 2/6] membarrier: powerpc: Skip memory barrier in switch_mm() (v6) Mathieu Desnoyers
@ 2017-11-08 18:35 ` Mathieu Desnoyers
[not found] ` <20171108183514.3306-1-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2017-11-08 18:35 ` [RFC PATCH for 4.15 6/6] membarrier: selftest: Test private expedited sync core cmd Mathieu Desnoyers
4 siblings, 0 replies; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, linux-api, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael, gromer, Avi Kivity,
Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Dave Watson, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
Andrea Parri, x86, Mathieu Desnoyers
Document the membarrier requirement on having a full memory barrier in
__schedule() after coming from user-space, before storing to rq->curr.
It is provided by smp_mb__after_spinlock() in __schedule().
Document that membarrier requires a full barrier on transition from
kernel thread to userspace thread. We currently have an implicit barrier
from atomic_dec_and_test() in mmdrop() that ensures this.
The x86 switch_mm_irqs_off() full barrier is currently provided by many
cpumask update operations as well as write_cr3(). Document that
write_cr3() provides this barrier.
Changes since v1:
- Update comments to match reality for code paths which are after
storing to rq->curr, before returning to user-space, based on feedback
from Andrea Parri.
Changes since v2:
- Update changelog (smp_mb__before_spinlock -> smp_mb__after_spinlock).
Based on feedback from Andrea Parri.
Changes since v3:
- Clarify comments following feeback from Peter Zijlstra.
Changes since v4:
- Update comment regarding powerpc barrier.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andrea Parri <parri.andrea@gmail.com>
CC: x86@kernel.org
---
arch/x86/mm/tlb.c | 5 +++++
include/linux/sched/mm.h | 5 +++++
kernel/sched/core.c | 37 ++++++++++++++++++++++++++-----------
3 files changed, 36 insertions(+), 11 deletions(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3118392cdf75..5abf9bfcca1f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -146,6 +146,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * before returning to user-space, after storing to rq->curr.
+ * Writing to CR3 provides that full memory barrier.
+ */
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index bbc3861d14c8..ff1d510a9c51 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -39,6 +39,11 @@ static inline void mmgrab(struct mm_struct *mm)
extern void __mmdrop(struct mm_struct *);
static inline void mmdrop(struct mm_struct *mm)
{
+ /*
+ * The implicit full barrier implied by atomic_dec_and_test is
+ * required by the membarrier system call before returning to
+ * user-space, after storing to rq->curr.
+ */
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f29525e453c4..8a176892b4f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2648,6 +2648,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
+ /*
+ * When transitioning from a kernel thread to a userspace
+ * thread, mmdrop()'s implicit full barrier is required by the
+ * membarrier system call, because the current active_mm can
+ * become the current mm without going through switch_mm().
+ */
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
@@ -2753,6 +2759,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
@@ -3289,6 +3302,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -3336,17 +3352,16 @@ static void __sched notrace __schedule(bool preempt)
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
- * rq->curr, before returning to user-space. For TSO
- * (e.g. x86), the architecture must provide its own
- * barrier in switch_mm(). For weakly ordered machines
- * for which spin_unlock() acts as a full memory
- * barrier, finish_lock_switch() in common code takes
- * care of this barrier. For weakly ordered machines for
- * which spin_unlock() acts as a RELEASE barrier (only
- * arm64 and PowerPC), arm64 has a full barrier in
- * switch_to(), and PowerPC has
- * smp_mb__after_unlock_lock() before
- * finish_lock_switch().
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
*/
++*switch_count;
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
[parent not found: <20171108183514.3306-1-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>]
* [RFC PATCH for 4.15 4/6] membarrier: Provide core serializing command
[not found] ` <20171108183514.3306-1-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2017-11-08 18:35 ` Mathieu Desnoyers
2017-11-08 18:35 ` [RFC PATCH for 4.15 5/6] membarrier: x86: " Mathieu Desnoyers
1 sibling, 0 replies; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael,
gromer-hpIqsD4AKlfQT0dZR+AlfA, Avi Kivity, Benjamin Herrenschmidt,
Paul Mackerras, Michael Ellerman, Dave Watson, Thomas Gleixner,
Ingo Molnar, H . Peter Anvin, Andrea Parri,
x86-DgEjT+Ai2ygdnm+yROfE0A, Mathieu Desnoyers
Provide core serializing membarrier command to support memory reclaim
by JIT.
Each architecture needs to explicitly opt into that support by
documenting in their architecture code how they provide the core
serializing instructions required when returning from the membarrier
IPI, and after the scheduler has updated the curr->mm pointer (before
going back to user-space). They should then define
ARCH_HAS_MEMBARRIER_SYNC_CORE to enable support for that command on
their architecture.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Andy Lutomirski <luto-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
CC: Paul E. McKenney <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Boqun Feng <boqun.feng-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Maged Michael <maged.michael-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
CC: gromer-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org
CC: Avi Kivity <avi-VrcmuVmyx1hWk0Htik3J/w@public.gmane.org>
CC: Benjamin Herrenschmidt <benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r@public.gmane.org>
CC: Paul Mackerras <paulus-eUNUBHrolfbYtjvyW6yDsg@public.gmane.org>
CC: Michael Ellerman <mpe-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>
CC: Dave Watson <davejwatson-b10kYP2dOMg@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: "H. Peter Anvin" <hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
CC: Andrea Parri <parri.andrea-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
include/linux/sched/mm.h | 9 +++++--
include/uapi/linux/membarrier.h | 14 ++++++++---
init/Kconfig | 3 +++
kernel/sched/membarrier.c | 56 ++++++++++++++++++++++++++++++-----------
4 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index ff1d510a9c51..a888da398517 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -219,8 +219,13 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
#ifdef CONFIG_MEMBARRIER
enum {
- MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
- MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
+ MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 2),
+};
+
+enum {
+ MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
};
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 4e01ad7ffe98..4dd4715a6c08 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -64,18 +64,24 @@
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
* returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ * TODO
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ * TODO
*
* Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
* the value 0.
*/
enum membarrier_cmd {
- MEMBARRIER_CMD_QUERY = 0,
- MEMBARRIER_CMD_SHARED = (1 << 0),
+ MEMBARRIER_CMD_QUERY = 0,
+ MEMBARRIER_CMD_SHARED = (1 << 0),
/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
- MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
- MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
};
#endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig
index ba7f17971554..ea4a7ce10328 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1403,6 +1403,9 @@ config MEMBARRIER
config ARCH_HAS_MEMBARRIER_HOOKS
bool
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+ bool
+
config EMBEDDED
bool "Embedded system"
option allnoconfig_y
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 23bd256f8458..6b3cbaa03ed7 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,24 +26,41 @@
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
-static int membarrier_private_expedited(void)
+static int membarrier_private_expedited(int flags)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
- if (!(atomic_read(¤t->mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
- return -EPERM;
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(¤t->mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ } else {
+ if (!(atomic_read(¤t->mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
if (num_online_cpus() == 1)
return 0;
@@ -103,22 +120,28 @@ static int membarrier_private_expedited(void)
return 0;
}
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_private_expedited(int flags)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ }
/*
* We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
- return;
- membarrier_arch_register_private_expedited(p);
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
- &mm->membarrier_state);
+ if (atomic_read(&mm->membarrier_state) & state)
+ return 0;
+ membarrier_arch_register_private_expedited(p, flags);
+ atomic_or(state, &mm->membarrier_state);
+ return 0;
}
/**
@@ -169,10 +192,13 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
synchronize_sched();
return 0;
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited();
+ return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
- membarrier_register_private_expedited();
- return 0;
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
default:
return -EINVAL;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [RFC PATCH for 4.15 5/6] membarrier: x86: Provide core serializing command
[not found] ` <20171108183514.3306-1-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2017-11-08 18:35 ` [RFC PATCH for 4.15 4/6] membarrier: Provide core serializing command Mathieu Desnoyers
@ 2017-11-08 18:35 ` Mathieu Desnoyers
2017-11-09 19:07 ` Andy Lutomirski
1 sibling, 1 reply; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael,
gromer-hpIqsD4AKlfQT0dZR+AlfA, Avi Kivity, Benjamin Herrenschmidt,
Paul Mackerras, Michael Ellerman, Dave Watson, Thomas Gleixner,
Ingo Molnar, H . Peter Anvin, Andrea Parri,
x86-DgEjT+Ai2ygdnm+yROfE0A, Mathieu Desnoyers
There are two places where core serialization is needed by membarrier:
1) When returning from the membarrier IPI,
2) After scheduler updates curr to a thread with a different mm, before
going back to user-space, since the curr->mm is used by membarrier to
check whether it needs to send an IPI to that CPU.
x86-32 uses only iret both as return from interrupt, and to go back to
user-space. The iret instruction is core serializing.
x86-64 uses iret as return from interrupt, which takes care of the IPI.
However, it can return to user-space through either sysretl (compat
code), sysretq, or iret. Given that sysret{l,q} is not core serializing,
we rely instead on write_cr3() performed by switch_mm() to provide core
serialization after changing the current mm, and deal with the special
case of kthread -> uthread (temporarily keeping current mm into
active_mm) by adding a sync_core() in that specific case.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Andy Lutomirski <luto-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
CC: Paul E. McKenney <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Boqun Feng <boqun.feng-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Maged Michael <maged.michael-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
CC: gromer-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org
CC: Avi Kivity <avi-VrcmuVmyx1hWk0Htik3J/w@public.gmane.org>
CC: Benjamin Herrenschmidt <benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r@public.gmane.org>
CC: Paul Mackerras <paulus-eUNUBHrolfbYtjvyW6yDsg@public.gmane.org>
CC: Michael Ellerman <mpe-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>
CC: Dave Watson <davejwatson-b10kYP2dOMg@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: "H. Peter Anvin" <hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
CC: Andrea Parri <parri.andrea-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
CC: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
---
MAINTAINERS | 2 ++
arch/powerpc/include/asm/membarrier.h | 8 ++++++-
arch/powerpc/kernel/membarrier.c | 3 ++-
arch/x86/Kconfig | 2 ++
arch/x86/entry/entry_32.S | 5 +++++
arch/x86/entry/entry_64.S | 8 +++++++
arch/x86/include/asm/membarrier.h | 36 ++++++++++++++++++++++++++++++++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/membarrier.c | 39 +++++++++++++++++++++++++++++++++++
arch/x86/mm/tlb.c | 7 ++++---
include/linux/sched/mm.h | 9 +++++++-
kernel/sched/core.c | 6 +++++-
12 files changed, 119 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/include/asm/membarrier.h
create mode 100644 arch/x86/kernel/membarrier.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 34687a0ec28c..ff564e5195fb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8831,6 +8831,8 @@ F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
F: arch/powerpc/kernel/membarrier.c
F: arch/powerpc/include/asm/membarrier.h
+F: arch/x86/kernel/membarrier.c
+F: arch/x86/include/asm/membarrier.h
MEMORY MANAGEMENT
L: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
index 0951646253d9..018cf278dc93 100644
--- a/arch/powerpc/include/asm/membarrier.h
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -21,6 +21,12 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
*/
smp_mb();
}
-void membarrier_arch_register_private_expedited(struct task_struct *t);
+
+static inline void membarrier_arch_mm_sync_core(void)
+{
+}
+
+void membarrier_arch_register_private_expedited(struct task_struct *t,
+ int flags);
#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/kernel/membarrier.c b/arch/powerpc/kernel/membarrier.c
index 4795ad59b833..0026d740e5a3 100644
--- a/arch/powerpc/kernel/membarrier.c
+++ b/arch/powerpc/kernel/membarrier.c
@@ -21,7 +21,8 @@
#include <linux/rcupdate.h>
#include <linux/atomic.h>
-void membarrier_arch_register_private_expedited(struct task_struct *p)
+void membarrier_arch_register_private_expedited(struct task_struct *p,
+ int flags)
{
struct mm_struct *mm = p->mm;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..6ac32fe768a8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -54,6 +54,8 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_MEMBARRIER_HOOKS
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
# Causing hangs/crashes, see the commit that added this change for details.
select ARCH_HAS_REFCOUNT if BROKEN
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4838037f97f6..04e5daba8456 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -553,6 +553,11 @@ restore_all:
.Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
.Lirq_return:
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on iret core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
+ */
INTERRUPT_RETURN
.section .fixup, "ax"
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bcfc5668dcb2..4859f04e1695 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -642,6 +642,10 @@ GLOBAL(restore_regs_and_iret)
restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on iret core serialization
+ * when returning from IPI handler.
+ */
INTERRUPT_RETURN
ENTRY(native_iret)
@@ -1122,6 +1126,10 @@ paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on iret core serialization
+ * when returning from IPI handler.
+ */
INTERRUPT_RETURN
END(paranoid_exit)
diff --git a/arch/x86/include/asm/membarrier.h b/arch/x86/include/asm/membarrier.h
new file mode 100644
index 000000000000..d22aac77047c
--- /dev/null
+++ b/arch/x86/include/asm/membarrier.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_MEMBARRIER_H
+#define _ASM_X86_MEMBARRIER_H
+
+#include <asm/processor.h>
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next, struct task_struct *tsk)
+{
+}
+
+#ifdef CONFIG_X86_32
+static inline void membarrier_arch_mm_sync_core(struct mm_struct *mm)
+{
+}
+static inline
+void membarrier_arch_register_private_expedited(struct task_struct *t,
+ int flags);
+#else
+/*
+ * x86-64 implements return to user-space through sysret, which is not a
+ * core-serializing instruction. Therefore, we need an explicit core
+ * serializing instruction after going from kernel thread back to
+ * user-space thread (active_mm moved back to current mm).
+ */
+static inline void membarrier_arch_mm_sync_core(struct mm_struct *mm)
+{
+ if (likely(!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_SYNC_CORE)))
+ return;
+ sync_core();
+}
+void membarrier_arch_register_private_expedited(struct task_struct *t,
+ int flags);
+#endif
+
+#endif /* _ASM_X86_MEMBARRIER_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5f70044340ff..13d6738b26c5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_X86_64) += membarrier.o
obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/membarrier.c b/arch/x86/kernel/membarrier.c
new file mode 100644
index 000000000000..978698d7da3d
--- /dev/null
+++ b/arch/x86/kernel/membarrier.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
+ *
+ * membarrier system call - x86 architecture code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/thread_info.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+
+void membarrier_arch_register_private_expedited(struct task_struct *p,
+ int flags)
+{
+ struct mm_struct *mm = p->mm;
+
+ if (!(flags & MEMBARRIER_FLAG_SYNC_CORE))
+ return;
+ atomic_or(MEMBARRIER_STATE_SYNC_CORE, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)
+ return;
+ /*
+ * Ensure all future scheduler executions will observe the new
+ * thread flag state for this process.
+ */
+ synchronize_sched();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5abf9bfcca1f..3b13d6735fa5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -147,9 +147,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.is_lazy, false);
/*
- * The membarrier system call requires a full memory barrier
- * before returning to user-space, after storing to rq->curr.
- * Writing to CR3 provides that full memory barrier.
+ * The membarrier system call requires a full memory barrier and
+ * core serialization before returning to user-space, after
+ * storing to rq->curr. Writing to CR3 provides that full
+ * memory barrier and core serializing instruction.
*/
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index a888da398517..5561b92b597a 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -222,6 +222,7 @@ enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 2),
+ MEMBARRIER_STATE_SYNC_CORE = (1U << 3),
};
enum {
@@ -232,7 +233,10 @@ enum {
#include <asm/membarrier.h>
#else
static inline void membarrier_arch_register_private_expedited(
- struct task_struct *p)
+ struct task_struct *p, int flags)
+{
+}
+static inline void membarrier_arch_mm_sync_core(struct mm_struct *mm)
{
}
#endif
@@ -251,6 +255,9 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
static inline void membarrier_execve(struct task_struct *t)
{
}
+static inline void membarrier_arch_mm_sync_core(struct mm_struct *mm)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a176892b4f0..b5194cfc2199 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2653,9 +2653,13 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* thread, mmdrop()'s implicit full barrier is required by the
* membarrier system call, because the current active_mm can
* become the current mm without going through switch_mm().
+ * membarrier also requires a core serializing instruction
+ * before going back to user-space after storing to rq->curr.
*/
- if (mm)
+ if (mm) {
mmdrop(mm);
+ membarrier_arch_mm_sync_core(mm);
+ }
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [RFC PATCH for 4.15 5/6] membarrier: x86: Provide core serializing command
2017-11-08 18:35 ` [RFC PATCH for 4.15 5/6] membarrier: x86: " Mathieu Desnoyers
@ 2017-11-09 19:07 ` Andy Lutomirski
[not found] ` <CALCETrX4vZHxkWhLKBebZ+R_UE8pDsmVbpJzpyPt=ZgZB8w8iQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Andy Lutomirski @ 2017-11-09 19:07 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Paul E. McKenney, linux-kernel@vger.kernel.org, Linux API,
Peter Zijlstra, Andy Lutomirski, Boqun Feng, Andrew Hunter,
Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
Paul Mackerras, Michael Ellerman, Dave Watson, Thomas Gleixner,
Ingo Molnar, H . Peter Anvin, Andrea Parri, X86 ML
On Wed, Nov 8, 2017 at 10:35 AM, Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
> +/*
> + * x86-64 implements return to user-space through sysret, which is not a
> + * core-serializing instruction. Therefore, we need an explicit core
> + * serializing instruction after going from kernel thread back to
> + * user-space thread (active_mm moved back to current mm).
> + */
> +static inline void membarrier_arch_mm_sync_core(struct mm_struct *mm)
> +{
> + if (likely(!(atomic_read(&mm->membarrier_state) &
> + MEMBARRIER_STATE_SYNC_CORE)))
> + return;
> + sync_core();
> +}
IMO there should be an extremely clear specification somewhere for
what this function is supposed to do.
If I remember correctly, it's supposed to promise that the icache is
synced before the next time we return to usermode for the current mm
on this CPU. If that's correct, then let's document it very
explicitly and let's also drop the "membarrier" from the name -- it's
a primitive we'll need anyway given the existing migration bug.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [RFC PATCH for 4.15 6/6] membarrier: selftest: Test private expedited sync core cmd
2017-11-08 18:35 [RFC PATCH for 4.15 0/6] membarrier updates for 4.15 Mathieu Desnoyers
` (3 preceding siblings ...)
[not found] ` <20171108183514.3306-1-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2017-11-08 18:35 ` Mathieu Desnoyers
4 siblings, 0 replies; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-11-08 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, linux-api, Peter Zijlstra, Andy Lutomirski,
Boqun Feng, Andrew Hunter, Maged Michael, gromer, Avi Kivity,
Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Dave Watson, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
Andrea Parri, x86, Mathieu Desnoyers, Shuah Khan,
Greg Kroah-Hartman, Alan
Test the new MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE and
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE commands.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Shuah Khan <shuahkh@osg.samsung.com>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: Alice Ferrazzi <alice.ferrazzi@gmail.com>
CC: Paul Elder <paul.elder@pitt.edu>
CC: linux-kselftest@vger.kernel.org
CC: linux-arch@vger.kernel.org
---
.../testing/selftests/membarrier/membarrier_test.c | 77 +++++++++++++++++++++-
1 file changed, 76 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index d7543a6d9030..a0eae8d51e72 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -132,6 +132,63 @@ static int test_membarrier_private_expedited_success(void)
return 0;
}
+static int test_membarrier_private_expedited_sync_core_fail(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE not registered failure";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EPERM) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EPERM, strerror(EPERM),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_register_private_expedited_sync_core_success(void)
+{
+ int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_sync_core_success(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
static int test_membarrier(void)
{
int status;
@@ -154,6 +211,22 @@ static int test_membarrier(void)
status = test_membarrier_private_expedited_success();
if (status)
return status;
+ status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+ if (status < 0) {
+ ksft_test_result_fail("sys_membarrier() failed\n");
+ return status;
+ }
+ if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+ status = test_membarrier_private_expedited_sync_core_fail();
+ if (status)
+ return status;
+ status = test_membarrier_register_private_expedited_sync_core_success();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_sync_core_success();
+ if (status)
+ return status;
+ }
return 0;
}
@@ -173,8 +246,10 @@ static int test_membarrier_query(void)
}
ksft_exit_fail_msg("sys_membarrier() failed\n");
}
- if (!(ret & MEMBARRIER_CMD_SHARED))
+ if (!(ret & MEMBARRIER_CMD_SHARED)) {
+ ksft_test_result_fail("sys_membarrier() CMD_SHARED query failed\n");
ksft_exit_fail_msg("sys_membarrier is not supported.\n");
+ }
ksft_test_result_pass("sys_membarrier available\n");
return 0;
--
2.11.0
^ permalink raw reply related [flat|nested] 11+ messages in thread