* [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
@ 2015-07-12 18:06 Mathieu Desnoyers
[not found] ` <1436724386-30909-1-git-send-email-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-12 18:06 UTC (permalink / raw)
To: Paul Turner
Cc: Mathieu Desnoyers, Andrew Hunter, Peter Zijlstra, Ingo Molnar,
Ben Maurer, Steven Rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton,
linux-api-u79uwXL29TY76Z2rM5mHXA
Expose a new system call allowing threads to register a userspace memory
area where to store the current CPU number. Scheduler migration sets the
TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
a notify-resume handler updates the current CPU value within that
user-space memory area.
This getcpu cache is an alternative to the sched_getcpu() vdso which has
a few benefits:
- It is faster to do a memory read that to call a vDSO,
- This cache value can be read from within an inline assembly, which
makes it a useful building block for restartable sequences.
This approach is inspired by Paul Turner and Andrew Hunter's work
on percpu atomics, which lets the kernel handle restart of critical
sections:
Ref.:
* https://lkml.org/lkml/2015/6/24/665
* https://lwn.net/Articles/650333/
* http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
Benchmarking sched_getcpu() vs tls cache approach. Getting the
current CPU number:
- With Linux vdso: 12.7 ns
- With TLS-cached cpu number: 0.3 ns
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Paul Turner <pjt-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: Ben Maurer <bmaurer-b10kYP2dOMg@public.gmane.org>
CC: Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org>
CC: "Paul E. McKenney" <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org>
CC: Lai Jiangshan <laijs-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
CC: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
arch/x86/kernel/signal.c | 2 ++
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/exec.c | 1 +
include/linux/sched.h | 27 +++++++++++++++
include/uapi/asm-generic/unistd.h | 4 ++-
init/Kconfig | 9 +++++
kernel/Makefile | 1 +
kernel/fork.c | 2 ++
kernel/getcpu-cache.c | 70 +++++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 3 ++
kernel/sched/sched.h | 2 ++
kernel/sys_ni.c | 3 ++
12 files changed, 124 insertions(+), 1 deletion(-)
create mode 100644 kernel/getcpu-cache.c
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e504246..157cec0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -750,6 +750,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ if (getcpu_cache_active(current))
+ getcpu_cache_handle_notify_resume(current);
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fb..cfcf8e7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common getcpu_cache sys_getcpu_cache
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/exec.c b/fs/exec.c
index c7f9b73..20ef2e6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1555,6 +1555,7 @@ static int do_execveat_common(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ getcpu_cache_execve(current);
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..0654cc2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1710,6 +1710,9 @@ struct task_struct {
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
+#ifdef CONFIG_GETCPU_CACHE
+ int32_t __user *getcpu_cache;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -3090,4 +3093,28 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}
+#ifdef CONFIG_GETCPU_CACHE
+void getcpu_cache_fork(struct task_struct *t);
+void getcpu_cache_execve(struct task_struct *t);
+void getcpu_cache_handle_notify_resume(struct task_struct *t);
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+ return t->getcpu_cache;
+}
+#else
+static inline void getcpu_cache_fork(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_execve(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+}
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+ return false;
+}
+#endif
+
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..f82b70d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_getcpu_cache 282
+__SYSCALL(__NR_getcpu_cache, sys_getcpu_cache)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d..fac919b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1559,6 +1559,15 @@ config PCI_QUIRKS
bugs/quirks. Disable this only if your target machine is
unaffected by PCI quirks.
+config GETCPU_CACHE
+ bool "Enable getcpu_cache() system call" if EXPERT
+ default y
+ help
+ Enable the getcpu_cache() system call which provides a
+ user-space cache for the current CPU number value.
+
+ If unsure, say Y.
+
config EMBEDDED
bool "Embedded system"
option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b33..3350ba1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_GETCPU_CACHE) += getcpu-cache.o
$(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..334e62d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1549,6 +1549,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
+ if (!(clone_flags & CLONE_THREAD))
+ getcpu_cache_fork(p);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
diff --git a/kernel/getcpu-cache.c b/kernel/getcpu-cache.c
new file mode 100644
index 0000000..b4e5c77
--- /dev/null
+++ b/kernel/getcpu-cache.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
+ *
+ * getcpu_cache system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+/*
+ * This resume handler should always be executed between a migration
+ * triggered by preemption and return to user-space.
+ */
+void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+ int32_t __user *gcp = t->getcpu_cache;
+
+ if (gcp == NULL)
+ return;
+ if (unlikely(t->flags & PF_EXITING))
+ return;
+ /*
+ * access_ok() of gcp_user has already been checked by
+ * sys_getcpu_cache().
+ */
+ if (__put_user(raw_smp_processor_id(), gcp))
+ force_sig(SIGSEGV, current);
+}
+
+/*
+ * If parent process has a getcpu_cache, the child inherits. Only
+ * applies when forking a process, not a thread.
+ */
+void getcpu_cache_fork(struct task_struct *t)
+{
+ t->getcpu_cache = current->getcpu_cache;
+}
+
+void getcpu_cache_execve(struct task_struct *t)
+{
+ t->getcpu_cache = NULL;
+}
+
+/*
+ * sys_getcpu_cache - setup getcpu cache for caller thread
+ */
+SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
+{
+ if (flags)
+ return -EINVAL;
+ if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
+ return -EFAULT;
+ current->getcpu_cache = gcp;
+ /* Will update *gcp on resume */
+ if (gcp)
+ set_thread_flag(TIF_NOTIFY_RESUME);
+ return 0;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f5..a9009d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1823,6 +1823,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_GETCPU_CACHE
+ p->getcpu_cache = NULL;
+#endif
}
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435..bf3e346 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -921,6 +921,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
#ifdef CONFIG_SMP
+ if (getcpu_cache_active(p))
+ set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfuly executed on another CPU. We must ensure that updates of
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0a..3691dc8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -229,3 +229,6 @@ cond_syscall(sys_bpf);
/* execveat */
cond_syscall(sys_execveat);
+
+/* current CPU number cache */
+cond_syscall(sys_getcpu_cache);
--
2.1.4
^ permalink raw reply related [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1436724386-30909-1-git-send-email-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-12 18:47 ` Josh Triplett
2015-07-13 3:40 ` Andy Lutomirski
2015-07-13 15:09 ` Mathieu Desnoyers
2015-07-13 3:38 ` Andy Lutomirski
2015-07-13 11:17 ` Ben Maurer
2 siblings, 2 replies; 47+ messages in thread
From: Josh Triplett @ 2015-07-12 18:47 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar,
Ben Maurer, Steven Rostedt, Paul E. McKenney, Lai Jiangshan,
Linus Torvalds, Andrew Morton, linux-api-u79uwXL29TY76Z2rM5mHXA
On Sun, Jul 12, 2015 at 02:06:26PM -0400, Mathieu Desnoyers wrote:
> Expose a new system call allowing threads to register a userspace memory
> area where to store the current CPU number. Scheduler migration sets the
> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
> a notify-resume handler updates the current CPU value within that
> user-space memory area.
>
> This getcpu cache is an alternative to the sched_getcpu() vdso which has
> a few benefits:
> - It is faster to do a memory read that to call a vDSO,
> - This cache value can be read from within an inline assembly, which
> makes it a useful building block for restartable sequences.
>
> This approach is inspired by Paul Turner and Andrew Hunter's work
> on percpu atomics, which lets the kernel handle restart of critical
> sections:
> Ref.:
> * https://lkml.org/lkml/2015/6/24/665
> * https://lwn.net/Articles/650333/
> * http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>
> Benchmarking sched_getcpu() vs tls cache approach. Getting the
> current CPU number:
>
> - With Linux vdso: 12.7 ns
> - With TLS-cached cpu number: 0.3 ns
Nice. One comment below about an interesting assumption that needs
confirmation.
> --- /dev/null
> +++ b/kernel/getcpu-cache.c
[...]
> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
> +{
> + int32_t __user *gcp = t->getcpu_cache;
> +
> + if (gcp == NULL)
> + return;
> + if (unlikely(t->flags & PF_EXITING))
> + return;
> + /*
> + * access_ok() of gcp_user has already been checked by
> + * sys_getcpu_cache().
> + */
> + if (__put_user(raw_smp_processor_id(), gcp))
> + force_sig(SIGSEGV, current);
> +}
> +
> +/*
> + * If parent process has a getcpu_cache, the child inherits. Only
> + * applies when forking a process, not a thread.
> + */
> +void getcpu_cache_fork(struct task_struct *t)
> +{
> + t->getcpu_cache = current->getcpu_cache;
> +}
> +
> +void getcpu_cache_execve(struct task_struct *t)
> +{
> + t->getcpu_cache = NULL;
> +}
> +
> +/*
> + * sys_getcpu_cache - setup getcpu cache for caller thread
> + */
> +SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
> +{
> + if (flags)
> + return -EINVAL;
> + if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
> + return -EFAULT;
> + current->getcpu_cache = gcp;
So, you store a userspace address, and intentionally only validate it
when initially set, not when used. You clear it on exec, though not on
fork. Could any cases other than exec could make this problematic? In
particular, what about unusual personality flags, such as
ADDR_LIMIT_32BIT or ADDR_LIMIT_3GB?
> + /* Will update *gcp on resume */
> + if (gcp)
Minor nit: you're using the pointer as a boolean here, but comparing it
to NULL elsewhere; you should be consistent. I'd suggest consistently
using gcp and !gcp, without the comparison to NULL.
- Josh Triplett
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1436724386-30909-1-git-send-email-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-12 18:47 ` Josh Triplett
@ 2015-07-13 3:38 ` Andy Lutomirski
[not found] ` <CALCETrV1suAbvMgD1jOEFyn3JcDE_hhi6X7+sGs9e3Oqw_6jUw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-13 11:17 ` Ben Maurer
2 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-13 3:38 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Paul E. McKenney, Ben Maurer, Ingo Molnar, Andrew Morton,
Josh Triplett, Lai Jiangshan, Paul Turner, Steven Rostedt,
Andrew Hunter, Linux API, Linus Torvalds, Peter Zijlstra
On Jul 12, 2015 12:06 PM, "Mathieu Desnoyers"
<mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
>
> Expose a new system call allowing threads to register a userspace memory
> area where to store the current CPU number. Scheduler migration sets the
> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
> a notify-resume handler updates the current CPU value within that
> user-space memory area.
>
> This getcpu cache is an alternative to the sched_getcpu() vdso which has
> a few benefits:
> - It is faster to do a memory read that to call a vDSO,
> - This cache value can be read from within an inline assembly, which
> makes it a useful building block for restartable sequences.
>
Let's wait and see what the final percpu atomic solution is. If it
involves percpu segments, then this is unnecessary.
Also, this will need to be rebased onto -tip, and that should wait
until the big exit rewrite is farther along.
> This approach is inspired by Paul Turner and Andrew Hunter's work
> on percpu atomics, which lets the kernel handle restart of critical
> sections:
> Ref.:
> * https://lkml.org/lkml/2015/6/24/665
> * https://lwn.net/Articles/650333/
> * http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>
> Benchmarking sched_getcpu() vs tls cache approach. Getting the
> current CPU number:
>
> - With Linux vdso: 12.7 ns
This is a bit unfair, because the glibc wrapper sucks and the
__vdso_getcpu interface is overcomplicated. We can fix it with a
better API. It won't make it *that* much faster, though.
>
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index e504246..157cec0 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -750,6 +750,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
> if (thread_info_flags & _TIF_NOTIFY_RESUME) {
> clear_thread_flag(TIF_NOTIFY_RESUME);
> tracehook_notify_resume(regs);
> + if (getcpu_cache_active(current))
> + getcpu_cache_handle_notify_resume(current);
We need to disentangle this stuff. This is buried way too deeply here.
Fortunately, do_notify_resume is going away. It's already unused on
64-bit kernels in -next.
> +/*
> + * This resume handler should always be executed between a migration
> + * triggered by preemption and return to user-space.
> + */
> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
> +{
> + int32_t __user *gcp = t->getcpu_cache;
> +
> + if (gcp == NULL)
> + return;
> + if (unlikely(t->flags & PF_EXITING))
> + return;
> + /*
> + * access_ok() of gcp_user has already been checked by
> + * sys_getcpu_cache().
> + */
> + if (__put_user(raw_smp_processor_id(), gcp))
> + force_sig(SIGSEGV, current);
We're preemptible here, although I think it's okay. But I'd at least
clear the getcpu_cache state if __put_user fails, because otherwise
it's not entirely obvious to me that we can't infinite loop.
> +/*
> + * sys_getcpu_cache - setup getcpu cache for caller thread
> + */
> +SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
> +{
> + if (flags)
> + return -EINVAL;
> + if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
> + return -EFAULT;
> + current->getcpu_cache = gcp;
> + /* Will update *gcp on resume */
> + if (gcp)
> + set_thread_flag(TIF_NOTIFY_RESUME);
> + return 0;
> +}
IMO this is impolite. If the pointer is bad, we should return -EFAULT
rather than sending SIGSEGV.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-12 18:47 ` Josh Triplett
@ 2015-07-13 3:40 ` Andy Lutomirski
2015-07-13 15:09 ` Mathieu Desnoyers
1 sibling, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-13 3:40 UTC (permalink / raw)
To: Josh Triplett
Cc: Mathieu Desnoyers, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, Ben Maurer, Steven Rostedt, Paul E. McKenney,
Lai Jiangshan, Linus Torvalds, Andrew Morton, Linux API
On Sun, Jul 12, 2015 at 11:47 AM, Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org> wrote:
> On Sun, Jul 12, 2015 at 02:06:26PM -0400, Mathieu Desnoyers wrote:
>> Expose a new system call allowing threads to register a userspace memory
>> area where to store the current CPU number. Scheduler migration sets the
>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>> a notify-resume handler updates the current CPU value within that
>> user-space memory area.
>>
>> +
>> +/*
>> + * sys_getcpu_cache - setup getcpu cache for caller thread
>> + */
>> +SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
>> +{
>> + if (flags)
>> + return -EINVAL;
>> + if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
>> + return -EFAULT;
>> + current->getcpu_cache = gcp;
>
> So, you store a userspace address, and intentionally only validate it
> when initially set, not when used. You clear it on exec, though not on
> fork. Could any cases other than exec could make this problematic? In
> particular, what about unusual personality flags, such as
> ADDR_LIMIT_32BIT or ADDR_LIMIT_3GB?
>
On x86, this is safe, although it does violate the rules.
That being said, I've never understood why access_ok cares at all what
kind of task we're in.
^ permalink raw reply [flat|nested] 47+ messages in thread
* RE: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1436724386-30909-1-git-send-email-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-12 18:47 ` Josh Triplett
2015-07-13 3:38 ` Andy Lutomirski
@ 2015-07-13 11:17 ` Ben Maurer
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F39C6-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2 siblings, 1 reply; 47+ messages in thread
From: Ben Maurer @ 2015-07-13 11:17 UTC (permalink / raw)
To: Mathieu Desnoyers, Paul Turner
Cc: Andrew Hunter, Peter Zijlstra, Ingo Molnar, Steven Rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Linus Torvalds,
Andrew Morton, linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
At Facebook we already use getcpu in folly, our base C++ library, to provide high performance concurrency algorithms. Folly includes an abstraction called AccessSpreader which helps engineers write abstractions which shard themselves across different cores to prevent cache contention (https://github.com/facebook/folly/blob/master/folly/detail/CacheLocality.cpp). We have used this primative to create faster reader writer locks (https://github.com/facebook/folly/blob/master/folly/SharedMutex.h), as well as in an abstraction that powers workqueues (https://github.com/facebook/folly/blob/master/folly/IndexedMemPool.h). This would be a great perf improvement for these types of abstractions and probably encourage us to use the idea more widely.
One quick comment on the approach -- it'd be really great if we had a method that didn't require users to register each thread. This can often lead to requiring an additional branch in critical code to check if the appropriate caches have been initialized. Also, one of the most interesting potential applications of the restartable sequences concept is in malloc. having a brief period at the beginning of the life of a thread where malloc didn't work would be pretty tricky to program around.
-b
________________________________________
From: Mathieu Desnoyers [mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org]
Sent: Sunday, July 12, 2015 11:06 AM
To: Paul Turner
Cc: Mathieu Desnoyers; Andrew Hunter; Peter Zijlstra; Ingo Molnar; Ben Maurer; Steven Rostedt; Paul E. McKenney; Josh Triplett; Lai Jiangshan; Linus Torvalds; Andrew Morton; linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
Expose a new system call allowing threads to register a userspace memory
area where to store the current CPU number. Scheduler migration sets the
TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
a notify-resume handler updates the current CPU value within that
user-space memory area.
This getcpu cache is an alternative to the sched_getcpu() vdso which has
a few benefits:
- It is faster to do a memory read that to call a vDSO,
- This cache value can be read from within an inline assembly, which
makes it a useful building block for restartable sequences.
This approach is inspired by Paul Turner and Andrew Hunter's work
on percpu atomics, which lets the kernel handle restart of critical
sections:
Ref.:
* https://lkml.org/lkml/2015/6/24/665
* https://urldefense.proofpoint.com/v1/url?u=https://lwn.net/Articles/650333/&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=ykFqYBj5kD4j7jlBP4F60A%3D%3D%0A&m=rgSZ590AsXQ7Hp6Q6Oyt6xFrzk0JFmcxItletGiGhNE%3D%0A&s=6446da0c7a6cc256b68c865d26b333a37d4b40548ebeebc39114a782a6ce6ea3
* https://urldefense.proofpoint.com/v1/url?u=http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%2520-%2520PerCpu%2520Atomics.pdf&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=ykFqYBj5kD4j7jlBP4F60A%3D%3D%0A&m=rgSZ590AsXQ7Hp6Q6Oyt6xFrzk0JFmcxItletGiGhNE%3D%0A&s=71e097fe57910235e8b23b0bb1b20aa8c8c4b80106663980b62ccaa7e13fba9a
Benchmarking sched_getcpu() vs tls cache approach. Getting the
current CPU number:
- With Linux vdso: 12.7 ns
- With TLS-cached cpu number: 0.3 ns
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Paul Turner <pjt-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: Ben Maurer <bmaurer-b10kYP2dOMg@public.gmane.org>
CC: Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org>
CC: "Paul E. McKenney" <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org>
CC: Lai Jiangshan <laijs-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
CC: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
arch/x86/kernel/signal.c | 2 ++
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/exec.c | 1 +
include/linux/sched.h | 27 +++++++++++++++
include/uapi/asm-generic/unistd.h | 4 ++-
init/Kconfig | 9 +++++
kernel/Makefile | 1 +
kernel/fork.c | 2 ++
kernel/getcpu-cache.c | 70 +++++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 3 ++
kernel/sched/sched.h | 2 ++
kernel/sys_ni.c | 3 ++
12 files changed, 124 insertions(+), 1 deletion(-)
create mode 100644 kernel/getcpu-cache.c
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e504246..157cec0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -750,6 +750,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ if (getcpu_cache_active(current))
+ getcpu_cache_handle_notify_resume(current);
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fb..cfcf8e7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common getcpu_cache sys_getcpu_cache
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/exec.c b/fs/exec.c
index c7f9b73..20ef2e6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1555,6 +1555,7 @@ static int do_execveat_common(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ getcpu_cache_execve(current);
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..0654cc2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1710,6 +1710,9 @@ struct task_struct {
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
+#ifdef CONFIG_GETCPU_CACHE
+ int32_t __user *getcpu_cache;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -3090,4 +3093,28 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}
+#ifdef CONFIG_GETCPU_CACHE
+void getcpu_cache_fork(struct task_struct *t);
+void getcpu_cache_execve(struct task_struct *t);
+void getcpu_cache_handle_notify_resume(struct task_struct *t);
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+ return t->getcpu_cache;
+}
+#else
+static inline void getcpu_cache_fork(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_execve(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+}
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+ return false;
+}
+#endif
+
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..f82b70d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_getcpu_cache 282
+__SYSCALL(__NR_getcpu_cache, sys_getcpu_cache)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d..fac919b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1559,6 +1559,15 @@ config PCI_QUIRKS
bugs/quirks. Disable this only if your target machine is
unaffected by PCI quirks.
+config GETCPU_CACHE
+ bool "Enable getcpu_cache() system call" if EXPERT
+ default y
+ help
+ Enable the getcpu_cache() system call which provides a
+ user-space cache for the current CPU number value.
+
+ If unsure, say Y.
+
config EMBEDDED
bool "Embedded system"
option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b33..3350ba1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_GETCPU_CACHE) += getcpu-cache.o
$(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..334e62d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1549,6 +1549,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
+ if (!(clone_flags & CLONE_THREAD))
+ getcpu_cache_fork(p);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
diff --git a/kernel/getcpu-cache.c b/kernel/getcpu-cache.c
new file mode 100644
index 0000000..b4e5c77
--- /dev/null
+++ b/kernel/getcpu-cache.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
+ *
+ * getcpu_cache system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+/*
+ * This resume handler should always be executed between a migration
+ * triggered by preemption and return to user-space.
+ */
+void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+ int32_t __user *gcp = t->getcpu_cache;
+
+ if (gcp == NULL)
+ return;
+ if (unlikely(t->flags & PF_EXITING))
+ return;
+ /*
+ * access_ok() of gcp_user has already been checked by
+ * sys_getcpu_cache().
+ */
+ if (__put_user(raw_smp_processor_id(), gcp))
+ force_sig(SIGSEGV, current);
+}
+
+/*
+ * If parent process has a getcpu_cache, the child inherits. Only
+ * applies when forking a process, not a thread.
+ */
+void getcpu_cache_fork(struct task_struct *t)
+{
+ t->getcpu_cache = current->getcpu_cache;
+}
+
+void getcpu_cache_execve(struct task_struct *t)
+{
+ t->getcpu_cache = NULL;
+}
+
+/*
+ * sys_getcpu_cache - setup getcpu cache for caller thread
+ */
+SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
+{
+ if (flags)
+ return -EINVAL;
+ if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
+ return -EFAULT;
+ current->getcpu_cache = gcp;
+ /* Will update *gcp on resume */
+ if (gcp)
+ set_thread_flag(TIF_NOTIFY_RESUME);
+ return 0;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f5..a9009d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1823,6 +1823,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_GETCPU_CACHE
+ p->getcpu_cache = NULL;
+#endif
}
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435..bf3e346 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -921,6 +921,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
#ifdef CONFIG_SMP
+ if (getcpu_cache_active(p))
+ set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfuly executed on another CPU. We must ensure that updates of
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0a..3691dc8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -229,3 +229,6 @@ cond_syscall(sys_bpf);
/* execveat */
cond_syscall(sys_execveat);
+
+/* current CPU number cache */
+cond_syscall(sys_getcpu_cache);
--
2.1.4
^ permalink raw reply related [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-12 18:47 ` Josh Triplett
2015-07-13 3:40 ` Andy Lutomirski
@ 2015-07-13 15:09 ` Mathieu Desnoyers
1 sibling, 0 replies; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-13 15:09 UTC (permalink / raw)
To: Josh Triplett
Cc: Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar,
Ben Maurer, rostedt, Paul E. McKenney, Lai Jiangshan,
Linus Torvalds, Andrew Morton, linux-api
----- On Jul 12, 2015, at 2:47 PM, Josh Triplett josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org wrote:
> On Sun, Jul 12, 2015 at 02:06:26PM -0400, Mathieu Desnoyers wrote:
>> Expose a new system call allowing threads to register a userspace memory
>> area where to store the current CPU number. Scheduler migration sets the
>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>> a notify-resume handler updates the current CPU value within that
>> user-space memory area.
>>
>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>> a few benefits:
>> - It is faster to do a memory read that to call a vDSO,
>> - This cache value can be read from within an inline assembly, which
>> makes it a useful building block for restartable sequences.
>>
>> This approach is inspired by Paul Turner and Andrew Hunter's work
>> on percpu atomics, which lets the kernel handle restart of critical
>> sections:
>> Ref.:
>> * https://lkml.org/lkml/2015/6/24/665
>> * https://lwn.net/Articles/650333/
>> *
>> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>>
>> Benchmarking sched_getcpu() vs tls cache approach. Getting the
>> current CPU number:
>>
>> - With Linux vdso: 12.7 ns
>> - With TLS-cached cpu number: 0.3 ns
>
> Nice. One comment below about an interesting assumption that needs
> confirmation.
>
>> --- /dev/null
>> +++ b/kernel/getcpu-cache.c
> [...]
>> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
>> +{
>> + int32_t __user *gcp = t->getcpu_cache;
>> +
>> + if (gcp == NULL)
>> + return;
>> + if (unlikely(t->flags & PF_EXITING))
>> + return;
>> + /*
>> + * access_ok() of gcp_user has already been checked by
>> + * sys_getcpu_cache().
>> + */
>> + if (__put_user(raw_smp_processor_id(), gcp))
>> + force_sig(SIGSEGV, current);
>> +}
>> +
>> +/*
>> + * If parent process has a getcpu_cache, the child inherits. Only
>> + * applies when forking a process, not a thread.
>> + */
>> +void getcpu_cache_fork(struct task_struct *t)
>> +{
>> + t->getcpu_cache = current->getcpu_cache;
>> +}
>> +
>> +void getcpu_cache_execve(struct task_struct *t)
>> +{
>> + t->getcpu_cache = NULL;
>> +}
>> +
>> +/*
>> + * sys_getcpu_cache - setup getcpu cache for caller thread
>> + */
>> +SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
>> +{
>> + if (flags)
>> + return -EINVAL;
>> + if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
>> + return -EFAULT;
>> + current->getcpu_cache = gcp;
>
> So, you store a userspace address, and intentionally only validate it
> when initially set, not when used. You clear it on exec, though not on
> fork. Could any cases other than exec could make this problematic? In
> particular, what about unusual personality flags, such as
> ADDR_LIMIT_32BIT or ADDR_LIMIT_3GB?
That's an interesting point. Looking at those personalities, I don't think
it should be an issue, but just the fact that you raise the question makes
me think we should user put_user() rather than __put_user() in the
notify_resume handler, just to be on the safe side. It should not be a
frequent code path anyway.
>
>> + /* Will update *gcp on resume */
>> + if (gcp)
>
> Minor nit: you're using the pointer as a boolean here, but comparing it
> to NULL elsewhere; you should be consistent. I'd suggest consistently
> using gcp and !gcp, without the comparison to NULL.
Good point, fixing,
Thanks for the feedback!
Mathieu
>
> - Josh Triplett
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrV1suAbvMgD1jOEFyn3JcDE_hhi6X7+sGs9e3Oqw_6jUw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-13 15:27 ` Mathieu Desnoyers
[not found] ` <1050138282.1065.1436801252018.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-13 15:27 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Paul E. McKenney, Ben Maurer, Ingo Molnar, Andrew Morton,
Josh Triplett, Lai Jiangshan, Paul Turner, rostedt, Andrew Hunter,
linux-api, Linus Torvalds, Peter Zijlstra
----- On Jul 12, 2015, at 11:38 PM, Andy Lutomirski luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org wrote:
> On Jul 12, 2015 12:06 PM, "Mathieu Desnoyers"
> <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
>>
>> Expose a new system call allowing threads to register a userspace memory
>> area where to store the current CPU number. Scheduler migration sets the
>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>> a notify-resume handler updates the current CPU value within that
>> user-space memory area.
>>
>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>> a few benefits:
>> - It is faster to do a memory read that to call a vDSO,
>> - This cache value can be read from within an inline assembly, which
>> makes it a useful building block for restartable sequences.
>>
>
> Let's wait and see what the final percpu atomic solution is. If it
> involves percpu segments, then this is unnecessary.
percpu segments will likely not solve everything. I have a use-case
with dynamically allocated per-cpu ring buffer in user-space (lttng-ust)
which can be a challenge for percpu segments. Having a fast getcpu()
is a win in those cases.
>
> Also, this will need to be rebased onto -tip, and that should wait
> until the big exit rewrite is farther along.
I don't really care which thread flag it ends up using, and this is
more or less an internal implementation detail. The important part is
the ABI exposed to user-space, and it's good to start the discussion
on this aspect early.
>
>> This approach is inspired by Paul Turner and Andrew Hunter's work
>> on percpu atomics, which lets the kernel handle restart of critical
>> sections:
>> Ref.:
>> * https://lkml.org/lkml/2015/6/24/665
>> * https://lwn.net/Articles/650333/
>> *
>> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>>
>> Benchmarking sched_getcpu() vs tls cache approach. Getting the
>> current CPU number:
>>
>> - With Linux vdso: 12.7 ns
>
> This is a bit unfair, because the glibc wrapper sucks and the
> __vdso_getcpu interface is overcomplicated. We can fix it with a
> better API. It won't make it *that* much faster, though.
Even if we improve the vDSO function, we are at a point where just
the function call is not that cheap.
>
>>
>> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
>> index e504246..157cec0 100644
>> --- a/arch/x86/kernel/signal.c
>> +++ b/arch/x86/kernel/signal.c
>> @@ -750,6 +750,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32
>> thread_info_flags)
>> if (thread_info_flags & _TIF_NOTIFY_RESUME) {
>> clear_thread_flag(TIF_NOTIFY_RESUME);
>> tracehook_notify_resume(regs);
>> + if (getcpu_cache_active(current))
>> + getcpu_cache_handle_notify_resume(current);
>
> We need to disentangle this stuff. This is buried way too deeply here.
>
> Fortunately, do_notify_resume is going away. It's already unused on
> 64-bit kernels in -next.
Cool! Of course, I'm willing to rebase this on whichever thread flag
and notification upon resume to userspace makes more sense.
>
>> +/*
>> + * This resume handler should always be executed between a migration
>> + * triggered by preemption and return to user-space.
>> + */
>> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
>> +{
>> + int32_t __user *gcp = t->getcpu_cache;
>> +
>> + if (gcp == NULL)
>> + return;
>> + if (unlikely(t->flags & PF_EXITING))
>> + return;
>> + /*
>> + * access_ok() of gcp_user has already been checked by
>> + * sys_getcpu_cache().
>> + */
>> + if (__put_user(raw_smp_processor_id(), gcp))
>> + force_sig(SIGSEGV, current);
>
> We're preemptible here, although I think it's okay. But I'd at least
> clear the getcpu_cache state if __put_user fails, because otherwise
> it's not entirely obvious to me that we can't infinite loop.
Good point. For safety's sake, I'll set t->getcpu_cache to NULL.
>
>> +/*
>> + * sys_getcpu_cache - setup getcpu cache for caller thread
>> + */
>> +SYSCALL_DEFINE2(getcpu_cache, int32_t __user *, gcp, int, flags)
>> +{
>> + if (flags)
>> + return -EINVAL;
>> + if (gcp != NULL && !access_ok(VERIFY_WRITE, gcp, sizeof(int32_t)))
>> + return -EFAULT;
>> + current->getcpu_cache = gcp;
>> + /* Will update *gcp on resume */
>> + if (gcp)
>> + set_thread_flag(TIF_NOTIFY_RESUME);
>> + return 0;
>> +}
>
> IMO this is impolite. If the pointer is bad, we should return -EFAULT
> rather than sending SIGSEGV.
OK, so I guess you mean we should do the __put_user() in getcpu_cache
too, rather than relying on the one in notify_resume, so we can handle
faults there and return -EFAULT rather than sending SIGSEGV. Yep, it
makes sense, will fix.
Thanks!
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1050138282.1065.1436801252018.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-13 15:30 ` Andrew Hunter
[not found] ` <CADroS=7MnUULrjDeQtmscxjkpjCtti9V-HfFXU0sjKhi6PsaAg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-13 18:36 ` Andy Lutomirski
1 sibling, 1 reply; 47+ messages in thread
From: Andrew Hunter @ 2015-07-13 15:30 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Andy Lutomirski, Paul E. McKenney, Ben Maurer, Ingo Molnar,
Andrew Morton, Josh Triplett, Lai Jiangshan, Paul Turner, rostedt,
linux-api, Linus Torvalds, Peter Zijlstra
On Mon, Jul 13, 2015 at 8:27 AM, Mathieu Desnoyers
<mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
> percpu segments will likely not solve everything. I have a use-case
> with dynamically allocated per-cpu ring buffer in user-space (lttng-ust)
> which can be a challenge for percpu segments. Having a fast getcpu()
> is a win in those cases.
>
Note that percpu segments allow userspace to trivially implement fast getcpu.
For the record, Paul and I currently think the best solution is percpu
segments + some variant of a restart-sequence API (we'll have a patch
soon.)
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CADroS=7MnUULrjDeQtmscxjkpjCtti9V-HfFXU0sjKhi6PsaAg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-13 16:07 ` Mathieu Desnoyers
0 siblings, 0 replies; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-13 16:07 UTC (permalink / raw)
To: Andrew Hunter
Cc: Andy Lutomirski, Paul E. McKenney, Ben Maurer, Ingo Molnar,
Andrew Morton, Josh Triplett, Lai Jiangshan, Paul Turner, rostedt,
linux-api, Linus Torvalds, Peter Zijlstra
----- On Jul 13, 2015, at 11:30 AM, Andrew Hunter ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org wrote:
> On Mon, Jul 13, 2015 at 8:27 AM, Mathieu Desnoyers
> <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
>> percpu segments will likely not solve everything. I have a use-case
>> with dynamically allocated per-cpu ring buffer in user-space (lttng-ust)
>> which can be a challenge for percpu segments. Having a fast getcpu()
>> is a win in those cases.
>>
>
> Note that percpu segments allow userspace to trivially implement fast getcpu.
>
> For the record, Paul and I currently think the best solution is percpu
> segments + some variant of a restart-sequence API (we'll have a patch
> soon.)
Although useful in many situations, percpu segments still have some
limitations AFAIU:
- They are not available on all architectures (very x86-specific),
- Some user-space applications already use those segments.
So as long as we only target user-space code that does not use GS,
and which runs on x86, the percpu segments seems to be a good idea.
However, implementing a more general approach for a fast getcpu cache
still appears somewhat useful for the general case.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F39C6-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2015-07-13 17:36 ` Mathieu Desnoyers
2015-07-14 9:34 ` Ben Maurer
[not found] ` <587954201.31.1436808992876.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 2 replies; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-13 17:36 UTC (permalink / raw)
To: Ben Maurer
Cc: Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Linus Torvalds,
Andrew Morton, linux-api, libc-alpha-9JcytcrH/bA+uJoB2kUjGw
----- On Jul 13, 2015, at 7:17 AM, Ben Maurer bmaurer-b10kYP2dOMg@public.gmane.org wrote:
> At Facebook we already use getcpu in folly, our base C++ library, to provide
> high performance concurrency algorithms. Folly includes an abstraction called
> AccessSpreader which helps engineers write abstractions which shard themselves
> across different cores to prevent cache contention
> (https://github.com/facebook/folly/blob/master/folly/detail/CacheLocality.cpp).
> We have used this primative to create faster reader writer locks
> (https://github.com/facebook/folly/blob/master/folly/SharedMutex.h), as well as
> in an abstraction that powers workqueues
> (https://github.com/facebook/folly/blob/master/folly/IndexedMemPool.h). This
> would be a great perf improvement for these types of abstractions and probably
> encourage us to use the idea more widely.
>
> One quick comment on the approach -- it'd be really great if we had a method
> that didn't require users to register each thread. This can often lead to
> requiring an additional branch in critical code to check if the appropriate
> caches have been initialized. Also, one of the most interesting potential
> applications of the restartable sequences concept is in malloc. having a brief
> period at the beginning of the life of a thread where malloc didn't work would
> be pretty tricky to program around.
If we invoke this per-thread registration directly in the glibc NPTL implementation,
in start_thread, do you think it would fit your requirements ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1050138282.1065.1436801252018.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-13 15:30 ` Andrew Hunter
@ 2015-07-13 18:36 ` Andy Lutomirski
1 sibling, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-13 18:36 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Ben Maurer, Paul E. McKenney, Ingo Molnar, linux-api,
Andrew Morton, Josh Triplett, Lai Jiangshan, rostedt, Paul Turner,
Andrew Hunter, Peter Zijlstra, Linus Torvalds
On Jul 13, 2015 9:27 AM, "Mathieu Desnoyers"
<mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
>
> ----- On Jul 12, 2015, at 11:38 PM, Andy Lutomirski luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org wrote:
>
> > On Jul 12, 2015 12:06 PM, "Mathieu Desnoyers"
> > <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
> >>
> >> Expose a new system call allowing threads to register a userspace memory
> >> area where to store the current CPU number. Scheduler migration sets the
> >> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
> >> a notify-resume handler updates the current CPU value within that
> >> user-space memory area.
> >>
> >> This getcpu cache is an alternative to the sched_getcpu() vdso which has
> >> a few benefits:
> >> - It is faster to do a memory read that to call a vDSO,
> >> - This cache value can be read from within an inline assembly, which
> >> makes it a useful building block for restartable sequences.
> >>
> >
> > Let's wait and see what the final percpu atomic solution is. If it
> > involves percpu segments, then this is unnecessary.
>
> percpu segments will likely not solve everything. I have a use-case
> with dynamically allocated per-cpu ring buffer in user-space (lttng-ust)
> which can be a challenge for percpu segments. Having a fast getcpu()
> is a win in those cases.
>
Even so, percpu segments will give you fast getcpu without introducing
a new scheduler hook.
> >
> > Also, this will need to be rebased onto -tip, and that should wait
> > until the big exit rewrite is farther along.
>
> I don't really care which thread flag it ends up using, and this is
> more or less an internal implementation detail. The important part is
> the ABI exposed to user-space, and it's good to start the discussion
> on this aspect early.
>
Agreed.
> >
> >> This approach is inspired by Paul Turner and Andrew Hunter's work
> >> on percpu atomics, which lets the kernel handle restart of critical
> >> sections:
> >> Ref.:
> >> * https://lkml.org/lkml/2015/6/24/665
> >> * https://lwn.net/Articles/650333/
> >> *
> >> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
> >>
> >> Benchmarking sched_getcpu() vs tls cache approach. Getting the
> >> current CPU number:
> >>
> >> - With Linux vdso: 12.7 ns
> >
> > This is a bit unfair, because the glibc wrapper sucks and the
> > __vdso_getcpu interface is overcomplicated. We can fix it with a
> > better API. It won't make it *that* much faster, though.
>
> Even if we improve the vDSO function, we are at a point where just
> the function call is not that cheap.
>
True, and the LSL isn't likely to go away. The branches can go, though.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* RE: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-13 17:36 ` Mathieu Desnoyers
@ 2015-07-14 9:34 ` Ben Maurer
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F5DA0-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
[not found] ` <587954201.31.1436808992876.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 47+ messages in thread
From: Ben Maurer @ 2015-07-14 9:34 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Linus Torvalds,
Andrew Morton, linux-api, libc-alpha@sourceware.org
Mathieu Desnoyers wrote:
> If we invoke this per-thread registration directly in the glibc NPTL implementation,
> in start_thread, do you think it would fit your requirements ?
I guess this would basically be transparent to the user -- we'd just need to make sure that the registration happens very early, before any chance of calling malloc.
That said, having the ability for the kernel to understand that TLS implementation are laid out using the same offset on each thread seems like something that could be valuable long term. Doing so makes it possible to build other TLS-based features without forcing each thread to be registered.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F5DA0-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2015-07-16 18:08 ` Mathieu Desnoyers
[not found] ` <549319255.383.1437070088597.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-18 9:47 ` Florian Weimer
0 siblings, 2 replies; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-16 18:08 UTC (permalink / raw)
To: Ben Maurer
Cc: Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Linus Torvalds,
Andrew Morton, linux-api, libc-alpha
----- On Jul 14, 2015, at 5:34 AM, Ben Maurer bmaurer-b10kYP2dOMg@public.gmane.org wrote:
> Mathieu Desnoyers wrote:
>> If we invoke this per-thread registration directly in the glibc NPTL
>> implementation,
>> in start_thread, do you think it would fit your requirements ?
>
> I guess this would basically be transparent to the user -- we'd just need to
> make sure that the registration happens very early, before any chance of
> calling malloc.
Yes, this is my thinking too.
>
> That said, having the ability for the kernel to understand that TLS
> implementation are laid out using the same offset on each thread seems like
> something that could be valuable long term. Doing so makes it possible to build
> other TLS-based features without forcing each thread to be registered.
AFAIU, using a fixed hardcoded ABI between kernel and user-space might make
transition from the pre-existing ABI (where this memory area is not
reserved) a bit tricky without registering the area, or getting a "feature"
flag, through a system call.
The related question then becomes: should we issue this system call once
per process, or once per thread at thread creation ? Issuing it once per
thread is marginally more costly for thread creation, but seems to be
easier to deal with internally within the kernel.
We could however ensure that only a single system call is needed per new-coming
thread, rather than one system call per feature. One way to do this would be
to register an area that may contain more than just the CPU id. It could
consist of an expandable structure with fixed offsets. When registered, we
could pass the size of that structure as an argument to the system call, so
the kernel knows which features are expected by user-space.
Thoughts ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <549319255.383.1437070088597.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-16 19:27 ` Andy Lutomirski
2015-07-17 10:21 ` Ondřej Bílka
[not found] ` <CALCETrWEKE=mow3vVh7C4r8CuGy_d5VOEz7KkpijuR5cpBfFtg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 2 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-16 19:27 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha
On Thu, Jul 16, 2015 at 11:08 AM, Mathieu Desnoyers
<mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
> ----- On Jul 14, 2015, at 5:34 AM, Ben Maurer bmaurer-b10kYP2dOMg@public.gmane.org wrote:
>
>> Mathieu Desnoyers wrote:
>>> If we invoke this per-thread registration directly in the glibc NPTL
>>> implementation,
>>> in start_thread, do you think it would fit your requirements ?
>>
>> I guess this would basically be transparent to the user -- we'd just need to
>> make sure that the registration happens very early, before any chance of
>> calling malloc.
>
> Yes, this is my thinking too.
>
>>
>> That said, having the ability for the kernel to understand that TLS
>> implementation are laid out using the same offset on each thread seems like
>> something that could be valuable long term. Doing so makes it possible to build
>> other TLS-based features without forcing each thread to be registered.
>
> AFAIU, using a fixed hardcoded ABI between kernel and user-space might make
> transition from the pre-existing ABI (where this memory area is not
> reserved) a bit tricky without registering the area, or getting a "feature"
> flag, through a system call.
>
> The related question then becomes: should we issue this system call once
> per process, or once per thread at thread creation ? Issuing it once per
> thread is marginally more costly for thread creation, but seems to be
> easier to deal with internally within the kernel.
>
> We could however ensure that only a single system call is needed per new-coming
> thread, rather than one system call per feature. One way to do this would be
> to register an area that may contain more than just the CPU id. It could
> consist of an expandable structure with fixed offsets. When registered, we
> could pass the size of that structure as an argument to the system call, so
> the kernel knows which features are expected by user-space.
If we actually bit the bullet and implemented per-cpu mappings, we
could have this be completely flexible because there would be no
format at all. Similarly, if we implemented per-cpu segments,
userspace would need to agree with *itself* how to arbitrate it, but
the kernel wouldn't need to be involved.
With this kind of memory poking, it's definitely messier, which is unfortunate.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-16 19:27 ` Andy Lutomirski
@ 2015-07-17 10:21 ` Ondřej Bílka
2015-07-17 15:53 ` Andy Lutomirski
[not found] ` <CALCETrWEKE=mow3vVh7C4r8CuGy_d5VOEz7KkpijuR5cpBfFtg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
1 sibling, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-17 10:21 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Mathieu Desnoyers, Ben Maurer, Paul Turner, Andrew Hunter,
Peter Zijlstra, Ingo Molnar, rostedt, Paul E. McKenney,
Josh Triplett, Lai Jiangshan, Linus Torvalds, Andrew Morton,
linux-api, libc-alpha
On Thu, Jul 16, 2015 at 12:27:10PM -0700, Andy Lutomirski wrote:
> On Thu, Jul 16, 2015 at 11:08 AM, Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> > ----- On Jul 14, 2015, at 5:34 AM, Ben Maurer bmaurer@fb.com wrote:
> >>
> >> That said, having the ability for the kernel to understand that TLS
> >> implementation are laid out using the same offset on each thread seems like
> >> something that could be valuable long term. Doing so makes it possible to build
> >> other TLS-based features without forcing each thread to be registered.
> >
> > AFAIU, using a fixed hardcoded ABI between kernel and user-space might make
> > transition from the pre-existing ABI (where this memory area is not
> > reserved) a bit tricky without registering the area, or getting a "feature"
> > flag, through a system call.
> >
> > The related question then becomes: should we issue this system call once
> > per process, or once per thread at thread creation ? Issuing it once per
> > thread is marginally more costly for thread creation, but seems to be
> > easier to deal with internally within the kernel.
> >
> > We could however ensure that only a single system call is needed per new-coming
> > thread, rather than one system call per feature. One way to do this would be
> > to register an area that may contain more than just the CPU id. It could
> > consist of an expandable structure with fixed offsets. When registered, we
> > could pass the size of that structure as an argument to the system call, so
> > the kernel knows which features are expected by user-space.
>
> If we actually bit the bullet and implemented per-cpu mappings, we
> could have this be completely flexible because there would be no
> format at all. Similarly, if we implemented per-cpu segments,
> userspace would need to agree with *itself* how to arbitrate it, but
> the kernel wouldn't need to be involved.
>
> With this kind of memory poking, it's definitely messier, which is unfortunate.
>
Could you recapitulate thread? On libc side we didn't read most of it so
it would be appreciated.
If per-cpu mappings mean that there is a single virtual page that is
mapped to different virtual pages?
I had in my todo list improving tls access. This would help tls
implementations for older arms and in general architectures that dont
store tcb in register.
My proposal is modulo small constant equivalent of userspace accessing tid
without syscall overhead, just use array of tcb's for first 32768 tids
and do syscall only when tid exceeds that.
On userspace my proposal would be use map that to fixed virtual address and store tcb in first eigth bytes. Kernel would on context switch along registers also
save and restore these. That would make tls access cheap as it would
need only extra load instruction versus static variable.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <587954201.31.1436808992876.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-17 10:58 ` Ondřej Bílka
2015-07-17 16:03 ` Mathieu Desnoyers
0 siblings, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-17 10:58 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha-9JcytcrH/bA+uJoB2kUjGw
On Mon, Jul 13, 2015 at 05:36:32PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 13, 2015, at 7:17 AM, Ben Maurer bmaurer-b10kYP2dOMg@public.gmane.org wrote:
>
> > At Facebook we already use getcpu in folly, our base C++ library, to provide
> > high performance concurrency algorithms. Folly includes an abstraction called
> > AccessSpreader which helps engineers write abstractions which shard themselves
> > across different cores to prevent cache contention
> > (https://github.com/facebook/folly/blob/master/folly/detail/CacheLocality.cpp).
Could you contribute your improvements/tips to libc? If these help for
c++ mutex then it would also improve c mutex.
> > We have used this primative to create faster reader writer locks
> > (https://github.com/facebook/folly/blob/master/folly/SharedMutex.h), as well as
> > in an abstraction that powers workqueues
> > (https://github.com/facebook/folly/blob/master/folly/IndexedMemPool.h). This
> > would be a great perf improvement for these types of abstractions and probably
> > encourage us to use the idea more widely.
> >
As libc rwlocks now are slow it gets speedup from that. Main problem
with this is that lock elission will give you bigger speedups that that.
Also from description you have wrong rwlock usecase, main application is
avoid blocking, when two readers take lock for long time having one wait
would be terrible.
> > One quick comment on the approach -- it'd be really great if we had a method
> > that didn't require users to register each thread. This can often lead to
> > requiring an additional branch in critical code to check if the appropriate
> > caches have been initialized. Also, one of the most interesting potential
> > applications of the restartable sequences concept is in malloc. having a brief
> > period at the beginning of the life of a thread where malloc didn't work would
> > be pretty tricky to program around.
>
> If we invoke this per-thread registration directly in the glibc NPTL implementation,
> in start_thread, do you think it would fit your requirements ?
>
A generic solution would be adding eager initialization of thread_local
variables which would fix more performance problems.
Second would be write patch to libc adding function
pthread_create_add_hook_np to register function that would be ran after
each thread cretion.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 10:21 ` Ondřej Bílka
@ 2015-07-17 15:53 ` Andy Lutomirski
0 siblings, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-17 15:53 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Mathieu Desnoyers, Ben Maurer, Paul Turner, Andrew Hunter,
Peter Zijlstra, Ingo Molnar, rostedt, Paul E. McKenney,
Josh Triplett, Lai Jiangshan, Linus Torvalds, Andrew Morton,
linux-api, libc-alpha
On Fri, Jul 17, 2015 at 3:21 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Thu, Jul 16, 2015 at 12:27:10PM -0700, Andy Lutomirski wrote:
>> On Thu, Jul 16, 2015 at 11:08 AM, Mathieu Desnoyers
>> <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
>> > ----- On Jul 14, 2015, at 5:34 AM, Ben Maurer bmaurer-b10kYP2dOMg@public.gmane.org wrote:
>> >>
>> >> That said, having the ability for the kernel to understand that TLS
>> >> implementation are laid out using the same offset on each thread seems like
>> >> something that could be valuable long term. Doing so makes it possible to build
>> >> other TLS-based features without forcing each thread to be registered.
>> >
>> > AFAIU, using a fixed hardcoded ABI between kernel and user-space might make
>> > transition from the pre-existing ABI (where this memory area is not
>> > reserved) a bit tricky without registering the area, or getting a "feature"
>> > flag, through a system call.
>> >
>> > The related question then becomes: should we issue this system call once
>> > per process, or once per thread at thread creation ? Issuing it once per
>> > thread is marginally more costly for thread creation, but seems to be
>> > easier to deal with internally within the kernel.
>> >
>> > We could however ensure that only a single system call is needed per new-coming
>> > thread, rather than one system call per feature. One way to do this would be
>> > to register an area that may contain more than just the CPU id. It could
>> > consist of an expandable structure with fixed offsets. When registered, we
>> > could pass the size of that structure as an argument to the system call, so
>> > the kernel knows which features are expected by user-space.
>>
>> If we actually bit the bullet and implemented per-cpu mappings, we
>> could have this be completely flexible because there would be no
>> format at all. Similarly, if we implemented per-cpu segments,
>> userspace would need to agree with *itself* how to arbitrate it, but
>> the kernel wouldn't need to be involved.
>>
>> With this kind of memory poking, it's definitely messier, which is unfortunate.
>>
> Could you recapitulate thread? On libc side we didn't read most of it so
> it would be appreciated.
>
> If per-cpu mappings mean that there is a single virtual page that is
> mapped to different virtual pages?
Single virtual page that's mapped to different physical pages on
different cpus. I believe that ARM has some hardware support for
this, but I'm not that familiar with ARM. x86 can fake it (at the
cost of some context switch overhead).
>
> I had in my todo list improving tls access. This would help tls
> implementations for older arms and in general architectures that dont
> store tcb in register.
>
> My proposal is modulo small constant equivalent of userspace accessing tid
> without syscall overhead, just use array of tcb's for first 32768 tids
> and do syscall only when tid exceeds that.
>
> On userspace my proposal would be use map that to fixed virtual address and store tcb in first eigth bytes. Kernel would on context switch along registers also
> save and restore these. That would make tls access cheap as it would
> need only extra load instruction versus static variable.
>
The problem is that having the kernel access userspace memory on
context switch, while doable, is a little bit unpleasant. We also
really need to get the ABI right the first time, because we don't
really get a second chance.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 10:58 ` Ondřej Bílka
@ 2015-07-17 16:03 ` Mathieu Desnoyers
[not found] ` <626545401.1010.1437149010438.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-17 16:03 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha
----- On Jul 17, 2015, at 6:58 AM, Ondřej Bílka neleai@seznam.cz wrote:
[...]
>> If we invoke this per-thread registration directly in the glibc NPTL
>> implementation,
>> in start_thread, do you think it would fit your requirements ?
>>
> A generic solution would be adding eager initialization of thread_local
> variables which would fix more performance problems.
Concretely, what code would be in charge of this initialization ?
What changes does "eager initialization" imply from the current
implementation ?
> Second would be write patch to libc adding function
> pthread_create_add_hook_np to register function that would be ran after
> each thread creation.
Do you suggest that this callback should be registered once per thread,
or somehow attached to pthread_attr_t ? As maintainer of lttng-ust,
where we need some control over our own threads, but where we want minimal
impact on the overall application, I would really prefer if we can attach
the callback to pthread_attr_t.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrWEKE=mow3vVh7C4r8CuGy_d5VOEz7KkpijuR5cpBfFtg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-17 18:48 ` Linus Torvalds
[not found] ` <CA+55aFz-VBnEKh0SPKgu8xV5=Zb+=6odybVUDoOYOknshbcFJA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-17 23:28 ` Ondřej Bílka
0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2015-07-17 18:48 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Mathieu Desnoyers, Ben Maurer, Paul Turner, Andrew Hunter,
Peter Zijlstra, Ingo Molnar, rostedt, Paul E. McKenney,
Josh Triplett, Lai Jiangshan, Andrew Morton, linux-api,
libc-alpha
On Thu, Jul 16, 2015 at 12:27 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>
> If we actually bit the bullet and implemented per-cpu mappings
That's not ever going to happen.
Per-cpu page tables are a complete disaster. It's a recipe for crazy
race conditions, when you have CPUs that update things like
dirty/accessed bits atomically etc, and you have fundamental races
when multiple CPU's allocating page tables at the same time (remember:
we have concurrent page faults, and the locking is not per-vm, it's at
a finer granularity).
It's also a big memory management problem when you have lots and lots of CPU's.
So don't go there. The only way to do per-cpu virtual mappings is
hardware-specific, it if you have hardware that explicitly allows
inserting percpu TLB entries (while still sharing the page tables),
then that would be ok. And we don't have that on x86. MIPS has
explicit support for these kinds of TLB backs, and obviously on other
architectures you might be able to play games with the SW-fill TLB,
but on x86 there's no hardware support for per-CPU TLB filling.
And this is not just theory. We've seen what happens when people try
to do per-thread page tables. It's happened several times, and it's a
fundamental mistake. Plan-9 had "private mappings" because that's how
they did stacks (ie the stack mappings were thread-local), and it
means that thread switching is fundamentally broken. I think Mach did
too. And per-cpu page tables are less broken from a scheduling
standpoint than per-thread page tables, but still do share a lot of
the synchronization problems, and have some allocation issues all
their own.
The Linux VM model of "one page table per VM" is the right one.
Anything else sucks, and makes threading a disaster.
So you can try to prove me wrong, but seriously, I doubt you'll succeed.
On x86, if you want per-cpu memory areas, you should basically plan on
using segment registers instead (although other odd state has been
used - there's been the people who use segment limits etc rather than
the *pointer* itself, preferring to use "lsl" to get percpu data. You
could also imaging hiding things in the vector state somewhere if you
control your environment well enough).
Linus
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CA+55aFz-VBnEKh0SPKgu8xV5=Zb+=6odybVUDoOYOknshbcFJA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-17 18:55 ` Andy Lutomirski
[not found] ` <CALCETrVNcLpZVATHOs-gZR9AMUSW_ScvXW_0oY=OnFHXXHLdaA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-17 18:55 UTC (permalink / raw)
To: Linus Torvalds
Cc: Mathieu Desnoyers, Ben Maurer, Paul Turner, Andrew Hunter,
Peter Zijlstra, Ingo Molnar, rostedt, Paul E. McKenney,
Josh Triplett, Lai Jiangshan, Andrew Morton, linux-api,
libc-alpha
On Fri, Jul 17, 2015 at 11:48 AM, Linus Torvalds
<torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
> On Thu, Jul 16, 2015 at 12:27 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>
>> If we actually bit the bullet and implemented per-cpu mappings
>
> That's not ever going to happen.
>
> Per-cpu page tables are a complete disaster. It's a recipe for crazy
> race conditions, when you have CPUs that update things like
> dirty/accessed bits atomically etc, and you have fundamental races
> when multiple CPU's allocating page tables at the same time (remember:
> we have concurrent page faults, and the locking is not per-vm, it's at
> a finer granularity).
>
> It's also a big memory management problem when you have lots and lots of CPU's.
>
> So you can try to prove me wrong, but seriously, I doubt you'll succeed.
I doubt I'll succeed, too. But I don't want anything resembling full
per-cpu page tables -- per-cpu pgds would be plenty. Still kinda
nasty to implement. On the other hand, getting rid of swapgs would be
a nice win.
>
> On x86, if you want per-cpu memory areas, you should basically plan on
> using segment registers instead (although other odd state has been
> used - there's been the people who use segment limits etc rather than
> the *pointer* itself, preferring to use "lsl" to get percpu data. You
> could also imaging hiding things in the vector state somewhere if you
> control your environment well enough).
I do think we should implement per-cpu descriptor bases or gs bases,
and we should also implement rd/wrfsgsbase. We should do them
together, give them well-defined sematics, and write tests. The
current "segment register state kinda sorta context switches correctly
as long as no one looks carefully" approach is no good. And once we
do that, we don't need a cached cpu number.
Sigh, if we had clean per-cpu memory mappings and got rid of swapgs,
then implementing the fsgsbase stuff would be so much easier.
(Although -- we could plausibly use r15 or something as our percpu
pointer in the kernel without too much loss, which would also get rid
of the fsgsbase mess. Hmm. It would make paranoid entries much
faster, too.)
Anyway, I do intend to ask Intel for real per-cpu mappings of some
sort if they ever ask my opinion again. Maybe they'll give us
something in a few decades.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrVNcLpZVATHOs-gZR9AMUSW_ScvXW_0oY=OnFHXXHLdaA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-17 19:11 ` Linus Torvalds
0 siblings, 0 replies; 47+ messages in thread
From: Linus Torvalds @ 2015-07-17 19:11 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Mathieu Desnoyers, Ben Maurer, Paul Turner, Andrew Hunter,
Peter Zijlstra, Ingo Molnar, rostedt, Paul E. McKenney,
Josh Triplett, Lai Jiangshan, Andrew Morton, linux-api,
libc-alpha
On Fri, Jul 17, 2015 at 11:55 AM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>
> I doubt I'll succeed, too. But I don't want anything resembling full
> per-cpu page tables -- per-cpu pgds would be plenty. Still kinda
> nasty to implement.
Per-cpu pgd's would have been trivial in the old 32-bit PAE
environment. There's only four entries at the top level, and they have
to be allocated at process startup anyway - and we wopuldn't even have
to do a per-cpu-and-VM allocation, we'd just have done one single
per-cpu entry, and when switching tasks we'd *copy* the VM entries to
the per-cpu one and re-load %cr3 with the same address. I thought
about it.
But I'm really happy we never went down that road. It's non-portable,
even on x86-32 (because it requires PAE). And even there it would be
limited to "the top 1GB of virtual address space ends up being
per-cpu", and then you have to get the vmalloc space right etc, so you
have that one PGE entry for the kernel mapping that you can make be
percpu and play tricks in. So you'd basically allocate one page per
CPU for the magic upper PGD entry that maps the top 1GB, and edit that
on-the-fly as you do task-switching. Very specialized, and the upside
was very dubious.
And that "simple" trick is not really doable with the x86-64 model any
more (you can't copy 4kB efficiently the way you could copy 32 _bytes_
efficiently). And you really don't want to pre-allocate the whole
top-level PGD either. So all the things that made it "easy" for 32-bit
PAE basically went away with x86-64.
No, I think the only thing that would make it possible is if there is
some architecture extension that replaces part of the page table
mappings with a percpu MSR describing a magic mapping or two. It would
be trivial to do such an addition in hardware (it's not even in the
critical path, it would be just a new magic special case for the TLB
fill code), but without hardware support it's just not a good idea.
(And I'm not claiming that the hw extension for per-cpu mappigns would
be a good idea either, although I think it would be an _interesting_
toy to play with ;)
Linus
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <626545401.1010.1437149010438.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-17 22:43 ` Ondřej Bílka
2015-07-18 2:43 ` Mathieu Desnoyers
0 siblings, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-17 22:43 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha
On Fri, Jul 17, 2015 at 04:03:30PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 17, 2015, at 6:58 AM, Ondřej Bílka neleai@seznam.cz wrote:
> [...]
> >> If we invoke this per-thread registration directly in the glibc NPTL
> >> implementation,
> >> in start_thread, do you think it would fit your requirements ?
> >>
> > A generic solution would be adding eager initialization of thread_local
> > variables which would fix more performance problems.
>
> Concretely, what code would be in charge of this initialization ?
>
> What changes does "eager initialization" imply from the current
> implementation ?
>
Now if you write
class foo{
public:
foo(){
printf("init\n");
}
foo(const foo &x){
printf("init\n");
}
int bar()
{
return 32;
}
};
thread_local class foo x;
Then constructor isn't called in all threads, only main as its lazily
initialized. You need to call x.bar() in each thread to cause
initialization.
> > Second would be write patch to libc adding function
> > pthread_create_add_hook_np to register function that would be ran after
> > each thread creation.
>
> Do you suggest that this callback should be registered once per thread,
> or somehow attached to pthread_attr_t ? As maintainer of lttng-ust,
> where we need some control over our own threads, but where we want minimal
> impact on the overall application, I would really prefer if we can attach
> the callback to pthread_attr_t.
>
Could you elaborate. I was suggesting per-thread callback, what do you
want to do with pthread_attr_t?
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 18:48 ` Linus Torvalds
[not found] ` <CA+55aFz-VBnEKh0SPKgu8xV5=Zb+=6odybVUDoOYOknshbcFJA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-17 23:28 ` Ondřej Bílka
2015-07-17 23:33 ` Andy Lutomirski
2015-07-18 7:34 ` Rich Felker
1 sibling, 2 replies; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-17 23:28 UTC (permalink / raw)
To: Linus Torvalds
Cc: Andy Lutomirski, Mathieu Desnoyers, Ben Maurer, Paul Turner,
Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
> On Thu, Jul 16, 2015 at 12:27 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >
> > If we actually bit the bullet and implemented per-cpu mappings
>
> That's not ever going to happen.
>
> The Linux VM model of "one page table per VM" is the right one.
> Anything else sucks, and makes threading a disaster.
>
> So you can try to prove me wrong, but seriously, I doubt you'll succeed.
>
> On x86, if you want per-cpu memory areas, you should basically plan on
> using segment registers instead (although other odd state has been
> used - there's been the people who use segment limits etc rather than
> the *pointer* itself, preferring to use "lsl" to get percpu data. You
> could also imaging hiding things in the vector state somewhere if you
> control your environment well enough).
>
Thats correct, problem is that you need some sort of hack like this on
archs that otherwise would need syscall to get tid/access tls variable.
On x64 and archs that have register for tls this could be implemented
relatively easily.
Kernel needs to allocate
int running_cpu_for_tid[32768];
On context switch it atomically writes to this table
running_cpu_for_tid[tid] = cpu;
This table is read-only accessible from userspace as mmaped file.
Then userspace just needs to access it with three indirections like:
__thread tid;
char caches[CPU_MAX];
#define getcpu_cache caches[tid > 32768 ? get_cpu() : running_cpu_for_tid[tid]]
With more complicated kernel interface you could eliminate one
indirection as we would use void * array instead and thread could do
syscall to register what values it should use for each thread.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 23:28 ` Ondřej Bílka
@ 2015-07-17 23:33 ` Andy Lutomirski
2015-07-18 10:35 ` Ondřej Bílka
[not found] ` <CALCETrVY=kjeA_4pazy3BL+ekfcV6WHKw8e3z-LBxx_uP1bw2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-18 7:34 ` Rich Felker
1 sibling, 2 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-17 23:33 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Linus Torvalds, Mathieu Desnoyers, Ben Maurer, Paul Turner,
Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On Fri, Jul 17, 2015 at 4:28 PM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
>>
>> On x86, if you want per-cpu memory areas, you should basically plan on
>> using segment registers instead (although other odd state has been
>> used - there's been the people who use segment limits etc rather than
>> the *pointer* itself, preferring to use "lsl" to get percpu data. You
>> could also imaging hiding things in the vector state somewhere if you
>> control your environment well enough).
>>
> Thats correct, problem is that you need some sort of hack like this on
> archs that otherwise would need syscall to get tid/access tls variable.
>
> On x64 and archs that have register for tls this could be implemented
> relatively easily.
>
> Kernel needs to allocate
>
> int running_cpu_for_tid[32768];
>
> On context switch it atomically writes to this table
>
> running_cpu_for_tid[tid] = cpu;
>
> This table is read-only accessible from userspace as mmaped file.
>
> Then userspace just needs to access it with three indirections like:
>
> __thread tid;
>
> char caches[CPU_MAX];
> #define getcpu_cache caches[tid > 32768 ? get_cpu() : running_cpu_for_tid[tid]]
>
> With more complicated kernel interface you could eliminate one
> indirection as we would use void * array instead and thread could do
> syscall to register what values it should use for each thread.
Or we implement per-cpu segment registers so you can point gs directly
at percpu data. This is conceptually easy and has no weird ABI
issues. All it needs is an implementation and some good tests.
I think the API should be "set gsbase to x + y*(cpu number)". On
x86_64, userspace just allocates a big swath of virtual space and
populates it as needed.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 22:43 ` Ondřej Bílka
@ 2015-07-18 2:43 ` Mathieu Desnoyers
0 siblings, 0 replies; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-18 2:43 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha
----- On Jul 17, 2015, at 6:43 PM, Ondřej Bílka neleai@seznam.cz wrote:
> On Fri, Jul 17, 2015 at 04:03:30PM +0000, Mathieu Desnoyers wrote:
>> ----- On Jul 17, 2015, at 6:58 AM, Ondřej Bílka neleai@seznam.cz wrote:
>> [...]
>> >> If we invoke this per-thread registration directly in the glibc NPTL
>> >> implementation,
>> >> in start_thread, do you think it would fit your requirements ?
>> >>
>
[...]
>> > Second would be write patch to libc adding function
>> > pthread_create_add_hook_np to register function that would be ran after
>> > each thread creation.
>>
>> Do you suggest that this callback should be registered once per thread,
>> or somehow attached to pthread_attr_t ? As maintainer of lttng-ust,
>> where we need some control over our own threads, but where we want minimal
>> impact on the overall application, I would really prefer if we can attach
>> the callback to pthread_attr_t.
>>
> Could you elaborate. I was suggesting per-thread callback, what do you
> want to do with pthread_attr_t?
Hrm, nevermind, even in the context of lttng-ust, I'd want the callback to
be called for every thread in the application. Therefore, registering this
callback once per process seems to make sense.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 23:28 ` Ondřej Bílka
2015-07-17 23:33 ` Andy Lutomirski
@ 2015-07-18 7:34 ` Rich Felker
[not found] ` <20150718073433.GH1173-C3MtFaGISjmo6RMmaWD+6Sb1p8zYI1N1@public.gmane.org>
1 sibling, 1 reply; 47+ messages in thread
From: Rich Felker @ 2015-07-18 7:34 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Linus Torvalds, Andy Lutomirski, Mathieu Desnoyers, Ben Maurer,
Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On Sat, Jul 18, 2015 at 01:28:36AM +0200, Ondřej Bílka wrote:
> On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
> > On Thu, Jul 16, 2015 at 12:27 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > >
> > > If we actually bit the bullet and implemented per-cpu mappings
> >
> > That's not ever going to happen.
> >
> > The Linux VM model of "one page table per VM" is the right one.
> > Anything else sucks, and makes threading a disaster.
> >
> > So you can try to prove me wrong, but seriously, I doubt you'll succeed.
> >
> > On x86, if you want per-cpu memory areas, you should basically plan on
> > using segment registers instead (although other odd state has been
> > used - there's been the people who use segment limits etc rather than
> > the *pointer* itself, preferring to use "lsl" to get percpu data. You
> > could also imaging hiding things in the vector state somewhere if you
> > control your environment well enough).
> >
> Thats correct, problem is that you need some sort of hack like this on
> archs that otherwise would need syscall to get tid/access tls variable.
>
> On x64 and archs that have register for tls this could be implemented
> relatively easily.
>
> Kernel needs to allocate
>
> int running_cpu_for_tid[32768];
This does not scale. You're assuming the default task ("pid") number
limit, but this can be raised up to 512k (beyond that is impossible
because of PI/robust futex ABI).
> On context switch it atomically writes to this table
>
> running_cpu_for_tid[tid] = cpu;
>
> This table is read-only accessible from userspace as mmaped file.
There is a much simpler solution: use a per-cpu (rather than per-task)
page that contains the right value for the cpu. I believe vdso already
does something like this, no?
Rich
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-16 18:08 ` Mathieu Desnoyers
[not found] ` <549319255.383.1437070088597.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-18 9:47 ` Florian Weimer
1 sibling, 0 replies; 47+ messages in thread
From: Florian Weimer @ 2015-07-18 9:47 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Ben Maurer, Paul Turner, Andrew Hunter, Peter Zijlstra,
Ingo Molnar, rostedt, Paul E. McKenney, Josh Triplett,
Lai Jiangshan, Linus Torvalds, Andrew Morton, linux-api,
libc-alpha
* Mathieu Desnoyers:
> The related question then becomes: should we issue this system call once
> per process, or once per thread at thread creation ? Issuing it once per
> thread is marginally more costly for thread creation, but seems to be
> easier to deal with internally within the kernel.
I might be missing something, but here is my take:
It has to be a per-thread attribute because glibc uses clone
internally and wants to set the flag, but the same process may have
application or library code which uses clone to create threads, too,
but does not know about this new feature. If it's per-process, the
existing code would fail.
(We may argue indefinitely if this is a supported use case from the
glibc point of view, but I'm pretty sure there is code out there which
does exactly this, even though it's fairly complicated to get right.)
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-17 23:33 ` Andy Lutomirski
@ 2015-07-18 10:35 ` Ondřej Bílka
[not found] ` <CALCETrVY=kjeA_4pazy3BL+ekfcV6WHKw8e3z-LBxx_uP1bw2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
1 sibling, 0 replies; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-18 10:35 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Linus Torvalds, Mathieu Desnoyers, Ben Maurer, Paul Turner,
Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On Fri, Jul 17, 2015 at 04:33:42PM -0700, Andy Lutomirski wrote:
> On Fri, Jul 17, 2015 at 4:28 PM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
> >>
> >> On x86, if you want per-cpu memory areas, you should basically plan on
> >> using segment registers instead (although other odd state has been
> >> used - there's been the people who use segment limits etc rather than
> >> the *pointer* itself, preferring to use "lsl" to get percpu data. You
> >> could also imaging hiding things in the vector state somewhere if you
> >> control your environment well enough).
> >>
> > Thats correct, problem is that you need some sort of hack like this on
> > archs that otherwise would need syscall to get tid/access tls variable.
> >
> > On x64 and archs that have register for tls this could be implemented
> > relatively easily.
> >
> > Kernel needs to allocate
> >
> > int running_cpu_for_tid[32768];
> >
> > On context switch it atomically writes to this table
> >
> > running_cpu_for_tid[tid] = cpu;
> >
> > This table is read-only accessible from userspace as mmaped file.
> >
> > Then userspace just needs to access it with three indirections like:
> >
> > __thread tid;
> >
> > char caches[CPU_MAX];
> > #define getcpu_cache caches[tid > 32768 ? get_cpu() : running_cpu_for_tid[tid]]
> >
> > With more complicated kernel interface you could eliminate one
> > indirection as we would use void * array instead and thread could do
> > syscall to register what values it should use for each thread.
>
> Or we implement per-cpu segment registers so you can point gs directly
> at percpu data. This is conceptually easy and has no weird ABI
> issues. All it needs is an implementation and some good tests.
>
That only works if you have free register on your arch. As gs there was
rfc to teach gcc use it which could give bigger speedup. I didn't see
how much this could help yet so I am bit skeptical.
> I think the API should be "set gsbase to x + y*(cpu number)". On
> x86_64, userspace just allocates a big swath of virtual space and
> populates it as needed.
>
That wouldn't work well if two shared libraries want to use that. You
would need to use something like se it to 4096*cpu_number or so.
Also we didn't considered yet overhead, as this slows down everything a
bit due slower context switches. So will this needs to have widespread
performance improvement to be worthwhile. What are use cases to make
that pay itself?
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <20150718073433.GH1173-C3MtFaGISjmo6RMmaWD+6Sb1p8zYI1N1@public.gmane.org>
@ 2015-07-18 10:51 ` Ondřej Bílka
0 siblings, 0 replies; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-18 10:51 UTC (permalink / raw)
To: Rich Felker
Cc: Linus Torvalds, Andy Lutomirski, Mathieu Desnoyers, Ben Maurer,
Paul Turner, Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On Sat, Jul 18, 2015 at 03:34:33AM -0400, Rich Felker wrote:
> On Sat, Jul 18, 2015 at 01:28:36AM +0200, Ondřej Bílka wrote:
> > On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
> > > On Thu, Jul 16, 2015 at 12:27 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > > >
> > > > If we actually bit the bullet and implemented per-cpu mappings
> > >
> > > That's not ever going to happen.
> > >
> > > The Linux VM model of "one page table per VM" is the right one.
> > > Anything else sucks, and makes threading a disaster.
> > >
> > > So you can try to prove me wrong, but seriously, I doubt you'll succeed.
> > >
> > > On x86, if you want per-cpu memory areas, you should basically plan on
> > > using segment registers instead (although other odd state has been
> > > used - there's been the people who use segment limits etc rather than
> > > the *pointer* itself, preferring to use "lsl" to get percpu data. You
> > > could also imaging hiding things in the vector state somewhere if you
> > > control your environment well enough).
> > >
> > Thats correct, problem is that you need some sort of hack like this on
> > archs that otherwise would need syscall to get tid/access tls variable.
> >
> > On x64 and archs that have register for tls this could be implemented
> > relatively easily.
> >
> > Kernel needs to allocate
> >
> > int running_cpu_for_tid[32768];
>
> This does not scale. You're assuming the default task ("pid") number
> limit, but this can be raised up to 512k (beyond that is impossible
> because of PI/robust futex ABI).
>
Doesn't matter much, you will allocate 512k instead. As scaling if you
have simultaneously more than 32k threads running you have different
worries than how slow its to get cpu id.
> > On context switch it atomically writes to this table
> >
> > running_cpu_for_tid[tid] = cpu;
> >
> > This table is read-only accessible from userspace as mmaped file.
>
> There is a much simpler solution: use a per-cpu (rather than per-task)
> page that contains the right value for the cpu. I believe vdso already
> does something like this, no?
>
Thats exactly what I suggested before to improve tls when you don't have
register for that.
If thats already done its just matter of abi for kernel map per-cpu page
to fixed virtual page and save/restore tcb in same way as it sets cpuid.
With that abi a tls access would cost just extra load more than of static
variable.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrVY=kjeA_4pazy3BL+ekfcV6WHKw8e3z-LBxx_uP1bw2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-20 8:35 ` Florian Weimer
[not found] ` <55ACB2DC.5010503-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-07-20 13:18 ` Konstantin Khlebnikov
1 sibling, 1 reply; 47+ messages in thread
From: Florian Weimer @ 2015-07-20 8:35 UTC (permalink / raw)
To: Andy Lutomirski, Ondřej Bílka
Cc: Linus Torvalds, Mathieu Desnoyers, Ben Maurer, Paul Turner,
Andrew Hunter, Peter Zijlstra, Ingo Molnar, rostedt,
Paul E. McKenney, Josh Triplett, Lai Jiangshan, Andrew Morton,
linux-api, libc-alpha
On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
> I think the API should be "set gsbase to x + y*(cpu number)". On
> x86_64, userspace just allocates a big swath of virtual space and
> populates it as needed.
This will break WINE and similar applications which use %gs today.
--
Florian Weimer / Red Hat Product Security
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrVY=kjeA_4pazy3BL+ekfcV6WHKw8e3z-LBxx_uP1bw2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-20 8:35 ` Florian Weimer
@ 2015-07-20 13:18 ` Konstantin Khlebnikov
1 sibling, 0 replies; 47+ messages in thread
From: Konstantin Khlebnikov @ 2015-07-20 13:18 UTC (permalink / raw)
To: linux-api-u79uwXL29TY76Z2rM5mHXA; +Cc: libc-alpha-9JcytcrH/bA+uJoB2kUjGw
On 18.07.2015 02:33, Andy Lutomirski wrote:
> On Fri, Jul 17, 2015 at 4:28 PM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> On Fri, Jul 17, 2015 at 11:48:14AM -0700, Linus Torvalds wrote:
>>>
>>> On x86, if you want per-cpu memory areas, you should basically plan on
>>> using segment registers instead (although other odd state has been
>>> used - there's been the people who use segment limits etc rather than
>>> the *pointer* itself, preferring to use "lsl" to get percpu data. You
>>> could also imaging hiding things in the vector state somewhere if you
>>> control your environment well enough).
>>>
>> Thats correct, problem is that you need some sort of hack like this on
>> archs that otherwise would need syscall to get tid/access tls variable.
>>
>> On x64 and archs that have register for tls this could be implemented
>> relatively easily.
>>
>> Kernel needs to allocate
>>
>> int running_cpu_for_tid[32768];
>>
>> On context switch it atomically writes to this table
>>
>> running_cpu_for_tid[tid] = cpu;
>>
>> This table is read-only accessible from userspace as mmaped file.
>>
>> Then userspace just needs to access it with three indirections like:
>>
>> __thread tid;
>>
>> char caches[CPU_MAX];
>> #define getcpu_cache caches[tid > 32768 ? get_cpu() : running_cpu_for_tid[tid]]
>>
>> With more complicated kernel interface you could eliminate one
>> indirection as we would use void * array instead and thread could do
>> syscall to register what values it should use for each thread.
>
> Or we implement per-cpu segment registers so you can point gs directly
> at percpu data. This is conceptually easy and has no weird ABI
> issues. All it needs is an implementation and some good tests.
>
> I think the API should be "set gsbase to x + y*(cpu number)". On
> x86_64, userspace just allocates a big swath of virtual space and
> populates it as needed.
I've proposed exactly that design last year:
https://lwn.net/Articles/611946/
> --Andy
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <55ACB2DC.5010503-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-07-20 15:31 ` Andy Lutomirski
[not found] ` <CALCETrV9Vp5UUOb3e_R5tphyE-urBgTwQR2pFWUOOFnHqWXHKQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-20 15:31 UTC (permalink / raw)
To: Florian Weimer
Cc: Ben Maurer, Ingo Molnar, libc-alpha, linux-api, Andrew Morton,
Ondřej Bílka, rostedt, Linus Torvalds,
Mathieu Desnoyers, Paul E. McKenney, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
On Jul 20, 2015 1:35 AM, "Florian Weimer" <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>
> On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
>
> > I think the API should be "set gsbase to x + y*(cpu number)". On
> > x86_64, userspace just allocates a big swath of virtual space and
> > populates it as needed.
>
> This will break WINE and similar applications which use %gs today.
Presumably WINE could just opt not to use this facility, just like
WINE will have to opt out of whatever the enterprise people who want
WRGSBASE were thinking of doing with it.
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrV9Vp5UUOb3e_R5tphyE-urBgTwQR2pFWUOOFnHqWXHKQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-20 15:32 ` Florian Weimer
[not found] ` <55AD14A4.6030101-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Florian Weimer @ 2015-07-20 15:32 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Ben Maurer, Ingo Molnar, libc-alpha, linux-api, Andrew Morton,
Ondřej Bílka, rostedt, Linus Torvalds,
Mathieu Desnoyers, Paul E. McKenney, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
On 07/20/2015 05:31 PM, Andy Lutomirski wrote:
> On Jul 20, 2015 1:35 AM, "Florian Weimer" <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>
>> On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
>>
>>> I think the API should be "set gsbase to x + y*(cpu number)". On
>>> x86_64, userspace just allocates a big swath of virtual space and
>>> populates it as needed.
>>
>> This will break WINE and similar applications which use %gs today.
>
> Presumably WINE could just opt not to use this facility, just like
> WINE will have to opt out of whatever the enterprise people who want
> WRGSBASE were thinking of doing with it.
How is this possible if it's process-global attribute and glibc or some
library in the process starts using it?
--
Florian Weimer / Red Hat Product Security
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <55AD14A4.6030101-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-07-20 17:41 ` Andy Lutomirski
[not found] ` <CALCETrUx6wFxmz+9TyW5bNgaMN0q180G8y9YOyq_D41sdhFaRQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
[not found] ` <CA+55aFzMJkzydXb7uVv1iSUnp=539d43ghQaonGdzMoF7QLZBA@mail.gmail.com>
0 siblings, 2 replies; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-20 17:41 UTC (permalink / raw)
To: Florian Weimer
Cc: Ben Maurer, Ingo Molnar, libc-alpha, linux-api, Andrew Morton,
Ondřej Bílka, rostedt, Linus Torvalds,
Mathieu Desnoyers, Paul E. McKenney, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
On Mon, Jul 20, 2015 at 8:32 AM, Florian Weimer <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On 07/20/2015 05:31 PM, Andy Lutomirski wrote:
>> On Jul 20, 2015 1:35 AM, "Florian Weimer" <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>>
>>> On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
>>>
>>>> I think the API should be "set gsbase to x + y*(cpu number)". On
>>>> x86_64, userspace just allocates a big swath of virtual space and
>>>> populates it as needed.
>>>
>>> This will break WINE and similar applications which use %gs today.
>>
>> Presumably WINE could just opt not to use this facility, just like
>> WINE will have to opt out of whatever the enterprise people who want
>> WRGSBASE were thinking of doing with it.
>
> How is this possible if it's process-global attribute and glibc or some
> library in the process starts using it?
>
glibc will have to expose a way to turn it off, I guess. (ELF flag?)
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CALCETrUx6wFxmz+9TyW5bNgaMN0q180G8y9YOyq_D41sdhFaRQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-20 20:07 ` josh-iaAMLnmF4UmaiuxdJuQwMA
2015-07-21 7:55 ` Florian Weimer
0 siblings, 1 reply; 47+ messages in thread
From: josh-iaAMLnmF4UmaiuxdJuQwMA @ 2015-07-20 20:07 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Florian Weimer, Ben Maurer, Ingo Molnar, libc-alpha, linux-api,
Andrew Morton, Ondřej Bílka, rostedt, Linus Torvalds,
Mathieu Desnoyers, Paul E. McKenney, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On Mon, Jul 20, 2015 at 10:41:09AM -0700, Andy Lutomirski wrote:
> On Mon, Jul 20, 2015 at 8:32 AM, Florian Weimer <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > On 07/20/2015 05:31 PM, Andy Lutomirski wrote:
> >> On Jul 20, 2015 1:35 AM, "Florian Weimer" <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >>>
> >>> On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
> >>>
> >>>> I think the API should be "set gsbase to x + y*(cpu number)". On
> >>>> x86_64, userspace just allocates a big swath of virtual space and
> >>>> populates it as needed.
> >>>
> >>> This will break WINE and similar applications which use %gs today.
> >>
> >> Presumably WINE could just opt not to use this facility, just like
> >> WINE will have to opt out of whatever the enterprise people who want
> >> WRGSBASE were thinking of doing with it.
> >
> > How is this possible if it's process-global attribute and glibc or some
> > library in the process starts using it?
> >
>
> glibc will have to expose a way to turn it off, I guess. (ELF flag?)
Or a way to turn it on.
- Josh Triplett
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CA+55aFzMJkzydXb7uVv1iSUnp=539d43ghQaonGdzMoF7QLZBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-20 21:09 ` Andy Lutomirski
2015-07-20 22:39 ` Linus Torvalds
0 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2015-07-20 21:09 UTC (permalink / raw)
To: Linus Torvalds
Cc: Ben Maurer, Ingo Molnar, libc-alpha, Andrew Morton, linux-api,
Ondřej Bílka, rostedt, Mathieu Desnoyers,
Paul E. McKenney, Florian Weimer, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
On Mon, Jul 20, 2015 at 1:50 PM, Linus Torvalds
<torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
>
> On Jul 20, 2015 10:41 AM, "Andy Lutomirski" <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>
>> glibc will have to expose a way to turn it off, I guess. (ELF flag?)
>
> Ugh. That just sounds nasty.
>
> I'm on mobile, so can't check right now, but don't we already have a per-cpu
> gdt? We could just make a very simple rule:
>
> - create a single gdt entry with a segment that is per-cpu and points to one
> single read-only page in kernel space that contains the virtual address of
> that segment in vmalloc space (and maybe we can have the CPU number there
> somewhere, and extend it to something else later)
Annoying problem one: the segment base field is only 32 bits in the GDT.
>
> - make the rule be that if you hold that segment in %fs or %gs in your user
> space state, it gets cleared when the thread is scheduled out.
That sounds a bit evil, but okay.
>
> What does this get you?
>
> It basically means that:
>
> - user space can just load the segment selector in %gs
>
IIRC this is very expensive -- 40 cycles or so. At this point
userspace might as well just use a real lock cmpxchg.
> - user space can load the virtual address of the segment base into a bar
> register, and use that to calculate a pointer to regular data structures.
>
> - user space can use that "reverse offset" to access any data it wants, and
> access that data with a gs override.
>
> - if the user space thread is scheduled, that access will fault with a GP
> fault, because %gs became NUL.
Cute.
>
> So basically you can do any memory access you want, and you'll be guaranteed
> that it will be done "atomically" on the same CPU you did the segment load
> on, or it will fault because you got scheduled away.
>
> And it's very cheap for both kernel and user space. One extra gdt entry (not
> per process or anything like that - it's system global, although different
> cpus all end up with different entries), and for each cpu one virtually
> mapped page. And all user space needs to do is to do a segment load.
>
> No system calls, no nothing.
>
> Would that be useful?
>
Does it solve the Wine problem? If Wine uses gs for something and
calls a function that does this, Wine still goes boom, right?
Could Wine just save and restore gs on calls into and out of Windows
code? That would solve all the problems, right?
--Andy
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-20 21:09 ` Andy Lutomirski
@ 2015-07-20 22:39 ` Linus Torvalds
[not found] ` <CA+55aFwLZLeeN7UN82dyt=emQcNBc8qZPJAw5iqtAbBwFA7FPQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2015-07-20 22:39 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Ben Maurer, Ingo Molnar, libc-alpha, Andrew Morton, linux-api,
Ondřej Bílka, rostedt, Mathieu Desnoyers,
Paul E. McKenney, Florian Weimer, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
On Mon, Jul 20, 2015 at 2:09 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> Annoying problem one: the segment base field is only 32 bits in the GDT.
Ok. So if we go this way, we'd make the rule be something like "the
segment base is the CPU number shifted up by the page size", and then
you'd have to add some magic offset that we'd declare as the "per-cpu
page offset".
>> - user space can just load the segment selector in %gs
>
> IIRC this is very expensive -- 40 cycles or so. At this point
> userspace might as well just use a real lock cmpxchg.
So cmpxchg may be as many cycles, but
(a) you can choose to load the segment just once, and do several
operations with it
(b) often - but admittedly not always - the real cost of a
non-cpu-local local and cmpxchg tends to be the cacheline ping-pong,
not the CPU cycles.
so I agree, loading a segment isn't free. But it's not *that*
expensive, and you could always decide to keep the segment loaded and
just do
- read segment selector
- if NUL segment, reload it.
although that only works if you own the segment entirely and can keep
it as the percpu segment (ie obviously not the Wine case, for
example).
> Does it solve the Wine problem? If Wine uses gs for something and
> calls a function that does this, Wine still goes boom, right?
So the advantage of just making a global segment descriptor available
is that it's not *that* expensive to just save/restore segments. So
either wine could do it, or any library users would do it.
But anyway, I'm not sure this is a good idea. The advantage of it is
that the kernel support really is _very_ minimal.
Linus
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <CA+55aFwLZLeeN7UN82dyt=emQcNBc8qZPJAw5iqtAbBwFA7FPQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-07-21 0:25 ` Mathieu Desnoyers
[not found] ` <2010227315.699.1437438300542.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-21 0:25 UTC (permalink / raw)
To: Linus Torvalds
Cc: Andy Lutomirski, Ben Maurer, Ingo Molnar, libc-alpha,
Andrew Morton, linux-api, Ondřej Bílka, rostedt,
Paul E. McKenney, Florian Weimer, Josh Triplett, Lai Jiangshan,
Paul Turner, Andrew Hunter, Peter Zijlstra
----- On Jul 20, 2015, at 6:39 PM, Linus Torvalds torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org wrote:
> On Mon, Jul 20, 2015 at 2:09 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>
>> Annoying problem one: the segment base field is only 32 bits in the GDT.
>
> Ok. So if we go this way, we'd make the rule be something like "the
> segment base is the CPU number shifted up by the page size", and then
> you'd have to add some magic offset that we'd declare as the "per-cpu
> page offset".
>
>>> - user space can just load the segment selector in %gs
>>
>> IIRC this is very expensive -- 40 cycles or so. At this point
>> userspace might as well just use a real lock cmpxchg.
>
> So cmpxchg may be as many cycles, but
>
> (a) you can choose to load the segment just once, and do several
> operations with it
>
> (b) often - but admittedly not always - the real cost of a
> non-cpu-local local and cmpxchg tends to be the cacheline ping-pong,
> not the CPU cycles.
>
> so I agree, loading a segment isn't free. But it's not *that*
> expensive, and you could always decide to keep the segment loaded and
> just do
>
> - read segment selector
> - if NUL segment, reload it.
>
> although that only works if you own the segment entirely and can keep
> it as the percpu segment (ie obviously not the Wine case, for
> example).
>
>> Does it solve the Wine problem? If Wine uses gs for something and
>> calls a function that does this, Wine still goes boom, right?
>
> So the advantage of just making a global segment descriptor available
> is that it's not *that* expensive to just save/restore segments. So
> either wine could do it, or any library users would do it.
>
> But anyway, I'm not sure this is a good idea. The advantage of it is
> that the kernel support really is _very_ minimal.
Considering that we'd at least also want this feature on ARM and
PowerPC 32/64, and that the gs segment selector approach clashes with
existing apps (wine), I'm not sure that implementing a gs segment
selector based approach to cpu number caching would lead to an overall
decrease in complexity if it leads to performance similar to those of
portable approaches.
I'm perfectly fine with architecture-specific tweaks that lead to
fast-path speedups, but if we have to bite the bullet and implement
an approach based on TLS and registering a memory area at thread start
through a system call on other architectures anyway, it might end up
being less complex to add a new system call on x86 too, especially if
fast path overhead is similar.
But I'm inclined to think that some aspect of the question eludes me,
especially given the amount of interest generated by the gs-segment
selector approach. What am I missing ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <2010227315.699.1437438300542.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-21 7:30 ` Ondřej Bílka
2015-07-21 12:58 ` Mathieu Desnoyers
2015-07-21 8:01 ` Florian Weimer
1 sibling, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-21 7:30 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
> >> Does it solve the Wine problem? If Wine uses gs for something and
> >> calls a function that does this, Wine still goes boom, right?
> >
> > So the advantage of just making a global segment descriptor available
> > is that it's not *that* expensive to just save/restore segments. So
> > either wine could do it, or any library users would do it.
> >
> > But anyway, I'm not sure this is a good idea. The advantage of it is
> > that the kernel support really is _very_ minimal.
>
> Considering that we'd at least also want this feature on ARM and
> PowerPC 32/64, and that the gs segment selector approach clashes with
> existing apps (wine), I'm not sure that implementing a gs segment
> selector based approach to cpu number caching would lead to an overall
> decrease in complexity if it leads to performance similar to those of
> portable approaches.
>
> I'm perfectly fine with architecture-specific tweaks that lead to
> fast-path speedups, but if we have to bite the bullet and implement
> an approach based on TLS and registering a memory area at thread start
> through a system call on other architectures anyway, it might end up
> being less complex to add a new system call on x86 too, especially if
> fast path overhead is similar.
>
> But I'm inclined to think that some aspect of the question eludes me,
> especially given the amount of interest generated by the gs-segment
> selector approach. What am I missing ?
>
As I wrote before you don't have to bite bullet as I said before. It
suffices to create 128k element array with cpu for each tid, make that
mmapable file and userspace could get cpu with nearly same performance
without hacks.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-20 20:07 ` josh-iaAMLnmF4UmaiuxdJuQwMA
@ 2015-07-21 7:55 ` Florian Weimer
0 siblings, 0 replies; 47+ messages in thread
From: Florian Weimer @ 2015-07-21 7:55 UTC (permalink / raw)
To: josh-iaAMLnmF4UmaiuxdJuQwMA, Andy Lutomirski
Cc: Ben Maurer, Ingo Molnar, libc-alpha, linux-api, Andrew Morton,
Ondřej Bílka, rostedt, Linus Torvalds,
Mathieu Desnoyers, Paul E. McKenney, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On 07/20/2015 10:07 PM, josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org wrote:
> On Mon, Jul 20, 2015 at 10:41:09AM -0700, Andy Lutomirski wrote:
>> On Mon, Jul 20, 2015 at 8:32 AM, Florian Weimer <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>> On 07/20/2015 05:31 PM, Andy Lutomirski wrote:
>>>> On Jul 20, 2015 1:35 AM, "Florian Weimer" <fweimer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>>>>
>>>>> On 07/18/2015 01:33 AM, Andy Lutomirski wrote:
>>>>>
>>>>>> I think the API should be "set gsbase to x + y*(cpu number)". On
>>>>>> x86_64, userspace just allocates a big swath of virtual space and
>>>>>> populates it as needed.
>>>>>
>>>>> This will break WINE and similar applications which use %gs today.
>>>>
>>>> Presumably WINE could just opt not to use this facility, just like
>>>> WINE will have to opt out of whatever the enterprise people who want
>>>> WRGSBASE were thinking of doing with it.
>>>
>>> How is this possible if it's process-global attribute and glibc or some
>>> library in the process starts using it?
>>>
>>
>> glibc will have to expose a way to turn it off, I guess. (ELF flag?)
>
> Or a way to turn it on.
How is this supposed to work? Who should turn it on?
It totally breaks encapsulation. We don't need any additional problems
like that.
--
Florian Weimer / Red Hat Product Security
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <2010227315.699.1437438300542.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-21 7:30 ` Ondřej Bílka
@ 2015-07-21 8:01 ` Florian Weimer
1 sibling, 0 replies; 47+ messages in thread
From: Florian Weimer @ 2015-07-21 8:01 UTC (permalink / raw)
To: Mathieu Desnoyers, Linus Torvalds
Cc: Andy Lutomirski, Ben Maurer, Ingo Molnar, libc-alpha,
Andrew Morton, linux-api, Ondřej Bílka, rostedt,
Paul E. McKenney, Josh Triplett, Paul Turner, Andrew Hunter,
Peter Zijlstra
On 07/21/2015 02:25 AM, Mathieu Desnoyers wrote:
> But I'm inclined to think that some aspect of the question eludes me,
> especially given the amount of interest generated by the gs-segment
> selector approach. What am I missing ?
%gs is not explicitly mentioned in the x86_64 psABI. This probably led
to the assumption that it's unused. I think that's not the right
conclusion to draw.
--
Florian Weimer / Red Hat Product Security
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-21 7:30 ` Ondřej Bílka
@ 2015-07-21 12:58 ` Mathieu Desnoyers
[not found] ` <894137397.137.1437483493715.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-21 12:58 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
> On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
>> >> Does it solve the Wine problem? If Wine uses gs for something and
>> >> calls a function that does this, Wine still goes boom, right?
>> >
>> > So the advantage of just making a global segment descriptor available
>> > is that it's not *that* expensive to just save/restore segments. So
>> > either wine could do it, or any library users would do it.
>> >
>> > But anyway, I'm not sure this is a good idea. The advantage of it is
>> > that the kernel support really is _very_ minimal.
>>
>> Considering that we'd at least also want this feature on ARM and
>> PowerPC 32/64, and that the gs segment selector approach clashes with
>> existing apps (wine), I'm not sure that implementing a gs segment
>> selector based approach to cpu number caching would lead to an overall
>> decrease in complexity if it leads to performance similar to those of
>> portable approaches.
>>
>> I'm perfectly fine with architecture-specific tweaks that lead to
>> fast-path speedups, but if we have to bite the bullet and implement
>> an approach based on TLS and registering a memory area at thread start
>> through a system call on other architectures anyway, it might end up
>> being less complex to add a new system call on x86 too, especially if
>> fast path overhead is similar.
>>
>> But I'm inclined to think that some aspect of the question eludes me,
>> especially given the amount of interest generated by the gs-segment
>> selector approach. What am I missing ?
>>
> As I wrote before you don't have to bite bullet as I said before. It
> suffices to create 128k element array with cpu for each tid, make that
> mmapable file and userspace could get cpu with nearly same performance
> without hacks.
I don't see how this would be acceptable on memory-constrained embedded
systems. They have multiple cores, and performance requirements, so
having a fast getcpu would be useful there (e.g. telecom industry),
but they clearly cannot afford a 512kB table per process just for that.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <894137397.137.1437483493715.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-21 15:16 ` Ondřej Bílka
2015-07-21 17:45 ` Mathieu Desnoyers
0 siblings, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-21 15:16 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On Tue, Jul 21, 2015 at 12:58:13PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
>
> > On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
> >> >> Does it solve the Wine problem? If Wine uses gs for something and
> >> >> calls a function that does this, Wine still goes boom, right?
> >> >
> >> > So the advantage of just making a global segment descriptor available
> >> > is that it's not *that* expensive to just save/restore segments. So
> >> > either wine could do it, or any library users would do it.
> >> >
> >> > But anyway, I'm not sure this is a good idea. The advantage of it is
> >> > that the kernel support really is _very_ minimal.
> >>
> >> Considering that we'd at least also want this feature on ARM and
> >> PowerPC 32/64, and that the gs segment selector approach clashes with
> >> existing apps (wine), I'm not sure that implementing a gs segment
> >> selector based approach to cpu number caching would lead to an overall
> >> decrease in complexity if it leads to performance similar to those of
> >> portable approaches.
> >>
> >> I'm perfectly fine with architecture-specific tweaks that lead to
> >> fast-path speedups, but if we have to bite the bullet and implement
> >> an approach based on TLS and registering a memory area at thread start
> >> through a system call on other architectures anyway, it might end up
> >> being less complex to add a new system call on x86 too, especially if
> >> fast path overhead is similar.
> >>
> >> But I'm inclined to think that some aspect of the question eludes me,
> >> especially given the amount of interest generated by the gs-segment
> >> selector approach. What am I missing ?
> >>
> > As I wrote before you don't have to bite bullet as I said before. It
> > suffices to create 128k element array with cpu for each tid, make that
> > mmapable file and userspace could get cpu with nearly same performance
> > without hacks.
>
> I don't see how this would be acceptable on memory-constrained embedded
> systems. They have multiple cores, and performance requirements, so
> having a fast getcpu would be useful there (e.g. telecom industry),
> but they clearly cannot afford a 512kB table per process just for that.
>
Which just means that you need more complicated api and implementation
for that but idea stays same. You would need syscalls
register/deregister_cpuid_idx that would give you index used instead
tid. A kernel would need to handle that many ids could be registered for
each thread and resize mmaped file in syscalls.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-21 15:16 ` Ondřej Bílka
@ 2015-07-21 17:45 ` Mathieu Desnoyers
[not found] ` <1350114812.1035.1437500726799.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-21 17:45 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
----- On Jul 21, 2015, at 11:16 AM, Ondřej Bílka neleai@seznam.cz wrote:
> On Tue, Jul 21, 2015 at 12:58:13PM +0000, Mathieu Desnoyers wrote:
>> ----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
>>
>> > On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
>> >> >> Does it solve the Wine problem? If Wine uses gs for something and
>> >> >> calls a function that does this, Wine still goes boom, right?
>> >> >
>> >> > So the advantage of just making a global segment descriptor available
>> >> > is that it's not *that* expensive to just save/restore segments. So
>> >> > either wine could do it, or any library users would do it.
>> >> >
>> >> > But anyway, I'm not sure this is a good idea. The advantage of it is
>> >> > that the kernel support really is _very_ minimal.
>> >>
>> >> Considering that we'd at least also want this feature on ARM and
>> >> PowerPC 32/64, and that the gs segment selector approach clashes with
>> >> existing apps (wine), I'm not sure that implementing a gs segment
>> >> selector based approach to cpu number caching would lead to an overall
>> >> decrease in complexity if it leads to performance similar to those of
>> >> portable approaches.
>> >>
>> >> I'm perfectly fine with architecture-specific tweaks that lead to
>> >> fast-path speedups, but if we have to bite the bullet and implement
>> >> an approach based on TLS and registering a memory area at thread start
>> >> through a system call on other architectures anyway, it might end up
>> >> being less complex to add a new system call on x86 too, especially if
>> >> fast path overhead is similar.
>> >>
>> >> But I'm inclined to think that some aspect of the question eludes me,
>> >> especially given the amount of interest generated by the gs-segment
>> >> selector approach. What am I missing ?
>> >>
>> > As I wrote before you don't have to bite bullet as I said before. It
>> > suffices to create 128k element array with cpu for each tid, make that
>> > mmapable file and userspace could get cpu with nearly same performance
>> > without hacks.
>>
>> I don't see how this would be acceptable on memory-constrained embedded
>> systems. They have multiple cores, and performance requirements, so
>> having a fast getcpu would be useful there (e.g. telecom industry),
>> but they clearly cannot afford a 512kB table per process just for that.
>>
> Which just means that you need more complicated api and implementation
> for that but idea stays same. You would need syscalls
> register/deregister_cpuid_idx that would give you index used instead
> tid. A kernel would need to handle that many ids could be registered for
> each thread and resize mmaped file in syscalls.
I feel we're talking past each other here. What I propose is to implement
a system call that registers a TLS area. It can be invoked at thread start.
The kernel can then keep the current CPU number within that registered
area up-to-date. This system call does not care how the TLS is implemented
underneath.
My understanding is that you are suggesting a way to speed up TLS accesses
by creating a table indexed by TID. Although it might lead to interesting
speed ups useful when reading the TLS, I don't see how you proposal is
useful in addressing the problem of caching the current CPU number (other
than possibly speeding up TLS accesses).
Or am I missing something fundamental to your proposal ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <1350114812.1035.1437500726799.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-21 18:00 ` Ondřej Bílka
2015-07-21 18:18 ` Mathieu Desnoyers
0 siblings, 1 reply; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-21 18:00 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On Tue, Jul 21, 2015 at 05:45:26PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 21, 2015, at 11:16 AM, Ondřej Bílka neleai@seznam.cz wrote:
>
> > On Tue, Jul 21, 2015 at 12:58:13PM +0000, Mathieu Desnoyers wrote:
> >> ----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
> >>
> >> > On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
> >> >> >> Does it solve the Wine problem? If Wine uses gs for something and
> >> >> >> calls a function that does this, Wine still goes boom, right?
> >> >> >
> >> >> > So the advantage of just making a global segment descriptor available
> >> >> > is that it's not *that* expensive to just save/restore segments. So
> >> >> > either wine could do it, or any library users would do it.
> >> >> >
> >> >> > But anyway, I'm not sure this is a good idea. The advantage of it is
> >> >> > that the kernel support really is _very_ minimal.
> >> >>
> >> >> Considering that we'd at least also want this feature on ARM and
> >> >> PowerPC 32/64, and that the gs segment selector approach clashes with
> >> >> existing apps (wine), I'm not sure that implementing a gs segment
> >> >> selector based approach to cpu number caching would lead to an overall
> >> >> decrease in complexity if it leads to performance similar to those of
> >> >> portable approaches.
> >> >>
> >> >> I'm perfectly fine with architecture-specific tweaks that lead to
> >> >> fast-path speedups, but if we have to bite the bullet and implement
> >> >> an approach based on TLS and registering a memory area at thread start
> >> >> through a system call on other architectures anyway, it might end up
> >> >> being less complex to add a new system call on x86 too, especially if
> >> >> fast path overhead is similar.
> >> >>
> >> >> But I'm inclined to think that some aspect of the question eludes me,
> >> >> especially given the amount of interest generated by the gs-segment
> >> >> selector approach. What am I missing ?
> >> >>
> >> > As I wrote before you don't have to bite bullet as I said before. It
> >> > suffices to create 128k element array with cpu for each tid, make that
> >> > mmapable file and userspace could get cpu with nearly same performance
> >> > without hacks.
> >>
> >> I don't see how this would be acceptable on memory-constrained embedded
> >> systems. They have multiple cores, and performance requirements, so
> >> having a fast getcpu would be useful there (e.g. telecom industry),
> >> but they clearly cannot afford a 512kB table per process just for that.
> >>
> > Which just means that you need more complicated api and implementation
> > for that but idea stays same. You would need syscalls
> > register/deregister_cpuid_idx that would give you index used instead
> > tid. A kernel would need to handle that many ids could be registered for
> > each thread and resize mmaped file in syscalls.
>
> I feel we're talking past each other here. What I propose is to implement
> a system call that registers a TLS area. It can be invoked at thread start.
> The kernel can then keep the current CPU number within that registered
> area up-to-date. This system call does not care how the TLS is implemented
> underneath.
>
> My understanding is that you are suggesting a way to speed up TLS accesses
> by creating a table indexed by TID. Although it might lead to interesting
> speed ups useful when reading the TLS, I don't see how you proposal is
> useful in addressing the problem of caching the current CPU number (other
> than possibly speeding up TLS accesses).
>
> Or am I missing something fundamental to your proposal ?
>
No, I still talk about getting cpu number. My first proposal is that
kernel allocates table of current cpu numbers accessed by tid. That
could process mmap and get cpu with cpu_tid_table[tid]. As you said that
size is problem I replied that you need to be more careful. Instead tid
you will use different id that you get with say register_cpucache, store
in tls variable and get cpu with cpu_cid_table[cid]. That decreases
space used to only threads that use this.
A tls speedup was side remark when you would implement per-cpu page then
you could speedup tls. As tls access speed and getting tid these are
equivalent as you could easily implement one with other.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
2015-07-21 18:00 ` Ondřej Bílka
@ 2015-07-21 18:18 ` Mathieu Desnoyers
[not found] ` <2028561497.1088.1437502683664.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 47+ messages in thread
From: Mathieu Desnoyers @ 2015-07-21 18:18 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
----- On Jul 21, 2015, at 2:00 PM, Ondřej Bílka neleai@seznam.cz wrote:
> On Tue, Jul 21, 2015 at 05:45:26PM +0000, Mathieu Desnoyers wrote:
>> ----- On Jul 21, 2015, at 11:16 AM, Ondřej Bílka neleai@seznam.cz wrote:
>>
>> > On Tue, Jul 21, 2015 at 12:58:13PM +0000, Mathieu Desnoyers wrote:
>> >> ----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
>> >>
>> >> > On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
>> >> >> >> Does it solve the Wine problem? If Wine uses gs for something and
>> >> >> >> calls a function that does this, Wine still goes boom, right?
>> >> >> >
>> >> >> > So the advantage of just making a global segment descriptor available
>> >> >> > is that it's not *that* expensive to just save/restore segments. So
>> >> >> > either wine could do it, or any library users would do it.
>> >> >> >
>> >> >> > But anyway, I'm not sure this is a good idea. The advantage of it is
>> >> >> > that the kernel support really is _very_ minimal.
>> >> >>
>> >> >> Considering that we'd at least also want this feature on ARM and
>> >> >> PowerPC 32/64, and that the gs segment selector approach clashes with
>> >> >> existing apps (wine), I'm not sure that implementing a gs segment
>> >> >> selector based approach to cpu number caching would lead to an overall
>> >> >> decrease in complexity if it leads to performance similar to those of
>> >> >> portable approaches.
>> >> >>
>> >> >> I'm perfectly fine with architecture-specific tweaks that lead to
>> >> >> fast-path speedups, but if we have to bite the bullet and implement
>> >> >> an approach based on TLS and registering a memory area at thread start
>> >> >> through a system call on other architectures anyway, it might end up
>> >> >> being less complex to add a new system call on x86 too, especially if
>> >> >> fast path overhead is similar.
>> >> >>
>> >> >> But I'm inclined to think that some aspect of the question eludes me,
>> >> >> especially given the amount of interest generated by the gs-segment
>> >> >> selector approach. What am I missing ?
>> >> >>
>> >> > As I wrote before you don't have to bite bullet as I said before. It
>> >> > suffices to create 128k element array with cpu for each tid, make that
>> >> > mmapable file and userspace could get cpu with nearly same performance
>> >> > without hacks.
>> >>
>> >> I don't see how this would be acceptable on memory-constrained embedded
>> >> systems. They have multiple cores, and performance requirements, so
>> >> having a fast getcpu would be useful there (e.g. telecom industry),
>> >> but they clearly cannot afford a 512kB table per process just for that.
>> >>
>> > Which just means that you need more complicated api and implementation
>> > for that but idea stays same. You would need syscalls
>> > register/deregister_cpuid_idx that would give you index used instead
>> > tid. A kernel would need to handle that many ids could be registered for
>> > each thread and resize mmaped file in syscalls.
>>
>> I feel we're talking past each other here. What I propose is to implement
>> a system call that registers a TLS area. It can be invoked at thread start.
>> The kernel can then keep the current CPU number within that registered
>> area up-to-date. This system call does not care how the TLS is implemented
>> underneath.
>>
>> My understanding is that you are suggesting a way to speed up TLS accesses
>> by creating a table indexed by TID. Although it might lead to interesting
>> speed ups useful when reading the TLS, I don't see how you proposal is
>> useful in addressing the problem of caching the current CPU number (other
>> than possibly speeding up TLS accesses).
>>
>> Or am I missing something fundamental to your proposal ?
>>
> No, I still talk about getting cpu number. My first proposal is that
> kernel allocates table of current cpu numbers accessed by tid. That
> could process mmap and get cpu with cpu_tid_table[tid]. As you said that
> size is problem I replied that you need to be more careful. Instead tid
> you will use different id that you get with say register_cpucache, store
> in tls variable and get cpu with cpu_cid_table[cid]. That decreases
> space used to only threads that use this.
>
> A tls speedup was side remark when you would implement per-cpu page then
> you could speedup tls. As tls access speed and getting tid these are
> equivalent as you could easily implement one with other.
Thanks for the clarification. There is then a fundamental question
I need to ask: what is the upside of going for a dedicated array of
current cpu number values rather than using a TLS variable ?
The main downside I see with the array of cpu number is false sharing
caused by having many current cpu number variables sitting on the same
cache line. It seems like an overall performance loss there.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [RFC PATCH] getcpu_cache system call: caching current CPU number (x86)
[not found] ` <2028561497.1088.1437502683664.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
@ 2015-07-22 7:53 ` Ondřej Bílka
0 siblings, 0 replies; 47+ messages in thread
From: Ondřej Bílka @ 2015-07-22 7:53 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Linus Torvalds, Andy Lutomirski, Ben Maurer, Ingo Molnar,
libc-alpha, Andrew Morton, linux-api, rostedt, Paul E. McKenney,
Florian Weimer, Josh Triplett, Lai Jiangshan, Paul Turner,
Andrew Hunter, Peter Zijlstra
On Tue, Jul 21, 2015 at 06:18:03PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 21, 2015, at 2:00 PM, Ondřej Bílka neleai@seznam.cz wrote:
>
> > On Tue, Jul 21, 2015 at 05:45:26PM +0000, Mathieu Desnoyers wrote:
> >> ----- On Jul 21, 2015, at 11:16 AM, Ondřej Bílka neleai@seznam.cz wrote:
> >>
> >> > On Tue, Jul 21, 2015 at 12:58:13PM +0000, Mathieu Desnoyers wrote:
> >> >> ----- On Jul 21, 2015, at 3:30 AM, Ondřej Bílka neleai@seznam.cz wrote:
> >> >>
> >> >> > On Tue, Jul 21, 2015 at 12:25:00AM +0000, Mathieu Desnoyers wrote:
> >> >> >> >> Does it solve the Wine problem? If Wine uses gs for something and
> >> >> >> >> calls a function that does this, Wine still goes boom, right?
> >> >> >> >
> >> >> >> > So the advantage of just making a global segment descriptor available
> >> >> >> > is that it's not *that* expensive to just save/restore segments. So
> >> >> >> > either wine could do it, or any library users would do it.
> >> >> >> >
> >> >> >> > But anyway, I'm not sure this is a good idea. The advantage of it is
> >> >> >> > that the kernel support really is _very_ minimal.
> >> >> >>
> >> >> >> Considering that we'd at least also want this feature on ARM and
> >> >> >> PowerPC 32/64, and that the gs segment selector approach clashes with
> >> >> >> existing apps (wine), I'm not sure that implementing a gs segment
> >> >> >> selector based approach to cpu number caching would lead to an overall
> >> >> >> decrease in complexity if it leads to performance similar to those of
> >> >> >> portable approaches.
> >> >> >>
> >> >> >> I'm perfectly fine with architecture-specific tweaks that lead to
> >> >> >> fast-path speedups, but if we have to bite the bullet and implement
> >> >> >> an approach based on TLS and registering a memory area at thread start
> >> >> >> through a system call on other architectures anyway, it might end up
> >> >> >> being less complex to add a new system call on x86 too, especially if
> >> >> >> fast path overhead is similar.
> >> >> >>
> >> >> >> But I'm inclined to think that some aspect of the question eludes me,
> >> >> >> especially given the amount of interest generated by the gs-segment
> >> >> >> selector approach. What am I missing ?
> >> >> >>
> >> >> > As I wrote before you don't have to bite bullet as I said before. It
> >> >> > suffices to create 128k element array with cpu for each tid, make that
> >> >> > mmapable file and userspace could get cpu with nearly same performance
> >> >> > without hacks.
> >> >>
> >> >> I don't see how this would be acceptable on memory-constrained embedded
> >> >> systems. They have multiple cores, and performance requirements, so
> >> >> having a fast getcpu would be useful there (e.g. telecom industry),
> >> >> but they clearly cannot afford a 512kB table per process just for that.
> >> >>
> >> > Which just means that you need more complicated api and implementation
> >> > for that but idea stays same. You would need syscalls
> >> > register/deregister_cpuid_idx that would give you index used instead
> >> > tid. A kernel would need to handle that many ids could be registered for
> >> > each thread and resize mmaped file in syscalls.
> >>
> >> I feel we're talking past each other here. What I propose is to implement
> >> a system call that registers a TLS area. It can be invoked at thread start.
> >> The kernel can then keep the current CPU number within that registered
> >> area up-to-date. This system call does not care how the TLS is implemented
> >> underneath.
> >>
> >> My understanding is that you are suggesting a way to speed up TLS accesses
> >> by creating a table indexed by TID. Although it might lead to interesting
> >> speed ups useful when reading the TLS, I don't see how you proposal is
> >> useful in addressing the problem of caching the current CPU number (other
> >> than possibly speeding up TLS accesses).
> >>
> >> Or am I missing something fundamental to your proposal ?
> >>
> > No, I still talk about getting cpu number. My first proposal is that
> > kernel allocates table of current cpu numbers accessed by tid. That
> > could process mmap and get cpu with cpu_tid_table[tid]. As you said that
> > size is problem I replied that you need to be more careful. Instead tid
> > you will use different id that you get with say register_cpucache, store
> > in tls variable and get cpu with cpu_cid_table[cid]. That decreases
> > space used to only threads that use this.
> >
> > A tls speedup was side remark when you would implement per-cpu page then
> > you could speedup tls. As tls access speed and getting tid these are
> > equivalent as you could easily implement one with other.
>
> Thanks for the clarification. There is then a fundamental question
> I need to ask: what is the upside of going for a dedicated array of
> current cpu number values rather than using a TLS variable ?
> The main downside I see with the array of cpu number is false sharing
> caused by having many current cpu number variables sitting on the same
> cache line. It seems like an overall performance loss there.
>
Its considerably simpler to implement as you don't need to mark tls
pages to avoid page fault in context switch, security issues where
attacker could try to unmap tls for possible privilege escalation if it
would write to different process etc.
And as for sharing it simply doesn't matter. Its mostly read only that
is written only on context switch so they will be resident in cache.
Also when you switch cpu then you get same cache miss from tls variable
so its same.
^ permalink raw reply [flat|nested] 47+ messages in thread
end of thread, other threads:[~2015-07-22 7:53 UTC | newest]
Thread overview: 47+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-12 18:06 [RFC PATCH] getcpu_cache system call: caching current CPU number (x86) Mathieu Desnoyers
[not found] ` <1436724386-30909-1-git-send-email-mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-12 18:47 ` Josh Triplett
2015-07-13 3:40 ` Andy Lutomirski
2015-07-13 15:09 ` Mathieu Desnoyers
2015-07-13 3:38 ` Andy Lutomirski
[not found] ` <CALCETrV1suAbvMgD1jOEFyn3JcDE_hhi6X7+sGs9e3Oqw_6jUw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-13 15:27 ` Mathieu Desnoyers
[not found] ` <1050138282.1065.1436801252018.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-13 15:30 ` Andrew Hunter
[not found] ` <CADroS=7MnUULrjDeQtmscxjkpjCtti9V-HfFXU0sjKhi6PsaAg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-13 16:07 ` Mathieu Desnoyers
2015-07-13 18:36 ` Andy Lutomirski
2015-07-13 11:17 ` Ben Maurer
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F39C6-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-07-13 17:36 ` Mathieu Desnoyers
2015-07-14 9:34 ` Ben Maurer
[not found] ` <5CDDBDF2D36D9F43B9F5E99003F6A0D48D5F5DA0-f8hGUhss0nh9TZdEUguypQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-07-16 18:08 ` Mathieu Desnoyers
[not found] ` <549319255.383.1437070088597.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-16 19:27 ` Andy Lutomirski
2015-07-17 10:21 ` Ondřej Bílka
2015-07-17 15:53 ` Andy Lutomirski
[not found] ` <CALCETrWEKE=mow3vVh7C4r8CuGy_d5VOEz7KkpijuR5cpBfFtg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-17 18:48 ` Linus Torvalds
[not found] ` <CA+55aFz-VBnEKh0SPKgu8xV5=Zb+=6odybVUDoOYOknshbcFJA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-17 18:55 ` Andy Lutomirski
[not found] ` <CALCETrVNcLpZVATHOs-gZR9AMUSW_ScvXW_0oY=OnFHXXHLdaA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-17 19:11 ` Linus Torvalds
2015-07-17 23:28 ` Ondřej Bílka
2015-07-17 23:33 ` Andy Lutomirski
2015-07-18 10:35 ` Ondřej Bílka
[not found] ` <CALCETrVY=kjeA_4pazy3BL+ekfcV6WHKw8e3z-LBxx_uP1bw2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-20 8:35 ` Florian Weimer
[not found] ` <55ACB2DC.5010503-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-07-20 15:31 ` Andy Lutomirski
[not found] ` <CALCETrV9Vp5UUOb3e_R5tphyE-urBgTwQR2pFWUOOFnHqWXHKQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-20 15:32 ` Florian Weimer
[not found] ` <55AD14A4.6030101-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-07-20 17:41 ` Andy Lutomirski
[not found] ` <CALCETrUx6wFxmz+9TyW5bNgaMN0q180G8y9YOyq_D41sdhFaRQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-20 20:07 ` josh-iaAMLnmF4UmaiuxdJuQwMA
2015-07-21 7:55 ` Florian Weimer
[not found] ` <CA+55aFzMJkzydXb7uVv1iSUnp=539d43ghQaonGdzMoF7QLZBA@mail.gmail.com>
[not found] ` <CA+55aFzMJkzydXb7uVv1iSUnp=539d43ghQaonGdzMoF7QLZBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-20 21:09 ` Andy Lutomirski
2015-07-20 22:39 ` Linus Torvalds
[not found] ` <CA+55aFwLZLeeN7UN82dyt=emQcNBc8qZPJAw5iqtAbBwFA7FPQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-07-21 0:25 ` Mathieu Desnoyers
[not found] ` <2010227315.699.1437438300542.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-21 7:30 ` Ondřej Bílka
2015-07-21 12:58 ` Mathieu Desnoyers
[not found] ` <894137397.137.1437483493715.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-21 15:16 ` Ondřej Bílka
2015-07-21 17:45 ` Mathieu Desnoyers
[not found] ` <1350114812.1035.1437500726799.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-21 18:00 ` Ondřej Bílka
2015-07-21 18:18 ` Mathieu Desnoyers
[not found] ` <2028561497.1088.1437502683664.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-22 7:53 ` Ondřej Bílka
2015-07-21 8:01 ` Florian Weimer
2015-07-20 13:18 ` Konstantin Khlebnikov
2015-07-18 7:34 ` Rich Felker
[not found] ` <20150718073433.GH1173-C3MtFaGISjmo6RMmaWD+6Sb1p8zYI1N1@public.gmane.org>
2015-07-18 10:51 ` Ondřej Bílka
2015-07-18 9:47 ` Florian Weimer
[not found] ` <587954201.31.1436808992876.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-17 10:58 ` Ondřej Bílka
2015-07-17 16:03 ` Mathieu Desnoyers
[not found] ` <626545401.1010.1437149010438.JavaMail.zimbra-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
2015-07-17 22:43 ` Ondřej Bílka
2015-07-18 2:43 ` Mathieu Desnoyers
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).