* [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
2011-05-10 14:48 ` Borislav Petkov
2011-05-10 14:15 ` [PATCH v3 2/6] x86-64: Remove unnecessary barrier in vread_tsc Andy Lutomirski
` (4 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
Variables that are shared between the vdso and the kernel are
currently a bit of a mess. They are each defined with their own
magic, they are accessed differently in the kernel, the vsyscall page,
and the vdso, and one of them (vsyscall_clock) doesn't even really
exist.
This changes them all to use a common mechanism. All of them are
delcared in vvar.h with a fixed address (validated by the linker
script). In the kernel (as before), they look like ordinary
read-write variables. In the vsyscall page and the vdso, they are
accessed through a new macro VVAR, which gives read-only access.
The vdso is now loaded verbatim into memory without any fixups. As a
side bonus, access from the vdso is faster because a level of
indirection is removed.
---
arch/x86/include/asm/vdso.h | 14 ----------
arch/x86/include/asm/vgtod.h | 2 -
arch/x86/include/asm/vsyscall.h | 12 +-------
arch/x86/include/asm/vvar.h | 52 +++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/time.c | 2 +-
arch/x86/kernel/tsc.c | 4 +-
arch/x86/kernel/vmlinux.lds.S | 34 ++++++++-----------------
arch/x86/kernel/vsyscall_64.c | 46 +++++++++++++++-------------------
arch/x86/vdso/Makefile | 2 +-
arch/x86/vdso/vclock_gettime.c | 3 +-
arch/x86/vdso/vdso.lds.S | 7 -----
arch/x86/vdso/vextern.h | 16 ------------
arch/x86/vdso/vgetcpu.c | 3 +-
arch/x86/vdso/vma.c | 27 --------------------
arch/x86/vdso/vvar.c | 12 ---------
15 files changed, 91 insertions(+), 145 deletions(-)
create mode 100644 arch/x86/include/asm/vvar.h
delete mode 100644 arch/x86/vdso/vextern.h
delete mode 100644 arch/x86/vdso/vvar.c
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052..bb05228 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H
-#ifdef CONFIG_X86_64
-extern const char VDSO64_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO64_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO64_SYMBOL(base, name) \
-({ \
- extern const char VDSO64_##name[]; \
- (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const char VDSO32_PRELINK[];
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e20..646b4c1 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
struct timespec wall_to_monotonic;
struct timespec wall_time_coarse;
};
-extern struct vsyscall_gtod_data __vsyscall_gtod_data
-__section_vsyscall_gtod_data;
extern struct vsyscall_gtod_data vsyscall_gtod_data;
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d2..d555973 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,19 @@ enum vsyscall_num {
#ifdef __KERNEL__
#include <linux/seqlock.h>
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-
/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __section_vsyscall_gtod_data __attribute__ \
- ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
-#define __section_vsyscall_clock __attribute__ \
- ((unused, __section__ (".vsyscall_clock"),aligned(16)))
#define __vsyscall_fn \
__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2
-extern int __vgetcpu_mode;
-extern volatile unsigned long __jiffies;
-
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;
+#include <asm/vvar.h>
+
extern void map_vsyscall(void);
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 0000000..131ce61
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,52 @@
+/*
+ * vvar.h: Shared vDSO/kernel variable declarations
+ * Copyright (c) 2011 Andy Lutomirski
+ * Subject to the GNU General Public License, version 2
+ *
+ * A handful of variables are accessible (read-only) from userspace
+ * code in the vsyscall page and the vdso. They are declared here.
+ * Some other file must define them with DEFINE_VVAR.
+ *
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+ * Each of these variables lives in the vsyscall page, and each
+ * one needs a unique offset within the little piece of the page
+ * reserved for vvars. Specify that offset in DECLARE_VVAR.
+ * (There are 896 bytes available. If you mess up, the linker will
+ * catch it.)
+ */
+
+/* Offset of vars within vsyscall page */
+#define VSYSCALL_VARS_OFFSET (3072 + 128)
+
+#if defined(__VVAR_KERNEL_LDS)
+
+/* The kernel linker script defines its own magic to put vvars in the
+ * right place.
+ */
+#define DECLARE_VVAR(offset, type, name) \
+ EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
+
+#else
+
+#define DECLARE_VVAR(offset, type, name) \
+ static type const * const vvaraddr_ ## name = \
+ (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
+
+#define DEFINE_VVAR(type, name) \
+ type __vvar_ ## name \
+ __attribute__((section(".vsyscall_var_" #name), aligned(16)))
+
+#define VVAR(name) (*vvaraddr_ ## name)
+
+#endif
+
+/* DECLARE_VVAR(offset, type, name) */
+
+DECLARE_VVAR(0, volatile unsigned long, jiffies)
+DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+DECLARE_VVAR(256, int, vgetcpu_mode)
+
+#undef DECLARE_VVAR
+#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a2..00cbb27 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
#include <asm/time.h>
#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ffe5755..bc46566 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -777,8 +777,8 @@ static cycle_t __vsyscall_fn vread_tsc(void)
ret = (cycle_t)vget_cycles();
rdtsc_barrier();
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
+ return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ?
+ ret : VVAR(vsyscall_gtod_data).clock.cycle_last;
}
#endif
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf47007..bbe6cc2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -160,6 +160,12 @@ SECTIONS
#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
+ ADDR(.vsyscall_0) + offset \
+ : AT(VLOAD(.vsyscall_var_ ## x)) { \
+ *(.vsyscall_var_ ## x) \
+ } \
+ x = VVIRT(.vsyscall_var_ ## x);
. = ALIGN(4096);
__vsyscall_0 = .;
@@ -174,18 +180,6 @@ SECTIONS
*(.vsyscall_fn)
}
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
-
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
- }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
*(.vsyscall_1)
}
@@ -193,21 +187,14 @@ SECTIONS
*(.vsyscall_2)
}
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
- }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
- }
- jiffies = VVIRT(.jiffies);
-
.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
*(.vsyscall_3)
}
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+
. = __vsyscall_0 + PAGE_SIZE;
#undef VSYSCALL_ADDR
@@ -215,6 +202,7 @@ SECTIONS
#undef VLOAD
#undef VVIRT_OFFSET
#undef VVIRT
+#undef EMIT_VVAR
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c..5f6ad03 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,15 +49,8 @@
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
*/
static __always_inline void do_get_tz(struct timezone * tz)
{
- *tz = __vsyscall_gtod_data.sys_tz;
+ *tz = VVAR(vsyscall_gtod_data).sys_tz;
}
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+ vread = VVAR(vsyscall_gtod_data).clock.vread;
+ if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
+ !vread)) {
gettimeofday(tv,NULL);
return;
}
now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
+ base = VVAR(vsyscall_gtod_data).clock.cycle_last;
+ mask = VVAR(vsyscall_gtod_data).clock.mask;
+ mult = VVAR(vsyscall_gtod_data).clock.mult;
+ shift = VVAR(vsyscall_gtod_data).clock.shift;
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
+ nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
+ } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
/* calculate interval: */
cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
{
unsigned seq;
time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+ if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
return time_syscall(t);
do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
- result = __vsyscall_gtod_data.wall_time_sec;
+ result = VVAR(vsyscall_gtod_data).wall_time_sec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
if (t)
*t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
We do this here because otherwise user space would do it on
its own in a likely inferior way (no access to jiffies).
If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
+ if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b1..a651861 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-$(VDSO64-y) += vma.o vdso.o
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754..0b873d4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,9 +22,8 @@
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
-#include "vextern.h"
-#define gtod vdso_vsyscall_gtod_data
+#define gtod (&VVAR(vsyscall_gtod_data))
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b..81f2500 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -28,10 +28,3 @@ VERSION {
}
VDSO64_PRELINK = VDSO_PRELINK;
-
-/*
- * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
- */
-#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
-#include "vextern.h"
-#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2..0000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef VEXTERN
-#include <asm/vsyscall.h>
-#define VEXTERN(x) \
- extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
-#endif
-
-#define VMAGIC 0xfeedbabeabcdefabUL
-
-/* Any kernel variables used in the vDSO must be exported in the main
- kernel's vmlinux.lds.S/vsyscall.h/proper __section and
- put into vextern.h and be referenced as a pointer with vdso prefix.
- The main kernel later fills in the values. */
-
-VEXTERN(jiffies)
-VEXTERN(vgetcpu_mode)
-VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b2..5463ad5 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
#include <linux/time.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
-#include "vextern.h"
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
- if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+ if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f..7abd2be 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,9 +15,6 @@
#include <asm/proto.h>
#include <asm/vdso.h>
-#include "vextern.h" /* Just for VMAGIC. */
-#undef VEXTERN
-
unsigned int __read_mostly vdso_enabled = 1;
extern char vdso_start[], vdso_end[];
@@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid;
static struct page **vdso_pages;
static unsigned vdso_size;
-static inline void *var_ref(void *p, char *name)
-{
- if (*(void **)p != (void *)VMAGIC) {
- printk("VDSO: variable %s broken\n", name);
- vdso_enabled = 0;
- }
- return p;
-}
-
static int __init init_vdso_vars(void)
{
int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
int i;
- char *vbase;
vdso_size = npages << PAGE_SHIFT;
vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
@@ -54,20 +41,6 @@ static int __init init_vdso_vars(void)
copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
}
- vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
- if (!vbase)
- goto oom;
-
- if (memcmp(vbase, "\177ELF", 4)) {
- printk("VDSO: I'm broken; not ELF\n");
- vdso_enabled = 0;
- }
-
-#define VEXTERN(x) \
- *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
-#include "vextern.h"
-#undef VEXTERN
- vunmap(vbase);
return 0;
oom:
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703..0000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Define pointer to external vDSO variables.
- These are part of the vDSO. The kernel fills in the real addresses
- at boot time. This is done because when the vdso is linked the
- kernel isn't yet and we don't know the final addresses. */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/vgtod.h>
-
-#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
-#include "vextern.h"
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables
2011-05-10 14:15 ` [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables Andy Lutomirski
@ 2011-05-10 14:48 ` Borislav Petkov
2011-05-12 11:16 ` Andrew Lutomirski
0 siblings, 1 reply; 10+ messages in thread
From: Borislav Petkov @ 2011-05-10 14:48 UTC (permalink / raw)
To: Andy Lutomirski
Cc: x86, linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner
On Tue, May 10, 2011 at 10:15:03AM -0400, Andy Lutomirski wrote:
> Variables that are shared between the vdso and the kernel are
> currently a bit of a mess. They are each defined with their own
> magic, they are accessed differently in the kernel, the vsyscall page,
> and the vdso, and one of them (vsyscall_clock) doesn't even really
> exist.
>
> This changes them all to use a common mechanism. All of them are
> delcared in vvar.h with a fixed address (validated by the linker
> script). In the kernel (as before), they look like ordinary
> read-write variables. In the vsyscall page and the vdso, they are
> accessed through a new macro VVAR, which gives read-only access.
>
> The vdso is now loaded verbatim into memory without any fixups. As a
> side bonus, access from the vdso is faster because a level of
> indirection is removed.
> ---
> arch/x86/include/asm/vdso.h | 14 ----------
> arch/x86/include/asm/vgtod.h | 2 -
> arch/x86/include/asm/vsyscall.h | 12 +-------
> arch/x86/include/asm/vvar.h | 52 +++++++++++++++++++++++++++++++++++++++
> arch/x86/kernel/time.c | 2 +-
> arch/x86/kernel/tsc.c | 4 +-
> arch/x86/kernel/vmlinux.lds.S | 34 ++++++++-----------------
> arch/x86/kernel/vsyscall_64.c | 46 +++++++++++++++-------------------
> arch/x86/vdso/Makefile | 2 +-
> arch/x86/vdso/vclock_gettime.c | 3 +-
> arch/x86/vdso/vdso.lds.S | 7 -----
> arch/x86/vdso/vextern.h | 16 ------------
> arch/x86/vdso/vgetcpu.c | 3 +-
> arch/x86/vdso/vma.c | 27 --------------------
> arch/x86/vdso/vvar.c | 12 ---------
> 15 files changed, 91 insertions(+), 145 deletions(-)
> create mode 100644 arch/x86/include/asm/vvar.h
> delete mode 100644 arch/x86/vdso/vextern.h
> delete mode 100644 arch/x86/vdso/vvar.c
>
> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
> index 9064052..bb05228 100644
> --- a/arch/x86/include/asm/vdso.h
> +++ b/arch/x86/include/asm/vdso.h
> @@ -1,20 +1,6 @@
> #ifndef _ASM_X86_VDSO_H
> #define _ASM_X86_VDSO_H
>
> -#ifdef CONFIG_X86_64
> -extern const char VDSO64_PRELINK[];
> -
> -/*
> - * Given a pointer to the vDSO image, find the pointer to VDSO64_name
> - * as that symbol is defined in the vDSO sources or linker script.
> - */
> -#define VDSO64_SYMBOL(base, name) \
> -({ \
> - extern const char VDSO64_##name[]; \
> - (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
> -})
> -#endif
> -
> #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
> extern const char VDSO32_PRELINK[];
>
> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
> index 3d61e20..646b4c1 100644
> --- a/arch/x86/include/asm/vgtod.h
> +++ b/arch/x86/include/asm/vgtod.h
> @@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
> struct timespec wall_to_monotonic;
> struct timespec wall_time_coarse;
> };
> -extern struct vsyscall_gtod_data __vsyscall_gtod_data
> -__section_vsyscall_gtod_data;
> extern struct vsyscall_gtod_data vsyscall_gtod_data;
>
> #endif /* _ASM_X86_VGTOD_H */
> diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
> index d0983d2..d555973 100644
> --- a/arch/x86/include/asm/vsyscall.h
> +++ b/arch/x86/include/asm/vsyscall.h
> @@ -16,27 +16,19 @@ enum vsyscall_num {
> #ifdef __KERNEL__
> #include <linux/seqlock.h>
>
> -#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
> -#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
> -
> /* Definitions for CONFIG_GENERIC_TIME definitions */
> -#define __section_vsyscall_gtod_data __attribute__ \
> - ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
> -#define __section_vsyscall_clock __attribute__ \
> - ((unused, __section__ (".vsyscall_clock"),aligned(16)))
> #define __vsyscall_fn \
> __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
>
> #define VGETCPU_RDTSCP 1
> #define VGETCPU_LSL 2
>
> -extern int __vgetcpu_mode;
> -extern volatile unsigned long __jiffies;
> -
> /* kernel space (writeable) */
> extern int vgetcpu_mode;
> extern struct timezone sys_tz;
>
> +#include <asm/vvar.h>
> +
> extern void map_vsyscall(void);
>
> #endif /* __KERNEL__ */
> diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
> new file mode 100644
> index 0000000..131ce61
> --- /dev/null
> +++ b/arch/x86/include/asm/vvar.h
> @@ -0,0 +1,52 @@
> +/*
> + * vvar.h: Shared vDSO/kernel variable declarations
> + * Copyright (c) 2011 Andy Lutomirski
> + * Subject to the GNU General Public License, version 2
> + *
> + * A handful of variables are accessible (read-only) from userspace
> + * code in the vsyscall page and the vdso. They are declared here.
> + * Some other file must define them with DEFINE_VVAR.
> + *
> + * In normal kernel code, they are used like any other variable.
> + * In user code, they are accessed through the VVAR macro.
> + *
> + * Each of these variables lives in the vsyscall page, and each
> + * one needs a unique offset within the little piece of the page
> + * reserved for vvars. Specify that offset in DECLARE_VVAR.
> + * (There are 896 bytes available. If you mess up, the linker will
> + * catch it.)
> + */
> +
> +/* Offset of vars within vsyscall page */
> +#define VSYSCALL_VARS_OFFSET (3072 + 128)
> +
> +#if defined(__VVAR_KERNEL_LDS)
> +
> +/* The kernel linker script defines its own magic to put vvars in the
> + * right place.
> + */
> +#define DECLARE_VVAR(offset, type, name) \
> + EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
> +
> +#else
> +
> +#define DECLARE_VVAR(offset, type, name) \
> + static type const * const vvaraddr_ ## name = \
> + (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
> +
> +#define DEFINE_VVAR(type, name) \
> + type __vvar_ ## name \
> + __attribute__((section(".vsyscall_var_" #name), aligned(16)))
> +
> +#define VVAR(name) (*vvaraddr_ ## name)
> +
> +#endif
> +
> +/* DECLARE_VVAR(offset, type, name) */
> +
> +DECLARE_VVAR(0, volatile unsigned long, jiffies)
> +DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
> +DECLARE_VVAR(256, int, vgetcpu_mode)
Why so spread out? Maybe put each on a single cacheline instead?
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables
2011-05-10 14:48 ` Borislav Petkov
@ 2011-05-12 11:16 ` Andrew Lutomirski
0 siblings, 0 replies; 10+ messages in thread
From: Andrew Lutomirski @ 2011-05-12 11:16 UTC (permalink / raw)
To: Borislav Petkov
Cc: x86, linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner
On Tue, May 10, 2011 at 10:48 AM, Borislav Petkov <bp@amd64.org> wrote:
> On Tue, May 10, 2011 at 10:15:03AM -0400, Andy Lutomirski wrote:
>> Variables that are shared between the vdso and the kernel are
>> currently a bit of a mess. They are each defined with their own
>> magic, they are accessed differently in the kernel, the vsyscall page,
>> and the vdso, and one of them (vsyscall_clock) doesn't even really
>> exist.
>>
>> This changes them all to use a common mechanism. All of them are
>> delcared in vvar.h with a fixed address (validated by the linker
>> script). In the kernel (as before), they look like ordinary
>> read-write variables. In the vsyscall page and the vdso, they are
>> accessed through a new macro VVAR, which gives read-only access.
>>
>> The vdso is now loaded verbatim into memory without any fixups. As a
>> side bonus, access from the vdso is faster because a level of
>> indirection is removed.
>> ---
>> arch/x86/include/asm/vdso.h | 14 ----------
>> arch/x86/include/asm/vgtod.h | 2 -
>> arch/x86/include/asm/vsyscall.h | 12 +-------
>> arch/x86/include/asm/vvar.h | 52 +++++++++++++++++++++++++++++++++++++++
>> arch/x86/kernel/time.c | 2 +-
>> arch/x86/kernel/tsc.c | 4 +-
>> arch/x86/kernel/vmlinux.lds.S | 34 ++++++++-----------------
>> arch/x86/kernel/vsyscall_64.c | 46 +++++++++++++++-------------------
>> arch/x86/vdso/Makefile | 2 +-
>> arch/x86/vdso/vclock_gettime.c | 3 +-
>> arch/x86/vdso/vdso.lds.S | 7 -----
>> arch/x86/vdso/vextern.h | 16 ------------
>> arch/x86/vdso/vgetcpu.c | 3 +-
>> arch/x86/vdso/vma.c | 27 --------------------
>> arch/x86/vdso/vvar.c | 12 ---------
>> 15 files changed, 91 insertions(+), 145 deletions(-)
>> create mode 100644 arch/x86/include/asm/vvar.h
>> delete mode 100644 arch/x86/vdso/vextern.h
>> delete mode 100644 arch/x86/vdso/vvar.c
>>
>> +/* DECLARE_VVAR(offset, type, name) */
>> +
>> +DECLARE_VVAR(0, volatile unsigned long, jiffies)
>> +DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
>> +DECLARE_VVAR(256, int, vgetcpu_mode)
>
> Why so spread out? Maybe put each on a single cacheline instead?
They're currently on their own cachelines. I just spread them out
because there's room and that way modification to their types is less
likely to make them conflict.
I'll add a patch at the end of the series to pack them into two
cachelines. All of the variables are read-mostly so it should be a
win.
--Andy
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v3 2/6] x86-64: Remove unnecessary barrier in vread_tsc
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 3/6] x86-64: Don't generate cmov " Andy Lutomirski
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
RDTSC is completely unordered on modern Intel and AMD CPUs. The
Intel manual says that lfence;rdtsc causes all previous instructions
to complete before the tsc is read, and the AMD manual says to use
mfence;rdtsc to do the same thing.
>From a decent amount of testing [1] this is enough to make rdtsc
be ordered with respect to subsequent loads across a wide variety
of CPUs.
On Sandy Bridge (i7-2600), this improves a loop of
clock_gettime(CLOCK_MONOTONIC) by more than 5 ns/iter.
[1] https://lkml.org/lkml/2011/4/18/350
Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
arch/x86/kernel/tsc.c | 9 +++++----
1 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index bc46566..7cabdae 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -769,13 +769,14 @@ static cycle_t __vsyscall_fn vread_tsc(void)
cycle_t ret;
/*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
*/
rdtsc_barrier();
ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ?
ret : VVAR(vsyscall_gtod_data).clock.cycle_last;
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v3 3/6] x86-64: Don't generate cmov in vread_tsc
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 1/6] x86-64: Clean up vdso/kernel shared variables Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 2/6] x86-64: Remove unnecessary barrier in vread_tsc Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 4/6] x86-64: vclock_gettime(CLOCK_MONOTONIC) can't ever see nsec < 0 Andy Lutomirski
` (2 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
vread_tsc checks whether rdtsc returns something less than
cycle_last, which is an extremely predictable branch. GCC likes
to generate a cmov anyway, which is several cycles slower than
a predicted branch. This saves a couple of nanoseconds.
Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
arch/x86/kernel/tsc.c | 18 ++++++++++++++++--
1 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7cabdae..db4c6e6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -767,6 +767,7 @@ static cycle_t read_tsc(struct clocksource *cs)
static cycle_t __vsyscall_fn vread_tsc(void)
{
cycle_t ret;
+ u64 last;
/*
* Empirically, a fence (of type that depends on the CPU)
@@ -778,8 +779,21 @@ static cycle_t __vsyscall_fn vread_tsc(void)
rdtsc_barrier();
ret = (cycle_t)vget_cycles();
- return ret >= VVAR(vsyscall_gtod_data).clock.cycle_last ?
- ret : VVAR(vsyscall_gtod_data).clock.cycle_last;
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
}
#endif
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v3 4/6] x86-64: vclock_gettime(CLOCK_MONOTONIC) can't ever see nsec < 0
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
` (2 preceding siblings ...)
2011-05-10 14:15 ` [PATCH v3 3/6] x86-64: Don't generate cmov " Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 5/6] x86-64: Move vread_tsc into a new file with sensible options Andy Lutomirski
2011-05-10 14:15 ` [PATCH v3 6/6] x86-64: Turn off -pg and turn on -foptimize-sibling-calls for vDSO Andy Lutomirski
5 siblings, 0 replies; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
vclock_gettime's do_monotonic helper can't ever generate a negative
nsec value, so it doesn't need to check whether it's negative. In
the CLOCK_MONOTONIC_COARSE case, ns can't ever exceed 2e9-1, so we
can avoid the loop entirely. This saves a single easily-predicted
branch.
Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
arch/x86/vdso/vclock_gettime.c | 40 ++++++++++++++++++++++------------------
1 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 0b873d4..28b2c00 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -55,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
return 0;
}
-/* Copy of the version in kernel/time.c which we cannot directly access */
-notrace static void
-vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-
notrace static noinline int do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
@@ -81,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(>od->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
+ * are all guaranteed to be nonnegative.
+ */
+ while (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
@@ -106,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(>od->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec and wall_to_monotonic.tv_nsec are
+ * guaranteed to be between 0 and NSEC_PER_SEC.
+ */
+ if (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v3 5/6] x86-64: Move vread_tsc into a new file with sensible options
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
` (3 preceding siblings ...)
2011-05-10 14:15 ` [PATCH v3 4/6] x86-64: vclock_gettime(CLOCK_MONOTONIC) can't ever see nsec < 0 Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
2011-05-10 14:36 ` Peter Zijlstra
2011-05-10 14:15 ` [PATCH v3 6/6] x86-64: Turn off -pg and turn on -foptimize-sibling-calls for vDSO Andy Lutomirski
5 siblings, 1 reply; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
vread_tsc is short and hot, and it's userspace code so the usual
reasons to keep frame pointers around, enable -pg, and turn off
sibling calls don't apply.
(OK, turning off sibling calls has no effect. But it might
someday...)
As an added benefit, tsc.c is profilable now.
Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
arch/x86/include/asm/tsc.h | 4 ++++
arch/x86/kernel/Makefile | 8 +++++---
arch/x86/kernel/tsc.c | 34 ----------------------------------
arch/x86/kernel/vread_tsc_64.c | 36 ++++++++++++++++++++++++++++++++++++
4 files changed, 45 insertions(+), 37 deletions(-)
create mode 100644 arch/x86/kernel/vread_tsc_64.c
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132f..8f2b3c6 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);
+#ifdef CONFIG_X86_64
+extern cycles_t vread_tsc(void);
+#endif
+
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2..7626fb8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
-CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,13 +23,16 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
+CFLAGS_vread_tsc_64.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
GCOV_PROFILE_tsc.o := n
GCOV_PROFILE_paravirt.o := n
+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls
+
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db4c6e6..5346381 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,40 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
ret : clocksource_tsc.cycle_last;
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
- u64 last;
-
- /*
- * Empirically, a fence (of type that depends on the CPU)
- * before rdtsc is enough to ensure that rdtsc is ordered
- * with respect to loads. The various CPU manuals are unclear
- * as to whether rdtsc can be reordered with later loads,
- * but no one has ever seen it happen.
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
-
- last = VVAR(vsyscall_gtod_data).clock.cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-#endif
-
static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 0000000..a81aa9e
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
+/* This code runs in userspace. */
+
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v3 5/6] x86-64: Move vread_tsc into a new file with sensible options
2011-05-10 14:15 ` [PATCH v3 5/6] x86-64: Move vread_tsc into a new file with sensible options Andy Lutomirski
@ 2011-05-10 14:36 ` Peter Zijlstra
0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2011-05-10 14:36 UTC (permalink / raw)
To: Andy Lutomirski
Cc: x86, linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
David S. Miller, Eric Dumazet, Thomas Gleixner
On Tue, 2011-05-10 at 10:15 -0400, Andy Lutomirski wrote:
> vread_tsc is short and hot, and it's userspace code so the usual
> reasons to keep frame pointers around, enable -pg, and turn off
> sibling calls don't apply.
>
I really don't like that, turning off frame pointers here means that if
your profiler interrupt hits while in the vDSO the user-space backtrace
is dead.
Please leave the frame pointer in. Esp on x86_64 the cost of keeping the
framepointer isn't much at all since it isn't nearly as register starved
as i386.
I'm fine with stripping -pg, that is indeed useless for the vDSO.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v3 6/6] x86-64: Turn off -pg and turn on -foptimize-sibling-calls for vDSO
2011-05-10 14:15 [PATCH v3 0/6] Micro-optimize vclock_gettime Andy Lutomirski
` (4 preceding siblings ...)
2011-05-10 14:15 ` [PATCH v3 5/6] x86-64: Move vread_tsc into a new file with sensible options Andy Lutomirski
@ 2011-05-10 14:15 ` Andy Lutomirski
5 siblings, 0 replies; 10+ messages in thread
From: Andy Lutomirski @ 2011-05-10 14:15 UTC (permalink / raw)
To: x86
Cc: linux-kernel, Ingo Molnar, Andi Kleen, Linus Torvalds,
Nick Piggin, David S. Miller, Eric Dumazet, Peter Zijlstra,
Thomas Gleixner, Andy Lutomirski
The vDSO isn't part of the kernel, so profiling and kernel
backtraces don't really matter.
Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
arch/x86/vdso/Makefile | 15 ++++++++++++++-
1 files changed, 14 insertions(+), 1 deletions(-)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index a651861..bef0bc9 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls
$(vobjs): KBUILD_CFLAGS += $(CFL)
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vvar.o = -pg
+
targets += vdso-syms.lds
obj-$(VDSO64-y) += vdso-syms.lds
--
1.7.5.1
^ permalink raw reply related [flat|nested] 10+ messages in thread