[PATCH v2 1/3] X86: make VDSO data support multiple pages

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 1/3] X86: make VDSO data support multiple pages
@ 2014-12-17 23:12 Shaohua Li
  2014-12-17 23:12 ` [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch Shaohua Li
  2014-12-17 23:12 ` [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO Shaohua Li
  0 siblings, 2 replies; 29+ messages in thread
From: Shaohua Li @ 2014-12-17 23:12 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Kernel-team, Andy Lutomirski, H. Peter Anvin, Ingo Molnar

Currently vdso data is one page. Next patches will add per-cpu data to
vdso, which requires several pages if CPU number is big. This makes VDSO
data support multiple pages.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/include/asm/vdso.h     | 2 +-
 arch/x86/include/asm/vvar.h     | 8 ++++++--
 arch/x86/kernel/asm-offsets.c   | 5 +++++
 arch/x86/kernel/vmlinux.lds.S   | 6 +++---
 arch/x86/tools/relocs.c         | 2 +-
 arch/x86/vdso/vdso-layout.lds.S | 9 +++++----
 arch/x86/vdso/vdso2c.c          | 6 +++---
 arch/x86/vdso/vma.c             | 9 +++++----
 8 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 8021bd2..35ca749 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -20,7 +20,7 @@ struct vdso_image {
 
 	long sym_vvar_start;  /* Negative offset to the vvar area */
 
-	long sym_vvar_page;
+	long sym_vvar_pages;
 	long sym_hpet_page;
 	long sym_VDSO32_NOTE_MASK;
 	long sym___kernel_sigreturn;
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 3f32dfc..62bc6f8 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -29,7 +29,7 @@
 
 #else
 
-extern char __vvar_page;
+extern char __vvar_pages;
 
 #define DECLARE_VVAR(offset, type, name)				\
 	extern type vvar_ ## name __attribute__((visibility("hidden")));
@@ -45,7 +45,11 @@ extern char __vvar_page;
 /* DECLARE_VVAR(offset, type, name) */
 
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
-
+/*
+ * you must update VVAR_TOTAL_SIZE to reflect all of the variables we're
+ * stuffing into the vvar area.  Don't change any of the above without
+ * also changing this math of VVAR_TOTAL_SIZE
+ */
 #undef DECLARE_VVAR
 
 #endif
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9f6b934..0ab31a9 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -16,6 +16,7 @@
 #include <asm/sigframe.h>
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
+#include <asm/vgtod.h>
 
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
@@ -71,4 +72,8 @@ void common(void) {
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+	BLANK();
+	DEFINE(VVAR_TOTAL_SIZE,
+		ALIGN(128 + sizeof(struct vsyscall_gtod_data), PAGE_SIZE));
 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300..2efeee9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -149,7 +149,7 @@ SECTIONS
 
 
 	. = ALIGN(PAGE_SIZE);
-	__vvar_page = .;
+	__vvar_pages = .;
 
 	.vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
 		/* work around gold bug 13023 */
@@ -168,10 +168,10 @@ SECTIONS
 		 * Pad the rest of the page with zeros.  Otherwise the loader
 		 * can leave garbage here.
 		 */
-		. = __vvar_beginning_hack + PAGE_SIZE;
+		. = __vvar_beginning_hack + VVAR_TOTAL_SIZE;
 	} :data
 
-       . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+	. = ALIGN(__vvar_pages + VVAR_TOTAL_SIZE, PAGE_SIZE);
 
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..ea01c48 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -73,7 +73,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"init_per_cpu__.*|"
 	"__end_rodata_hpage_align|"
 #endif
-	"__vvar_page|"
+	"__vvar_pages|"
 	"_end)$"
 };
 
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index de2c921..413c739 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -1,4 +1,5 @@
 #include <asm/vdso.h>
+#include <asm/asm-offsets.h>
 
 /*
  * Linker script for vDSO.  This is an ELF shared object prelinked to
@@ -25,17 +26,17 @@ SECTIONS
 	 * segment.
 	 */
 
-	vvar_start = . - 2 * PAGE_SIZE;
-	vvar_page = vvar_start;
+	vvar_start = . - (VVAR_TOTAL_SIZE + PAGE_SIZE);
+	vvar_pages = vvar_start;
 
 	/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_pages + offset;
 #define __VVAR_KERNEL_LDS
 #include <asm/vvar.h>
 #undef __VVAR_KERNEL_LDS
 #undef EMIT_VVAR
 
-	hpet_page = vvar_start + PAGE_SIZE;
+	hpet_page = vvar_start + VVAR_TOTAL_SIZE;
 
 	. = SIZEOF_HEADERS;
 
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 8627db2..95eda74 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -71,14 +71,14 @@ const char *outfilename;
 /* Symbols that we need in vdso2c. */
 enum {
 	sym_vvar_start,
-	sym_vvar_page,
+	sym_vvar_pages,
 	sym_hpet_page,
 	sym_VDSO_FAKE_SECTION_TABLE_START,
 	sym_VDSO_FAKE_SECTION_TABLE_END,
 };
 
 const int special_pages[] = {
-	sym_vvar_page,
+	sym_vvar_pages,
 	sym_hpet_page,
 };
 
@@ -89,7 +89,7 @@ struct vdso_sym {
 
 struct vdso_sym required_syms[] = {
 	[sym_vvar_start] = {"vvar_start", true},
-	[sym_vvar_page] = {"vvar_page", true},
+	[sym_vvar_pages] = {"vvar_pages", true},
 	[sym_hpet_page] = {"hpet_page", true},
 	[sym_VDSO_FAKE_SECTION_TABLE_START] = {
 		"VDSO_FAKE_SECTION_TABLE_START", false
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 009495b..6496c65 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -19,6 +19,7 @@
 #include <asm/page.h>
 #include <asm/hpet.h>
 #include <asm/desc.h>
+#include <asm/asm-offsets.h>
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
@@ -133,11 +134,11 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		goto up_fail;
 	}
 
-	if (image->sym_vvar_page)
+	if (image->sym_vvar_pages)
 		ret = remap_pfn_range(vma,
-				      text_start + image->sym_vvar_page,
-				      __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-				      PAGE_SIZE,
+				      text_start + image->sym_vvar_pages,
+				      __pa_symbol(&__vvar_pages) >> PAGE_SHIFT,
+				      VVAR_TOTAL_SIZE,
 				      PAGE_READONLY);
 
 	if (ret)
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch
  2014-12-17 23:12 [PATCH v2 1/3] X86: make VDSO data support multiple pages Shaohua Li
@ 2014-12-17 23:12 ` Shaohua Li
  2014-12-19  1:05   ` Thomas Gleixner
  2014-12-17 23:12 ` [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO Shaohua Li
  1 sibling, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2014-12-17 23:12 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Kernel-team, Andy Lutomirski, H. Peter Anvin, Ingo Molnar

vdso code can't disable preempt, so it can be preempted at any time.
This makes a challenge to implement specific features. This patch adds a
generic API to let vdso code detect context switch.

We can use a context switch count to do the detection. The change of the
count in giving time can be used to detect if context switch occurs.
Andy suggested we can use a timestamp, so in next patch we can save some
intructions. But the principle isn't changed here. This patch uses the
timestamp approach.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/Kconfig              |  4 ++++
 arch/x86/include/asm/vdso.h   | 17 +++++++++++++++++
 arch/x86/include/asm/vvar.h   |  6 ++++++
 arch/x86/kernel/asm-offsets.c |  6 ++++++
 arch/x86/vdso/vma.c           |  1 +
 kernel/sched/core.c           |  5 +++++
 6 files changed, 39 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d69f1cd..e384147 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1943,6 +1943,10 @@ config COMPAT_VDSO
 	  If unsure, say N: if you are compiling your own kernel, you
 	  are unlikely to be using a buggy version of glibc.
 
+config VDSO_CS_DETECT
+	def_bool y
+	depends on X86_64
+
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
 	---help---
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 35ca749..d4556a3 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -49,6 +49,23 @@ extern const struct vdso_image *selected_vdso32;
 
 extern void __init init_vdso_image(const struct vdso_image *image);
 
+#ifdef CONFIG_VDSO_CS_DETECT
+struct vdso_percpu_data {
+	u64 last_cs_timestamp;
+} ____cacheline_aligned;
+
+struct vdso_data {
+	int dummy;
+	struct vdso_percpu_data vpercpu[0];
+};
+extern struct vdso_data vdso_data;
+
+static inline void vdso_set_cpu_cs_timestamp(int cpu)
+{
+	rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
+}
+#endif
+
 #endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 62bc6f8..19ac55c 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -45,6 +45,12 @@ extern char __vvar_pages;
 /* DECLARE_VVAR(offset, type, name) */
 
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+/*
+ * this one needs to be last because it ends with a per-cpu array.
+ */
+DECLARE_VVAR(320, struct vdso_data, vdso_data)
+#endif
 /*
  * you must update VVAR_TOTAL_SIZE to reflect all of the variables we're
  * stuffing into the vvar area.  Don't change any of the above without
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 0ab31a9..7321cdc 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
 #include <asm/vgtod.h>
+#include <asm/vdso.h>
 
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
@@ -74,6 +75,11 @@ void common(void) {
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
 	BLANK();
+#ifdef CONFIG_VDSO_CS_DETECT
+	DEFINE(VVAR_TOTAL_SIZE, ALIGN(320 + sizeof(struct vdso_data)
+		+ sizeof(struct vdso_percpu_data) * CONFIG_NR_CPUS, PAGE_SIZE));
+#else
 	DEFINE(VVAR_TOTAL_SIZE,
 		ALIGN(128 + sizeof(struct vsyscall_gtod_data), PAGE_SIZE));
+#endif
 }
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 6496c65..22b1a69 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -23,6 +23,7 @@
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
+DEFINE_VVAR(struct vdso_data, vdso_data);
 #endif
 
 void __init init_vdso_image(const struct vdso_image *image)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b7..d8e882d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2232,6 +2232,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	struct rq *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
+#ifdef CONFIG_VDSO_CS_DETECT
+	int cpu = smp_processor_id();
+
+	vdso_set_cpu_cs_timestamp(cpu);
+#endif
 
 	rq->prev_mm = NULL;
 
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-17 23:12 [PATCH v2 1/3] X86: make VDSO data support multiple pages Shaohua Li
  2014-12-17 23:12 ` [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch Shaohua Li
@ 2014-12-17 23:12 ` Shaohua Li
  2014-12-18 23:30   ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2014-12-17 23:12 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Kernel-team, Andy Lutomirski, H. Peter Anvin, Ingo Molnar

This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
use the following method to compute the thread cpu time:

    t0 = process start
    t1 = most recent context switch time
    t2 = time at which the vsyscall is invoked

    thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
    		= current->se.sum_exec_runtime + now - sched_clock()

At context switch time We stash away

    adj_sched_time = sum_exec_runtime - sched_clock()

in a per-cpu struct in the VVAR page and then compute

    thread_cpu_time = adj_sched_time + now

All computations are done in nanosecs on systems where TSC is stable. If
TSC is unstable, we fallback to a regular syscall.
    Benchmark data:

    for (i = 0; i < 100000000; i++) {
	    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
	    sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
    }

    Baseline:
    real    1m3.428s
    user    0m5.190s
    sys     0m58.220s

    patched:
    real    0m4.912s
    user    0m4.910s
    sys     0m0.000s

This should speed up profilers that need to query thread cpu time a lot
to do fine-grained timestamps.

No statistically significant regression was detected on x86_64 context
switch code. Most archs that don't support vsyscalls will have this code
disabled via jump labels.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Kumar Sundararajan <kumar@fb.com>
Signed-off-by: Arun Sharma <asharma@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/include/asm/vdso.h    |  9 +++++-
 arch/x86/kernel/tsc.c          | 14 +++++++++
 arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/vdso/vma.c            | 10 +++++-
 kernel/sched/core.c            |  3 ++
 5 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index d4556a3..fcdf611 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
 #ifdef CONFIG_VDSO_CS_DETECT
 struct vdso_percpu_data {
 	u64 last_cs_timestamp;
+
+	u64 adj_sched_time;
+	u64 cyc2ns_offset;
+	u32 cyc2ns_mul;
 } ____cacheline_aligned;
 
 struct vdso_data {
-	int dummy;
+	unsigned int thread_cputime_disabled;
 	struct vdso_percpu_data vpercpu[0];
 };
 extern struct vdso_data vdso_data;
 
+struct static_key;
+extern struct static_key vcpu_thread_cputime_enabled;
+
 static inline void vdso_set_cpu_cs_timestamp(int cpu)
 {
 	rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b7e50bb..dd3b281 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -21,6 +21,7 @@
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/vdso.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	data->cyc2ns_offset = ns_now -
 		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
 
+#ifdef CONFIG_VDSO_CS_DETECT
+	vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
+	vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
+#endif
+
 	cyc2ns_write_end(cpu, data);
 
 done:
@@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
 		tsc_unstable = 1;
 		clear_sched_clock_stable();
 		disable_sched_clock_irqtime();
+#ifdef CONFIG_VDSO_CS_DETECT
+		vdso_data.thread_cputime_disabled = 1;
+		static_key_slow_dec(&vcpu_thread_cputime_enabled);
+#endif
 		pr_info("Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
@@ -1202,6 +1212,10 @@ void __init tsc_init(void)
 
 	tsc_disabled = 0;
 	static_key_slow_inc(&__use_tsc);
+#ifdef CONFIG_VDSO_CS_DETECT
+	vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
+			X86_FEATURE_RDTSCP);
+#endif
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322..0aa39b1 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -17,6 +17,7 @@
 #include <asm/vvar.h>
 #include <asm/unistd.h>
 #include <asm/msr.h>
+#include <asm/vdso.h>
 #include <linux/math64.h>
 #include <linux/time.h>
 
@@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
 	} while (unlikely(gtod_read_retry(gtod, seq)));
 }
 
+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
+{
+	u64 ns = offset;
+	ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
+	return ns;
+}
+
+notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
+{
+	return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
+}
+
+#define THREAD_CPU_TIME_MAX_LOOP 20
+notrace static u64 do_thread_cpu_time(void)
+{
+	unsigned int p;
+	u64 tscval;
+	u64 adj_sched_time;
+	u64 scale;
+	u64 offset;
+	const struct vdso_data *vp = &VVAR(vdso_data);
+	int cpuo, cpun;
+	int loop = 0;
+
+	do {
+		loop++;
+		if (loop > THREAD_CPU_TIME_MAX_LOOP)
+			return -1LL;
+
+		rdtscpll(tscval, p);
+		cpuo = p & VGETCPU_CPU_MASK;
+		adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
+		scale = vp->vpercpu[cpuo].cyc2ns_mul;
+		offset = vp->vpercpu[cpuo].cyc2ns_offset;
+
+		cpun = __getcpu() & VGETCPU_CPU_MASK;
+		/*
+		 * The CPU check goarantees this runs in the same cpu, so no
+		 * barrier required
+		 */
+	} while (unlikely(cpuo != cpun ||
+			tscval <= vdso_get_cpu_cs_timestamp(cpun)));
+
+	return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
+}
+
+notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
+{
+	u32 rem;
+	ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
+	ts->tv_nsec = rem;
+}
+#endif
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	switch (clock) {
@@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 	case CLOCK_MONOTONIC_COARSE:
 		do_monotonic_coarse(ts);
 		break;
+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+	case CLOCK_THREAD_CPUTIME_ID: {
+		u64 time;
+		if (VVAR(vdso_data).thread_cputime_disabled)
+			goto fallback;
+		time = do_thread_cpu_time();
+		if (time == -1LL)
+			goto fallback;
+		ns_to_ts(time, ts);
+		break;
+	}
+#endif
 	default:
 		goto fallback;
 	}
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 22b1a69..237b3af 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -12,6 +12,8 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
+#include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
@@ -23,7 +25,11 @@
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
-DEFINE_VVAR(struct vdso_data, vdso_data);
+
+DEFINE_VVAR(struct vdso_data, vdso_data) = {
+	.thread_cputime_disabled = 1,
+};
+struct static_key vcpu_thread_cputime_enabled;
 #endif
 
 void __init init_vdso_image(const struct vdso_image *image)
@@ -275,6 +281,8 @@ static int __init init_vdso(void)
 	init_vdso_image(&vdso_image_x32);
 #endif
 
+	if (!vdso_data.thread_cputime_disabled)
+		static_key_slow_inc(&vcpu_thread_cputime_enabled);
 	cpu_notifier_register_begin();
 
 	on_each_cpu(vgetcpu_cpu_init, NULL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8e882d..03e6175 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	int cpu = smp_processor_id();
 
 	vdso_set_cpu_cs_timestamp(cpu);
+	if (static_key_false(&vcpu_thread_cputime_enabled))
+		vdso_data.vpercpu[cpu].adj_sched_time =
+			current->se.sum_exec_runtime - sched_clock();
 #endif
 
 	rq->prev_mm = NULL;
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-17 23:12 ` [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO Shaohua Li
@ 2014-12-18 23:30   ` Andy Lutomirski
  2014-12-19  0:22     ` Andy Lutomirski
  0 siblings, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-18 23:30 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-kernel@vger.kernel.org, X86 ML, Kernel-team, H. Peter Anvin,
	Ingo Molnar, Peter Zijlstra, John Stultz

On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <shli@fb.com> wrote:
> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
> use the following method to compute the thread cpu time:
>
>     t0 = process start
>     t1 = most recent context switch time
>     t2 = time at which the vsyscall is invoked
>
>     thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>                 = current->se.sum_exec_runtime + now - sched_clock()
>
> At context switch time We stash away
>
>     adj_sched_time = sum_exec_runtime - sched_clock()
>
> in a per-cpu struct in the VVAR page and then compute
>
>     thread_cpu_time = adj_sched_time + now
>
> All computations are done in nanosecs on systems where TSC is stable. If
> TSC is unstable, we fallback to a regular syscall.
>     Benchmark data:
>
>     for (i = 0; i < 100000000; i++) {
>             clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
>             sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
>     }

A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
is spent taking various locks, and I think it could be worth adding a
fast path for the read-my-own-clock case in which we just disable
preemption and read the thing without any locks.

If we're actually going to go the vdso route, I'd like to make the
scheduler hooks clean.  Peterz and/or John, what's the right way to
get an arch-specific callback with sum_exec_runtime and an up to date
sched_clock value during a context switch?  I'd much rather not add
yet another rdtsc instruction to the scheduler.

--Andy

>
>     Baseline:
>     real    1m3.428s
>     user    0m5.190s
>     sys     0m58.220s
>
>     patched:
>     real    0m4.912s
>     user    0m4.910s
>     sys     0m0.000s
>
> This should speed up profilers that need to query thread cpu time a lot
> to do fine-grained timestamps.
>
> No statistically significant regression was detected on x86_64 context
> switch code. Most archs that don't support vsyscalls will have this code
> disabled via jump labels.
>
> Cc: Andy Lutomirski <luto@amacapital.net>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
> Signed-off-by: Arun Sharma <asharma@fb.com>
> Signed-off-by: Chris Mason <clm@fb.com>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  arch/x86/include/asm/vdso.h    |  9 +++++-
>  arch/x86/kernel/tsc.c          | 14 +++++++++
>  arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
>  arch/x86/vdso/vma.c            | 10 +++++-
>  kernel/sched/core.c            |  3 ++
>  5 files changed, 103 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
> index d4556a3..fcdf611 100644
> --- a/arch/x86/include/asm/vdso.h
> +++ b/arch/x86/include/asm/vdso.h
> @@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
>  #ifdef CONFIG_VDSO_CS_DETECT
>  struct vdso_percpu_data {
>         u64 last_cs_timestamp;
> +
> +       u64 adj_sched_time;
> +       u64 cyc2ns_offset;
> +       u32 cyc2ns_mul;
>  } ____cacheline_aligned;
>
>  struct vdso_data {
> -       int dummy;
> +       unsigned int thread_cputime_disabled;
>         struct vdso_percpu_data vpercpu[0];
>  };
>  extern struct vdso_data vdso_data;
>
> +struct static_key;
> +extern struct static_key vcpu_thread_cputime_enabled;
> +
>  static inline void vdso_set_cpu_cs_timestamp(int cpu)
>  {
>         rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index b7e50bb..dd3b281 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -21,6 +21,7 @@
>  #include <asm/hypervisor.h>
>  #include <asm/nmi.h>
>  #include <asm/x86_init.h>
> +#include <asm/vdso.h>
>
>  unsigned int __read_mostly cpu_khz;    /* TSC clocks / usec, not used here */
>  EXPORT_SYMBOL(cpu_khz);
> @@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>         data->cyc2ns_offset = ns_now -
>                 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
>
> +#ifdef CONFIG_VDSO_CS_DETECT
> +       vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
> +       vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
> +#endif
> +
>         cyc2ns_write_end(cpu, data);
>
>  done:
> @@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
>                 tsc_unstable = 1;
>                 clear_sched_clock_stable();
>                 disable_sched_clock_irqtime();
> +#ifdef CONFIG_VDSO_CS_DETECT
> +               vdso_data.thread_cputime_disabled = 1;
> +               static_key_slow_dec(&vcpu_thread_cputime_enabled);
> +#endif
>                 pr_info("Marking TSC unstable due to %s\n", reason);
>                 /* Change only the rating, when not registered */
>                 if (clocksource_tsc.mult)
> @@ -1202,6 +1212,10 @@ void __init tsc_init(void)
>
>         tsc_disabled = 0;
>         static_key_slow_inc(&__use_tsc);
> +#ifdef CONFIG_VDSO_CS_DETECT
> +       vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
> +                       X86_FEATURE_RDTSCP);
> +#endif
>
>         if (!no_sched_irq_time)
>                 enable_sched_clock_irqtime();
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index 9793322..0aa39b1 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -17,6 +17,7 @@
>  #include <asm/vvar.h>
>  #include <asm/unistd.h>
>  #include <asm/msr.h>
> +#include <asm/vdso.h>
>  #include <linux/math64.h>
>  #include <linux/time.h>
>
> @@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
>         } while (unlikely(gtod_read_retry(gtod, seq)));
>  }
>
> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
> +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> +notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
> +{
> +       u64 ns = offset;
> +       ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
> +       return ns;
> +}
> +
> +notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
> +{
> +       return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
> +}
> +
> +#define THREAD_CPU_TIME_MAX_LOOP 20
> +notrace static u64 do_thread_cpu_time(void)
> +{
> +       unsigned int p;
> +       u64 tscval;
> +       u64 adj_sched_time;
> +       u64 scale;
> +       u64 offset;
> +       const struct vdso_data *vp = &VVAR(vdso_data);
> +       int cpuo, cpun;
> +       int loop = 0;
> +
> +       do {
> +               loop++;
> +               if (loop > THREAD_CPU_TIME_MAX_LOOP)
> +                       return -1LL;
> +
> +               rdtscpll(tscval, p);
> +               cpuo = p & VGETCPU_CPU_MASK;
> +               adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
> +               scale = vp->vpercpu[cpuo].cyc2ns_mul;
> +               offset = vp->vpercpu[cpuo].cyc2ns_offset;
> +
> +               cpun = __getcpu() & VGETCPU_CPU_MASK;
> +               /*
> +                * The CPU check goarantees this runs in the same cpu, so no
> +                * barrier required
> +                */
> +       } while (unlikely(cpuo != cpun ||
> +                       tscval <= vdso_get_cpu_cs_timestamp(cpun)));
> +
> +       return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
> +}
> +
> +notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
> +{
> +       u32 rem;
> +       ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
> +       ts->tv_nsec = rem;
> +}
> +#endif
> +
>  notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>  {
>         switch (clock) {
> @@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>         case CLOCK_MONOTONIC_COARSE:
>                 do_monotonic_coarse(ts);
>                 break;
> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
> +       case CLOCK_THREAD_CPUTIME_ID: {
> +               u64 time;
> +               if (VVAR(vdso_data).thread_cputime_disabled)
> +                       goto fallback;
> +               time = do_thread_cpu_time();
> +               if (time == -1LL)
> +                       goto fallback;
> +               ns_to_ts(time, ts);
> +               break;
> +       }
> +#endif
>         default:
>                 goto fallback;
>         }
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 22b1a69..237b3af 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -12,6 +12,8 @@
>  #include <linux/random.h>
>  #include <linux/elf.h>
>  #include <linux/cpu.h>
> +#include <linux/jump_label.h>
> +#include <asm/vsyscall.h>
>  #include <asm/vgtod.h>
>  #include <asm/proto.h>
>  #include <asm/vdso.h>
> @@ -23,7 +25,11 @@
>
>  #if defined(CONFIG_X86_64)
>  unsigned int __read_mostly vdso64_enabled = 1;
> -DEFINE_VVAR(struct vdso_data, vdso_data);
> +
> +DEFINE_VVAR(struct vdso_data, vdso_data) = {
> +       .thread_cputime_disabled = 1,
> +};
> +struct static_key vcpu_thread_cputime_enabled;
>  #endif
>
>  void __init init_vdso_image(const struct vdso_image *image)
> @@ -275,6 +281,8 @@ static int __init init_vdso(void)
>         init_vdso_image(&vdso_image_x32);
>  #endif
>
> +       if (!vdso_data.thread_cputime_disabled)
> +               static_key_slow_inc(&vcpu_thread_cputime_enabled);
>         cpu_notifier_register_begin();
>
>         on_each_cpu(vgetcpu_cpu_init, NULL, 1);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d8e882d..03e6175 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>         int cpu = smp_processor_id();
>
>         vdso_set_cpu_cs_timestamp(cpu);
> +       if (static_key_false(&vcpu_thread_cputime_enabled))
> +               vdso_data.vpercpu[cpu].adj_sched_time =
> +                       current->se.sum_exec_runtime - sched_clock();
>  #endif
>
>         rq->prev_mm = NULL;
> --
> 1.8.1
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-18 23:30   ` Andy Lutomirski
@ 2014-12-19  0:22     ` Andy Lutomirski
  2014-12-19  0:30       ` Shaohua Li
  2014-12-19 11:23       ` Peter Zijlstra
  0 siblings, 2 replies; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19  0:22 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-kernel@vger.kernel.org, X86 ML, Kernel-team, H. Peter Anvin,
	Ingo Molnar, Peter Zijlstra, John Stultz

On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <shli@fb.com> wrote:
>> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
>> use the following method to compute the thread cpu time:
>>
>>     t0 = process start
>>     t1 = most recent context switch time
>>     t2 = time at which the vsyscall is invoked
>>
>>     thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>>                 = current->se.sum_exec_runtime + now - sched_clock()
>>
>> At context switch time We stash away
>>
>>     adj_sched_time = sum_exec_runtime - sched_clock()
>>
>> in a per-cpu struct in the VVAR page and then compute
>>
>>     thread_cpu_time = adj_sched_time + now
>>
>> All computations are done in nanosecs on systems where TSC is stable. If
>> TSC is unstable, we fallback to a regular syscall.
>>     Benchmark data:
>>
>>     for (i = 0; i < 100000000; i++) {
>>             clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
>>             sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
>>     }
>
> A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
> is spent taking various locks, and I think it could be worth adding a
> fast path for the read-my-own-clock case in which we just disable
> preemption and read the thing without any locks.
>
> If we're actually going to go the vdso route, I'd like to make the
> scheduler hooks clean.  Peterz and/or John, what's the right way to
> get an arch-specific callback with sum_exec_runtime and an up to date
> sched_clock value during a context switch?  I'd much rather not add
> yet another rdtsc instruction to the scheduler.

Bad news: this patch is incorrect, I think.  Take a look at
update_rq_clock -- it does fancy things involving irq time and
paravirt steal time.  So this patch could result in extremely
non-monotonic results.

There's CPUCLOCK_VIRT (which, oddly, I see no well-defined API for)
that *might* be a decent approximation.

Thoughts?

--Andy

>
> --Andy
>
>>
>>     Baseline:
>>     real    1m3.428s
>>     user    0m5.190s
>>     sys     0m58.220s
>>
>>     patched:
>>     real    0m4.912s
>>     user    0m4.910s
>>     sys     0m0.000s
>>
>> This should speed up profilers that need to query thread cpu time a lot
>> to do fine-grained timestamps.
>>
>> No statistically significant regression was detected on x86_64 context
>> switch code. Most archs that don't support vsyscalls will have this code
>> disabled via jump labels.
>>
>> Cc: Andy Lutomirski <luto@amacapital.net>
>> Cc: H. Peter Anvin <hpa@zytor.com>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>> Signed-off-by: Arun Sharma <asharma@fb.com>
>> Signed-off-by: Chris Mason <clm@fb.com>
>> Signed-off-by: Shaohua Li <shli@fb.com>
>> ---
>>  arch/x86/include/asm/vdso.h    |  9 +++++-
>>  arch/x86/kernel/tsc.c          | 14 +++++++++
>>  arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
>>  arch/x86/vdso/vma.c            | 10 +++++-
>>  kernel/sched/core.c            |  3 ++
>>  5 files changed, 103 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
>> index d4556a3..fcdf611 100644
>> --- a/arch/x86/include/asm/vdso.h
>> +++ b/arch/x86/include/asm/vdso.h
>> @@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
>>  #ifdef CONFIG_VDSO_CS_DETECT
>>  struct vdso_percpu_data {
>>         u64 last_cs_timestamp;
>> +
>> +       u64 adj_sched_time;
>> +       u64 cyc2ns_offset;
>> +       u32 cyc2ns_mul;
>>  } ____cacheline_aligned;
>>
>>  struct vdso_data {
>> -       int dummy;
>> +       unsigned int thread_cputime_disabled;
>>         struct vdso_percpu_data vpercpu[0];
>>  };
>>  extern struct vdso_data vdso_data;
>>
>> +struct static_key;
>> +extern struct static_key vcpu_thread_cputime_enabled;
>> +
>>  static inline void vdso_set_cpu_cs_timestamp(int cpu)
>>  {
>>         rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
>> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
>> index b7e50bb..dd3b281 100644
>> --- a/arch/x86/kernel/tsc.c
>> +++ b/arch/x86/kernel/tsc.c
>> @@ -21,6 +21,7 @@
>>  #include <asm/hypervisor.h>
>>  #include <asm/nmi.h>
>>  #include <asm/x86_init.h>
>> +#include <asm/vdso.h>
>>
>>  unsigned int __read_mostly cpu_khz;    /* TSC clocks / usec, not used here */
>>  EXPORT_SYMBOL(cpu_khz);
>> @@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>>         data->cyc2ns_offset = ns_now -
>>                 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
>>
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> +       vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
>> +       vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
>> +#endif
>> +
>>         cyc2ns_write_end(cpu, data);
>>
>>  done:
>> @@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
>>                 tsc_unstable = 1;
>>                 clear_sched_clock_stable();
>>                 disable_sched_clock_irqtime();
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> +               vdso_data.thread_cputime_disabled = 1;
>> +               static_key_slow_dec(&vcpu_thread_cputime_enabled);
>> +#endif
>>                 pr_info("Marking TSC unstable due to %s\n", reason);
>>                 /* Change only the rating, when not registered */
>>                 if (clocksource_tsc.mult)
>> @@ -1202,6 +1212,10 @@ void __init tsc_init(void)
>>
>>         tsc_disabled = 0;
>>         static_key_slow_inc(&__use_tsc);
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> +       vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
>> +                       X86_FEATURE_RDTSCP);
>> +#endif
>>
>>         if (!no_sched_irq_time)
>>                 enable_sched_clock_irqtime();
>> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
>> index 9793322..0aa39b1 100644
>> --- a/arch/x86/vdso/vclock_gettime.c
>> +++ b/arch/x86/vdso/vclock_gettime.c
>> @@ -17,6 +17,7 @@
>>  #include <asm/vvar.h>
>>  #include <asm/unistd.h>
>>  #include <asm/msr.h>
>> +#include <asm/vdso.h>
>>  #include <linux/math64.h>
>>  #include <linux/time.h>
>>
>> @@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
>>         } while (unlikely(gtod_read_retry(gtod, seq)));
>>  }
>>
>> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
>> +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
>> +notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
>> +{
>> +       u64 ns = offset;
>> +       ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
>> +       return ns;
>> +}
>> +
>> +notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
>> +{
>> +       return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
>> +}
>> +
>> +#define THREAD_CPU_TIME_MAX_LOOP 20
>> +notrace static u64 do_thread_cpu_time(void)
>> +{
>> +       unsigned int p;
>> +       u64 tscval;
>> +       u64 adj_sched_time;
>> +       u64 scale;
>> +       u64 offset;
>> +       const struct vdso_data *vp = &VVAR(vdso_data);
>> +       int cpuo, cpun;
>> +       int loop = 0;
>> +
>> +       do {
>> +               loop++;
>> +               if (loop > THREAD_CPU_TIME_MAX_LOOP)
>> +                       return -1LL;
>> +
>> +               rdtscpll(tscval, p);
>> +               cpuo = p & VGETCPU_CPU_MASK;
>> +               adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
>> +               scale = vp->vpercpu[cpuo].cyc2ns_mul;
>> +               offset = vp->vpercpu[cpuo].cyc2ns_offset;
>> +
>> +               cpun = __getcpu() & VGETCPU_CPU_MASK;
>> +               /*
>> +                * The CPU check goarantees this runs in the same cpu, so no
>> +                * barrier required
>> +                */
>> +       } while (unlikely(cpuo != cpun ||
>> +                       tscval <= vdso_get_cpu_cs_timestamp(cpun)));
>> +
>> +       return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
>> +}
>> +
>> +notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
>> +{
>> +       u32 rem;
>> +       ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
>> +       ts->tv_nsec = rem;
>> +}
>> +#endif
>> +
>>  notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>>  {
>>         switch (clock) {
>> @@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>>         case CLOCK_MONOTONIC_COARSE:
>>                 do_monotonic_coarse(ts);
>>                 break;
>> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
>> +       case CLOCK_THREAD_CPUTIME_ID: {
>> +               u64 time;
>> +               if (VVAR(vdso_data).thread_cputime_disabled)
>> +                       goto fallback;
>> +               time = do_thread_cpu_time();
>> +               if (time == -1LL)
>> +                       goto fallback;
>> +               ns_to_ts(time, ts);
>> +               break;
>> +       }
>> +#endif
>>         default:
>>                 goto fallback;
>>         }
>> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
>> index 22b1a69..237b3af 100644
>> --- a/arch/x86/vdso/vma.c
>> +++ b/arch/x86/vdso/vma.c
>> @@ -12,6 +12,8 @@
>>  #include <linux/random.h>
>>  #include <linux/elf.h>
>>  #include <linux/cpu.h>
>> +#include <linux/jump_label.h>
>> +#include <asm/vsyscall.h>
>>  #include <asm/vgtod.h>
>>  #include <asm/proto.h>
>>  #include <asm/vdso.h>
>> @@ -23,7 +25,11 @@
>>
>>  #if defined(CONFIG_X86_64)
>>  unsigned int __read_mostly vdso64_enabled = 1;
>> -DEFINE_VVAR(struct vdso_data, vdso_data);
>> +
>> +DEFINE_VVAR(struct vdso_data, vdso_data) = {
>> +       .thread_cputime_disabled = 1,
>> +};
>> +struct static_key vcpu_thread_cputime_enabled;
>>  #endif
>>
>>  void __init init_vdso_image(const struct vdso_image *image)
>> @@ -275,6 +281,8 @@ static int __init init_vdso(void)
>>         init_vdso_image(&vdso_image_x32);
>>  #endif
>>
>> +       if (!vdso_data.thread_cputime_disabled)
>> +               static_key_slow_inc(&vcpu_thread_cputime_enabled);
>>         cpu_notifier_register_begin();
>>
>>         on_each_cpu(vgetcpu_cpu_init, NULL, 1);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index d8e882d..03e6175 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>>         int cpu = smp_processor_id();
>>
>>         vdso_set_cpu_cs_timestamp(cpu);
>> +       if (static_key_false(&vcpu_thread_cputime_enabled))
>> +               vdso_data.vpercpu[cpu].adj_sched_time =
>> +                       current->se.sum_exec_runtime - sched_clock();
>>  #endif
>>
>>         rq->prev_mm = NULL;
>> --
>> 1.8.1
>>
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19  0:22     ` Andy Lutomirski
@ 2014-12-19  0:30       ` Shaohua Li
  2014-12-19  0:32         ` Andy Lutomirski
  2014-12-19  0:34         ` Thomas Gleixner
  2014-12-19 11:23       ` Peter Zijlstra
  1 sibling, 2 replies; 29+ messages in thread
From: Shaohua Li @ 2014-12-19  0:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-kernel@vger.kernel.org, X86 ML, Kernel-team, H. Peter Anvin,
	Ingo Molnar, Peter Zijlstra, John Stultz

On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <shli@fb.com> wrote:
> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
> >> use the following method to compute the thread cpu time:
> >>
> >>     t0 = process start
> >>     t1 = most recent context switch time
> >>     t2 = time at which the vsyscall is invoked
> >>
> >>     thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
> >>                 = current->se.sum_exec_runtime + now - sched_clock()
> >>
> >> At context switch time We stash away
> >>
> >>     adj_sched_time = sum_exec_runtime - sched_clock()
> >>
> >> in a per-cpu struct in the VVAR page and then compute
> >>
> >>     thread_cpu_time = adj_sched_time + now
> >>
> >> All computations are done in nanosecs on systems where TSC is stable. If
> >> TSC is unstable, we fallback to a regular syscall.
> >>     Benchmark data:
> >>
> >>     for (i = 0; i < 100000000; i++) {
> >>             clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
> >>             sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
> >>     }
> >
> > A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
> > is spent taking various locks, and I think it could be worth adding a
> > fast path for the read-my-own-clock case in which we just disable
> > preemption and read the thing without any locks.
> >
> > If we're actually going to go the vdso route, I'd like to make the
> > scheduler hooks clean.  Peterz and/or John, what's the right way to
> > get an arch-specific callback with sum_exec_runtime and an up to date
> > sched_clock value during a context switch?  I'd much rather not add
> > yet another rdtsc instruction to the scheduler.
> 
> Bad news: this patch is incorrect, I think.  Take a look at
> update_rq_clock -- it does fancy things involving irq time and
> paravirt steal time.  So this patch could result in extremely
> non-monotonic results.

Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
optional feature. Actually it's added not long time ago. I thought it's
acceptable the time isn't precise just like what we have before the
feature is added.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19  0:30       ` Shaohua Li
@ 2014-12-19  0:32         ` Andy Lutomirski
  2014-12-19  0:34         ` Thomas Gleixner
  1 sibling, 0 replies; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19  0:32 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-kernel@vger.kernel.org, X86 ML, Kernel-team, H. Peter Anvin,
	Ingo Molnar, Peter Zijlstra, John Stultz

On Thu, Dec 18, 2014 at 4:30 PM, Shaohua Li <shli@fb.com> wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> > On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <shli@fb.com> wrote:
>> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
>> >> use the following method to compute the thread cpu time:
>> >>
>> >>     t0 = process start
>> >>     t1 = most recent context switch time
>> >>     t2 = time at which the vsyscall is invoked
>> >>
>> >>     thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>> >>                 = current->se.sum_exec_runtime + now - sched_clock()
>> >>
>> >> At context switch time We stash away
>> >>
>> >>     adj_sched_time = sum_exec_runtime - sched_clock()
>> >>
>> >> in a per-cpu struct in the VVAR page and then compute
>> >>
>> >>     thread_cpu_time = adj_sched_time + now
>> >>
>> >> All computations are done in nanosecs on systems where TSC is stable. If
>> >> TSC is unstable, we fallback to a regular syscall.
>> >>     Benchmark data:
>> >>
>> >>     for (i = 0; i < 100000000; i++) {
>> >>             clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
>> >>             sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
>> >>     }
>> >
>> > A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
>> > is spent taking various locks, and I think it could be worth adding a
>> > fast path for the read-my-own-clock case in which we just disable
>> > preemption and read the thing without any locks.
>> >
>> > If we're actually going to go the vdso route, I'd like to make the
>> > scheduler hooks clean.  Peterz and/or John, what's the right way to
>> > get an arch-specific callback with sum_exec_runtime and an up to date
>> > sched_clock value during a context switch?  I'd much rather not add
>> > yet another rdtsc instruction to the scheduler.
>>
>> Bad news: this patch is incorrect, I think.  Take a look at
>> update_rq_clock -- it does fancy things involving irq time and
>> paravirt steal time.  So this patch could result in extremely
>> non-monotonic results.
>
> Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
> optional feature. Actually it's added not long time ago. I thought it's
> acceptable the time isn't precise just like what we have before the
> feature is added.
>

Nonetheless, I think that the vdso accelerated functions should be
careful to remain interchangeable with the syscall equivalents.  If
that means that some kconfig magic needs to be added to prevent this
code from being enabled when it won't work, then so be it.  But it
might be better to use a different clock id entirely, and I don't
really understand the logic behind all the clock ids.

John?

--Andy

> Thanks,
> Shaohua



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19  0:30       ` Shaohua Li
  2014-12-19  0:32         ` Andy Lutomirski
@ 2014-12-19  0:34         ` Thomas Gleixner
  1 sibling, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2014-12-19  0:34 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, Peter Zijlstra,
	John Stultz

On Thu, 18 Dec 2014, Shaohua Li wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> > Bad news: this patch is incorrect, I think.  Take a look at
> > update_rq_clock -- it does fancy things involving irq time and
> > paravirt steal time.  So this patch could result in extremely
> > non-monotonic results.
> 
> Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
> optional feature. Actually it's added not long time ago. I thought it's
> acceptable the time isn't precise just like what we have before the
> feature is added.

Wrong thought, really.

If the vdso can give you observable inconsistent representation
vs. real syscalls then it does not matter at all how long ago the
feature was added. It's simply wrong no matter what.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch
  2014-12-17 23:12 ` [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch Shaohua Li
@ 2014-12-19  1:05   ` Thomas Gleixner
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2014-12-19  1:05 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-kernel, x86, Kernel-team, Andy Lutomirski, H. Peter Anvin,
	Ingo Molnar

On Wed, 17 Dec 2014, Shaohua Li wrote:
> vdso code can't disable preempt, so it can be preempted at any time.
> This makes a challenge to implement specific features. This patch adds a
> generic API to let vdso code detect context switch.

Please change this to:

 This adds a complete trainwreck into the scheduler hotpath.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b5797b7..d8e882d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2232,6 +2232,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>  	struct rq *rq = this_rq();
>  	struct mm_struct *mm = rq->prev_mm;
>  	long prev_state;
> +#ifdef CONFIG_VDSO_CS_DETECT

Ever heard about header files and CONFIG dependent inline functions?

> +	int cpu = smp_processor_id();

If you think hard enough the you might find an even more expensive way
to figure out on which cpu you are running on.

> +	vdso_set_cpu_cs_timestamp(cpu);

So we hand in the cpu we are running on to update percpu data in non
preemptible context instead of handing in the really relevant data,
i.e. the timestamp which is readily available right here for free. I
bet the completely misnomed function will go through loops and hoops
to figure that out.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19  0:22     ` Andy Lutomirski
  2014-12-19  0:30       ` Shaohua Li
@ 2014-12-19 11:23       ` Peter Zijlstra
  2014-12-19 16:48         ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2014-12-19 11:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> Bad news: this patch is incorrect, I think.  Take a look at
> update_rq_clock -- it does fancy things involving irq time and
> paravirt steal time.  So this patch could result in extremely
> non-monotonic results.

Yeah, I'm not sure how (and if) we could make all that work :/

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 11:23       ` Peter Zijlstra
@ 2014-12-19 16:48         ` Andy Lutomirski
  2014-12-19 17:03           ` Peter Zijlstra
  2014-12-19 17:42           ` Chris Mason
  0 siblings, 2 replies; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19 16:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> Bad news: this patch is incorrect, I think.  Take a look at
>> update_rq_clock -- it does fancy things involving irq time and
>> paravirt steal time.  So this patch could result in extremely
>> non-monotonic results.
>
> Yeah, I'm not sure how (and if) we could make all that work :/

I obviously can't comment on what Facebook needs, but if I were
rigging something up to profile my own code*, I'd want a count of
elapsed time, including user, system, and probably interrupt as well.
I would probably not want to count time during which I'm not
scheduled, and I would also probably not want to count steal time.
The latter makes any implementation kind of nasty.

The API presumably doesn't need to be any particular clock id for
clock_gettime, and it may not even need to be clock_gettime at all.

Is perf self-monitoring good enough for this?  If not, can we make it
good enough?

* I do this today using CLOCK_MONOTONIC

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 16:48         ` Andy Lutomirski
@ 2014-12-19 17:03           ` Peter Zijlstra
  2014-12-19 17:07             ` Andy Lutomirski
  2015-01-02  2:59             ` Shaohua Li
  2014-12-19 17:42           ` Chris Mason
  1 sibling, 2 replies; 29+ messages in thread
From: Peter Zijlstra @ 2014-12-19 17:03 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> >> Bad news: this patch is incorrect, I think.  Take a look at
> >> update_rq_clock -- it does fancy things involving irq time and
> >> paravirt steal time.  So this patch could result in extremely
> >> non-monotonic results.
> >
> > Yeah, I'm not sure how (and if) we could make all that work :/
> 
> I obviously can't comment on what Facebook needs, but if I were
> rigging something up to profile my own code*, I'd want a count of
> elapsed time, including user, system, and probably interrupt as well.
> I would probably not want to count time during which I'm not
> scheduled, and I would also probably not want to count steal time.
> The latter makes any implementation kind of nasty.
> 
> The API presumably doesn't need to be any particular clock id for
> clock_gettime, and it may not even need to be clock_gettime at all.
> 
> Is perf self-monitoring good enough for this?  If not, can we make it
> good enough?

Yeah, I think you should be able to use that. You could count a NOP
event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
such purposes iirc.

The advantage of using perf self profiling is that it (obviously)
extends to more than just walltime.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:03           ` Peter Zijlstra
@ 2014-12-19 17:07             ` Andy Lutomirski
  2014-12-19 17:27               ` Peter Zijlstra
  2015-01-02  2:59             ` Shaohua Li
  1 sibling, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19 17:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 9:03 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
>> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> >> Bad news: this patch is incorrect, I think.  Take a look at
>> >> update_rq_clock -- it does fancy things involving irq time and
>> >> paravirt steal time.  So this patch could result in extremely
>> >> non-monotonic results.
>> >
>> > Yeah, I'm not sure how (and if) we could make all that work :/
>>
>> I obviously can't comment on what Facebook needs, but if I were
>> rigging something up to profile my own code*, I'd want a count of
>> elapsed time, including user, system, and probably interrupt as well.
>> I would probably not want to count time during which I'm not
>> scheduled, and I would also probably not want to count steal time.
>> The latter makes any implementation kind of nasty.
>>
>> The API presumably doesn't need to be any particular clock id for
>> clock_gettime, and it may not even need to be clock_gettime at all.
>>
>> Is perf self-monitoring good enough for this?  If not, can we make it
>> good enough?
>
> Yeah, I think you should be able to use that. You could count a NOP
> event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
> such purposes iirc.
>
> The advantage of using perf self profiling is that it (obviously)
> extends to more than just walltime.

Re-asking my old question: would it make sense to add a vdso helper
for the magic self-monitoring interface?  Or, at the very least, we
could try to tidy up the docs a bit.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:07             ` Andy Lutomirski
@ 2014-12-19 17:27               ` Peter Zijlstra
  2014-12-19 17:42                 ` Andy Lutomirski
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2014-12-19 17:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 09:07:49AM -0800, Andy Lutomirski wrote:
> Re-asking my old question: would it make sense to add a vdso helper
> for the magic self-monitoring interface?  Or, at the very least, we
> could try to tidy up the docs a bit.

I find it really helps (performance wise) to strip down that magic to
the bare minimum required.

A VDSO helper would always have to do everything.  I suppose we could
provide a generic helper, one can always hand code the stuff anyhow.

Then again, what is the benefit of having it in the VDSO as opposed to a
regular DSO?

Updating the docs is always good.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:27               ` Peter Zijlstra
@ 2014-12-19 17:42                 ` Andy Lutomirski
  0 siblings, 0 replies; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19 17:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shaohua Li, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 9:27 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 19, 2014 at 09:07:49AM -0800, Andy Lutomirski wrote:
>> Re-asking my old question: would it make sense to add a vdso helper
>> for the magic self-monitoring interface?  Or, at the very least, we
>> could try to tidy up the docs a bit.
>
> I find it really helps (performance wise) to strip down that magic to
> the bare minimum required.
>
> A VDSO helper would always have to do everything.  I suppose we could
> provide a generic helper, one can always hand code the stuff anyhow.
>
> Then again, what is the benefit of having it in the VDSO as opposed to a
> regular DSO?

The benefit of using the VDSO is that it means that we can change the
data structure whenever we want.  The __vdso_clock_gettime data
structure changes on a somewhat regular basis.

Other than that, a regular DSO is somewhat easier.

>
> Updating the docs is always good.

:)

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 16:48         ` Andy Lutomirski
  2014-12-19 17:03           ` Peter Zijlstra
@ 2014-12-19 17:42           ` Chris Mason
  2014-12-19 17:53             ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: Chris Mason @ 2014-12-19 17:42 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Shaohua Li, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz



On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <luto@amacapital.net> 
wrote:
> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra 
> <peterz@infradead.org> wrote:
>>  On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>>>  Bad news: this patch is incorrect, I think.  Take a look at
>>>  update_rq_clock -- it does fancy things involving irq time and
>>>  paravirt steal time.  So this patch could result in extremely
>>>  non-monotonic results.
>> 
>>  Yeah, I'm not sure how (and if) we could make all that work :/
> 
> I obviously can't comment on what Facebook needs, but if I were
> rigging something up to profile my own code*, I'd want a count of
> elapsed time, including user, system, and probably interrupt as well.
> I would probably not want to count time during which I'm not
> scheduled, and I would also probably not want to count steal time.
> The latter makes any implementation kind of nasty.
> 
> The API presumably doesn't need to be any particular clock id for
> clock_gettime, and it may not even need to be clock_gettime at all.
> 
> Is perf self-monitoring good enough for this?  If not, can we make it
> good enough?
> 
> * I do this today using CLOCK_MONOTONIC

The clock_gettime calls are used for a wide variety of things, but 
usually they are trying to instrument how much CPU the application is 
using.  So for example with the HHVM interpreter they have a ratio of 
the number of hhvm instructions they were able to execute in N seconds 
of cputime.  This gets used to optimize the HHVM implementation and can 
be used as a push blocking counter (code can't go in if it makes it 
slower).

Wall time isn't a great representation of this because it includes 
factors that might be outside a given HHVM patch, but it sounds like 
we're saying almost the same thing.

I'm not familiar with the perf self monitoring?

-chris





^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:42           ` Chris Mason
@ 2014-12-19 17:53             ` Andy Lutomirski
  2014-12-19 18:16               ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2014-12-19 17:53 UTC (permalink / raw)
  To: Chris Mason
  Cc: Peter Zijlstra, Shaohua Li, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 9:42 AM, Chris Mason <clm@fb.com> wrote:
>
>
> On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <luto@amacapital.net>
> wrote:
>>
>> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org>
>> wrote:
>>>
>>>  On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>>>>
>>>>  Bad news: this patch is incorrect, I think.  Take a look at
>>>>  update_rq_clock -- it does fancy things involving irq time and
>>>>  paravirt steal time.  So this patch could result in extremely
>>>>  non-monotonic results.
>>>
>>>
>>>  Yeah, I'm not sure how (and if) we could make all that work :/
>>
>>
>> I obviously can't comment on what Facebook needs, but if I were
>> rigging something up to profile my own code*, I'd want a count of
>> elapsed time, including user, system, and probably interrupt as well.
>> I would probably not want to count time during which I'm not
>> scheduled, and I would also probably not want to count steal time.
>> The latter makes any implementation kind of nasty.
>>
>> The API presumably doesn't need to be any particular clock id for
>> clock_gettime, and it may not even need to be clock_gettime at all.
>>
>> Is perf self-monitoring good enough for this?  If not, can we make it
>> good enough?
>>
>> * I do this today using CLOCK_MONOTONIC
>
>
> The clock_gettime calls are used for a wide variety of things, but usually
> they are trying to instrument how much CPU the application is using.  So for
> example with the HHVM interpreter they have a ratio of the number of hhvm
> instructions they were able to execute in N seconds of cputime.  This gets
> used to optimize the HHVM implementation and can be used as a push blocking
> counter (code can't go in if it makes it slower).
>
> Wall time isn't a great representation of this because it includes factors
> that might be outside a given HHVM patch, but it sounds like we're saying
> almost the same thing.
>
> I'm not familiar with the perf self monitoring?

You can call perf_event_open and mmap the result.  Then you can read
the docs^Wheader file.

On the god side, it's an explicit mmap, so all the nasty preemption
issues are entirely moot.  And you can count cache misses and such if
you want to be fancy.

On the bad side, the docs are a bit weak, and the added context switch
overhead might be higher.

--Andy

>
> -chris
>
>
>
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:53             ` Andy Lutomirski
@ 2014-12-19 18:16               ` Shaohua Li
  0 siblings, 0 replies; 29+ messages in thread
From: Shaohua Li @ 2014-12-19 18:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Chris Mason, Peter Zijlstra, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 09:53:24AM -0800, Andy Lutomirski wrote:
> On Fri, Dec 19, 2014 at 9:42 AM, Chris Mason <clm@fb.com> wrote:
> >
> >
> > On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <luto@amacapital.net>
> > wrote:
> >>
> >> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org>
> >> wrote:
> >>>
> >>>  On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> >>>>
> >>>>  Bad news: this patch is incorrect, I think.  Take a look at
> >>>>  update_rq_clock -- it does fancy things involving irq time and
> >>>>  paravirt steal time.  So this patch could result in extremely
> >>>>  non-monotonic results.
> >>>
> >>>
> >>>  Yeah, I'm not sure how (and if) we could make all that work :/
> >>
> >>
> >> I obviously can't comment on what Facebook needs, but if I were
> >> rigging something up to profile my own code*, I'd want a count of
> >> elapsed time, including user, system, and probably interrupt as well.
> >> I would probably not want to count time during which I'm not
> >> scheduled, and I would also probably not want to count steal time.
> >> The latter makes any implementation kind of nasty.
> >>
> >> The API presumably doesn't need to be any particular clock id for
> >> clock_gettime, and it may not even need to be clock_gettime at all.
> >>
> >> Is perf self-monitoring good enough for this?  If not, can we make it
> >> good enough?
> >>
> >> * I do this today using CLOCK_MONOTONIC
> >
> >
> > The clock_gettime calls are used for a wide variety of things, but usually
> > they are trying to instrument how much CPU the application is using.  So for
> > example with the HHVM interpreter they have a ratio of the number of hhvm
> > instructions they were able to execute in N seconds of cputime.  This gets
> > used to optimize the HHVM implementation and can be used as a push blocking
> > counter (code can't go in if it makes it slower).
> >
> > Wall time isn't a great representation of this because it includes factors
> > that might be outside a given HHVM patch, but it sounds like we're saying
> > almost the same thing.
> >
> > I'm not familiar with the perf self monitoring?
> 
> You can call perf_event_open and mmap the result.  Then you can read
> the docs^Wheader file.
> 
> On the god side, it's an explicit mmap, so all the nasty preemption
> issues are entirely moot.  And you can count cache misses and such if
> you want to be fancy.
> 
> On the bad side, the docs are a bit weak, and the added context switch
> overhead might be higher.

I'll measure the overhead for sure. If overhead isn't high, the perf
approach is very interesting. On the other hand, is it acceptable the
clock_gettime fallbacks to slow path if irq time is enabled (it's
overhead is high, we don't enable it actually)?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2014-12-19 17:03           ` Peter Zijlstra
  2014-12-19 17:07             ` Andy Lutomirski
@ 2015-01-02  2:59             ` Shaohua Li
  2015-01-02 15:31               ` David Ahern
  2015-01-02 17:47               ` Andy Lutomirski
  1 sibling, 2 replies; 29+ messages in thread
From: Shaohua Li @ 2015-01-02  2:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Dec 19, 2014 at 06:03:34PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
> > On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> > >> Bad news: this patch is incorrect, I think.  Take a look at
> > >> update_rq_clock -- it does fancy things involving irq time and
> > >> paravirt steal time.  So this patch could result in extremely
> > >> non-monotonic results.
> > >
> > > Yeah, I'm not sure how (and if) we could make all that work :/
> > 
> > I obviously can't comment on what Facebook needs, but if I were
> > rigging something up to profile my own code*, I'd want a count of
> > elapsed time, including user, system, and probably interrupt as well.
> > I would probably not want to count time during which I'm not
> > scheduled, and I would also probably not want to count steal time.
> > The latter makes any implementation kind of nasty.
> > 
> > The API presumably doesn't need to be any particular clock id for
> > clock_gettime, and it may not even need to be clock_gettime at all.
> > 
> > Is perf self-monitoring good enough for this?  If not, can we make it
> > good enough?
> 
> Yeah, I think you should be able to use that. You could count a NOP
> event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
> such purposes iirc.
> 
> The advantage of using perf self profiling is that it (obviously)
> extends to more than just walltime.

Hi Peter & Andy,
I'm wondering how we could use the perf to implament a clock_gettime.
reading the perf fd or using ioctl is slow so reading the mmap
ringbuffer is the only option. But as far as I know the ringbuffer has
data only when an event is generated. Between two events, there is
nothing we can read from the ringbuffer. Then how can application get
time info in the interval?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02  2:59             ` Shaohua Li
@ 2015-01-02 15:31               ` David Ahern
  2015-01-02 17:02                 ` Shaohua Li
  2015-01-02 17:47               ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: David Ahern @ 2015-01-02 15:31 UTC (permalink / raw)
  To: Shaohua Li, Peter Zijlstra
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On 1/1/15 7:59 PM, Shaohua Li wrote:
> I'm wondering how we could use the perf to implament a clock_gettime.
> reading the perf fd or using ioctl is slow so reading the mmap
> ringbuffer is the only option. But as far as I know the ringbuffer has
> data only when an event is generated. Between two events, there is
> nothing we can read from the ringbuffer. Then how can application get
> time info in the interval?

Are you wanting to read perf_clock from userspace?

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02 15:31               ` David Ahern
@ 2015-01-02 17:02                 ` Shaohua Li
  2015-01-02 17:09                   ` David Ahern
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2015-01-02 17:02 UTC (permalink / raw)
  To: David Ahern
  Cc: Peter Zijlstra, Andy Lutomirski, linux-kernel@vger.kernel.org,
	X86 ML, Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Jan 02, 2015 at 08:31:33AM -0700, David Ahern wrote:
> On 1/1/15 7:59 PM, Shaohua Li wrote:
> >I'm wondering how we could use the perf to implament a clock_gettime.
> >reading the perf fd or using ioctl is slow so reading the mmap
> >ringbuffer is the only option. But as far as I know the ringbuffer has
> >data only when an event is generated. Between two events, there is
> >nothing we can read from the ringbuffer. Then how can application get
> >time info in the interval?
> 
> Are you wanting to read perf_clock from userspace?

Yep, in some sort of form. Basically I want to read the time a task
runs. Peter suggests we can read the activation time of a perf event.
But I don't want to use any system call, as it's slow and likes
clock_gettime.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02 17:02                 ` Shaohua Li
@ 2015-01-02 17:09                   ` David Ahern
  2015-01-02 17:17                     ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: David Ahern @ 2015-01-02 17:09 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Peter Zijlstra, Andy Lutomirski, linux-kernel@vger.kernel.org,
	X86 ML, Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On 1/2/15 10:02 AM, Shaohua Li wrote:
> On Fri, Jan 02, 2015 at 08:31:33AM -0700, David Ahern wrote:
>> On 1/1/15 7:59 PM, Shaohua Li wrote:
>>> I'm wondering how we could use the perf to implament a clock_gettime.
>>> reading the perf fd or using ioctl is slow so reading the mmap
>>> ringbuffer is the only option. But as far as I know the ringbuffer has
>>> data only when an event is generated. Between two events, there is
>>> nothing we can read from the ringbuffer. Then how can application get
>>> time info in the interval?
>>
>> Are you wanting to read perf_clock from userspace?
>
> Yep, in some sort of form. Basically I want to read the time a task
> runs. Peter suggests we can read the activation time of a perf event.
> But I don't want to use any system call, as it's slow and likes
> clock_gettime.

Since we cannot get the capability committed upstream a number of folks 
are using this method:

     https://github.com/dsahern/linux/blob/perf-full-monty/README.ahern

ie., a KLM exports perf_clock and apps can use:

#define CLOCK_PERF 14
if (clock_gettime(CLOCK_PERF, &ts) != 0) {
}

No vdso acceleration, but works with an unmodified kernel.

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02 17:09                   ` David Ahern
@ 2015-01-02 17:17                     ` Shaohua Li
  2015-01-02 17:26                       ` David Ahern
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2015-01-02 17:17 UTC (permalink / raw)
  To: David Ahern
  Cc: Peter Zijlstra, Andy Lutomirski, linux-kernel@vger.kernel.org,
	X86 ML, Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Jan 02, 2015 at 10:09:10AM -0700, David Ahern wrote:
> On 1/2/15 10:02 AM, Shaohua Li wrote:
> >On Fri, Jan 02, 2015 at 08:31:33AM -0700, David Ahern wrote:
> >>On 1/1/15 7:59 PM, Shaohua Li wrote:
> >>>I'm wondering how we could use the perf to implament a clock_gettime.
> >>>reading the perf fd or using ioctl is slow so reading the mmap
> >>>ringbuffer is the only option. But as far as I know the ringbuffer has
> >>>data only when an event is generated. Between two events, there is
> >>>nothing we can read from the ringbuffer. Then how can application get
> >>>time info in the interval?
> >>
> >>Are you wanting to read perf_clock from userspace?
> >
> >Yep, in some sort of form. Basically I want to read the time a task
> >runs. Peter suggests we can read the activation time of a perf event.
> >But I don't want to use any system call, as it's slow and likes
> >clock_gettime.
> 
> Since we cannot get the capability committed upstream a number of
> folks are using this method:
> 
>     https://github.com/dsahern/linux/blob/perf-full-monty/README.ahern
> 
> ie., a KLM exports perf_clock and apps can use:
> 
> #define CLOCK_PERF 14
> if (clock_gettime(CLOCK_PERF, &ts) != 0) {
> }
> 
> No vdso acceleration, but works with an unmodified kernel.

no, that's not what I want. as I said, we don't want any syscall (unless
it's vdso based). 

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02 17:17                     ` Shaohua Li
@ 2015-01-02 17:26                       ` David Ahern
  0 siblings, 0 replies; 29+ messages in thread
From: David Ahern @ 2015-01-02 17:26 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Peter Zijlstra, Andy Lutomirski, linux-kernel@vger.kernel.org,
	X86 ML, Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On 1/2/15 10:17 AM, Shaohua Li wrote:
> On Fri, Jan 02, 2015 at 10:09:10AM -0700, David Ahern wrote:
>> On 1/2/15 10:02 AM, Shaohua Li wrote:
>>> On Fri, Jan 02, 2015 at 08:31:33AM -0700, David Ahern wrote:
>>>> On 1/1/15 7:59 PM, Shaohua Li wrote:
>>>>> I'm wondering how we could use the perf to implament a clock_gettime.
>>>>> reading the perf fd or using ioctl is slow so reading the mmap
>>>>> ringbuffer is the only option. But as far as I know the ringbuffer has
>>>>> data only when an event is generated. Between two events, there is
>>>>> nothing we can read from the ringbuffer. Then how can application get
>>>>> time info in the interval?
>>>>
>>>> Are you wanting to read perf_clock from userspace?
>>>
>>> Yep, in some sort of form. Basically I want to read the time a task
>>> runs. Peter suggests we can read the activation time of a perf event.
>>> But I don't want to use any system call, as it's slow and likes
>>> clock_gettime.
>>
>> Since we cannot get the capability committed upstream a number of
>> folks are using this method:
>>
>>      https://github.com/dsahern/linux/blob/perf-full-monty/README.ahern
>>
>> ie., a KLM exports perf_clock and apps can use:
>>
>> #define CLOCK_PERF 14
>> if (clock_gettime(CLOCK_PERF, &ts) != 0) {
>> }
>>
>> No vdso acceleration, but works with an unmodified kernel.
>
> no, that's not what I want. as I said, we don't want any syscall (unless
> it's vdso based).

A number of clocks are vdso based (e.g., CLOCK_MONOTONIC). Those are not 
good enough?

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02  2:59             ` Shaohua Li
  2015-01-02 15:31               ` David Ahern
@ 2015-01-02 17:47               ` Andy Lutomirski
  2015-01-05 23:23                 ` Shaohua Li
  1 sibling, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2015-01-02 17:47 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Peter Zijlstra, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Thu, Jan 1, 2015 at 6:59 PM, Shaohua Li <shli@fb.com> wrote:
> On Fri, Dec 19, 2014 at 06:03:34PM +0100, Peter Zijlstra wrote:
>> On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
>> > On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> > >> Bad news: this patch is incorrect, I think.  Take a look at
>> > >> update_rq_clock -- it does fancy things involving irq time and
>> > >> paravirt steal time.  So this patch could result in extremely
>> > >> non-monotonic results.
>> > >
>> > > Yeah, I'm not sure how (and if) we could make all that work :/
>> >
>> > I obviously can't comment on what Facebook needs, but if I were
>> > rigging something up to profile my own code*, I'd want a count of
>> > elapsed time, including user, system, and probably interrupt as well.
>> > I would probably not want to count time during which I'm not
>> > scheduled, and I would also probably not want to count steal time.
>> > The latter makes any implementation kind of nasty.
>> >
>> > The API presumably doesn't need to be any particular clock id for
>> > clock_gettime, and it may not even need to be clock_gettime at all.
>> >
>> > Is perf self-monitoring good enough for this?  If not, can we make it
>> > good enough?
>>
>> Yeah, I think you should be able to use that. You could count a NOP
>> event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
>> such purposes iirc.
>>
>> The advantage of using perf self profiling is that it (obviously)
>> extends to more than just walltime.
>
> Hi Peter & Andy,
> I'm wondering how we could use the perf to implament a clock_gettime.
> reading the perf fd or using ioctl is slow so reading the mmap
> ringbuffer is the only option. But as far as I know the ringbuffer has
> data only when an event is generated. Between two events, there is
> nothing we can read from the ringbuffer. Then how can application get
> time info in the interval?

Don't use the ringbuffer.  Instead, use a counting event, mmap it, and
look at struct perf_event_mmap_page's comments to see how to read the
time stamps.

There's some code here that does this:

https://github.com/andikleen/pmu-tools

but you won't actually need the rdpmc part, since you just want
overall times instead of hardware event counts.

--Andy

>
> Thanks,
> Shaohua



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-02 17:47               ` Andy Lutomirski
@ 2015-01-05 23:23                 ` Shaohua Li
  2015-01-06 10:18                   ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2015-01-05 23:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, linux-kernel@vger.kernel.org, X86 ML, Kernel-team,
	H. Peter Anvin, Ingo Molnar, John Stultz

On Fri, Jan 02, 2015 at 09:47:29AM -0800, Andy Lutomirski wrote:
> On Thu, Jan 1, 2015 at 6:59 PM, Shaohua Li <shli@fb.com> wrote:
> > On Fri, Dec 19, 2014 at 06:03:34PM +0100, Peter Zijlstra wrote:
> >> On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
> >> > On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >> > > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> >> > >> Bad news: this patch is incorrect, I think.  Take a look at
> >> > >> update_rq_clock -- it does fancy things involving irq time and
> >> > >> paravirt steal time.  So this patch could result in extremely
> >> > >> non-monotonic results.
> >> > >
> >> > > Yeah, I'm not sure how (and if) we could make all that work :/
> >> >
> >> > I obviously can't comment on what Facebook needs, but if I were
> >> > rigging something up to profile my own code*, I'd want a count of
> >> > elapsed time, including user, system, and probably interrupt as well.
> >> > I would probably not want to count time during which I'm not
> >> > scheduled, and I would also probably not want to count steal time.
> >> > The latter makes any implementation kind of nasty.
> >> >
> >> > The API presumably doesn't need to be any particular clock id for
> >> > clock_gettime, and it may not even need to be clock_gettime at all.
> >> >
> >> > Is perf self-monitoring good enough for this?  If not, can we make it
> >> > good enough?
> >>
> >> Yeah, I think you should be able to use that. You could count a NOP
> >> event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
> >> such purposes iirc.
> >>
> >> The advantage of using perf self profiling is that it (obviously)
> >> extends to more than just walltime.
> >
> > Hi Peter & Andy,
> > I'm wondering how we could use the perf to implament a clock_gettime.
> > reading the perf fd or using ioctl is slow so reading the mmap
> > ringbuffer is the only option. But as far as I know the ringbuffer has
> > data only when an event is generated. Between two events, there is
> > nothing we can read from the ringbuffer. Then how can application get
> > time info in the interval?
> 
> Don't use the ringbuffer.  Instead, use a counting event, mmap it, and
> look at struct perf_event_mmap_page's comments to see how to read the
> time stamps.
> 
> There's some code here that does this:
> 
> https://github.com/andikleen/pmu-tools
> 
> but you won't actually need the rdpmc part, since you just want
> overall times instead of hardware event counts.

Good, it works. But the timestamp (.time_running and friends) only gets
updated for real hardware event between context switches. For software
event, the timestamp is initialized once, then never updated. If I use
it to get time, I actually get CLOCK_MONOTONIC. Hardware events work
well here, but depending on hardware event is too tricky, which I'd like
to avoid.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-05 23:23                 ` Shaohua Li
@ 2015-01-06 10:18                   ` Peter Zijlstra
  2015-01-06 16:59                     ` Shaohua Li
  2015-01-12 19:50                     ` Shaohua Li
  0 siblings, 2 replies; 29+ messages in thread
From: Peter Zijlstra @ 2015-01-06 10:18 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Mon, Jan 05, 2015 at 03:23:38PM -0800, Shaohua Li wrote:
> Good, it works. But the timestamp (.time_running and friends) only gets
> updated for real hardware event between context switches. For software
> event, the timestamp is initialized once, then never updated. If I use
> it to get time, I actually get CLOCK_MONOTONIC. Hardware events work
> well here, but depending on hardware event is too tricky, which I'd like
> to avoid.

Hmm, that's an unfortunate difference in behaviour, does something like
the below cure that for you?

---
 kernel/events/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c1ee7f2bebc..0feb4e039359 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5965,11 +5965,13 @@ static void perf_swevent_del(struct perf_event *event, int flags)
 static void perf_swevent_start(struct perf_event *event, int flags)
 {
 	event->hw.state = 0;
+	perf_event_update_userpage(event);
 }
 
 static void perf_swevent_stop(struct perf_event *event, int flags)
 {
 	event->hw.state = PERF_HES_STOPPED;
+	perf_event_update_userpage(event);
 }
 
 /* Deref the hlist from the update side */
@@ -6410,12 +6412,14 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
 {
 	local64_set(&event->hw.prev_count, local_clock());
 	perf_swevent_start_hrtimer(event);
+	perf_event_update_userpage(event);
 }
 
 static void cpu_clock_event_stop(struct perf_event *event, int flags)
 {
 	perf_swevent_cancel_hrtimer(event);
 	cpu_clock_event_update(event);
+	perf_event_update_userpage(event);
 }
 
 static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -6484,12 +6488,14 @@ static void task_clock_event_start(struct perf_event *event, int flags)
 {
 	local64_set(&event->hw.prev_count, event->ctx->time);
 	perf_swevent_start_hrtimer(event);
+	perf_event_update_userpage(event);
 }
 
 static void task_clock_event_stop(struct perf_event *event, int flags)
 {
 	perf_swevent_cancel_hrtimer(event);
 	task_clock_event_update(event, event->ctx->time);
+	perf_event_update_userpage(event);
 }
 
 static int task_clock_event_add(struct perf_event *event, int flags)

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-06 10:18                   ` Peter Zijlstra
@ 2015-01-06 16:59                     ` Shaohua Li
  2015-01-12 19:50                     ` Shaohua Li
  1 sibling, 0 replies; 29+ messages in thread
From: Shaohua Li @ 2015-01-06 16:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Tue, Jan 06, 2015 at 11:18:39AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 05, 2015 at 03:23:38PM -0800, Shaohua Li wrote:
> > Good, it works. But the timestamp (.time_running and friends) only gets
> > updated for real hardware event between context switches. For software
> > event, the timestamp is initialized once, then never updated. If I use
> > it to get time, I actually get CLOCK_MONOTONIC. Hardware events work
> > well here, but depending on hardware event is too tricky, which I'd like
> > to avoid.
> 
> Hmm, that's an unfortunate difference in behaviour, does something like
> the below cure that for you?

Yes, I tried similar here, it works perfectly. Is this a bug we will
eventually fix? If yes, we will very happy to use perf for the
clock_gettime things. I had some initial benchmarks which show this
doesn't have noticeable performance issue. I'll do more benchamrks and
report back if there are any issues.

Thanks,
Shaohua

> 
> ---
>  kernel/events/core.c | 6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4c1ee7f2bebc..0feb4e039359 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5965,11 +5965,13 @@ static void perf_swevent_del(struct perf_event *event, int flags)
>  static void perf_swevent_start(struct perf_event *event, int flags)
>  {
>  	event->hw.state = 0;
> +	perf_event_update_userpage(event);
>  }
>  
>  static void perf_swevent_stop(struct perf_event *event, int flags)
>  {
>  	event->hw.state = PERF_HES_STOPPED;
> +	perf_event_update_userpage(event);
>  }
>  
>  /* Deref the hlist from the update side */
> @@ -6410,12 +6412,14 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
>  {
>  	local64_set(&event->hw.prev_count, local_clock());
>  	perf_swevent_start_hrtimer(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static void cpu_clock_event_stop(struct perf_event *event, int flags)
>  {
>  	perf_swevent_cancel_hrtimer(event);
>  	cpu_clock_event_update(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static int cpu_clock_event_add(struct perf_event *event, int flags)
> @@ -6484,12 +6488,14 @@ static void task_clock_event_start(struct perf_event *event, int flags)
>  {
>  	local64_set(&event->hw.prev_count, event->ctx->time);
>  	perf_swevent_start_hrtimer(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static void task_clock_event_stop(struct perf_event *event, int flags)
>  {
>  	perf_swevent_cancel_hrtimer(event);
>  	task_clock_event_update(event, event->ctx->time);
> +	perf_event_update_userpage(event);
>  }
>  
>  static int task_clock_event_add(struct perf_event *event, int flags)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO
  2015-01-06 10:18                   ` Peter Zijlstra
  2015-01-06 16:59                     ` Shaohua Li
@ 2015-01-12 19:50                     ` Shaohua Li
  1 sibling, 0 replies; 29+ messages in thread
From: Shaohua Li @ 2015-01-12 19:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, X86 ML,
	Kernel-team, H. Peter Anvin, Ingo Molnar, John Stultz

On Tue, Jan 06, 2015 at 11:18:39AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 05, 2015 at 03:23:38PM -0800, Shaohua Li wrote:
> > Good, it works. But the timestamp (.time_running and friends) only gets
> > updated for real hardware event between context switches. For software
> > event, the timestamp is initialized once, then never updated. If I use
> > it to get time, I actually get CLOCK_MONOTONIC. Hardware events work
> > well here, but depending on hardware event is too tricky, which I'd like
> > to avoid.
> 
> Hmm, that's an unfortunate difference in behaviour, does something like
> the below cure that for you?

Hi Peter,
can we push these to upstream? It works very well here.

Thanks,
Shaohua
 
> ---
>  kernel/events/core.c | 6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4c1ee7f2bebc..0feb4e039359 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5965,11 +5965,13 @@ static void perf_swevent_del(struct perf_event *event, int flags)
>  static void perf_swevent_start(struct perf_event *event, int flags)
>  {
>  	event->hw.state = 0;
> +	perf_event_update_userpage(event);
>  }
>  
>  static void perf_swevent_stop(struct perf_event *event, int flags)
>  {
>  	event->hw.state = PERF_HES_STOPPED;
> +	perf_event_update_userpage(event);
>  }
>  
>  /* Deref the hlist from the update side */
> @@ -6410,12 +6412,14 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
>  {
>  	local64_set(&event->hw.prev_count, local_clock());
>  	perf_swevent_start_hrtimer(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static void cpu_clock_event_stop(struct perf_event *event, int flags)
>  {
>  	perf_swevent_cancel_hrtimer(event);
>  	cpu_clock_event_update(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static int cpu_clock_event_add(struct perf_event *event, int flags)
> @@ -6484,12 +6488,14 @@ static void task_clock_event_start(struct perf_event *event, int flags)
>  {
>  	local64_set(&event->hw.prev_count, event->ctx->time);
>  	perf_swevent_start_hrtimer(event);
> +	perf_event_update_userpage(event);
>  }
>  
>  static void task_clock_event_stop(struct perf_event *event, int flags)
>  {
>  	perf_swevent_cancel_hrtimer(event);
>  	task_clock_event_update(event, event->ctx->time);
> +	perf_event_update_userpage(event);
>  }
>  
>  static int task_clock_event_add(struct perf_event *event, int flags)

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2015-01-12 19:51 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-17 23:12 [PATCH v2 1/3] X86: make VDSO data support multiple pages Shaohua Li
2014-12-17 23:12 ` [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch Shaohua Li
2014-12-19  1:05   ` Thomas Gleixner
2014-12-17 23:12 ` [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO Shaohua Li
2014-12-18 23:30   ` Andy Lutomirski
2014-12-19  0:22     ` Andy Lutomirski
2014-12-19  0:30       ` Shaohua Li
2014-12-19  0:32         ` Andy Lutomirski
2014-12-19  0:34         ` Thomas Gleixner
2014-12-19 11:23       ` Peter Zijlstra
2014-12-19 16:48         ` Andy Lutomirski
2014-12-19 17:03           ` Peter Zijlstra
2014-12-19 17:07             ` Andy Lutomirski
2014-12-19 17:27               ` Peter Zijlstra
2014-12-19 17:42                 ` Andy Lutomirski
2015-01-02  2:59             ` Shaohua Li
2015-01-02 15:31               ` David Ahern
2015-01-02 17:02                 ` Shaohua Li
2015-01-02 17:09                   ` David Ahern
2015-01-02 17:17                     ` Shaohua Li
2015-01-02 17:26                       ` David Ahern
2015-01-02 17:47               ` Andy Lutomirski
2015-01-05 23:23                 ` Shaohua Li
2015-01-06 10:18                   ` Peter Zijlstra
2015-01-06 16:59                     ` Shaohua Li
2015-01-12 19:50                     ` Shaohua Li
2014-12-19 17:42           ` Chris Mason
2014-12-19 17:53             ` Andy Lutomirski
2014-12-19 18:16               ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox