[PATCH 2/2] Add a thread cpu time implementation to vDSO

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 19:36 [PATCH 0/2] " Arun Sharma
@ 2011-12-12 19:36 ` Arun Sharma
  2011-12-12 20:13   ` Eric Dumazet
                     ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-12 19:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: Kumar Sundararajan, Arun Sharma, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

From: Kumar Sundararajan <kumar@fb.com>

This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
via a new vsyscall. We also add a direct vsyscall that returns
time in ns (RFC: the direct vsyscall doesn't have a corresponding
regular syscall, although clock_gettime() is pretty close).

We use the following method to compute the thread cpu time:

t0 = process start
t1 = most recent context switch time
t2 = time at which the vsyscall is invoked

thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
		= current->se.sum_exec_runtime + now - sched_clock()

At context switch time We stash away

adj_sched_time = sum_exec_runtime - sched_clock()

in a per-cpu struct in the VVAR page (which has now been extended
to two pages) and then compute

thread_cpu_time = adj_sched_time + now

All computations are done in nanosecs on systems where TSC is stable.
If TSC is unstable, we fallback to a regular syscall.

Benchmark data:

Baseline:

        for (i = 0; i < 100000000; i++) {
                clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

vclock_gettime:

        vclock_gettime = dlsym(vdso, "__vdso_clock_gettime");
        for (i = 0; i < 100000000; i++) {
                (*vclock_gettime)(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

thread_cpu_time:

        thread_cpu_time = dlsym(vdso, "__vdso_thread_cpu_time");
        for (i = 0; i < 100000000; i++) {
                sum += (*thread_cpu_time)();
        }

Baseline: 		19.34 secs
vclock_gettime: 	4.74 secs
thread_cpu_time:	3.62 secs

This should speed up profilers that need to query thread
cpu time a lot to do fine-grained timestamps.

No statistically significant regression was detected on x86_64
context switch code. Most archs that don't support vsyscalls
will have this code disabled via jump labels.

Signed-off-by: Kumar Sundararajan <kumar@fb.com>
Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andy Lutomirski <luto@MIT.EDU>
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/timer.h   |   18 +++++++++---
 arch/x86/include/asm/vvar.h    |    1 +
 arch/x86/kernel/tsc.c          |    6 ++++
 arch/x86/kernel/vsyscall_64.c  |    1 +
 arch/x86/vdso/vclock_gettime.c |   58 ++++++++++++++++++++++++++++++++++++++-
 arch/x86/vdso/vdso.lds.S       |    2 +
 arch/x86/vdso/vma.c            |    5 +++
 include/linux/jiffies.h        |   16 +++++++++++
 kernel/sched.c                 |    6 ++++
 9 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 431793e..99a3670 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -55,19 +55,27 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+static inline unsigned long long ___cycles_2_ns(unsigned long long cyc,
+						unsigned long long scale,
+						unsigned long long offset)
 {
 	unsigned long long quot;
 	unsigned long long rem;
-	int cpu = smp_processor_id();
-	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long ns = offset;
 	quot = (cyc >> CYC2NS_SCALE_FACTOR);
 	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
-	ns += quot * per_cpu(cyc2ns, cpu) +
-		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
+	ns += quot * scale + ((rem * scale) >> CYC2NS_SCALE_FACTOR);
 	return ns;
 }
 
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+	int cpu = smp_processor_id();
+	unsigned long long offset = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long scale = per_cpu(cyc2ns, cpu);
+	return ___cycles_2_ns(cyc, scale, offset);
+}
+
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	unsigned long long ns;
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 0fd7a4a..e36e1c1 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -47,5 +47,6 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+DECLARE_VVAR(2048, struct vcpu_data, vcpu_data)
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db48336..1dc7205 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -570,6 +570,8 @@ int recalibrate_cpu_khz(void)
 		cpu_data(0).loops_per_jiffy =
 			cpufreq_scale(cpu_data(0).loops_per_jiffy,
 					cpu_khz_old, cpu_khz);
+		vcpu_data.tsc_khz = tsc_khz;
+		vcpu_data.tsc_unstable = 0;
 		return 0;
 	} else
 		return -ENODEV;
@@ -623,6 +625,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	if (cpu_khz) {
 		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
 		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+		vcpu_data.vpercpu[cpu].cyc2ns = *scale;
+		vcpu_data.vpercpu[cpu].cyc2ns_offset = *offset;
 	}
 
 	sched_clock_idle_wakeup_event(0);
@@ -786,6 +790,8 @@ void mark_tsc_unstable(char *reason)
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
 		disable_sched_clock_irqtime();
+		vcpu_data.tsc_unstable = 1;
+		jump_label_dec(&vcpu_data_enabled);
 		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7960d3a..cdfcedf 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -56,6 +56,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
 	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
+DEFINE_VVAR(struct vcpu_data, vcpu_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e72..45720b3 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -18,6 +18,7 @@
 #include <asm/vsyscall.h>
 #include <asm/fixmap.h>
 #include <asm/vgtod.h>
+#include <asm/timer.h>
 #include <asm/timex.h>
 #include <asm/hpet.h>
 #include <asm/unistd.h>
@@ -65,8 +66,8 @@ static notrace cycle_t vread_hpet(void)
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
-	asm("syscall" : "=a" (ret) :
-	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+	asm volatile("syscall" : "=a" (ret) :
+		     "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
 	return ret;
 }
 
@@ -154,8 +155,51 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+notrace static inline unsigned long __do_thread_cpu_time(void)
+{
+	unsigned int p;
+	u_int64_t tscval;
+	unsigned long long adj_sched_time, scale, offset;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+	int cpu;
+
+	if (vp->tsc_unstable) {
+		struct timespec ts;
+		vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+		return timespec_to_ns(&ts);
+	}
+
+	do {
+		native_read_tscp(&p);
+		cpu = p & 0xfff;
+		adj_sched_time = vp->vpercpu[cpu].adj_sched_time;
+		scale = vp->vpercpu[cpu].cyc2ns;
+		offset = vp->vpercpu[cpu].cyc2ns_offset;
+		rdtscpll(tscval, p);
+		cpu = p & 0xfff;
+	} while (unlikely(adj_sched_time != vp->vpercpu[cpu].adj_sched_time));
+
+	return  ___cycles_2_ns(tscval, scale, offset) + adj_sched_time;
+}
+
+notrace static noinline unsigned long do_thread_cpu_time(void)
+{
+	return __do_thread_cpu_time();
+}
+
+notrace noinline unsigned long __vdso_thread_cpu_time(void)
+{
+	return __do_thread_cpu_time();
+}
+
+long thread_time(void)
+	__attribute__((weak, alias("__vdso_thread_cpu_time")));
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
+	long ns;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+
 	switch (clock) {
 	case CLOCK_REALTIME:
 		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
@@ -169,6 +213,16 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		return do_realtime_coarse(ts);
 	case CLOCK_MONOTONIC_COARSE:
 		return do_monotonic_coarse(ts);
+	case CLOCK_THREAD_CPUTIME_ID:
+		if (vp->tsc_unstable)
+			break;
+		ns = do_thread_cpu_time();
+		if (likely(ns > 0)) {
+			ts->tv_sec = 0;
+			timespec_add_ns(ts, ns);
+			return 0;
+		}
+		break;
 	}
 
 	return vdso_fallback_gettime(clock, ts);
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index b96b267..c0b2d36 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -25,6 +25,8 @@ VERSION {
 		__vdso_getcpu;
 		time;
 		__vdso_time;
+		thread_cpu_time;
+		__vdso_thread_cpu_time;
 	local: *;
 	};
 }
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c..69f38f8 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/jump_label.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
@@ -24,6 +25,8 @@ extern unsigned short vdso_sync_cpuid;
 extern struct page *vdso_pages[];
 static unsigned vdso_size;
 
+struct jump_label_key vcpu_data_enabled;
+
 static void __init patch_vdso(void *vdso, size_t len)
 {
 	Elf64_Ehdr *hdr = vdso;
@@ -66,6 +69,8 @@ static int __init init_vdso(void)
 	for (i = 0; i < npages; i++)
 		vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
 
+	if (sched_clock_stable)
+		jump_label_inc(&vcpu_data_enabled);
 	return 0;
 }
 subsys_initcall(init_vdso);
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 265e2c3..a0fe1f5 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -312,4 +312,20 @@ extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
+struct vpercpu_data {
+	unsigned long long adj_sched_time;
+	unsigned long long cyc2ns_offset;
+	unsigned long cyc2ns;
+} ____cacheline_aligned;
+
+struct vcpu_data {
+	struct vpercpu_data vpercpu[NR_CPUS];
+	unsigned int tsc_khz;
+	unsigned int tsc_unstable;
+};
+extern struct vcpu_data vcpu_data;
+
+struct jump_label_key;
+extern struct jump_label_key vcpu_data_enabled;
+
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149c..3f92455 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3216,6 +3216,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+	if (static_branch(&vcpu_data_enabled)) {
+		int cpu = smp_processor_id();
+		vcpu_data.vpercpu[cpu].adj_sched_time =
+                  current->se.sum_exec_runtime - sched_clock();
+	}
 }
 
 #ifdef CONFIG_SMP
-- 
1.7.4


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 19:36 ` [PATCH 2/2] " Arun Sharma
@ 2011-12-12 20:13   ` Eric Dumazet
  2011-12-12 21:19     ` Arun Sharma
  2011-12-12 20:15   ` Andrew Lutomirski
  2011-12-12 23:09   ` john stultz
  2 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2011-12-12 20:13 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

Le lundi 12 décembre 2011 à 11:36 -0800, Arun Sharma a écrit :
> From: Kumar Sundararajan <kumar@fb.com>
> 

...

> +
> +struct vcpu_data {
> +	struct vpercpu_data vpercpu[NR_CPUS];
> +	unsigned int tsc_khz;
> +	unsigned int tsc_unstable;
> +};

Thats a showstopper.

Try to compile the thing with NR_CPUS=4096 ?



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 19:36 ` [PATCH 2/2] " Arun Sharma
  2011-12-12 20:13   ` Eric Dumazet
@ 2011-12-12 20:15   ` Andrew Lutomirski
  2011-12-12 22:49     ` Arun Sharma
  2011-12-12 23:09   ` john stultz
  2 siblings, 1 reply; 23+ messages in thread
From: Andrew Lutomirski @ 2011-12-12 20:15 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz

On Mon, Dec 12, 2011 at 11:36 AM, Arun Sharma <asharma@fb.com> wrote:
> From: Kumar Sundararajan <kumar@fb.com>
>
> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
> via a new vsyscall. We also add a direct vsyscall that returns
> time in ns (RFC: the direct vsyscall doesn't have a corresponding
> regular syscall, although clock_gettime() is pretty close).
>
> We use the following method to compute the thread cpu time:
>
> t0 = process start
> t1 = most recent context switch time
> t2 = time at which the vsyscall is invoked
>
> thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>                = current->se.sum_exec_runtime + now - sched_clock()
>

[...]

This seems like a neat idea, but I'm a little worried about the
implementation.  Surely someone with 4k+ cpus will complain when the
percpu vvar data exceeds a page and crashes.  I have no personal
objection to a dynamically-sized vvar area, but it might need more
careful handling.

I also wonder if there a problem with information leaking.  If there
was an entire per-cpu vvar page (perhaps mapped differently on each
cpu) then nothing interesting would leak.  But that could add
overhead.

> diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
> index 0fd7a4a..e36e1c1 100644
> --- a/arch/x86/include/asm/vvar.h
> +++ b/arch/x86/include/asm/vvar.h
> @@ -47,5 +47,6 @@
>  DECLARE_VVAR(0, volatile unsigned long, jiffies)
>  DECLARE_VVAR(16, int, vgetcpu_mode)
>  DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
> +DECLARE_VVAR(2048, struct vcpu_data, vcpu_data)

Any reason this isn't page-aligned?  Offset 2048 seems like an odd place.

>  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>  {
>        long ret;
> -       asm("syscall" : "=a" (ret) :
> -           "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> +       asm volatile("syscall" : "=a" (ret) :
> +                    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>        return ret;
>  }

Huh?  This should probably be a separate patch (and probably not a
-stable candidate, since it would take amazing compiler stupidity to
generate anything other than the obvious code).  The memory clobber
may also be enough to make this officially safe.

>
> @@ -154,8 +155,51 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
>        return 0;
>  }
>
> +notrace static inline unsigned long __do_thread_cpu_time(void)
> +{
> +       unsigned int p;
> +       u_int64_t tscval;
> +       unsigned long long adj_sched_time, scale, offset;
> +       const struct vcpu_data *vp = &VVAR(vcpu_data);
> +       int cpu;
> +
> +       if (vp->tsc_unstable) {
> +               struct timespec ts;
> +               vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
> +               return timespec_to_ns(&ts);
> +       }

Yuck -- another flag indicating whether we're using the tsc.

> +
> +       do {
> +               native_read_tscp(&p);

Do all 64-bit cpus have rdtscp?  ISTR older Intel machines don't have it.

> --- a/arch/x86/vdso/vdso.lds.S
> +++ b/arch/x86/vdso/vdso.lds.S
> @@ -25,6 +25,8 @@ VERSION {
>                __vdso_getcpu;
>                time;
>                __vdso_time;
> +               thread_cpu_time;
> +               __vdso_thread_cpu_time;
>        local: *;
>        };
>  }

Why do we have the non-__vdso versions?  IMO anything that actually
uses them is likely to be confused.  They have the same names as the
glibc wrappers, but the glibc wrappers have different return value and
errno semantics.  I'd say just use __vdso.

Also, what's wrong with just adding the functionality to __vdso_clock_gettime?

> +struct vcpu_data {
> +       struct vpercpu_data vpercpu[NR_CPUS];
> +       unsigned int tsc_khz;

I think this also duplicates some of the gtod_data stuff, at least in
the case that TSC works for gtod.


--Andy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 20:13   ` Eric Dumazet
@ 2011-12-12 21:19     ` Arun Sharma
  2011-12-12 21:27       ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-12 21:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

On 12/12/11 12:13 PM, Eric Dumazet wrote:

>> +
>> +struct vcpu_data {
>> +	struct vpercpu_data vpercpu[NR_CPUS];
>> +	unsigned int tsc_khz;
>> +	unsigned int tsc_unstable;
>> +};
>
> Thats a showstopper.
>
> Try to compile the thing with NR_CPUS=4096 ?
>

I get a link time error:

ld: section .data..percpu [0000000001ac2000 -> 0000000001ad48ff] 
overlaps section .vvar [0000000001ac0000 -> 0000000001b0083f]

which I consider better than runtime memory corruption :)

I could add a BUILD_BUG_ON() that tries to catch this earlier in the 
compile process.

Re: Fixing the build for NR_CPUS > 64

How about something along the lines of the following:

From: Arun Sharma <asharma@fb.com>
Date: Mon, 12 Dec 2011 13:13:43 -0800
Subject: [PATCH] Handle NR_CPUS > 64

---
arch/x86/kernel/tsc.c          |    8 ++++++--
arch/x86/vdso/vclock_gettime.c |    4 ++--
include/linux/jiffies.h        |    8 ++++++--
kernel/sched.c                 |    2 ++
4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1dc7205..45f3438 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -571,7 +571,11 @@ int recalibrate_cpu_khz(void)
			cpufreq_scale(cpu_data(0).loops_per_jiffy,
					cpu_khz_old, cpu_khz);
		vcpu_data.tsc_khz = tsc_khz;
-		vcpu_data.tsc_unstable = 0;
+#if CONFIG_NR_CPUS <= 64
+		vcpu_data.thread_cputime_disabled = 0;
+#else
+		vcpu_data.thread_cputime_disabled = 1;
+#endif
		return 0;
	} else
		return -ENODEV;
@@ -790,7 +794,7 @@ void mark_tsc_unstable(char *reason)
		tsc_unstable = 1;
		sched_clock_stable = 0;
		disable_sched_clock_irqtime();
-		vcpu_data.tsc_unstable = 1;
+		vcpu_data.thread_cputime_disabled = 1;
		jump_label_dec(&vcpu_data_enabled);
		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
		/* Change only the rating, when not registered */
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 61ab74b..e41a7e9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -185,7 +185,7 @@ notrace static inline unsigned long 
__do_thread_cpu_time(void)
	const struct vcpu_data *vp = &VVAR(vcpu_data);
	int cpu;

-	if (vp->tsc_unstable) {
+	if (vp->thread_cputime_disabled) {
		struct timespec ts;
		vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
		return timespec_to_ns(&ts);
@@ -236,7 +236,7 @@ notrace int __vdso_clock_gettime(clockid_t clock, 
struct timespec *ts)
	case CLOCK_MONOTONIC_COARSE:
		return do_monotonic_coarse(ts);
	case CLOCK_THREAD_CPUTIME_ID:
-		if (vp->tsc_unstable)
+		if (vp->thread_cputime_disabled)
			break;
		ns = do_thread_cpu_time();
		if (likely(ns > 0)) {
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index a0fe1f5..aec389f 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -319,9 +319,13 @@ struct vpercpu_data {
} ____cacheline_aligned;

struct vcpu_data {
-	struct vpercpu_data vpercpu[NR_CPUS];
	unsigned int tsc_khz;
-	unsigned int tsc_unstable;
+	unsigned int thread_cputime_disabled;
+#if CONFIG_NR_CPUS <= 64
+	struct vpercpu_data vpercpu[NR_CPUS];
+#else
+	struct vpercpu_data vpercpu[1];
+#endif
};
extern struct vcpu_data vcpu_data;

diff --git a/kernel/sched.c b/kernel/sched.c
index a72545a..e4f38ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3216,11 +3216,13 @@ static void finish_task_switch(struct rq *rq, 
struct task_struct *prev)
		put_task_struct(prev);
	}

+#if CONFIG_NR_CPUS <= 64
	if (static_branch(&vcpu_data_enabled)) {
		int cpu = smp_processor_id();
		vcpu_data.vpercpu[cpu].adj_sched_time =
                   current->se.sum_exec_runtime - sched_clock();
	}
+#endif
}

#ifdef CONFIG_SMP
-- 
1.7.4



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 21:19     ` Arun Sharma
@ 2011-12-12 21:27       ` Eric Dumazet
  2011-12-12 21:33         ` Andrew Lutomirski
  2011-12-12 22:14         ` Arun Sharma
  0 siblings, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2011-12-12 21:27 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

Le lundi 12 décembre 2011 à 13:19 -0800, Arun Sharma a écrit :
> On 12/12/11 12:13 PM, Eric Dumazet wrote:
> 
> >> +
> >> +struct vcpu_data {
> >> +	struct vpercpu_data vpercpu[NR_CPUS];
> >> +	unsigned int tsc_khz;
> >> +	unsigned int tsc_unstable;
> >> +};
> >
> > Thats a showstopper.
> >
> > Try to compile the thing with NR_CPUS=4096 ?
> >
> 
> I get a link time error:
> 
> ld: section .data..percpu [0000000001ac2000 -> 0000000001ad48ff] 
> overlaps section .vvar [0000000001ac0000 -> 0000000001b0083f]
> 
> which I consider better than runtime memory corruption :)
> 
> I could add a BUILD_BUG_ON() that tries to catch this earlier in the 
> compile process.
> 
> Re: Fixing the build for NR_CPUS > 64
> 
> How about something along the lines of the following:
> 
> From: Arun Sharma <asharma@fb.com>
> Date: Mon, 12 Dec 2011 13:13:43 -0800
> Subject: [PATCH] Handle NR_CPUS > 64
> 
> ---

This only works if CONFIG_X86_L1_CACHE_SHIFT=6

Some configurations have 128 bytes cache lines

But really most modern distros have NR_CPUS > 64




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 21:27       ` Eric Dumazet
@ 2011-12-12 21:33         ` Andrew Lutomirski
  2011-12-12 22:14         ` Arun Sharma
  1 sibling, 0 replies; 23+ messages in thread
From: Andrew Lutomirski @ 2011-12-12 21:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Arun Sharma, linux-kernel, Kumar Sundararajan, Peter Zijlstra,
	Ingo Molnar, Thomas Gleixner, john stultz

On Mon, Dec 12, 2011 at 1:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 12 décembre 2011 à 13:19 -0800, Arun Sharma a écrit :
>> On 12/12/11 12:13 PM, Eric Dumazet wrote:
>>
>> >> +
>> >> +struct vcpu_data {
>> >> +  struct vpercpu_data vpercpu[NR_CPUS];
>> >> +  unsigned int tsc_khz;
>> >> +  unsigned int tsc_unstable;
>> >> +};
>> >
>> > Thats a showstopper.
>> >
>> > Try to compile the thing with NR_CPUS=4096 ?
>> >
>>
>> I get a link time error:
>>
>> ld: section .data..percpu [0000000001ac2000 -> 0000000001ad48ff]
>> overlaps section .vvar [0000000001ac0000 -> 0000000001b0083f]
>>
>> which I consider better than runtime memory corruption :)
>>
>> I could add a BUILD_BUG_ON() that tries to catch this earlier in the
>> compile process.
>>
>> Re: Fixing the build for NR_CPUS > 64
>>
>> How about something along the lines of the following:
>>
>> From: Arun Sharma <asharma@fb.com>
>> Date: Mon, 12 Dec 2011 13:13:43 -0800
>> Subject: [PATCH] Handle NR_CPUS > 64
>>
>> ---
>
> This only works if CONFIG_X86_L1_CACHE_SHIFT=6
>
> Some configurations have 128 bytes cache lines
>
> But really most modern distros have NR_CPUS > 64
>
>
>

I kind of like the idea of a per-cpu vvar page.

Pros:
- It could be used for very fast and simple userspace TLS.
- Things like gettid() could become vsyscalls.
- Thread time, process time, etc.
- I would use it for my nefarious asynchronous sysret idea.
- The kernel could use the same infrastructure to replace swapgs.
This might not be a win -- swapgs is probably very highly optimized on
modern cpus.
- Other than the core implementation, everything that uses it would be
very easy to understand.

Cons:
- Uses a page, a pte, and everything higher up the page table hierarchy per cpu.
- Takes up a tlb slot.
- Getting it to work with Xen might be interesting.

--Andy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 21:27       ` Eric Dumazet
  2011-12-12 21:33         ` Andrew Lutomirski
@ 2011-12-12 22:14         ` Arun Sharma
  2011-12-13  8:52           ` Peter Zijlstra
  1 sibling, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-12 22:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

On 12/12/11 1:27 PM, Eric Dumazet wrote:

> This only works if CONFIG_X86_L1_CACHE_SHIFT=6
>
> Some configurations have 128 bytes cache lines
>

I could still handle this case based on compile time math.

> But really most modern distros have NR_CPUS>  64

Is 256 the new 64? :)

That'll require 3 more VVAR pages, most of which would be unused on the 
vast majority of the systems (assuming 32 CPUs or less).

Having the number of VVAR pages configurable at boot time would solve 
this problem nicely. I think that should be a separate patch.

  -Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 20:15   ` Andrew Lutomirski
@ 2011-12-12 22:49     ` Arun Sharma
  2011-12-12 23:01       ` Andrew Lutomirski
  0 siblings, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-12 22:49 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz

On 12/12/11 12:15 PM, Andrew Lutomirski wrote:

> This seems like a neat idea, but I'm a little worried about the
> implementation.  Surely someone with 4k+ cpus will complain when the
> percpu vvar data exceeds a page and crashes.  I have no personal
> objection to a dynamically-sized vvar area, but it might need more
> careful handling.

I'm for disabling the vsyscall when the data doesn't fit in a couple of 
pages.

>
> I also wonder if there a problem with information leaking.  If there
> was an entire per-cpu vvar page (perhaps mapped differently on each
> cpu) then nothing interesting would leak.  But that could add
> overhead.
>

The timing based attacks depend on the granularity of timestamps. I feel 
what's available here is too coarse grained to be useful. Happy to
disable the code at compile time for those cases. Are there 
CONFIG_HIGH_SECURITY type of options I could use for this purpose?

>> diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
>> index 0fd7a4a..e36e1c1 100644
>> --- a/arch/x86/include/asm/vvar.h
>> +++ b/arch/x86/include/asm/vvar.h
>> @@ -47,5 +47,6 @@
>>   DECLARE_VVAR(0, volatile unsigned long, jiffies)
>>   DECLARE_VVAR(16, int, vgetcpu_mode)
>>   DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
>> +DECLARE_VVAR(2048, struct vcpu_data, vcpu_data)
>
> Any reason this isn't page-aligned?  Offset 2048 seems like an odd place.
>

I meant it to be 128 + sizeof(struct vsyscall_gtod_data) so we fit 
everything in one page if CONFIG_NR_CPUS is small enough.

I'll fix it up in the next rev.

>>   notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>>   {
>>         long ret;
>> -       asm("syscall" : "=a" (ret) :
>> -           "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>> +       asm volatile("syscall" : "=a" (ret) :
>> +                    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>>         return ret;
>>   }
>
> Huh?  This should probably be a separate patch (and probably not a
> -stable candidate, since it would take amazing compiler stupidity to
> generate anything other than the obvious code).  The memory clobber
> may also be enough to make this officially safe.

Yes - this should be a separate patch. gcc-4.4 likes to get rid of the 
instruction in __do_thread_cpu_time without the asm volatile (in spite 
of the memory clobber).

>> +       if (vp->tsc_unstable) {
>> +               struct timespec ts;
>> +               vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID,&ts);
>> +               return timespec_to_ns(&ts);
>> +       }
>
> Yuck -- another flag indicating whether we're using the tsc.

I renamed it to thread_cputime_disabled to deal with NR_CPUS > 64.

>
>> +
>> +       do {
>> +               native_read_tscp(&p);
>
> Do all 64-bit cpus have rdtscp?  ISTR older Intel machines don't have it.
>

Yeah - I ran into this one when testing with KVM on a rdtscp capable 
CPU. Will disable the vsyscall when the CPU doesn't support rdtscp.

>> --- a/arch/x86/vdso/vdso.lds.S
>> +++ b/arch/x86/vdso/vdso.lds.S
>> @@ -25,6 +25,8 @@ VERSION {
>>                 __vdso_getcpu;
>>                 time;
>>                 __vdso_time;
>> +               thread_cpu_time;
>> +               __vdso_thread_cpu_time;
>>         local: *;
>>         };
>>   }
>
> Why do we have the non-__vdso versions?  IMO anything that actually
> uses them is likely to be confused.  They have the same names as the
> glibc wrappers, but the glibc wrappers have different return value and
> errno semantics.  I'd say just use __vdso.

I'm not familiar with the history of __vdso_foo vs foo. Happy to get rid 
of the non __vdso versions.

>
> Also, what's wrong with just adding the functionality to __vdso_clock_gettime?
>

Performance. Most of this is client dependent (whether it expects time 
in nanoseconds or timespec).

Baseline: 		19.34 secs
vclock_gettime: 	4.74 secs
thread_cpu_time:	3.62 secs

Our use case prefers nanoseconds, so the conversion to timespec and back 
adds overhead. Also, having a direct entry point vs going through the 
switch in clock_gettime() adds some cycles.

I have some people here asking me if they could get CLOCK_REALTIME and 
CLOCK_MONOTONIC also in nanoseconds for the same reason.

>> +struct vcpu_data {
>> +       struct vpercpu_data vpercpu[NR_CPUS];
>> +       unsigned int tsc_khz;
>
> I think this also duplicates some of the gtod_data stuff, at least in
> the case that TSC works for gtod.
>

vgotd.h seems to be clock source independent. This vsyscall is usable 
only on systems with tsc_unstable == 0. Also, there is only one instance 
of mult and shift there, whereas we save it on a per-cpu basis in the 
VVAR page.

Thanks for the review and comments!

  -Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 22:49     ` Arun Sharma
@ 2011-12-12 23:01       ` Andrew Lutomirski
  2011-12-13  0:40         ` Arun Sharma
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Lutomirski @ 2011-12-12 23:01 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz

On Mon, Dec 12, 2011 at 2:49 PM, Arun Sharma <asharma@fb.com> wrote:
>>
>> I also wonder if there a problem with information leaking.  If there
>> was an entire per-cpu vvar page (perhaps mapped differently on each
>> cpu) then nothing interesting would leak.  But that could add
>> overhead.
>>
>
> The timing based attacks depend on the granularity of timestamps. I feel
> what's available here is too coarse grained to be useful. Happy to
> disable the code at compile time for those cases. Are there
> CONFIG_HIGH_SECURITY type of options I could use for this purpose?

It allows anyone to detect with very high precision when context
switches happen on another CPU.  This sounds a little dangerous.  I
don't know if a config option is the right choice.

>
>>>  notrace static long vdso_fallback_gettime(long clock, struct timespec
>>> *ts)
>>>  {
>>>        long ret;
>>> -       asm("syscall" : "=a" (ret) :
>>> -           "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>>> +       asm volatile("syscall" : "=a" (ret) :
>>> +                    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) :
>>> "memory");
>>>        return ret;
>>>  }
>>
>>
>> Huh?  This should probably be a separate patch (and probably not a
>> -stable candidate, since it would take amazing compiler stupidity to
>> generate anything other than the obvious code).  The memory clobber
>> may also be enough to make this officially safe.
>
>
> Yes - this should be a separate patch. gcc-4.4 likes to get rid of the
> instruction in __do_thread_cpu_time without the asm volatile (in spite of
> the memory clobber).
>

gcc 4.4 caught a genuine bug in your code.  You ignored the return
value (which is an output constraint), so you weren't using any
outputs and gcc omitted the apparently useless code.  The right fix is
to check the return value.  (And to consider adding the missing
volatile.)

>
>
>>> +       if (vp->tsc_unstable) {
>>> +               struct timespec ts;
>>> +               vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID,&ts);
>>> +               return timespec_to_ns(&ts);
>>> +       }
>>
>>
>> Yuck -- another flag indicating whether we're using the tsc.
>
>
> I renamed it to thread_cputime_disabled to deal with NR_CPUS > 64.
>

Still yuck.  IMO this should always work.

>
>>> --- a/arch/x86/vdso/vdso.lds.S
>>> +++ b/arch/x86/vdso/vdso.lds.S
>>> @@ -25,6 +25,8 @@ VERSION {
>>>                __vdso_getcpu;
>>>                time;
>>>                __vdso_time;
>>> +               thread_cpu_time;
>>> +               __vdso_thread_cpu_time;
>>>        local: *;
>>>        };
>>>  }
>>
>>
>> Why do we have the non-__vdso versions?  IMO anything that actually
>> uses them is likely to be confused.  They have the same names as the
>> glibc wrappers, but the glibc wrappers have different return value and
>> errno semantics.  I'd say just use __vdso.
>
>
> I'm not familiar with the history of __vdso_foo vs foo. Happy to get rid of
> the non __vdso versions.
>

I'm not either :)  I'm just the most recent person to have made major
changes to this code.


>
>>
>> Also, what's wrong with just adding the functionality to
>> __vdso_clock_gettime?
>>
>
> Performance. Most of this is client dependent (whether it expects time in
> nanoseconds or timespec).
>
>
> Baseline:               19.34 secs
> vclock_gettime:         4.74 secs
> thread_cpu_time:        3.62 secs
>
> Our use case prefers nanoseconds, so the conversion to timespec and back
> adds overhead. Also, having a direct entry point vs going through the switch
> in clock_gettime() adds some cycles.
>
> I have some people here asking me if they could get CLOCK_REALTIME and
> CLOCK_MONOTONIC also in nanoseconds for the same reason.

I'd support a clock_gettime_nsec that supported all the clocks with
sensible semantics.  Perhaps a pair of functions:

clock_gettime_nsec_base that returns an offset, shifted right by 2^63
(63 is somewhat arbitrary.  32 would work just as well, but 64 would
be more awkward to use) with a "guarantee" that the value will not
change until a reboot unless the system is up for a really long time
such that clock_gettime_nsec actually wraps.  clock_gettime_nsec would
then return an offset.

>
>
>
>>> +struct vcpu_data {
>>> +       struct vpercpu_data vpercpu[NR_CPUS];
>>> +       unsigned int tsc_khz;
>>
>>
>> I think this also duplicates some of the gtod_data stuff, at least in
>> the case that TSC works for gtod.
>>
>
> vgotd.h seems to be clock source independent. This vsyscall is usable only
> on systems with tsc_unstable == 0. Also, there is only one instance of mult
> and shift there, whereas we save it on a per-cpu basis in the VVAR page.

Sorry, I was unclear.  gtod_data contains vclock_mode, which will tell
you whether the tsc is usable.  I have a buggy computer (I'm typing
this email on it) that has a TSC that is only useful per-process.  I
don't think it's worth supporting this particular case, since it's a
bios bug and needs fixing by the vendor.  It's hopefully rare.

--Andy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 19:36 ` [PATCH 2/2] " Arun Sharma
  2011-12-12 20:13   ` Eric Dumazet
  2011-12-12 20:15   ` Andrew Lutomirski
@ 2011-12-12 23:09   ` john stultz
  2011-12-12 23:20     ` Arun Sharma
  2 siblings, 1 reply; 23+ messages in thread
From: john stultz @ 2011-12-12 23:09 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, Andy Lutomirski

On Mon, 2011-12-12 at 11:36 -0800, Arun Sharma wrote:
> From: Kumar Sundararajan <kumar@fb.com>
> 
> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
> via a new vsyscall. We also add a direct vsyscall that returns
> time in ns (RFC: the direct vsyscall doesn't have a corresponding
> regular syscall, although clock_gettime() is pretty close).

I'm still not super psyched about providing a vdso-only API.

If a nanosecond interface like thread_cpu_time() is actually a big win
over clock_gettime(CLOCK_THREAD_CPUTIME,...) it seems it should have its
own syscall as well, no? 

Possibly something like clock_gettime_ns(), which would return the same
values as clock_gettime() but in nanoseconds rather then a timespec?

thanks
-john



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:09   ` john stultz
@ 2011-12-12 23:20     ` Arun Sharma
  2011-12-12 23:32       ` john stultz
  0 siblings, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-12 23:20 UTC (permalink / raw)
  To: john stultz
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, Andy Lutomirski

On 12/12/11 3:09 PM, john stultz wrote:
> On Mon, 2011-12-12 at 11:36 -0800, Arun Sharma wrote:
>> From: Kumar Sundararajan<kumar@fb.com>
>>
>> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
>> via a new vsyscall. We also add a direct vsyscall that returns
>> time in ns (RFC: the direct vsyscall doesn't have a corresponding
>> regular syscall, although clock_gettime() is pretty close).
>
> I'm still not super psyched about providing a vdso-only API.
>
> If a nanosecond interface like thread_cpu_time() is actually a big win
> over clock_gettime(CLOCK_THREAD_CPUTIME,...) it seems it should have its
> own syscall as well, no?

The win is relatively small when we're dealing with syscalls. But with 
vsyscalls, it starts showing up in micro benchmarks.

Happy to post patches for regular syscalls (assuming I can get them 
allocated :).

>
> Possibly something like clock_gettime_ns(), which would return the same
> values as clock_gettime() but in nanoseconds rather then a timespec?
>

If we're doing non-POSIXy things there, how about allocating one syscall 
per clock instead of multiplexing them through a single syscall?

This would be a nice to have (clock_gettime_ns() should get us most of 
the perf benefit).

-Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:20     ` Arun Sharma
@ 2011-12-12 23:32       ` john stultz
  2011-12-12 23:41         ` Andrew Lutomirski
  2011-12-13  0:26         ` Arun Sharma
  0 siblings, 2 replies; 23+ messages in thread
From: john stultz @ 2011-12-12 23:32 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, Andy Lutomirski

On Mon, 2011-12-12 at 15:20 -0800, Arun Sharma wrote:
> On 12/12/11 3:09 PM, john stultz wrote:
> > On Mon, 2011-12-12 at 11:36 -0800, Arun Sharma wrote:
> >> From: Kumar Sundararajan<kumar@fb.com>
> >>
> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
> >> via a new vsyscall. We also add a direct vsyscall that returns
> >> time in ns (RFC: the direct vsyscall doesn't have a corresponding
> >> regular syscall, although clock_gettime() is pretty close).
> >
> > I'm still not super psyched about providing a vdso-only API.
> >
> > If a nanosecond interface like thread_cpu_time() is actually a big win
> > over clock_gettime(CLOCK_THREAD_CPUTIME,...) it seems it should have its
> > own syscall as well, no?
> 
> The win is relatively small when we're dealing with syscalls. But with 
> vsyscalls, it starts showing up in micro benchmarks.
> 
> Happy to post patches for regular syscalls (assuming I can get them 
> allocated :).
> 
> >
> > Possibly something like clock_gettime_ns(), which would return the same
> > values as clock_gettime() but in nanoseconds rather then a timespec?
> >
> 
> If we're doing non-POSIXy things there, how about allocating one syscall 
> per clock instead of multiplexing them through a single syscall?
> 
> This would be a nice to have (clock_gettime_ns() should get us most of 
> the perf benefit).

Well, it makes it a little easier to extend if we get a new clockid,
rather then having to add a whole new syscall. Keeps parity between the
timespec and ns interfaces.

Is it just that you're concerned about the clockid switch costs being
too high?

thanks
-john



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:32       ` john stultz
@ 2011-12-12 23:41         ` Andrew Lutomirski
  2011-12-12 23:52           ` john stultz
  2011-12-13  0:26         ` Arun Sharma
  1 sibling, 1 reply; 23+ messages in thread
From: Andrew Lutomirski @ 2011-12-12 23:41 UTC (permalink / raw)
  To: john stultz
  Cc: Arun Sharma, linux-kernel, Kumar Sundararajan, Peter Zijlstra,
	Ingo Molnar, Thomas Gleixner

On Mon, Dec 12, 2011 at 3:32 PM, john stultz <johnstul@us.ibm.com> wrote:
> On Mon, 2011-12-12 at 15:20 -0800, Arun Sharma wrote:
>> On 12/12/11 3:09 PM, john stultz wrote:
>> > On Mon, 2011-12-12 at 11:36 -0800, Arun Sharma wrote:
>> >> From: Kumar Sundararajan<kumar@fb.com>
>> >>
>> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
>> >> via a new vsyscall. We also add a direct vsyscall that returns
>> >> time in ns (RFC: the direct vsyscall doesn't have a corresponding
>> >> regular syscall, although clock_gettime() is pretty close).
>> >
>> > I'm still not super psyched about providing a vdso-only API.
>> >
>> > If a nanosecond interface like thread_cpu_time() is actually a big win
>> > over clock_gettime(CLOCK_THREAD_CPUTIME,...) it seems it should have its
>> > own syscall as well, no?
>>
>> The win is relatively small when we're dealing with syscalls. But with
>> vsyscalls, it starts showing up in micro benchmarks.
>>
>> Happy to post patches for regular syscalls (assuming I can get them
>> allocated :).
>>
>> >
>> > Possibly something like clock_gettime_ns(), which would return the same
>> > values as clock_gettime() but in nanoseconds rather then a timespec?
>> >
>>
>> If we're doing non-POSIXy things there, how about allocating one syscall
>> per clock instead of multiplexing them through a single syscall?
>>
>> This would be a nice to have (clock_gettime_ns() should get us most of
>> the perf benefit).
>
> Well, it makes it a little easier to extend if we get a new clockid,
> rather then having to add a whole new syscall. Keeps parity between the
> timespec and ns interfaces.
>
> Is it just that you're concerned about the clockid switch costs being
> too high?

>From my (old, from memory) measurements, the switch cost is very very
low if it's predicted correctly, and the main case where it's likely
to be mispredicted is when it hasn't been called in awhile, in which
case either probably no one cares or the cache misses will dominate.

How worried are you about introducing a year 2554 bug?  Python says:

>>> dateutil.parser.parse('1/1/1970') + datetime.timedelta(seconds = 2**64 // 1000000000)
datetime.datetime(2554, 7, 21, 23, 34, 33)

--Andy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:41         ` Andrew Lutomirski
@ 2011-12-12 23:52           ` john stultz
  0 siblings, 0 replies; 23+ messages in thread
From: john stultz @ 2011-12-12 23:52 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: Arun Sharma, linux-kernel, Kumar Sundararajan, Peter Zijlstra,
	Ingo Molnar, Thomas Gleixner

On Mon, 2011-12-12 at 15:41 -0800, Andrew Lutomirski wrote:
> On Mon, Dec 12, 2011 at 3:32 PM, john stultz <johnstul@us.ibm.com> wrote:
> > Is it just that you're concerned about the clockid switch costs being
> > too high?
> 
> From my (old, from memory) measurements, the switch cost is very very
> low if it's predicted correctly, and the main case where it's likely
> to be mispredicted is when it hasn't been called in awhile, in which
> case either probably no one cares or the cache misses will dominate.
> 
> How worried are you about introducing a year 2554 bug?  Python says:
> 
> >>> dateutil.parser.parse('1/1/1970') + datetime.timedelta(seconds = 2**64 // 1000000000)
> datetime.datetime(2554, 7, 21, 23, 34, 33)

Not very. :) 

Worse case the syscall can expose a nsec_t or something that can be
bumped to u128 when that becomes common.

thanks
-john


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:32       ` john stultz
  2011-12-12 23:41         ` Andrew Lutomirski
@ 2011-12-13  0:26         ` Arun Sharma
  1 sibling, 0 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-13  0:26 UTC (permalink / raw)
  To: john stultz
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, Andy Lutomirski

On 12/12/11 3:32 PM, john stultz wrote:

> Is it just that you're concerned about the clockid switch costs being
> too high?

Yeah - but I have no real-world data to back it up (I don't think micro 
benchmarks are a good way to analyze this).

clock_gettime_ns() may just be the right answer here.

  -Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 23:01       ` Andrew Lutomirski
@ 2011-12-13  0:40         ` Arun Sharma
  0 siblings, 0 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-13  0:40 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz

On 12/12/11 3:01 PM, Andrew Lutomirski wrote:

>> The timing based attacks depend on the granularity of timestamps. I feel
>> what's available here is too coarse grained to be useful. Happy to
>> disable the code at compile time for those cases. Are there
>> CONFIG_HIGH_SECURITY type of options I could use for this purpose?
>
> It allows anyone to detect with very high precision when context
> switches happen on another CPU.  This sounds a little dangerous.  I
> don't know if a config option is the right choice.

Minor nit: attacker gets to see sum_exec_runtime - sched_clock(), not 
sched_clock() directly. It might still be equally damaging (assuming the 
attacker can work out sum_exec_runtime by observing the VVAR page).

Either way, I think it's important to get to the bottom of this. 
Conversely, we might want to consider enabling things like this only 
under CONFIG_I_TRUST_STUFF_RUNNING_ON_MY_MACHINE.

>> Yes - this should be a separate patch. gcc-4.4 likes to get rid of the
>> instruction in __do_thread_cpu_time without the asm volatile (in spite of
>> the memory clobber).
>>
>
> gcc 4.4 caught a genuine bug in your code.  You ignored the return
> value (which is an output constraint), so you weren't using any
> outputs and gcc omitted the apparently useless code.  The right fix is
> to check the return value.  (And to consider adding the missing
> volatile.)

Yes - there are two bugs here.

>
>>
>>
>>>> +       if (vp->tsc_unstable) {
>>>> +               struct timespec ts;
>>>> +               vdso_fallback_gettime(CLOCK_THREAD_CPUTIME_ID,&ts);
>>>> +               return timespec_to_ns(&ts);
>>>> +       }
>>>
>>>
>>> Yuck -- another flag indicating whether we're using the tsc.
>>
>>
>> I renamed it to thread_cputime_disabled to deal with NR_CPUS>  64.
>>
>
> Still yuck.  IMO this should always work.
>

Yes - but boot time VVAR page allocation is going to take me some time 
to implement. In the meanwhile I was merely trying to ensure that 
compilation doesn't break for unsupported configs and the app doesn't 
get SIGILL when running on old 64 bit CPUs.

[..]
> Sorry, I was unclear.  gtod_data contains vclock_mode, which will tell
> you whether the tsc is usable.  I have a buggy computer (I'm typing
> this email on it) that has a TSC that is only useful per-process.  I
> don't think it's worth supporting this particular case, since it's a
> bios bug and needs fixing by the vendor.  It's hopefully rare.

Ah yes. Will make this change as well.

  -Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-12 22:14         ` Arun Sharma
@ 2011-12-13  8:52           ` Peter Zijlstra
  2011-12-13 18:15             ` Arun Sharma
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2011-12-13  8:52 UTC (permalink / raw)
  To: Arun Sharma
  Cc: Eric Dumazet, linux-kernel, Kumar Sundararajan, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

On Mon, 2011-12-12 at 14:14 -0800, Arun Sharma wrote:

> > But really most modern distros have NR_CPUS>  64
> 
> Is 256 the new 64? :)

4096 in fact.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-13  8:52           ` Peter Zijlstra
@ 2011-12-13 18:15             ` Arun Sharma
  2011-12-13 18:53               ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-13 18:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, linux-kernel, Kumar Sundararajan, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

On 12/13/11 12:52 AM, Peter Zijlstra wrote:
> On Mon, 2011-12-12 at 14:14 -0800, Arun Sharma wrote:
>
>>> But really most modern distros have NR_CPUS>   64
>>
>> Is 256 the new 64? :)
>
> 4096 in fact.

Not sure if you're asking for vsyscall support for NR_CPUS=4096 case or 
just a compile fix + disabled vsyscall is ok.

  -Arun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-13 18:15             ` Arun Sharma
@ 2011-12-13 18:53               ` Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-12-13 18:53 UTC (permalink / raw)
  To: Arun Sharma
  Cc: Eric Dumazet, linux-kernel, Kumar Sundararajan, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

On Tue, 2011-12-13 at 10:15 -0800, Arun Sharma wrote:
> On 12/13/11 12:52 AM, Peter Zijlstra wrote:
> > On Mon, 2011-12-12 at 14:14 -0800, Arun Sharma wrote:
> >
> >>> But really most modern distros have NR_CPUS>   64
> >>
> >> Is 256 the new 64? :)
> >
> > 4096 in fact.
> 
> Not sure if you're asking for vsyscall support for NR_CPUS=4096 case or 
> just a compile fix + disabled vsyscall is ok.

I'm saying distros tend to compile with NR_CPUS=4096, so if you
auto-disable the feature for that, it effectively doesn't exist.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 0/2] Add a thread cpu time implementation to vDSO (v2)
@ 2011-12-19 19:51 Arun Sharma
  2011-12-19 19:51 ` [PATCH 1/2] Extend VVAR support to multiple pages Arun Sharma
  2011-12-19 19:51 ` [PATCH 2/2] Add a thread cpu time implementation to vDSO Arun Sharma
  0 siblings, 2 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-19 19:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Arun Sharma

Changes since last rev:

* Drop the direct vsyscall (hope clock_gettime_ns() addresses the gap)
* Add a compile time check for vvar page overflow
* Handle return value from the syscall
* Pack the VVAR page more tightly
* Handle NR_CPUS > 64

Arun Sharma (1):
  Extend VVAR support to multiple pages

Kumar Sundararajan (1):
  Add a thread cpu time implementation to vDSO

 Documentation/cgroups/timer_slack.txt |   81 ----------------
 drivers/Makefile                      |    3 -
 drivers/fb/Makefile                   |    1 -
 drivers/fb/tsc/Makefile               |    2 -
 drivers/fb/tsc/tsc.c                  |   29 ------
 fs/select.c                           |    7 +-
 include/linux/cgroup_subsys.h         |    7 --
 include/linux/prctl.h                 |    6 -
 include/linux/sched.h                 |   11 --
 include/linux/topology.h              |    2 -
 init/Kconfig                          |    8 --
 kernel/Makefile                       |    1 -
 kernel/cgroup_timer_slack.c           |  169 ---------------------------------
 kernel/fork.c                         |    4 -
 kernel/futex.c                        |    4 +-
 kernel/hrtimer.c                      |    2 +-
 kernel/sched_fair.c                   |   54 -----------
 kernel/sys.c                          |    3 -
 18 files changed, 8 insertions(+), 386 deletions(-)
 delete mode 100644 Documentation/cgroups/timer_slack.txt
 delete mode 100644 drivers/fb/Makefile
 delete mode 100644 drivers/fb/tsc/Makefile
 delete mode 100644 drivers/fb/tsc/tsc.c
 delete mode 100644 kernel/cgroup_timer_slack.c

-- 
1.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] Extend VVAR support to multiple pages
  2011-12-19 19:51 [PATCH 0/2] Add a thread cpu time implementation to vDSO (v2) Arun Sharma
@ 2011-12-19 19:51 ` Arun Sharma
  2011-12-19 19:51 ` [PATCH 2/2] Add a thread cpu time implementation to vDSO Arun Sharma
  1 sibling, 0 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-19 19:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: Arun Sharma, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski

Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: Kumar Sundararajan <kumar@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andy Lutomirski <luto@MIT.EDU>
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/fixmap.h |    1 +
 arch/x86/include/asm/vvar.h   |    3 ++-
 arch/x86/kernel/vmlinux.lds.S |    2 +-
 arch/x86/kernel/vsyscall_64.c |    1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e..6441527 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VVAR_PAGE2,
 	VVAR_PAGE,
 	VSYSCALL_HPET,
 #endif
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac..0fd7a4a 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,8 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
-#define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#define VVAR_NUM_PAGES 2
+#define VVAR_ADDRESS (-10*1024*1024 - VVAR_NUM_PAGES * 4096)
 
 #if defined(__VVAR_KERNEL_LDS)
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f1..70c52de 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -171,7 +171,7 @@ SECTIONS
 
 	} :data
 
-       . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+       . = ALIGN(__vvar_page + VVAR_NUM_PAGES*PAGE_SIZE, PAGE_SIZE);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22..7960d3a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -282,6 +282,7 @@ void __init map_vsyscall(void)
 		     (unsigned long)VSYSCALL_START);
 
 	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+	__set_fixmap(VVAR_PAGE2, physaddr_vvar_page + PAGE_SIZE, PAGE_KERNEL_VVAR);
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
 		     (unsigned long)VVAR_ADDRESS);
 }
-- 
1.7.4


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-19 19:51 [PATCH 0/2] Add a thread cpu time implementation to vDSO (v2) Arun Sharma
  2011-12-19 19:51 ` [PATCH 1/2] Extend VVAR support to multiple pages Arun Sharma
@ 2011-12-19 19:51 ` Arun Sharma
  2011-12-19 19:58   ` Arun Sharma
  1 sibling, 1 reply; 23+ messages in thread
From: Arun Sharma @ 2011-12-19 19:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: Kumar Sundararajan, Arun Sharma, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski, Eric Dumazet

From: Kumar Sundararajan <kumar@fb.com>

This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
We use the following method to compute the thread cpu time:

t0 = process start
t1 = most recent context switch time
t2 = time at which the vsyscall is invoked

thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
		= current->se.sum_exec_runtime + now - sched_clock()

At context switch time We stash away

adj_sched_time = sum_exec_runtime - sched_clock()

in a per-cpu struct in the VVAR page (which has now been extended
to two pages) and then compute

thread_cpu_time = adj_sched_time + now

All computations are done in nanosecs on systems where TSC is stable.
If TSC is unstable, we fallback to a regular syscall.

Benchmark data:

Baseline:

        for (i = 0; i < 100000000; i++) {
                clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

vclock_gettime:

        vclock_gettime = dlsym(vdso, "__vdso_clock_gettime");
        for (i = 0; i < 100000000; i++) {
                (*vclock_gettime)(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

Baseline: 		19.34 secs
vclock_gettime: 	4.74 secs

This should speed up profilers that need to query thread
cpu time a lot to do fine-grained timestamps.

No statistically significant regression was detected on x86_64
context switch code. Most archs that don't support vsyscalls
will have this code disabled via jump labels.

Signed-off-by: Kumar Sundararajan <kumar@fb.com>
Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andy Lutomirski <luto@MIT.EDU>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/timer.h   |   18 +++++++++++++-----
 arch/x86/include/asm/vvar.h    |    1 +
 arch/x86/kernel/tsc.c          |   16 ++++++++++++++++
 arch/x86/kernel/vsyscall_64.c  |    1 +
 arch/x86/vdso/vclock_gettime.c |   36 ++++++++++++++++++++++++++++++++++--
 arch/x86/vdso/vma.c            |    5 +++++
 include/linux/jiffies.h        |   16 ++++++++++++++++
 kernel/sched.c                 |    8 ++++++++
 8 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 431793e..99a3670 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -55,19 +55,27 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+static inline unsigned long long ___cycles_2_ns(unsigned long long cyc,
+						unsigned long long scale,
+						unsigned long long offset)
 {
 	unsigned long long quot;
 	unsigned long long rem;
-	int cpu = smp_processor_id();
-	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long ns = offset;
 	quot = (cyc >> CYC2NS_SCALE_FACTOR);
 	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
-	ns += quot * per_cpu(cyc2ns, cpu) +
-		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
+	ns += quot * scale + ((rem * scale) >> CYC2NS_SCALE_FACTOR);
 	return ns;
 }
 
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+	int cpu = smp_processor_id();
+	unsigned long long offset = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long scale = per_cpu(cyc2ns, cpu);
+	return ___cycles_2_ns(cyc, scale, offset);
+}
+
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	unsigned long long ns;
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 0fd7a4a..6710e2a 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -47,5 +47,6 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+DECLARE_VVAR(256, struct vcpu_data, vcpu_data)
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db48336..8e9f52c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -623,6 +623,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	if (cpu_khz) {
 		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
 		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+		vcpu_data.vpercpu[cpu].cyc2ns = *scale;
+		vcpu_data.vpercpu[cpu].cyc2ns_offset = *offset;
 	}
 
 	sched_clock_idle_wakeup_event(0);
@@ -786,6 +788,8 @@ void mark_tsc_unstable(char *reason)
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
 		disable_sched_clock_irqtime();
+		vcpu_data.thread_cputime_disabled = 1;
+		jump_label_dec(&vcpu_data_enabled);
 		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
@@ -943,6 +947,15 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+/* Should be optimized away at compile time */
+static noinline int check_vvar_overflow(void)
+{
+	long vcpu_data_offset =  (long) vvaraddr_vcpu_data - VVAR_ADDRESS;
+	size_t size = (vcpu_data_offset + sizeof(vcpu_data)
+			+ sizeof(struct vpercpu_data) * CONFIG_NR_CPUS);
+	return (size > VVAR_NUM_PAGES * PAGE_SIZE);
+}
+
 void __init tsc_init(void)
 {
 	u64 lpj;
@@ -979,6 +992,9 @@ void __init tsc_init(void)
 
 	/* now allow native_sched_clock() to use rdtsc */
 	tsc_disabled = 0;
+	vcpu_data.tsc_khz = tsc_khz;
+	vcpu_data.thread_cputime_disabled = check_vvar_overflow()
+		|| !cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7960d3a..cdfcedf 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -56,6 +56,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
 	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
+DEFINE_VVAR(struct vcpu_data, vcpu_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e72..14d466a 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -18,6 +18,7 @@
 #include <asm/vsyscall.h>
 #include <asm/fixmap.h>
 #include <asm/vgtod.h>
+#include <asm/timer.h>
 #include <asm/timex.h>
 #include <asm/hpet.h>
 #include <asm/unistd.h>
@@ -154,8 +155,32 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+notrace static noinline unsigned long do_thread_cpu_time(void)
+{
+	unsigned int p;
+	u_int64_t tscval;
+	unsigned long long adj_sched_time, scale, offset;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+	int cpu;
+
+	do {
+		native_read_tscp(&p);
+		cpu = p & 0xfff;
+		adj_sched_time = vp->vpercpu[cpu].adj_sched_time;
+		scale = vp->vpercpu[cpu].cyc2ns;
+		offset = vp->vpercpu[cpu].cyc2ns_offset;
+		rdtscpll(tscval, p);
+		cpu = p & 0xfff;
+	} while (unlikely(adj_sched_time != vp->vpercpu[cpu].adj_sched_time));
+
+	return  ___cycles_2_ns(tscval, scale, offset) + adj_sched_time;
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
+	unsigned long ns;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+
 	switch (clock) {
 	case CLOCK_REALTIME:
 		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
@@ -169,6 +194,13 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		return do_realtime_coarse(ts);
 	case CLOCK_MONOTONIC_COARSE:
 		return do_monotonic_coarse(ts);
+	case CLOCK_THREAD_CPUTIME_ID:
+		if (vp->thread_cputime_disabled)
+			break;
+		ns = do_thread_cpu_time();
+		ts->tv_sec = 0;
+		timespec_add_ns(ts, ns);
+		return 0;
 	}
 
 	return vdso_fallback_gettime(clock, ts);
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c..8b7630e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/jump_label.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
@@ -24,6 +25,8 @@ extern unsigned short vdso_sync_cpuid;
 extern struct page *vdso_pages[];
 static unsigned vdso_size;
 
+struct jump_label_key vcpu_data_enabled;
+
 static void __init patch_vdso(void *vdso, size_t len)
 {
 	Elf64_Ehdr *hdr = vdso;
@@ -66,6 +69,8 @@ static int __init init_vdso(void)
 	for (i = 0; i < npages; i++)
 		vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
 
+	if (!vcpu_data.thread_cputime_disabled)
+		jump_label_inc(&vcpu_data_enabled);
 	return 0;
 }
 subsys_initcall(init_vdso);
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 265e2c3..d44e6aa 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -312,4 +312,20 @@ extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
+struct vpercpu_data {
+	unsigned long long adj_sched_time;
+	unsigned long long cyc2ns_offset;
+	unsigned long cyc2ns;
+} ____cacheline_aligned;
+
+struct vcpu_data {
+	unsigned int tsc_khz;
+	unsigned int thread_cputime_disabled;
+	struct vpercpu_data vpercpu[0];
+};
+extern struct vcpu_data vcpu_data;
+
+struct jump_label_key;
+extern struct jump_label_key vcpu_data_enabled;
+
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149c..b738dd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3216,6 +3216,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+#if CONFIG_NR_CPUS <= 64
+	if (static_branch(&vcpu_data_enabled)) {
+		int cpu = smp_processor_id();
+		vcpu_data.vpercpu[cpu].adj_sched_time =
+                  current->se.sum_exec_runtime - sched_clock();
+	}
+#endif
 }
 
 #ifdef CONFIG_SMP
-- 
1.7.4


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] Add a thread cpu time implementation to vDSO
  2011-12-19 19:51 ` [PATCH 2/2] Add a thread cpu time implementation to vDSO Arun Sharma
@ 2011-12-19 19:58   ` Arun Sharma
  0 siblings, 0 replies; 23+ messages in thread
From: Arun Sharma @ 2011-12-19 19:58 UTC (permalink / raw)
  To: Arun Sharma
  Cc: linux-kernel, Kumar Sundararajan, Peter Zijlstra, Ingo Molnar,
	Thomas Gleixner, john stultz, Andy Lutomirski, Eric Dumazet

On 12/19/11 11:51 AM, Arun Sharma wrote:

> +/* Should be optimized away at compile time */
> +static noinline int check_vvar_overflow(void)
> +{
> +	long vcpu_data_offset =  (long) vvaraddr_vcpu_data - VVAR_ADDRESS;
> +	size_t size = (vcpu_data_offset + sizeof(vcpu_data)
> +			+ sizeof(struct vpercpu_data) * CONFIG_NR_CPUS);
> +	return (size>  VVAR_NUM_PAGES * PAGE_SIZE);
> +}

Forgot to drop the "noinline" part (which I had added for debug 
purposes, so I could inspect the generated code). Fixed in my repo.

  -Arun


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2011-12-19 19:58 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-19 19:51 [PATCH 0/2] Add a thread cpu time implementation to vDSO (v2) Arun Sharma
2011-12-19 19:51 ` [PATCH 1/2] Extend VVAR support to multiple pages Arun Sharma
2011-12-19 19:51 ` [PATCH 2/2] Add a thread cpu time implementation to vDSO Arun Sharma
2011-12-19 19:58   ` Arun Sharma
  -- strict thread matches above, loose matches on Subject: below --
2011-12-12 19:36 [PATCH 0/2] " Arun Sharma
2011-12-12 19:36 ` [PATCH 2/2] " Arun Sharma
2011-12-12 20:13   ` Eric Dumazet
2011-12-12 21:19     ` Arun Sharma
2011-12-12 21:27       ` Eric Dumazet
2011-12-12 21:33         ` Andrew Lutomirski
2011-12-12 22:14         ` Arun Sharma
2011-12-13  8:52           ` Peter Zijlstra
2011-12-13 18:15             ` Arun Sharma
2011-12-13 18:53               ` Peter Zijlstra
2011-12-12 20:15   ` Andrew Lutomirski
2011-12-12 22:49     ` Arun Sharma
2011-12-12 23:01       ` Andrew Lutomirski
2011-12-13  0:40         ` Arun Sharma
2011-12-12 23:09   ` john stultz
2011-12-12 23:20     ` Arun Sharma
2011-12-12 23:32       ` john stultz
2011-12-12 23:41         ` Andrew Lutomirski
2011-12-12 23:52           ` john stultz
2011-12-13  0:26         ` Arun Sharma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).