public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86, TSC: Add a software TSC offset
@ 2014-07-19 13:06 Borislav Petkov
  2014-07-19 13:28 ` Borislav Petkov
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Borislav Petkov @ 2014-07-19 13:06 UTC (permalink / raw)
  To: Peter Zijlstra, Thomas Gleixner; +Cc: x86-ml, lkml, Steven Rostedt

From: Borislav Petkov <bp@suse.de>

There are machines which do have stable and always-running TSCs but the
last get started at different points in time by the platform, causing
the TSCs to have a small constant diff.

It has been tried a couple of times to resync those during that
sync check but the procedure is error prone and flaky, and not 100%
successful.

So, instead of doing that, let's not touch the TSCs at all but save a
per-CPU TSC offset which we add to the TSC value we've read from the
Time-Stamp Counter. The hope is thus to still salvage the TSC on those
machines.

For that to work, we need to populate the TSC AUX MSR with the core ID
prior to doing the TSC sync check so that RDTSCP can give us the correct
core number and we can add the offset atomically. And yes, we need a
X86_FEATURE_RDTSCP CPU for the whole deal to work. Older ones simply
lose.

See also comment above tsc_sync.c::compute_tsc_offset() for more details.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/cpufeature.h |   1 +
 arch/x86/include/asm/tsc.h        |  19 +++++++
 arch/x86/kernel/cpu/common.c      |   2 +
 arch/x86/kernel/tsc_sync.c        | 117 ++++++++++++++++++++++++++++++++++----
 4 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb9b258d60e7..8c27e55372fb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -244,6 +244,7 @@
 #define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_TSC_OFFSET	X86_BUG(8) /* CPU has skewed but stable TSCs */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..a91f439738f9 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -9,6 +9,9 @@
 #define NS_SCALE	10 /* 2^10, carefully chosen */
 #define US_SCALE	32 /* 2^32, arbitralrily chosen */
 
+
+DECLARE_PER_CPU(long long, tsc_offset);
+
 /*
  * Standard way to access the cycle counter.
  */
@@ -32,6 +35,22 @@ static inline cycles_t get_cycles(void)
 	return ret;
 }
 
+static inline cycles_t get_cycles_aux(void)
+{
+	unsigned long long ret = 0;
+	int cpu;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+	if (static_cpu_has_safe(X86_FEATURE_RDTSCP)) {
+		rdtscpll(ret, cpu);
+		return ret + per_cpu(tsc_offset, cpu);
+	} else
+		return get_cycles();
+}
+
 static __always_inline cycles_t vget_cycles(void)
 {
 	/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ce31eeada362..c67300281790 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -962,6 +962,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
+	if (cpu_has(c, X86_FEATURE_RDTSCP))
+		write_rdtscp_aux(smp_processor_id());
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61..5c0d2eeb5e9b 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
+#include <asm/msr-index.h>
 #include <asm/tsc.h>
 
 /*
@@ -28,6 +29,11 @@ static atomic_t start_count;
 static atomic_t stop_count;
 
 /*
+ * TSC offset helper counters.
+ */
+static atomic_t set_offset_on_target, offset_done;
+
+/*
  * We use a raw spinlock in this exceptional case, because
  * we want to have the fastest, inlined, non-debug version
  * of a critical section, to be able to prove TSC time-warps:
@@ -36,7 +42,9 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 
 static cycles_t last_tsc;
 static cycles_t max_warp;
-static int nr_warps;
+static int nr_warps, max_warp_cpu;
+
+DEFINE_PER_CPU(long long, tsc_offset) = { 0 };
 
 /*
  * TSC-warp measurement loop running on both CPUs:
@@ -47,7 +55,7 @@ static void check_tsc_warp(unsigned int timeout)
 	int i;
 
 	rdtsc_barrier();
-	start = get_cycles();
+	start = get_cycles_aux();
 	rdtsc_barrier();
 	/*
 	 * The measurement runs for 'timeout' msecs:
@@ -64,7 +72,7 @@ static void check_tsc_warp(unsigned int timeout)
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
 		rdtsc_barrier();
-		now = get_cycles();
+		now = get_cycles_aux();
 		rdtsc_barrier();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
@@ -89,6 +97,10 @@ static void check_tsc_warp(unsigned int timeout)
 			arch_spin_lock(&sync_lock);
 			max_warp = max(max_warp, prev - now);
 			nr_warps++;
+
+			if (prev - now == max_warp)
+				max_warp_cpu = smp_processor_id();
+
 			arch_spin_unlock(&sync_lock);
 		}
 	}
@@ -116,6 +128,69 @@ static inline unsigned int loop_timeout(int cpu)
 	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
 }
 
+static inline bool cpu_should_save_offset(int cpu)
+{
+	bool ret = static_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+		   static_cpu_has(X86_FEATURE_NONSTOP_TSC);
+
+	if (ret)
+		set_cpu_bug(&cpu_data(cpu), X86_BUG_TSC_OFFSET);
+
+	return ret;
+}
+
+/*
+ * We're saving a per-core TSC offset only on machines which have a
+ * stable and non-stop TSC but which, for some reason, start their TSCs
+ * on the different nodes at different points in time, thus causing a
+ * small constant diff between them.
+ *
+ * We do this during the TSC sync check which happens between a source
+ * and a target CPU. When we detect the diff, we hold the target CPU by
+ * _not_ incrementing stop_count. What we do instead is we send it into
+ * compute_tsc_offset() below and store the max_warp difference we have
+ * measured above in a per-cpu variable.
+ *
+ * We do pay attention to which CPU saw the max_warp by writing its
+ * number into max_warp_cpu so that we can compute whether the offset
+ * we're going to write into the target's TSC is positive or negative.
+ *
+ * It is positive when the target CPU's TSC has started later than the
+ * source CPU's TSC and thus has a smaller TSC value.
+ *
+ * It is negative when the target CPU's TSC has started earlier than the
+ * source CPU's TSC and thus has a higher TSC value.
+ *
+ * Once we've computed the offset, we let both CPUs do the usual
+ * TSC sync check again, taking the offset into account, see
+ * get_cycles_aux().
+ *
+ * Called on the target.
+ */
+static void compute_tsc_offset(int cpu)
+{
+	long long off;
+
+	/*
+	 * This CPU wrote last the max_warp above, means its TSC is smaller than
+	 * the source CPU which we're doing the sync check with.
+	 */
+	if (cpu == max_warp_cpu)
+		off =  max_warp;
+	else
+		off = -max_warp;
+
+	per_cpu(tsc_offset, cpu) = off;
+	pr_info("CPU%d, saved offset: %lld\n", cpu, off);
+
+	nr_warps = 0;
+	max_warp = 0;
+	last_tsc = 0;
+
+	atomic_inc(&offset_done);
+	atomic_set(&set_offset_on_target, 0);
+}
+
 /*
  * Source CPU calls into this - it waits for the freshly booted
  * target CPU to arrive and then starts the measurement:
@@ -138,6 +213,7 @@ void check_tsc_sync_source(int cpu)
 		return;
 	}
 
+restart_src:
 	/*
 	 * Reset it - in case this is a second bootup:
 	 */
@@ -155,15 +231,27 @@ void check_tsc_sync_source(int cpu)
 
 	check_tsc_warp(loop_timeout(cpu));
 
+	/*
+	 * Wait for target to finish measurement:
+	 */
 	while (atomic_read(&stop_count) != cpus-1)
 		cpu_relax();
 
+	/* Analyze measurement */
 	if (nr_warps) {
-		pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
-			smp_processor_id(), cpu);
-		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
-			   "turning off TSC clock.\n", max_warp);
-		mark_tsc_unstable("check_tsc_sync_source failed");
+		if (cpu_should_save_offset(cpu) && !atomic_read(&offset_done)) {
+			pr_warn("TSCs of [CPU#%d -> CPU#%d] %lld cycles out of sync, saving offset.\n",
+				smp_processor_id(), cpu, max_warp);
+
+			atomic_set(&start_count, 0);
+			atomic_set(&set_offset_on_target, 1);
+
+			goto restart_src;
+		} else {
+			pr_warning("Measured %Ld(%d) cycles TSC warp between CPUs, "
+				   "turning off TSC clock.\n", max_warp, max_warp_cpu);
+			mark_tsc_unstable("check_tsc_sync_source failed");
+		}
 	} else {
 		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
 			smp_processor_id(), cpu);
@@ -173,6 +261,7 @@ void check_tsc_sync_source(int cpu)
 	 * Reset it - just in case we boot another CPU later:
 	 */
 	atomic_set(&start_count, 0);
+	atomic_set(&offset_done, 0);
 	nr_warps = 0;
 	max_warp = 0;
 	last_tsc = 0;
@@ -188,11 +277,16 @@ void check_tsc_sync_source(int cpu)
  */
 void check_tsc_sync_target(void)
 {
+	int this_cpu = smp_processor_id();
 	int cpus = 2;
 
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
+restart_tgt:
+	if (atomic_read(&set_offset_on_target))
+		compute_tsc_offset(this_cpu);
+
 	/*
 	 * Register this CPU's participation and wait for the
 	 * source CPU to start the measurement:
@@ -201,7 +295,7 @@ void check_tsc_sync_target(void)
 	while (atomic_read(&start_count) != cpus)
 		cpu_relax();
 
-	check_tsc_warp(loop_timeout(smp_processor_id()));
+	check_tsc_warp(loop_timeout(this_cpu));
 
 	/*
 	 * Ok, we are done:
@@ -211,6 +305,9 @@ void check_tsc_sync_target(void)
 	/*
 	 * Wait for the source CPU to print stuff:
 	 */
-	while (atomic_read(&stop_count) != cpus)
+	while (atomic_read(&stop_count) != cpus) {
+		if (atomic_read(&set_offset_on_target))
+			goto restart_tgt;
 		cpu_relax();
+	}
 }

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-19 13:06 [PATCH] x86, TSC: Add a software TSC offset Borislav Petkov
@ 2014-07-19 13:28 ` Borislav Petkov
  2014-07-21 19:34 ` Andy Lutomirski
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Borislav Petkov @ 2014-07-19 13:28 UTC (permalink / raw)
  To: Peter Zijlstra, Thomas Gleixner; +Cc: x86-ml, lkml, Steven Rostedt

On Sat, Jul 19, 2014 at 03:06:02PM +0200, Borislav Petkov wrote:
> From: Borislav Petkov <bp@suse.de>
> 
> There are machines which do have stable and always-running TSCs but the
> last get started at different points in time by the platform, causing
> the TSCs to have a small constant diff.
> 
> It has been tried a couple of times to resync those during that
> sync check but the procedure is error prone and flaky, and not 100%
> successful.
> 
> So, instead of doing that, let's not touch the TSCs at all but save a
> per-CPU TSC offset which we add to the TSC value we've read from the
> Time-Stamp Counter. The hope is thus to still salvage the TSC on those
> machines.
> 
> For that to work, we need to populate the TSC AUX MSR with the core ID
> prior to doing the TSC sync check so that RDTSCP can give us the correct
> core number and we can add the offset atomically. And yes, we need a
> X86_FEATURE_RDTSCP CPU for the whole deal to work. Older ones simply
> lose.
> 
> See also comment above tsc_sync.c::compute_tsc_offset() for more details.

And here's how it looks like: So I'm injecting a TSC diff locally because I
don't have a machine which has that problem, Peter has a WSM for that.

So here's the case where the target CPU has started its TSC earlier than
the source CPU:

[    0.264966] x86: Booting SMP configuration:
[    0.265151] .... node  #0, CPUs:      #1
[    0.281610] 1, tsc1: 37576107984
[    0.281611] updating with 600000

This is the error injection into the TSC of CPU1 with +600K cycles.

[    0.281990] 1, tsc2: 37576716684

...

[    0.284259] TSCs of [CPU#0 -> CPU#1] 599193 cycles out of sync, saving offset.
[    0.284756] CPU1, saved offset: -599193

We save a negative offset, and we also see the time it took us to do a
RMW on the TSC :-)

Then we run the sync test again, this time we read the TSC and add the
negative offset.

[    0.287156] TSC synchronization [CPU#0 -> CPU#1]: passed
[    0.287385] x86: Booted up 1 node, 2 CPUs



And now the case where the target CPU starts later than the source (I'd
expect this to be the common case):

[    0.264850] x86: Booting SMP configuration:
[    0.265036] .... node  #0, CPUs:      #1
[    0.281476] identify_cpu: Setting TSC_AUX MSR, cpu 1
[    0.281495] 1, tsc1: 56268738505
[    0.281497] updating with -12345678

injection

[    0.273772] 1, tsc2: 56256402112

...

[    0.284183] TSCs of [CPU#0 -> CPU#1] 12345363 cycles out of sync, saving offset.
[    0.276608] CPU1, saved offset: 12345363
[    0.287057] TSC synchronization [CPU#0 -> CPU#1]: passed
[    0.287288] x86: Booted up 1 node, 2 CPUs



We also state that we have this "workaround" enabled in /proc/cpuinfo:

processor       : 1

...

flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc extd_apicid aperfmperf pni monitor ssse3 cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch ibs skinit wdt arat hw_pstate npt lbrv svm_lock nrip_save pausefilter
bugs            : fxsave_leak tsc_offset
			      ^^^^^^^^^^

bogomips        : 3193.18
TLB size        : 1024 4K pages
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management: ts ttp tm stc 100mhzsteps hwpstate



The whole deal needs more testing now.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-19 13:06 [PATCH] x86, TSC: Add a software TSC offset Borislav Petkov
  2014-07-19 13:28 ` Borislav Petkov
@ 2014-07-21 19:34 ` Andy Lutomirski
  2014-07-21 21:35   ` Borislav Petkov
  2014-07-22  2:40 ` Steven Rostedt
  2014-07-22 12:05 ` Borislav Petkov
  3 siblings, 1 reply; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 19:34 UTC (permalink / raw)
  To: Borislav Petkov, Peter Zijlstra, Thomas Gleixner
  Cc: x86-ml, lkml, Steven Rostedt

On 07/19/2014 06:06 AM, Borislav Petkov wrote:
> From: Borislav Petkov <bp@suse.de>
> 
> There are machines which do have stable and always-running TSCs but the
> last get started at different points in time by the platform, causing
> the TSCs to have a small constant diff.
> 
> It has been tried a couple of times to resync those during that
> sync check but the procedure is error prone and flaky, and not 100%
> successful.
> 
> So, instead of doing that, let's not touch the TSCs at all but save a
> per-CPU TSC offset which we add to the TSC value we've read from the
> Time-Stamp Counter. The hope is thus to still salvage the TSC on those
> machines.
> 
> For that to work, we need to populate the TSC AUX MSR with the core ID
> prior to doing the TSC sync check so that RDTSCP can give us the correct
> core number and we can add the offset atomically. And yes, we need a
> X86_FEATURE_RDTSCP CPU for the whole deal to work. Older ones simply
> lose.

I'm sure I'm missing something, but where is tsc_offset used outside the
tsc startup code?  Is it somehow getting programmed into the TSC config
registers?

--Andy

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 19:34 ` Andy Lutomirski
@ 2014-07-21 21:35   ` Borislav Petkov
  2014-07-21 21:41     ` Andy Lutomirski
  0 siblings, 1 reply; 20+ messages in thread
From: Borislav Petkov @ 2014-07-21 21:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 12:34:40PM -0700, Andy Lutomirski wrote:
> I'm sure I'm missing something, but where is tsc_offset used outside
> the tsc startup code? Is it somehow getting programmed into the TSC
> config registers?

Nah, we're programming the CPU number into the TSC_AUX MSR. However,
that approach might become obsolete for a simpler, preempt-disable
version which Peter suggested today. It should be comparatively cheap
and work on all CPUs, not only RDTSCP-supporting ones:

get_cycles() {
	preempt_disable();
	rdtscll() + this_cpu_read(tsc_offset);
	preempt_disable()
}

So, to answer your question, the TSC offset will be used in get_cycles(). So
even on machines with a skewed TSC, we will have a stable TSC counter by
adding the per-cpu offset each time we read the TSC.

We'll see how it plays out in testing. :)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:35   ` Borislav Petkov
@ 2014-07-21 21:41     ` Andy Lutomirski
  2014-07-21 21:52       ` Borislav Petkov
  0 siblings, 1 reply; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 21:41 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 2:35 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jul 21, 2014 at 12:34:40PM -0700, Andy Lutomirski wrote:
>> I'm sure I'm missing something, but where is tsc_offset used outside
>> the tsc startup code? Is it somehow getting programmed into the TSC
>> config registers?
>
> Nah, we're programming the CPU number into the TSC_AUX MSR. However,
> that approach might become obsolete for a simpler, preempt-disable
> version which Peter suggested today. It should be comparatively cheap
> and work on all CPUs, not only RDTSCP-supporting ones:
>
> get_cycles() {
>         preempt_disable();
>         rdtscll() + this_cpu_read(tsc_offset);
>         preempt_disable()
> }
>
> So, to answer your question, the TSC offset will be used in get_cycles(). So
> even on machines with a skewed TSC, we will have a stable TSC counter by
> adding the per-cpu offset each time we read the TSC.
>
> We'll see how it plays out in testing. :)

How will this be compatible with the vdso?

Also, IIRC, rdtscp does not need rdtsc_barrier(), whereas rdtsc does.
Getting this wrong will be a significant slowdown.

--Andy

>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:41     ` Andy Lutomirski
@ 2014-07-21 21:52       ` Borislav Petkov
  2014-07-21 21:56         ` Andy Lutomirski
  0 siblings, 1 reply; 20+ messages in thread
From: Borislav Petkov @ 2014-07-21 21:52 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 02:41:50PM -0700, Andy Lutomirski wrote:
> How will this be compatible with the vdso?

I've never thought about it yet. How compatible would you want it to be
and what do you expect from it?

Remember, this is only attempting to be a hardware workaround for a
smallish number of systems out there. Most of current machines should
have stable and synched TSCs.

> Also, IIRC, rdtscp does not need rdtsc_barrier(), whereas rdtsc does.
> Getting this wrong will be a significant slowdown.

This is too cryptic for me - get_cycles doesn't barrier around the TSC
now either. Again, we will most likely end up not using RDTSCP anyway.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:52       ` Borislav Petkov
@ 2014-07-21 21:56         ` Andy Lutomirski
  2014-07-21 22:06           ` Thomas Gleixner
                             ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 21:56 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 2:52 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jul 21, 2014 at 02:41:50PM -0700, Andy Lutomirski wrote:
>> How will this be compatible with the vdso?
>
> I've never thought about it yet. How compatible would you want it to be
> and what do you expect from it?

I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
correct time :)  They use vread_tsc, and they can't use
preempt_disable, because they're in userspace.  They also can't
directly access per-cpu variables.

Turning off vdso tsc support on these machines would be an option.

>
> Remember, this is only attempting to be a hardware workaround for a
> smallish number of systems out there. Most of current machines should
> have stable and synched TSCs.

I actually own one of these systems.  It's a Sandy Bridge Core-i7
Extreme or something like that.

>
>> Also, IIRC, rdtscp does not need rdtsc_barrier(), whereas rdtsc does.
>> Getting this wrong will be a significant slowdown.
>
> This is too cryptic for me - get_cycles doesn't barrier around the TSC
> now either. Again, we will most likely end up not using RDTSCP anyway.

I wonder if that's a bug in get_cycles.

The basic issue is that rdtsc is not ordered with respect to nearby
loads, so it's fairly easy to see it behaving non-monotonically across
CPUs.  rdtscp is ordered, but it's a little slower.

--Andy

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:56         ` Andy Lutomirski
@ 2014-07-21 22:06           ` Thomas Gleixner
  2014-07-21 22:11             ` Thomas Gleixner
  2014-07-21 22:08           ` Borislav Petkov
  2014-07-22  7:57           ` Peter Zijlstra
  2 siblings, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2014-07-21 22:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Peter Zijlstra, x86-ml, lkml, Steven Rostedt

On Mon, 21 Jul 2014, Andy Lutomirski wrote:
> On Mon, Jul 21, 2014 at 2:52 PM, Borislav Petkov <bp@alien8.de> wrote:
> > On Mon, Jul 21, 2014 at 02:41:50PM -0700, Andy Lutomirski wrote:
> >> How will this be compatible with the vdso?
> >
> > I've never thought about it yet. How compatible would you want it to be
> > and what do you expect from it?
> 
> I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
> correct time :)  They use vread_tsc, and they can't use
> preempt_disable, because they're in userspace.  They also can't
> directly access per-cpu variables.
> 
> Turning off vdso tsc support on these machines would be an option.

Correct. 

So if we make the TSC usable that way we need to mark it unusable for
the VDSO and force vdso_*gettime* into the kernel. That will be way
better than forcing the whole machine onto HPET.

And once we are using the real syscall, the whole thing just works ...

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:56         ` Andy Lutomirski
  2014-07-21 22:06           ` Thomas Gleixner
@ 2014-07-21 22:08           ` Borislav Petkov
  2014-07-21 22:13             ` Andy Lutomirski
  2014-07-22  7:57           ` Peter Zijlstra
  2 siblings, 1 reply; 20+ messages in thread
From: Borislav Petkov @ 2014-07-21 22:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 02:56:49PM -0700, Andy Lutomirski wrote:
> I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
> correct time :)  They use vread_tsc, and they can't use
> preempt_disable, because they're in userspace.  They also can't
> directly access per-cpu variables.
> 
> Turning off vdso tsc support on these machines would be an option.

Right, this is what I was going to propose to tglx on IRC. Or we can try
to come up with something working for the vdso too, say RDTSCP :-)

But that still won't work as it needs the per-cpu variables. So I guess
vdso loses...

> I actually own one of these systems.  It's a Sandy Bridge Core-i7
> Extreme or something like that.

Ha, cool, so I've got my tester! :-)

> I wonder if that's a bug in get_cycles.
> 
> The basic issue is that rdtsc is not ordered with respect to nearby
> loads, so it's fairly easy to see it behaving non-monotonically across
> CPUs.  rdtscp is ordered, but it's a little slower.

Yah, that I know. But I don't see get_cycles() having the barriers. So
it might be a bug. We can certainly try to "fix" it and see what happens
:-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..ad7d5e449c0b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -27,7 +27,9 @@ static inline cycles_t get_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
+	rdtsc_barrier();
 	rdtscll(ret);
+	rdtsc_barrier();
 
 	return ret;
 }

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:06           ` Thomas Gleixner
@ 2014-07-21 22:11             ` Thomas Gleixner
  2014-07-21 22:14               ` Andy Lutomirski
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2014-07-21 22:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Peter Zijlstra, x86-ml, lkml, Steven Rostedt

On Tue, 22 Jul 2014, Thomas Gleixner wrote:
> On Mon, 21 Jul 2014, Andy Lutomirski wrote:
> > On Mon, Jul 21, 2014 at 2:52 PM, Borislav Petkov <bp@alien8.de> wrote:
> > > On Mon, Jul 21, 2014 at 02:41:50PM -0700, Andy Lutomirski wrote:
> > >> How will this be compatible with the vdso?
> > >
> > > I've never thought about it yet. How compatible would you want it to be
> > > and what do you expect from it?
> > 
> > I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
> > correct time :)  They use vread_tsc, and they can't use
> > preempt_disable, because they're in userspace.  They also can't
> > directly access per-cpu variables.
> > 
> > Turning off vdso tsc support on these machines would be an option.
> 
> Correct. 
> 
> So if we make the TSC usable that way we need to mark it unusable for
> the VDSO and force vdso_*gettime* into the kernel. That will be way
> better than forcing the whole machine onto HPET.
> 
> And once we are using the real syscall, the whole thing just works ...

So all it needs is to do

    clocksource_tsc.archdata.vclock_mode = VCLOCK_NONE;

if we do the magic per cpu offset trick to make the fcked up TSC
usable.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:08           ` Borislav Petkov
@ 2014-07-21 22:13             ` Andy Lutomirski
  2014-07-21 22:30               ` Borislav Petkov
  2014-07-22  8:00               ` Peter Zijlstra
  0 siblings, 2 replies; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 22:13 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 3:08 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jul 21, 2014 at 02:56:49PM -0700, Andy Lutomirski wrote:
>> I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
>> correct time :)  They use vread_tsc, and they can't use
>> preempt_disable, because they're in userspace.  They also can't
>> directly access per-cpu variables.
>>
>> Turning off vdso tsc support on these machines would be an option.
>
> Right, this is what I was going to propose to tglx on IRC. Or we can try
> to come up with something working for the vdso too, say RDTSCP :-)
>
> But that still won't work as it needs the per-cpu variables. So I guess
> vdso loses...
>
>> I actually own one of these systems.  It's a Sandy Bridge Core-i7
>> Extreme or something like that.
>
> Ha, cool, so I've got my tester! :-)

Ha.  Ha ha.  Muahaha.  Because IIRC this box is synced until the first
time it suspends.

>
>> I wonder if that's a bug in get_cycles.
>>
>> The basic issue is that rdtsc is not ordered with respect to nearby
>> loads, so it's fairly easy to see it behaving non-monotonically across
>> CPUs.  rdtscp is ordered, but it's a little slower.
>
> Yah, that I know. But I don't see get_cycles() having the barriers. So
> it might be a bug. We can certainly try to "fix" it and see what happens
> :-)
>
> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
> index 94605c0e9cee..ad7d5e449c0b 100644
> --- a/arch/x86/include/asm/tsc.h
> +++ b/arch/x86/include/asm/tsc.h
> @@ -27,7 +27,9 @@ static inline cycles_t get_cycles(void)
>         if (!cpu_has_tsc)
>                 return 0;
>  #endif
> +       rdtsc_barrier();
>         rdtscll(ret);
> +       rdtsc_barrier();
>

Only the first of these is necessary.  There was a long thread on this
a couple of years ago, and the conclusion was that the code in
vread_tsc in vclock_gettime.c is correct.

--Andy

>         return ret;
>  }
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:11             ` Thomas Gleixner
@ 2014-07-21 22:14               ` Andy Lutomirski
  0 siblings, 0 replies; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 22:14 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Borislav Petkov, Peter Zijlstra, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 3:11 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 22 Jul 2014, Thomas Gleixner wrote:
>> On Mon, 21 Jul 2014, Andy Lutomirski wrote:
>> > On Mon, Jul 21, 2014 at 2:52 PM, Borislav Petkov <bp@alien8.de> wrote:
>> > > On Mon, Jul 21, 2014 at 02:41:50PM -0700, Andy Lutomirski wrote:
>> > >> How will this be compatible with the vdso?
>> > >
>> > > I've never thought about it yet. How compatible would you want it to be
>> > > and what do you expect from it?
>> >
>> > I expect that users of __vdso_clock_gettime (e.g. glibc) will get the
>> > correct time :)  They use vread_tsc, and they can't use
>> > preempt_disable, because they're in userspace.  They also can't
>> > directly access per-cpu variables.
>> >
>> > Turning off vdso tsc support on these machines would be an option.
>>
>> Correct.
>>
>> So if we make the TSC usable that way we need to mark it unusable for
>> the VDSO and force vdso_*gettime* into the kernel. That will be way
>> better than forcing the whole machine onto HPET.
>>
>> And once we are using the real syscall, the whole thing just works ...
>
> So all it needs is to do
>
>     clocksource_tsc.archdata.vclock_mode = VCLOCK_NONE;
>
> if we do the magic per cpu offset trick to make the fcked up TSC
> usable.

Just make sure this happens every time the tsc clocksource is selected
-- it's possible to go back and forth.

>
> Thanks,
>
>         tglx
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:13             ` Andy Lutomirski
@ 2014-07-21 22:30               ` Borislav Petkov
  2014-07-21 22:43                 ` Andy Lutomirski
  2014-07-22  8:00               ` Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Borislav Petkov @ 2014-07-21 22:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 03:13:33PM -0700, Andy Lutomirski wrote:
> Ha.  Ha ha.  Muahaha.  Because IIRC this box is synced until the first
> time it suspends.

Sweet, TSCs get fumbled in some S-state or maybe SMI... Who TF knows.

Well, I'm thinking upon resume, we run through smpboot which should do
the tsc sync check again. Will have to test to see.

> > diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
> > index 94605c0e9cee..ad7d5e449c0b 100644
> > --- a/arch/x86/include/asm/tsc.h
> > +++ b/arch/x86/include/asm/tsc.h
> > @@ -27,7 +27,9 @@ static inline cycles_t get_cycles(void)
> >         if (!cpu_has_tsc)
> >                 return 0;
> >  #endif
> > +       rdtsc_barrier();
> >         rdtscll(ret);
> > +       rdtsc_barrier();
> >
> 
> Only the first of these is necessary.  There was a long thread on this
> a couple of years ago, and the conclusion was that the code in
> vread_tsc in vclock_gettime.c is correct.

	 * The various CPU manuals are unclear
         * as to whether rdtsc can be reordered with later loads,
         * but no one has ever seen it happen.

until some future uarch proves you wrong. :-)

I guess we can try with one pre-fence only first although if we're doing
one already, I can't imagine the post-one to be all that expensive since
we've retired the whole inflight crap already anyway.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:30               ` Borislav Petkov
@ 2014-07-21 22:43                 ` Andy Lutomirski
  2014-07-21 23:01                   ` Borislav Petkov
  0 siblings, 1 reply; 20+ messages in thread
From: Andy Lutomirski @ 2014-07-21 22:43 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 3:30 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jul 21, 2014 at 03:13:33PM -0700, Andy Lutomirski wrote:
>> Ha.  Ha ha.  Muahaha.  Because IIRC this box is synced until the first
>> time it suspends.
>
> Sweet, TSCs get fumbled in some S-state or maybe SMI... Who TF knows.
>
> Well, I'm thinking upon resume, we run through smpboot which should do
> the tsc sync check again. Will have to test to see.

I have some reason to believe that this is almost an intentional bug
on the part of the BIOS vendor.

>
>> > diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
>> > index 94605c0e9cee..ad7d5e449c0b 100644
>> > --- a/arch/x86/include/asm/tsc.h
>> > +++ b/arch/x86/include/asm/tsc.h
>> > @@ -27,7 +27,9 @@ static inline cycles_t get_cycles(void)
>> >         if (!cpu_has_tsc)
>> >                 return 0;
>> >  #endif
>> > +       rdtsc_barrier();
>> >         rdtscll(ret);
>> > +       rdtsc_barrier();
>> >
>>
>> Only the first of these is necessary.  There was a long thread on this
>> a couple of years ago, and the conclusion was that the code in
>> vread_tsc in vclock_gettime.c is correct.
>
>          * The various CPU manuals are unclear
>          * as to whether rdtsc can be reordered with later loads,
>          * but no one has ever seen it happen.
>
> until some future uarch proves you wrong. :-)
>
> I guess we can try with one pre-fence only first although if we're doing
> one already, I can't imagine the post-one to be all that expensive since
> we've retired the whole inflight crap already anyway.
>

IIRC it was actually quite expensive, at least on Sandy Bridge.  Maybe
AMD is different.

Anyway, if some future uarch breaks this, I could resurrect my old
hack: do a TSC-dependent load prior to returning.  Loads are ordered,
and the hackish load can't be reordered wrt RDTSC due to
data-dependency, so we're in business :)

--Andy

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:43                 ` Andy Lutomirski
@ 2014-07-21 23:01                   ` Borislav Petkov
  0 siblings, 0 replies; 20+ messages in thread
From: Borislav Petkov @ 2014-07-21 23:01 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 03:43:36PM -0700, Andy Lutomirski wrote:
> I have some reason to believe that this is almost an intentional bug
> on the part of the BIOS vendor.

Hiding SMIs or some other dumb, I-know-how-to-do-stuff-better-than-you
BIOS gunk.

> IIRC it was actually quite expensive, at least on Sandy Bridge.

Hmm, strange. So after the first fence and when you get to retire the
second fence, you will have only the RDTSC eligible for retirement
and everything that's coming behind it in program order can still go
out-of-order. Unless the second fence flushes more stuff. I most likely
am missing something.

> Maybe AMD is different.
> 
> Anyway, if some future uarch breaks this, I could resurrect my old
> hack: do a TSC-dependent load prior to returning.  Loads are ordered,
> and the hackish load can't be reordered wrt RDTSC due to
> data-dependency, so we're in business :)

:-)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-19 13:06 [PATCH] x86, TSC: Add a software TSC offset Borislav Petkov
  2014-07-19 13:28 ` Borislav Petkov
  2014-07-21 19:34 ` Andy Lutomirski
@ 2014-07-22  2:40 ` Steven Rostedt
  2014-07-22  8:59   ` Borislav Petkov
  2014-07-22 12:05 ` Borislav Petkov
  3 siblings, 1 reply; 20+ messages in thread
From: Steven Rostedt @ 2014-07-22  2:40 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml

Your patch inspired me to write this hack. I was curious to know how
the TSCs of my boxes were with respect to each other, and wanted to get
an idea. Maybe there's a better way, but I decided to waste an hour and
write this hack up.

Here's what it does. It creates the file /sys/kernel/debug/rdtsc_test,
and when you read it, it does some whacky things.

1) A table is set up with the number of possible CPUs. The cable
consists of: index, TSC count, CPU.

2) A atomic variable is set to the number of online CPUS.

3) An IPI is sent to each of the other CPUs to run the test.

4) The test decrements the atomic, and then spins until it reaches zero.

5) The caller of smp_call_function() then calls the test iself, being
the last to decrement the counter causing it to go to zero and all CPUs
then fight for a spinlock.

6) When the spin lock is taken, it records which place it was in (order
of spinlock taken, and records its own TSC. Then it releases the lock.

7) It then records in the table where its position is, its TSC counter
and CPU number.


Finally, the read will show the results of the table. Looks something
like this:

# cat /debug/rdtsc_test 
    0)  1305910016816  (cpu:5)
    1)  1305910017550  (cpu:7)
    2)  1305910017712  (cpu:1)
    3)  1305910017910  (cpu:6)
    4)  1305910018042  (cpu:2)
    5)  1305910018226  (cpu:3)
    6)  1305910018416  (cpu:4)
    7)  1305910018540  (cpu:0)

As long as the TSC counts are in order of the index, the TSC is moving
forward nicely. If they are not in order, then the TSCs are not in sync.

Yes, this is a hack, but I think it's a somewhat useful hack.

Not-for-inclusion-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/Makefile     |    1 
 kernel/rdtsc_test.c |  127 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+)

Index: linux-trace.git/kernel/Makefile
===================================================================
--- linux-trace.git.orig/kernel/Makefile	2014-07-16 14:10:47.210980652 -0400
+++ linux-trace.git/kernel/Makefile	2014-07-21 18:23:23.990246141 -0400
@@ -28,6 +28,7 @@
 obj-y += irq/
 obj-y += rcu/
 
+obj-$(CONFIG_DEBUG_FS) += rdtsc_test.o
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
Index: linux-trace.git/kernel/rdtsc_test.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/rdtsc_test.c	2014-07-21 22:32:16.531878062 -0400
@@ -0,0 +1,127 @@
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+
+static unsigned int rdtsc_count __read_mostly;
+static unsigned int rdtsc_index;
+static u64 *rdtsc_counts;
+static unsigned int *rdtsc_cpus;
+
+static DEFINE_RAW_SPINLOCK(rdtsc_lock);
+static DEFINE_MUTEX(rdtsc_mutex);
+
+static atomic_t rdtsc_start;
+
+static void read_rdtsc_test(void *data)
+{
+	unsigned int idx;
+	u64 cnt;
+
+	atomic_dec(&rdtsc_start);
+	while (atomic_read(&rdtsc_start))
+		cpu_relax();
+
+	raw_spin_lock(&rdtsc_lock);
+	idx = rdtsc_index++;
+	rdtscll(cnt);
+	raw_spin_unlock(&rdtsc_lock);
+
+	if (idx >= rdtsc_count)
+		return;
+
+	rdtsc_counts[idx] = cnt;
+	rdtsc_cpus[idx] = smp_processor_id();
+}
+
+static void run_rdtsc_test(void)
+{
+	int i = 0;
+
+	get_online_cpus();
+	mutex_lock(&rdtsc_mutex);
+	atomic_set(&rdtsc_start, num_online_cpus());
+	rdtsc_index = 0;
+	memset(rdtsc_counts, 0, sizeof(*rdtsc_counts) * rdtsc_count);
+	memset(rdtsc_cpus, 0, sizeof(*rdtsc_cpus) * rdtsc_count);
+
+	/* Don't let us get migrated */
+	preempt_disable();
+	smp_call_function(read_rdtsc_test, NULL, 0);
+
+	/* Run the test without being disturbed. */
+	local_irq_disable();
+	read_rdtsc_test(NULL);
+	local_irq_enable();
+
+	preempt_enable();
+
+	/* We didn't wait for smp_call_function() to complete on other CPUS */
+	while (rdtsc_index < num_online_cpus() && i++ < 1000)
+		msleep(1);
+
+	mutex_unlock(&rdtsc_mutex);
+	WARN_ON(rdtsc_index < num_online_cpus());
+	put_online_cpus();
+}
+
+static int rdtsc_test_show(struct seq_file *m, void *v)
+{
+	unsigned int i;
+
+	mutex_lock(&rdtsc_mutex);
+
+	for (i = 0; i < rdtsc_count; i++) {
+		seq_printf(m, "%5u)  %9llu  (cpu:%u)\n",
+			   i, rdtsc_counts[i], rdtsc_cpus[i]);
+	}
+
+	mutex_unlock(&rdtsc_mutex);
+
+	return 0;
+}
+
+static int rdtsc_test_open(struct inode *inode, struct file *filp)
+{
+	run_rdtsc_test();
+
+	return single_open(filp, rdtsc_test_show, inode->i_private);
+}
+
+static const struct file_operations rdtsc_test_fops = {
+	.open = rdtsc_test_open,
+	.read = seq_read,
+	.llseek = generic_file_llseek,
+};
+
+static __init int init_rdtsc_test(void)
+{
+	struct dentry *ret;
+
+	rdtsc_count = num_possible_cpus();
+	rdtsc_counts = kmalloc(sizeof(*rdtsc_counts) * rdtsc_count, GFP_KERNEL);
+	if (!rdtsc_counts) {
+		pr_warn("Could not create rdtsc_test counts\n");
+		return -ENOMEM;
+	}
+
+	rdtsc_cpus = kmalloc(sizeof(*rdtsc_cpus) * rdtsc_count, GFP_KERNEL);
+	if (!rdtsc_cpus) {
+		kfree(rdtsc_counts);
+		pr_warn("Could not create rdtsc_test cpus\n");
+		return -ENOMEM;
+	}
+
+	ret = debugfs_create_file("rdtsc_test", 0x400, NULL, NULL,
+				  &rdtsc_test_fops);
+	if (!ret)
+		pr_warn("Could not create debugfs rdtsc_test entry\n");
+	return ret;
+}
+
+fs_initcall(init_rdtsc_test);
+

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 21:56         ` Andy Lutomirski
  2014-07-21 22:06           ` Thomas Gleixner
  2014-07-21 22:08           ` Borislav Petkov
@ 2014-07-22  7:57           ` Peter Zijlstra
  2 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2014-07-22  7:57 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 02:56:49PM -0700, Andy Lutomirski wrote:
> > Remember, this is only attempting to be a hardware workaround for a
> > smallish number of systems out there. Most of current machines should
> > have stable and synched TSCs.
> 
> I actually own one of these systems.  It's a Sandy Bridge Core-i7
> Extreme or something like that.

If it is (and it sounds like it is) a single socket, then your unsynced
TSC is likely due to SMM fuckery and the TSCs will drift further and
further apart as (run)time increases due to SMM activity.

The problem Borislav is talking about is multi socket systems, which due
to (failed) board layout get the CPUs powered up 'wrong' and the TSCs
between sockets is offset because of this, its a fixed offset and stable
forever after (until power cycle etc..).

I have one WSM-EP that does this (occasionally).

His initial idea was to re-write the TSC value to match, but since
writing the TSC is expensive (in the 1000s of cycles range) getting an
offset adjustment of 10s of cycles in just right is nigh on impossible.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-21 22:13             ` Andy Lutomirski
  2014-07-21 22:30               ` Borislav Petkov
@ 2014-07-22  8:00               ` Peter Zijlstra
  1 sibling, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2014-07-22  8:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Thomas Gleixner, x86-ml, lkml, Steven Rostedt

On Mon, Jul 21, 2014 at 03:13:33PM -0700, Andy Lutomirski wrote:
> >> I actually own one of these systems.  It's a Sandy Bridge Core-i7
> >> Extreme or something like that.
> >
> > Ha, cool, so I've got my tester! :-)
> 
> Ha.  Ha ha.  Muahaha.  Because IIRC this box is synced until the first
> time it suspends.

Oh cute, one of those :-) Yes some BIOSes write 'random' crap into the
TSC when doing the S states jig. We have some tsc suspend/resume hooks
to correct the worst of that, but yeah screwy that.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-22  2:40 ` Steven Rostedt
@ 2014-07-22  8:59   ` Borislav Petkov
  0 siblings, 0 replies; 20+ messages in thread
From: Borislav Petkov @ 2014-07-22  8:59 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Peter Zijlstra, Thomas Gleixner, x86-ml, lkml

On Mon, Jul 21, 2014 at 10:40:39PM -0400, Steven Rostedt wrote:
> Your patch inspired me to write this hack. I was curious to know how
> the TSCs of my boxes were with respect to each other, and wanted to get
> an idea. Maybe there's a better way, but I decided to waste an hour and
> write this hack up.
> 
> Here's what it does. It creates the file /sys/kernel/debug/rdtsc_test,
> and when you read it, it does some whacky things.
> 
> 1) A table is set up with the number of possible CPUs. The cable
> consists of: index, TSC count, CPU.
> 
> 2) A atomic variable is set to the number of online CPUS.
> 
> 3) An IPI is sent to each of the other CPUs to run the test.
> 
> 4) The test decrements the atomic, and then spins until it reaches zero.
> 
> 5) The caller of smp_call_function() then calls the test iself, being
> the last to decrement the counter causing it to go to zero and all CPUs
> then fight for a spinlock.
> 
> 6) When the spin lock is taken, it records which place it was in (order
> of spinlock taken, and records its own TSC. Then it releases the lock.
> 
> 7) It then records in the table where its position is, its TSC counter
> and CPU number.
> 
> 
> Finally, the read will show the results of the table. Looks something
> like this:
> 
> # cat /debug/rdtsc_test 
>     0)  1305910016816  (cpu:5)
>     1)  1305910017550  (cpu:7)
>     2)  1305910017712  (cpu:1)
>     3)  1305910017910  (cpu:6)
>     4)  1305910018042  (cpu:2)
>     5)  1305910018226  (cpu:3)
>     6)  1305910018416  (cpu:4)
>     7)  1305910018540  (cpu:0)
> 
> As long as the TSC counts are in order of the index, the TSC is moving
> forward nicely. If they are not in order, then the TSCs are not in sync.
> 
> Yes, this is a hack, but I think it's a somewhat useful hack.

I think so too, especially if one would like to take a look at the TSCs
and how they're looking like after, say, suspend cycle, a long machine
runtime when there is suspicion that some SMM might've been entered or so...

However, I think you can do all that from luserspace, even though you'd
have to do more dancing like pinning threads to cpus and those threads
would have to synchronize back on rdtsc_start after having read the TSC,
i.e.

	while(atomic_read(&rdtsc_start) != num_online_cpus())
		cpu_relax();

so that you can make sure they all have read the TSC at least once and
nothing gets reordered.

It might work if I'm not missing something.

Doing it in the kernel is much easier with all that ring0 functionality
present. :-)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] x86, TSC: Add a software TSC offset
  2014-07-19 13:06 [PATCH] x86, TSC: Add a software TSC offset Borislav Petkov
                   ` (2 preceding siblings ...)
  2014-07-22  2:40 ` Steven Rostedt
@ 2014-07-22 12:05 ` Borislav Petkov
  3 siblings, 0 replies; 20+ messages in thread
From: Borislav Petkov @ 2014-07-22 12:05 UTC (permalink / raw)
  To: Peter Zijlstra, Thomas Gleixner; +Cc: x86-ml, lkml, Steven Rostedt

Ok, here's the preempt-version we were talking about.

Please don't look at the vdso hunk - I had to make it build. Will fix
properly later once we've established whether this actually makes sense
at all first.

:-)

--
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb9b258d60e7..8c27e55372fb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -244,6 +244,7 @@
 #define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_TSC_OFFSET	X86_BUG(8) /* CPU has skewed but stable TSCs */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..904bc182a16b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -4,11 +4,11 @@
 #ifndef _ASM_X86_TSC_H
 #define _ASM_X86_TSC_H
 
-#include <asm/processor.h>
-
 #define NS_SCALE	10 /* 2^10, carefully chosen */
 #define US_SCALE	32 /* 2^32, arbitralrily chosen */
 
+DECLARE_PER_CPU(long long, tsc_offset);
+
 /*
  * Standard way to access the cycle counter.
  */
@@ -27,7 +27,10 @@ static inline cycles_t get_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
+	preempt_disable();
 	rdtscll(ret);
+	ret += this_cpu_read_8(tsc_offset);
+	preempt_enable();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61..97293b66fa65 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -28,6 +28,11 @@ static atomic_t start_count;
 static atomic_t stop_count;
 
 /*
+ * TSC offset helper counters.
+ */
+static atomic_t set_offset_on_target, offset_done;
+
+/*
  * We use a raw spinlock in this exceptional case, because
  * we want to have the fastest, inlined, non-debug version
  * of a critical section, to be able to prove TSC time-warps:
@@ -36,7 +41,10 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 
 static cycles_t last_tsc;
 static cycles_t max_warp;
-static int nr_warps;
+static int nr_warps, max_warp_cpu;
+
+DEFINE_PER_CPU(long long, tsc_offset) = { 0 };
+EXPORT_PER_CPU_SYMBOL_GPL(tsc_offset);
 
 /*
  * TSC-warp measurement loop running on both CPUs:
@@ -89,6 +97,10 @@ static void check_tsc_warp(unsigned int timeout)
 			arch_spin_lock(&sync_lock);
 			max_warp = max(max_warp, prev - now);
 			nr_warps++;
+
+			if (prev - now == max_warp)
+				max_warp_cpu = smp_processor_id();
+
 			arch_spin_unlock(&sync_lock);
 		}
 	}
@@ -116,6 +128,69 @@ static inline unsigned int loop_timeout(int cpu)
 	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
 }
 
+static inline bool cpu_should_save_offset(int cpu)
+{
+	bool ret = static_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+		   static_cpu_has(X86_FEATURE_NONSTOP_TSC);
+
+	if (ret)
+		set_cpu_bug(&cpu_data(cpu), X86_BUG_TSC_OFFSET);
+
+	return ret;
+}
+
+/*
+ * We're saving a per-core TSC offset only on machines which have a
+ * stable and non-stop TSC but which, for some reason, start their TSCs
+ * on the different nodes at different points in time, thus causing a
+ * small constant diff between them.
+ *
+ * We do this during the TSC sync check which happens between a source
+ * and a target CPU. When we detect the diff, we hold the target CPU by
+ * _not_ incrementing stop_count. What we do instead is we send it into
+ * compute_tsc_offset() below and store the max_warp difference we have
+ * measured above in a per-cpu variable.
+ *
+ * We do pay attention to which CPU saw the max_warp by writing its
+ * number into max_warp_cpu so that we can compute whether the offset
+ * we're going to write into the target's TSC is positive or negative.
+ *
+ * It is positive when the target CPU's TSC has started later than the
+ * source CPU's TSC and thus has a smaller TSC value.
+ *
+ * It is negative when the target CPU's TSC has started earlier than the
+ * source CPU's TSC and thus has a higher TSC value.
+ *
+ * Once we've computed the offset, we let both CPUs do the usual
+ * TSC sync check again, taking the offset into account, see
+ * get_cycles_aux().
+ *
+ * Called on the target.
+ */
+static void compute_tsc_offset(int cpu)
+{
+	long long off;
+
+	/*
+	 * This CPU wrote last the max_warp above, means its TSC is smaller than
+	 * the source CPU which we're doing the sync check with.
+	 */
+	if (cpu == max_warp_cpu)
+		off =  max_warp;
+	else
+		off = -max_warp;
+
+	per_cpu(tsc_offset, cpu) = off;
+	pr_info("CPU%d, saved offset: %lld\n", cpu, off);
+
+	nr_warps = 0;
+	max_warp = 0;
+	last_tsc = 0;
+
+	atomic_inc(&offset_done);
+	atomic_set(&set_offset_on_target, 0);
+}
+
 /*
  * Source CPU calls into this - it waits for the freshly booted
  * target CPU to arrive and then starts the measurement:
@@ -138,6 +213,7 @@ void check_tsc_sync_source(int cpu)
 		return;
 	}
 
+restart_src:
 	/*
 	 * Reset it - in case this is a second bootup:
 	 */
@@ -155,15 +231,27 @@ void check_tsc_sync_source(int cpu)
 
 	check_tsc_warp(loop_timeout(cpu));
 
+	/*
+	 * Wait for target to finish measurement:
+	 */
 	while (atomic_read(&stop_count) != cpus-1)
 		cpu_relax();
 
+	/* Analyze measurement */
 	if (nr_warps) {
-		pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
-			smp_processor_id(), cpu);
-		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
-			   "turning off TSC clock.\n", max_warp);
-		mark_tsc_unstable("check_tsc_sync_source failed");
+		if (cpu_should_save_offset(cpu) && !atomic_read(&offset_done)) {
+			pr_warn("TSCs of [CPU#%d -> CPU#%d] %lld cycles out of sync, saving offset.\n",
+				smp_processor_id(), cpu, max_warp);
+
+			atomic_set(&start_count, 0);
+			atomic_set(&set_offset_on_target, 1);
+
+			goto restart_src;
+		} else {
+			pr_warning("Measured %Ld(%d) cycles TSC warp between CPUs, "
+				   "turning off TSC clock.\n", max_warp, max_warp_cpu);
+			mark_tsc_unstable("check_tsc_sync_source failed");
+		}
 	} else {
 		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
 			smp_processor_id(), cpu);
@@ -173,6 +261,7 @@ void check_tsc_sync_source(int cpu)
 	 * Reset it - just in case we boot another CPU later:
 	 */
 	atomic_set(&start_count, 0);
+	atomic_set(&offset_done, 0);
 	nr_warps = 0;
 	max_warp = 0;
 	last_tsc = 0;
@@ -188,11 +277,16 @@ void check_tsc_sync_source(int cpu)
  */
 void check_tsc_sync_target(void)
 {
+	int this_cpu = smp_processor_id();
 	int cpus = 2;
 
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
+restart_tgt:
+	if (atomic_read(&set_offset_on_target))
+		compute_tsc_offset(this_cpu);
+
 	/*
 	 * Register this CPU's participation and wait for the
 	 * source CPU to start the measurement:
@@ -201,7 +295,7 @@ void check_tsc_sync_target(void)
 	while (atomic_read(&start_count) != cpus)
 		cpu_relax();
 
-	check_tsc_warp(loop_timeout(smp_processor_id()));
+	check_tsc_warp(loop_timeout(this_cpu));
 
 	/*
 	 * Ok, we are done:
@@ -211,6 +305,9 @@ void check_tsc_sync_target(void)
 	/*
 	 * Wait for the source CPU to print stuff:
 	 */
-	while (atomic_read(&stop_count) != cpus)
+	while (atomic_read(&stop_count) != cpus) {
+		if (atomic_read(&set_offset_on_target))
+			goto restart_tgt;
 		cpu_relax();
+	}
 }
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
index 175cc72c0f68..d5cba62bbf46 100644
--- a/arch/x86/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -25,6 +25,9 @@
 
 #define BUILD_VDSO32_64
 
+#undef this_cpu_read_8
+#define this_cpu_read_8(dummy) (0)
+
 #endif
 
 #include "../vclock_gettime.c"

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2014-07-22 12:05 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-07-19 13:06 [PATCH] x86, TSC: Add a software TSC offset Borislav Petkov
2014-07-19 13:28 ` Borislav Petkov
2014-07-21 19:34 ` Andy Lutomirski
2014-07-21 21:35   ` Borislav Petkov
2014-07-21 21:41     ` Andy Lutomirski
2014-07-21 21:52       ` Borislav Petkov
2014-07-21 21:56         ` Andy Lutomirski
2014-07-21 22:06           ` Thomas Gleixner
2014-07-21 22:11             ` Thomas Gleixner
2014-07-21 22:14               ` Andy Lutomirski
2014-07-21 22:08           ` Borislav Petkov
2014-07-21 22:13             ` Andy Lutomirski
2014-07-21 22:30               ` Borislav Petkov
2014-07-21 22:43                 ` Andy Lutomirski
2014-07-21 23:01                   ` Borislav Petkov
2014-07-22  8:00               ` Peter Zijlstra
2014-07-22  7:57           ` Peter Zijlstra
2014-07-22  2:40 ` Steven Rostedt
2014-07-22  8:59   ` Borislav Petkov
2014-07-22 12:05 ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox