[patch] x86: unify/rewrite SMP TSC sync code

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch] x86: unify/rewrite SMP TSC sync code
@ 2006-11-24 17:02 Ingo Molnar
  2006-11-24 17:13 ` Andi Kleen
  2006-11-29  7:13 ` Ingo Molnar
  0 siblings, 2 replies; 22+ messages in thread
From: Ingo Molnar @ 2006-11-24 17:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton, Arjan van de Ven, Andi Kleen, Thomas Gleixner


Andrew,

could we try this one in -mm? It unifies (and simplifies) the TSC sync 
code between i386 and x86_64, and also offers a stronger guarantee that 
we'll only activate the TSC clock on CPU where the TSC is synced 
correctly by the hardware.

	Ingo

---------------->
From: Ingo Molnar <mingo@elte.hu>
[patch] x86: unify/rewrite SMP TSC sync code

make the TSC synchronization code more robust, and unify it between 
x86_64 and i386.

The biggest change is the removal of the 'fix up TSCs' code on x86_64 
and i386, in some rare cases it was /causing/ time-warps on SMP systems.

The new code only checks for TSC asynchronity - and if it can prove a 
time-warp (if it can observe the TSC going backwards when going from one 
CPU to another within a critical section), then the TSC clock-source is 
turned off.

The TSC synchronization-checking code also got moved into a separate 
file.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/i386/kernel/Makefile     |    2 
 arch/i386/kernel/smpboot.c    |  178 ++------------------------------
 arch/i386/kernel/tsc.c        |    4 
 arch/i386/kernel/tsc_sync.c   |    1 
 arch/x86_64/kernel/Makefile   |    2 
 arch/x86_64/kernel/smpboot.c  |  230 ++----------------------------------------
 arch/x86_64/kernel/time.c     |   11 ++
 arch/x86_64/kernel/tsc_sync.c |  187 ++++++++++++++++++++++++++++++++++
 include/asm-i386/tsc.h        |   49 --------
 include/asm-x86_64/proto.h    |    2 
 include/asm-x86_64/timex.h    |   26 ----
 include/asm-x86_64/tsc.h      |   66 ++++++++++++
 12 files changed, 295 insertions(+), 463 deletions(-)

Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
-obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
+obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -88,12 +88,6 @@ cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
-/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
- * is no way to resync one AP against BP. TBD: for prescott and above, we
- * should use IA64's algorithm
- */
-static int __devinitdata tsc_sync_disabled;
-
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
@@ -210,151 +204,6 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static struct {
-	atomic_t start_flag;
-	atomic_t count_start;
-	atomic_t count_stop;
-	unsigned long long values[NR_CPUS];
-} tsc __initdata = {
-	.start_flag = ATOMIC_INIT(0),
-	.count_start = ATOMIC_INIT(0),
-	.count_stop = ATOMIC_INIT(0),
-};
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp(void)
-{
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
-
-	atomic_set(&tsc.start_flag, 1);
-	wmb();
-
-	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
-	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc.count_start);
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		/*
-		 * We clear the TSC in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		/*
-		 * Wait for all APs to leave the synchronization point:
-		 */
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_start, 0);
-		wmb();
-		atomic_inc(&tsc.count_stop);
-	}
-
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc.values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc.values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long long realdelta;
-
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc.values[i] < avg)
-				realdelta = -realdelta;
-
-			if (realdelta)
-				printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
-					"skew, fixed it up.\n", i, realdelta);
-		}
-	}
-	if (!buggy)
-		printk("passed.\n");
-}
-
-static void __init synchronize_tsc_ap(void)
-{
-	int i;
-
-	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
-	 */
-	while (!atomic_read(&tsc.start_flag))
-		cpu_relax();
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc.count_start);
-		while (atomic_read(&tsc.count_start) != num_booting_cpus())
-			cpu_relax();
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		atomic_inc(&tsc.count_stop);
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus())
-			cpu_relax();
-	}
-}
-#undef NR_LOOPS
-
 extern void calibrate_delay(void);
 
 static atomic_t init_deasserted;
@@ -440,12 +289,6 @@ static void __devinit smp_callin(void)
 	 * Allow the master to continue.
 	 */
 	cpu_set(cpuid, cpu_callin_map);
-
-	/*
-	 *      Synchronize the TSC with the BP
-	 */
-	if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
-		synchronize_tsc_ap();
 }
 
 static int cpucount;
@@ -545,6 +388,11 @@ static void __devinit start_secondary(vo
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
+	/*
+	 * Check TSC synchronization with the BP:
+	 */
+	check_tsc_sync_target();
+
 	setup_secondary_APIC_clock();
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
@@ -1091,8 +939,6 @@ static int __cpuinit __smp_prepare_cpu(i
 	info.cpu = cpu;
 	INIT_WORK(&task, do_warm_boot_cpu, &info);
 
-	tsc_sync_disabled = 1;
-
 	/* init low mem mapping */
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
 			KERNEL_PGD_PTRS);
@@ -1100,7 +946,6 @@ static int __cpuinit __smp_prepare_cpu(i
 	schedule_work(&task);
 	wait_for_completion(&done);
 
-	tsc_sync_disabled = 0;
 	zap_low_mappings();
 	ret = 0;
 exit:
@@ -1316,12 +1161,6 @@ static void __init smp_boot_cpus(unsigne
 	smpboot_setup_io_apic();
 
 	setup_boot_APIC_clock();
-
-	/*
-	 * Synchronize the TSC with the AP
-	 */
-	if (cpu_has_tsc && cpucount && cpu_khz)
-		synchronize_tsc_bp();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
@@ -1456,9 +1295,16 @@ int __devinit __cpu_up(unsigned int cpu)
 	}
 
 	local_irq_enable();
+
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
+
+	/*
+	 * Check TSC synchronization with the AP:
+	 */
+	check_tsc_sync_source(cpu);
+
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
 	return 0;
Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -434,8 +434,10 @@ out:
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
-static __init int unsynchronized_tsc(void)
+__init int unsynchronized_tsc(void)
 {
+	if (!cpu_has_tsc || tsc_unstable)
+		return 1;
 	/*
 	 * Intel systems are normally all synchronized.
 	 * Exceptions must mark TSC as unstable:
Index: linux/arch/i386/kernel/tsc_sync.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o tsc_sync.o
 obj-y				+= apic.o  nmi.o
 obj-y				+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
Index: linux/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smpboot.c
+++ linux/arch/x86_64/kernel/smpboot.c
@@ -147,217 +147,6 @@ static void __cpuinit smp_store_cpu_info
 	print_cpu_info(c);
 }
 
-/*
- * New Funky TSC sync algorithm borrowed from IA64.
- * Main advantage is that it doesn't reset the TSCs fully and
- * in general looks more robust and it works better than my earlier
- * attempts. I believe it was written by David Mosberger. Some minor
- * adjustments for x86-64 by me -AK
- *
- * Original comment reproduced below.
- *
- * Synchronize TSC of the current (slave) CPU with the TSC of the
- * MASTER CPU (normally the time-keeper CPU).  We use a closed loop to
- * eliminate the possibility of unaccounted-for errors (such as
- * getting a machine check in the middle of a calibration step).  The
- * basic idea is for the slave to ask the master what itc value it has
- * and to read its own itc before and after the master responds.  Each
- * iteration gives us three timestamps:
- *
- *	slave		master
- *
- *	t0 ---\
- *             ---\
- *		   --->
- *			tm
- *		   /---
- *	       /---
- *	t1 <---
- *
- *
- * The goal is to adjust the slave's TSC such that tm falls exactly
- * half-way between t0 and t1.  If we achieve this, the clocks are
- * synchronized provided the interconnect between the slave and the
- * master is symmetric.  Even if the interconnect were asymmetric, we
- * would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us
- * synchronize the TSC to within one or two cycles.  However, we can
- * only *guarantee* that the synchronization is accurate to within a
- * round-trip time, which is typically in the range of several hundred
- * cycles (e.g., ~500 cycles).  In practice, this means that the TSCs
- * are usually almost perfectly synchronized, but we shouldn't assume
- * that the accuracy is much better than half a micro second or so.
- *
- * [there are other errors like the latency of RDTSC and of the
- * WRMSR. These can also account to hundreds of cycles. So it's
- * probably worse. It claims 153 cycles error on a dual Opteron,
- * but I suspect the numbers are actually somewhat worse -AK]
- */
-
-#define MASTER	0
-#define SLAVE	(SMP_CACHE_BYTES/8)
-
-/* Intentionally don't use cpu_relax() while TSC synchronization
-   because we don't want to go into funky power save modi or cause
-   hypervisors to schedule us away.  Going to sleep would likely affect
-   latency and low latency is the primary objective here. -AK */
-#define no_cpu_relax() barrier()
-
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static volatile __cpuinitdata unsigned long go[SLAVE + 1];
-static int notscsync __cpuinitdata;
-
-#undef DEBUG_TSC_SYNC
-
-#define NUM_ROUNDS	64	/* magic value */
-#define NUM_ITERS	5	/* likewise */
-
-/* Callback on boot CPU */
-static __cpuinit void sync_master(void *arg)
-{
-	unsigned long flags, i;
-
-	go[MASTER] = 0;
-
-	local_irq_save(flags);
-	{
-		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
-			while (!go[MASTER])
-				no_cpu_relax();
-			go[MASTER] = 0;
-			rdtscll(go[SLAVE]);
-		}
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our tsc differs from the tsc
- * on the master (time-keeper) CPU.  A positive number indicates our
- * tsc is ahead of the master, negative that it is behind.
- */
-static inline long
-get_delta(long *rt, long *master)
-{
-	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
-	unsigned long tcenter, t0, t1, tm;
-	int i;
-
-	for (i = 0; i < NUM_ITERS; ++i) {
-		rdtscll(t0);
-		go[MASTER] = 1;
-		while (!(tm = go[SLAVE]))
-			no_cpu_relax();
-		go[SLAVE] = 0;
-		rdtscll(t1);
-
-		if (t1 - t0 < best_t1 - best_t0)
-			best_t0 = t0, best_t1 = t1, best_tm = tm;
-	}
-
-	*rt = best_t1 - best_t0;
-	*master = best_tm - best_t0;
-
-	/* average best_t0 and best_t1 without overflow: */
-	tcenter = (best_t0/2 + best_t1/2);
-	if (best_t0 % 2 + best_t1 % 2 == 2)
-		++tcenter;
-	return tcenter - best_tm;
-}
-
-static __cpuinit void sync_tsc(unsigned int master)
-{
-	int i, done = 0;
-	long delta, adj, adjust_latency = 0;
-	unsigned long flags, rt, master_time_stamp, bound;
-#ifdef DEBUG_TSC_SYNC
-	static struct syncdebug {
-		long rt;	/* roundtrip time */
-		long master;	/* master's timestamp */
-		long diff;	/* difference between midpoint and master's timestamp */
-		long lat;	/* estimate of tsc adjustment latency */
-	} t[NUM_ROUNDS] __cpuinitdata;
-#endif
-
-	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
-		smp_processor_id(), master);
-
-	go[MASTER] = 1;
-
-	/* It is dangerous to broadcast IPI as cpus are coming up,
-	 * as they may not be ready to accept them.  So since
-	 * we only need to send the ipi to the boot cpu direct
-	 * the message, and avoid the race.
-	 */
-	smp_call_function_single(master, sync_master, NULL, 1, 0);
-
-	while (go[MASTER])	/* wait for master to be ready */
-		no_cpu_relax();
-
-	spin_lock_irqsave(&tsc_sync_lock, flags);
-	{
-		for (i = 0; i < NUM_ROUNDS; ++i) {
-			delta = get_delta(&rt, &master_time_stamp);
-			if (delta == 0) {
-				done = 1;	/* let's lock on to this... */
-				bound = rt;
-			}
-
-			if (!done) {
-				unsigned long t;
-				if (i > 0) {
-					adjust_latency += -delta;
-					adj = -delta + adjust_latency/4;
-				} else
-					adj = -delta;
-
-				rdtscll(t);
-				wrmsrl(MSR_IA32_TSC, t + adj);
-			}
-#ifdef DEBUG_TSC_SYNC
-			t[i].rt = rt;
-			t[i].master = master_time_stamp;
-			t[i].diff = delta;
-			t[i].lat = adjust_latency/4;
-#endif
-		}
-	}
-	spin_unlock_irqrestore(&tsc_sync_lock, flags);
-
-#ifdef DEBUG_TSC_SYNC
-	for (i = 0; i < NUM_ROUNDS; ++i)
-		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
-		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
-	printk(KERN_INFO
-	       "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
-	       "maxerr %lu cycles)\n",
-	       smp_processor_id(), master, delta, rt);
-}
-
-static void __cpuinit tsc_sync_wait(void)
-{
-	/*
-	 * When the CPU has synchronized TSCs assume the BIOS
-  	 * or the hardware already synced.  Otherwise we could
-	 * mess up a possible perfect synchronization with a
-	 * not-quite-perfect algorithm.
-	 */
-	if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
-		return;
-	sync_tsc(0);
-}
-
-static __init int notscsync_setup(char *s)
-{
-	notscsync = 1;
-	return 1;
-}
-__setup("notscsync", notscsync_setup);
-
 static atomic_t init_deasserted __cpuinitdata;
 
 /*
@@ -545,6 +334,11 @@ void __cpuinit start_secondary(void)
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
 	barrier();
 
+	/*
+  	 * Check TSC sync first:
+ 	 */
+	check_tsc_sync_target();
+
 	Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 	
 	setup_secondary_APIC_clock();
 
@@ -564,14 +358,6 @@ void __cpuinit start_secondary(void)
 	 */
 	set_cpu_sibling_map(smp_processor_id());
 
-	/* 
-  	 * Wait for TSC sync to not schedule things before.
-	 * We still process interrupts, which could see an inconsistent
-	 * time in that window unfortunately. 
-	 * Do this here because TSC sync has global unprotected state.
- 	 */
-	tsc_sync_wait();
-
 	/*
 	 * We need to hold call_lock, so there is no inconsistency
 	 * between the time smp_call_function() determines number of
@@ -591,6 +377,7 @@ void __cpuinit start_secondary(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	spin_unlock(&vector_lock);
+
 	unlock_ipi_call_lock();
 
 	cpu_idle();
@@ -1165,6 +952,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	/* Unleash the CPU! */
 	Dprintk("waiting for cpu %d\n", cpu);
 
+	/*
+  	 * Make sure and check TSC sync:
+ 	 */
+	check_tsc_sync_source(cpu);
+
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
 	err = 0;
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -922,12 +922,23 @@ void __init time_init(void)
 #endif
 }
 
+static int tsc_unstable = 0;
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
 /*
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
 __cpuinit int unsynchronized_tsc(void)
 {
+	if (tsc_unstable)
+		return 1;
+
 #ifdef CONFIG_SMP
 	if (apic_is_clustered_box())
 		return 1;
Index: linux/arch/x86_64/kernel/tsc_sync.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
+/*
+ * arch/i386/kernel/tsc_sync.c: check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static __cpuinitdata atomic_t start_count;
+static __cpuinitdata atomic_t stop_count;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata cycles_t last_tsc;
+static __cpuinitdata cycles_t max_warp;
+static __cpuinitdata int nr_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+static __cpuinit void check_tsc_warp(void)
+{
+	cycles_t start, now, prev, end;
+	int i;
+
+	start = get_cycles_sync();
+	/*
+	 * The measurement runs for 20 msecs:
+	 */
+	end = start + cpu_khz * 20ULL;
+	now = start;
+
+	for (i = 0; ; i++) {
+		/*
+		 * We take the global lock, measure TSC, save the
+		 * previous TSC that was measured (possibly on
+		 * another CPU) and update the previous TSC timestamp.
+		 */
+		__raw_spin_lock(&sync_lock);
+		prev = last_tsc;
+		now = get_cycles_sync();
+		last_tsc = now;
+		__raw_spin_unlock(&sync_lock);
+
+		/*
+		 * Be nice every now and then (and also check whether
+		 * measurement is done [we also insert a 100 million
+		 * loops safety exit, so we dont lock up in case the
+		 * TSC readout is totally broken]):
+		 */
+		if (unlikely(!(i & 7))) {
+			if (now > end || i > 100000000)
+				break;
+			cpu_relax();
+			touch_nmi_watchdog();
+		}
+		/*
+		 * Outside the critical section we can now see whether
+		 * we saw a time-warp of the TSC going backwards:
+		 */
+		if (unlikely(prev > now)) {
+			__raw_spin_lock(&sync_lock);
+			max_warp = max(max_warp, prev - now);
+			nr_warps++;
+			__raw_spin_unlock(&sync_lock);
+		}
+
+	}
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+	int cpus = 2;
+
+	/*
+	 * No need to check if we already know that the TSC is not
+	 * synchronized:
+	 */
+	if (unsynchronized_tsc())
+		return;
+
+	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+			  smp_processor_id(), cpu);
+
+	/*
+	 * Reset it - in case this is a second bootup:
+	 */
+	atomic_set(&stop_count, 0);
+
+	/*
+	 * Wait for the target to arrive:
+	 */
+	while (atomic_read(&start_count) != cpus-1)
+		cpu_relax();
+	/*
+	 * Trigger the target to continue into the measurement too:
+	 */
+	atomic_inc(&start_count);
+
+	check_tsc_warp();
+
+	while (atomic_read(&stop_count) != cpus-1)
+		cpu_relax();
+
+	/*
+	 * Reset it - just in case we boot another CPU later:
+	 */
+	atomic_set(&start_count, 0);
+
+	if (nr_warps) {
+		printk("\n");
+		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
+				    " turning off TSC clock.\n", max_warp);
+		mark_tsc_unstable();
+		nr_warps = 0;
+		max_warp = 0;
+		last_tsc = 0;
+	} else {
+		printk(" passed.\n");
+	}
+
+	/*
+	 * Let the target continue with the bootup:
+	 */
+	atomic_inc(&stop_count);
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+	int cpus = 2;
+
+	if (unsynchronized_tsc())
+		return;
+
+	/*
+	 * Register this CPU's participation and wait for the
+	 * source CPU to start the measurement:
+	 */
+	atomic_inc(&start_count);
+	while (atomic_read(&start_count) != cpus)
+		cpu_relax();
+
+	check_tsc_warp();
+
+	/*
+	 * Ok, we are done:
+	 */
+	atomic_inc(&stop_count);
+
+	/*
+	 * Wait for the source CPU to print stuff:
+	 */
+	while (atomic_read(&stop_count) != cpus)
+		cpu_relax();
+}
+#undef NR_LOOPS
+
Index: linux/include/asm-i386/tsc.h
===================================================================
--- linux.orig/include/asm-i386/tsc.h
+++ linux/include/asm-i386/tsc.h
@@ -1,48 +1 @@
-/*
- * linux/include/asm-i386/tsc.h
- *
- * i386 TSC related functions
- */
-#ifndef _ASM_i386_TSC_H
-#define _ASM_i386_TSC_H
-
-#include <asm/processor.h>
-
-/*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is.
- */
-typedef unsigned long long cycles_t;
-
-extern unsigned int cpu_khz;
-extern unsigned int tsc_khz;
-
-static inline cycles_t get_cycles(void)
-{
-	unsigned long long ret = 0;
-
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
-	rdtscll(ret);
-#endif
-	return ret;
-}
-
-extern void tsc_init(void);
-extern void mark_tsc_unstable(void);
-
-#endif
+#include <asm-x86_64/tsc.h>
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -92,8 +92,6 @@ extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
 
-extern int unsynchronized_tsc(void);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long table_start, table_end;
Index: linux/include/asm-x86_64/timex.h
===================================================================
--- linux.orig/include/asm-x86_64/timex.h
+++ linux/include/asm-x86_64/timex.h
@@ -12,35 +12,11 @@
 #include <asm/hpet.h>
 #include <asm/system.h>
 #include <asm/processor.h>
+#include <asm/tsc.h>
 #include <linux/compiler.h>
 
 #define CLOCK_TICK_RATE	PIT_TICK_RATE	/* Underlying HZ */
 
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-	unsigned long long ret;
-
-	rdtscll(ret);
-	return ret;
-}
-
-/* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t get_cycles_sync(void)
-{
-	unsigned long long ret;
-	unsigned eax;
-	/* Don't do an additional sync on CPUs where we know
-	   RDTSC is already synchronous. */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-	rdtscll(ret);
-	return ret;
-}
-
-extern unsigned int cpu_khz;
-
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
Index: linux/include/asm-x86_64/tsc.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/tsc.h
@@ -0,0 +1,66 @@
+/*
+ * linux/include/asm-x86_64/tsc.h
+ *
+ * x86_64 TSC related functions
+ */
+#ifndef _ASM_x86_64_TSC_H
+#define _ASM_x86_64_TSC_H
+
+#include <asm/processor.h>
+
+/*
+ * Standard way to access the cycle counter.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+static inline cycles_t get_cycles(void)
+{
+	unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+
+#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+	rdtscll(ret);
+#endif
+	return ret;
+}
+
+/* Like get_cycles, but make sure the CPU is synchronized. */
+static __always_inline cycles_t get_cycles_sync(void)
+{
+	unsigned long long ret;
+#ifdef X86_FEATURE_SYNC_RDTSC
+	unsigned eax;
+
+	/*
+	 * Don't do an additional sync on CPUs where we know
+	 * RDTSC is already synchronous:
+	 */
+	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+#else
+	sync_core();
+#endif
+	rdtscll(ret);
+
+	return ret;
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(void);
+extern int unsynchronized_tsc(void);
+
+/*
+ * Boot-time check whether the TSCs are synchronized across
+ * all CPUs/cores:
+ */
+extern void check_tsc_sync_source(int cpu);
+extern void check_tsc_sync_target(void);
+
+#endif

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 17:02 [patch] x86: unify/rewrite SMP TSC sync code Ingo Molnar
@ 2006-11-24 17:13 ` Andi Kleen
  2006-11-24 20:25   ` Ingo Molnar
  2006-11-27 19:00   ` Christoph Lameter
  2006-11-29  7:13 ` Ingo Molnar
  1 sibling, 2 replies; 22+ messages in thread
From: Andi Kleen @ 2006-11-24 17:13 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Arjan van de Ven, Thomas Gleixner


> make the TSC synchronization code more robust, and unify it between 
> x86_64 and i386.
> 
> The biggest change is the removal of the 'fix up TSCs' code on x86_64 
> and i386, in some rare cases it was /causing/ time-warps on SMP systems.

On x86-64 I don't think it can since it doesn't check anymore on sync Intel.

> The new code only checks for TSC asynchronity - and if it can prove a 
> time-warp (if it can observe the TSC going backwards when going from one 
> CPU to another within a critical section), then the TSC clock-source is 
> turned off.

The trouble is that people are using the RDTSC anyways even if the
kernel doesn't. So some synchronization is probably a good idea.

-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 17:13 ` Andi Kleen
@ 2006-11-24 20:25   ` Ingo Molnar
  2006-11-24 20:37     ` Andi Kleen
                       ` (2 more replies)
  2006-11-27 19:00   ` Christoph Lameter
  1 sibling, 3 replies; 22+ messages in thread
From: Ingo Molnar @ 2006-11-24 20:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Andrew Morton, Arjan van de Ven, Thomas Gleixner

* Andi Kleen <ak@suse.de> wrote:

> > make the TSC synchronization code more robust, and unify it between 
> > x86_64 and i386.
> > 
> > The biggest change is the removal of the 'fix up TSCs' code on 
> > x86_64 and i386, in some rare cases it was /causing/ time-warps on 
> > SMP systems.
> 
> On x86-64 I don't think it can since it doesn't check anymore on sync 
> Intel.

yeah - the main new bit for x86-64 is the unconditional check for time 
warps. We shouldnt (and cannot) really trust the CPU and the board/BIOS 
getting it right. There were always some motherboards using Intel CPUs 
that had the TSCs wrong.

> > The new code only checks for TSC asynchronity - and if it can prove 
> > a time-warp (if it can observe the TSC going backwards when going 
> > from one CPU to another within a critical section), then the TSC 
> > clock-source is turned off.
> 
> The trouble is that people are using the RDTSC anyways even if the 
> kernel doesn't. So some synchronization is probably a good idea.

which apps are using it? It might be safer in terms of app compatibility 
if we removed the TSC bit in the case where we know it doesnt work, and 
if we turned the feature off in the CPU in this case. We could also try 
to 'approximately' sync up the TSC, but that obviously cannot be used 
for kernel timekeeping - and by offering an 'almost' good TSC to 
userspace we'd lure them towards using something we /cannot/ fix.

nor can the TSC really be synced up properly in the hotplug CPU case, 
after the fact - what if the app already read out an older TSC value and 
a new CPU is added. If the TSC isnt sync on SMP then it quickly gets 
pretty messy, and we should rather take a look at /why/ these apps are 
using RDTSC.

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 20:25   ` Ingo Molnar
@ 2006-11-24 20:37     ` Andi Kleen
  2006-11-24 20:46       ` Ingo Molnar
  2006-11-24 22:48     ` Ben Greear
  2006-11-25  2:56     ` Wink Saville
  2 siblings, 1 reply; 22+ messages in thread
From: Andi Kleen @ 2006-11-24 20:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Arjan van de Ven, Thomas Gleixner


> yeah - the main new bit for x86-64 is the unconditional check for time 
> warps. We shouldnt (and cannot) really trust the CPU and the board/BIOS 
> getting it right. There were always some motherboards using Intel CPUs 
> that had the TSCs wrong.

In the 64bit capable generation I don't know of any run in spec 
(except for multinode systems and there was one overclocked
system where the cores got unsync, but overclocking is an operator error) 
 
> > > The new code only checks for TSC asynchronity - and if it can prove 
> > > a time-warp (if it can observe the TSC going backwards when going 
> > > from one CPU to another within a critical section), then the TSC 
> > > clock-source is turned off.
> > 
> > The trouble is that people are using the RDTSC anyways even if the 
> > kernel doesn't. So some synchronization is probably a good idea.
> 
> which apps are using it? It might be safer in terms of app compatibility 
> if we removed the TSC bit in the case where we know it doesnt work, and 
> if we turned the feature off in the CPU in this case. We could also try 
> to 'approximately' sync up the TSC, 

There was a patch from google for trap -- trapping RDTSC for syncing
on demand. I'm not sure that was the right strategy though.

> but that obviously cannot be used  
> for kernel timekeeping - and by offering an 'almost' good TSC to 
> userspace we'd lure them towards using something we /cannot/ fix.

The trouble is that it's good enough on many systems, probably 
on those that are being developed on.

Anyways I don't feel very strongly about this -- i guess taking
it out would be fine.
 
> nor can the TSC really be synced up properly in the hotplug CPU case, 
> after the fact - what if the app already read out an older TSC value and 
> a new CPU is added. If the TSC isnt sync on SMP then it quickly gets 
> pretty messy, and we should rather take a look at /why/ these apps are 
> using RDTSC.

Because gettimeofday is too slow.
-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 20:37     ` Andi Kleen
@ 2006-11-24 20:46       ` Ingo Molnar
  2006-11-24 21:06         ` Andi Kleen
  0 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2006-11-24 20:46 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Andrew Morton, Arjan van de Ven, Thomas Gleixner


* Andi Kleen <ak@suse.de> wrote:

> 
> > yeah - the main new bit for x86-64 is the unconditional check for time 
> > warps. We shouldnt (and cannot) really trust the CPU and the board/BIOS 
> > getting it right. There were always some motherboards using Intel CPUs 
> > that had the TSCs wrong.
> 
> In the 64bit capable generation I don't know of any run in spec 
> (except for multinode systems and there was one overclocked system 
> where the cores got unsync, but overclocking is an operator error)

i have one (Intel based), 64-bit, fully in spec, which is off by 
~3000-4000 cycles. So it happens. But it's a no-brainer thing, this area 
is historically so bad that it would be crazy /not/ to spend this 20 
msecs bootup time per CPU to check whether its TSC is in sync.

I was in fact surprised when i noticed that you removed the 
unconditional TSC check that i put there years ago - with this we 
started a ride into the dark with lights off. If the situation gets 
better in say 2 years and no system ever produces the warning message we 
can remove it. (but i doubt it will ever get 100% correct.) [The patch 
will need some cooking in -mm, because it touches code that is fragile 
to begin with, but it's a necessity i'm quite sure.]

> > > The trouble is that people are using the RDTSC anyways even if the 
> > > kernel doesn't. So some synchronization is probably a good idea.
> > 
> > which apps are using it? It might be safer in terms of app 
> > compatibility if we removed the TSC bit in the case where we know it 
> > doesnt work, and if we turned the feature off in the CPU in this 
> > case. We could also try to 'approximately' sync up the TSC,
> 
> There was a patch from google for trap -- trapping RDTSC for syncing 
> on demand. I'm not sure that was the right strategy though.

but which apps are using RDTSC natively? Trapping isnt too good i agree 
- if then we should remove it from the CPU features and hence apps wont 
(or shouldnt) use it.

> > nor can the TSC really be synced up properly in the hotplug CPU 
> > case, after the fact - what if the app already read out an older TSC 
> > value and a new CPU is added. If the TSC isnt sync on SMP then it 
> > quickly gets pretty messy, and we should rather take a look at /why/ 
> > these apps are using RDTSC.
> 
> Because gettimeofday is too slow.

as i indicated it in another discussion, i can fix that. Next patch will 
be that.

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 20:46       ` Ingo Molnar
@ 2006-11-24 21:06         ` Andi Kleen
  0 siblings, 0 replies; 22+ messages in thread
From: Andi Kleen @ 2006-11-24 21:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Arjan van de Ven, Thomas Gleixner

On Friday 24 November 2006 21:46, Ingo Molnar wrote:
> 
> * Andi Kleen <ak@suse.de> wrote:
> 
> > 
> > > yeah - the main new bit for x86-64 is the unconditional check for time 
> > > warps. We shouldnt (and cannot) really trust the CPU and the board/BIOS 
> > > getting it right. There were always some motherboards using Intel CPUs 
> > > that had the TSCs wrong.
> > 
> > In the 64bit capable generation I don't know of any run in spec 
> > (except for multinode systems and there was one overclocked system 
> > where the cores got unsync, but overclocking is an operator error)
> 
> i have one (Intel based), 64-bit, fully in spec, which is off by 
> ~3000-4000 cycles. So it happens.

More details?

> I was in fact surprised when i noticed that you removed the 
> unconditional TSC check that i put there years ago 

I removed it because you pointed out that it usually caused
trouble on Intel systems: we would always detect errors due to measurement errors
and then make things worse by trying to fix it.

But you're right it might have been better to keep 
a check with a threshold to catch totally broken cases.

> but which apps are using RDTSC natively? Trapping isnt too good i agree

The only sure way would be to trap+printk -- but from previous
user complaints it's a substantial number.

> - if then we should remove it from the CPU features and hence apps wont 
> (or shouldnt) use it.

I doubt the majority checks any cpu features first ...

> 
> > > nor can the TSC really be synced up properly in the hotplug CPU 
> > > case, after the fact - what if the app already read out an older TSC 
> > > value and a new CPU is added. If the TSC isnt sync on SMP then it 
> > > quickly gets pretty messy, and we should rather take a look at /why/ 
> > > these apps are using RDTSC.
> > 
> > Because gettimeofday is too slow.
> 
> as i indicated it in another discussion, i can fix that. Next patch will 
> be that.

Well I hope it's not making it HZ resolution. As noted earlier we tried
that already and it didn't work (it violates the "forward monotonity"
that is commonly expected) 

Ok I could imagine it making sense as a new CLOCK_FASTBUTLOUSYRESOLUTION timer in 
clock_gettime() [together with the new vdso fastpath], but not as default.

-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 20:25   ` Ingo Molnar
  2006-11-24 20:37     ` Andi Kleen
@ 2006-11-24 22:48     ` Ben Greear
  2006-11-25  2:56     ` Wink Saville
  2 siblings, 0 replies; 22+ messages in thread
From: Ben Greear @ 2006-11-24 22:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, linux-kernel, Andrew Morton, Arjan van de Ven,
	Thomas Gleixner

Ingo Molnar wrote:
> nor can the TSC really be synced up properly in the hotplug CPU case, 
> after the fact - what if the app already read out an older TSC value and 
> a new CPU is added. If the TSC isnt sync on SMP then it quickly gets 
> pretty messy, and we should rather take a look at /why/ these apps are 
> using RDTSC.
>   

I've been using RDTSC in pktgen.  For my app, I am just trying to find a 
quick
way to determine if X nano-seconds have elapsed.  If it's occasionally 
off by a small amount,
that is OK as I have higher-level control logic that will correct 
long-term trends.

After poking around, I ended up just exporting sched_clock() and using that
instead of directly using RDTSC, but that was mostly just for convenience.

If RDTSC is not a good choice for the use I describe above, is there a 
better
method that is still very fast (as compared to do_gettimeofday())?

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 20:25   ` Ingo Molnar
  2006-11-24 20:37     ` Andi Kleen
  2006-11-24 22:48     ` Ben Greear
@ 2006-11-25  2:56     ` Wink Saville
  2006-11-25  8:30       ` Arjan van de Ven
  2 siblings, 1 reply; 22+ messages in thread
From: Wink Saville @ 2006-11-25  2:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, linux-kernel, Andrew Morton, Arjan van de Ven,
	Thomas Gleixner

Ingo Molnar wrote:
> * Andi Kleen <ak@suse.de> wrote:
> 
> a new CPU is added. If the TSC isnt sync on SMP then it quickly gets 
> pretty messy, and we should rather take a look at /why/ these apps are 
> using RDTSC.
> 
> 	Ingo
> -

I use RDTSC in get a cheap method of measuring time. What other choices are
there for a low overhead high frequency time source?

By low overhead a kernel call is way to expensive, I want to minimally impact
the code and have many of these calls through out the code. One of the
ways I use it is to instrument multi-threaded applications and then use
the TSC to compare when actions occur between threads. i.e. I use it as a
time stamp counter and neither precision or accuracy is too important.
On the other hand the more precise and accurate the better:)

Cheers,

Wink Saville

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-25  2:56     ` Wink Saville
@ 2006-11-25  8:30       ` Arjan van de Ven
  2006-11-25 16:58         ` Wink Saville
  0 siblings, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2006-11-25  8:30 UTC (permalink / raw)
  To: Wink Saville
  Cc: Ingo Molnar, Andi Kleen, linux-kernel, Andrew Morton,
	Thomas Gleixner

On Fri, 2006-11-24 at 18:56 -0800, Wink Saville wrote:
> Ingo Molnar wrote:
> > * Andi Kleen <ak@suse.de> wrote:
> > 
> > a new CPU is added. If the TSC isnt sync on SMP then it quickly gets 
> > pretty messy, and we should rather take a look at /why/ these apps are 
> > using RDTSC.
> > 
> > 	Ingo
> > -
> 
> I use RDTSC in get a cheap method of measuring time. What other choices are
> there for a low overhead high frequency time source?
> 
> By low overhead a kernel call is way to expensive, I want to minimally impact
> the code and have many of these calls through out the code. One of the
> ways I use it is to instrument multi-threaded applications and then use
> the TSC to compare when actions occur between threads. i.e. I use it as a
> time stamp counter and neither precision or accuracy is too important.
> On the other hand the more precise and accurate the better:)

so you can live with an occasional jump of seconds/minutes between
threads? Or when a thread moves to another cpu?
(yes on many PCs you won't see minutes, but on others you will)

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-25  8:30       ` Arjan van de Ven
@ 2006-11-25 16:58         ` Wink Saville
  2006-11-25 17:41           ` Arjan van de Ven
  0 siblings, 1 reply; 22+ messages in thread
From: Wink Saville @ 2006-11-25 16:58 UTC (permalink / raw)
  To: arjan; +Cc: Ingo Molnar, Andi Kleen, linux-kernel, Andrew Morton,
	Thomas Gleixner

Arjan van de Ven wrote:

<snip/>

> 
> so you can live with an occasional jump of seconds/minutes between
> threads? Or when a thread moves to another cpu?
> (yes on many PCs you won't see minutes, but on others you will)
> 

No that probably wouldn't be acceptable a few thousand ticks might be Ok.
How hard would be it possible to detect post facto what corrections might be needed
when? Maybe an event or FIFO that would provide correction information.

I would also assume we could request that a thread not migrate between CPU's
if it was critical and that could be a solution.

Would another alternative be to allow the thread to indicate when it was
ready to migrate and at these points the correction information could
be supplied/retrieved if desired.

Actually, we need to ask the CPU/System makers to provide a system wide
timer that is independent of the given CPU. I would expect it quite simple
for the coming generations of multi-core CPU's to provide a shared TSC.
For the multi-processor embedded work I have done we provided a memory
mapped counter that all processors could read, that worked quite well:)

Wink

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-25 16:58         ` Wink Saville
@ 2006-11-25 17:41           ` Arjan van de Ven
  2006-11-25 20:34             ` Wink Saville
  0 siblings, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2006-11-25 17:41 UTC (permalink / raw)
  To: Wink Saville
  Cc: Ingo Molnar, Andi Kleen, linux-kernel, Andrew Morton,
	Thomas Gleixner


> Actually, we need to ask the CPU/System makers to provide a system wide
> timer that is independent of the given CPU. I would expect it quite simple

they exist. They're called pmtimer and hpet.
pmtimer is port io. hpet is memory mapped io.




-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-25 17:41           ` Arjan van de Ven
@ 2006-11-25 20:34             ` Wink Saville
  0 siblings, 0 replies; 22+ messages in thread
From: Wink Saville @ 2006-11-25 20:34 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Andi Kleen, linux-kernel, Andrew Morton,
	Thomas Gleixner

Arjan van de Ven wrote:
>> Actually, we need to ask the CPU/System makers to provide a system wide
>> timer that is independent of the given CPU. I would expect it quite simple
> 
> they exist. They're called pmtimer and hpet.
> pmtimer is port io. hpet is memory mapped io.

Thanks for the info. I took a look at Documentation/hpet.txt and drivers/char/hpet.c
and see that hpet_mmap is implemented in the driver but nothing hpet.txt indicates
what is being mapped.

Could you point me to any other documentation? I did find the following:

http://www.intel.com/hardwaredesign/hpetspec_1.pdf

Are you aware of any example user code that uses the mmap capability of hpet?

Thanks,

Wink

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 17:13 ` Andi Kleen
  2006-11-24 20:25   ` Ingo Molnar
@ 2006-11-27 19:00   ` Christoph Lameter
  1 sibling, 0 replies; 22+ messages in thread
From: Christoph Lameter @ 2006-11-27 19:00 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Arjan van de Ven,
	Thomas Gleixner

On Fri, 24 Nov 2006, Andi Kleen wrote:

> The trouble is that people are using the RDTSC anyways even if the
> kernel doesn't. So some synchronization is probably a good idea.

It is better to simply leave TSC alone if unsynchronized. If TSCs appear 
to be in sync (through some sporadic "synchronization") then people will 
be tempted to use RDTSC because it seems to work/ However, RDTSC will 
sporadically yield incoherent values (i.e. time earlier than last TSC 
read) if we have no full synchronization.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-24 17:02 [patch] x86: unify/rewrite SMP TSC sync code Ingo Molnar
  2006-11-24 17:13 ` Andi Kleen
@ 2006-11-29  7:13 ` Ingo Molnar
  1 sibling, 0 replies; 22+ messages in thread
From: Ingo Molnar @ 2006-11-29  7:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton, Arjan van de Ven, Andi Kleen, Thomas Gleixner


* Ingo Molnar <mingo@elte.hu> wrote:

> Andrew,
> 
> could we try this one in -mm? It unifies (and simplifies) the TSC sync 
> code between i386 and x86_64, and also offers a stronger guarantee 
> that we'll only activate the TSC clock on CPU where the TSC is synced 
> correctly by the hardware.

updated patch below. (Mike Galbraith reported that suspend broke on -rt 
kernels, it was due to an __init/__cpuinit mismatch)

	Ingo

------------->
Subject: x86: rewrite SMP TSC sync code
From: Ingo Molnar <mingo@elte.hu>

make the TSC synchronization code more robust, and unify
it between x86_64 and i386.

The biggest change is the removal of the 'fix up TSCs' code
on x86_64 and i386, in some rare cases it was /causing/
time-warps on SMP systems.

The new code only checks for TSC asynchronity - and if it can
prove a time-warp (if it can observe the TSC going backwards
when going from one CPU to another within a critical section),
then the TSC clock-source is turned off.

The TSC synchronization-checking code also got moved into a
separate file.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/i386/kernel/Makefile     |    2 
 arch/i386/kernel/smpboot.c    |  178 ++------------------------------
 arch/i386/kernel/tsc.c        |    4 
 arch/i386/kernel/tsc_sync.c   |    1 
 arch/x86_64/kernel/Makefile   |    2 
 arch/x86_64/kernel/smpboot.c  |  230 ++----------------------------------------
 arch/x86_64/kernel/time.c     |   11 ++
 arch/x86_64/kernel/tsc_sync.c |  187 ++++++++++++++++++++++++++++++++++
 include/asm-i386/tsc.h        |   49 --------
 include/asm-x86_64/proto.h    |    2 
 include/asm-x86_64/timex.h    |   26 ----
 include/asm-x86_64/tsc.h      |   66 ++++++++++++
 12 files changed, 295 insertions(+), 463 deletions(-)

Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
-obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
+obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -88,12 +88,6 @@ cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
-/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
- * is no way to resync one AP against BP. TBD: for prescott and above, we
- * should use IA64's algorithm
- */
-static int __devinitdata tsc_sync_disabled;
-
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
@@ -210,151 +204,6 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static struct {
-	atomic_t start_flag;
-	atomic_t count_start;
-	atomic_t count_stop;
-	unsigned long long values[NR_CPUS];
-} tsc __initdata = {
-	.start_flag = ATOMIC_INIT(0),
-	.count_start = ATOMIC_INIT(0),
-	.count_stop = ATOMIC_INIT(0),
-};
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp(void)
-{
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
-
-	atomic_set(&tsc.start_flag, 1);
-	wmb();
-
-	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
-	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc.count_start);
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		/*
-		 * We clear the TSC in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		/*
-		 * Wait for all APs to leave the synchronization point:
-		 */
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
-			cpu_relax();
-		atomic_set(&tsc.count_start, 0);
-		wmb();
-		atomic_inc(&tsc.count_stop);
-	}
-
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc.values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc.values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long long realdelta;
-
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc.values[i] < avg)
-				realdelta = -realdelta;
-
-			if (realdelta)
-				printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
-					"skew, fixed it up.\n", i, realdelta);
-		}
-	}
-	if (!buggy)
-		printk("passed.\n");
-}
-
-static void __init synchronize_tsc_ap(void)
-{
-	int i;
-
-	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
-	 */
-	while (!atomic_read(&tsc.start_flag))
-		cpu_relax();
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc.count_start);
-		while (atomic_read(&tsc.count_start) != num_booting_cpus())
-			cpu_relax();
-
-		rdtscll(tsc.values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
-
-		atomic_inc(&tsc.count_stop);
-		while (atomic_read(&tsc.count_stop) != num_booting_cpus())
-			cpu_relax();
-	}
-}
-#undef NR_LOOPS
-
 extern void calibrate_delay(void);
 
 static atomic_t init_deasserted;
@@ -440,12 +289,6 @@ static void __devinit smp_callin(void)
 	 * Allow the master to continue.
 	 */
 	cpu_set(cpuid, cpu_callin_map);
-
-	/*
-	 *      Synchronize the TSC with the BP
-	 */
-	if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
-		synchronize_tsc_ap();
 }
 
 static int cpucount;
@@ -545,6 +388,11 @@ static void __devinit start_secondary(vo
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
+	/*
+	 * Check TSC synchronization with the BP:
+	 */
+	check_tsc_sync_target();
+
 	setup_secondary_APIC_clock();
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
@@ -1091,8 +939,6 @@ static int __cpuinit __smp_prepare_cpu(i
 	info.cpu = cpu;
 	INIT_WORK(&task, do_warm_boot_cpu, &info);
 
-	tsc_sync_disabled = 1;
-
 	/* init low mem mapping */
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
 			KERNEL_PGD_PTRS);
@@ -1100,7 +946,6 @@ static int __cpuinit __smp_prepare_cpu(i
 	schedule_work(&task);
 	wait_for_completion(&done);
 
-	tsc_sync_disabled = 0;
 	zap_low_mappings();
 	ret = 0;
 exit:
@@ -1316,12 +1161,6 @@ static void __init smp_boot_cpus(unsigne
 	smpboot_setup_io_apic();
 
 	setup_boot_APIC_clock();
-
-	/*
-	 * Synchronize the TSC with the AP
-	 */
-	if (cpu_has_tsc && cpucount && cpu_khz)
-		synchronize_tsc_bp();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
@@ -1456,9 +1295,16 @@ int __devinit __cpu_up(unsigned int cpu)
 	}
 
 	local_irq_enable();
+
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
+
+	/*
+	 * Check TSC synchronization with the AP:
+	 */
+	check_tsc_sync_source(cpu);
+
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
 	return 0;
Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -434,8 +434,10 @@ out:
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
-static __init int unsynchronized_tsc(void)
+__cpuinit int unsynchronized_tsc(void)
 {
+	if (!cpu_has_tsc || tsc_unstable)
+		return 1;
 	/*
 	 * Intel systems are normally all synchronized.
 	 * Exceptions must mark TSC as unstable:
Index: linux/arch/i386/kernel/tsc_sync.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o tsc_sync.o
 obj-y				+= apic.o  nmi.o
 obj-y				+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
Index: linux/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smpboot.c
+++ linux/arch/x86_64/kernel/smpboot.c
@@ -147,217 +147,6 @@ static void __cpuinit smp_store_cpu_info
 	print_cpu_info(c);
 }
 
-/*
- * New Funky TSC sync algorithm borrowed from IA64.
- * Main advantage is that it doesn't reset the TSCs fully and
- * in general looks more robust and it works better than my earlier
- * attempts. I believe it was written by David Mosberger. Some minor
- * adjustments for x86-64 by me -AK
- *
- * Original comment reproduced below.
- *
- * Synchronize TSC of the current (slave) CPU with the TSC of the
- * MASTER CPU (normally the time-keeper CPU).  We use a closed loop to
- * eliminate the possibility of unaccounted-for errors (such as
- * getting a machine check in the middle of a calibration step).  The
- * basic idea is for the slave to ask the master what itc value it has
- * and to read its own itc before and after the master responds.  Each
- * iteration gives us three timestamps:
- *
- *	slave		master
- *
- *	t0 ---\
- *             ---\
- *		   --->
- *			tm
- *		   /---
- *	       /---
- *	t1 <---
- *
- *
- * The goal is to adjust the slave's TSC such that tm falls exactly
- * half-way between t0 and t1.  If we achieve this, the clocks are
- * synchronized provided the interconnect between the slave and the
- * master is symmetric.  Even if the interconnect were asymmetric, we
- * would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us
- * synchronize the TSC to within one or two cycles.  However, we can
- * only *guarantee* that the synchronization is accurate to within a
- * round-trip time, which is typically in the range of several hundred
- * cycles (e.g., ~500 cycles).  In practice, this means that the TSCs
- * are usually almost perfectly synchronized, but we shouldn't assume
- * that the accuracy is much better than half a micro second or so.
- *
- * [there are other errors like the latency of RDTSC and of the
- * WRMSR. These can also account to hundreds of cycles. So it's
- * probably worse. It claims 153 cycles error on a dual Opteron,
- * but I suspect the numbers are actually somewhat worse -AK]
- */
-
-#define MASTER	0
-#define SLAVE	(SMP_CACHE_BYTES/8)
-
-/* Intentionally don't use cpu_relax() while TSC synchronization
-   because we don't want to go into funky power save modi or cause
-   hypervisors to schedule us away.  Going to sleep would likely affect
-   latency and low latency is the primary objective here. -AK */
-#define no_cpu_relax() barrier()
-
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static volatile __cpuinitdata unsigned long go[SLAVE + 1];
-static int notscsync __cpuinitdata;
-
-#undef DEBUG_TSC_SYNC
-
-#define NUM_ROUNDS	64	/* magic value */
-#define NUM_ITERS	5	/* likewise */
-
-/* Callback on boot CPU */
-static __cpuinit void sync_master(void *arg)
-{
-	unsigned long flags, i;
-
-	go[MASTER] = 0;
-
-	local_irq_save(flags);
-	{
-		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
-			while (!go[MASTER])
-				no_cpu_relax();
-			go[MASTER] = 0;
-			rdtscll(go[SLAVE]);
-		}
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our tsc differs from the tsc
- * on the master (time-keeper) CPU.  A positive number indicates our
- * tsc is ahead of the master, negative that it is behind.
- */
-static inline long
-get_delta(long *rt, long *master)
-{
-	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
-	unsigned long tcenter, t0, t1, tm;
-	int i;
-
-	for (i = 0; i < NUM_ITERS; ++i) {
-		rdtscll(t0);
-		go[MASTER] = 1;
-		while (!(tm = go[SLAVE]))
-			no_cpu_relax();
-		go[SLAVE] = 0;
-		rdtscll(t1);
-
-		if (t1 - t0 < best_t1 - best_t0)
-			best_t0 = t0, best_t1 = t1, best_tm = tm;
-	}
-
-	*rt = best_t1 - best_t0;
-	*master = best_tm - best_t0;
-
-	/* average best_t0 and best_t1 without overflow: */
-	tcenter = (best_t0/2 + best_t1/2);
-	if (best_t0 % 2 + best_t1 % 2 == 2)
-		++tcenter;
-	return tcenter - best_tm;
-}
-
-static __cpuinit void sync_tsc(unsigned int master)
-{
-	int i, done = 0;
-	long delta, adj, adjust_latency = 0;
-	unsigned long flags, rt, master_time_stamp, bound;
-#ifdef DEBUG_TSC_SYNC
-	static struct syncdebug {
-		long rt;	/* roundtrip time */
-		long master;	/* master's timestamp */
-		long diff;	/* difference between midpoint and master's timestamp */
-		long lat;	/* estimate of tsc adjustment latency */
-	} t[NUM_ROUNDS] __cpuinitdata;
-#endif
-
-	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
-		smp_processor_id(), master);
-
-	go[MASTER] = 1;
-
-	/* It is dangerous to broadcast IPI as cpus are coming up,
-	 * as they may not be ready to accept them.  So since
-	 * we only need to send the ipi to the boot cpu direct
-	 * the message, and avoid the race.
-	 */
-	smp_call_function_single(master, sync_master, NULL, 1, 0);
-
-	while (go[MASTER])	/* wait for master to be ready */
-		no_cpu_relax();
-
-	spin_lock_irqsave(&tsc_sync_lock, flags);
-	{
-		for (i = 0; i < NUM_ROUNDS; ++i) {
-			delta = get_delta(&rt, &master_time_stamp);
-			if (delta == 0) {
-				done = 1;	/* let's lock on to this... */
-				bound = rt;
-			}
-
-			if (!done) {
-				unsigned long t;
-				if (i > 0) {
-					adjust_latency += -delta;
-					adj = -delta + adjust_latency/4;
-				} else
-					adj = -delta;
-
-				rdtscll(t);
-				wrmsrl(MSR_IA32_TSC, t + adj);
-			}
-#ifdef DEBUG_TSC_SYNC
-			t[i].rt = rt;
-			t[i].master = master_time_stamp;
-			t[i].diff = delta;
-			t[i].lat = adjust_latency/4;
-#endif
-		}
-	}
-	spin_unlock_irqrestore(&tsc_sync_lock, flags);
-
-#ifdef DEBUG_TSC_SYNC
-	for (i = 0; i < NUM_ROUNDS; ++i)
-		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
-		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
-	printk(KERN_INFO
-	       "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
-	       "maxerr %lu cycles)\n",
-	       smp_processor_id(), master, delta, rt);
-}
-
-static void __cpuinit tsc_sync_wait(void)
-{
-	/*
-	 * When the CPU has synchronized TSCs assume the BIOS
-  	 * or the hardware already synced.  Otherwise we could
-	 * mess up a possible perfect synchronization with a
-	 * not-quite-perfect algorithm.
-	 */
-	if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
-		return;
-	sync_tsc(0);
-}
-
-static __init int notscsync_setup(char *s)
-{
-	notscsync = 1;
-	return 1;
-}
-__setup("notscsync", notscsync_setup);
-
 static atomic_t init_deasserted __cpuinitdata;
 
 /*
@@ -545,6 +334,11 @@ void __cpuinit start_secondary(void)
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
 	barrier();
 
+	/*
+  	 * Check TSC sync first:
+ 	 */
+	check_tsc_sync_target();
+
 	Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 	
 	setup_secondary_APIC_clock();
 
@@ -564,14 +358,6 @@ void __cpuinit start_secondary(void)
 	 */
 	set_cpu_sibling_map(smp_processor_id());
 
-	/* 
-  	 * Wait for TSC sync to not schedule things before.
-	 * We still process interrupts, which could see an inconsistent
-	 * time in that window unfortunately. 
-	 * Do this here because TSC sync has global unprotected state.
- 	 */
-	tsc_sync_wait();
-
 	/*
 	 * We need to hold call_lock, so there is no inconsistency
 	 * between the time smp_call_function() determines number of
@@ -591,6 +377,7 @@ void __cpuinit start_secondary(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	spin_unlock(&vector_lock);
+
 	unlock_ipi_call_lock();
 
 	cpu_idle();
@@ -1165,6 +952,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	/* Unleash the CPU! */
 	Dprintk("waiting for cpu %d\n", cpu);
 
+	/*
+  	 * Make sure and check TSC sync:
+ 	 */
+	check_tsc_sync_source(cpu);
+
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
 	err = 0;
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -922,12 +922,23 @@ void __init time_init(void)
 #endif
 }
 
+static int tsc_unstable = 0;
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
 /*
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
 __cpuinit int unsynchronized_tsc(void)
 {
+	if (tsc_unstable)
+		return 1;
+
 #ifdef CONFIG_SMP
 	if (apic_is_clustered_box())
 		return 1;
Index: linux/arch/x86_64/kernel/tsc_sync.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
+/*
+ * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static __cpuinitdata atomic_t start_count;
+static __cpuinitdata atomic_t stop_count;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata cycles_t last_tsc;
+static __cpuinitdata cycles_t max_warp;
+static __cpuinitdata int nr_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+static __cpuinit void check_tsc_warp(void)
+{
+	cycles_t start, now, prev, end;
+	int i;
+
+	start = get_cycles_sync();
+	/*
+	 * The measurement runs for 20 msecs:
+	 */
+	end = start + cpu_khz * 20ULL;
+	now = start;
+
+	for (i = 0; ; i++) {
+		/*
+		 * We take the global lock, measure TSC, save the
+		 * previous TSC that was measured (possibly on
+		 * another CPU) and update the previous TSC timestamp.
+		 */
+		__raw_spin_lock(&sync_lock);
+		prev = last_tsc;
+		now = get_cycles_sync();
+		last_tsc = now;
+		__raw_spin_unlock(&sync_lock);
+
+		/*
+		 * Be nice every now and then (and also check whether
+		 * measurement is done [we also insert a 100 million
+		 * loops safety exit, so we dont lock up in case the
+		 * TSC readout is totally broken]):
+		 */
+		if (unlikely(!(i & 7))) {
+			if (now > end || i > 100000000)
+				break;
+			cpu_relax();
+			touch_nmi_watchdog();
+		}
+		/*
+		 * Outside the critical section we can now see whether
+		 * we saw a time-warp of the TSC going backwards:
+		 */
+		if (unlikely(prev > now)) {
+			__raw_spin_lock(&sync_lock);
+			max_warp = max(max_warp, prev - now);
+			nr_warps++;
+			__raw_spin_unlock(&sync_lock);
+		}
+
+	}
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+	int cpus = 2;
+
+	/*
+	 * No need to check if we already know that the TSC is not
+	 * synchronized:
+	 */
+	if (unsynchronized_tsc())
+		return;
+
+	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+			  smp_processor_id(), cpu);
+
+	/*
+	 * Reset it - in case this is a second bootup:
+	 */
+	atomic_set(&stop_count, 0);
+
+	/*
+	 * Wait for the target to arrive:
+	 */
+	while (atomic_read(&start_count) != cpus-1)
+		cpu_relax();
+	/*
+	 * Trigger the target to continue into the measurement too:
+	 */
+	atomic_inc(&start_count);
+
+	check_tsc_warp();
+
+	while (atomic_read(&stop_count) != cpus-1)
+		cpu_relax();
+
+	/*
+	 * Reset it - just in case we boot another CPU later:
+	 */
+	atomic_set(&start_count, 0);
+
+	if (nr_warps) {
+		printk("\n");
+		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
+				    " turning off TSC clock.\n", max_warp);
+		mark_tsc_unstable();
+		nr_warps = 0;
+		max_warp = 0;
+		last_tsc = 0;
+	} else {
+		printk(" passed.\n");
+	}
+
+	/*
+	 * Let the target continue with the bootup:
+	 */
+	atomic_inc(&stop_count);
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+	int cpus = 2;
+
+	if (unsynchronized_tsc())
+		return;
+
+	/*
+	 * Register this CPU's participation and wait for the
+	 * source CPU to start the measurement:
+	 */
+	atomic_inc(&start_count);
+	while (atomic_read(&start_count) != cpus)
+		cpu_relax();
+
+	check_tsc_warp();
+
+	/*
+	 * Ok, we are done:
+	 */
+	atomic_inc(&stop_count);
+
+	/*
+	 * Wait for the source CPU to print stuff:
+	 */
+	while (atomic_read(&stop_count) != cpus)
+		cpu_relax();
+}
+#undef NR_LOOPS
+
Index: linux/include/asm-i386/tsc.h
===================================================================
--- linux.orig/include/asm-i386/tsc.h
+++ linux/include/asm-i386/tsc.h
@@ -1,48 +1 @@
-/*
- * linux/include/asm-i386/tsc.h
- *
- * i386 TSC related functions
- */
-#ifndef _ASM_i386_TSC_H
-#define _ASM_i386_TSC_H
-
-#include <asm/processor.h>
-
-/*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is.
- */
-typedef unsigned long long cycles_t;
-
-extern unsigned int cpu_khz;
-extern unsigned int tsc_khz;
-
-static inline cycles_t get_cycles(void)
-{
-	unsigned long long ret = 0;
-
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
-	rdtscll(ret);
-#endif
-	return ret;
-}
-
-extern void tsc_init(void);
-extern void mark_tsc_unstable(void);
-
-#endif
+#include <asm-x86_64/tsc.h>
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -92,8 +92,6 @@ extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
 
-extern int unsynchronized_tsc(void);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long table_start, table_end;
Index: linux/include/asm-x86_64/timex.h
===================================================================
--- linux.orig/include/asm-x86_64/timex.h
+++ linux/include/asm-x86_64/timex.h
@@ -12,35 +12,11 @@
 #include <asm/hpet.h>
 #include <asm/system.h>
 #include <asm/processor.h>
+#include <asm/tsc.h>
 #include <linux/compiler.h>
 
 #define CLOCK_TICK_RATE	PIT_TICK_RATE	/* Underlying HZ */
 
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-	unsigned long long ret;
-
-	rdtscll(ret);
-	return ret;
-}
-
-/* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t get_cycles_sync(void)
-{
-	unsigned long long ret;
-	unsigned eax;
-	/* Don't do an additional sync on CPUs where we know
-	   RDTSC is already synchronous. */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-	rdtscll(ret);
-	return ret;
-}
-
-extern unsigned int cpu_khz;
-
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
Index: linux/include/asm-x86_64/tsc.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/tsc.h
@@ -0,0 +1,66 @@
+/*
+ * linux/include/asm-x86_64/tsc.h
+ *
+ * x86_64 TSC related functions
+ */
+#ifndef _ASM_x86_64_TSC_H
+#define _ASM_x86_64_TSC_H
+
+#include <asm/processor.h>
+
+/*
+ * Standard way to access the cycle counter.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+static inline cycles_t get_cycles(void)
+{
+	unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+
+#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+	rdtscll(ret);
+#endif
+	return ret;
+}
+
+/* Like get_cycles, but make sure the CPU is synchronized. */
+static __always_inline cycles_t get_cycles_sync(void)
+{
+	unsigned long long ret;
+#ifdef X86_FEATURE_SYNC_RDTSC
+	unsigned eax;
+
+	/*
+	 * Don't do an additional sync on CPUs where we know
+	 * RDTSC is already synchronous:
+	 */
+	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+#else
+	sync_core();
+#endif
+	rdtscll(ret);
+
+	return ret;
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(void);
+extern int unsynchronized_tsc(void);
+
+/*
+ * Boot-time check whether the TSCs are synchronized across
+ * all CPUs/cores:
+ */
+extern void check_tsc_sync_source(int cpu);
+extern void check_tsc_sync_target(void);
+
+#endif

^ permalink raw reply	[flat|nested] 22+ messages in thread

[parent not found: <fa./NRPJg+JjfSQLUVwnX1GpHGIojQ@ifi.uio.no>]

[parent not found: <fa.Y0RKABHd+7qnbGQYBAGPvlJ0Qic@ifi.uio.no>]

[parent not found: <fa.fD3WSpNqEJ4736vYzEak5Gf3xTw@ifi.uio.no>]

[parent not found: <fa.A+gkQAO1DLThaxJxPLPl3yE1CGo@ifi.uio.no>]

[parent not found: <fa.INurNKWdUKAEULTHyfpSW65a/Ng@ifi.uio.no>]

[parent not found: <fa.n9vySiI9RS2MCl0DZPDzxZEPiFw@ifi.uio.no>]

* Re: [patch] x86: unify/rewrite SMP TSC sync code
       [not found]         ` <fa.n9vySiI9RS2MCl0DZPDzxZEPiFw@ifi.uio.no>
@ 2006-11-26  7:20           ` Robert Hancock
  2006-11-26  8:16             ` Wink Saville
  0 siblings, 1 reply; 22+ messages in thread
From: Robert Hancock @ 2006-11-26  7:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Wink Saville, Arjan van de Ven

Wink Saville wrote:
> Arjan van de Ven wrote:
>>> Actually, we need to ask the CPU/System makers to provide a system wide
>>> timer that is independent of the given CPU. I would expect it quite 
>>> simple
>>
>> they exist. They're called pmtimer and hpet.
>> pmtimer is port io. hpet is memory mapped io.
> 
> Thanks for the info. I took a look at Documentation/hpet.txt and 
> drivers/char/hpet.c
> and see that hpet_mmap is implemented in the driver but nothing hpet.txt 
> indicates
> what is being mapped.
> 
> Could you point me to any other documentation? I did find the following:
> 
> http://www.intel.com/hardwaredesign/hpetspec_1.pdf
> 
> Are you aware of any example user code that uses the mmap capability of 
> hpet?

Generally user mode code should just be using gettimeofday. When the TSC 
is usable as a sane time source, the kernel will use it. When it's not, 
it will use something else like the HPET, ACPI PM Timer or (at last 
resort) the PIT, in increasing degrees of slowness.

-- 
Robert Hancock      Saskatoon, SK, Canada
To email, remove "nospam" from hancockr@nospamshaw.ca
Home Page: http://www.roberthancock.com/


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-26  7:20           ` Robert Hancock
@ 2006-11-26  8:16             ` Wink Saville
  2006-11-26  8:24               ` Arjan van de Ven
  0 siblings, 1 reply; 22+ messages in thread
From: Wink Saville @ 2006-11-26  8:16 UTC (permalink / raw)
  To: Robert Hancock; +Cc: linux-kernel, Arjan van de Ven

Robert Hancock wrote:
>>>> Actually, we need to ask the CPU/System makers to provide a system wide
> Generally user mode code should just be using gettimeofday. When the TSC 
> is usable as a sane time source, the kernel will use it. When it's not, 
> it will use something else like the HPET, ACPI PM Timer or (at last 
> resort) the PIT, in increasing degrees of slowness.
> 

But gettimeofday is much too expensive compared to RDTSC.

Wink


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-26  8:16             ` Wink Saville
@ 2006-11-26  8:24               ` Arjan van de Ven
  2006-11-26 19:48                 ` Wink Saville
  0 siblings, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2006-11-26  8:24 UTC (permalink / raw)
  To: Wink Saville; +Cc: Robert Hancock, linux-kernel

On Sun, 2006-11-26 at 00:16 -0800, Wink Saville wrote:
> Robert Hancock wrote:
> >>>> Actually, we need to ask the CPU/System makers to provide a system wide
> > Generally user mode code should just be using gettimeofday. When the TSC 
> > is usable as a sane time source, the kernel will use it. When it's not, 
> > it will use something else like the HPET, ACPI PM Timer or (at last 
> > resort) the PIT, in increasing degrees of slowness.
> > 
> 
> But gettimeofday is much too expensive compared to RDTSC.

it's the cost of a syscall (1000 cycles?) plus what it takes to get a
reasonable time estimate. Assuming your kernel has enough time support
AND your tsc is reasonably ok, it'll be using that. If it's NOT using
that then that's a pretty good sign that you can't also use it in
userspace....

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-26  8:24               ` Arjan van de Ven
@ 2006-11-26 19:48                 ` Wink Saville
  2006-11-27  7:51                   ` Arjan van de Ven
  0 siblings, 1 reply; 22+ messages in thread
From: Wink Saville @ 2006-11-26 19:48 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Robert Hancock, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1468 bytes --]

Arjan van de Ven wrote:
> it's the cost of a syscall (1000 cycles?) plus what it takes to get a
> reasonable time estimate. Assuming your kernel has enough time support
> AND your tsc is reasonably ok, it'll be using that. If it's NOT using
> that then that's a pretty good sign that you can't also use it in
> userspace....
> 

I wrote a quick and dirty program that I've attached to test the cost
difference between RDTSC and gettimeofday (gtod), the results:

wink@winkc2d1:~/linux/linux-2.6/test/rdtsc-pref$ time ./rdtsc-pref 100000000
rdtsc:   average ticks=  65
gtod:    average ticks= 222
gtod_us: average ticks= 232

real    0m36.002s
user    0m35.997s
sys     0m0.000s

About a 3.5x cost difference, still for most of my uses gtod was not as costly
as I had supposed.

But, there are other uses that it wouldn't be acceptable. For instance, I
have used a memory mapped time stamp counter in an embedded ARM based
system for instrumenting the interrupt service routine, syscalls
and task switches. For this type of instrumentation a gtod type call wouldn't
have been suitable.

Anyway for x86_64 systems, if I can use a memory mapped HPET counter, I might be
able to have my cake and eat it too. One counter that can be used inside and
outside the kernel that is cheap, precise and accurate, nirvana! We'll have to see.

BTW my system is a 2.4ghz Core 2 duo running 2.6.19-rc6 with HPET enabled,
in the attachment I've included my config file.

Cheers,

Wink

[-- Attachment #2: rdtsc-pref.tgz --]
[-- Type: application/octet-stream, Size: 9522 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-26 19:48                 ` Wink Saville
@ 2006-11-27  7:51                   ` Arjan van de Ven
  2006-11-27  9:15                     ` Wink Saville
  0 siblings, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2006-11-27  7:51 UTC (permalink / raw)
  To: Wink Saville; +Cc: Robert Hancock, linux-kernel

On Sun, 2006-11-26 at 11:48 -0800, Wink Saville wrote:
> Arjan van de Ven wrote:
> > it's the cost of a syscall (1000 cycles?) plus what it takes to get a
> > reasonable time estimate. Assuming your kernel has enough time support
> > AND your tsc is reasonably ok, it'll be using that. If it's NOT using
> > that then that's a pretty good sign that you can't also use it in
> > userspace....
> > 
> 
> I wrote a quick and dirty program that I've attached to test the cost
> difference between RDTSC and gettimeofday (gtod), the results:
> 
> wink@winkc2d1:~/linux/linux-2.6/test/rdtsc-pref$ time ./rdtsc-pref 100000000
> rdtsc:   average ticks=  65
> gtod:    average ticks= 222
> gtod_us: average ticks= 232

just to make sure, you do realize that when you write "ticks" that rdtsc
doesn't measure cpu clock ticks or cpu cycles anymore, right? (At least
not on your machine)


> But, there are other uses that it wouldn't be acceptable. For instance, I
> have used a memory mapped time stamp counter in an embedded ARM based

ARM is a different animal; generally on such embedded system you know a
lot better if you have a reliable and userspace-useful tick counter like
this....

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-27  7:51                   ` Arjan van de Ven
@ 2006-11-27  9:15                     ` Wink Saville
  0 siblings, 0 replies; 22+ messages in thread
From: Wink Saville @ 2006-11-27  9:15 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Robert Hancock, linux-kernel

Arjan van de Ven wrote:

> just to make sure, you do realize that when you write "ticks" that rdtsc
> doesn't measure cpu clock ticks or cpu cycles anymore, right? (At least
> not on your machine)
> 

Yes, that's why I wrote ticks and not cycles. At this point I'm not sure how to convert
ticks to time on my machine, something else to learn:) Hopefully the HPET
will resolve all of the issues.

Wink


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
@ 2006-11-27 17:23 Robert Crocombe
  2006-11-27 18:41 ` Max Krasnyansky
  0 siblings, 1 reply; 22+ messages in thread
From: Robert Crocombe @ 2006-11-27 17:23 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1309 bytes --]

The difference that Wink reports is tiny compared to that measured on
my Opteron machines:

dual (2.6.17):

rcrocomb@netfires-aptest:cyclecounter_test$ ./rdtsc-pref 1000000
rdtsc:   average ticks=  10
gtod:    average ticks=4296
gtod_us: average ticks=4328

quad (2.6.16-rt29):

rcrocomb@spanky:wink_saville_test$ ./rdtsc-pref 1000000
rdtsc:   average ticks=  10
gtod:    average ticks=5688
gtod_us: average ticks=5711

I have my own little test that I'll attach, but it gives a similar
result.  Here are the results from the 2x box:

rcrocomb@netfires-aptest:cyclecounter_test$ ./timing
Using the cycle counter
Calibrated timer as 2593081969.758825 Hz
4194304 iterations in 0.016 seconds is 0.004 useconds per iteration.

rcrocomb@netfires-aptest:cyclecounter_test$ ./timing_gettimeofday
Using gettimeofday
4194304 iterations in 6.793 seconds is 1.620 useconds per iteration.

I have used the pthread affinity and/or cpuset, etc. mechanisms to try
and inject some reliability into the measurement.

Using gtod() can amount to a substantial disturbance of the thing to
be measured.  Using rdtsc, things seem reliable so far, and we have an
FPGA (accessed through the PCI bus) that has been programmed to give
access to an 8MHz clock and we do some checks against that.

-- 
Robert Crocombe
rcrocomb@gmail.com

[-- Attachment #2: timing.cpp --]
[-- Type: text/x-c++src, Size: 3566 bytes --]

#include <stdio.h>                                  // printf()
#include <stdint.h>                                 // uint64_t
#include <stdlib.h>                                 // drand48()
#include <sys/select.h>                             // select()
#include <sys/time.h>                               // gettimeofday
#include <time.h>
#include <asm-x86_64/msr.h>                         // rdtscll()


////////////////////////////////////////////////////////////////////////////////
// Globals
////////////////////////////////////////////////////////////////////////////////

enum
{
    ITERATIONS  = 1 << 22
};

static double seconds_per_tick;

////////////////////////////////////////////////////////////////////////////////
// Prototypes
////////////////////////////////////////////////////////////////////////////////

double gimme_timeofday(void);
double get_time(void);

void selectsleep(unsigned us);
void init(void);

////////////////////////////////////////////////////////////////////////////////
// Definitions
////////////////////////////////////////////////////////////////////////////////

double
gimme_timeofday(void)
{
    struct timeval tv;
    gettimeofday(&tv, 0);
    return tv.tv_sec + 1e-6 * tv.tv_usec;
}


double
get_time(void)
{
    uint64_t t;
    rdtscll(t);
    return t * seconds_per_tick;
}


/**
    A good way to simply hang around doing nothing for awhile.
*/

void
selectsleep(unsigned us)
{
	struct timeval tv;
	tv.tv_sec = 0;
	tv.tv_usec = us;
	select(0,0,0,0,&tv);
}

/**
    Figure out how fast rdtscll() ticks.  This should be equal to the
    frequency of the clock on the processor.  Here's the bad news: I don't
    know if rdtscll() always uses the same processor so it may very well be
    necessary to set a processor affinity to get really good results over
    time.

    This piece of code by Mark Hahn from brain.mcmaster.ca/~hahn/.
*/

void
init(void)
{
	double sumx = 0;
	double sumy = 0;
	double sumxx = 0;
	double sumxy = 0;
	double slope;

	// least squares linear regression of ticks onto real time
	// as returned by gettimeofday.

	const unsigned n = 30;
	unsigned i;

	for ( unsigned int i = 0; i < n; ++i)
        {
		double breal,real,ticks;
		uint64_t aticks, bticks;
	
		breal = gimme_timeofday();
		rdtscll(bticks);

		selectsleep((unsigned)(10000 + drand48() * 200000));

                rdtscll(aticks);
		ticks = aticks - bticks;
		real = gimme_timeofday() - breal;

		sumx += real;
		sumxx += real * real;
		sumxy += real * ticks;
		sumy += ticks;
	}

	slope = ((sumxy - (sumx*sumy) / n) / (sumxx - (sumx*sumx) / n));
	seconds_per_tick = 1.0 / slope;

        printf("Calibrated timer as %.6f Hz\n", slope);
}

int
main(int argc, char *argv[])
{
    printf("Doing stuff\n");

#if 0   // Using rdtscll()
    printf("Using the cycle counter\n");
    init();
    double time_start = gimme_timeofday();
    for (unsigned int i = 0; i < ITERATIONS; ++i)
    {
        double the_time;
        the_time = get_time();
    }
    double time_end = gimme_timeofday();
#else   // using gettimeofday()
    printf("Using gettimeofday\n");
    double time_start = gimme_timeofday();
    for (unsigned int i = 0; i < ITERATIONS; ++i)
    {
        double the_time;
        the_time = gimme_timeofday();
    }
    double time_end = gimme_timeofday();
#endif

    double diff = time_end - time_start;

    double useconds = (diff / ITERATIONS) * 1e6;

    printf("%u iterations in %.3f seconds is %.3f useconds per iteration.\n",
          ITERATIONS, diff, useconds);

    printf("Done\n");
    return 0;
}

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] x86: unify/rewrite SMP TSC sync code
  2006-11-27 17:23 Robert Crocombe
@ 2006-11-27 18:41 ` Max Krasnyansky
  0 siblings, 0 replies; 22+ messages in thread
From: Max Krasnyansky @ 2006-11-27 18:41 UTC (permalink / raw)
  To: Robert Crocombe; +Cc: linux-kernel

> Using gtod() can amount to a substantial disturbance of the thing to
> be measured.  Using rdtsc, things seem reliable so far, and we have an
> FPGA (accessed through the PCI bus) that has been programmed to give
> access to an 8MHz clock and we do some checks against that.

Same here. gettimeofday() is way too slow (dual Opteron box) for the
frequency I need to call it at. HPET is not available. But TSC is doing just
fine. Plus in my case I don't care about sync between CPUs (thread that uses
TSC is running on the isolated CPU) and I have external time source that takes
care of the drift.

So please no trapping of the RDTSC. Making it clear (bold kernel message during
boot :-) that TSC(s) are not in sync or unstable (from GTOD point of view) is of
course perfectly fine.

Thanx
Max

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2006-11-29  7:16 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-11-24 17:02 [patch] x86: unify/rewrite SMP TSC sync code Ingo Molnar
2006-11-24 17:13 ` Andi Kleen
2006-11-24 20:25   ` Ingo Molnar
2006-11-24 20:37     ` Andi Kleen
2006-11-24 20:46       ` Ingo Molnar
2006-11-24 21:06         ` Andi Kleen
2006-11-24 22:48     ` Ben Greear
2006-11-25  2:56     ` Wink Saville
2006-11-25  8:30       ` Arjan van de Ven
2006-11-25 16:58         ` Wink Saville
2006-11-25 17:41           ` Arjan van de Ven
2006-11-25 20:34             ` Wink Saville
2006-11-27 19:00   ` Christoph Lameter
2006-11-29  7:13 ` Ingo Molnar
     [not found] <fa./NRPJg+JjfSQLUVwnX1GpHGIojQ@ifi.uio.no>
     [not found] ` <fa.Y0RKABHd+7qnbGQYBAGPvlJ0Qic@ifi.uio.no>
     [not found]   ` <fa.fD3WSpNqEJ4736vYzEak5Gf3xTw@ifi.uio.no>
     [not found]     ` <fa.A+gkQAO1DLThaxJxPLPl3yE1CGo@ifi.uio.no>
     [not found]       ` <fa.INurNKWdUKAEULTHyfpSW65a/Ng@ifi.uio.no>
     [not found]         ` <fa.n9vySiI9RS2MCl0DZPDzxZEPiFw@ifi.uio.no>
2006-11-26  7:20           ` Robert Hancock
2006-11-26  8:16             ` Wink Saville
2006-11-26  8:24               ` Arjan van de Ven
2006-11-26 19:48                 ` Wink Saville
2006-11-27  7:51                   ` Arjan van de Ven
2006-11-27  9:15                     ` Wink Saville
  -- strict thread matches above, loose matches on Subject: below --
2006-11-27 17:23 Robert Crocombe
2006-11-27 18:41 ` Max Krasnyansky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox