[RFC patch 0/4] TSC calibration improvements

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC patch 0/4] TSC calibration improvements
@ 2008-09-04 15:18 Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 1/4] x86: TSC: define the PIT latch value separate Thomas Gleixner
                   ` (4 more replies)
  0 siblings, 5 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-04 15:18 UTC (permalink / raw)
  To: LKML; +Cc: Linus Torvalds, Ingo Molnar, Alok Kataria, Arjan van de Veen

This patch series addresses following:

     - reduce the calibration time to a useful value
     - make decision smarter, when a reference (HPET/PMTIMER) is around

The first patches are cleanups to prepare for the smarter loop
handling.

The main change is to reduce the PIT delay value to 10ms, which gives
reasonable results on very slow machines as well. To avoid looping
several times when the machine has a working reference counter
(HPET/pmtimer), we compare the results of the PIT and the reference and
break out of the loop when both match inside of a 10% window.

For virtualized environments the PIT calibration fails often and the
reference calibration is not reproducible with 10ms. To address this
we check whether the PIT failed two times in a row and make the PIT
loop longer (50ms) for the last try to get a better result for the
reference.

Most of the machines I tested break out of the loop after the first
try with a stable reproducible result.

Thanks,

	tglx
-- 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [RFC patch 1/4] x86: TSC: define the PIT latch value separate
  2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
@ 2008-09-04 15:18 ` Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 2/4] x86: TSC: separate hpet/pmtimer calculation out Thomas Gleixner
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-04 15:18 UTC (permalink / raw)
  To: LKML; +Cc: Linus Torvalds, Ingo Molnar, Alok Kataria, Arjan van de Veen

[-- Attachment #1: tsc-define-latch-value.patch --]
[-- Type: text/plain, Size: 1592 bytes --]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c |   14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/kernel/tsc.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc.c
+++ linux-2.6/arch/x86/kernel/tsc.c
@@ -122,6 +122,10 @@ static u64 tsc_read_refs(u64 *pm, u64 *h
 	return ULLONG_MAX;
 }
 
+#define CAL_MS		50
+#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS	5000
+
 /*
  * Try to calibrate the TSC against the Programmable
  * Interrupt Timer and return the frequency of the TSC
@@ -144,8 +148,8 @@ static unsigned long pit_calibrate_tsc(v
 	 * (LSB then MSB) to begin countdown.
 	 */
 	outb(0xb0, 0x43);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+	outb(CAL_LATCH & 0xff, 0x42);
+	outb(CAL_LATCH >> 8, 0x42);
 
 	tsc = t1 = t2 = get_cycles();
 
@@ -166,18 +170,18 @@ static unsigned long pit_calibrate_tsc(v
 	/*
 	 * Sanity checks:
 	 *
-	 * If we were not able to read the PIT more than 5000
+	 * If we were not able to read the PIT more than PIT_MIN_LOOPS
 	 * times, then we have been hit by a massive SMI
 	 *
 	 * If the maximum is 10 times larger than the minimum,
 	 * then we got hit by an SMI as well.
 	 */
-	if (pitcnt < 5000 || tscmax > 10 * tscmin)
+	if (pitcnt < CAL_PIT_LOOPS || tscmax > 10 * tscmin)
 		return ULONG_MAX;
 
 	/* Calculate the PIT value */
 	delta = t2 - t1;
-	do_div(delta, 50);
+	do_div(delta, CAL_MS);
 	return delta;
 }
 

-- 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [RFC patch 2/4] x86: TSC: separate hpet/pmtimer calculation out
  2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 1/4] x86: TSC: define the PIT latch value separate Thomas Gleixner
@ 2008-09-04 15:18 ` Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 3/4] x86: TSC: use one set of reference variables Thomas Gleixner
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-04 15:18 UTC (permalink / raw)
  To: LKML; +Cc: Linus Torvalds, Ingo Molnar, Alok Kataria, Arjan van de Veen

[-- Attachment #1: tsc-separate-hpet-pmtimer-calculation.patch --]
[-- Type: text/plain, Size: 1951 bytes --]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c |   56 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 15 deletions(-)

Index: linux-2.6/arch/x86/kernel/tsc.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc.c
+++ linux-2.6/arch/x86/kernel/tsc.c
@@ -122,6 +122,43 @@ static u64 tsc_read_refs(u64 *pm, u64 *h
 	return ULLONG_MAX;
 }
 
+/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+	u64 tmp;
+
+	if (hpet2 < hpet1)
+		hpet2 += 0x100000000ULL;
+	hpet2 -= hpet1;
+	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+	do_div(tmp, 1000000);
+	do_div(deltatsc, tmp);
+
+	return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+	u64 tmp;
+
+	if (!pm1 && !pm2)
+		return ULONG_MAX;
+
+	if (pm2 < pm1)
+		pm2 += (u64)ACPI_PM_OVRRUN;
+	pm2 -= pm1;
+	tmp = pm2 * 1000000000LL;
+	do_div(tmp, PMTMR_TICKS_PER_SEC);
+	do_div(deltatsc, tmp);
+
+	return (unsigned long) deltatsc;
+}
+
 #define CAL_MS		50
 #define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS))
 #define CAL_PIT_LOOPS	5000
@@ -247,22 +284,11 @@ unsigned long native_calibrate_tsc(void)
 			continue;
 
 		tsc2 = (tsc2 - tsc1) * 1000000LL;
+		if (hpet)
+			tsc2 = calc_hpet_ref(tsc2, hpet1, hpet2);
+		else
+			tsc2 = calc_pmtimer_ref(tsc2, pm1, pm2);
 
-		if (hpet) {
-			if (hpet2 < hpet1)
-				hpet2 += 0x100000000ULL;
-			hpet2 -= hpet1;
-			tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
-			do_div(tsc1, 1000000);
-		} else {
-			if (pm2 < pm1)
-				pm2 += (u64)ACPI_PM_OVRRUN;
-			pm2 -= pm1;
-			tsc1 = pm2 * 1000000000LL;
-			do_div(tsc1, PMTMR_TICKS_PER_SEC);
-		}
-
-		do_div(tsc2, tsc1);
 		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
 	}
 

-- 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [RFC patch 3/4] x86: TSC: use one set of reference variables
  2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 1/4] x86: TSC: define the PIT latch value separate Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 2/4] x86: TSC: separate hpet/pmtimer calculation out Thomas Gleixner
@ 2008-09-04 15:18 ` Thomas Gleixner
  2008-09-04 15:18 ` [RFC patch 4/4] x86: TSC make the calibration loop smarter Thomas Gleixner
  2008-09-04 15:36 ` [RFC patch 0/4] TSC calibration improvements Ingo Molnar
  4 siblings, 0 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-04 15:18 UTC (permalink / raw)
  To: LKML; +Cc: Linus Torvalds, Ingo Molnar, Alok Kataria, Arjan van de Veen

[-- Attachment #1: tsc-use-one-set-of-ref-variables.patch --]
[-- Type: text/plain, Size: 2933 bytes --]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c |   22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

Index: linux-2.6/arch/x86/kernel/tsc.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc.c
+++ linux-2.6/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
 /*
  * Read TSC and the reference counters. Take care of SMI disturbance
  */
-static u64 tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
 {
 	u64 t1, t2;
 	int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *h
 	for (i = 0; i < MAX_RETRIES; i++) {
 		t1 = get_cycles();
 		if (hpet)
-			*hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+			*p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
 		else
-			*pm = acpi_pm_read_early();
+			*p = acpi_pm_read_early();
 		t2 = get_cycles();
 		if ((t2 - t1) < SMI_TRESHOLD)
 			return t2;
@@ -228,7 +228,7 @@ static unsigned long pit_calibrate_tsc(v
  */
 unsigned long native_calibrate_tsc(void)
 {
-	u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
 	unsigned long flags;
 	int hpet = is_hpet_enabled(), i;
@@ -267,16 +267,16 @@ unsigned long native_calibrate_tsc(void)
 		 * read the end value.
 		 */
 		local_irq_save(flags);
-		tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+		tsc1 = tsc_read_refs(&ref1, hpet);
 		tsc_pit_khz = pit_calibrate_tsc();
-		tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+		tsc2 = tsc_read_refs(&ref2, hpet);
 		local_irq_restore(flags);
 
 		/* Pick the lowest PIT TSC calibration so far */
 		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
 
 		/* hpet or pmtimer available ? */
-		if (!hpet && !pm1 && !pm2)
+		if (!hpet && !ref1 && !ref2)
 			continue;
 
 		/* Check, whether the sampling was disturbed by an SMI */
@@ -285,9 +285,9 @@ unsigned long native_calibrate_tsc(void)
 
 		tsc2 = (tsc2 - tsc1) * 1000000LL;
 		if (hpet)
-			tsc2 = calc_hpet_ref(tsc2, hpet1, hpet2);
+			tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
 		else
-			tsc2 = calc_pmtimer_ref(tsc2, pm1, pm2);
+			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
 
 		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
 	}
@@ -300,7 +300,7 @@ unsigned long native_calibrate_tsc(void)
 		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
 
 		/* We don't have an alternative source, disable TSC */
-		if (!hpet && !pm1 && !pm2) {
+		if (!hpet && !ref1 && !ref2) {
 			printk("TSC: No reference (HPET/PMTIMER) available\n");
 			return 0;
 		}
@@ -320,7 +320,7 @@ unsigned long native_calibrate_tsc(void)
 	}
 
 	/* We don't have an alternative source, use the PIT calibration value */
-	if (!hpet && !pm1 && !pm2) {
+	if (!hpet && !ref1 && !ref2) {
 		printk(KERN_INFO "TSC: Using PIT calibration value\n");
 		return tsc_pit_min;
 	}

-- 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [RFC patch 4/4] x86: TSC make the calibration loop smarter
  2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
                   ` (2 preceding siblings ...)
  2008-09-04 15:18 ` [RFC patch 3/4] x86: TSC: use one set of reference variables Thomas Gleixner
@ 2008-09-04 15:18 ` Thomas Gleixner
  2008-09-04 15:36 ` [RFC patch 0/4] TSC calibration improvements Ingo Molnar
  4 siblings, 0 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-04 15:18 UTC (permalink / raw)
  To: LKML; +Cc: Linus Torvalds, Ingo Molnar, Alok Kataria, Arjan van de Veen

[-- Attachment #1: tsc-loop-smarter.patch --]
[-- Type: text/plain, Size: 6786 bytes --]

The last changes made the calibration loop 250ms long which is far
too much. Try to do that more clever.

Experiments have shown that using a 10ms delay for the PIT based calibration
gives us a good enough value. If we have a reference (HPET/PMTIMER) and the
result of the PIT and the reference is close enough, then we can break out of
the calibration loop on a match right away and use the reference value.

Otherwise we just loop 3 times and decide then, which value to take.

One caveat is that for virtualized environments the PIT calibration often does
not work at all and I found out that 10us is a bit too short as well for the
reference to give a sane result. The solution here is to make the last loop
longer when the first two PIT calibrations failed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c |   95 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 37 deletions(-)

Index: linux-2.6/arch/x86/kernel/tsc.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc.c
+++ linux-2.6/arch/x86/kernel/tsc.c
@@ -159,9 +159,14 @@ static unsigned long calc_pmtimer_ref(u6
 	return (unsigned long) deltatsc;
 }
 
-#define CAL_MS		50
+#define CAL_MS		10
 #define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS))
-#define CAL_PIT_LOOPS	5000
+#define CAL_PIT_LOOPS	1000
+
+#define CAL2_MS		50
+#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS	5000
+
 
 /*
  * Try to calibrate the TSC against the Programmable
@@ -170,7 +175,7 @@ static unsigned long calc_pmtimer_ref(u6
  *
  * Return ULONG_MAX on failure to calibrate.
  */
-static unsigned long pit_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
 {
 	u64 tsc, t1, t2, delta;
 	unsigned long tscmin, tscmax;
@@ -185,8 +190,8 @@ static unsigned long pit_calibrate_tsc(v
 	 * (LSB then MSB) to begin countdown.
 	 */
 	outb(0xb0, 0x43);
-	outb(CAL_LATCH & 0xff, 0x42);
-	outb(CAL_LATCH >> 8, 0x42);
+	outb(latch & 0xff, 0x42);
+	outb(latch >> 8, 0x42);
 
 	tsc = t1 = t2 = get_cycles();
 
@@ -207,18 +212,18 @@ static unsigned long pit_calibrate_tsc(v
 	/*
 	 * Sanity checks:
 	 *
-	 * If we were not able to read the PIT more than PIT_MIN_LOOPS
+	 * If we were not able to read the PIT more than loopmin
 	 * times, then we have been hit by a massive SMI
 	 *
 	 * If the maximum is 10 times larger than the minimum,
 	 * then we got hit by an SMI as well.
 	 */
-	if (pitcnt < CAL_PIT_LOOPS || tscmax > 10 * tscmin)
+	if (pitcnt < loopmin || tscmax > 10 * tscmin)
 		return ULONG_MAX;
 
 	/* Calculate the PIT value */
 	delta = t2 - t1;
-	do_div(delta, CAL_MS);
+	do_div(delta, ms);
 	return delta;
 }
 
@@ -230,8 +235,8 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags;
-	int hpet = is_hpet_enabled(), i;
+	unsigned long flags, latch, ms;
+	int hpet = is_hpet_enabled(), i, loopmin;
 
 	/*
 	 * Run 5 calibration loops to get the lowest frequency value
@@ -257,7 +262,13 @@ unsigned long native_calibrate_tsc(void)
 	 * calibration delay loop as we have to wait for a certain
 	 * amount of time anyway.
 	 */
-	for (i = 0; i < 5; i++) {
+
+	/* Preset PIT loop values */
+	latch = CAL_LATCH;
+	ms = CAL_MS;
+	loopmin = CAL_PIT_LOOPS;
+
+	for (i = 0; i < 3; i++) {
 		unsigned long tsc_pit_khz;
 
 		/*
@@ -268,7 +279,7 @@ unsigned long native_calibrate_tsc(void)
 		 */
 		local_irq_save(flags);
 		tsc1 = tsc_read_refs(&ref1, hpet);
-		tsc_pit_khz = pit_calibrate_tsc();
+		tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
 		tsc2 = tsc_read_refs(&ref2, hpet);
 		local_irq_restore(flags);
 
@@ -290,6 +301,35 @@ unsigned long native_calibrate_tsc(void)
 			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
 
 		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+		/* Check the reference deviation */
+		delta = ((u64) tsc_pit_min) * 100;
+		do_div(delta, tsc_ref_min);
+
+		/*
+		 * If both calibration results are inside a 10% window
+		 * then we can be sure, that the calibration
+		 * succeeded. We break out of the loop right away. We
+		 * use the reference value, as it is more precise.
+		 */
+		if (delta >= 90 && delta <= 110) {
+			printk(KERN_INFO
+			       "TSC: PIT calibration matches %s. %d loops\n",
+			       hpet ? "HPET" : "PMTIMER", i + 1);
+			return tsc_ref_min;
+		}
+
+		/*
+		 * Check whether PIT failed more than once. This
+		 * happens in virtualized environments. We need to
+		 * give the virtual PC a slightly longer timeframe for
+		 * the HPET/PMTIMER to make the result precise.
+		 */
+		if (i == 1 && tsc_pit_min == ULONG_MAX) {
+			latch = CAL2_LATCH;
+			ms = CAL2_MS;
+			loopmin = CAL2_PIT_LOOPS;
+		}
 	}
 
 	/*
@@ -308,7 +348,7 @@ unsigned long native_calibrate_tsc(void)
 		/* The alternative source failed as well, disable TSC */
 		if (tsc_ref_min == ULONG_MAX) {
 			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
-			       "failed due to SMI disturbance.\n");
+			       "failed.\n");
 			return 0;
 		}
 
@@ -327,37 +367,18 @@ unsigned long native_calibrate_tsc(void)
 
 	/* The alternative source failed, use the PIT calibration value */
 	if (tsc_ref_min == ULONG_MAX) {
-		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
-		       "to SMI disturbance. Using PIT calibration\n");
+		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+		       "Using PIT calibration\n");
 		return tsc_pit_min;
 	}
 
-	/* Check the reference deviation */
-	delta = ((u64) tsc_pit_min) * 100;
-	do_div(delta, tsc_ref_min);
-
-	/*
-	 * If both calibration results are inside a 5% window, the we
-	 * use the lower frequency of those as it is probably the
-	 * closest estimate.
-	 */
-	if (delta >= 95 && delta <= 105) {
-		printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
-		       hpet ? "HPET" : "PMTIMER");
-		printk(KERN_INFO "TSC: using %s calibration value\n",
-		       tsc_pit_min <= tsc_ref_min ? "PIT" :
-		       hpet ? "HPET" : "PMTIMER");
-		return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
-	}
-
-	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
-	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-
 	/*
 	 * The calibration values differ too much. In doubt, we use
 	 * the PIT value as we know that there are PMTIMERs around
-	 * running at double speed.
+	 * running at double speed. At least we let the user know:
 	 */
+	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
 	printk(KERN_INFO "TSC: Using PIT calibration value\n");
 	return tsc_pit_min;
 }

-- 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
                   ` (3 preceding siblings ...)
  2008-09-04 15:18 ` [RFC patch 4/4] x86: TSC make the calibration loop smarter Thomas Gleixner
@ 2008-09-04 15:36 ` Ingo Molnar
  2008-09-04 15:45   ` Linus Torvalds
  2008-09-04 21:00   ` Valdis.Kletnieks
  4 siblings, 2 replies; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 15:36 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Linus Torvalds, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin


* Thomas Gleixner <tglx@linutronix.de> wrote:

> This patch series addresses following:
> 
>      - reduce the calibration time to a useful value
>      - make decision smarter, when a reference (HPET/PMTIMER) is around
> 
> The first patches are cleanups to prepare for the smarter loop
> handling.
> 
> The main change is to reduce the PIT delay value to 10ms, which gives
> reasonable results on very slow machines as well. To avoid looping
> several times when the machine has a working reference counter
> (HPET/pmtimer), we compare the results of the PIT and the reference and
> break out of the loop when both match inside of a 10% window.
> 
> For virtualized environments the PIT calibration fails often and the
> reference calibration is not reproducible with 10ms. To address this
> we check whether the PIT failed two times in a row and make the PIT
> loop longer (50ms) for the last try to get a better result for the
> reference.
> 
> Most of the machines I tested break out of the loop after the first
> try with a stable reproducible result.

i've added them to tip/x86/tsc and merged it into tip/master - if 
there's test success we can merge it into x86/urgent as well and push it 
into v2.6.27. Any objections to that merge route?

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 15:36 ` [RFC patch 0/4] TSC calibration improvements Ingo Molnar
@ 2008-09-04 15:45   ` Linus Torvalds
  2008-09-04 16:00     ` Ingo Molnar
  2008-09-04 17:39     ` Alok Kataria
  2008-09-04 21:00   ` Valdis.Kletnieks
  1 sibling, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 15:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin

On Thu, 4 Sep 2008, Ingo Molnar wrote:
> 
> i've added them to tip/x86/tsc and merged it into tip/master - if 
> there's test success we can merge it into x86/urgent as well and push it 
> into v2.6.27. Any objections to that merge route?

I don't think it's quite that urgent, and wonder what the downside is of 
just changing the timeout to 10ms. On 32-bit x86, it was 30ms (I think) 
before the merge, so it sounds like 50ms was a bit excessive even before 
the whole "loop five times"/

So _short_ term, I'd really prefer (a) looping just three times and (b) 
looping with a smaller timeout.

Long-term, I actually think even 10ms is actually a total waste. I'll post 
my trial "quick calibration" code that is more likely to fail under 
virtualization or SMM (or, indeed, perhaps even on things like TMTA CPU's 
that can have longer latencies due to translation), but that is really 
fast and knows very intimately when it succeeds. I just need to do 
slightly more testing.

				Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 15:45   ` Linus Torvalds
@ 2008-09-04 16:00     ` Ingo Molnar
  2008-09-04 16:21       ` Linus Torvalds
  2008-09-04 17:39     ` Alok Kataria
  1 sibling, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 16:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 4 Sep 2008, Ingo Molnar wrote:
> > 
> > i've added them to tip/x86/tsc and merged it into tip/master - if 
> > there's test success we can merge it into x86/urgent as well and push it 
> > into v2.6.27. Any objections to that merge route?
> 
> I don't think it's quite that urgent, and wonder what the downside is 
> of just changing the timeout to 10ms. On 32-bit x86, it was 30ms (I 
> think) before the merge, so it sounds like 50ms was a bit excessive 
> even before the whole "loop five times"/
>
> So _short_ term, I'd really prefer (a) looping just three times and 
> (b) looping with a smaller timeout.

ok - sounds very good to me! It's quite late in the cycle so the more 
commits we can delay to v2.6.28 the better IMHO.

> Long-term, I actually think even 10ms is actually a total waste. I'll 
> post my trial "quick calibration" code that is more likely to fail 
> under virtualization or SMM (or, indeed, perhaps even on things like 
> TMTA CPU's that can have longer latencies due to translation), but 
> that is really fast and knows very intimately when it succeeds. I just 
> need to do slightly more testing.

ah - perhaps a dynamic statistical approach with an estimation of 
worst-case calibration error (~= standard deviation) and a quality 
threshold to reach? That could dramatically increase the number of 
samples while also making it much faster in practice. Nifty!

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 16:00     ` Ingo Molnar
@ 2008-09-04 16:21       ` Linus Torvalds
  2008-09-04 16:36         ` Ingo Molnar
  2008-09-04 17:41         ` Linus Torvalds
  0 siblings, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 16:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra



On Thu, 4 Sep 2008, Ingo Molnar wrote:
> 
> ah - perhaps a dynamic statistical approach with an estimation of 
> worst-case calibration error (~= standard deviation) and a quality 
> threshold to reach? That could dramatically increase the number of 
> samples while also making it much faster in practice. Nifty!

Oh, no, I'm _much_ more nifty than that!

Instead of being very clever, I have a very _stupid_ algorithm, one that 
has very hardcoded expectations of exactly what it will see. And if it 
doesn't see exactly that, it just fails early. 

I'd post the patch, but I really need to actually _test_ it first, and I 
haven't rebooted yet.

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 16:21       ` Linus Torvalds
@ 2008-09-04 16:36         ` Ingo Molnar
  2008-09-04 17:41         ` Linus Torvalds
  1 sibling, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 16:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 4 Sep 2008, Ingo Molnar wrote:
> > 
> > ah - perhaps a dynamic statistical approach with an estimation of 
> > worst-case calibration error (~= standard deviation) and a quality 
> > threshold to reach? That could dramatically increase the number of 
> > samples while also making it much faster in practice. Nifty!
> 
> Oh, no, I'm _much_ more nifty than that!
> 
> Instead of being very clever, I have a very _stupid_ algorithm, one 
> that has very hardcoded expectations of exactly what it will see. And 
> if it doesn't see exactly that, it just fails early.
>
> I'd post the patch, but I really need to actually _test_ it first, and 
> I haven't rebooted yet.

ok, will wait with patience :)

i've been using adaptive calibration with great success in user-space, 
to run benchmarks on multiple boxes with a variable number of lmbench 
iterations. Once the observable statistical properties of the series of 
measurements looks valid it stops the test iterations and emits a 
result. This both makes things faster (more predictable hw/kernel 
executes certain lmbench tests much faster) and it makes the results 
more reliable (less noise, better cross-hardware, cross-kernel and 
cross-test comparisons). I've got a quality threshold (and a max 
iterations threshold) hardcoded as well.

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 15:45   ` Linus Torvalds
  2008-09-04 16:00     ` Ingo Molnar
@ 2008-09-04 17:39     ` Alok Kataria
  2008-09-04 17:53       ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Alok Kataria @ 2008-09-04 17:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Dan Hecht, Garrett Smith, Rusty Russell,
	Jeremy Fitzhardinge

On Thu, 2008-09-04 at 08:45 -0700, Linus Torvalds wrote:
> 
> On Thu, 4 Sep 2008, Ingo Molnar wrote:
> >
> > i've added them to tip/x86/tsc and merged it into tip/master - if
> > there's test success we can merge it into x86/urgent as well and push it
> > into v2.6.27. Any objections to that merge route?
> 
> I don't think it's quite that urgent, and wonder what the downside is of
> just changing the timeout to 10ms. On 32-bit x86, it was 30ms (I think)
> before the merge, so it sounds like 50ms was a bit excessive even before
> the whole "loop five times"/
> 
> So _short_ term, I'd really prefer (a) looping just three times and (b)
> looping with a smaller timeout.

Looping for a smaller timeout is really going to strain things for
Virtualization.
Even on native hardware if you reduce the  timeout to less than 10ms it
may result in errors in the range of 2500ppm on a 2GHz systems when
calibrating against pmtimer/hpet, this is far worse than what NTP can
correct, afaik NTP can handle errors only upto 500ppm. And IMHO that is
the reason why we had a timeout of 50ms before (since it limits the
maximum theoretical error to 500ppm)

In addition to that, the pmtimer hardware itself has some drift,
typically in the range of 20 to 100ppm.  If this drift happens to be in
the same direction as the error in measuring the tsc against the
pmtimer, we could have a total error of more than 500ppm in the tsc
frequency calibration code with the 50ms timeout. So anything less than
50msec of timeout is risky for virtualized environment.

If people think that new hardware is good enough to handle these errors
with a smaller timeout value thats okay, but for virtualized environment
we need to keep these timeouts as they are or increase them. 

If still there is a pressing need to reduce the timeout, then
alternatively, as Arjan suggested, we can ask the hardware (hypervisor)
for tsc frequency, and do this only if we are running under a hypervisor
( can't do this for h/w with constant TSC too since its not reliable as
mentioned by Linus).
This tsc freq from hypervisor, can be implemented using cpuid's if the
backend hypervisor supports that (VMware does). Let me know what people
think about this and we can work towards a standard interface. 

Thanks,
Alok
> 
> Long-term, I actually think even 10ms is actually a total waste. I'll post
> my trial "quick calibration" code that is more likely to fail under
> virtualization or SMM (or, indeed, perhaps even on things like TMTA CPU's
> that can have longer latencies due to translation), but that is really
> fast and knows very intimately when it succeeds. I just need to do
> slightly more testing.
> 
>                                 Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 16:21       ` Linus Torvalds
  2008-09-04 16:36         ` Ingo Molnar
@ 2008-09-04 17:41         ` Linus Torvalds
  2008-09-04 18:07           ` Alan Cox
  1 sibling, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 17:41 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra



On Thu, 4 Sep 2008, Linus Torvalds wrote:
> 
> I'd post the patch, but I really need to actually _test_ it first, and I 
> haven't rebooted yet.

Just as well. There were various stupid small details, like the fact that 
i8253 timer mode 2 (square wave) decrements by two, which confused me for 
a while until I realized it.

Anyway, here's a suggested diff. The comments are quite extensive, and 
should explain it all. The code should be _very_ robust, in that if 
anything doesn't match expectations, it will fail and fall back on the old 
code. But it should also be very fast, and quite precise.

It only uses 2048 PIT timer ticks to calibrate the TSC, plus 256 ticks on 
each side to make sure the TSC values were very close to the tick, so the 
whole calibration takes less than 2.5ms. Yet, despite only takign 2.5ms, 
we can actually give pretty stringent guarantees of accuracy:

 - the code requires that we hit each 256-counter block at least 35 times, 
   so the TSC error is basically at *MOST* just a few PIT cycles off in 
   any direction. In practice, it's going to be about three microseconds 
   off (which is how long it takes to read the counter)

 - so over 2048 PIT cycles, we can pretty much guarantee that the 
   calibration error is less than one half of a percent.

My testing bears this out: on my machine, the quick-calibration reports 
2934.085kHz, while the slow one reports 2933.415.

Yes, the slower calibration is still more precise. For me, the slow 
calibration is stable to within about one hundreth of a percent, so it's 
(at a guess) roughly an order-and-a-half of magnitude more precise. The 
longer you wait, the more precise you can be.

However, the nice thing about the fast TSC PIT synchronization is that 
it's pretty much _guaranteed_ to give that 0.5% precision, and fail 
gracefully (and very quickly) if it doesn't get it. And it really is 
fairly simple (even if there's a lot of _details_ there, and I didn't get 
all of those right ont he first try or even the second ;)

The patch says "110 insertions", but 63 of those new lines are actually 
comments.

(And yes, I do the latching - it's not reqlly required since I only depend 
on the MSB, and it actually makes for slightly lower precision, but it's 
the "safe" thing. And I figured out that the reason I thought that the 
latch stops the count in my earlier experiments was again due to the 
fact that "mode 2" decrements by two, not by one. So latching is fine, 
and the documented way to do this all).

		Linus

---
 arch/x86/kernel/tsc.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 110 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9d..e14e6c8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -181,6 +181,109 @@ static unsigned long pit_calibrate_tsc(void)
 	return delta;
 }
 
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ *  - the PIT is running at roughly 1.19MHz
+ *
+ *  - each IO is going to take about 1us on real hardware,
+ *    but we allow it to be much faster (by a factor of 10) or
+ *    _slightly_ slower (ie we allow up to a 2us read+counter
+ *    update - anything else implies a unacceptably slow CPU
+ *    or PIT for the fast calibration to work.
+ *
+ *  - with 256 PIT ticks to read the value, we have 214us to
+ *    see the same MSB (and overhead like doing a single TSC
+ *    read per MSB value etc).
+ *
+ *  - We're doing 3 IO's per loop (latch, read, read), and
+ *    we expect them each to take about a microsecond on real
+ *    hardware. So we expect a count value of around 70. But
+ *    we'll be generous, and accept anything over 35.
+ *
+ *  - if the PIT is stuck, and we see *many* more reads, we
+ *    return early (and the next caller of pit_expect_msb()
+ *    then consider it a failure when they don't see the
+ *    next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+	int count = 0;
+
+	for (count = 0; count < 50000; count++) {
+		/* Latch counter 2 - just to be safe */
+		outb(0x80, 0x43);
+		/* Ignore LSB */
+		inb(0x42);
+		if (inb(0x42) != val)
+			break;
+	}
+	return count > 35;
+}
+
+static unsigned long quick_pit_calibrate(void)
+{
+	/* Set the Gate high, disable speaker */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	/*
+	 * Counter 2, mode 0 (one-shot), binary count
+	 *
+	 * NOTE! Mode 2 decrements by two (and then the
+	 * output is flipped each time, giving the same
+	 * final output frequency as a decrement-by-one),
+	 * so mode 0 is much better when looking at the
+	 * individual counts.
+	 */
+	outb(0xb0, 0x43);
+
+	/* Start at 0xffff */
+	outb(0xff, 0x42);
+	outb(0xff, 0x42);
+
+	if (pit_expect_msb(0xff)) {
+		u64 t1, t2, delta;
+		unsigned char expect;
+
+		t1 = get_cycles();
+		for (expect = 0xfe; expect > 0xf5; expect--) {
+			t2 = get_cycles();
+			if (!pit_expect_msb(expect))
+				goto failed;
+		}
+		/*
+		 * Ok, if we get here, then we've seen the
+		 * MSB of the PIT go from 0xff to 0xf6, and
+		 * each MSB had many hits, so our TSC reading
+		 * was always very close to the transition.
+		 *
+		 * So t1 is at the 0xff -> 0xfe transition,
+		 * and t2 is at 0xf7->0xf6, and so the PIT
+		 * count difference between the two is 8*256,
+		 * ie 2048.
+		 *
+		 * kHz = ticks / time-in-seconds / 1000;
+		 * kHz = (t2 - t1) / (2048 / PIT_TICK_RATE) / 1000
+		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (2048 * 1000)
+		 */
+		delta = (t2 - t1)*PIT_TICK_RATE;
+		do_div(delta, 2048*1000);
+		printk("Fast TSC calibration using PIT\n");
+		return delta;
+	}
+failed:
+	return 0;
+}
 
 /**
  * native_calibrate_tsc - calibrate the tsc on boot
@@ -189,9 +292,15 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags;
+	unsigned long flags, fast_calibrate;
 	int hpet = is_hpet_enabled(), i;
 
+	local_irq_save(flags);
+	fast_calibrate = quick_pit_calibrate();
+	local_irq_restore(flags);
+	if (fast_calibrate)
+		return fast_calibrate;
+
 	/*
 	 * Run 5 calibration loops to get the lowest frequency value
 	 * (the best estimate). We use two different calibration modes

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 17:39     ` Alok Kataria
@ 2008-09-04 17:53       ` Linus Torvalds
  2008-09-04 18:31         ` Alok Kataria
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 17:53 UTC (permalink / raw)
  To: Alok Kataria
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Dan Hecht, Garrett Smith, Rusty Russell,
	Jeremy Fitzhardinge

On Thu, 4 Sep 2008, Alok Kataria wrote:
> 
> Looping for a smaller timeout is really going to strain things for
> Virtualization.

Can you check the patch I just sent out?

It loops for a _very_ short timeout, but on the other hand it should also 
absolutely immediately notice that it's getting the wrong expected values 
under virtualization, and the fast case will then fail early. 

It then falls back on the slow case, but I don't think you can avoid that 
under virtualization.

> Even on native hardware if you reduce the  timeout to less than 10ms it
> may result in errors in the range of 2500ppm on a 2GHz systems when
> calibrating against pmtimer/hpet, this is far worse than what NTP can
> correct, afaik NTP can handle errors only upto 500ppm. And IMHO that is
> the reason why we had a timeout of 50ms before (since it limits the
> maximum theoretical error to 500ppm)

I would not mind at all having the more precise thing happen _later_, 
especially if we can do it incrementally.

One of the problems with the TSC calibration is that we need it fairly 
early (for things like usleep()), and it needs to be in the right 
ballpark. It definitely does not need to be in the parts-per-million 
range, it needs to be in the "within a few percent" range.

(To make matters worse, the TSC isn't then even used in practice for 
real-time clocks, because of variable frequency and/or halting in idle 
states. So the actual real-time clock will actually be based on HPET or 
PM_TIMER anyway most of the time).

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 17:41         ` Linus Torvalds
@ 2008-09-04 18:07           ` Alan Cox
  2008-09-04 18:26             ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Alan Cox @ 2008-09-04 18:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Alok Kataria,
	Arjan van de Veen, H. Peter Anvin, Peter Zijlstra

> (And yes, I do the latching - it's not reqlly required since I only depend 
> on the MSB, and it actually makes for slightly lower precision, but it's 
> the "safe" thing. And I figured out that the reason I thought that the 

Good job you don't. Various Cyrix/Geode chipsets have as errata #2

"Counter latch command is non-operational in the 8254 timer"

Alan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 18:07           ` Alan Cox
@ 2008-09-04 18:26             ` Linus Torvalds
  2008-09-04 18:30               ` H. Peter Anvin
  2008-09-04 20:09               ` Linus Torvalds
  0 siblings, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 18:26 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Alok Kataria,
	Arjan van de Veen, H. Peter Anvin, Peter Zijlstra

On Thu, 4 Sep 2008, Alan Cox wrote:
>
> > (And yes, I do the latching - it's not reqlly required since I only depend 
> > on the MSB, and it actually makes for slightly lower precision, but it's 
> > the "safe" thing. And I figured out that the reason I thought that the 
> 
> Good job you don't. Various Cyrix/Geode chipsets have as errata #2
> 
> "Counter latch command is non-operational in the 8254 timer"

Yeah, I had some memory of latch issues. I wrote the thing originally 
without the latching, which is why the whole thing is designed to igore 
the low cycle count. I just decided that doing the latching shouldn't 
hurt that much, even if it ends up being just a 1us no-op.

It does mean that on any normal hardware, the expected error is roughly 
3us over 2048 PIT ticks, which if I do the math right (nominal PIT 
frequency: 1193182 Hz) is just under 0.2%. Or put another way, ~1750 ppm.

Not doing the latching should make the expected error go down to 2us.

Of course, the 2048 PIT ticks is just a random choice. It could be any 
multiple of 256 ticks, so that error can be made smaller. Maybe it's worth 
spending 10ms on this, and get it down by a factor of five (at which point 
the error on the PIT frequency is probably in the same order of 
magnitude).

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 18:26             ` Linus Torvalds
@ 2008-09-04 18:30               ` H. Peter Anvin
  2008-09-04 20:09               ` Linus Torvalds
  1 sibling, 0 replies; 52+ messages in thread
From: H. Peter Anvin @ 2008-09-04 18:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Ingo Molnar, Thomas Gleixner, LKML, Alok Kataria,
	Arjan van de Veen, Peter Zijlstra

Linus Torvalds wrote:
> 
> Of course, the 2048 PIT ticks is just a random choice. It could be any 
> multiple of 256 ticks, so that error can be made smaller. Maybe it's worth 
> spending 10ms on this, and get it down by a factor of five (at which point 
> the error on the PIT frequency is probably in the same order of 
> magnitude).
> 

FWIW, typical error on the 14.31818 MHz clock (used as the PIT, PMTMR 
and HPET timebase in most systems) is usually ±50 ppm.  High-quality 
motherboards which use a TCXO for the 14.31818 MHz clock would have
around ±1 ppm.

	-hpa

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 17:53       ` Linus Torvalds
@ 2008-09-04 18:31         ` Alok Kataria
  2008-09-04 18:34           ` H. Peter Anvin
  0 siblings, 1 reply; 52+ messages in thread
From: Alok Kataria @ 2008-09-04 18:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Daniel Hecht, Garrett Smith, Rusty Russell,
	Jeremy Fitzhardinge

On Thu, 2008-09-04 at 10:53 -0700, Linus Torvalds wrote:
> 
> On Thu, 4 Sep 2008, Alok Kataria wrote:
> >
> > Looping for a smaller timeout is really going to strain things for
> > Virtualization.
> 
> Can you check the patch I just sent out?
> 
> It loops for a _very_ short timeout, but on the other hand it should also
> absolutely immediately notice that it's getting the wrong expected values
> under virtualization, and the fast case will then fail early.

Yep tried that, the fast calibration failed in 4-5 reboots that i did
until now. Will do some more runs just to be sure.
> 
> It then falls back on the slow case, but I don't think you can avoid that
> under virtualization.

That's fine i think the slow calibration works well for us and we can
live with the added 200msec delay while booting. 

I am assuming the patch you just sent was for 2.6.27.
And that we won't be changing the timeout's after this patch.
So, can we also apply the 2 patches that I sent yesterday on top of this
one ? the one that calibrated against pmtimer/hpet over the full loop
and the one that fixed the warning message. Both of these will affect
the slower case, which is now not the common case when running on native
hardware.

In the long run though, I think to get rid of all this complexity, we
can think of getting the frequency from the hypervisor. Lets see, i will
try making a patch maybe early next week and send it for the x86 tree.

Thanks,
Alok

> 
> > Even on native hardware if you reduce the  timeout to less than 10ms it
> > may result in errors in the range of 2500ppm on a 2GHz systems when
> > calibrating against pmtimer/hpet, this is far worse than what NTP can
> > correct, afaik NTP can handle errors only upto 500ppm. And IMHO that is
> > the reason why we had a timeout of 50ms before (since it limits the
> > maximum theoretical error to 500ppm)
> 
> I would not mind at all having the more precise thing happen _later_,
> especially if we can do it incrementally.
> 
> One of the problems with the TSC calibration is that we need it fairly
> early (for things like usleep()), and it needs to be in the right
> ballpark. It definitely does not need to be in the parts-per-million
> range, it needs to be in the "within a few percent" range.
> 
> (To make matters worse, the TSC isn't then even used in practice for
> real-time clocks, because of variable frequency and/or halting in idle
> states. So the actual real-time clock will actually be based on HPET or
> PM_TIMER anyway most of the time).
> 
>                 Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 18:31         ` Alok Kataria
@ 2008-09-04 18:34           ` H. Peter Anvin
  0 siblings, 0 replies; 52+ messages in thread
From: H. Peter Anvin @ 2008-09-04 18:34 UTC (permalink / raw)
  To: akataria
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Arjan van de Veen, Daniel Hecht, Garrett Smith, Rusty Russell,
	Jeremy Fitzhardinge

Alok Kataria wrote:
> 
> In the long run though, I think to get rid of all this complexity, we
> can think of getting the frequency from the hypervisor. Lets see, i will
> try making a patch maybe early next week and send it for the x86 tree.
> 

That should go along with a paravirtualized (virtio) timesource.

	-hpa

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 18:26             ` Linus Torvalds
  2008-09-04 18:30               ` H. Peter Anvin
@ 2008-09-04 20:09               ` Linus Torvalds
  2008-09-04 20:43                 ` Ingo Molnar
  2008-09-04 21:38                 ` Alok Kataria
  1 sibling, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 20:09 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Alok Kataria,
	Arjan van de Veen, H. Peter Anvin, Peter Zijlstra



On Thu, 4 Sep 2008, Linus Torvalds wrote:
> 
> Yeah, I had some memory of latch issues. I wrote the thing originally 
> without the latching, which is why the whole thing is designed to igore 
> the low cycle count. I just decided that doing the latching shouldn't 
> hurt that much, even if it ends up being just a 1us no-op.

Thinking more about it, that was the wrong decision. After all, the whole 
point of the "quick calibration" is to take care of the good case of 
reasonable hardware - and fall back on a much slower version if there is 
something odd going on.

So latching things is the wrong thing to do, since it just slows things 
down. No real hardware should need it, and if some odd hardware does 
exist, all the other sanity checks will catch it anyway.

And yeah, I shouldn't have tried to go from 250ms down to 2.5ms. Aim for 
something like a 15ms calibration instead, which should give plenty of 
precision, while still being much faster than we used to be.

So how about this?

(Stage #2 would then be to simplify the main calibration loop now that we 
know that it's there really as a fall-back when the PIT isn't stable for 
some reason, but that's a separate issue).

Thomas, I assume that this one catches your SMI-laptop and falls back to 
the slow case, the same way Alok already said it catches his VM setup?

		Linus

---
 arch/x86/kernel/tsc.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 118 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9d..4589ae4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -181,6 +181,117 @@ static unsigned long pit_calibrate_tsc(void)
 	return delta;
 }
 
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ *  - the PIT is running at roughly 1.19MHz
+ *
+ *  - each IO is going to take about 1us on real hardware,
+ *    but we allow it to be much faster (by a factor of 10) or
+ *    _slightly_ slower (ie we allow up to a 2us read+counter
+ *    update - anything else implies a unacceptably slow CPU
+ *    or PIT for the fast calibration to work.
+ *
+ *  - with 256 PIT ticks to read the value, we have 214us to
+ *    see the same MSB (and overhead like doing a single TSC
+ *    read per MSB value etc).
+ *
+ *  - We're doing 2 reads per loop (LSB, MSB), and we expect
+ *    them each to take about a microsecond on real hardware.
+ *    So we expect a count value of around 100. But we'll be
+ *    generous, and accept anything over 50.
+ *
+ *  - if the PIT is stuck, and we see *many* more reads, we
+ *    return early (and the next caller of pit_expect_msb()
+ *    then consider it a failure when they don't see the
+ *    next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+	int count = 0;
+
+	for (count = 0; count < 50000; count++) {
+		/* Ignore LSB */
+		inb(0x42);
+		if (inb(0x42) != val)
+			break;
+	}
+	return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+	/* Set the Gate high, disable speaker */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	/*
+	 * Counter 2, mode 0 (one-shot), binary count
+	 *
+	 * NOTE! Mode 2 decrements by two (and then the
+	 * output is flipped each time, giving the same
+	 * final output frequency as a decrement-by-one),
+	 * so mode 0 is much better when looking at the
+	 * individual counts.
+	 */
+	outb(0xb0, 0x43);
+
+	/* Start at 0xffff */
+	outb(0xff, 0x42);
+	outb(0xff, 0x42);
+
+	if (pit_expect_msb(0xff)) {
+		int i;
+		u64 t1, t2, delta;
+		unsigned char expect = 0xfe;
+
+		t1 = get_cycles();
+		for (expect = 0xfe, i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+			if (!pit_expect_msb(expect))
+				goto failed;
+		}
+		t2 = get_cycles();
+
+		/*
+		 * Ok, if we get here, then we've seen the
+		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+		 * times, and each MSB had many hits, so we never
+		 * had any sudden jumps.
+		 *
+		 * As a result, we can depend on there not being
+		 * any odd delays anywhere, and the TSC reads are
+		 * reliable.
+		 *
+		 * kHz = ticks / time-in-seconds / 1000;
+		 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+		 */
+		delta = (t2 - t1)*PIT_TICK_RATE;
+		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+		printk("Fast TSC calibration using PIT\n");
+		return delta;
+	}
+failed:
+	return 0;
+}
 
 /**
  * native_calibrate_tsc - calibrate the tsc on boot
@@ -189,9 +300,15 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags;
+	unsigned long flags, fast_calibrate;
 	int hpet = is_hpet_enabled(), i;
 
+	local_irq_save(flags);
+	fast_calibrate = quick_pit_calibrate();
+	local_irq_restore(flags);
+	if (fast_calibrate)
+		return fast_calibrate;
+
 	/*
 	 * Run 5 calibration loops to get the lowest frequency value
 	 * (the best estimate). We use two different calibration modes

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 20:09               ` Linus Torvalds
@ 2008-09-04 20:43                 ` Ingo Molnar
  2008-09-04 20:52                   ` Ingo Molnar
  2008-09-04 20:53                   ` Linus Torvalds
  2008-09-04 21:38                 ` Alok Kataria
  1 sibling, 2 replies; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 20:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> +		unsigned char expect = 0xfe;
> +
> +		t1 = get_cycles();
> +		for (expect = 0xfe, i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
> +			if (!pit_expect_msb(expect))
> +				goto failed;
> +		}
> +		t2 = get_cycles();

i suspect you didnt want the second initialization of 'expect' to 0xfe? 
I fixed that up by removing the second one.

( although i think it would all be even clearer if we initialized 
  'expect' to 0xff before the first call of pit_expect_msb() higher up, 
  and then doing an expect-- before doing this loop - but that is really 
  painting the bikeshed i guess. )

> @@ -189,9 +300,15 @@ unsigned long native_calibrate_tsc(void)
>  {
>  	u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
>  	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
> -	unsigned long flags;
> +	unsigned long flags, fast_calibrate;
>  	int hpet = is_hpet_enabled(), i;
>  
> +	local_irq_save(flags);
> +	fast_calibrate = quick_pit_calibrate();
> +	local_irq_restore(flags);
> +	if (fast_calibrate)
> +		return fast_calibrate;
> +
>  	/*
>  	 * Run 5 calibration loops to get the lowest frequency value
>  	 * (the best estimate). We use two different calibration modes

these bits collided with Thomas's textually but not conceptually: Thomas 
improved the slowpath calibration while you introduced a new fastpath. I 
kept both and applied your patch to tip/x86/tsc.

if you are happy with the commit below, could you please give your 
Signed-off-by for it?

	Ingo

-------------->
>From b6cb513801373e10af6f2df287a8089e3c340e83 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 4 Sep 2008 10:41:22 -0700
Subject: [PATCH] x86: quick TSC calibration

Introduce a fast TSC-calibration method on sane hardware.

It only uses 17920 PIT timer ticks to calibrate the TSC, plus 256 ticks on
each side to make sure the TSC values were very close to the tick, so the
whole calibration takes 15ms. Yet, despite only takign 15ms,
we can actually give pretty stringent guarantees of accuracy:

 - the code requires that we hit each 256-counter block at least 50 times,
   so the TSC error is basically at *MOST* just a few PIT cycles off in
   any direction. In practice, it's going to be about one microseconds
   off (which is how long it takes to read the counter)

 - so over 17920 PIT cycles, we can pretty much guarantee that the
   calibration error is less than one half of a percent.

My testing bears this out: on my machine, the quick-calibration reports
2934.085kHz, while the slow one reports 2933.415.

Yes, the slower calibration is still more precise. For me, the slow
calibration is stable to within about one hundreth of a percent, so it's
(at a guess) roughly an order-and-a-half of magnitude more precise. The
longer you wait, the more precise you can be.

However, the nice thing about the fast TSC PIT synchronization is that
it's pretty much _guaranteed_ to give that 0.5% precision, and fail
gracefully (and very quickly) if it doesn't get it. And it really is
fairly simple (even if there's a lot of _details_ there, and I didn't get
all of those right ont he first try or even the second ;)

The patch says "110 insertions", but 63 of those new lines are actually
comments.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 110 insertions(+), 1 deletions(-)
---
 arch/x86/kernel/tsc.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 118 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index da033b5..839070b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -227,6 +227,117 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
 	return delta;
 }
 
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ *  - the PIT is running at roughly 1.19MHz
+ *
+ *  - each IO is going to take about 1us on real hardware,
+ *    but we allow it to be much faster (by a factor of 10) or
+ *    _slightly_ slower (ie we allow up to a 2us read+counter
+ *    update - anything else implies a unacceptably slow CPU
+ *    or PIT for the fast calibration to work.
+ *
+ *  - with 256 PIT ticks to read the value, we have 214us to
+ *    see the same MSB (and overhead like doing a single TSC
+ *    read per MSB value etc).
+ *
+ *  - We're doing 2 reads per loop (LSB, MSB), and we expect
+ *    them each to take about a microsecond on real hardware.
+ *    So we expect a count value of around 100. But we'll be
+ *    generous, and accept anything over 50.
+ *
+ *  - if the PIT is stuck, and we see *many* more reads, we
+ *    return early (and the next caller of pit_expect_msb()
+ *    then consider it a failure when they don't see the
+ *    next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+	int count = 0;
+
+	for (count = 0; count < 50000; count++) {
+		/* Ignore LSB */
+		inb(0x42);
+		if (inb(0x42) != val)
+			break;
+	}
+	return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+	/* Set the Gate high, disable speaker */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	/*
+	 * Counter 2, mode 0 (one-shot), binary count
+	 *
+	 * NOTE! Mode 2 decrements by two (and then the
+	 * output is flipped each time, giving the same
+	 * final output frequency as a decrement-by-one),
+	 * so mode 0 is much better when looking at the
+	 * individual counts.
+	 */
+	outb(0xb0, 0x43);
+
+	/* Start at 0xffff */
+	outb(0xff, 0x42);
+	outb(0xff, 0x42);
+
+	if (pit_expect_msb(0xff)) {
+		int i;
+		u64 t1, t2, delta;
+		unsigned char expect = 0xfe;
+
+		t1 = get_cycles();
+		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+			if (!pit_expect_msb(expect))
+				goto failed;
+		}
+		t2 = get_cycles();
+
+		/*
+		 * Ok, if we get here, then we've seen the
+		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+		 * times, and each MSB had many hits, so we never
+		 * had any sudden jumps.
+		 *
+		 * As a result, we can depend on there not being
+		 * any odd delays anywhere, and the TSC reads are
+		 * reliable.
+		 *
+		 * kHz = ticks / time-in-seconds / 1000;
+		 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+		 */
+		delta = (t2 - t1)*PIT_TICK_RATE;
+		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+		printk("Fast TSC calibration using PIT\n");
+		return delta;
+	}
+failed:
+	return 0;
+}
 
 /**
  * native_calibrate_tsc - calibrate the tsc on boot
@@ -235,9 +346,15 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags, latch, ms;
+	unsigned long flags, latch, ms, fast_calibrate;
 	int hpet = is_hpet_enabled(), i, loopmin;
 
+	local_irq_save(flags);
+	fast_calibrate = quick_pit_calibrate();
+	local_irq_restore(flags);
+	if (fast_calibrate)
+		return fast_calibrate;
+
 	/*
 	 * Run 5 calibration loops to get the lowest frequency value
 	 * (the best estimate). We use two different calibration modes

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 20:43                 ` Ingo Molnar
@ 2008-09-04 20:52                   ` Ingo Molnar
  2008-09-04 21:09                     ` Linus Torvalds
  2008-09-04 20:53                   ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 20:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra


> +static unsigned long quick_pit_calibrate(void)
> +{
[...]
> +	if (pit_expect_msb(0xff)) {
> +		int i;
> +		u64 t1, t2, delta;
> +		unsigned char expect = 0xfe;
> +
> +		t1 = get_cycles();
> +		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
> +			if (!pit_expect_msb(expect))
> +				goto failed;
> +		}
> +		t2 = get_cycles();

hm, unless i'm missing something i think here we still have a small 
window for an SMI or some virtualization delay to slip in and cause 
massive inaccuracy: if the delay happens _after_ the last 
pit_expect_msb() and _before_ the external get_cycles() call. Right?

i fixed that by adding one more pit_expect_msb() call.

plus i think QUICK_PIT_ITERATIONS is quite close to overflowing 255 
which is built into the u32 'expect' variable (the MSB will only 
overflow to 10 bits or so) - so i've added a BUILD_BUG_ON() to make sure 
anyone tuning QUICK_PIT_MS above 60msec or so would get a build error 
instead of some hard(er) to track down calibration error.

but it's getting late here so please double-check me ... The commit is 
below.

	Ingo

------------>
>From 40d2650256289d3ba59c4fd146b86b972db6ec40 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Sep 2008 22:47:47 +0200
Subject: [PATCH] x86: quick TSC calibration, improve

- make sure the final TSC timestamp is reliable too

- make sure nobody increases QUICK_PIT_MS so that
  QUICK_PIT_ITERATIONS can get larger than 0xff, breaking the iteration.
  (It would take about 60 msecs to reach that limit.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c |   11 +++++++++++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 839070b..4832a40 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -304,6 +304,11 @@ static unsigned long quick_pit_calibrate(void)
 	outb(0xff, 0x42);
 	outb(0xff, 0x42);
 
+	/*
+	 * The iteration assumes that expect never goes below zero:
+	 */
+	BUILD_BUG_ON(QUICK_PIT_ITERATIONS >= 0xff);
+
 	if (pit_expect_msb(0xff)) {
 		int i;
 		u64 t1, t2, delta;
@@ -317,6 +322,12 @@ static unsigned long quick_pit_calibrate(void)
 		t2 = get_cycles();
 
 		/*
+		 * Make sure we can rely on the second TSC timestamp:
+		 */
+		if (!pit_expect_msb(--expect))
+			goto failed;
+
+		/*
 		 * Ok, if we get here, then we've seen the
 		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
 		 * times, and each MSB had many hits, so we never

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 20:43                 ` Ingo Molnar
  2008-09-04 20:52                   ` Ingo Molnar
@ 2008-09-04 20:53                   ` Linus Torvalds
  1 sibling, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 20:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra



On Thu, 4 Sep 2008, Ingo Molnar wrote:
> 
> if you are happy with the commit below, could you please give your 
> Signed-off-by for it?

Sure.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 15:36 ` [RFC patch 0/4] TSC calibration improvements Ingo Molnar
  2008-09-04 15:45   ` Linus Torvalds
@ 2008-09-04 21:00   ` Valdis.Kletnieks
  1 sibling, 0 replies; 52+ messages in thread
From: Valdis.Kletnieks @ 2008-09-04 21:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, LKML, Linus Torvalds, Alok Kataria,
	Arjan van de Veen, H. Peter Anvin

[-- Attachment #1: Type: text/plain, Size: 624 bytes --]

On Thu, 04 Sep 2008 17:36:20 +0200, Ingo Molnar said:

> i've added them to tip/x86/tsc and merged it into tip/master - if 
> there's test success we can merge it into x86/urgent as well and push it 
> into v2.6.27. Any objections to that merge route?

Timekeeping has always been a snarly issue fraught with peril when testing on
new hardware.  Are we confident enough that this will DTRT on almost every
platform, including all the busticated ones?  Seems dangerous to be merging
it for .27 when we're already past -rc5.

But if Ingo and Thomas want to merge it, it's OK with me as long as they
take the blame for it. :)


[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 20:52                   ` Ingo Molnar
@ 2008-09-04 21:09                     ` Linus Torvalds
  2008-09-04 21:21                       ` Ingo Molnar
  2008-09-04 21:33                       ` Ingo Molnar
  0 siblings, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 21:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

On Thu, 4 Sep 2008, Ingo Molnar wrote:
> 
> hm, unless i'm missing something i think here we still have a small 
> window for an SMI or some virtualization delay to slip in and cause 
> massive inaccuracy: if the delay happens _after_ the last 
> pit_expect_msb() and _before_ the external get_cycles() call. Right?

Yes. I had the extra pit_expect_msb() originally, but decided that 
basically a single-instruction race for somethign that ran without 
any MSI for 15ms was a bit pointless.

But adding another pit_expect_msb() is certainly not wrong. 

However, this one is:

> +	/*
> +	 * The iteration assumes that expect never goes below zero:
> +	 */
> +	BUILD_BUG_ON(QUICK_PIT_ITERATIONS >= 0xff);

No it doesn't. "expect" is unsigned char and will happily wrap, as will 
the PIT timer. The fact that it is in "single shot" mode doesn't actually 
mean that the timer stops, it just affects what happens when it goes down 
to zero. 

So that BUILD_BUG_ON() is misleading and incorrect.

Of course, it is true that in _practice_ you would never actually want to 
delay that long, but the code as written should be perfectly happy to 
iterate arbitrarily many times. Of course, the actual final TSC 
multiply/divide calculations would overflow at some point, but that's much 
further down the line.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:09                     ` Linus Torvalds
@ 2008-09-04 21:21                       ` Ingo Molnar
  2008-09-04 21:30                         ` Linus Torvalds
  2008-09-04 21:33                       ` Ingo Molnar
  1 sibling, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 21:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> However, this one is:
> 
> > +	/*
> > +	 * The iteration assumes that expect never goes below zero:
> > +	 */
> > +	BUILD_BUG_ON(QUICK_PIT_ITERATIONS >= 0xff);
> 
> No it doesn't. "expect" is unsigned char and will happily wrap, as 
> will the PIT timer. The fact that it is in "single shot" mode doesn't 
> actually mean that the timer stops, it just affects what happens when 
> it goes down to zero.
> 
> So that BUILD_BUG_ON() is misleading and incorrect.

ah, indeed, that bit of mine is wrong - and the period is programmed to 
0xffff so it should all work out just fine. You code in a way too tricky 
manner ;) I zapped that portion.

In fact ... shouldnt we intentionally include a 'wraparound' event in 
the test? Some of the erratums/instabilities regarding PITs happened 
around wraparounds [of the lsb] - maybe the wraparound of the MSB 
matters too. (Maybe some boards freeze the counter readout until the 
host OS ACKs the PIT irq or something - which we dont do in this 
calibration run so if there's some weirdness there we'd detect it.)

So maybe we should start with an expect value of QUICK_PIT_ITERATIONS/2, 
with a wraparound right in the middle of the measurement?

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:21                       ` Ingo Molnar
@ 2008-09-04 21:30                         ` Linus Torvalds
  2008-09-04 21:34                           ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 21:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra



On Thu, 4 Sep 2008, Ingo Molnar wrote:
> 
> In fact ... shouldnt we intentionally include a 'wraparound' event in 
> the test?

No.

Face it, if somebody tries to make QUICK_PIT_MS be so large as to that be 
an issue, then the whole point of the function goes away.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:09                     ` Linus Torvalds
  2008-09-04 21:21                       ` Ingo Molnar
@ 2008-09-04 21:33                       ` Ingo Molnar
  2008-09-05 22:18                         ` Alok Kataria
  1 sibling, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 21:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 4 Sep 2008, Ingo Molnar wrote:
> > 
> > hm, unless i'm missing something i think here we still have a small 
> > window for an SMI or some virtualization delay to slip in and cause 
> > massive inaccuracy: if the delay happens _after_ the last 
> > pit_expect_msb() and _before_ the external get_cycles() call. Right?
> 
> Yes. I had the extra pit_expect_msb() originally, but decided that 
> basically a single-instruction race for somethign that ran without any 
> MSI for 15ms was a bit pointless.

the race is wider than that i think: all it takes an SMI at the last PIO 
access, so the window should be 1 usec, against a 15000 usecs period. 
That's 1 out of 15,000 boxes coming up with totally incorrect 
calibration.

we also might have a very theoretical race of an SMI taking exactly 65 
msecs so that the whole PIT wraps around and fools the fastpath - the 
chance for that would be around 1:300 - assuming we only have to hit the 
right MSB with a ~200 usecs precision). That assumes equal distribution 
of SMI costs which they certainly dont have - most of them are much less 
than 60 msecs. So i dont think it's an issue in practice - on real hw.

But it's still a possibility unless i'm missing something. We could 
protect against that case by reading the IRQ0-pending bit and making 
sure it's not pending after we have done the closing TSC readout.

> But adding another pit_expect_msb() is certainly not wrong.

ok, i kept that bit.

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:30                         ` Linus Torvalds
@ 2008-09-04 21:34                           ` Linus Torvalds
  2008-09-04 21:39                             ` Ingo Molnar
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 21:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

On Thu, 4 Sep 2008, Linus Torvalds wrote:
> 
> Face it, if somebody tries to make QUICK_PIT_MS be so large as to that be 
> an issue, then the whole point of the function goes away.

Btw, the same is true of adding any random "sanity checking". The point of 
that thing was to simply only work when the PIT works as advertized, and 
fail immediately if it doesn't. Even *if* you were to pick a big 
calibration delay *and* if you happened to have a PIT that is broken and 
doesn't wrap correctly, the design of the thing would mean that it would 
then fail the calibration already.

Exactly because it would _see_ that it's not wrapping.

So then it returns zero, and the slow and complicated case can run.

Take a look at the generated assembly language. I literally wrote it so 
that you could imagine that it's an old-time asm hacker that wrote the 
asm. Don't screw it up.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 20:09               ` Linus Torvalds
  2008-09-04 20:43                 ` Ingo Molnar
@ 2008-09-04 21:38                 ` Alok Kataria
  2008-09-04 21:52                   ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Alok Kataria @ 2008-09-04 21:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

On Thu, 2008-09-04 at 13:09 -0700, Linus Torvalds wrote:
> 
> On Thu, 4 Sep 2008, Linus Torvalds wrote:
> >
> > Yeah, I had some memory of latch issues. I wrote the thing originally
> > without the latching, which is why the whole thing is designed to igore
> > the low cycle count. I just decided that doing the latching shouldn't
> > hurt that much, even if it ends up being just a 1us no-op.
> 
> Thinking more about it, that was the wrong decision. After all, the whole
> point of the "quick calibration" is to take care of the good case of
> reasonable hardware - and fall back on a much slower version if there is
> something odd going on.
> 
> So latching things is the wrong thing to do, since it just slows things
> down. No real hardware should need it, and if some odd hardware does
> exist, all the other sanity checks will catch it anyway.
> 
> And yeah, I shouldn't have tried to go from 250ms down to 2.5ms. Aim for
> something like a 15ms calibration instead, which should give plenty of
> precision, while still being much faster than we used to be.
> 
> So how about this?
> 
> (Stage #2 would then be to simplify the main calibration loop now that we
> know that it's there really as a fall-back when the PIT isn't stable for
> some reason, but that's a separate issue).
> 
> Thomas, I assume that this one catches your SMI-laptop and falls back to
> the slow case, the same way Alok already said it catches his VM setup?

Just for the record, I ran this reboot loop for 50 times, and in 1 of
the run the kernel did use fast pit calibration method.
These runs were on vanilla 2.6.27-rc5 with this patch (take 2 from
Linus) and Ingo's fix.
The frequency that was calibrated by this run was 1866.291 Mhz, 
usually it's about 1866.692 Mhz. Slow method calibrates in the same
range.

In all the other 49 odd runs it did calibrate using the slow method. 
I printed the count values to see how much are we missing the fast
method by.

The maximum count value that I see is 84.
In one single reboot run, on an average in about 70 iterations the val
returned from pit_expect_msb is > 50, and eventually we hit a condition
where the value is < 50 and we bail out of the fast method.

So just to be on safer side can we be a little less generous and
increase the threshold to somewhere around 75 from 50 ? Or is there a
good reason not to ?

Thanks,
Alok

> 
>                 Linus
> 
> ---
>  arch/x86/kernel/tsc.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 118 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 8f98e9d..4589ae4 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -181,6 +181,117 @@ static unsigned long pit_calibrate_tsc(void)
>         return delta;
>  }
> 
> +/*
> + * This reads the current MSB of the PIT counter, and
> + * checks if we are running on sufficiently fast and
> + * non-virtualized hardware.
> + *
> + * Our expectations are:
> + *
> + *  - the PIT is running at roughly 1.19MHz
> + *
> + *  - each IO is going to take about 1us on real hardware,
> + *    but we allow it to be much faster (by a factor of 10) or
> + *    _slightly_ slower (ie we allow up to a 2us read+counter
> + *    update - anything else implies a unacceptably slow CPU
> + *    or PIT for the fast calibration to work.
> + *
> + *  - with 256 PIT ticks to read the value, we have 214us to
> + *    see the same MSB (and overhead like doing a single TSC
> + *    read per MSB value etc).
> + *
> + *  - We're doing 2 reads per loop (LSB, MSB), and we expect
> + *    them each to take about a microsecond on real hardware.
> + *    So we expect a count value of around 100. But we'll be
> + *    generous, and accept anything over 50.
> + *
> + *  - if the PIT is stuck, and we see *many* more reads, we
> + *    return early (and the next caller of pit_expect_msb()
> + *    then consider it a failure when they don't see the
> + *    next expected value).
> + *
> + * These expectations mean that we know that we have seen the
> + * transition from one expected value to another with a fairly
> + * high accuracy, and we didn't miss any events. We can thus
> + * use the TSC value at the transitions to calculate a pretty
> + * good value for the TSC frequencty.
> + */
> +static inline int pit_expect_msb(unsigned char val)
> +{
> +       int count = 0;
> +
> +       for (count = 0; count < 50000; count++) {
> +               /* Ignore LSB */
> +               inb(0x42);
> +               if (inb(0x42) != val)
> +                       break;
> +       }
> +       return count > 50;
> +}
> +
> +/*
> + * How many MSB values do we want to see? We aim for a
> + * 15ms calibration, which assuming a 2us counter read
> + * error should give us roughly 150 ppm precision for
> + * the calibration.
> + */
> +#define QUICK_PIT_MS 15
> +#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
> +
> +static unsigned long quick_pit_calibrate(void)
> +{
> +       /* Set the Gate high, disable speaker */
> +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
> +
> +       /*
> +        * Counter 2, mode 0 (one-shot), binary count
> +        *
> +        * NOTE! Mode 2 decrements by two (and then the
> +        * output is flipped each time, giving the same
> +        * final output frequency as a decrement-by-one),
> +        * so mode 0 is much better when looking at the
> +        * individual counts.
> +        */
> +       outb(0xb0, 0x43);
> +
> +       /* Start at 0xffff */
> +       outb(0xff, 0x42);
> +       outb(0xff, 0x42);
> +
> +       if (pit_expect_msb(0xff)) {
> +               int i;
> +               u64 t1, t2, delta;
> +               unsigned char expect = 0xfe;
> +
> +               t1 = get_cycles();
> +               for (expect = 0xfe, i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
> +                       if (!pit_expect_msb(expect))
> +                               goto failed;
> +               }
> +               t2 = get_cycles();
> +
> +               /*
> +                * Ok, if we get here, then we've seen the
> +                * MSB of the PIT decrement QUICK_PIT_ITERATIONS
> +                * times, and each MSB had many hits, so we never
> +                * had any sudden jumps.
> +                *
> +                * As a result, we can depend on there not being
> +                * any odd delays anywhere, and the TSC reads are
> +                * reliable.
> +                *
> +                * kHz = ticks / time-in-seconds / 1000;
> +                * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
> +                * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
> +                */
> +               delta = (t2 - t1)*PIT_TICK_RATE;
> +               do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
> +               printk("Fast TSC calibration using PIT\n");
> +               return delta;
> +       }
> +failed:
> +       return 0;
> +}
> 
>  /**
>   * native_calibrate_tsc - calibrate the tsc on boot
> @@ -189,9 +300,15 @@ unsigned long native_calibrate_tsc(void)
>  {
>         u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
>         unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
> -       unsigned long flags;
> +       unsigned long flags, fast_calibrate;
>         int hpet = is_hpet_enabled(), i;
> 
> +       local_irq_save(flags);
> +       fast_calibrate = quick_pit_calibrate();
> +       local_irq_restore(flags);
> +       if (fast_calibrate)
> +               return fast_calibrate;
> +
>         /*
>          * Run 5 calibration loops to get the lowest frequency value
>          * (the best estimate). We use two different calibration modes


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:34                           ` Linus Torvalds
@ 2008-09-04 21:39                             ` Ingo Molnar
  0 siblings, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2008-09-04 21:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Alok Kataria, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 4 Sep 2008, Linus Torvalds wrote:
> > 
> > Face it, if somebody tries to make QUICK_PIT_MS be so large as to 
> > that be an issue, then the whole point of the function goes away.
> 
> Btw, the same is true of adding any random "sanity checking". The 
> point of that thing was to simply only work when the PIT works as 
> advertized, and fail immediately if it doesn't. Even *if* you were to 
> pick a big calibration delay *and* if you happened to have a PIT that 
> is broken and doesn't wrap correctly, the design of the thing would 
> mean that it would then fail the calibration already.
> 
> Exactly because it would _see_ that it's not wrapping.

yeah - pit_expect_msb() would return after 50,000 iterations with a 
failure. So i was wondering whether for such PITs we _want_ the slow and 
complicated case to run.

Probably not, because the PIT could quite likely still be counting very 
precisely as long as no wraparound was involved.

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:38                 ` Alok Kataria
@ 2008-09-04 21:52                   ` Linus Torvalds
  2008-09-04 22:09                     ` Alok Kataria
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-04 21:52 UTC (permalink / raw)
  To: Alok Kataria
  Cc: Alan Cox, Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

On Thu, 4 Sep 2008, Alok Kataria wrote:
> 
> The maximum count value that I see is 84.
> In one single reboot run, on an average in about 70 iterations the val
> returned from pit_expect_msb is > 50, and eventually we hit a condition
> where the value is < 50 and we bail out of the fast method.
> 
> So just to be on safer side can we be a little less generous and
> increase the threshold to somewhere around 75 from 50 ? Or is there a
> good reason not to ?

Why would you? 

The reason the single run completed successfully was apparently that no 
actual virtualization event triggered, so it actually accessed the 
hardware successfully and without any real slowdown.  As shown also by the 
fact that the actual frequency was correct at the end.

The ones that failed presumably all had interrupts that happened in the 
VM, which then immediately triggered the "uhhuh, there was a bump" thing.

IOW, the code worked correctly as designed. It's not a 
"anti-virtualization" feature per se, it's a "detect when virtualization 
screws up timing". When virtualization (or SMI etc) does _not_ screw up 
timing, it all works fine.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:52                   ` Linus Torvalds
@ 2008-09-04 22:09                     ` Alok Kataria
  0 siblings, 0 replies; 52+ messages in thread
From: Alok Kataria @ 2008-09-04 22:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Ingo Molnar, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra

On Thu, 2008-09-04 at 14:52 -0700, Linus Torvalds wrote:
> 
> On Thu, 4 Sep 2008, Alok Kataria wrote:
> >
> > The maximum count value that I see is 84.
> > In one single reboot run, on an average in about 70 iterations the val
> > returned from pit_expect_msb is > 50, and eventually we hit a condition
> > where the value is < 50 and we bail out of the fast method.
> >
> > So just to be on safer side can we be a little less generous and
> > increase the threshold to somewhere around 75 from 50 ? Or is there a
> > good reason not to ?
> 
> Why would you?
> 
> The reason the single run completed successfully was apparently that no
> actual virtualization event triggered, so it actually accessed the
> hardware successfully and without any real slowdown.  As shown also by the
> fact that the actual frequency was correct at the end.
> 
> The ones that failed presumably all had interrupts that happened in the
> VM, which then immediately triggered the "uhhuh, there was a bump" thing.
> 
> IOW, the code worked correctly as designed. It's not a
> "anti-virtualization" feature per se, it's a "detect when virtualization
> screws up timing". When virtualization (or SMI etc) does _not_ screw up
> timing, it all works fine.

Maybe i was just being too paranoid, the slow calibration method has
been known to work correctly even under extreme stress conditions for us
and wanted to make sure that we always take that path.

Anyways I understand your point so lets stick with this value, if there
are any false positives that I see with this i will get back.

Thanks,
Alok




> 
>                         Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-04 21:33                       ` Ingo Molnar
@ 2008-09-05 22:18                         ` Alok Kataria
  2008-09-05 22:34                           ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Alok Kataria @ 2008-09-05 22:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra, Dan Hecht, Garrett Smith

On Thu, 2008-09-04 at 14:33 -0700, Ingo Molnar wrote:
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > On Thu, 4 Sep 2008, Ingo Molnar wrote:
> > >
> > > hm, unless i'm missing something i think here we still have a small
> > > window for an SMI or some virtualization delay to slip in and cause
> > > massive inaccuracy: if the delay happens _after_ the last
> > > pit_expect_msb() and _before_ the external get_cycles() call. Right?
> >
> > Yes. I had the extra pit_expect_msb() originally, but decided that
> > basically a single-instruction race for somethign that ran without any
> > MSI for 15ms was a bit pointless.
> 
> the race is wider than that i think: all it takes an SMI at the last PIO
> access, so the window should be 1 usec, against a 15000 usecs period.
> That's 1 out of 15,000 boxes coming up with totally incorrect
> calibration.
> 
> we also might have a very theoretical race of an SMI taking exactly 65
> msecs so that the whole PIT wraps around and fools the fastpath - the
> chance for that would be around 1:300 - assuming we only have to hit the
> right MSB with a ~200 usecs precision). That assumes equal distribution
> of SMI costs which they certainly dont have - most of them are much less
> than 60 msecs. So i dont think it's an issue in practice - on real hw.
> 
> But it's still a possibility unless i'm missing something. We could
> protect against that case by reading the IRQ0-pending bit and making
> sure it's not pending after we have done the closing TSC readout.
> 
> > But adding another pit_expect_msb() is certainly not wrong.
> 

Hi, 
I ran the current tree with these patches on my VM setup for both 32 &
64bit around 200 reboots each. 
The system entered the FAST calibration mode more often this time,
around 25% of time.
And i had an interesting case where in the frequency that was calibrated
was 1875Mhz compared to actual ~1866Mhz, leaving an error of 0.5%.

Now, looking at the code.
Even with this last pit_expect_msb check, i think there can be a case
when a error spanning 114usec can slip in the TSC calculation. 

This can happen if, 
in the pit_expect_msb (the one just before the second read_tsc),
we hit an SMI/virtualization event *after* doing the 50 iterations of
PIT read loop, this allows the pit_expect_msb to succeed when the SMI
returns.

If this SMI/Virtualization event spans across the next PIT MSB increment
interval leaving sufficient time (100us) for the last pit_expect_msb to
succeed.
We can have a error of 1MSB tick increment - time taken for the last
pit_expect_msb to succeed, in the read TSC value.

i.e. a error of (214us - 100us) in the 15msec period, i.e. error of
7600PPM ??

And, in order for the TSC clocksource to keep correct time (on systems
where the TSC clocksource is usable), the TSC frequency estimate must be
within 500 ppm of its true frequency, otherwise NTP will not be able to
correct it.

So, IMHO we should not use this algorithm. 

I don't know if increasing the count threshold will help too, since that
threshold value may fail for some system which perform better than our
assumption of "we take 2us to do the 2 PIT reads". Atleast in
virtualized environment I can make no such guarantees. 

Thanks,
Alok

> ok, i kept that bit.
> 
>         Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-05 22:18                         ` Alok Kataria
@ 2008-09-05 22:34                           ` Linus Torvalds
  2008-09-06 20:03                             ` Thomas Gleixner
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-05 22:34 UTC (permalink / raw)
  To: Alok Kataria
  Cc: Alan Cox, Thomas Gleixner, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra, Dan Hecht, Garrett Smith

On Fri, 5 Sep 2008, Alok Kataria wrote:
> 
> This can happen if, in the pit_expect_msb (the one just before the 
> second read_tsc), we hit an SMI/virtualization event *after* doing the 
> 50 iterations of PIT read loop, this allows the pit_expect_msb to 
> succeed when the SMI returns.

So theoretically, on real hardware, the minimum of 50 reads will take 
100us. The 256 PTI cycles will take 214us, so in the absolute worst case, 
you can have two consecutive successful cycles despite having a 228us SMI 
(or other event) if it happens just in the middle.

Of course, then the actual _error_ on the TSC read will be just half that, 
but since there are two TSC reads - one at the beginning and one at the 
end - and if the errors of the two reads go in opposite directions, they 
can add up to 228us.

So I agree - in theory you can have a fairly big error if you hit 
everything just right. In practice, of course, even that *maximal* error 
is actually perfectly fine for TSC calibration.

So I just don't care one whit.  The fact is, fast bootup is more important 
than some crazy and totally unrealistic VM situation. The 50ms thing was 
already too long, the 250ms one is unbearable.

The thing is, you _can_ calibrate the thing more carefully _later_. Use a 
tiemr to do two events one second apart (without slowing down the boot) if 
you want to get a really good value, along with the HPET/PMTIMER 
fine-tuning. That way you should actually be able to get a _really_ 
precise thing, because you do need a long time to get precision. But that 
long time should not be in a critical path on the bootup.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-05 22:34                           ` Linus Torvalds
@ 2008-09-06 20:03                             ` Thomas Gleixner
  2008-09-06 20:29                               ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 20:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Fri, 5 Sep 2008, Linus Torvalds wrote:
> On Fri, 5 Sep 2008, Alok Kataria wrote:
> > 
> > This can happen if, in the pit_expect_msb (the one just before the 
> > second read_tsc), we hit an SMI/virtualization event *after* doing the 
> > 50 iterations of PIT read loop, this allows the pit_expect_msb to 
> > succeed when the SMI returns.
> 
> So theoretically, on real hardware, the minimum of 50 reads will take 
> 100us. The 256 PTI cycles will take 214us, so in the absolute worst case, 
> you can have two consecutive successful cycles despite having a 228us SMI 
> (or other event) if it happens just in the middle.

I just ran your patch in a loop on the SMI mephitic laptop with random
delays between the loops.

20 out of 10000 results are between 2200 and 8400Mhz while the CPU
still runs @2GHz.

And it also happened in an automated boot/reboot cycle (no extra loop)
once out of 50.

So the SMI hit exactly:

+      	t1 = get_cycles();
+      	for (expect = 0xfe, i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+	      	if (!pit_expect_msb(expect))
+			goto failed;
+	}

-----> HERE

+	t2 = get_cycles();

The SMIs on that machine take between 500us and 100+ms according to my
instrumentation experiments: http://lkml.org/lkml/2008/9/2/202

Adding another check after the second get_cycles() makes it reliable:

+      	t1 = get_cycles();
+      	for (expect = 0xfe, i = 0; i < QUICK_PIT_ITERATIONS; i++) {
+	      	if (!pit_expect_msb(expect--))
+			goto failed;
+	}
+   	t2 = get_cycles();
+
+	if (!pit_expect_msb(expect))
+		goto failed;

That solves the problem on that box and will detect any virtualization
delay as well. I just had to move out the "expect--" from the for() as
gcc is overly clever.

> Of course, then the actual _error_ on the TSC read will be just half that, 
> but since there are two TSC reads - one at the beginning and one at the 
> end - and if the errors of the two reads go in opposite directions, they 
> can add up to 228us.
> 
> So I agree - in theory you can have a fairly big error if you hit 
> everything just right. In practice, of course, even that *maximal* error 
> is actually perfectly fine for TSC calibration.

The above _maximal_ error of 400% is not perfectly fine at all.

> So I just don't care one whit.  The fact is, fast bootup is more important 
> than some crazy and totally unrealistic VM situation. The 50ms thing was 
> already too long, the 250ms one is unbearable.

True. You applied a first draft of my patch right away from mail. It
was the accumulated findings of my detective work on various wreckaged
hardware.

The follow up patches I sent (http://lkml.org/lkml/2008/9/4/254),
bring it down to 10ms in the good and 30ms in the worst case with the
option for 50ms in the last round to make the virtualized/emulated
stuff happy. With that I have not seen any wrong result on any of my
jinxed boxen so far and it works very reliable on virtualized
environments as well.

Combining them with your fast calibration should be a solid and
reasonable fast solution in all corner cases.

Ingo put them into the -tip tree:

  git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86/tsc

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:03                             ` Thomas Gleixner
@ 2008-09-06 20:29                               ` Linus Torvalds
  2008-09-06 20:37                                 ` Thomas Gleixner
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 20:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith



On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> 
> Adding another check after the second get_cycles() makes it reliable:

My original patch actually had that, and Ingo added it to the -tip tree 
too (I think), so that should be the one we're discussing anyway. I just 
didn't think it would ever trigger in practice, which is why I removed it. 
Mea culpa.

		Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:29                               ` Linus Torvalds
@ 2008-09-06 20:37                                 ` Thomas Gleixner
  2008-09-06 20:50                                   ` Linus Torvalds
  2008-09-06 20:52                                   ` Thomas Gleixner
  0 siblings, 2 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 20:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:
> 
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > Adding another check after the second get_cycles() makes it reliable:
> 
> My original patch actually had that, and Ingo added it to the -tip tree 
> too (I think), so that should be the one we're discussing anyway. I just 
> didn't think it would ever trigger in practice, which is why I removed it. 
> Mea culpa.

My bad. Did not test that against -tip, just applied it from mail :)

Just checked. The -tip version still has the expect-- in the for()
which might lead to stupid results depending on the gcc madness level.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:37                                 ` Thomas Gleixner
@ 2008-09-06 20:50                                   ` Linus Torvalds
  2008-09-06 20:55                                     ` Linus Torvalds
  2008-09-06 20:58                                     ` Thomas Gleixner
  2008-09-06 20:52                                   ` Thomas Gleixner
  1 sibling, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 20:50 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith



On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> 
> Just checked. The -tip version still has the expect-- in the for()
> which might lead to stupid results depending on the gcc madness level.

Umm. What? You're on some odd drugs.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:37                                 ` Thomas Gleixner
  2008-09-06 20:50                                   ` Linus Torvalds
@ 2008-09-06 20:52                                   ` Thomas Gleixner
  2008-09-06 20:59                                     ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 20:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> On Sat, 6 Sep 2008, Linus Torvalds wrote:
> > 
> > On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > > 
> > > Adding another check after the second get_cycles() makes it reliable:
> > 
> > My original patch actually had that, and Ingo added it to the -tip tree 
> > too (I think), so that should be the one we're discussing anyway. I just 
> > didn't think it would ever trigger in practice, which is why I removed it. 
> > Mea culpa.
> 
> My bad. Did not test that against -tip, just applied it from mail :)
> 
> Just checked. The -tip version still has the expect-- in the for()
> which might lead to stupid results depending on the gcc madness level.

If Alok has the second check in place and is actually worried about
that 288us impact, then we can add the following (untested), which
does not impact the speed of the check.

Against -tip x86/tsc

Thanks,

	tglx
---

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6dab90f..3bfe083 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -310,8 +310,8 @@ static unsigned long quick_pit_calibrate(void)
 		unsigned char expect = 0xfe;
 
 		t1 = get_cycles();
-		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
-			if (!pit_expect_msb(expect))
+		for (i = 0; i < QUICK_PIT_ITERATIONS; i++) {
+			if (!pit_expect_msb(expect--))
 				goto failed;
 		}
 		t2 = get_cycles();
@@ -319,7 +319,7 @@ static unsigned long quick_pit_calibrate(void)
 		/*
 		 * Make sure we can rely on the second TSC timestamp:
 		 */
-		if (!pit_expect_msb(--expect))
+		if (!pit_expect_msb(expect))
 			goto failed;
 
 		/*
@@ -338,7 +338,6 @@ static unsigned long quick_pit_calibrate(void)
 		 */
 		delta = (t2 - t1)*PIT_TICK_RATE;
 		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
-		printk("Fast TSC calibration using PIT\n");
 		return delta;
 	}
 failed:
@@ -356,10 +355,42 @@ unsigned long native_calibrate_tsc(void)
 	int hpet = is_hpet_enabled(), i, loopmin;
 
 	local_irq_save(flags);
+	tsc1 = tsc_read_refs(&ref1, hpet);
 	fast_calibrate = quick_pit_calibrate();
+	tsc2 = tsc_read_refs(&ref1, hpet);
 	local_irq_restore(flags);
-	if (fast_calibrate)
-		return fast_calibrate;
+
+	if (fast_calibrate) {
+		/*
+		 * Return the fast_calibrate value when neither hpet
+		 * nor pmtimer are available.
+		 */
+		if (!hpet && !ref1 && !ref2) {
+			printk("Fast TSC calibration using PIT\n");
+			return fast_calibrate;
+		}
+
+		/* Check, whether the sampling was disturbed by an SMI */
+		if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+			goto slowpath;
+
+		tsc2 = (tsc2 - tsc1) * 1000000LL;
+		if (hpet)
+			tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+		else
+			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
+
+		/* Check the reference deviation */
+		delta = ((u64) fast_calibrate) * 100;
+		do_div(delta, tsc2);
+
+		if (delta >= 90 && delta <= 110) {
+			printk("Fast TSC calibration using PIT\n");
+			return fast_calibrate;
+		}
+	}
+
+slowpath:
 
 	/*
 	 * Run 5 calibration loops to get the lowest frequency value


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:50                                   ` Linus Torvalds
@ 2008-09-06 20:55                                     ` Linus Torvalds
  2008-09-06 21:15                                       ` Thomas Gleixner
  2008-09-06 22:40                                       ` Ingo Molnar
  2008-09-06 20:58                                     ` Thomas Gleixner
  1 sibling, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 20:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:

> 
> 
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > Just checked. The -tip version still has the expect-- in the for()
> > which might lead to stupid results depending on the gcc madness level.
> 
> Umm. What? You're on some odd drugs.

Oh, you mean te "--expect" in the last pit_expect_msb(). Yeah, that one 
looks bogus, but I don't understand what it has to do with gcc at all.

"expect" is an unsigned char. There are absolutely _zero_ issues with 
overflow, underflow, random phases of the moon, madness levels or anything 
else. But yes, it does look like Ingo screwed up when adding that final 
check, since expect was already decremented at the end of the loop.

Ingo? Did you actually test it?

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:50                                   ` Linus Torvalds
  2008-09-06 20:55                                     ` Linus Torvalds
@ 2008-09-06 20:58                                     ` Thomas Gleixner
  2008-09-06 21:10                                       ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 20:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:

> 
> 
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > Just checked. The -tip version still has the expect-- in the for()
> > which might lead to stupid results depending on the gcc madness level.
> 
> Umm. What? You're on some odd drugs.

Just straight forward german beer :)
 
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6dab90f..3bfe083 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -310,8 +310,8 @@ static unsigned long quick_pit_calibrate(void)
 		unsigned char expect = 0xfe;
 
 		t1 = get_cycles();
		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
			if (!pit_expect_msb(expect))
 				goto failed;
 		}
 		t2 = get_cycles();

 		/*
 		 * Make sure we can rely on the second TSC timestamp:
 		 */
		if (!pit_expect_msb(--expect))
 			goto failed;
 
Where is a guarantee, that excpect is not decremented before we break
out of the loop ? 

the "expect--" can be done _BEFORE_ the i < QUICK_PIT_ITERATIONS
evaluation. Not likely, but ...

This version works always

 		t1 = get_cycles();
		for (i = 0; i < QUICK_PIT_ITERATIONS; i++) {
			if (!pit_expect_msb(expect--))
 				goto failed;
 		}
 		t2 = get_cycles();

 		/*
 		 * Make sure we can rely on the second TSC timestamp:
 		 */

		if (!pit_expect_msb(expect))
 			goto failed;

Thanks,

	tglx

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:52                                   ` Thomas Gleixner
@ 2008-09-06 20:59                                     ` Linus Torvalds
  2008-09-06 21:07                                       ` Thomas Gleixner
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 20:59 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> 
> If Alok has the second check in place and is actually worried about
> that 288us impact, then we can add the following (untested), which
> does not impact the speed of the check.

Guys, please. 

Show some _taste_.

Dammit, stop adding random crap to "native_calibrate_tsc()" and make it 
look like total and utter SHIT.

If you want to do that

	tsc1 = tsc_read_refs(&ref1, hpet);
	..
	tsc2 = tsc_read_refs(&ref1, hpet);

around calibration and comparing it, then do it *once*. Do it over the 
whole thing. Do it in a function of its own, instead of making this 
horrible and unreadable mess.

This patch may be fine as a "let's check if it works" thing, but please 
don't send out total SH*T to public lists.

Some _tasted_ in programming, please!

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:59                                     ` Linus Torvalds
@ 2008-09-06 21:07                                       ` Thomas Gleixner
  2008-09-06 21:15                                         ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 21:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > If Alok has the second check in place and is actually worried about
> > that 288us impact, then we can add the following (untested), which
> > does not impact the speed of the check.
> 
> Guys, please. 
> 
> Show some _taste_.

Tell the hardware dudes who made that crap so difficult
 
> Dammit, stop adding random crap to "native_calibrate_tsc()" and make it 
> look like total and utter SHIT.
> 
> If you want to do that
> 
> 	tsc1 = tsc_read_refs(&ref1, hpet);
> 	..
> 	tsc2 = tsc_read_refs(&ref1, hpet);
> 
> around calibration and comparing it, then do it *once*. Do it over the 
> whole thing. Do it in a function of its own, instead of making this 
> horrible and unreadable mess.

Over which _whole_ thing ? You want to have the very very fast thing,
which is not reliable under all circumstances as Alok pointed out and
I merily added a sanity check around that for testing.

> This patch may be fine as a "let's check if it works" thing, but please 
> don't send out total SH*T to public lists.

Why not ? We want to figure out if it solves the problem and sending
it to public lists is the fastest way to get it tested.

> Some _tasted_ in programming, please!

What we apply finally is a totally different thing.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:58                                     ` Thomas Gleixner
@ 2008-09-06 21:10                                       ` Linus Torvalds
  2008-09-07  6:01                                         ` Willy Tarreau
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 21:10 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Thomas Gleixner wrote:
>  
> Where is a guarantee, that excpect is not decremented before we break
> out of the loop ? 

Quite the reverse. We have a guarantee that it _is_ decremented.

Adn that guarantee is very much about the C language.

		for (a ; b ; c) {
			..
		}

translates as

		a;
		while (b) {
			..
	continue:
			c;
		}

And this has absolutely _nothing_ to do with any gcc oddity or anything 
else. 

The fact is, the code that Ingo added was totally bogus. The real bug was 
that he did a totally bogus "--expect" in the argument to that last call. 

Because 'c' *will* have been done after the last iteration of the loop.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:07                                       ` Thomas Gleixner
@ 2008-09-06 21:15                                         ` Linus Torvalds
  2008-09-06 21:26                                           ` Thomas Gleixner
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 21:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith



On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > Show some _taste_.
> 
> Tell the hardware dudes who made that crap so difficult

No. I'm telling you, because that patch IS CRAP.

> Over which _whole_ thing ? You want to have the very very fast thing,
> which is not reliable under all circumstances as Alok pointed out and
> I merily added a sanity check around that for testing.

You can move that thing _out_ into a function of its own.

Look at this piece fo CRAP, and tell me, HOW MANY TIMES do you want to 
repeat it?

+               /*
+                * Return the fast_calibrate value when neither hpet
+                * nor pmtimer are available.
+                */
+               if (!hpet && !ref1 && !ref2) {
+                       printk("Fast TSC calibration using PIT\n");
+                       return fast_calibrate;
+               }
+
+               /* Check, whether the sampling was disturbed by an SMI */
+               if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+                       goto slowpath;
+
+               tsc2 = (tsc2 - tsc1) * 1000000LL;
+               if (hpet)
+                       tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+               else
+                       tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
+
+               /* Check the reference deviation */
+               delta = ((u64) fast_calibrate) * 100;
+               do_div(delta, tsc2);
+
+               if (delta >= 90 && delta <= 110) {
+                       printk("Fast TSC calibration using PIT\n");
+                       return fast_calibrate;
+               }
+       }

Here's a hint: we don't do cut-and-paste programming. And we don't get 
extra points for bloating a single function with the same unreadable code 
over and over and over again. 

How many copies do you want? And here's a hint: the answer is _one_. If 
you get any other answer, your patch is SHIT.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:55                                     ` Linus Torvalds
@ 2008-09-06 21:15                                       ` Thomas Gleixner
  2008-09-06 21:22                                         ` Linus Torvalds
  2008-09-06 22:40                                       ` Ingo Molnar
  1 sibling, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 21:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:
> > > Just checked. The -tip version still has the expect-- in the for()
> > > which might lead to stupid results depending on the gcc madness level.
> > 
> > Umm. What? You're on some odd drugs.
> 
> Oh, you mean te "--expect" in the last pit_expect_msb(). Yeah, that one 
> looks bogus, but I don't understand what it has to do with gcc at all.

One gcc does:

    i++;
    if (i >= QUICK_PIT_ITERATIONS)
       goto out;
    expect--;

The other one does:

    i++;
    expect--;
    if (i >= QUICK_PIT_ITERATIONS)
       goto out;

Don't ask me which one is correct. It's just reality :(

/me goes back to consume legal german drugs

    tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:15                                       ` Thomas Gleixner
@ 2008-09-06 21:22                                         ` Linus Torvalds
  2008-09-06 21:30                                           ` Thomas Gleixner
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 21:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> 
> One gcc does:
> 
>     i++;
>     if (i >= QUICK_PIT_ITERATIONS)
>        goto out;
>     expect--;
> 
> The other one does:
> 
>     i++;
>     expect--;
>     if (i >= QUICK_PIT_ITERATIONS)
>        goto out;
> 
> Don't ask me which one is correct. It's just reality :(

Show me. Because I simply don't believe you.

The first code is simply _wrong_ - except if "expect" isn't even _used_ 
afterwards (in which case gcc can optimize away the last unused write).

And I strongly suspect that that is what you've seen. 

Because quite frankly, if what you describe is real, then your gcc is 
incredibly buggy. So buggy that it sounds unlikely to be able to compile 
the kernel in many other places. This is very simple and very fundamental 
C, not something subtle or even half-way undefined.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:15                                         ` Linus Torvalds
@ 2008-09-06 21:26                                           ` Thomas Gleixner
  2008-09-06 21:32                                             ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 21:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > > 
> > > Show some _taste_.
> > 
> > Tell the hardware dudes who made that crap so difficult
> 
> No. I'm telling you, because that patch IS CRAP.
> 
> > Over which _whole_ thing ? You want to have the very very fast thing,
> > which is not reliable under all circumstances as Alok pointed out and
> > I merily added a sanity check around that for testing.
> 
> You can move that thing _out_ into a function of its own.

I know, but I'm not going to do that at 11PM. 
 
> Here's a hint: we don't do cut-and-paste programming. And we don't get 
> extra points for bloating a single function with the same unreadable code 
> over and over and over again. 
> 
> How many copies do you want? And here's a hint: the answer is _one_. If 
> you get any other answer, your patch is SHIT.

I didn't know that sending a test patch which is admittetly not pretty
is a capital crime nowadays.

In future I'll restrict myself to look at such stuff only on Monday to
Friday between 9AM and 5PM and send test/RFC patches only when they
got approved by the nonshitapproval committee, which holds a meeting
once a month.

Yours grumpy,

      tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:22                                         ` Linus Torvalds
@ 2008-09-06 21:30                                           ` Thomas Gleixner
  0 siblings, 0 replies; 52+ messages in thread
From: Thomas Gleixner @ 2008-09-06 21:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Linus Torvalds wrote:
> On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> > 
> > One gcc does:
> > 
> >     i++;
> >     if (i >= QUICK_PIT_ITERATIONS)
> >        goto out;
> >     expect--;
> > 
> > The other one does:
> > 
> >     i++;
> >     expect--;
> >     if (i >= QUICK_PIT_ITERATIONS)
> >        goto out;
> > 
> > Don't ask me which one is correct. It's just reality :(
> 
> Show me. Because I simply don't believe you.
> 
> The first code is simply _wrong_ - except if "expect" isn't even _used_ 
> afterwards (in which case gcc can optimize away the last unused write).
> 
> And I strongly suspect that that is what you've seen. 
> 
> Because quite frankly, if what you describe is real, then your gcc is 
> incredibly buggy. So buggy that it sounds unlikely to be able to compile 
> the kernel in many other places. This is very simple and very fundamental 
> C, not something subtle or even half-way undefined.

Just checked, yes you are right. I messed that up in my tiredness to see
the obvious bug in Ingo's fixup patch.

/me resorts to bed

    tglx

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:26                                           ` Thomas Gleixner
@ 2008-09-06 21:32                                             ` Linus Torvalds
  0 siblings, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2008-09-06 21:32 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alok Kataria, Alan Cox, LKML, Arjan van de Veen, H. Peter Anvin,
	Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, 6 Sep 2008, Thomas Gleixner wrote:
> 
> I didn't know that sending a test patch which is admittetly not pretty
> is a capital crime nowadays.

The thing is, there's a pattern to this. And it has nothing to do with 
"test patch".

See commit fbb16e243887332dd5754e48ffe5b963378f3cd2, and then see what I 
had to do in ec0c15afb41fd9ad45b53468b60db50170e22346.

That wasn't a test patch, was it?

I don't want to continually see these patches that are simply adding more 
and more crap. I want to not have to clean up the end result afterwards.

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 20:55                                     ` Linus Torvalds
  2008-09-06 21:15                                       ` Thomas Gleixner
@ 2008-09-06 22:40                                       ` Ingo Molnar
  1 sibling, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2008-09-06 22:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, Alok Kataria, Alan Cox, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra, Dan Hecht, Garrett Smith


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> "expect" is an unsigned char. There are absolutely _zero_ issues with 
> overflow, underflow, random phases of the moon, madness levels or 
> anything else. But yes, it does look like Ingo screwed up when adding 
> that final check, since expect was already decremented at the end of 
> the loop.
> 
> Ingo? Did you actually test it?

hm, yes, that's my brown paper bag fault, sorry.

I did that addition in tip/x86/tsc and posted it to you and i did test 
it immediately - and i noticed that i never saw the fast-calibration 
message i expected to see. I even pasted the boot log over irc yesterday 
and i still have it:

 *> [  0.000] TSC: PIT calibration deviates from PMTIMER: 738839 846296.
 *> [  0.000] TSC: Using PIT calibration value
 *> [  0.000] Detected 738.839 MHz processor.
 *> does not seem to trigger anywhere

i wanted to debug that problem straight after i worked down my 800+ 
mails post-vacation mbox :-/ Which state i reached about 2 hours ago so 
i'm now free - the fix is below.

i _think_ that the quality of calibration should now be pretty OK with 
latest -git. The clever fast calibration stuff could be .28 material. 

And/or we could change the 5x 50msec calibration to 3x 30msec right now, 
the precision is still plenty and the 90 msec is then replaced with your 
fast-calibrate method anyway on proper boxes. Hm?

	Ingo

------------------->
>From 5df45515512436a808d3476a90e83f2efb022422 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Sep 2008 23:55:40 +0200
Subject: [PATCH] x86, tsc calibration: fix

my brown paperbag day ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6dab90f..4847a92 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -319,7 +319,7 @@ static unsigned long quick_pit_calibrate(void)
 		/*
 		 * Make sure we can rely on the second TSC timestamp:
 		 */
-		if (!pit_expect_msb(--expect))
+		if (!pit_expect_msb(expect))
 			goto failed;
 
 		/*

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [RFC patch 0/4] TSC calibration improvements
  2008-09-06 21:10                                       ` Linus Torvalds
@ 2008-09-07  6:01                                         ` Willy Tarreau
  0 siblings, 0 replies; 52+ messages in thread
From: Willy Tarreau @ 2008-09-07  6:01 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, Alok Kataria, Alan Cox, LKML, Arjan van de Veen,
	H. Peter Anvin, Peter Zijlstra, Dan Hecht, Garrett Smith

On Sat, Sep 06, 2008 at 02:10:32PM -0700, Linus Torvalds wrote:

> The fact is, the code that Ingo added was totally bogus. The real bug was 
> that he did a totally bogus "--expect" in the argument to that last call. 

BTW, I hate to see state-changing instructions inside an if condition.
I've been bitten several times while debugging. You try to temporarily
comment out the if statement for a test and you end up with different
code. Same for printf. Examples of dangerous usages :

      i = 0;
      for (x = 0; x < 100; x++) {
         update_var(&i);
         if (debug && i--)
             printf("Hey I'm here\n");
      }
      return i;

You can bet that the if will go away before production. Variant with
similar effects :

      i = 0;
      for (x = 0; x < 100; x++) {
         update_var(&i);
         printf("Hey I'm here : %d\n", --i);
      }
      return i;

Since it costs nothing (except one tab and one LF) to put the instruction
out of the condition, I prefer to see them extracted :

      i = 0;
      for (x = 0; x < 100; x++) {
         update_var(&i);
         i--;
         if (debug)
             printf("Hey I'm here\n");
      }
      return i;
> 

Willy


^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2008-09-07  6:02 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-04 15:18 [RFC patch 0/4] TSC calibration improvements Thomas Gleixner
2008-09-04 15:18 ` [RFC patch 1/4] x86: TSC: define the PIT latch value separate Thomas Gleixner
2008-09-04 15:18 ` [RFC patch 2/4] x86: TSC: separate hpet/pmtimer calculation out Thomas Gleixner
2008-09-04 15:18 ` [RFC patch 3/4] x86: TSC: use one set of reference variables Thomas Gleixner
2008-09-04 15:18 ` [RFC patch 4/4] x86: TSC make the calibration loop smarter Thomas Gleixner
2008-09-04 15:36 ` [RFC patch 0/4] TSC calibration improvements Ingo Molnar
2008-09-04 15:45   ` Linus Torvalds
2008-09-04 16:00     ` Ingo Molnar
2008-09-04 16:21       ` Linus Torvalds
2008-09-04 16:36         ` Ingo Molnar
2008-09-04 17:41         ` Linus Torvalds
2008-09-04 18:07           ` Alan Cox
2008-09-04 18:26             ` Linus Torvalds
2008-09-04 18:30               ` H. Peter Anvin
2008-09-04 20:09               ` Linus Torvalds
2008-09-04 20:43                 ` Ingo Molnar
2008-09-04 20:52                   ` Ingo Molnar
2008-09-04 21:09                     ` Linus Torvalds
2008-09-04 21:21                       ` Ingo Molnar
2008-09-04 21:30                         ` Linus Torvalds
2008-09-04 21:34                           ` Linus Torvalds
2008-09-04 21:39                             ` Ingo Molnar
2008-09-04 21:33                       ` Ingo Molnar
2008-09-05 22:18                         ` Alok Kataria
2008-09-05 22:34                           ` Linus Torvalds
2008-09-06 20:03                             ` Thomas Gleixner
2008-09-06 20:29                               ` Linus Torvalds
2008-09-06 20:37                                 ` Thomas Gleixner
2008-09-06 20:50                                   ` Linus Torvalds
2008-09-06 20:55                                     ` Linus Torvalds
2008-09-06 21:15                                       ` Thomas Gleixner
2008-09-06 21:22                                         ` Linus Torvalds
2008-09-06 21:30                                           ` Thomas Gleixner
2008-09-06 22:40                                       ` Ingo Molnar
2008-09-06 20:58                                     ` Thomas Gleixner
2008-09-06 21:10                                       ` Linus Torvalds
2008-09-07  6:01                                         ` Willy Tarreau
2008-09-06 20:52                                   ` Thomas Gleixner
2008-09-06 20:59                                     ` Linus Torvalds
2008-09-06 21:07                                       ` Thomas Gleixner
2008-09-06 21:15                                         ` Linus Torvalds
2008-09-06 21:26                                           ` Thomas Gleixner
2008-09-06 21:32                                             ` Linus Torvalds
2008-09-04 20:53                   ` Linus Torvalds
2008-09-04 21:38                 ` Alok Kataria
2008-09-04 21:52                   ` Linus Torvalds
2008-09-04 22:09                     ` Alok Kataria
2008-09-04 17:39     ` Alok Kataria
2008-09-04 17:53       ` Linus Torvalds
2008-09-04 18:31         ` Alok Kataria
2008-09-04 18:34           ` H. Peter Anvin
2008-09-04 21:00   ` Valdis.Kletnieks

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox