public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v3] Fix: clocksource watchdog marks TSC unstable on guest VM
@ 2015-09-08 14:19 Mathieu Desnoyers
  2015-09-08 15:08 ` Thomas Gleixner
  0 siblings, 1 reply; 5+ messages in thread
From: Mathieu Desnoyers @ 2015-09-08 14:19 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: linux-kernel, Mathieu Desnoyers, Daniel Lezcano

I have been getting those warnings across a range of guest
kernels in my development virtual machines. The host is a
3.13 Ubuntu kernel. The latest guest on which I reproduced
this is a 4.2 kernel (akpm's tree).

[  126.902240] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[  126.902240] clocksource:                       'hpet' wd_now: f948c267 wd_last: f5edb97c mask: ffffffff
[  126.967563] clocksource:                       'tsc' cs_now: 48f04ee060 cs_last: 48a8c73ed0 mask: ffffffffffffffff

Relevant info from the guest kernel dmesg:

[    2.197070] HPET: 3 timers in total, 0 timers will be used for per-cpu timer
[    2.198021] hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
[    2.199557] hpet0: 3 comparators, 64-bit 100.000000 MHz counter

[    3.393185] tsc: Refined TSC clocksource calibration: 2400.037 MHz
[    3.396552] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x22985a87e67, max_idle_ns: 440795315743 ns

It appears that the clocksource watchdog does:

        local_irq_disable();
        csnow = cs->read(cs);
        wdnow = watchdog->read(watchdog);
        local_irq_enable();

If the host kernel preempts the guest between reading the
TSC and the watchdog, the resulting delta may go well over
the watchdog threshold, which is currently 62ms.

Correct this issue by reading "wdnow" before and after reading
TSC, and retry if the delta between the two reads exceeds
WATCHDOG_THRESHOLD.

Introduce WATCHDOG_RETRY to bound the number of retry (in the
unlikely event of a bogus clock source for wdnow). If the
number of retry has been reached, disable the watchdog timer.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Daniel Lezcano <daniel.lezcano@linaro.org>
CC: Thomas Gleixner <tglx@linutronix.de>

--

Changes since v1:
- Add missing \n at the end of pr_warn().
Changes since v2:
- Disable watchdog timer if WATCHDOG_RETRY is reached.
---
 kernel/time/clocksource.c | 192 ++++++++++++++++++++++++++++------------------
 1 file changed, 117 insertions(+), 75 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 174c594..a9d4428 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -126,6 +126,12 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+/*
+ * When two back-to-back reads of the watchdog clock are too far
+ * apart, limit the number of retry allowed before we disable the
+ * watchdog timer.
+ */
+#define WATCHDOG_RETRY 20
 
 static void clocksource_watchdog_work(struct work_struct *work)
 {
@@ -166,95 +172,131 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static void clocksource_watchdog(unsigned long data)
+static int check_cs_wd(struct clocksource *cs)
 {
-	struct clocksource *cs;
-	cycle_t csnow, wdnow, cslast, wdlast, delta;
-	int64_t wd_nsec, cs_nsec;
-	int next_cpu, reset_pending;
-
-	spin_lock(&watchdog_lock);
-	if (!watchdog_running)
-		goto out;
-
-	reset_pending = atomic_read(&watchdog_reset_pending);
+	cycle_t csnow, wdnow[2], cslast, wdlast, delta;
+	int64_t wd_nsec[2], cs_nsec;
+	int retry_count = WATCHDOG_RETRY;
+
+restart:
+	/* Clocksource already marked unstable? */
+	if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+		if (finished_booting)
+			schedule_work(&watchdog_work);
+		return 0;
+	}
 
-	list_for_each_entry(cs, &watchdog_list, wd_list) {
+	local_irq_disable();
+	wdnow[0] = watchdog->read(watchdog);
+	csnow = cs->read(cs);
+	wdnow[1] = watchdog->read(watchdog);
+	local_irq_enable();
+
+	/* Clocksource initialized ? */
+	if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+	    atomic_read(&watchdog_reset_pending)) {
+		cs->flags |= CLOCK_SOURCE_WATCHDOG;
+		cs->wd_last = wdnow[1];
+		cs->cs_last = csnow;
+		return 0;
+	}
 
-		/* Clocksource already marked unstable? */
-		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
-			if (finished_booting)
-				schedule_work(&watchdog_work);
-			continue;
-		}
+	delta = clocksource_delta(wdnow[0], cs->wd_last, watchdog->mask);
+	wd_nsec[0] = clocksource_cyc2ns(delta, watchdog->mult,
+				     watchdog->shift);
+	delta = clocksource_delta(wdnow[1], cs->wd_last, watchdog->mask);
+	wd_nsec[1] = clocksource_cyc2ns(delta, watchdog->mult,
+				     watchdog->shift);
 
-		local_irq_disable();
-		csnow = cs->read(cs);
-		wdnow = watchdog->read(watchdog);
-		local_irq_enable();
-
-		/* Clocksource initialized ? */
-		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
-		    atomic_read(&watchdog_reset_pending)) {
-			cs->flags |= CLOCK_SOURCE_WATCHDOG;
-			cs->wd_last = wdnow;
-			cs->cs_last = csnow;
-			continue;
+	if ((abs(wd_nsec[1] - wd_nsec[0]) > WATCHDOG_THRESHOLD)) {
+		/*
+		 * Got bogus reads from the watchdog clocksource.
+		 * This can be caused by SMI, MCE, NMI, and preemption of guest
+		 * kernel by host.
+		 */
+		pr_warn("Bogus reads from watchdog clocksource: wdnow[0]: %llx wdnow[1]: %llx\n",
+			wdnow[0], wdnow[1]);
+		pr_warn("              delta: %llu nsec\n",
+			(unsigned long long) abs(wd_nsec[1] - wd_nsec[0]));
+		if (retry_count-- > 0) {
+			goto restart;
+		} else {
+			pr_warn("Disabling watchdog due to bogus clocksource\n");
+			return -1;
 		}
+	}
+	delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+	cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+	wdlast = cs->wd_last; /* save these in case we print them */
+	cslast = cs->cs_last;
+	cs->cs_last = csnow;
+	cs->wd_last = wdnow[1];
+
+	if (atomic_read(&watchdog_reset_pending))
+		return 0;
+
+	/* Check the deviation from the watchdog clocksource. */
+	if ((abs(cs_nsec - wd_nsec[1]) > WATCHDOG_THRESHOLD)) {
+		pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+			cs->name);
+		pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+			watchdog->name, wdnow[1], wdlast, watchdog->mask);
+		pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+			cs->name, csnow, cslast, cs->mask);
+		__clocksource_unstable(cs);
+		return 0;
+	}
 
-		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
-		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
-					     watchdog->shift);
-
-		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
-		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
-		wdlast = cs->wd_last; /* save these in case we print them */
-		cslast = cs->cs_last;
-		cs->cs_last = csnow;
-		cs->wd_last = wdnow;
+	if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+	    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+	    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+		/* Mark it valid for high-res. */
+		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 
-		if (atomic_read(&watchdog_reset_pending))
-			continue;
+		/*
+		 * clocksource_done_booting() will sort it if
+		 * finished_booting is not set yet.
+		 */
+		if (!finished_booting)
+			return 0;
 
-		/* Check the deviation from the watchdog clocksource. */
-		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
-				cs->name);
-			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
-				watchdog->name, wdnow, wdlast, watchdog->mask);
-			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
-				cs->name, csnow, cslast, cs->mask);
-			__clocksource_unstable(cs);
-			continue;
+		/*
+		 * If this is not the current clocksource let
+		 * the watchdog thread reselect it. Due to the
+		 * change to high res this clocksource might
+		 * be preferred now. If it is the current
+		 * clocksource let the tick code know about
+		 * that change.
+		 */
+		if (cs != curr_clocksource) {
+			cs->flags |= CLOCK_SOURCE_RESELECT;
+			schedule_work(&watchdog_work);
+		} else {
+			tick_clock_notify();
 		}
+	}
+	return 0;
+}
 
-		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
-		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
-		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-			/* Mark it valid for high-res. */
-			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+static void clocksource_watchdog(unsigned long data)
+{
+	struct clocksource *cs;
+	int next_cpu, reset_pending;
 
-			/*
-			 * clocksource_done_booting() will sort it if
-			 * finished_booting is not set yet.
-			 */
-			if (!finished_booting)
-				continue;
+	spin_lock(&watchdog_lock);
+	if (!watchdog_running)
+		goto out;
 
+	reset_pending = atomic_read(&watchdog_reset_pending);
+
+	list_for_each_entry(cs, &watchdog_list, wd_list) {
+		if (check_cs_wd(cs) < 0) {
 			/*
-			 * If this is not the current clocksource let
-			 * the watchdog thread reselect it. Due to the
-			 * change to high res this clocksource might
-			 * be preferred now. If it is the current
-			 * clocksource let the tick code know about
-			 * that change.
+			 * Disable watchdog because its reference
+			 * clocksource is bogus.
 			 */
-			if (cs != curr_clocksource) {
-				cs->flags |= CLOCK_SOURCE_RESELECT;
-				schedule_work(&watchdog_work);
-			} else {
-				tick_clock_notify();
-			}
+			watchdog_running = 0;
+			goto out;
 		}
 	}
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-09-09 15:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-08 14:19 [RFC PATCH v3] Fix: clocksource watchdog marks TSC unstable on guest VM Mathieu Desnoyers
2015-09-08 15:08 ` Thomas Gleixner
2015-09-09  1:03   ` Shaohua Li
2015-09-09  9:51     ` Thomas Gleixner
2015-09-09 15:43       ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox