[PATCH v2] Track hard and soft "short lockups" or "stalls."

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] Track hard and soft "short lockups" or "stalls."
@ 2011-07-15 20:11 Alex Neronskiy
  2011-07-18 12:28 ` Don Zickus
  0 siblings, 1 reply; 7+ messages in thread
From: Alex Neronskiy @ 2011-07-15 20:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Don Zickus, Mandeep Singh Baines, Alex Neronskiy

Enable the lockup watchdog to not only respond to lockups beyond a
certain threshold, but also keep track of the worst sub-threshold
lockup (called a "stall") seen so far. The stalls are communicated
to userspace via /proc/sys/kernel/worst_softstall
and worst_hardstall, as well as through the kernel log, where a
stack trace is also included. Hard lockups are checked for more
frequently to collect additional data. Soft stalls are described
by the length of time between successive instances of scheduling
of the watchdog thread, and hard stalls are described by the
number of times a hard lockup check found that no interrupts had
been recorded. Some thresholds on these are set, to attempt to
prevent floods of useless data.

Signed-off-by: Alex Neronskiy <zakmagnus@google.com>
---
The motivation here is to be able to gather more data about just how
"extreme" latency conditions get inside the kernel, without
necessarily triggering a threshold and potentially setting off a
panic.
The way the hard lockup watchdog works has been changed to make it
more compatible with this purpose. Previously it was hard to
collect any data from it other than "lockup" or "not lockup."
Now it has to fail 5 consecutive checks to be considered a lockup,
somewhat similarly to how soft lockups work.

 include/linux/nmi.h |    6 ++-
 kernel/sysctl.c     |   18 +++++++-
 kernel/watchdog.c   |  122 +++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 127 insertions(+), 19 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 2d304ef..57cda21 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -48,9 +48,13 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_enabled;
 extern int watchdog_thresh;
+extern unsigned long worst_softstall;
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+extern unsigned long worst_hardstall;
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
-#endif
+#endif /* CONFIG_LOCKUP_DETECTOR */
 
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98..b52a4d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -767,7 +767,23 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#endif
+	{
+		.procname       = "worst_softstall",
+		.data           = &worst_softstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0444,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+	{
+		.procname       = "worst_hardstall",
+		.data           = &worst_hardstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0444,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_LOCKUP_DETECTOR */
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.procname       = "unknown_nmi_panic",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..7d37cc2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,6 +29,8 @@
 
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+unsigned long worst_softstall;
+static DEFINE_SPINLOCK(softstall_lock);
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -40,8 +42,29 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
+
+unsigned long worst_hardstall;
+static DEFINE_SPINLOCK(hardstall_lock);
+
+/* The number of consecutive hard check failures before it's a lockup */
+#define hardlockup_thresh (5)
+
+/*
+ * The minimum expected number of missed interrupts in a normal hard
+ * lockup check.
+ */
+#define hardstall_thresh (2)
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/*
+ * The number of milliseconds by which a stall must be worse than the
+ * current worst in order to be recorded. This prevents bursts of near-
+ * identical stalls.
+ */
+#define softstall_diff_thresh (100)
 
 /* boot commands */
 /*
@@ -98,19 +121,28 @@ __setup("nosoftlockup", nosoftlockup_setup);
  * the thresholds with a factor: we make the soft threshold twice the amount of
  * time the hard threshold is.
  */
-static int get_softlockup_thresh(void)
+static inline int get_softlockup_thresh(void)
 {
 	return watchdog_thresh * 2;
 }
 
 /*
- * Returns seconds, approximately.  We don't need nanosecond
+ * This is just the period of the timer function. A "stall" shorter than
+ * this is the expected behavior and is therefore not noteworthy.
+ */
+static inline int get_softstall_thresh(void)
+{
+	return 1000 / 5 * get_softlockup_thresh();
+}
+
+/*
+ * Returns milliseconds, approximately.  We don't need nanosecond
  * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
+ * 2^20ns == 1.049ms.
  */
 static unsigned long get_timestamp(int this_cpu)
 {
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 20LL;  /* 2^20 ~= 10^6 */
 }
 
 static unsigned long get_sample_period(void)
@@ -176,25 +208,77 @@ void touch_softlockup_watchdog_sync(void)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static int is_hardlockup(void)
+static void update_hardstall(unsigned long stall, int this_cpu)
+{
+	int update_stall = 0;
+
+	if (stall > hardstall_thresh && stall > worst_hardstall) {
+		unsigned long flags;
+		spin_lock_irqsave(&hardstall_lock, flags);
+		update_stall = stall > worst_hardstall;
+		if (update_stall)
+			worst_hardstall = stall;
+		spin_unlock_irqrestore(&hardstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "new worst hard stall seen on CPU#%d: %lu interrupts missed\n",
+			this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_hardlockup(int this_cpu)
 {
 	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+	unsigned long hrint_saved = __this_cpu_read(hrtimer_interrupts_saved);
+	unsigned long ints_missed = 0;
+
+	__this_cpu_write(hrtimer_interrupts_saved, hrint);
 
-	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+	if (hrint_saved == hrint)
+		ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
+	else
+		__this_cpu_write(hrtimer_interrupts_missed, 0);
+
+	if (ints_missed >= hardlockup_thresh)
 		return 1;
 
-	__this_cpu_write(hrtimer_interrupts_saved, hrint);
+	update_hardstall(ints_missed, this_cpu);
 	return 0;
 }
 #endif
 
-static int is_softlockup(unsigned long touch_ts)
+static void update_softstall(unsigned long stall, int this_cpu)
 {
-	unsigned long now = get_timestamp(smp_processor_id());
+	int update_stall = 0;
+	if (stall > get_softstall_thresh() &&
+			stall > worst_softstall + softstall_diff_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&softstall_lock, flags);
+		update_stall = stall > worst_softstall + softstall_diff_thresh;
+		if (update_stall)
+			worst_softstall = stall;
+		spin_unlock_irqrestore(&softstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "new worst soft stall seen on CPU#%d: %lums\n",
+				this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_softlockup(unsigned long touch_ts, int this_cpu)
+{
+	unsigned long now = get_timestamp(this_cpu);
+	unsigned long stall = now - touch_ts;
 
 	/* Warn about unreasonable delays: */
-	if (time_after(now, touch_ts + get_softlockup_thresh()))
-		return now - touch_ts;
+	if (time_after(now, touch_ts + 1000 * get_softlockup_thresh()))
+		return stall;
+
+	update_softstall(stall, this_cpu);
 
 	return 0;
 }
@@ -213,6 +297,9 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
 {
+	int lockup;
+	int this_cpu = smp_processor_id();
+
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
@@ -227,9 +314,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
 	 * fired multiple times before we overflow'd.  If it hasn't
 	 * then this is a good indication the cpu is stuck
 	 */
-	if (is_hardlockup()) {
-		int this_cpu = smp_processor_id();
-
+	lockup = is_hardlockup(this_cpu);
+	if (lockup) {
 		/* only print hardlockups once */
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
@@ -260,6 +346,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
+	int this_cpu = smp_processor_id();
 
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
@@ -289,14 +376,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 * indicate it is getting cpu time.  If it hasn't then
 	 * this is a good indication some task is hogging the cpu
 	 */
-	duration = is_softlockup(touch_ts);
+	duration = is_softlockup(touch_ts, this_cpu);
 	if (unlikely(duration)) {
 		/* only warn once */
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
 		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
-			smp_processor_id(), duration,
+			this_cpu, duration >> 10L,
 			current->comm, task_pid_nr(current));
 		print_modules();
 		print_irqtrace_events(current);
@@ -371,6 +458,7 @@ static int watchdog_nmi_enable(int cpu)
 	/* Try to register using hardware perf events */
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	do_div(wd_attr->sample_period, hardlockup_thresh);
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
  2011-07-15 20:11 [PATCH v2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
@ 2011-07-18 12:28 ` Don Zickus
       [not found]   ` <CAAuSN9106qmYF27oRrfUBtqwOmSQgDJWwv3iz_NmTTuYNEymHA@mail.gmail.com>
  0 siblings, 1 reply; 7+ messages in thread
From: Don Zickus @ 2011-07-18 12:28 UTC (permalink / raw)
  To: Alex Neronskiy; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Fri, Jul 15, 2011 at 01:11:10PM -0700, Alex Neronskiy wrote:
> Enable the lockup watchdog to not only respond to lockups beyond a
> certain threshold, but also keep track of the worst sub-threshold
> lockup (called a "stall") seen so far. The stalls are communicated
> to userspace via /proc/sys/kernel/worst_softstall
> and worst_hardstall, as well as through the kernel log, where a
> stack trace is also included. Hard lockups are checked for more
> frequently to collect additional data. Soft stalls are described
> by the length of time between successive instances of scheduling
> of the watchdog thread, and hard stalls are described by the
> number of times a hard lockup check found that no interrupts had
> been recorded. Some thresholds on these are set, to attempt to
> prevent floods of useless data.

I think this matches what I had in mind.  Though I keep wondering if there
is a simpler way to achieve this because this is only going to print in
the ints_missed = 3 and 4 cases.  But I can't, so I will do some testing
on my end to make sure it doesn't break some of the cases I deal with.

I think it will be fine though.

Thanks!

Cheers,
Don

> 
> Signed-off-by: Alex Neronskiy <zakmagnus@google.com>
> ---
> The motivation here is to be able to gather more data about just how
> "extreme" latency conditions get inside the kernel, without
> necessarily triggering a threshold and potentially setting off a
> panic.
> The way the hard lockup watchdog works has been changed to make it
> more compatible with this purpose. Previously it was hard to
> collect any data from it other than "lockup" or "not lockup."
> Now it has to fail 5 consecutive checks to be considered a lockup,
> somewhat similarly to how soft lockups work.
> 
>  include/linux/nmi.h |    6 ++-
>  kernel/sysctl.c     |   18 +++++++-
>  kernel/watchdog.c   |  122 +++++++++++++++++++++++++++++++++++++++++++-------
>  3 files changed, 127 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 2d304ef..57cda21 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -48,9 +48,13 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
>  u64 hw_nmi_get_sample_period(int watchdog_thresh);
>  extern int watchdog_enabled;
>  extern int watchdog_thresh;
> +extern unsigned long worst_softstall;
> +#ifdef CONFIG_HARDLOCKUP_DETECTOR
> +extern unsigned long worst_hardstall;
> +#endif /* CONFIG_HARDLOCKUP_DETECTOR */
>  struct ctl_table;
>  extern int proc_dowatchdog(struct ctl_table *, int ,
>  			   void __user *, size_t *, loff_t *);
> -#endif
> +#endif /* CONFIG_LOCKUP_DETECTOR */
>  
>  #endif
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index f175d98..b52a4d5 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -767,7 +767,23 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> -#endif
> +	{
> +		.procname       = "worst_softstall",
> +		.data           = &worst_softstall,
> +		.maxlen         = sizeof(unsigned long),
> +		.mode           = 0444,
> +		.proc_handler   = proc_doulongvec_minmax,
> +	},
> +#ifdef CONFIG_HARDLOCKUP_DETECTOR
> +	{
> +		.procname       = "worst_hardstall",
> +		.data           = &worst_hardstall,
> +		.maxlen         = sizeof(unsigned long),
> +		.mode           = 0444,
> +		.proc_handler   = proc_doulongvec_minmax,
> +	},
> +#endif /* CONFIG_HARDLOCKUP_DETECTOR */
> +#endif /* CONFIG_LOCKUP_DETECTOR */
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
>  	{
>  		.procname       = "unknown_nmi_panic",
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 3d0c56a..7d37cc2 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -29,6 +29,8 @@
>  
>  int watchdog_enabled = 1;
>  int __read_mostly watchdog_thresh = 10;
> +unsigned long worst_softstall;
> +static DEFINE_SPINLOCK(softstall_lock);
>  
>  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
>  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
> @@ -40,8 +42,29 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
>  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
>  static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
>  static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
> +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
>  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
> -#endif
> +
> +unsigned long worst_hardstall;
> +static DEFINE_SPINLOCK(hardstall_lock);
> +
> +/* The number of consecutive hard check failures before it's a lockup */
> +#define hardlockup_thresh (5)
> +
> +/*
> + * The minimum expected number of missed interrupts in a normal hard
> + * lockup check.
> + */
> +#define hardstall_thresh (2)
> +
> +#endif /* CONFIG_HARDLOCKUP_DETECTOR */
> +
> +/*
> + * The number of milliseconds by which a stall must be worse than the
> + * current worst in order to be recorded. This prevents bursts of near-
> + * identical stalls.
> + */
> +#define softstall_diff_thresh (100)
>  
>  /* boot commands */
>  /*
> @@ -98,19 +121,28 @@ __setup("nosoftlockup", nosoftlockup_setup);
>   * the thresholds with a factor: we make the soft threshold twice the amount of
>   * time the hard threshold is.
>   */
> -static int get_softlockup_thresh(void)
> +static inline int get_softlockup_thresh(void)
>  {
>  	return watchdog_thresh * 2;
>  }
>  
>  /*
> - * Returns seconds, approximately.  We don't need nanosecond
> + * This is just the period of the timer function. A "stall" shorter than
> + * this is the expected behavior and is therefore not noteworthy.
> + */
> +static inline int get_softstall_thresh(void)
> +{
> +	return 1000 / 5 * get_softlockup_thresh();
> +}
> +
> +/*
> + * Returns milliseconds, approximately.  We don't need nanosecond
>   * resolution, and we don't need to waste time with a big divide when
> - * 2^30ns == 1.074s.
> + * 2^20ns == 1.049ms.
>   */
>  static unsigned long get_timestamp(int this_cpu)
>  {
> -	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
> +	return cpu_clock(this_cpu) >> 20LL;  /* 2^20 ~= 10^6 */
>  }
>  
>  static unsigned long get_sample_period(void)
> @@ -176,25 +208,77 @@ void touch_softlockup_watchdog_sync(void)
>  
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  /* watchdog detector functions */
> -static int is_hardlockup(void)
> +static void update_hardstall(unsigned long stall, int this_cpu)
> +{
> +	int update_stall = 0;
> +
> +	if (stall > hardstall_thresh && stall > worst_hardstall) {
> +		unsigned long flags;
> +		spin_lock_irqsave(&hardstall_lock, flags);
> +		update_stall = stall > worst_hardstall;
> +		if (update_stall)
> +			worst_hardstall = stall;
> +		spin_unlock_irqrestore(&hardstall_lock, flags);
> +	}
> +
> +	if (update_stall) {
> +		printk(KERN_WARNING "new worst hard stall seen on CPU#%d: %lu interrupts missed\n",
> +			this_cpu, stall);
> +		dump_stack();
> +	}
> +}
> +
> +static int is_hardlockup(int this_cpu)
>  {
>  	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
> +	unsigned long hrint_saved = __this_cpu_read(hrtimer_interrupts_saved);
> +	unsigned long ints_missed = 0;
> +
> +	__this_cpu_write(hrtimer_interrupts_saved, hrint);
>  
> -	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
> +	if (hrint_saved == hrint)
> +		ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
> +	else
> +		__this_cpu_write(hrtimer_interrupts_missed, 0);
> +
> +	if (ints_missed >= hardlockup_thresh)
>  		return 1;
>  
> -	__this_cpu_write(hrtimer_interrupts_saved, hrint);
> +	update_hardstall(ints_missed, this_cpu);
>  	return 0;
>  }
>  #endif
>  
> -static int is_softlockup(unsigned long touch_ts)
> +static void update_softstall(unsigned long stall, int this_cpu)
>  {
> -	unsigned long now = get_timestamp(smp_processor_id());
> +	int update_stall = 0;
> +	if (stall > get_softstall_thresh() &&
> +			stall > worst_softstall + softstall_diff_thresh) {
> +		unsigned long flags;
> +		spin_lock_irqsave(&softstall_lock, flags);
> +		update_stall = stall > worst_softstall + softstall_diff_thresh;
> +		if (update_stall)
> +			worst_softstall = stall;
> +		spin_unlock_irqrestore(&softstall_lock, flags);
> +	}
> +
> +	if (update_stall) {
> +		printk(KERN_WARNING "new worst soft stall seen on CPU#%d: %lums\n",
> +				this_cpu, stall);
> +		dump_stack();
> +	}
> +}
> +
> +static int is_softlockup(unsigned long touch_ts, int this_cpu)
> +{
> +	unsigned long now = get_timestamp(this_cpu);
> +	unsigned long stall = now - touch_ts;
>  
>  	/* Warn about unreasonable delays: */
> -	if (time_after(now, touch_ts + get_softlockup_thresh()))
> -		return now - touch_ts;
> +	if (time_after(now, touch_ts + 1000 * get_softlockup_thresh()))
> +		return stall;
> +
> +	update_softstall(stall, this_cpu);
>  
>  	return 0;
>  }
> @@ -213,6 +297,9 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
>  		 struct perf_sample_data *data,
>  		 struct pt_regs *regs)
>  {
> +	int lockup;
> +	int this_cpu = smp_processor_id();
> +
>  	/* Ensure the watchdog never gets throttled */
>  	event->hw.interrupts = 0;
>  
> @@ -227,9 +314,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
>  	 * fired multiple times before we overflow'd.  If it hasn't
>  	 * then this is a good indication the cpu is stuck
>  	 */
> -	if (is_hardlockup()) {
> -		int this_cpu = smp_processor_id();
> -
> +	lockup = is_hardlockup(this_cpu);
> +	if (lockup) {
>  		/* only print hardlockups once */
>  		if (__this_cpu_read(hard_watchdog_warn) == true)
>  			return;
> @@ -260,6 +346,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
>  	struct pt_regs *regs = get_irq_regs();
>  	int duration;
> +	int this_cpu = smp_processor_id();
>  
>  	/* kick the hardlockup detector */
>  	watchdog_interrupt_count();
> @@ -289,14 +376,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  	 * indicate it is getting cpu time.  If it hasn't then
>  	 * this is a good indication some task is hogging the cpu
>  	 */
> -	duration = is_softlockup(touch_ts);
> +	duration = is_softlockup(touch_ts, this_cpu);
>  	if (unlikely(duration)) {
>  		/* only warn once */
>  		if (__this_cpu_read(soft_watchdog_warn) == true)
>  			return HRTIMER_RESTART;
>  
>  		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
> -			smp_processor_id(), duration,
> +			this_cpu, duration >> 10L,
>  			current->comm, task_pid_nr(current));
>  		print_modules();
>  		print_irqtrace_events(current);
> @@ -371,6 +458,7 @@ static int watchdog_nmi_enable(int cpu)
>  	/* Try to register using hardware perf events */
>  	wd_attr = &wd_hw_attr;
>  	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
> +	do_div(wd_attr->sample_period, hardlockup_thresh);
>  	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
>  	if (!IS_ERR(event)) {
>  		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
> -- 
> 1.7.3.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
       [not found]   ` <CAAuSN9106qmYF27oRrfUBtqwOmSQgDJWwv3iz_NmTTuYNEymHA@mail.gmail.com>
@ 2011-07-20 15:41     ` Don Zickus
  2011-07-20 19:41       ` ZAK Magnus
  0 siblings, 1 reply; 7+ messages in thread
From: Don Zickus @ 2011-07-20 15:41 UTC (permalink / raw)
  To: ZAK Magnus; +Cc: linux-kernel

On Mon, Jul 18, 2011 at 02:45:55PM -0700, ZAK Magnus wrote:
> Okay, great. I'm eager to hear anything you may discover, good or bad. By
> the way, would you mind sharing a bit about how you do your testing for
> this?

Sorry for getting back to you late, busy week.

Most of the testing I do is from the lkdtm module

modprobe lkdtm
mount -t debugfs none /sys/kernel/debug
cd /sys/kernel/debug/provoke-crashing/
service cpuspeed stop
echo HARDLOCKUP > DIRECT #or SOFTLOCKUP or HUNG_TASK

I then count to 10 seconds to make sure the timer is within reason.

So I did the above test and noticed the panic looked funny because it spit
out the 

new worst hard stall seen on CPU#0: 3 interrupts missed

and then 

new worst hard stall seen on CPU#0: 4 interrupts missed

and then finally the HARDLOCKUP message

I am not sure that is what we want as it confuses people as to where the
panic really is.

What if you moved the 'update_hardstall()' to just underneath the zero'ing
out of the hrtimer_interrupts_missed?  This only then prints out the
interrupts missed line when you know the end point.  And avoids printing
it all together in the case of a true HARDLOCKUP.  Like the patch below

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7d37cc2..ba41a74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -238,13 +238,14 @@ static int is_hardlockup(int this_cpu)
 
 	if (hrint_saved == hrint)
 		ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
-	else
+	else {
 		__this_cpu_write(hrtimer_interrupts_missed, 0);
+		update_hardstall(ints_missed, this_cpu);
+	}
 
 	if (ints_missed >= hardlockup_thresh)
 		return 1;
 
-	update_hardstall(ints_missed, this_cpu);
 	return 0;
 }
 #endif

The softlockup case probably needs the same.

Thoughts?

Cheers,
Don

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
  2011-07-20 15:41     ` Don Zickus
@ 2011-07-20 19:41       ` ZAK Magnus
  2011-07-20 21:07         ` Don Zickus
  0 siblings, 1 reply; 7+ messages in thread
From: ZAK Magnus @ 2011-07-20 19:41 UTC (permalink / raw)
  To: Don Zickus; +Cc: linux-kernel

On Wed, Jul 20, 2011 at 8:41 AM, Don Zickus <dzickus@redhat.com> wrote:
> On Mon, Jul 18, 2011 at 02:45:55PM -0700, ZAK Magnus wrote:
>> Okay, great. I'm eager to hear anything you may discover, good or bad. By
>> the way, would you mind sharing a bit about how you do your testing for
>> this?
>
> Sorry for getting back to you late, busy week.
>
> Most of the testing I do is from the lkdtm module
>
> modprobe lkdtm
> mount -t debugfs none /sys/kernel/debug
> cd /sys/kernel/debug/provoke-crashing/
> service cpuspeed stop
> echo HARDLOCKUP > DIRECT #or SOFTLOCKUP or HUNG_TASK
>
> I then count to 10 seconds to make sure the timer is within reason.
>
> So I did the above test and noticed the panic looked funny because it spit
> out the
>
> new worst hard stall seen on CPU#0: 3 interrupts missed
>
> and then
>
> new worst hard stall seen on CPU#0: 4 interrupts missed
>
> and then finally the HARDLOCKUP message
>
> I am not sure that is what we want as it confuses people as to where the
> panic really is.
Are the stack traces very different? I don't understand in what sense
it's confusing.

> What if you moved the 'update_hardstall()' to just underneath the zero'ing
> out of the hrtimer_interrupts_missed?  This only then prints out the
> interrupts missed line when you know the end point.  And avoids printing
> it all together in the case of a true HARDLOCKUP.  Like the patch below
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 7d37cc2..ba41a74 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -238,13 +238,14 @@ static int is_hardlockup(int this_cpu)
>
>        if (hrint_saved == hrint)
>                ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
> -       else
> +       else {
>                __this_cpu_write(hrtimer_interrupts_missed, 0);
> +               update_hardstall(ints_missed, this_cpu);
> +       }
>
>        if (ints_missed >= hardlockup_thresh)
>                return 1;
>
> -       update_hardstall(ints_missed, this_cpu);
>        return 0;
>  }
>  #endif
>
> The softlockup case probably needs the same.
>
> Thoughts?
I don't think that exact patch would work (wouldn't it cause
update_hardstall to only ever be called with 0 as its first argument?)
but I hope I still understand what you're saying. You're saying stalls
should only be recorded once they're finished, right? I don't know if
this is the best approach. If we wait until interrupts stop being
missed, it means the code could have exited whatever section caused
the stall to begin with. Maybe your data indicates otherwise, but I
would think this means the stack trace would not really be
informative. It's one thing to know a stall occurs, but its occurrence
is generally reflective of a bug or a suboptimal section, so it would
be good to know where that is in order to try and fix it.

For soft stalls, I think the same is true. Also, since the soft lockup
system just relies on checking a timestamp compared to now, it can't
know how long a stall was after it has already finished. The hard
system only knows because it keeps a running count of the number of
failed checks. An additional timestamp could be introduced and the
difference between the two retroactively checked in order to reproduce
this, but the stack trace issue would still apply. Also, while not
hugely complex, the change would be more significant than the sort
your patch presents.

The bottom line is that I think catching a stall in progress is the
most informative thing to do, and I don't understand the downsides of
doing so. Could you please explain them?

On another note, I'm working on a patch on top of this one which would
change the hard lockup system to be more like the soft lockup system.
It would use a timestamp as well, so it can have a more exact read on
how long the timer has been delayed. This adds resolution and gets rid
of that problem where it can only report missed = 3 or 4. Any
preliminary comments? Or should I just put the patch up before
discussing it?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
  2011-07-20 19:41       ` ZAK Magnus
@ 2011-07-20 21:07         ` Don Zickus
  2011-07-20 21:15           ` ZAK Magnus
  0 siblings, 1 reply; 7+ messages in thread
From: Don Zickus @ 2011-07-20 21:07 UTC (permalink / raw)
  To: ZAK Magnus; +Cc: linux-kernel

On Wed, Jul 20, 2011 at 12:41:39PM -0700, ZAK Magnus wrote:
> > I then count to 10 seconds to make sure the timer is within reason.
> >
> > So I did the above test and noticed the panic looked funny because it spit
> > out the
> >
> > new worst hard stall seen on CPU#0: 3 interrupts missed
> >
> > and then
> >
> > new worst hard stall seen on CPU#0: 4 interrupts missed
> >
> > and then finally the HARDLOCKUP message
> >
> > I am not sure that is what we want as it confuses people as to where the
> > panic really is.
> Are the stack traces very different? I don't understand in what sense
> it's confusing.

The fact that there are 3 of them telling me the samething.  Most people
look at the first stack trace to figure out what is going on and will just
notice the warning.  They might completely miss the HARDLOCKUP message on
the third stack trace down.

It just looked odd when I ran it the first time.  I feel like I would
constantly be trying to educate people on why we do it like that.

> 
> > What if you moved the 'update_hardstall()' to just underneath the zero'ing
> > out of the hrtimer_interrupts_missed?  This only then prints out the
> > interrupts missed line when you know the end point.  And avoids printing
> > it all together in the case of a true HARDLOCKUP.  Like the patch below
> >
> > diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> > index 7d37cc2..ba41a74 100644
> > --- a/kernel/watchdog.c
> > +++ b/kernel/watchdog.c
> > @@ -238,13 +238,14 @@ static int is_hardlockup(int this_cpu)
> >
> >        if (hrint_saved == hrint)
> >                ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
> > -       else
> > +       else {
> >                __this_cpu_write(hrtimer_interrupts_missed, 0);
> > +               update_hardstall(ints_missed, this_cpu);
> > +       }
> >
> >        if (ints_missed >= hardlockup_thresh)
> >                return 1;
> >
> > -       update_hardstall(ints_missed, this_cpu);
> >        return 0;
> >  }
> >  #endif
> >
> > The softlockup case probably needs the same.
> >
> > Thoughts?
> I don't think that exact patch would work (wouldn't it cause
> update_hardstall to only ever be called with 0 as its first argument?)
> but I hope I still understand what you're saying. You're saying stalls
> should only be recorded once they're finished, right? I don't know if
> this is the best approach. If we wait until interrupts stop being
> missed, it means the code could have exited whatever section caused
> the stall to begin with. Maybe your data indicates otherwise, but I
> would think this means the stack trace would not really be

Crap.  good point.

> informative. It's one thing to know a stall occurs, but its occurrence
> is generally reflective of a bug or a suboptimal section, so it would
> be good to know where that is in order to try and fix it.
> 
> For soft stalls, I think the same is true. Also, since the soft lockup
> system just relies on checking a timestamp compared to now, it can't
> know how long a stall was after it has already finished. The hard
> system only knows because it keeps a running count of the number of
> failed checks. An additional timestamp could be introduced and the
> difference between the two retroactively checked in order to reproduce
> this, but the stack trace issue would still apply. Also, while not
> hugely complex, the change would be more significant than the sort
> your patch presents.
> 
> The bottom line is that I think catching a stall in progress is the
> most informative thing to do, and I don't understand the downsides of
> doing so. Could you please explain them?
> 
> On another note, I'm working on a patch on top of this one which would
> change the hard lockup system to be more like the soft lockup system.
> It would use a timestamp as well, so it can have a more exact read on
> how long the timer has been delayed. This adds resolution and gets rid
> of that problem where it can only report missed = 3 or 4. Any
> preliminary comments? Or should I just put the patch up before
> discussing it?

That might work.  I would have to see the patch.  What clock would you use
to read the time?  I don't think you can use 'now' if interrupts are
disabled.

Cheers,
Don


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
  2011-07-20 21:07         ` Don Zickus
@ 2011-07-20 21:15           ` ZAK Magnus
  2011-07-21 14:51             ` Don Zickus
  0 siblings, 1 reply; 7+ messages in thread
From: ZAK Magnus @ 2011-07-20 21:15 UTC (permalink / raw)
  To: Don Zickus; +Cc: linux-kernel

On Wed, Jul 20, 2011 at 2:07 PM, Don Zickus <dzickus@redhat.com> wrote:
> On Wed, Jul 20, 2011 at 12:41:39PM -0700, ZAK Magnus wrote:
>> Are the stack traces very different? I don't understand in what sense
>> it's confusing.
>
> The fact that there are 3 of them telling me the samething.  Most people
> look at the first stack trace to figure out what is going on and will just
> notice the warning.  They might completely miss the HARDLOCKUP message on
> the third stack trace down.
>
> It just looked odd when I ran it the first time.  I feel like I would
> constantly be trying to educate people on why we do it like that.
Oh, okay. So, maybe the stall warnings should say something like, "a
lockup might happen soon?" Would that help? I don't know.

>> I don't think that exact patch would work (wouldn't it cause
>> update_hardstall to only ever be called with 0 as its first argument?)
>> but I hope I still understand what you're saying. You're saying stalls
>> should only be recorded once they're finished, right? I don't know if
>> this is the best approach. If we wait until interrupts stop being
>> missed, it means the code could have exited whatever section caused
>> the stall to begin with. Maybe your data indicates otherwise, but I
>> would think this means the stack trace would not really be
>
> Crap.  good point.
Which part, exactly?

>> informative. It's one thing to know a stall occurs, but its occurrence
>> is generally reflective of a bug or a suboptimal section, so it would
>> be good to know where that is in order to try and fix it.
>>
>> For soft stalls, I think the same is true. Also, since the soft lockup
>> system just relies on checking a timestamp compared to now, it can't
>> know how long a stall was after it has already finished. The hard
>> system only knows because it keeps a running count of the number of
>> failed checks. An additional timestamp could be introduced and the
>> difference between the two retroactively checked in order to reproduce
>> this, but the stack trace issue would still apply. Also, while not
>> hugely complex, the change would be more significant than the sort
>> your patch presents.
>>
>> The bottom line is that I think catching a stall in progress is the
>> most informative thing to do, and I don't understand the downsides of
>> doing so. Could you please explain them?
>>
>> On another note, I'm working on a patch on top of this one which would
>> change the hard lockup system to be more like the soft lockup system.
>> It would use a timestamp as well, so it can have a more exact read on
>> how long the timer has been delayed. This adds resolution and gets rid
>> of that problem where it can only report missed = 3 or 4. Any
>> preliminary comments? Or should I just put the patch up before
>> discussing it?
>
> That might work.  I would have to see the patch.  What clock would you use
> to read the time?  I don't think you can use 'now' if interrupts are
> disabled.
Okay, I will send it when it seems ready. For the timestamp, I was
just using the get_timestamp function that's defined in the file,
which calls cpu_clock(). Is there a better way?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] Track hard and soft "short lockups" or "stalls."
  2011-07-20 21:15           ` ZAK Magnus
@ 2011-07-21 14:51             ` Don Zickus
  0 siblings, 0 replies; 7+ messages in thread
From: Don Zickus @ 2011-07-21 14:51 UTC (permalink / raw)
  To: ZAK Magnus; +Cc: linux-kernel

On Wed, Jul 20, 2011 at 02:15:02PM -0700, ZAK Magnus wrote:
> On Wed, Jul 20, 2011 at 2:07 PM, Don Zickus <dzickus@redhat.com> wrote:
> > On Wed, Jul 20, 2011 at 12:41:39PM -0700, ZAK Magnus wrote:
> >> Are the stack traces very different? I don't understand in what sense
> >> it's confusing.
> >
> > The fact that there are 3 of them telling me the samething.  Most people
> > look at the first stack trace to figure out what is going on and will just
> > notice the warning.  They might completely miss the HARDLOCKUP message on
> > the third stack trace down.
> >
> > It just looked odd when I ran it the first time.  I feel like I would
> > constantly be trying to educate people on why we do it like that.
> Oh, okay. So, maybe the stall warnings should say something like, "a
> lockup might happen soon?" Would that help? I don't know.

Perhaps just changing the wording might help with the confusion.  After
reading below it might be hard to restrict the dump_stack to only the
highest stall (as opposed to printing each new worst hardstall as it
increments to a hardlockup).

> 
> >> I don't think that exact patch would work (wouldn't it cause
> >> update_hardstall to only ever be called with 0 as its first argument?)
> >> but I hope I still understand what you're saying. You're saying stalls
> >> should only be recorded once they're finished, right? I don't know if
> >> this is the best approach. If we wait until interrupts stop being
> >> missed, it means the code could have exited whatever section caused
> >> the stall to begin with. Maybe your data indicates otherwise, but I
> >> would think this means the stack trace would not really be
> >
> > Crap.  good point.
> Which part, exactly?

The part about losing the stalled section when the code resets the missed
interrupts back to 0.

> 
> >> informative. It's one thing to know a stall occurs, but its occurrence
> >> is generally reflective of a bug or a suboptimal section, so it would
> >> be good to know where that is in order to try and fix it.
> >>
> >> For soft stalls, I think the same is true. Also, since the soft lockup
> >> system just relies on checking a timestamp compared to now, it can't
> >> know how long a stall was after it has already finished. The hard
> >> system only knows because it keeps a running count of the number of
> >> failed checks. An additional timestamp could be introduced and the
> >> difference between the two retroactively checked in order to reproduce
> >> this, but the stack trace issue would still apply. Also, while not
> >> hugely complex, the change would be more significant than the sort
> >> your patch presents.
> >>
> >> The bottom line is that I think catching a stall in progress is the
> >> most informative thing to do, and I don't understand the downsides of
> >> doing so. Could you please explain them?
> >>
> >> On another note, I'm working on a patch on top of this one which would
> >> change the hard lockup system to be more like the soft lockup system.
> >> It would use a timestamp as well, so it can have a more exact read on
> >> how long the timer has been delayed. This adds resolution and gets rid
> >> of that problem where it can only report missed = 3 or 4. Any
> >> preliminary comments? Or should I just put the patch up before
> >> discussing it?
> >
> > That might work.  I would have to see the patch.  What clock would you use
> > to read the time?  I don't think you can use 'now' if interrupts are
> > disabled.
> Okay, I will send it when it seems ready. For the timestamp, I was
> just using the get_timestamp function that's defined in the file,
> which calls cpu_clock(). Is there a better way?

Oh right.  I forgot Peter created some abstraction called cpu_clock().
That should work I suppose, it's NMI safe according to various code
comments.  I just wonder if the clock gets updated if interrupts are
accidentally disabled for a long time.  Though TSC doesn't care on x86
about interrupts.

Cheers,
Don

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2011-07-21 14:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-15 20:11 [PATCH v2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
2011-07-18 12:28 ` Don Zickus
     [not found]   ` <CAAuSN9106qmYF27oRrfUBtqwOmSQgDJWwv3iz_NmTTuYNEymHA@mail.gmail.com>
2011-07-20 15:41     ` Don Zickus
2011-07-20 19:41       ` ZAK Magnus
2011-07-20 21:07         ` Don Zickus
2011-07-20 21:15           ` ZAK Magnus
2011-07-21 14:51             ` Don Zickus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox