[PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Bitao Hu <yaoma@linux.alibaba.com>
To: dianders@chromium.org, akpm@linux-foundation.org,
	pmladek@suse.com, tglx@linutronix.de, maz@kernel.org,
	liusong@linux.alibaba.com
Cc: linux-kernel@vger.kernel.org, Bitao Hu <yaoma@linux.alibaba.com>
Subject: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq
Date: Tue, 23 Jan 2024 20:12:22 +0800	[thread overview]
Message-ID: <20240123121223.22318-3-yaoma@linux.alibaba.com> (raw)
In-Reply-To: <20240123121223.22318-1-yaoma@linux.alibaba.com>

When the watchdog determines that the current soft lockup is due
to an interrupt storm based on CPU utilization, reporting the
top three most time-consuming hardirq can be quite useful for
further troubleshooting.
Below is an example of interrupt storm. The call tree does not
provide useful information, but we can analyze which interrupt
caused the soft lockup by using the time-consuming information
of hardirq.

[   67.714044] watchdog: BUG: soft lockup - CPU#9 stuck for 28s! [swapper/9:0]
[   67.714548] CPU#9 Utilization every 4s during lockup:
[   67.714549]  #1: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
[   67.714551]  #2: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714553]  #3: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714555]  #4: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
[   67.714556]  #5: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714570] CPU#9 Detect HardIRQ Time exceeds 50% since 45s. Most time-consuming HardIRQs:
[   67.714571]  #1: 99.9% irq#7(IPI)
[   67.714575]  #2: 0.0% irq#10(arch_timer)
[   67.714578]  #3: 0.0% irq#2(IPI)
...
[   67.714654] Call trace:
[   67.714656]  __do_softirq+0xa8/0x364

Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
---
 include/linux/irq.h     |   9 ++
 include/linux/irqdesc.h |   2 +
 kernel/irq/irqdesc.c    |   9 +-
 kernel/watchdog.c       | 186 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 205 insertions(+), 1 deletion(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 90081afa10ce..3a3e8b1e227e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -251,6 +251,15 @@ enum {
 	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
+
+/*
+ * struct per_irqtime - per irq stats for interrupt storm detection
+ */
+struct per_irqtime {
+	u64	start_time;
+	u64	delta;
+};
+
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a73..c5407cdcf2d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -29,6 +29,7 @@ struct pt_regs;
  * @wake_depth:		enable depth, for multiple irq_set_irq_wake() callers
  * @tot_count:		stats field for non-percpu irqs
  * @irq_count:		stats field to detect stalled irqs
+ * @irq_time:		stats field to detect interrupt storm
  * @last_unhandled:	aging timer for unhandled count
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @threads_handled:	stats field for deferred spurious detection of threaded handlers
@@ -64,6 +65,7 @@ struct irq_desc {
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		tot_count;
 	unsigned int		irq_count;	/* For detecting broken IRQs */
+	struct per_irqtime __percpu *irq_time;
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	unsigned int		irqs_unhandled;
 	atomic_t		threads_handled;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 27ca1c866f29..a4d7c324cba2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -422,9 +422,12 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 	desc->kstat_irqs = alloc_percpu(unsigned int);
 	if (!desc->kstat_irqs)
 		goto err_desc;
+	desc->irq_time = alloc_percpu(struct per_irqtime);
+	if (!desc->irq_time)
+		goto err_kstat;
 
 	if (alloc_masks(desc, node))
-		goto err_kstat;
+		goto err_irqtime;
 
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
@@ -439,6 +442,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 
 	return desc;
 
+err_irqtime:
+	free_percpu(desc->irq_time);
 err_kstat:
 	free_percpu(desc->kstat_irqs);
 err_desc:
@@ -451,6 +456,7 @@ static void irq_kobj_release(struct kobject *kobj)
 	struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
 
 	free_masks(desc);
+	free_percpu(desc->irq_time);
 	free_percpu(desc->kstat_irqs);
 	kfree(desc);
 }
@@ -594,6 +600,7 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
+		desc[i].irq_time = alloc_percpu(struct per_irqtime);
 		alloc_masks(&desc[i], node);
 		raw_spin_lock_init(&desc[i].lock);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9fad10e0a147..f347c5d8c5c1 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,12 @@
 #include <linux/stop_machine.h>
 #include <linux/kernel_stat.h>
 #include <linux/math64.h>
+#include <linux/tracepoint.h>
+#include <linux/irq.h>
+#include <linux/bitops.h>
+#include <trace/events/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/timekeeping.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -431,11 +437,15 @@ void touch_softlockup_watchdog_sync(void)
 	__this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
 }
 
+static void set_potential_softlockup(unsigned long now, unsigned long touch_ts);
+
 static int is_softlockup(unsigned long touch_ts,
 			 unsigned long period_ts,
 			 unsigned long now)
 {
 	if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+		/* Softlockup may occur in the current period */
+		set_potential_softlockup(now, period_ts);
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
@@ -448,6 +458,8 @@ static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
 static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
 static DEFINE_PER_CPU(int, cpustat_tail);
 
+static void print_hardirq_time(void);
+
 static void update_cpustat(void)
 {
 	u64 *old = this_cpu_ptr(cpustat_old);
@@ -489,10 +501,179 @@ static void print_cpustat(void)
 			a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
 			a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
 	}
+	print_hardirq_time();
+}
+
+static DEFINE_PER_CPU(u64, hardirq_start_time);
+
+#define SOFTLOCKUP_HARDIRQ	0
+static DEFINE_PER_CPU(long, softlockup_flags);
+
+static void find_irqtime_top3(u64 *irqtime, int *irq, u64 perirq_time, int perirq_id)
+{
+	if (perirq_time > irqtime[0] || (perirq_time == irqtime[0] && perirq_id < irq[0])) {
+		irqtime[2] = irqtime[1];
+		irq[2] = irq[1];
+		irqtime[1] = irqtime[0];
+		irq[1] = irq[0];
+		irqtime[0] = perirq_time;
+		irq[0] = perirq_id;
+	} else if (perirq_time > irqtime[1] || (perirq_time == irqtime[1] && perirq_id < irq[1])) {
+		irqtime[2] = irqtime[1];
+		irq[2] = irq[1];
+		irqtime[1] = perirq_time;
+		irq[1] = perirq_id;
+	} else if (perirq_time > irqtime[1] || (perirq_time == irqtime[2] && perirq_id < irq[2])) {
+		irqtime[2] = perirq_time;
+		irq[2] = perirq_id;
+	}
+}
+
+/*
+ * If the proportion of time spent handling irq exceeds 50% during a sampling period,
+ * then it is necessary to tally the handling time of each irq.
+ */
+static inline bool need_trace_irqtime(int type)
+{
+	int tail = this_cpu_read(cpustat_tail);
+
+	if (--tail == -1)
+		tail = 4;
+	return this_cpu_read(cpustat_diff[tail][type]) > sample_period/2;
+}
+
+static void irq_handler_entry_callback(void *data, int irq, struct irqaction *action)
+{
+	this_cpu_ptr(irq_to_desc(irq)->irq_time)->start_time = local_clock();
+}
+
+static void irq_handler_exit_callback(void *data, int irq, struct irqaction *action, int ret)
+{
+	u64 delta;
+	struct per_irqtime *irq_time;
+
+	if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
+		irq_time = this_cpu_ptr(irq_to_desc(irq)->irq_time);
+		delta = local_clock() - irq_time->start_time;
+		irq_time->delta += delta;
+	}
+}
+/*
+ * Mark softlockup as potentially caused by hardirq
+ */
+static void set_potential_softlockup_hardirq(void)
+{
+	if (!need_trace_irqtime(CPUTIME_IRQ))
+		return;
+
+	if (test_and_set_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags)))
+		return;
+
+	__this_cpu_write(hardirq_start_time, local_clock());
+}
+
+static void clear_potential_softlockup_hardirq(void)
+{
+	unsigned int i;
+	unsigned long flags;
+	struct irq_desc *desc;
+
+	local_irq_save(flags);
+	clear_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags));
+	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+		memset(this_cpu_ptr(desc->irq_time), 0, sizeof(struct per_irqtime));
+	}
+	local_irq_restore(flags);
+}
+
+static void hook_hardirq_events(void)
+{
+	int ret;
+
+	ret = register_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+	if (ret)
+		goto out_err;
+	ret = register_trace_irq_handler_exit(irq_handler_exit_callback, NULL);
+	if (ret)
+		goto out_unreg_entry;
+	return;
+
+out_unreg_entry:
+	unregister_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+out_err:
+	return;
+}
+
+static void unhook_hardirq_events(void)
+{
+	unregister_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+	unregister_trace_irq_handler_exit(irq_handler_exit_callback, NULL);
 }
+
+/*
+ * Mark that softlockup may occur
+ */
+static void set_potential_softlockup(unsigned long now, unsigned long period_ts)
+{
+	if (time_after_eq(now, period_ts + get_softlockup_thresh()/5))
+		set_potential_softlockup_hardirq();
+}
+
+static void clear_potential_softlockup(void)
+{
+	clear_potential_softlockup_hardirq();
+}
+
+static void print_hardirq_time(void)
+{
+	struct per_irqtime *irq_time;
+	struct irq_desc *desc;
+	u64 hardirqtime_top3[3] = {0, 0, 0};
+	int hardirq_top3[3] = {-1, -1, -1};
+	u64 start_time, now, a;
+	u32 period_us, i, b;
+
+	if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
+		start_time = __this_cpu_read(hardirq_start_time);
+		now = local_clock();
+		period_us = (now - start_time)/1000;
+		/* Find the top three most time-consuming interrupts */
+		for_each_irq_desc(i, desc) {
+			if (!desc)
+				continue;
+			irq_time = this_cpu_ptr(desc->irq_time);
+			find_irqtime_top3(hardirqtime_top3, hardirq_top3, irq_time->delta, i);
+		}
+		printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds 50%% since %llds. Most time-consuming HardIRQs:\n",
+			smp_processor_id(), ns_to_timespec64(start_time - sample_period).tv_sec);
+		for (i = 0; i < 3; i++) {
+			if (hardirq_top3[i] == -1)
+				break;
+			a = 100 * (hardirqtime_top3[i] / 1000);
+			b = 10 * do_div(a, period_us);
+			do_div(b, period_us);
+			desc = irq_to_desc(hardirq_top3[i]);
+			if (desc && desc->action)
+				printk(KERN_CRIT "\t#%u: %llu.%u%% irq#%d(%s)\n",
+					i+1, a, b, hardirq_top3[i], desc->action->name);
+			else
+				printk(KERN_CRIT "\t#%u: %llu.%u%% irq#%d\n",
+				i+1, a, b, hardirq_top3[i]);
+		}
+		if (!need_trace_irqtime(CPUTIME_IRQ))
+			clear_potential_softlockup_hardirq();
+	}
+}
+
 #else
 static inline void update_cpustat(void) { }
 static inline void print_cpustat(void) { }
+static inline void set_potential_softlockup(unsigned long now, unsigned long period_ts) { }
+static inline void clear_potential_softlockup(void) { }
+static inline void hook_hardirq_events(void) { }
+static inline void unhook_hardirq_events(void) { }
 #endif
 
 /* watchdog detector functions */
@@ -510,6 +691,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
 static int softlockup_fn(void *data)
 {
 	update_touch_ts();
+	clear_potential_softlockup();
 	complete(this_cpu_ptr(&softlockup_completion));
 
 	return 0;
@@ -674,6 +856,8 @@ static void softlockup_stop_all(void)
 	if (!softlockup_initialized)
 		return;
 
+	unhook_hardirq_events();
+
 	for_each_cpu(cpu, &watchdog_allowed_mask)
 		smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
 
@@ -690,6 +874,8 @@ static void softlockup_start_all(void)
 {
 	int cpu;
 
+	hook_hardirq_events();
+
 	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
 	for_each_cpu(cpu, &watchdog_allowed_mask)
 		smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
-- 
2.37.1 (Apple Git-137.1)

next prev parent reply	other threads:[~2024-01-23 12:12 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-23 12:12 [PATCH 0/3] *** Detect interrupt storm in softlockup *** Bitao Hu
2024-01-23 12:12 ` [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm Bitao Hu
2024-01-24  1:43   ` Liu Song
2024-01-24  2:48     ` yaoma
2024-01-25  0:19   ` Doug Anderson
2024-01-25  8:31     ` Bitao Hu
2024-01-25 15:08       ` Doug Anderson
2024-01-26  5:25   ` kernel test robot
2024-01-23 12:12 ` Bitao Hu [this message]
2024-01-25  0:19   ` [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq Doug Anderson
2024-01-25  7:50     ` Bitao Hu
2024-01-25  9:19     ` Bitao Hu
2024-02-14 23:36   ` Thomas Gleixner
2024-02-14 23:39     ` Doug Anderson
2024-02-15  1:02       ` Thomas Gleixner
2024-01-23 12:12 ` [PATCH 3/3] watchdog/softlockup: add parameter to control the reporting of " Bitao Hu
2024-01-26  5:25   ` kernel test robot
2024-01-26  6:07   ` kernel test robot

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:90081afa10c dfblob:3a3e8b1e227 dfblob:d9451d456a7
dfblob:c5407cdcf2d dfblob:27ca1c866f2 dfblob:a4d7c324cba
dfblob:9fad10e0a14 dfblob:f347c5d8c5c )
 OR (
bs:"[PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240123121223.22318-3-yaoma@linux.alibaba.com \
    --to=yaoma@linux.alibaba.com \
    --cc=akpm@linux-foundation.org \
    --cc=dianders@chromium.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=liusong@linux.alibaba.com \
    --cc=maz@kernel.org \
    --cc=pmladek@suse.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox