* + watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch added to mm-nonmm-unstable branch
@ 2026-02-20 21:20 Andrew Morton
0 siblings, 0 replies; 2+ messages in thread
From: Andrew Morton @ 2026-02-20 21:20 UTC (permalink / raw)
To: mm-commits, wangjinchao600, pmladek, max.kellermann, lihuafei1,
irogers, eranian, dianders, cuiyunhui, corbet, mrungta, akpm
The patch titled
Subject: watchdog/hardlockup: improve buddy system detection timeliness
has been added to the -mm mm-nonmm-unstable branch. Its filename is
watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via various
branches at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there most days
------------------------------------------------------
From: Mayank Rungta <mrungta@google.com>
Subject: watchdog/hardlockup: improve buddy system detection timeliness
Date: Thu, 12 Feb 2026 14:12:12 -0700
Currently, the buddy system only performs checks every 3rd sample. With a
4-second interval. If a check window is missed, the next check occurs 12
seconds later, potentially delaying hard lockup detection for up to 24
seconds.
Modify the buddy system to perform checks at every interval (4s).
Introduce a missed-interrupt threshold to maintain the existing grace
period while reducing the detection window to 8-12 seconds.
Best and worst case detection scenarios:
Before (12s check window):
- Best case: Lockup occurs after first check but just before heartbeat
interval. Detected in ~8s (8s till next check).
- Worst case: Lockup occurs just after a check.
Detected in ~24s (missed check + 12s till next check + 12s logic).
After (4s check window with threshold of 3):
- Best case: Lockup occurs just before a check.
Detected in ~8s (0s till 1st check + 4s till 2nd + 4s till 3rd).
- Worst case: Lockup occurs just after a check.
Detected in ~12s (4s till 1st check + 4s till 2nd + 4s till 3rd).
Link: https://lkml.kernel.org/r/20260212-hardlockup-watchdog-fixes-v1-3-745f1dce04c3@google.com
Signed-off-by: Mayank Rungta <mrungta@google.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Li Huafei <lihuafei1@huawei.com>
Cc: Max Kellermann <max.kellermann@ionos.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephane Erainan <eranian@google.com>
Cc: Wang Jinchao <wangjinchao600@gmail.com>
Cc: Yunhui Cui <cuiyunhui@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/nmi.h | 1 +
kernel/watchdog.c | 18 ++++++++++++++++--
kernel/watchdog_buddy.c | 9 +--------
3 files changed, 18 insertions(+), 10 deletions(-)
--- a/include/linux/nmi.h~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/include/linux/nmi.h
@@ -21,6 +21,7 @@ void lockup_detector_soft_poweroff(void)
extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
+extern int watchdog_hardlockup_miss_thresh;
extern struct cpumask watchdog_cpumask;
extern unsigned long *watchdog_cpumask_bits;
--- a/kernel/watchdog_buddy.c~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/kernel/watchdog_buddy.c
@@ -21,6 +21,7 @@ static unsigned int watchdog_next_cpu(un
int __init watchdog_hardlockup_probe(void)
{
+ watchdog_hardlockup_miss_thresh = 3;
return 0;
}
@@ -86,14 +87,6 @@ void watchdog_buddy_check_hardlockup(int
{
unsigned int next_cpu;
- /*
- * Test for hardlockups every 3 samples. The sample period is
- * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
- * watchdog_thresh (over by 20%).
- */
- if (hrtimer_interrupts % 3 != 0)
- return;
-
/* check for a hardlockup on the next CPU */
next_cpu = watchdog_next_cpu(smp_processor_id());
if (next_cpu >= nr_cpu_ids)
--- a/kernel/watchdog.c~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/kernel/watchdog.c
@@ -61,6 +61,13 @@ int __read_mostly sysctl_hardlockup_all_
# endif /* CONFIG_SMP */
/*
+ * Number of consecutive missed interrupts before declaring a lockup.
+ * Default to 1 (immediate) for NMI/Perf. Buddy will overwrite this to 3.
+ */
+int __read_mostly watchdog_hardlockup_miss_thresh = 1;
+EXPORT_SYMBOL_GPL(watchdog_hardlockup_miss_thresh);
+
+/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
@@ -137,6 +144,7 @@ __setup("nmi_watchdog=", hardlockup_pani
static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(int, hrtimer_interrupts_missed);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
static unsigned long hard_lockup_nmi_warn;
@@ -163,8 +171,13 @@ static bool is_hardlockup(unsigned int c
{
int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
- if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
- return true;
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) {
+ per_cpu(hrtimer_interrupts_missed, cpu)++;
+ if (per_cpu(hrtimer_interrupts_missed, cpu) >= watchdog_hardlockup_miss_thresh)
+ return true;
+
+ return false;
+ }
/*
* NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
@@ -172,6 +185,7 @@ static bool is_hardlockup(unsigned int c
* written/read by a single CPU.
*/
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ per_cpu(hrtimer_interrupts_missed, cpu) = 0;
return false;
}
_
Patches currently in -mm which might be from mrungta@google.com are
watchdog-hardlockup-always-update-saved-interrupts-during-check.patch
doc-watchdog-clarify-hardlockup-detection-timing.patch
watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
doc-watchdog-document-buddy-detector.patch
^ permalink raw reply [flat|nested] 2+ messages in thread
* + watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch added to mm-nonmm-unstable branch
@ 2026-03-14 23:33 Andrew Morton
0 siblings, 0 replies; 2+ messages in thread
From: Andrew Morton @ 2026-03-14 23:33 UTC (permalink / raw)
To: mm-commits, wangjinchao600, skhan, pmladek, max.kellermann,
lihuafei1, irogers, eranian, dianders, cuiyunhui, corbet, mrungta,
akpm
The patch titled
Subject: watchdog/hardlockup: improve buddy system detection timeliness
has been added to the -mm mm-nonmm-unstable branch. Its filename is
watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via various
branches at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there most days
------------------------------------------------------
From: Mayank Rungta <mrungta@google.com>
Subject: watchdog/hardlockup: improve buddy system detection timeliness
Date: Thu, 12 Mar 2026 16:22:05 -0700
Currently, the buddy system only performs checks every 3rd sample. With a
4-second interval. If a check window is missed, the next check occurs 12
seconds later, potentially delaying hard lockup detection for up to 24
seconds.
Modify the buddy system to perform checks at every interval (4s).
Introduce a missed-interrupt threshold to maintain the existing grace
period while reducing the detection window to 8-12 seconds.
Best and worst case detection scenarios:
Before (12s check window):
- Best case: Lockup occurs after first check but just before heartbeat
interval. Detected in ~8s (8s till next check).
- Worst case: Lockup occurs just after a check.
Detected in ~24s (missed check + 12s till next check + 12s logic).
After (4s check window with threshold of 3):
- Best case: Lockup occurs just before a check.
Detected in ~8s (0s till 1st check + 4s till 2nd + 4s till 3rd).
- Worst case: Lockup occurs just after a check.
Detected in ~12s (4s till 1st check + 4s till 2nd + 4s till 3rd).
Link: https://lkml.kernel.org/r/20260312-hardlockup-watchdog-fixes-v2-4-45bd8a0cc7ed@google.com
Signed-off-by: Mayank Rungta <mrungta@google.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Li Huafei <lihuafei1@huawei.com>
Cc: Max Kellermann <max.kellermann@ionos.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Stephane Erainan <eranian@google.com>
Cc: Wang Jinchao <wangjinchao600@gmail.com>
Cc: Yunhui Cui <cuiyunhui@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/nmi.h | 1 +
kernel/watchdog.c | 19 ++++++++++++++++---
kernel/watchdog_buddy.c | 9 +--------
3 files changed, 18 insertions(+), 11 deletions(-)
--- a/include/linux/nmi.h~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/include/linux/nmi.h
@@ -21,6 +21,7 @@ void lockup_detector_soft_poweroff(void)
extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
+extern int watchdog_hardlockup_miss_thresh;
extern struct cpumask watchdog_cpumask;
extern unsigned long *watchdog_cpumask_bits;
--- a/kernel/watchdog_buddy.c~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/kernel/watchdog_buddy.c
@@ -21,6 +21,7 @@ static unsigned int watchdog_next_cpu(un
int __init watchdog_hardlockup_probe(void)
{
+ watchdog_hardlockup_miss_thresh = 3;
return 0;
}
@@ -86,14 +87,6 @@ void watchdog_buddy_check_hardlockup(int
{
unsigned int next_cpu;
- /*
- * Test for hardlockups every 3 samples. The sample period is
- * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
- * watchdog_thresh (over by 20%).
- */
- if (hrtimer_interrupts % 3 != 0)
- return;
-
/* check for a hardlockup on the next CPU */
next_cpu = watchdog_next_cpu(smp_processor_id());
if (next_cpu >= nr_cpu_ids)
--- a/kernel/watchdog.c~watchdog-hardlockup-improve-buddy-system-detection-timeliness
+++ a/kernel/watchdog.c
@@ -61,6 +61,13 @@ int __read_mostly sysctl_hardlockup_all_
# endif /* CONFIG_SMP */
/*
+ * Number of consecutive missed interrupts before declaring a lockup.
+ * Default to 1 (immediate) for NMI/Perf. Buddy will overwrite this to 3.
+ */
+int __read_mostly watchdog_hardlockup_miss_thresh = 1;
+EXPORT_SYMBOL_GPL(watchdog_hardlockup_miss_thresh);
+
+/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
@@ -137,6 +144,7 @@ __setup("nmi_watchdog=", hardlockup_pani
static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(int, hrtimer_interrupts_missed);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
static unsigned long hard_lockup_nmi_warn;
@@ -159,7 +167,7 @@ void watchdog_hardlockup_touch_cpu(unsig
per_cpu(watchdog_hardlockup_touched, cpu) = true;
}
-static void watchdog_hardlockup_update(unsigned int cpu)
+static void watchdog_hardlockup_update_reset(unsigned int cpu)
{
int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
@@ -169,6 +177,7 @@ static void watchdog_hardlockup_update(u
* written/read by a single CPU.
*/
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ per_cpu(hrtimer_interrupts_missed, cpu) = 0;
}
static bool is_hardlockup(unsigned int cpu)
@@ -176,10 +185,14 @@ static bool is_hardlockup(unsigned int c
int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
if (per_cpu(hrtimer_interrupts_saved, cpu) != hrint) {
- watchdog_hardlockup_update(cpu);
+ watchdog_hardlockup_update_reset(cpu);
return false;
}
+ per_cpu(hrtimer_interrupts_missed, cpu)++;
+ if (per_cpu(hrtimer_interrupts_missed, cpu) % watchdog_hardlockup_miss_thresh)
+ return false;
+
return true;
}
@@ -198,7 +211,7 @@ void watchdog_hardlockup_check(unsigned
unsigned long flags;
if (per_cpu(watchdog_hardlockup_touched, cpu)) {
- watchdog_hardlockup_update(cpu);
+ watchdog_hardlockup_update_reset(cpu);
per_cpu(watchdog_hardlockup_touched, cpu) = false;
return;
}
_
Patches currently in -mm which might be from mrungta@google.com are
watchdog-return-early-in-watchdog_hardlockup_check.patch
watchdog-update-saved-interrupts-during-check.patch
doc-watchdog-clarify-hardlockup-detection-timing.patch
watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch
doc-watchdog-document-buddy-detector.patch
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-03-14 23:33 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-20 21:20 + watchdog-hardlockup-improve-buddy-system-detection-timeliness.patch added to mm-nonmm-unstable branch Andrew Morton
-- strict thread matches above, loose matches on Subject: below --
2026-03-14 23:33 Andrew Morton
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.