From: Laurent Dufour <ldufour@linux.ibm.com>
To: Nicholas Piggin <npiggin@gmail.com>, linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCH v2 4/5] powerpc/watchdog: Read TB close to where it is used
Date: Fri, 5 Nov 2021 14:39:32 +0100 [thread overview]
Message-ID: <b0880e6e-71c7-0ea2-1e6d-6dee86265abe@linux.ibm.com> (raw)
In-Reply-To: <20211104161057.1255659-5-npiggin@gmail.com>
Le 04/11/2021 à 17:10, Nicholas Piggin a écrit :
> When taking watchdog actions, printing messages, comparing and
> re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use
> and under wd_smp_lock or printing lock (if applicable).
>
> This should keep timebase mostly monotonic with kernel log messages, and
> could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to
> something a long way in the past, and causing other CPUs to appear to be
> stuck.
>
> These additional TB reads are all slowpath (lockup has been detected),
> so performance does not matter.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> arch/powerpc/kernel/watchdog.c | 30 ++++++++++++++++++------------
> 1 file changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 0265d27340f1..2444cd10b61a 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -94,6 +94,10 @@ static u64 wd_smp_last_reset_tb;
> * Try to take the exclusive watchdog action / NMI IPI / printing lock.
> * wd_smp_lock must be held. If this fails, we should return and wait
> * for the watchdog to kick in again (or another CPU to trigger it).
> + *
> + * Importantly, if hardlockup_panic is set, wd_try_report failure should
> + * not delay the panic, because whichever other CPU is reporting will
> + * call panic.
> */
I guess this comment should be part of the previous commit in this series.
Despite that, please consider
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
> static bool wd_try_report(void)
> {
> @@ -153,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
> /* Do not panic from here because that can recurse into NMI IPI layer */
> }
>
> -static bool set_cpu_stuck(int cpu, u64 tb)
> +static bool set_cpu_stuck(int cpu)
> {
> cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
> cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
> @@ -162,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
> */
> smp_mb();
> if (cpumask_empty(&wd_smp_cpus_pending)) {
> - wd_smp_last_reset_tb = tb;
> + wd_smp_last_reset_tb = get_tb();
> cpumask_andnot(&wd_smp_cpus_pending,
> &wd_cpus_enabled,
> &wd_smp_cpus_stuck);
> @@ -171,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
> return false;
> }
>
> -static void watchdog_smp_panic(int cpu, u64 tb)
> +static void watchdog_smp_panic(int cpu)
> {
> static cpumask_t wd_smp_cpus_ipi; // protected by reporting
> unsigned long flags;
> + u64 tb;
> int c;
>
> wd_smp_lock(&flags);
> /* Double check some things under lock */
> + tb = get_tb();
> if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
> goto out;
> if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
> @@ -192,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
> continue; // should not happen
>
> __cpumask_set_cpu(c, &wd_smp_cpus_ipi);
> - if (set_cpu_stuck(c, tb))
> + if (set_cpu_stuck(c))
> break;
> }
> if (cpumask_empty(&wd_smp_cpus_ipi)) {
> @@ -232,7 +238,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
> wd_smp_unlock(&flags);
> }
>
> -static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> +static void wd_smp_clear_cpu_pending(int cpu)
> {
> if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
> if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
> @@ -240,7 +246,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> unsigned long flags;
>
> pr_emerg("CPU %d became unstuck TB:%lld\n",
> - cpu, tb);
> + cpu, get_tb());
> print_irqtrace_events(current);
> if (regs)
> show_regs(regs);
> @@ -301,7 +307,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> */
> wd_smp_lock(&flags);
> if (cpumask_empty(&wd_smp_cpus_pending)) {
> - wd_smp_last_reset_tb = tb;
> + wd_smp_last_reset_tb = get_tb();
> cpumask_andnot(&wd_smp_cpus_pending,
> &wd_cpus_enabled,
> &wd_smp_cpus_stuck);
> @@ -316,10 +322,10 @@ static void watchdog_timer_interrupt(int cpu)
>
> per_cpu(wd_timer_tb, cpu) = tb;
>
> - wd_smp_clear_cpu_pending(cpu, tb);
> + wd_smp_clear_cpu_pending(cpu);
>
> if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
> - watchdog_smp_panic(cpu, tb);
> + watchdog_smp_panic(cpu);
> }
>
> DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
> @@ -356,7 +362,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
> return 0;
> }
>
> - set_cpu_stuck(cpu, tb);
> + set_cpu_stuck(cpu);
>
> wd_smp_unlock(&flags);
>
> @@ -417,7 +423,7 @@ void arch_touch_nmi_watchdog(void)
> tb = get_tb();
> if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
> per_cpu(wd_timer_tb, cpu) = tb;
> - wd_smp_clear_cpu_pending(cpu, tb);
> + wd_smp_clear_cpu_pending(cpu);
> }
> }
> EXPORT_SYMBOL(arch_touch_nmi_watchdog);
> @@ -475,7 +481,7 @@ static void stop_watchdog(void *arg)
> cpumask_clear_cpu(cpu, &wd_cpus_enabled);
> wd_smp_unlock(&flags);
>
> - wd_smp_clear_cpu_pending(cpu, get_tb());
> + wd_smp_clear_cpu_pending(cpu);
> }
>
> static int stop_watchdog_on_cpu(unsigned int cpu)
>
next prev parent reply other threads:[~2021-11-05 13:40 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-04 16:10 [PATCH v2 0/5] powerpc: watchdog fixes Nicholas Piggin
2021-11-04 16:10 ` [PATCH v2 1/5] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race Nicholas Piggin
2021-11-05 9:20 ` Laurent Dufour
2021-11-05 11:46 ` Nicholas Piggin
2021-11-05 12:15 ` Laurent Dufour
2021-11-04 16:10 ` [PATCH v2 2/5] powerpc/watchdog: Tighten non-atomic read-modify-write access Nicholas Piggin
2021-11-05 16:17 ` Laurent Dufour
2021-11-04 16:10 ` [PATCH v2 3/5] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi Nicholas Piggin
2021-11-04 16:10 ` [PATCH v2 4/5] powerpc/watchdog: Read TB close to where it is used Nicholas Piggin
2021-11-05 13:39 ` Laurent Dufour [this message]
2021-11-04 16:10 ` [PATCH v2 5/5] powerpc/watchdog: Remove backtrace print from unstuck message Nicholas Piggin
2021-11-04 16:48 ` Laurent Dufour
2021-11-05 1:28 ` Nicholas Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b0880e6e-71c7-0ea2-1e6d-6dee86265abe@linux.ibm.com \
--to=ldufour@linux.ibm.com \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=npiggin@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).