linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Laurent Dufour <ldufour@linux.ibm.com>
To: Nicholas Piggin <npiggin@gmail.com>, linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCH v2 4/5] powerpc/watchdog: Read TB close to where it is used
Date: Fri, 5 Nov 2021 14:39:32 +0100	[thread overview]
Message-ID: <b0880e6e-71c7-0ea2-1e6d-6dee86265abe@linux.ibm.com> (raw)
In-Reply-To: <20211104161057.1255659-5-npiggin@gmail.com>

Le 04/11/2021 à 17:10, Nicholas Piggin a écrit :
> When taking watchdog actions, printing messages, comparing and
> re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use
> and under wd_smp_lock or printing lock (if applicable).
> 
> This should keep timebase mostly monotonic with kernel log messages, and
> could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to
> something a long way in the past, and causing other CPUs to appear to be
> stuck.
> 
> These additional TB reads are all slowpath (lockup has been detected),
> so performance does not matter.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   arch/powerpc/kernel/watchdog.c | 30 ++++++++++++++++++------------
>   1 file changed, 18 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 0265d27340f1..2444cd10b61a 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -94,6 +94,10 @@ static u64 wd_smp_last_reset_tb;
>    * Try to take the exclusive watchdog action / NMI IPI / printing lock.
>    * wd_smp_lock must be held. If this fails, we should return and wait
>    * for the watchdog to kick in again (or another CPU to trigger it).
> + *
> + * Importantly, if hardlockup_panic is set, wd_try_report failure should
> + * not delay the panic, because whichever other CPU is reporting will
> + * call panic.
>    */

I guess this comment should be part of the previous commit in this series.

Despite that, please consider

Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>

>   static bool wd_try_report(void)
>   {
> @@ -153,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
>   	/* Do not panic from here because that can recurse into NMI IPI layer */
>   }
>   
> -static bool set_cpu_stuck(int cpu, u64 tb)
> +static bool set_cpu_stuck(int cpu)
>   {
>   	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
>   	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
> @@ -162,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
>   	 */
>   	smp_mb();
>   	if (cpumask_empty(&wd_smp_cpus_pending)) {
> -		wd_smp_last_reset_tb = tb;
> +		wd_smp_last_reset_tb = get_tb();
>   		cpumask_andnot(&wd_smp_cpus_pending,
>   				&wd_cpus_enabled,
>   				&wd_smp_cpus_stuck);
> @@ -171,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
>   	return false;
>   }
>   
> -static void watchdog_smp_panic(int cpu, u64 tb)
> +static void watchdog_smp_panic(int cpu)
>   {
>   	static cpumask_t wd_smp_cpus_ipi; // protected by reporting
>   	unsigned long flags;
> +	u64 tb;
>   	int c;
>   
>   	wd_smp_lock(&flags);
>   	/* Double check some things under lock */
> +	tb = get_tb();
>   	if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
>   		goto out;
>   	if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
> @@ -192,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   			continue; // should not happen
>   
>   		__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
> -		if (set_cpu_stuck(c, tb))
> +		if (set_cpu_stuck(c))
>   			break;
>   	}
>   	if (cpumask_empty(&wd_smp_cpus_ipi)) {
> @@ -232,7 +238,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   	wd_smp_unlock(&flags);
>   }
>   
> -static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> +static void wd_smp_clear_cpu_pending(int cpu)
>   {
>   	if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
>   		if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
> @@ -240,7 +246,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			unsigned long flags;
>   
>   			pr_emerg("CPU %d became unstuck TB:%lld\n",
> -				 cpu, tb);
> +				 cpu, get_tb());
>   			print_irqtrace_events(current);
>   			if (regs)
>   				show_regs(regs);
> @@ -301,7 +307,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   		 */
>   		wd_smp_lock(&flags);
>   		if (cpumask_empty(&wd_smp_cpus_pending)) {
> -			wd_smp_last_reset_tb = tb;
> +			wd_smp_last_reset_tb = get_tb();
>   			cpumask_andnot(&wd_smp_cpus_pending,
>   					&wd_cpus_enabled,
>   					&wd_smp_cpus_stuck);
> @@ -316,10 +322,10 @@ static void watchdog_timer_interrupt(int cpu)
>   
>   	per_cpu(wd_timer_tb, cpu) = tb;
>   
> -	wd_smp_clear_cpu_pending(cpu, tb);
> +	wd_smp_clear_cpu_pending(cpu);
>   
>   	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
> -		watchdog_smp_panic(cpu, tb);
> +		watchdog_smp_panic(cpu);
>   }
>   
>   DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
> @@ -356,7 +362,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   			return 0;
>   		}
>   
> -		set_cpu_stuck(cpu, tb);
> +		set_cpu_stuck(cpu);
>   
>   		wd_smp_unlock(&flags);
>   
> @@ -417,7 +423,7 @@ void arch_touch_nmi_watchdog(void)
>   	tb = get_tb();
>   	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
>   		per_cpu(wd_timer_tb, cpu) = tb;
> -		wd_smp_clear_cpu_pending(cpu, tb);
> +		wd_smp_clear_cpu_pending(cpu);
>   	}
>   }
>   EXPORT_SYMBOL(arch_touch_nmi_watchdog);
> @@ -475,7 +481,7 @@ static void stop_watchdog(void *arg)
>   	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
>   	wd_smp_unlock(&flags);
>   
> -	wd_smp_clear_cpu_pending(cpu, get_tb());
> +	wd_smp_clear_cpu_pending(cpu);
>   }
>   
>   static int stop_watchdog_on_cpu(unsigned int cpu)
> 


  reply	other threads:[~2021-11-05 13:40 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-04 16:10 [PATCH v2 0/5] powerpc: watchdog fixes Nicholas Piggin
2021-11-04 16:10 ` [PATCH v2 1/5] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race Nicholas Piggin
2021-11-05  9:20   ` Laurent Dufour
2021-11-05 11:46     ` Nicholas Piggin
2021-11-05 12:15       ` Laurent Dufour
2021-11-04 16:10 ` [PATCH v2 2/5] powerpc/watchdog: Tighten non-atomic read-modify-write access Nicholas Piggin
2021-11-05 16:17   ` Laurent Dufour
2021-11-04 16:10 ` [PATCH v2 3/5] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi Nicholas Piggin
2021-11-04 16:10 ` [PATCH v2 4/5] powerpc/watchdog: Read TB close to where it is used Nicholas Piggin
2021-11-05 13:39   ` Laurent Dufour [this message]
2021-11-04 16:10 ` [PATCH v2 5/5] powerpc/watchdog: Remove backtrace print from unstuck message Nicholas Piggin
2021-11-04 16:48   ` Laurent Dufour
2021-11-05  1:28     ` Nicholas Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b0880e6e-71c7-0ea2-1e6d-6dee86265abe@linux.ibm.com \
    --to=ldufour@linux.ibm.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=npiggin@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).