From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755553AbYJBXGy (ORCPT ); Thu, 2 Oct 2008 19:06:54 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753231AbYJBXGq (ORCPT ); Thu, 2 Oct 2008 19:06:46 -0400 Received: from e4.ny.us.ibm.com ([32.97.182.144]:37143 "EHLO e4.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753061AbYJBXGp (ORCPT ); Thu, 2 Oct 2008 19:06:45 -0400 Date: Thu, 2 Oct 2008 16:06:39 -0700 From: "Paul E. McKenney" To: linux-kernel@vger.kernel.org Cc: Ingo Molnar , mathieu.desnoyers@polymtl.ca, akpm@linux-foundation.org, laijs@cn.fujitsu.com, manfred@colorfullife.com, dipankar@in.ibm.com, niv@us.ibm.com, dvhltc@us.ibm.com, josht@linux.vnet.ibm.com, ltt-dev@lists.casi.polymtl.ca Subject: [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Message-ID: <20081002230639.GA7522@linux.vnet.ibm.com> Reply-To: paulmck@linux.vnet.ibm.com References: <20081002003628.GA7877@linux.vnet.ibm.com> <20081002080726.GB17695@elte.hu> <20081002140507.GB6729@linux.vnet.ibm.com> <20081002151046.GB2473@Krystal> <20081002174318.GB7324@linux.vnet.ibm.com> <20081002185115.GA11047@Krystal> <20081002225537.GA6706@linux.vnet.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20081002225537.GA6706@linux.vnet.ibm.com> User-Agent: Mutt/1.5.15+20070412 (2007-04-11) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hello! This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney --- include/linux/rcuclassic.h | 12 ++- kernel/rcuclassic.c | 166 +++++++++++++++++++++++---------------------- lib/Kconfig.debug | 2 3 files changed, 96 insertions(+), 84 deletions(-) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 29bf528..2d72d20 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,15 +40,21 @@ #include #include +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ long pending; /* Number of the last pending batch */ -#ifdef CONFIG_DEBUG_RCU_STALL - unsigned long gp_check; /* Time grace period should end, in seconds. */ -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long jiffies_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ int signaled; diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128..eae2fb6 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4e921a8..e0e0582 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. -config RCU_CPU_STALL +config RCU_CPU_STALL_DETECTOR bool "Check for stalled CPUs delaying RCU grace periods" depends on CLASSIC_RCU default n