From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: mingo@elte.hu, akpm@linux-foundation.org,
mathieu.desnoyers@polymtl.ca, laijs@cn.fujitsu.com,
manfred@colorfullife.com, dipankar@in.ibm.com, niv@us.ibm.com,
dvhltc@us.ibm.com, josht@linux.vnet.ibm.com
Subject: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU
Date: Wed, 1 Oct 2008 17:36:28 -0700 [thread overview]
Message-ID: <20081002003628.GA7877@linux.vnet.ibm.com> (raw)
Hello!
This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR,
which defaults disabled. This is a debugging feature, not something
that non-kernel-hackers would be expected to care about. This feature
can detect looping CPUs in !PREEMPT builds and looping CPUs with
preemption disabled in PREEMPT builds. This is essentially a port of
this functionality from the treercu patch.
One current shortcoming: on some systems, stalls are detected during
early boot, when we normally would not care about them. My thought is
to add a call from late initialization to suppress stall detection until
the system is well along its way to being booted, but thought I should
check to see if there might already be something for this purpose.
(Currently against 2.6.27-rc8, FYI.)
Thoughts?
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
include/linux/rcuclassic.h | 9 ++++
kernel/rcuclassic.c | 88 +++++++++++++++++++++++++++++++++++++++++++++
lib/Kconfig.debug | 13 ++++++
3 files changed, 110 insertions(+)
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 4ab8436..9b62e9a 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -40,6 +40,10 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK 3 /* for rcp->seconds_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK 30 /* for rcp->seconds_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
@@ -52,6 +56,11 @@ struct rcu_ctrlblk {
spinlock_t lock ____cacheline_internodealigned_in_smp;
cpumask_t cpumask; /* CPUs that need to switch in order */
/* for current batch to proceed. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ unsigned long gp_start; /* Time at which GP started in jiffies. */
+ unsigned long seconds_stall;
+ /* Time at which to check for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
} ____cacheline_internodealigned_in_smp;
/* Is batch a before batch b ? */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cd..c092ba9 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
}
#endif
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+ rcp->gp_start = jiffies;
+ rcp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ delta = get_seconds() - rcp->seconds_stall;
+ if (delta < 2 || rcp->cur != rcp->completed) {
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ return;
+ }
+ rcp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+
+ /* OK, time to rat on our buddy... */
+
+ printk(KERN_ERR "RCU detected CPU stalls:");
+ for_each_possible_cpu(cpu) {
+ if (cpu_isset(cpu, rcp->cpumask))
+ printk(" %d", cpu);
+ }
+ printk(" (detected by %d, t=%ld jiffies)\n",
+ smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long flags;
+
+ printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+ smp_processor_id(), get_seconds(),
+ jiffies - rcp->gp_start);
+ dump_stack();
+ spin_lock_irqsave(&rcp->lock, flags);
+ if ((long)(get_seconds() - rcp->seconds_stall) >= 0)
+ rcp->seconds_stall =
+ get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ set_need_resched(); /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ long delta;
+
+ delta = get_seconds() - rcp->seconds_stall;
+ if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rcp);
+
+ } else if (rcp->cur != rcp->completed && delta >= 2) {
+
+ /* They had two seconds to dump stack, so complain. */
+ print_other_cpu_stall(rcp);
+ }
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
*/
smp_wmb();
rcp->cur++;
+ record_gp_stall_check_time(rcp);
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
+ /* Check for CPU stalls, if enabled. */
+ check_cpu_stall(rcp);
+
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
@@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user)
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
+#ifdef CONFIG_DEBUG_RCU_STALL
+ printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0b50481..9fee969 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
Say N here if you want the RCU torture tests to start only
after being manually enabled via /proc.
+config RCU_CPU_STALL_DETECTOR
+ bool "Check for stalled CPUs delaying RCU grace periods"
+ depends on CLASSIC_RCU
+ default n
+ help
+ This option causes RCU to printk information on which
+ CPUs are delaying the current grace period, but only when
+ the grace period extends for excessive time periods.
+
+ Say Y if you want RCU to perform such checks.
+
+ Say N if you are unsure.
+
config KPROBES_SANITY_TEST
bool "Kprobes sanity tests"
depends on DEBUG_KERNEL
next reply other threads:[~2008-10-02 0:38 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-02 0:36 Paul E. McKenney [this message]
2008-10-02 8:07 ` [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU Ingo Molnar
2008-10-02 14:05 ` Paul E. McKenney
2008-10-02 15:10 ` Mathieu Desnoyers
2008-10-02 17:43 ` Paul E. McKenney
2008-10-02 18:51 ` [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) Mathieu Desnoyers
2008-10-02 22:55 ` Paul E. McKenney
2008-10-02 23:06 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney
2008-10-03 4:41 ` Mathieu Desnoyers
2008-10-03 5:02 ` Paul E. McKenney
2008-10-03 8:39 ` Ingo Molnar
2008-10-03 8:42 ` [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix Ingo Molnar
2008-10-03 15:14 ` Paul E. McKenney
2008-10-03 15:13 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney
2008-10-02 17:39 ` [PATCH,RFC] " Paul E. McKenney
2008-10-03 10:12 ` Lai Jiangshan
2008-10-03 15:12 ` Paul E. McKenney
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20081002003628.GA7877@linux.vnet.ibm.com \
--to=paulmck@linux.vnet.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=dipankar@in.ibm.com \
--cc=dvhltc@us.ibm.com \
--cc=josht@linux.vnet.ibm.com \
--cc=laijs@cn.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=manfred@colorfullife.com \
--cc=mathieu.desnoyers@polymtl.ca \
--cc=mingo@elte.hu \
--cc=niv@us.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox