* [PATCH RFC tip/core/rcu 2/5] rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
@ 2014-06-20 18:33 ` Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 3/5] rcu: Add RCU_COND_RESCHED_QS for large systems Paul E. McKenney
` (4 subsequent siblings)
5 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-20 18:33 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
peterz, rostedt, dhowells, edumazet, dvhart, fweisbec, oleg, sbw,
Paul E. McKenney, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Back in the Good Olde Days, the kernel entered the scheduler on every
CPU periodically, whether anyone needed it or not. This was of course
wasteful and inefficient, so recent kernels have of course discontinued
this practice. However, this means that a given CPU might execute
for a very long time in the kernel without entering the scheduler,
and thus for a very long time without RCU quiescent states. This has
in fact happened on systems with unusual numbers of CPUs, open files,
and disk drives, and also on systems with large quantities of main memory.
Fortunately, each of the kernel's more than 500 calls to cond_resched()
can serve as an RCU quiescent state. Unfortunately, there appears to
be some desire to make cond_resched() be a no-op for PREEMPT=y builds.
Therefore, this commit creates cond_resched_rcu_qs(), which acts as
cond_resched() except that it also supplies RCU with a quiescent state
when RCU needs one. In the very common case where RCU does not need
a quiescent state, cond_resched_rcu_qs() adds only a test of a single
per-CPU variable. Note that cond_resched_rcu_qs() is implemented as a
macro rather than an inline function to avoid include-file issues.
This commit also applies cond_resched_rcu_qs() in a few places known
to need it, should cond_resched() not provide RCU grace periods.
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
fs/file.c | 2 +-
include/linux/rcutiny.h | 4 ++++
include/linux/rcutree.h | 13 +++++++++++++
kernel/rcu/rcutorture.c | 4 ++--
kernel/rcu/tree.c | 12 ++++++------
kernel/rcu/tree_plugin.h | 2 +-
mm/mlock.c | 2 +-
7 files changed, 28 insertions(+), 11 deletions(-)
diff --git a/fs/file.c b/fs/file.c
index 66923fe3176e..1cafc4c9275b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files)
struct file * file = xchg(&fdt->fd[i], NULL);
if (file) {
filp_close(file, files);
- cond_resched();
+ cond_resched_rcu_qs();
}
}
i++;
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ff2ede319890..968977da1803 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -83,6 +83,10 @@ static inline void rcu_note_context_switch(int cpu)
rcu_sched_qs(cpu);
}
+static inline void cond_resched_rcu_qs(void)
+{
+}
+
static inline bool rcu_should_resched(void)
{
return false;
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 16780fed7155..ca7d34027935 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -62,6 +62,19 @@ static inline void rcu_cond_resched(void)
rcu_resched();
}
+/**
+ * cond_resched_rcu_qs - Report potential quiescent states to RCU
+ *
+ * This macro resembles cond_resched(), except that it is defined to
+ * report potential quiescent states to RCU even if the cond_resched()
+ * machinery were to be shut off, as some advocate for PREEMPT kernels.
+ */
+#define cond_resched_rcu_qs() \
+do { \
+ rcu_cond_resched(); \
+ cond_resched(); \
+} while (0)
+
void synchronize_rcu_bh(void);
void synchronize_sched_expedited(void);
void synchronize_rcu_expedited(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..febe07062ac5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -667,7 +667,7 @@ static int rcu_torture_boost(void *arg)
}
call_rcu_time = jiffies;
}
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
@@ -1019,7 +1019,7 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2cc72ce19ff6..8d1f45b41433 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1629,7 +1629,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
system_state == SYSTEM_RUNNING)
udelay(200);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
- cond_resched();
+ cond_resched_rcu_qs();
}
mutex_unlock(&rsp->onoff_mutex);
@@ -1718,7 +1718,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
- cond_resched();
+ cond_resched_rcu_qs();
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1767,7 +1767,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched();
+ cond_resched_rcu_qs();
flush_signals(current);
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -1810,10 +1810,10 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched();
+ cond_resched_rcu_qs();
} else {
/* Deal with stray signal. */
- cond_resched();
+ cond_resched_rcu_qs();
flush_signals(current);
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -2414,7 +2414,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched();
+ cond_resched_rcu_qs();
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac0fb186b8..a86a363ea453 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1842,7 +1842,7 @@ static int rcu_oom_notify(struct notifier_block *self,
get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched();
+ cond_resched_rcu_qs();
}
put_online_cpus();
diff --git a/mm/mlock.c b/mm/mlock.c
index b1eb53634005..bc386a22d647 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -782,7 +782,7 @@ static int do_mlockall(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
- cond_resched();
+ cond_resched_rcu_qs();
}
out:
return 0;
--
1.8.1.5
^ permalink raw reply related [flat|nested] 24+ messages in thread* [PATCH RFC tip/core/rcu 3/5] rcu: Add RCU_COND_RESCHED_QS for large systems
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 2/5] rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops Paul E. McKenney
@ 2014-06-20 18:33 ` Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 4/5] rcutorture: Suppress spurious RCU CPU stall warnings Paul E. McKenney
` (3 subsequent siblings)
5 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-20 18:33 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
peterz, rostedt, dhowells, edumazet, dvhart, fweisbec, oleg, sbw,
Paul E. McKenney, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
People with low-latency kernel-intensive workloads object to the
extra test of a per-CPU variable added to cond_resched(). This commit
therefore adds a Kconfig variable RCU_COND_RESCHED_QS that enables this
extra test. Therefore, people who would like large systems to operate
reliably can enable this Kconfig variable, while people with low-latency
kernel-intensive workloads can leave it disabled. People setting defaults
for Linux distributions should choose wisely. ;-)
Suggested-by: Christoph Lameter <cl@gentwo.org>
Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
include/linux/rcutree.h | 15 +++++++++++++--
init/Kconfig | 26 ++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index ca7d34027935..69ccd4163de2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -56,12 +56,23 @@ static inline bool rcu_should_resched(void)
}
/* Report quiescent states to RCU if it is time to do so. */
-static inline void rcu_cond_resched(void)
+static inline void __rcu_cond_resched(void)
{
if (unlikely(rcu_should_resched()))
rcu_resched();
}
+/*
+ * Report quiescent states to RCU in kernels that are not configured
+ * for low-latency kernel-intensive workloads.
+ */
+static inline void rcu_cond_resched(void)
+{
+#ifdef CONFIG_RCU_COND_RESCHED_QS
+ __rcu_cond_resched();
+#endif /* #ifdef CONFIG_RCU_COND_RESCHED_QS */
+}
+
/**
* cond_resched_rcu_qs - Report potential quiescent states to RCU
*
@@ -71,7 +82,7 @@ static inline void rcu_cond_resched(void)
*/
#define cond_resched_rcu_qs() \
do { \
- rcu_cond_resched(); \
+ __rcu_cond_resched(); \
cond_resched(); \
} while (0)
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99af1b9..6457a7f1f0be 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -781,6 +781,32 @@ config RCU_NOCB_CPU_ALL
endchoice
+config RCU_COND_RESCHED_QS
+ bool "Use cond_resched() calls as RCU quiescent states"
+ depends on TREE_RCU || TREE_PREEMPT_RCU
+ default n
+ help
+ Use this option to allow RCU grace periods to make progress
+ on large systems that can execute for extended time periods
+ in the kernel. Workloads that contain processes with large
+ numbers of open files, systems with large quantities of
+ memory, workloads that do mm-related system calls on large
+ regions, and systems with large numbers of mass-storage
+ devices are particularly prone to this behavior. Without
+ this option set, RCU CPU stall warnings or even OOMs can
+ result.
+
+ This option causes cond_resched() to check to see if the
+ current RCU grace period has been waiting for too long for
+ this CPU, and to emulate a zero-duration dyntick-idle
+ period if so. RCU's grace-period kthreads will then note
+ this dyntick-idle period and report a quiescent state to
+ the RCU core on this CPU's behalf, thus avoiding both RCU
+ CPU stall warnings and OOMs.
+
+ Say Y here for reliable operation on large systems.
+ Say N here for kernel-intensive low-latency workloads.
+
endmenu # "RCU Subsystem"
config IKCONFIG
--
1.8.1.5
^ permalink raw reply related [flat|nested] 24+ messages in thread* [PATCH RFC tip/core/rcu 4/5] rcutorture: Suppress spurious RCU CPU stall warnings
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 2/5] rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 3/5] rcu: Add RCU_COND_RESCHED_QS for large systems Paul E. McKenney
@ 2014-06-20 18:33 ` Paul E. McKenney
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 5/5] rcu: Add boot/sysfs control for RCU cond_resched() help solicitation Paul E. McKenney
` (2 subsequent siblings)
5 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-20 18:33 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
peterz, rostedt, dhowells, edumazet, dvhart, fweisbec, oleg, sbw,
Paul E. McKenney, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Because rcutorture runs CPU-bound in the kernel, it can prevent grace
periods from being reported. In RCU_COND_RESCHED_QS=y builds, the
cond_resched() calls handle this reporting. This commit therefore
forces RCU_COND_RESCHED_QS=y for the TREE01 rcutorture configuration,
which is otherwise prone to spurious RCU CPU stall warnings.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
tools/testing/selftests/rcutorture/configs/rcu/TREE01 | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index 9c827ec59a97..82e70bc38125 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -4,6 +4,7 @@ CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y
#CHECK#CONFIG_TREE_PREEMPT_RCU=y
+CONFIG_RCU_COND_RESCHED_QS=y
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
--
1.8.1.5
^ permalink raw reply related [flat|nested] 24+ messages in thread* [PATCH RFC tip/core/rcu 5/5] rcu: Add boot/sysfs control for RCU cond_resched() help solicitation
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
` (2 preceding siblings ...)
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 4/5] rcutorture: Suppress spurious RCU CPU stall warnings Paul E. McKenney
@ 2014-06-20 18:33 ` Paul E. McKenney
2014-06-23 16:43 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Oleg Nesterov
2014-07-22 4:35 ` Pranith Kumar
5 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-20 18:33 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
peterz, rostedt, dhowells, edumazet, dvhart, fweisbec, oleg, sbw,
Paul E. McKenney, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
This commit replaces the hard-coded seven-jiffy cond_resched()
quiescent-state solicitation delay with a delay that can be set via
the rcutree.jiffies_till_cond_resched_qs boot parameter, or of course
via sysfs. This change was inspired in part by Dave Hansen's time-based
approach: https://lkml.org/lkml/2014/6/17/746.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
Documentation/kernel-parameters.txt | 7 +++++++
kernel/rcu/tree.c | 24 ++++++++++++++++++++++--
2 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6eaa9cdb7094..1b553596e933 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2785,6 +2785,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
leaf rcu_node structure. Useful for very large
systems.
+ rcutree.jiffies_till_cond_resched_qs= [KNL]
+ Set required age in jiffies for a given
+ grace period before RCU starts soliciting
+ quiescent-state help from cond_resched_rcu_qs(),
+ and, if CONFIG_RCU_COND_RESCHED_QS=y, also from
+ cond_resched() itself.
+
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8d1f45b41433..d88f6a10d971 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -295,6 +295,14 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from cond_resched_rcu_qs() and, if
+ * CONFIG_RCU_COND_RESCHED_QS=y, also from cond_resched() itself.
+ */
+static ulong jiffies_till_cond_resched_qs = 10;
+module_param(jiffies_till_cond_resched_qs, ulong, 0644);
+
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
@@ -951,9 +959,21 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* even context-switching back and forth between a pair of
* in-kernel CPU-bound tasks cannot advance grace periods.
* So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_cond_resched_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
*/
- if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
- rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
+ rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_cond_resched_qs) &&
+ !(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
ACCESS_ONCE(rdp->cond_resched_completed) =
ACCESS_ONCE(rdp->mynode->completed);
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
--
1.8.1.5
^ permalink raw reply related [flat|nested] 24+ messages in thread* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
` (3 preceding siblings ...)
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 5/5] rcu: Add boot/sysfs control for RCU cond_resched() help solicitation Paul E. McKenney
@ 2014-06-23 16:43 ` Oleg Nesterov
2014-06-23 17:36 ` Paul E. McKenney
2014-07-22 4:35 ` Pranith Kumar
5 siblings, 1 reply; 24+ messages in thread
From: Oleg Nesterov @ 2014-06-23 16:43 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
josh, niv, tglx, peterz, rostedt, dhowells, edumazet, dvhart,
fweisbec, sbw, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
On 06/20, Paul E. McKenney wrote:
>
> This commit takes a different approach to fixing this bug, mainly by
> avoiding having cond_resched() do an RCU-visible quiescent state unless
> there is a grace period that has been in flight for a significant period
> of time. This commit also reduces the common-case cond_resched() overhead
> to a check of a single per-CPU variable.
I can't say I fully understand this change, but I think it is fine.
Just a really stupid question below.
> +void rcu_resched(void)
> +{
> + unsigned long flags;
> + struct rcu_data *rdp;
> + struct rcu_dynticks *rdtp;
> + int resched_mask;
> + struct rcu_state *rsp;
> +
> + local_irq_save(flags);
> +
> + /*
> + * Yes, we can lose flag-setting operations. This is OK, because
> + * the flag will be set again after some delay.
> + */
> + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
> + raw_cpu_write(rcu_cond_resched_mask, 0);
> +
> + /* Find the flavor that needs a quiescent state. */
> + for_each_rcu_flavor(rsp) {
> + rdp = raw_cpu_ptr(rsp->rda);
> + if (!(resched_mask & rsp->flavor_mask))
> + continue;
> + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
> + if (ACCESS_ONCE(rdp->mynode->completed) !=
> + ACCESS_ONCE(rdp->cond_resched_completed))
> + continue;
Probably the comment above mb() meant "rcu_cond_resched_mask before
->cond_resched_completed" ? Otherwise I can't see why do we need any
barrier.
> @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> }
>
> /*
> - * There is a possibility that a CPU in adaptive-ticks state
> - * might run in the kernel with the scheduling-clock tick disabled
> - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
> - * force the CPU to restart the scheduling-clock tick in this
> - * CPU is in this state.
> + * A CPU running for an extended time within the kernel can
> + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
> + * even context-switching back and forth between a pair of
> + * in-kernel CPU-bound tasks cannot advance grace periods.
> + * So if the grace period is old enough, make the CPU pay attention.
> */
> - rcu_kick_nohz_cpu(rdp->cpu);
> + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
> + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
> + ACCESS_ONCE(rdp->cond_resched_completed) =
> + ACCESS_ONCE(rdp->mynode->completed);
> + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
> + ACCESS_ONCE(*rcrmp) =
> + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
> + }
OK, in this case I guess we need a full barrier because we need to read
rcu_cond_resched_mask before updating it...
But, I am just curious, is there any reason to use ACCESS_ONCE() twice?
ACCESS_ONCE(*rcrmp) |= rdp->rsp->flavor_mask;
or even
ACCESS_ONCE(per_cpu(rcu_cond_resched_mask, rdp->cpu)) |=
rdp->rsp->flavor_mask;
should equally work, or ACCESS_ONCE() can't be used to RMW ?
(and in fact at least the 2nd ACCESS_ONCE() (load) looks unnecessary anyway
because of smp_mb() above).
Once again, of course I am not arguing if there is no "real" reason and you
just prefer it this way. But the kernel has more and more ACESS_ONCE() users
and sometime I simply do not understand why it is needed. For example,
cyc2ns_write_end().
Or even INIT_LIST_HEAD_RCU(). The comment in list_splice_init_rcu() says:
/*
* "first" and "last" tracking list, so initialize it. RCU readers
* have access to this list, so we must use INIT_LIST_HEAD_RCU()
* instead of INIT_LIST_HEAD().
*/
INIT_LIST_HEAD_RCU(list);
but we are going to call synchronize_rcu() or something similar, this should
act as compiler barrier too?
Oleg.
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-06-23 16:43 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Oleg Nesterov
@ 2014-06-23 17:36 ` Paul E. McKenney
2014-06-23 18:35 ` Oleg Nesterov
0 siblings, 1 reply; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-23 17:36 UTC (permalink / raw)
To: Oleg Nesterov
Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
josh, niv, tglx, peterz, rostedt, dhowells, edumazet, dvhart,
fweisbec, sbw, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
On Mon, Jun 23, 2014 at 06:43:21PM +0200, Oleg Nesterov wrote:
> On 06/20, Paul E. McKenney wrote:
> >
> > This commit takes a different approach to fixing this bug, mainly by
> > avoiding having cond_resched() do an RCU-visible quiescent state unless
> > there is a grace period that has been in flight for a significant period
> > of time. This commit also reduces the common-case cond_resched() overhead
> > to a check of a single per-CPU variable.
>
> I can't say I fully understand this change, but I think it is fine.
> Just a really stupid question below.
>
> > +void rcu_resched(void)
> > +{
> > + unsigned long flags;
> > + struct rcu_data *rdp;
> > + struct rcu_dynticks *rdtp;
> > + int resched_mask;
> > + struct rcu_state *rsp;
> > +
> > + local_irq_save(flags);
> > +
> > + /*
> > + * Yes, we can lose flag-setting operations. This is OK, because
> > + * the flag will be set again after some delay.
> > + */
> > + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
> > + raw_cpu_write(rcu_cond_resched_mask, 0);
> > +
> > + /* Find the flavor that needs a quiescent state. */
> > + for_each_rcu_flavor(rsp) {
> > + rdp = raw_cpu_ptr(rsp->rda);
> > + if (!(resched_mask & rsp->flavor_mask))
> > + continue;
> > + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
> > + if (ACCESS_ONCE(rdp->mynode->completed) !=
> > + ACCESS_ONCE(rdp->cond_resched_completed))
> > + continue;
>
> Probably the comment above mb() meant "rcu_cond_resched_mask before
> ->cond_resched_completed" ? Otherwise I can't see why do we need any
> barrier.
You are absolutely right, changed as suggested.
> > @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> > }
> >
> > /*
> > - * There is a possibility that a CPU in adaptive-ticks state
> > - * might run in the kernel with the scheduling-clock tick disabled
> > - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
> > - * force the CPU to restart the scheduling-clock tick in this
> > - * CPU is in this state.
> > + * A CPU running for an extended time within the kernel can
> > + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
> > + * even context-switching back and forth between a pair of
> > + * in-kernel CPU-bound tasks cannot advance grace periods.
> > + * So if the grace period is old enough, make the CPU pay attention.
> > */
> > - rcu_kick_nohz_cpu(rdp->cpu);
> > + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
> > + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
> > + ACCESS_ONCE(rdp->cond_resched_completed) =
> > + ACCESS_ONCE(rdp->mynode->completed);
> > + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
> > + ACCESS_ONCE(*rcrmp) =
> > + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
> > + }
>
> OK, in this case I guess we need a full barrier because we need to read
> rcu_cond_resched_mask before updating it...
>
> But, I am just curious, is there any reason to use ACCESS_ONCE() twice?
>
> ACCESS_ONCE(*rcrmp) |= rdp->rsp->flavor_mask;
>
> or even
>
> ACCESS_ONCE(per_cpu(rcu_cond_resched_mask, rdp->cpu)) |=
> rdp->rsp->flavor_mask;
>
> should equally work, or ACCESS_ONCE() can't be used to RMW ?
It can be, but Linus doesn't like it to be. I recently changed all of
the RMW ACCESS_ONCE() calls as a result. One of the reasons for avoiding
RMW ACCESS_ONCE() is that language features that might one day replace
ACCESS_ONCE() do not support RMW use.
> (and in fact at least the 2nd ACCESS_ONCE() (load) looks unnecessary anyway
> because of smp_mb() above).
It is unlikely, but without ACCESS_ONCE() some misbegotten compiler could
split the load and still claim to be conforming to the standard. :-(
(This is called "load tearing" by the standards guys.)
> Once again, of course I am not arguing if there is no "real" reason and you
> just prefer it this way. But the kernel has more and more ACESS_ONCE() users
> and sometime I simply do not understand why it is needed. For example,
> cyc2ns_write_end().
Could be concern about store tearing.
> Or even INIT_LIST_HEAD_RCU(). The comment in list_splice_init_rcu() says:
>
> /*
> * "first" and "last" tracking list, so initialize it. RCU readers
> * have access to this list, so we must use INIT_LIST_HEAD_RCU()
> * instead of INIT_LIST_HEAD().
> */
>
> INIT_LIST_HEAD_RCU(list);
>
> but we are going to call synchronize_rcu() or something similar, this should
> act as compiler barrier too?
Indeed, synchronize_rcu() enforces a barrier on each CPU between
any prior and subsequent accesses to RCU-protected data by that CPU.
(Which means that CPUs that would otherwise sleep through the entire
grace period can continue sleeping, given that it is not accessing
any RCU-protected data while sleeping.) I would guess load-tearing
or store-tearing concerns.
Thanx, Paul
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-06-23 17:36 ` Paul E. McKenney
@ 2014-06-23 18:35 ` Oleg Nesterov
2014-06-24 0:18 ` Paul E. McKenney
0 siblings, 1 reply; 24+ messages in thread
From: Oleg Nesterov @ 2014-06-23 18:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
josh, niv, tglx, peterz, rostedt, dhowells, edumazet, dvhart,
fweisbec, sbw, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
On 06/23, Paul E. McKenney wrote:
>
> On Mon, Jun 23, 2014 at 06:43:21PM +0200, Oleg Nesterov wrote:
> >
> > should equally work, or ACCESS_ONCE() can't be used to RMW ?
>
> It can be, but Linus doesn't like it to be. I recently changed all of
> the RMW ACCESS_ONCE() calls as a result. One of the reasons for avoiding
> RMW ACCESS_ONCE() is that language features that might one day replace
> ACCESS_ONCE() do not support RMW use.
OK, thanks.
> > Or even INIT_LIST_HEAD_RCU(). The comment in list_splice_init_rcu() says:
> >
> > /*
> > * "first" and "last" tracking list, so initialize it. RCU readers
> > * have access to this list, so we must use INIT_LIST_HEAD_RCU()
> > * instead of INIT_LIST_HEAD().
> > */
> >
> > INIT_LIST_HEAD_RCU(list);
> >
> > but we are going to call synchronize_rcu() or something similar, this should
> > act as compiler barrier too?
>
> Indeed, synchronize_rcu() enforces a barrier on each CPU between
> any prior and subsequent accesses to RCU-protected data by that CPU.
> (Which means that CPUs that would otherwise sleep through the entire
> grace period can continue sleeping, given that it is not accessing
> any RCU-protected data while sleeping.) I would guess load-tearing
> or store-tearing concerns.
But the kernel depends on the fact that "long" should be updated atomically,
and the concurent reader should see the old-or-new value without any tricks.
Perhaps we should add ACCESS_ONCE_PARANOID_FOR_COMPILER(). Otherwise when
you read the code it is not always clear why it is uses ACCESS_ONCE(), and
sometimes this look as if you simply do not understand it. Or at least a
/* not really needed but gcc can have bugs */ could help in these cases.
Oleg.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-06-23 18:35 ` Oleg Nesterov
@ 2014-06-24 0:18 ` Paul E. McKenney
0 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-06-24 0:18 UTC (permalink / raw)
To: Oleg Nesterov
Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
josh, niv, tglx, peterz, rostedt, dhowells, edumazet, dvhart,
fweisbec, sbw, Andi Kleen, Christoph Lameter, Mike Galbraith,
Eric Dumazet
On Mon, Jun 23, 2014 at 08:35:27PM +0200, Oleg Nesterov wrote:
> On 06/23, Paul E. McKenney wrote:
> >
> > On Mon, Jun 23, 2014 at 06:43:21PM +0200, Oleg Nesterov wrote:
> > >
> > > should equally work, or ACCESS_ONCE() can't be used to RMW ?
> >
> > It can be, but Linus doesn't like it to be. I recently changed all of
> > the RMW ACCESS_ONCE() calls as a result. One of the reasons for avoiding
> > RMW ACCESS_ONCE() is that language features that might one day replace
> > ACCESS_ONCE() do not support RMW use.
>
> OK, thanks.
>
> > > Or even INIT_LIST_HEAD_RCU(). The comment in list_splice_init_rcu() says:
> > >
> > > /*
> > > * "first" and "last" tracking list, so initialize it. RCU readers
> > > * have access to this list, so we must use INIT_LIST_HEAD_RCU()
> > > * instead of INIT_LIST_HEAD().
> > > */
> > >
> > > INIT_LIST_HEAD_RCU(list);
> > >
> > > but we are going to call synchronize_rcu() or something similar, this should
> > > act as compiler barrier too?
> >
> > Indeed, synchronize_rcu() enforces a barrier on each CPU between
> > any prior and subsequent accesses to RCU-protected data by that CPU.
> > (Which means that CPUs that would otherwise sleep through the entire
> > grace period can continue sleeping, given that it is not accessing
> > any RCU-protected data while sleeping.) I would guess load-tearing
> > or store-tearing concerns.
>
> But the kernel depends on the fact that "long" should be updated atomically,
> and the concurent reader should see the old-or-new value without any tricks.
>
> Perhaps we should add ACCESS_ONCE_PARANOID_FOR_COMPILER(). Otherwise when
> you read the code it is not always clear why it is uses ACCESS_ONCE(), and
> sometimes this look as if you simply do not understand it. Or at least a
> /* not really needed but gcc can have bugs */ could help in these cases.
I am a bit reluctant to add variants of ACCESS_ONCE(), but perhaps
comments about exactly what the ACCESS_ONCE() is preventing in the more
paranoid cases would be a good thing. My fear is that the comments will
just be copy/pasted with the ACCESS_ONCE wrappers. ;-)
Thanx, Paul
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-06-20 18:33 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Paul E. McKenney
` (4 preceding siblings ...)
2014-06-23 16:43 ` [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU Oleg Nesterov
@ 2014-07-22 4:35 ` Pranith Kumar
2014-07-22 4:52 ` Pranith Kumar
2014-07-22 11:06 ` Paul E. McKenney
5 siblings, 2 replies; 24+ messages in thread
From: Pranith Kumar @ 2014-07-22 4:35 UTC (permalink / raw)
To: Paul E. McKenney, Josh Triplett, LKML
Hi Paul,
I was going through this code and found a few inconsistencies. I git blamed it
and found that it was this recent commit and thought I could ask a few
questions. I am dropping the CC's as I am not sure since it is pretty late.
Please find a few questions below:
On 06/20/2014 02:33 PM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
>
> Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
> fixed a problem where a CPU looping in the kernel with but one runnable
> task would give RCU CPU stall warnings, even if the in-kernel loop
> contained cond_resched() calls. Unfortunately, in so doing, it introduced
> performance regressions in Anton Blanchard's will-it-scale "open1" test.
> The problem appears to be not so much the increased cond_resched() path
> length as an increase in the rate at which grace periods complete, which
> increased per-update grace-period overhead.
>
> This commit takes a different approach to fixing this bug, mainly by
> avoiding having cond_resched() do an RCU-visible quiescent state unless
> there is a grace period that has been in flight for a significant period
> of time. This commit also reduces the common-case cond_resched() overhead
> to a check of a single per-CPU variable.
>
<snip>
> index f1ba77363fbb..2cc72ce19ff6 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -229,6 +229,58 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
> #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> };
>
> +/*
> + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> + */
> +
> +DEFINE_PER_CPU(int, rcu_cond_resched_mask);
> +
> +/*
> + * Let the RCU core know that this CPU has gone through a cond_resched(),
> + * which is a quiescent state.
> + */
> +void rcu_resched(void)
> +{
> + unsigned long flags;
> + struct rcu_data *rdp;
> + struct rcu_dynticks *rdtp;
> + int resched_mask;
> + struct rcu_state *rsp;
> +
> + local_irq_save(flags);
> +
> + /*
> + * Yes, we can lose flag-setting operations. This is OK, because
> + * the flag will be set again after some delay.
> + */
> + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
> + raw_cpu_write(rcu_cond_resched_mask, 0);
> +
> + /* Find the flavor that needs a quiescent state. */
> + for_each_rcu_flavor(rsp) {
> + rdp = raw_cpu_ptr(rsp->rda);
> + if (!(resched_mask & rsp->flavor_mask))
> + continue;
Here both resched_mask and flavor_mask are not being updated within the loop.
Are they supposed to be? It is really not clear what flavor_mask is doing in the
code.
> + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
> + if (ACCESS_ONCE(rdp->mynode->completed) !=
> + ACCESS_ONCE(rdp->cond_resched_completed))
> + continue;
> +
> + /*
> + * Pretend to be momentarily idle for the quiescent state.
> + * This allows the grace-period kthread to record the
> + * quiescent state, with no need for this CPU to do anything
> + * further.
> + */
> + rdtp = this_cpu_ptr(&rcu_dynticks);
> + smp_mb__before_atomic(); /* Earlier stuff before QS. */
> + atomic_add(2, &rdtp->dynticks); /* QS. */
> + smp_mb__after_atomic(); /* Later stuff after QS. */
> + break;
> + }
> + local_irq_restore(flags);
> +}
> +
> static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
> static long qhimark = 10000; /* If this many pending, ignore blimit. */
> static long qlowmark = 100; /* Once only this many pending, use blimit. */
> @@ -853,6 +905,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> bool *isidle, unsigned long *maxj)
> {
> unsigned int curr;
> + int *rcrmp;
> unsigned int snap;
>
> curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
> @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> }
>
> /*
> - * There is a possibility that a CPU in adaptive-ticks state
> - * might run in the kernel with the scheduling-clock tick disabled
> - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
> - * force the CPU to restart the scheduling-clock tick in this
> - * CPU is in this state.
> + * A CPU running for an extended time within the kernel can
> + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
> + * even context-switching back and forth between a pair of
> + * in-kernel CPU-bound tasks cannot advance grace periods.
> + * So if the grace period is old enough, make the CPU pay attention.
> */
> - rcu_kick_nohz_cpu(rdp->cpu);
> + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
> + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
> + ACCESS_ONCE(rdp->cond_resched_completed) =
> + ACCESS_ONCE(rdp->mynode->completed);
> + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
> + ACCESS_ONCE(*rcrmp) =
> + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
> + }
>
> /*
> * Alternatively, the CPU might be running in the kernel
> @@ -3491,6 +3551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> "rcu_node_fqs_1",
> "rcu_node_fqs_2",
> "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
> + static u8 fl_mask = 0x1;
What does 0x1 mean here? Is it for a particular flavor? This could use a
comment.
> int cpustride = 1;
> int i;
> int j;
> @@ -3509,6 +3570,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> for (i = 1; i < rcu_num_lvls; i++)
> rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
> rcu_init_levelspread(rsp);
> + rsp->flavor_mask = fl_mask;
> + fl_mask <<= 1;
Something looks off here. fl_mask is not being used after this. Was it supposed
to be used or is it just a stray statement?
The flavor_mask operations could really use some comments as it is not really
clear what is being achieved by that.
--
Pranith
>
> /* Initialize the elements themselves, starting from the leaves. */
>
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index bf2c1e669691..0f69a79c5b7d 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -307,6 +307,9 @@ struct rcu_data {
> /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
> unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
> unsigned long offline_fqs; /* Kicked due to being offline. */
> + unsigned long cond_resched_completed;
> + /* Grace period that needs help */
> + /* from cond_resched(). */
>
> /* 5) __rcu_pending() statistics. */
> unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
> @@ -392,6 +395,7 @@ struct rcu_state {
> struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
> u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
> u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
> + u8 flavor_mask; /* bit in flavor mask. */
> struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
> void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
> void (*func)(struct rcu_head *head));
> @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
> static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
> static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
> static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
> -static void rcu_kick_nohz_cpu(int cpu);
> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
> static bool init_nocb_callback_list(struct rcu_data *rdp);
> static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
> static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index cbc2c45265e2..02ac0fb186b8 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
> * if an adaptive-ticks CPU is failing to respond to the current grace
> * period and has not be idle from an RCU perspective, kick it.
> */
> -static void rcu_kick_nohz_cpu(int cpu)
> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
> {
> #ifdef CONFIG_NO_HZ_FULL
> if (tick_nohz_full_cpu(cpu))
> diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
> index a2aeb4df0f60..d22309cae9f5 100644
> --- a/kernel/rcu/update.c
> +++ b/kernel/rcu/update.c
> @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
> early_initcall(check_cpu_stall_init);
>
> #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
> -
> -/*
> - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> - */
> -
> -DEFINE_PER_CPU(int, rcu_cond_resched_count);
> -
> -/*
> - * Report a set of RCU quiescent states, for use by cond_resched()
> - * and friends. Out of line due to being called infrequently.
> - */
> -void rcu_resched(void)
> -{
> - preempt_disable();
> - __this_cpu_write(rcu_cond_resched_count, 0);
> - rcu_note_context_switch(smp_processor_id());
> - preempt_enable();
> -}
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-07-22 4:35 ` Pranith Kumar
@ 2014-07-22 4:52 ` Pranith Kumar
2014-07-22 11:07 ` Paul E. McKenney
2014-07-22 11:06 ` Paul E. McKenney
1 sibling, 1 reply; 24+ messages in thread
From: Pranith Kumar @ 2014-07-22 4:52 UTC (permalink / raw)
To: Paul E. McKenney, Josh Triplett, LKML
Doh! I figured it out *after* I sent out the mail. Sorry for the noise!
On Tue, Jul 22, 2014 at 12:35 AM, Pranith Kumar <bobby.prani@gmail.com> wrote:
> Hi Paul,
>
> I was going through this code and found a few inconsistencies. I git blamed it
> and found that it was this recent commit and thought I could ask a few
> questions. I am dropping the CC's as I am not sure since it is pretty late.
>
> Please find a few questions below:
>
> On 06/20/2014 02:33 PM, Paul E. McKenney wrote:
>> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
>>
>> Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
>> fixed a problem where a CPU looping in the kernel with but one runnable
>> task would give RCU CPU stall warnings, even if the in-kernel loop
>> contained cond_resched() calls. Unfortunately, in so doing, it introduced
>> performance regressions in Anton Blanchard's will-it-scale "open1" test.
>> The problem appears to be not so much the increased cond_resched() path
>> length as an increase in the rate at which grace periods complete, which
>> increased per-update grace-period overhead.
>>
>> This commit takes a different approach to fixing this bug, mainly by
>> avoiding having cond_resched() do an RCU-visible quiescent state unless
>> there is a grace period that has been in flight for a significant period
>> of time. This commit also reduces the common-case cond_resched() overhead
>> to a check of a single per-CPU variable.
>>
> <snip>
>> index f1ba77363fbb..2cc72ce19ff6 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -229,6 +229,58 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
>> #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
>> };
>>
>> +/*
>> + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
>> + */
>> +
>> +DEFINE_PER_CPU(int, rcu_cond_resched_mask);
>> +
>> +/*
>> + * Let the RCU core know that this CPU has gone through a cond_resched(),
>> + * which is a quiescent state.
>> + */
>> +void rcu_resched(void)
>> +{
>> + unsigned long flags;
>> + struct rcu_data *rdp;
>> + struct rcu_dynticks *rdtp;
>> + int resched_mask;
>> + struct rcu_state *rsp;
>> +
>> + local_irq_save(flags);
>> +
>> + /*
>> + * Yes, we can lose flag-setting operations. This is OK, because
>> + * the flag will be set again after some delay.
>> + */
>> + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
>> + raw_cpu_write(rcu_cond_resched_mask, 0);
>> +
>> + /* Find the flavor that needs a quiescent state. */
>> + for_each_rcu_flavor(rsp) {
>> + rdp = raw_cpu_ptr(rsp->rda);
>> + if (!(resched_mask & rsp->flavor_mask))
>> + continue;
>
> Here both resched_mask and flavor_mask are not being updated within the loop.
> Are they supposed to be? It is really not clear what flavor_mask is doing in the
> code.
>
>
>> + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
>> + if (ACCESS_ONCE(rdp->mynode->completed) !=
>> + ACCESS_ONCE(rdp->cond_resched_completed))
>> + continue;
>> +
>> + /*
>> + * Pretend to be momentarily idle for the quiescent state.
>> + * This allows the grace-period kthread to record the
>> + * quiescent state, with no need for this CPU to do anything
>> + * further.
>> + */
>> + rdtp = this_cpu_ptr(&rcu_dynticks);
>> + smp_mb__before_atomic(); /* Earlier stuff before QS. */
>> + atomic_add(2, &rdtp->dynticks); /* QS. */
>> + smp_mb__after_atomic(); /* Later stuff after QS. */
>> + break;
>> + }
>> + local_irq_restore(flags);
>> +}
>> +
>> static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
>> static long qhimark = 10000; /* If this many pending, ignore blimit. */
>> static long qlowmark = 100; /* Once only this many pending, use blimit. */
>> @@ -853,6 +905,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
>> bool *isidle, unsigned long *maxj)
>> {
>> unsigned int curr;
>> + int *rcrmp;
>> unsigned int snap;
>>
>> curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
>> @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
>> }
>>
>> /*
>> - * There is a possibility that a CPU in adaptive-ticks state
>> - * might run in the kernel with the scheduling-clock tick disabled
>> - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
>> - * force the CPU to restart the scheduling-clock tick in this
>> - * CPU is in this state.
>> + * A CPU running for an extended time within the kernel can
>> + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
>> + * even context-switching back and forth between a pair of
>> + * in-kernel CPU-bound tasks cannot advance grace periods.
>> + * So if the grace period is old enough, make the CPU pay attention.
>> */
>> - rcu_kick_nohz_cpu(rdp->cpu);
>> + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
>> + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
>> + ACCESS_ONCE(rdp->cond_resched_completed) =
>> + ACCESS_ONCE(rdp->mynode->completed);
>> + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
>> + ACCESS_ONCE(*rcrmp) =
>> + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
>> + }
>>
>> /*
>> * Alternatively, the CPU might be running in the kernel
>> @@ -3491,6 +3551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
>> "rcu_node_fqs_1",
>> "rcu_node_fqs_2",
>> "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
>> + static u8 fl_mask = 0x1;
>
> What does 0x1 mean here? Is it for a particular flavor? This could use a
> comment.
>
>> int cpustride = 1;
>> int i;
>> int j;
>> @@ -3509,6 +3570,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
>> for (i = 1; i < rcu_num_lvls; i++)
>> rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
>> rcu_init_levelspread(rsp);
>> + rsp->flavor_mask = fl_mask;
>> + fl_mask <<= 1;
>
> Something looks off here. fl_mask is not being used after this. Was it supposed
> to be used or is it just a stray statement?
>
> The flavor_mask operations could really use some comments as it is not really
> clear what is being achieved by that.
>
> --
> Pranith
>
>>
>> /* Initialize the elements themselves, starting from the leaves. */
>>
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index bf2c1e669691..0f69a79c5b7d 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -307,6 +307,9 @@ struct rcu_data {
>> /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
>> unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
>> unsigned long offline_fqs; /* Kicked due to being offline. */
>> + unsigned long cond_resched_completed;
>> + /* Grace period that needs help */
>> + /* from cond_resched(). */
>>
>> /* 5) __rcu_pending() statistics. */
>> unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
>> @@ -392,6 +395,7 @@ struct rcu_state {
>> struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
>> u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
>> u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
>> + u8 flavor_mask; /* bit in flavor mask. */
>> struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
>> void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
>> void (*func)(struct rcu_head *head));
>> @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
>> static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
>> static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
>> static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
>> -static void rcu_kick_nohz_cpu(int cpu);
>> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
>> static bool init_nocb_callback_list(struct rcu_data *rdp);
>> static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
>> static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
>> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
>> index cbc2c45265e2..02ac0fb186b8 100644
>> --- a/kernel/rcu/tree_plugin.h
>> +++ b/kernel/rcu/tree_plugin.h
>> @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
>> * if an adaptive-ticks CPU is failing to respond to the current grace
>> * period and has not be idle from an RCU perspective, kick it.
>> */
>> -static void rcu_kick_nohz_cpu(int cpu)
>> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
>> {
>> #ifdef CONFIG_NO_HZ_FULL
>> if (tick_nohz_full_cpu(cpu))
>> diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
>> index a2aeb4df0f60..d22309cae9f5 100644
>> --- a/kernel/rcu/update.c
>> +++ b/kernel/rcu/update.c
>> @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
>> early_initcall(check_cpu_stall_init);
>>
>> #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
>> -
>> -/*
>> - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
>> - */
>> -
>> -DEFINE_PER_CPU(int, rcu_cond_resched_count);
>> -
>> -/*
>> - * Report a set of RCU quiescent states, for use by cond_resched()
>> - * and friends. Out of line due to being called infrequently.
>> - */
>> -void rcu_resched(void)
>> -{
>> - preempt_disable();
>> - __this_cpu_write(rcu_cond_resched_count, 0);
>> - rcu_note_context_switch(smp_processor_id());
>> - preempt_enable();
>> -}
>>
>
--
Pranith
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-07-22 4:52 ` Pranith Kumar
@ 2014-07-22 11:07 ` Paul E. McKenney
0 siblings, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-07-22 11:07 UTC (permalink / raw)
To: Pranith Kumar; +Cc: Josh Triplett, LKML
On Tue, Jul 22, 2014 at 12:52:40AM -0400, Pranith Kumar wrote:
> Doh! I figured it out *after* I sent out the mail. Sorry for the noise!
I know that feeling! ;-)
Thanx, Paul
> On Tue, Jul 22, 2014 at 12:35 AM, Pranith Kumar <bobby.prani@gmail.com> wrote:
> > Hi Paul,
> >
> > I was going through this code and found a few inconsistencies. I git blamed it
> > and found that it was this recent commit and thought I could ask a few
> > questions. I am dropping the CC's as I am not sure since it is pretty late.
> >
> > Please find a few questions below:
> >
> > On 06/20/2014 02:33 PM, Paul E. McKenney wrote:
> >> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> >>
> >> Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
> >> fixed a problem where a CPU looping in the kernel with but one runnable
> >> task would give RCU CPU stall warnings, even if the in-kernel loop
> >> contained cond_resched() calls. Unfortunately, in so doing, it introduced
> >> performance regressions in Anton Blanchard's will-it-scale "open1" test.
> >> The problem appears to be not so much the increased cond_resched() path
> >> length as an increase in the rate at which grace periods complete, which
> >> increased per-update grace-period overhead.
> >>
> >> This commit takes a different approach to fixing this bug, mainly by
> >> avoiding having cond_resched() do an RCU-visible quiescent state unless
> >> there is a grace period that has been in flight for a significant period
> >> of time. This commit also reduces the common-case cond_resched() overhead
> >> to a check of a single per-CPU variable.
> >>
> > <snip>
> >> index f1ba77363fbb..2cc72ce19ff6 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -229,6 +229,58 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
> >> #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> >> };
> >>
> >> +/*
> >> + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> >> + */
> >> +
> >> +DEFINE_PER_CPU(int, rcu_cond_resched_mask);
> >> +
> >> +/*
> >> + * Let the RCU core know that this CPU has gone through a cond_resched(),
> >> + * which is a quiescent state.
> >> + */
> >> +void rcu_resched(void)
> >> +{
> >> + unsigned long flags;
> >> + struct rcu_data *rdp;
> >> + struct rcu_dynticks *rdtp;
> >> + int resched_mask;
> >> + struct rcu_state *rsp;
> >> +
> >> + local_irq_save(flags);
> >> +
> >> + /*
> >> + * Yes, we can lose flag-setting operations. This is OK, because
> >> + * the flag will be set again after some delay.
> >> + */
> >> + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
> >> + raw_cpu_write(rcu_cond_resched_mask, 0);
> >> +
> >> + /* Find the flavor that needs a quiescent state. */
> >> + for_each_rcu_flavor(rsp) {
> >> + rdp = raw_cpu_ptr(rsp->rda);
> >> + if (!(resched_mask & rsp->flavor_mask))
> >> + continue;
> >
> > Here both resched_mask and flavor_mask are not being updated within the loop.
> > Are they supposed to be? It is really not clear what flavor_mask is doing in the
> > code.
> >
> >
> >> + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
> >> + if (ACCESS_ONCE(rdp->mynode->completed) !=
> >> + ACCESS_ONCE(rdp->cond_resched_completed))
> >> + continue;
> >> +
> >> + /*
> >> + * Pretend to be momentarily idle for the quiescent state.
> >> + * This allows the grace-period kthread to record the
> >> + * quiescent state, with no need for this CPU to do anything
> >> + * further.
> >> + */
> >> + rdtp = this_cpu_ptr(&rcu_dynticks);
> >> + smp_mb__before_atomic(); /* Earlier stuff before QS. */
> >> + atomic_add(2, &rdtp->dynticks); /* QS. */
> >> + smp_mb__after_atomic(); /* Later stuff after QS. */
> >> + break;
> >> + }
> >> + local_irq_restore(flags);
> >> +}
> >> +
> >> static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
> >> static long qhimark = 10000; /* If this many pending, ignore blimit. */
> >> static long qlowmark = 100; /* Once only this many pending, use blimit. */
> >> @@ -853,6 +905,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> >> bool *isidle, unsigned long *maxj)
> >> {
> >> unsigned int curr;
> >> + int *rcrmp;
> >> unsigned int snap;
> >>
> >> curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
> >> @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> >> }
> >>
> >> /*
> >> - * There is a possibility that a CPU in adaptive-ticks state
> >> - * might run in the kernel with the scheduling-clock tick disabled
> >> - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
> >> - * force the CPU to restart the scheduling-clock tick in this
> >> - * CPU is in this state.
> >> + * A CPU running for an extended time within the kernel can
> >> + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
> >> + * even context-switching back and forth between a pair of
> >> + * in-kernel CPU-bound tasks cannot advance grace periods.
> >> + * So if the grace period is old enough, make the CPU pay attention.
> >> */
> >> - rcu_kick_nohz_cpu(rdp->cpu);
> >> + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
> >> + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
> >> + ACCESS_ONCE(rdp->cond_resched_completed) =
> >> + ACCESS_ONCE(rdp->mynode->completed);
> >> + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
> >> + ACCESS_ONCE(*rcrmp) =
> >> + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
> >> + }
> >>
> >> /*
> >> * Alternatively, the CPU might be running in the kernel
> >> @@ -3491,6 +3551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> >> "rcu_node_fqs_1",
> >> "rcu_node_fqs_2",
> >> "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
> >> + static u8 fl_mask = 0x1;
> >
> > What does 0x1 mean here? Is it for a particular flavor? This could use a
> > comment.
> >
> >> int cpustride = 1;
> >> int i;
> >> int j;
> >> @@ -3509,6 +3570,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> >> for (i = 1; i < rcu_num_lvls; i++)
> >> rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
> >> rcu_init_levelspread(rsp);
> >> + rsp->flavor_mask = fl_mask;
> >> + fl_mask <<= 1;
> >
> > Something looks off here. fl_mask is not being used after this. Was it supposed
> > to be used or is it just a stray statement?
> >
> > The flavor_mask operations could really use some comments as it is not really
> > clear what is being achieved by that.
> >
> > --
> > Pranith
> >
> >>
> >> /* Initialize the elements themselves, starting from the leaves. */
> >>
> >> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> >> index bf2c1e669691..0f69a79c5b7d 100644
> >> --- a/kernel/rcu/tree.h
> >> +++ b/kernel/rcu/tree.h
> >> @@ -307,6 +307,9 @@ struct rcu_data {
> >> /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
> >> unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
> >> unsigned long offline_fqs; /* Kicked due to being offline. */
> >> + unsigned long cond_resched_completed;
> >> + /* Grace period that needs help */
> >> + /* from cond_resched(). */
> >>
> >> /* 5) __rcu_pending() statistics. */
> >> unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
> >> @@ -392,6 +395,7 @@ struct rcu_state {
> >> struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
> >> u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
> >> u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
> >> + u8 flavor_mask; /* bit in flavor mask. */
> >> struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
> >> void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
> >> void (*func)(struct rcu_head *head));
> >> @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
> >> static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
> >> static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
> >> static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
> >> -static void rcu_kick_nohz_cpu(int cpu);
> >> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
> >> static bool init_nocb_callback_list(struct rcu_data *rdp);
> >> static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
> >> static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> >> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> >> index cbc2c45265e2..02ac0fb186b8 100644
> >> --- a/kernel/rcu/tree_plugin.h
> >> +++ b/kernel/rcu/tree_plugin.h
> >> @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
> >> * if an adaptive-ticks CPU is failing to respond to the current grace
> >> * period and has not be idle from an RCU perspective, kick it.
> >> */
> >> -static void rcu_kick_nohz_cpu(int cpu)
> >> +static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
> >> {
> >> #ifdef CONFIG_NO_HZ_FULL
> >> if (tick_nohz_full_cpu(cpu))
> >> diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
> >> index a2aeb4df0f60..d22309cae9f5 100644
> >> --- a/kernel/rcu/update.c
> >> +++ b/kernel/rcu/update.c
> >> @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
> >> early_initcall(check_cpu_stall_init);
> >>
> >> #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
> >> -
> >> -/*
> >> - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> >> - */
> >> -
> >> -DEFINE_PER_CPU(int, rcu_cond_resched_count);
> >> -
> >> -/*
> >> - * Report a set of RCU quiescent states, for use by cond_resched()
> >> - * and friends. Out of line due to being called infrequently.
> >> - */
> >> -void rcu_resched(void)
> >> -{
> >> - preempt_disable();
> >> - __this_cpu_write(rcu_cond_resched_count, 0);
> >> - rcu_note_context_switch(smp_processor_id());
> >> - preempt_enable();
> >> -}
> >>
> >
>
>
>
> --
> Pranith
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU
2014-07-22 4:35 ` Pranith Kumar
2014-07-22 4:52 ` Pranith Kumar
@ 2014-07-22 11:06 ` Paul E. McKenney
1 sibling, 0 replies; 24+ messages in thread
From: Paul E. McKenney @ 2014-07-22 11:06 UTC (permalink / raw)
To: Pranith Kumar; +Cc: Josh Triplett, LKML
On Tue, Jul 22, 2014 at 12:35:57AM -0400, Pranith Kumar wrote:
> Hi Paul,
>
> I was going through this code and found a few inconsistencies. I git blamed it
> and found that it was this recent commit and thought I could ask a few
> questions. I am dropping the CC's as I am not sure since it is pretty late.
>
> Please find a few questions below:
>
> On 06/20/2014 02:33 PM, Paul E. McKenney wrote:
> > From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> >
> > Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
> > fixed a problem where a CPU looping in the kernel with but one runnable
> > task would give RCU CPU stall warnings, even if the in-kernel loop
> > contained cond_resched() calls. Unfortunately, in so doing, it introduced
> > performance regressions in Anton Blanchard's will-it-scale "open1" test.
> > The problem appears to be not so much the increased cond_resched() path
> > length as an increase in the rate at which grace periods complete, which
> > increased per-update grace-period overhead.
> >
> > This commit takes a different approach to fixing this bug, mainly by
> > avoiding having cond_resched() do an RCU-visible quiescent state unless
> > there is a grace period that has been in flight for a significant period
> > of time. This commit also reduces the common-case cond_resched() overhead
> > to a check of a single per-CPU variable.
> >
> <snip>
> > index f1ba77363fbb..2cc72ce19ff6 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -229,6 +229,58 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
> > #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> > };
> >
> > +/*
> > + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> > + */
> > +
> > +DEFINE_PER_CPU(int, rcu_cond_resched_mask);
> > +
> > +/*
> > + * Let the RCU core know that this CPU has gone through a cond_resched(),
> > + * which is a quiescent state.
> > + */
> > +void rcu_resched(void)
> > +{
> > + unsigned long flags;
> > + struct rcu_data *rdp;
> > + struct rcu_dynticks *rdtp;
> > + int resched_mask;
> > + struct rcu_state *rsp;
> > +
> > + local_irq_save(flags);
> > +
> > + /*
> > + * Yes, we can lose flag-setting operations. This is OK, because
> > + * the flag will be set again after some delay.
> > + */
> > + resched_mask = raw_cpu_read(rcu_cond_resched_mask);
> > + raw_cpu_write(rcu_cond_resched_mask, 0);
> > +
> > + /* Find the flavor that needs a quiescent state. */
> > + for_each_rcu_flavor(rsp) {
> > + rdp = raw_cpu_ptr(rsp->rda);
> > + if (!(resched_mask & rsp->flavor_mask))
> > + continue;
>
> Here both resched_mask and flavor_mask are not being updated within the loop.
> Are they supposed to be? It is really not clear what flavor_mask is doing in the
> code.
Because rsp is being updated on each pass through the loop, rsp->flavor_mask
has a different value on each pass.
The idea is that each rcu_state (referenced by rsp) has one bit in its
->flavor_mask. Then resched_mask has bits set corresponding to each
rcu_state that needs help.
There were several variants of this commit, due to some performance
concerns.
> > + smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
> > + if (ACCESS_ONCE(rdp->mynode->completed) !=
> > + ACCESS_ONCE(rdp->cond_resched_completed))
> > + continue;
> > +
> > + /*
> > + * Pretend to be momentarily idle for the quiescent state.
> > + * This allows the grace-period kthread to record the
> > + * quiescent state, with no need for this CPU to do anything
> > + * further.
> > + */
> > + rdtp = this_cpu_ptr(&rcu_dynticks);
> > + smp_mb__before_atomic(); /* Earlier stuff before QS. */
> > + atomic_add(2, &rdtp->dynticks); /* QS. */
> > + smp_mb__after_atomic(); /* Later stuff after QS. */
> > + break;
> > + }
> > + local_irq_restore(flags);
> > +}
> > +
> > static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
> > static long qhimark = 10000; /* If this many pending, ignore blimit. */
> > static long qlowmark = 100; /* Once only this many pending, use blimit. */
> > @@ -853,6 +905,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> > bool *isidle, unsigned long *maxj)
> > {
> > unsigned int curr;
> > + int *rcrmp;
> > unsigned int snap;
> >
> > curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
> > @@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
> > }
> >
> > /*
> > - * There is a possibility that a CPU in adaptive-ticks state
> > - * might run in the kernel with the scheduling-clock tick disabled
> > - * for an extended time period. Invoke rcu_kick_nohz_cpu() to
> > - * force the CPU to restart the scheduling-clock tick in this
> > - * CPU is in this state.
> > + * A CPU running for an extended time within the kernel can
> > + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
> > + * even context-switching back and forth between a pair of
> > + * in-kernel CPU-bound tasks cannot advance grace periods.
> > + * So if the grace period is old enough, make the CPU pay attention.
> > */
> > - rcu_kick_nohz_cpu(rdp->cpu);
> > + if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
> > + rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
> > + ACCESS_ONCE(rdp->cond_resched_completed) =
> > + ACCESS_ONCE(rdp->mynode->completed);
> > + smp_mb(); /* ->cond_resched_completed before *rcrmp. */
> > + ACCESS_ONCE(*rcrmp) =
> > + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
> > + }
> >
> > /*
> > * Alternatively, the CPU might be running in the kernel
> > @@ -3491,6 +3551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> > "rcu_node_fqs_1",
> > "rcu_node_fqs_2",
> > "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
> > + static u8 fl_mask = 0x1;
>
> What does 0x1 mean here? Is it for a particular flavor? This could use a
> comment.
Each flavor gets its own bit, and the first flavor gets 0x1, which is
the same as 1, but indicates its use as a bit mask.
> > int cpustride = 1;
> > int i;
> > int j;
> > @@ -3509,6 +3570,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
> > for (i = 1; i < rcu_num_lvls; i++)
> > rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
> > rcu_init_levelspread(rsp);
> > + rsp->flavor_mask = fl_mask;
> > + fl_mask <<= 1;
>
> Something looks off here. fl_mask is not being used after this. Was it supposed
> to be used or is it just a stray statement?
Note the "static" on the declaration. The next call will see the updated
value, not the initial value.
> The flavor_mask operations could really use some comments as it is not really
> clear what is being achieved by that.
Seems pretty straightforward to me. Each flavor gets a bit in ->flavor_mask,
starting with 0x1, then 0x2, then maybe 0x4 for preemptible kernels.
I believe that you were forgetting that "static" on a local variable
means that the variable is off-stack, and that later invocations see
updates from previous invocations. Initialization of course happens
at compile time.
Thanx, Paul
> --
> Pranith
>
> >
> > /* Initialize the elements themselves, starting from the leaves. */
> >
> > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> > index bf2c1e669691..0f69a79c5b7d 100644
> > --- a/kernel/rcu/tree.h
> > +++ b/kernel/rcu/tree.h
> > @@ -307,6 +307,9 @@ struct rcu_data {
> > /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
> > unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
> > unsigned long offline_fqs; /* Kicked due to being offline. */
> > + unsigned long cond_resched_completed;
> > + /* Grace period that needs help */
> > + /* from cond_resched(). */
> >
> > /* 5) __rcu_pending() statistics. */
> > unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
> > @@ -392,6 +395,7 @@ struct rcu_state {
> > struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
> > u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
> > u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
> > + u8 flavor_mask; /* bit in flavor mask. */
> > struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
> > void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
> > void (*func)(struct rcu_head *head));
> > @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
> > static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
> > static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
> > static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
> > -static void rcu_kick_nohz_cpu(int cpu);
> > +static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
> > static bool init_nocb_callback_list(struct rcu_data *rdp);
> > static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
> > static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> > index cbc2c45265e2..02ac0fb186b8 100644
> > --- a/kernel/rcu/tree_plugin.h
> > +++ b/kernel/rcu/tree_plugin.h
> > @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
> > * if an adaptive-ticks CPU is failing to respond to the current grace
> > * period and has not be idle from an RCU perspective, kick it.
> > */
> > -static void rcu_kick_nohz_cpu(int cpu)
> > +static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
> > {
> > #ifdef CONFIG_NO_HZ_FULL
> > if (tick_nohz_full_cpu(cpu))
> > diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
> > index a2aeb4df0f60..d22309cae9f5 100644
> > --- a/kernel/rcu/update.c
> > +++ b/kernel/rcu/update.c
> > @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
> > early_initcall(check_cpu_stall_init);
> >
> > #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
> > -
> > -/*
> > - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
> > - */
> > -
> > -DEFINE_PER_CPU(int, rcu_cond_resched_count);
> > -
> > -/*
> > - * Report a set of RCU quiescent states, for use by cond_resched()
> > - * and friends. Out of line due to being called infrequently.
> > - */
> > -void rcu_resched(void)
> > -{
> > - preempt_disable();
> > - __this_cpu_write(rcu_cond_resched_count, 0);
> > - rcu_note_context_switch(smp_processor_id());
> > - preempt_enable();
> > -}
> >
>
^ permalink raw reply [flat|nested] 24+ messages in thread