Linux Documentation
 help / color / mirror / Atom feed
From: Jing Wu <realwujing@gmail.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	 Juri Lelli <juri.lelli@redhat.com>,
	 Vincent Guittot <vincent.guittot@linaro.org>,
	 Dietmar Eggemann <dietmar.eggemann@arm.com>,
	 Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,  Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	 "Paul E. McKenney" <paulmck@kernel.org>,
	 Frederic Weisbecker <frederic@kernel.org>,
	 Neeraj Upadhyay <neeraj.upadhyay@kernel.org>,
	 Joel Fernandes <joelagnelf@nvidia.com>,
	 Josh Triplett <josh@joshtriplett.org>,
	Boqun Feng <boqun@kernel.org>,
	 Uladzislau Rezki <urezki@gmail.com>,
	 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	 Lai Jiangshan <jiangshanlai@gmail.com>,
	Zqiang <qiang.zhang@linux.dev>,
	 Anna-Maria Behnsen <anna-maria@linutronix.de>,
	Tejun Heo <tj@kernel.org>,  Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	 Shuah Khan <shuah@kernel.org>, Thomas Gleixner <tglx@kernel.org>
Cc: linux-kernel@vger.kernel.org, rcu@vger.kernel.org,
	 cgroups@vger.kernel.org, linux-doc@vger.kernel.org,
	 linux-kselftest@vger.kernel.org, Jing Wu <realwujing@gmail.com>,
	 Qiliang Yuan <yuanql9@chinatelecom.cn>
Subject: [PATCH v3 07/13] rcu/nocb: Add explicit housekeeping callback for runtime NOCB toggling
Date: Thu, 18 Jun 2026 11:11:18 +0800	[thread overview]
Message-ID: <20260618-wujing-dhm-v3-7-28f1a4d83b68@gmail.com> (raw)
In-Reply-To: <20260618-wujing-dhm-v3-0-28f1a4d83b68@gmail.com>

Register a housekeeping callback for HK_TYPE_KERNEL_NOISE.  When the
mask changes, schedule asynchronous work to iterate all possible CPUs
and toggle NOCB mode for CPUs whose state disagrees with the new mask.
CPUs in the housekeeping set are de-offloaded; isolated CPUs are
offloaded.

Use CPU hotplug (remove_cpu() / add_cpu()) because
rcu_nocb_cpu_offload() and rcu_nocb_cpu_deoffload() require the target
CPU to be offline.  The hotplug cycle takes the CPU fully offline to
quiesce its RCU state before toggling the NOCB flag, then brings it
back.  Skip CPUs whose state already matches to avoid unnecessary
hotplug churn.  Only bring a CPU back online if it was online before
the state change (was_online guard avoids add_cpu() on a CPU that was
already offline).

This differs from Frederic Weisbecker's suggestion to "assume the CPU
is offline" within the RCU subsystem and toggle NOCB without a full
hotplug cycle.  The full hotplug approach was chosen for v3 because
rcu_nocb_cpu_offload() and rcu_nocb_cpu_deoffload() are the existing
stable interfaces and the "assume offline" path would require adding
new internal RCU APIs.  This is a known limitation that may be
addressed by RCU maintainers in follow-up work.

Snapshot the current HK_TYPE_KERNEL_NOISE cpumask inside the work
function under an RCU read lock rather than caching the pointer at
apply() time.  Caching at apply() time would create a use-after-free
hazard: a subsequent housekeeping_update_types() call frees the old
cpumask after synchronize_rcu() but before the work function runs.

Remove the cpus_read_lock() / cpus_read_unlock() pair that wrapped the
hotplug loop.  remove_cpu() and add_cpu() acquire the cpu_hotplug_lock
write side; holding the read side via cpus_read_lock() before calling
them causes a deadlock.

Signed-off-by: Jing Wu <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
 kernel/rcu/tree.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 55df6d37145e8..214ce940f501b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4929,3 +4929,107 @@ void __init rcu_init(void)
 #include "tree_exp.h"
 #include "tree_nocb.h"
 #include "tree_plugin.h"
+
+#ifdef CONFIG_RCU_NOCB_CPU
+/*
+ * RCU NOCB runtime toggle via housekeeping callback.
+ * Schedule the CPU-hotplug work asynchronously because
+ * remove_cpu() and add_cpu() must not be called while holding
+ * cpuset_top_mutex (the hk callback context).
+ *
+ * Snapshot the current HK_TYPE_KERNEL_NOISE cpumask inside the work
+ * function under an RCU read lock to avoid caching a pointer at
+ * apply() time that could be freed before the work runs.
+ */
+struct rcu_hk_work {
+	struct work_struct work;
+};
+
+static void rcu_hk_workfn(struct work_struct *w)
+{
+	struct rcu_hk_work *hw = container_of(w, struct rcu_hk_work, work);
+	cpumask_var_t hk_mask;
+	int cpu, ret;
+
+	if (!alloc_cpumask_var(&hk_mask, GFP_KERNEL)) {
+		kfree(hw);
+		return;
+	}
+
+	rcu_read_lock();
+	cpumask_copy(hk_mask, housekeeping_cpumask_rcu(HK_TYPE_KERNEL_NOISE));
+	rcu_read_unlock();
+
+	for_each_possible_cpu(cpu) {
+		bool should_offload = !cpumask_test_cpu(cpu, hk_mask);
+		bool is_offloaded;
+		bool was_online;
+
+		if (!cpumask_available(rcu_nocb_mask)) {
+			is_offloaded = false;
+		} else {
+			is_offloaded = cpumask_test_cpu(cpu, rcu_nocb_mask);
+		}
+
+		if (should_offload == is_offloaded)
+			continue;
+
+		was_online = cpu_online(cpu);
+		if (was_online) {
+			ret = remove_cpu(cpu);
+			if (ret)
+				continue;
+		}
+		if (should_offload)
+			rcu_nocb_cpu_offload(cpu);
+		else
+			rcu_nocb_cpu_deoffload(cpu);
+		if (was_online)
+			add_cpu(cpu);
+	}
+
+	free_cpumask_var(hk_mask);
+	kfree(hw);
+}
+
+static void rcu_hk_apply(enum hk_type type)
+{
+	struct rcu_hk_work *hw;
+
+	if (!cpumask_available(rcu_nocb_mask))
+		return;
+
+	hw = kmalloc(sizeof(*hw), GFP_KERNEL);
+	if (!hw)
+		return;
+
+	INIT_WORK(&hw->work, rcu_hk_workfn);
+	schedule_work(&hw->work);
+}
+
+static int rcu_hk_validate(enum hk_type type,
+			   const struct cpumask *cur_mask,
+			   const struct cpumask *new_mask)
+{
+	if (!IS_ENABLED(CONFIG_RCU_NOCB_CPU))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static struct housekeeping_cbs rcu_hk_cbs = {
+	.name		= "rcu/nocb",
+	.pre_validate	= rcu_hk_validate,
+	.apply		= rcu_hk_apply,
+};
+
+static int __init rcu_hk_init(void)
+{
+	int ret;
+
+	ret = housekeeping_register_cbs(HK_TYPE_KERNEL_NOISE, &rcu_hk_cbs);
+	if (ret)
+		pr_info("rcu/nocb: runtime NOCB toggle disabled (%d)\n", ret);
+	return 0;
+}
+late_initcall(rcu_hk_init);
+#endif /* CONFIG_RCU_NOCB_CPU */

-- 
2.43.0


  parent reply	other threads:[~2026-06-18  3:12 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-18  3:11 [PATCH v3 00/13] Dynamic Housekeeping Management (DHM) via CPUSets Jing Wu
2026-06-18  3:11 ` [PATCH v3 01/13] sched/isolation: Replace notifier chain with explicit callback interface Jing Wu
2026-06-18  3:11 ` [PATCH v3 02/13] sched/isolation: Add housekeeping_update_types() for kernel-noise masks Jing Wu
2026-06-18  3:11 ` [PATCH v3 03/13] sched/isolation: RCU-protect all housekeeping cpumask readers Jing Wu
2026-06-18  3:11 ` [PATCH v3 04/13] sched/isolation: Fix RCU protection for runtime-mutable cpumask callers Jing Wu
2026-06-18  3:11 ` [PATCH v3 05/13] cpu/hotplug: Reserve CPUHP states for nohz_full and managed IRQ down-paths Jing Wu
2026-06-18 16:06   ` Thomas Gleixner
2026-06-18 21:01     ` Thomas Gleixner
2026-06-18  3:11 ` [PATCH v3 06/13] tick/nohz, context_tracking: Prepare for runtime nohz_full updates Jing Wu
2026-06-18 17:27   ` Thomas Gleixner
2026-06-18 19:49     ` Thomas Gleixner
2026-06-18  3:11 ` Jing Wu [this message]
2026-06-18  3:11 ` [PATCH v3 08/13] genirq: Add explicit housekeeping callback for managed IRQ migration Jing Wu
2026-06-18 20:27   ` Thomas Gleixner
2026-06-18 21:11     ` Thomas Gleixner
2026-06-18  3:11 ` [PATCH v3 09/13] watchdog/lockup_detector: Register housekeeping callback for kernel-noise Jing Wu
2026-06-18  3:11 ` [PATCH v3 10/13] sched: Guard sched_tick_start/stop against uninitialized tick_work_cpu Jing Wu
2026-06-18 20:50   ` Thomas Gleixner
2026-06-18  3:11 ` [PATCH v3 11/13] cgroup/cpuset: Extend isolated partition to trigger kernel-noise isolation Jing Wu
2026-06-18 20:55   ` Thomas Gleixner
2026-06-18  3:11 ` [PATCH v3 12/13] docs: cgroup-v2: Document kernel-noise isolation via isolated partitions Jing Wu
2026-06-18  3:11 ` [PATCH v3 13/13] selftests/cgroup: Add kernel-noise isolation test to cpuset selftest Jing Wu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260618-wujing-dhm-v3-7-28f1a4d83b68@gmail.com \
    --to=realwujing@gmail.com \
    --cc=anna-maria@linutronix.de \
    --cc=boqun@kernel.org \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=dietmar.eggemann@arm.com \
    --cc=frederic@kernel.org \
    --cc=jiangshanlai@gmail.com \
    --cc=joelagnelf@nvidia.com \
    --cc=josh@joshtriplett.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=neeraj.upadhyay@kernel.org \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=qiang.zhang@linux.dev \
    --cc=rcu@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=shuah@kernel.org \
    --cc=skhan@linuxfoundation.org \
    --cc=tglx@kernel.org \
    --cc=tj@kernel.org \
    --cc=urezki@gmail.com \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=yuanql9@chinatelecom.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox