public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: David Woodhouse <dwmw@amazon.co.uk>,
	"Paul E . McKenney" <paulmck@kernel.org>,
	Sasha Levin <sashal@kernel.org>,
	frederic@kernel.org, quic_neeraju@quicinc.com,
	josh@joshtriplett.org, rcu@vger.kernel.org
Subject: [PATCH AUTOSEL 5.16 05/35] rcu: Kill rnp->ofl_seq and use only rcu_state.ofl_lock for exclusion
Date: Mon, 28 Mar 2022 07:19:41 -0400	[thread overview]
Message-ID: <20220328112011.1555169-5-sashal@kernel.org> (raw)
In-Reply-To: <20220328112011.1555169-1-sashal@kernel.org>

From: David Woodhouse <dwmw@amazon.co.uk>

[ Upstream commit 82980b1622d97017053c6792382469d7dc26a486 ]

If we allow architectures to bring APs online in parallel, then we end
up requiring rcu_cpu_starting() to be reentrant. But currently, the
manipulation of rnp->ofl_seq is not thread-safe.

However, rnp->ofl_seq is also fairly much pointless anyway since both
rcu_cpu_starting() and rcu_report_dead() hold rcu_state.ofl_lock for
fairly much the whole time that rnp->ofl_seq is set to an odd number
to indicate that an operation is in progress.

So drop rnp->ofl_seq completely, and use only rcu_state.ofl_lock.

This has a couple of minor complexities: lockdep will complain when we
take rcu_state.ofl_lock, and currently accepts the 'excuse' of having
an odd value in rnp->ofl_seq. So switch it to an arch_spinlock_t to
avoid that false positive complaint. Since we're killing rnp->ofl_seq
of course that 'excuse' has to be changed too, so make it check for
arch_spin_is_locked(rcu_state.ofl_lock).

There's no arch_spin_lock_irqsave() so we have to manually save and
restore local interrupts around the locking.

At Paul's request based on Neeraj's analysis, make rcu_gp_init not just
wait but *exclude* any CPU online/offline activity, which was fairly
much true already by virtue of it holding rcu_state.ofl_lock.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/rcu/tree.c | 71 ++++++++++++++++++++++++-----------------------
 kernel/rcu/tree.h |  4 +--
 2 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 28fd0cef9b1f..0e70fe0ab690 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static struct rcu_state rcu_state = {
 	.abbr = RCU_ABBR,
 	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
 	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
-	.ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
 };
 
 /* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1168,7 +1168,15 @@ bool rcu_lockdep_current_cpu_online(void)
 	preempt_disable_notrace();
 	rdp = this_cpu_ptr(&rcu_data);
 	rnp = rdp->mynode;
-	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
+	/*
+	 * Strictly, we care here about the case where the current CPU is
+	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * not being up to date. So arch_spin_is_locked() might have a
+	 * false positive if it's held by some *other* CPU, but that's
+	 * OK because that just means a false *negative* on the warning.
+	 */
+	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) ||
+	    arch_spin_is_locked(&rcu_state.ofl_lock))
 		ret = true;
 	preempt_enable_notrace();
 	return ret;
@@ -1732,7 +1740,6 @@ static void rcu_strict_gp_boundary(void *unused)
  */
 static noinline_for_stack bool rcu_gp_init(void)
 {
-	unsigned long firstseq;
 	unsigned long flags;
 	unsigned long oldmask;
 	unsigned long mask;
@@ -1775,22 +1782,17 @@ static noinline_for_stack bool rcu_gp_init(void)
 	 * of RCU's Requirements documentation.
 	 */
 	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+	/* Exclude CPU hotplug operations. */
 	rcu_for_each_leaf_node(rnp) {
-		// Wait for CPU-hotplug operations that might have
-		// started before this grace period did.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
-		firstseq = READ_ONCE(rnp->ofl_seq);
-		if (firstseq & 0x1)
-			while (firstseq == READ_ONCE(rnp->ofl_seq))
-				schedule_timeout_idle(1);  // Can't wake unless RCU is watching.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
-		raw_spin_lock(&rcu_state.ofl_lock);
-		raw_spin_lock_irq_rcu_node(rnp);
+		local_irq_save(flags);
+		arch_spin_lock(&rcu_state.ofl_lock);
+		raw_spin_lock_rcu_node(rnp);
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
-			raw_spin_unlock_irq_rcu_node(rnp);
-			raw_spin_unlock(&rcu_state.ofl_lock);
+			raw_spin_unlock_rcu_node(rnp);
+			arch_spin_unlock(&rcu_state.ofl_lock);
+			local_irq_restore(flags);
 			continue;
 		}
 
@@ -1825,8 +1827,9 @@ static noinline_for_stack bool rcu_gp_init(void)
 				rcu_cleanup_dead_rnp(rnp);
 		}
 
-		raw_spin_unlock_irq_rcu_node(rnp);
-		raw_spin_unlock(&rcu_state.ofl_lock);
+		raw_spin_unlock_rcu_node(rnp);
+		arch_spin_unlock(&rcu_state.ofl_lock);
+		local_irq_restore(flags);
 	}
 	rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
 
@@ -4247,11 +4250,10 @@ void rcu_cpu_starting(unsigned int cpu)
 
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+	local_irq_save(flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
 	rcu_dynticks_eqs_online();
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	raw_spin_lock_rcu_node(rnp);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
 	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
@@ -4264,15 +4266,18 @@ void rcu_cpu_starting(unsigned int cpu)
 
 	/* An incoming CPU should never be blocking a grace period. */
 	if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+		/* rcu_report_qs_rnp() *really* wants some flags to restore */
+		unsigned long flags2;
+
+		local_irq_save(flags2);
 		rcu_disable_urgency_upon_qs(rdp);
 		/* Report QS -after- changing ->qsmaskinitnext! */
-		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
 	} else {
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		raw_spin_unlock_rcu_node(rnp);
 	}
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(flags);
 	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 
@@ -4286,7 +4291,7 @@ void rcu_cpu_starting(unsigned int cpu)
  */
 void rcu_report_dead(unsigned int cpu)
 {
-	unsigned long flags;
+	unsigned long flags, seq_flags;
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -4300,10 +4305,8 @@ void rcu_report_dead(unsigned int cpu)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	raw_spin_lock(&rcu_state.ofl_lock);
+	local_irq_save(seq_flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
 	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -4314,10 +4317,8 @@ void rcu_report_dead(unsigned int cpu)
 	}
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	raw_spin_unlock(&rcu_state.ofl_lock);
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(seq_flags);
 
 	rdp->cpu_started = false;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..aff4cc9303fb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,8 +56,6 @@ struct rcu_node {
 				/*  Initialized from ->qsmaskinitnext at the */
 				/*  beginning of each grace period. */
 	unsigned long qsmaskinitnext;
-	unsigned long ofl_seq;	/* CPU-hotplug operation sequence count. */
-				/* Online CPUs for next grace period. */
 	unsigned long expmask;	/* CPUs or groups that need to check in */
 				/*  to allow the current expedited GP */
 				/*  to complete. */
@@ -358,7 +356,7 @@ struct rcu_state {
 	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 
-	raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+	arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
 						/* Synchronize offline with */
 						/*  GP pre-initialization. */
 };
-- 
2.34.1


  parent reply	other threads:[~2022-03-28 11:24 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-28 11:19 [PATCH AUTOSEL 5.16 01/35] LSM: general protection fault in legacy_parse_param Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 02/35] regulator: rpi-panel: Handle I2C errors/timing to the Atmel Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 03/35] crypto: hisilicon/qm - cleanup warning in qm_vf_read_qos Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 04/35] gcc-plugins/stackleak: Exactly match strings instead of prefixes Sasha Levin
2022-03-28 11:19 ` Sasha Levin [this message]
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 06/35] pinctrl: npcm: Fix broken references to chip->parent_device Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 07/35] rcu: Mark writes to the rcu_segcblist structure's ->flags field Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 08/35] block: throttle split bio in case of iops limit Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 09/35] memstick/mspro_block: fix handling of read-only devices Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 10/35] block/bfq_wf2q: correct weight to ioprio Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 11/35] crypto: xts - Add softdep on ecb Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 12/35] crypto: hisilicon/sec - not need to enable sm4 extra mode at HW V3 Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 13/35] block, bfq: don't move oom_bfqq Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 14/35] selinux: use correct type for context length Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 15/35] powercap/dtpm_cpu: Reset per_cpu variable in the release function Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 16/35] arm64: module: remove (NOLOAD) from linker script Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 17/35] selinux: allow FIOCLEX and FIONCLEX with policy capability Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 18/35] loop: use sysfs_emit() in the sysfs xxx show() Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 19/35] Fix incorrect type in assignment of ipv6 port for audit Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 20/35] irqchip/qcom-pdc: Fix broken locking Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 21/35] irqchip/nvic: Release nvic_base upon failure Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 22/35] fs/binfmt_elf: Fix AT_PHDR for unusual ELF files Sasha Levin
2022-03-28 11:19 ` [PATCH AUTOSEL 5.16 23/35] signal, x86: Delay calling signals in atomic on RT enabled kernels Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 24/35] bfq: fix use-after-free in bfq_dispatch_request Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 25/35] ACPICA: Avoid walking the ACPI Namespace if it is not there Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 26/35] lib/raid6/test/Makefile: Use $(pound) instead of \# for Make 4.3 Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 27/35] Revert "Revert "block, bfq: honor already-setup queue merges"" Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 28/35] ACPI/APEI: Limit printable size of BERT table data Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 29/35] PM: core: keep irq flags in device_pm_check_callbacks() Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 30/35] parisc: Fix non-access data TLB cache flush faults Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 31/35] parisc: Fix handling off probe non-access faults Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 32/35] nvme-tcp: lockdep: annotate in-kernel sockets Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 33/35] spi: tegra20: Use of_device_get_match_data() Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 34/35] Revert "ACPI: Pass the same capabilities to the _OSC regardless of the query flag" Sasha Levin
2022-03-28 11:20 ` [PATCH AUTOSEL 5.16 35/35] spi: fsi: Implement a timeout for polling status Sasha Levin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220328112011.1555169-5-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=dwmw@amazon.co.uk \
    --cc=frederic@kernel.org \
    --cc=josh@joshtriplett.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulmck@kernel.org \
    --cc=quic_neeraju@quicinc.com \
    --cc=rcu@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox