All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Ming Lei <ming.lei@redhat.com>,
	Reinette Chatre <reinette.chatre@intel.com>,
	Fenghua Yu <fenghua.yu@intel.com>,
	Peter Newman <peternewman@google.com>,
	Babu Moger <babu.moger@amd.com>, Luck Tony <tony.luck@intel.com>,
	Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>,
	linux-block@vger.kernel.org
Subject: [PATCH AUTOSEL 6.12 36/36] blk-mq: move cpuhp callback registering out of q->sysfs_lock
Date: Wed, 11 Dec 2024 13:49:52 -0500	[thread overview]
Message-ID: <20241211185028.3841047-36-sashal@kernel.org> (raw)
In-Reply-To: <20241211185028.3841047-1-sashal@kernel.org>

From: Ming Lei <ming.lei@redhat.com>

[ Upstream commit 22465bbac53c821319089016f268a2437de9b00a ]

Registering and unregistering cpuhp callback requires global cpu hotplug lock,
which is used everywhere. Meantime q->sysfs_lock is used in block layer
almost everywhere.

It is easy to trigger lockdep warning[1] by connecting the two locks.

Fix the warning by moving blk-mq's cpuhp callback registering out of
q->sysfs_lock. Add one dedicated global lock for covering registering &
unregistering hctx's cpuhp, and it is safe to do so because hctx is
guaranteed to be live if our request_queue is live.

[1] https://lore.kernel.org/lkml/Z04pz3AlvI4o0Mr8@agluck-desk3/

Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Newman <peternewman@google.com>
Cc: Babu Moger <babu.moger@amd.com>
Reported-by: Luck Tony <tony.luck@intel.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20241206111611.978870-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 block/blk-mq.c | 103 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c4012cb3adbf1..6dd849d607c03 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -43,6 +43,7 @@
 
 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
 static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
+static DEFINE_MUTEX(blk_mq_cpuhp_lock);
 
 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
 static void blk_mq_request_bypass_insert(struct request *rq,
@@ -3740,13 +3741,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
-static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
-	if (!(hctx->flags & BLK_MQ_F_STACKING))
+	lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+	if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+	    !hlist_unhashed(&hctx->cpuhp_online)) {
 		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
 						    &hctx->cpuhp_online);
-	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
-					    &hctx->cpuhp_dead);
+		INIT_HLIST_NODE(&hctx->cpuhp_online);
+	}
+
+	if (!hlist_unhashed(&hctx->cpuhp_dead)) {
+		cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+						    &hctx->cpuhp_dead);
+		INIT_HLIST_NODE(&hctx->cpuhp_dead);
+	}
+}
+
+static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+	mutex_lock(&blk_mq_cpuhp_lock);
+	__blk_mq_remove_cpuhp(hctx);
+	mutex_unlock(&blk_mq_cpuhp_lock);
+}
+
+static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+	lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+	if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+	    hlist_unhashed(&hctx->cpuhp_online))
+		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+				&hctx->cpuhp_online);
+
+	if (hlist_unhashed(&hctx->cpuhp_dead))
+		cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+				&hctx->cpuhp_dead);
+}
+
+static void __blk_mq_remove_cpuhp_list(struct list_head *head)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+	list_for_each_entry(hctx, head, hctx_list)
+		__blk_mq_remove_cpuhp(hctx);
+}
+
+/*
+ * Unregister cpuhp callbacks from exited hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
+{
+	LIST_HEAD(hctx_list);
+
+	spin_lock(&q->unused_hctx_lock);
+	list_splice_init(&q->unused_hctx_list, &hctx_list);
+	spin_unlock(&q->unused_hctx_lock);
+
+	mutex_lock(&blk_mq_cpuhp_lock);
+	__blk_mq_remove_cpuhp_list(&hctx_list);
+	mutex_unlock(&blk_mq_cpuhp_lock);
+
+	spin_lock(&q->unused_hctx_lock);
+	list_splice(&hctx_list, &q->unused_hctx_list);
+	spin_unlock(&q->unused_hctx_lock);
+}
+
+/*
+ * Register cpuhp callbacks from all hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	mutex_lock(&blk_mq_cpuhp_lock);
+	queue_for_each_hw_ctx(q, hctx, i)
+		__blk_mq_add_cpuhp(hctx);
+	mutex_unlock(&blk_mq_cpuhp_lock);
 }
 
 /*
@@ -3797,8 +3876,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 
-	blk_mq_remove_cpuhp(hctx);
-
 	xa_erase(&q->hctx_table, hctx_idx);
 
 	spin_lock(&q->unused_hctx_lock);
@@ -3815,6 +3892,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (i == nr_queue)
 			break;
+		blk_mq_remove_cpuhp(hctx);
 		blk_mq_exit_hctx(q, set, hctx, i);
 	}
 }
@@ -3838,11 +3916,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
 		goto exit_flush_rq;
 
-	if (!(hctx->flags & BLK_MQ_F_STACKING))
-		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
-				&hctx->cpuhp_online);
-	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
-
 	return 0;
 
  exit_flush_rq:
@@ -3877,6 +3950,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
+	INIT_HLIST_NODE(&hctx->cpuhp_dead);
+	INIT_HLIST_NODE(&hctx->cpuhp_online);
 	hctx->queue = q;
 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
 
@@ -4415,6 +4490,12 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	xa_for_each_start(&q->hctx_table, j, hctx, j)
 		blk_mq_exit_hctx(q, set, hctx, j);
 	mutex_unlock(&q->sysfs_lock);
+
+	/* unregister cpuhp callbacks for exited hctxs */
+	blk_mq_remove_hw_queues_cpuhp(q);
+
+	/* register cpuhp for new initialized hctxs */
+	blk_mq_add_hw_queues_cpuhp(q);
 }
 
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
-- 
2.43.0


      parent reply	other threads:[~2024-12-11 18:51 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-11 18:49 [PATCH AUTOSEL 6.12 01/36] watchdog: it87_wdt: add PWRGD enable quirk for Qotom QCML04 Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 02/36] watchdog: rzg2l_wdt: Power on the watchdog domain in the restart handler Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 03/36] Revert "watchdog: s3c2410_wdt: use exynos_get_pmu_regmap_by_phandle() for PMU regs" Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 04/36] watchdog: mediatek: Add support for MT6735 TOPRGU/WDT Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 05/36] watchdog: s3c2410_wdt: add support for exynosautov920 SoC Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 06/36] scsi: qla1280: Fix hw revision numbering for ISP1020/1040 Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 07/36] scsi: megaraid_sas: Fix for a potential deadlock Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 08/36] udf: Skip parent dir link count update if corrupted Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 09/36] udf: Verify inode link counts before performing rename Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 10/36] ALSA: ump: Don't open legacy substream for an inactive group Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 11/36] ALSA: ump: Indicate the inactive group in legacy substream names Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 12/36] ALSA: ump: Update legacy substream names upon FB info update Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 13/36] ALSA: hda/conexant: fix Z60MR100 startup pop issue Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 14/36] ALSA: sh: Use standard helper for buffer accesses Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 15/36] smb: server: Fix building with GCC 15 Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 16/36] regmap: Use correct format specifier for logging range errors Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 17/36] LoongArch: Fix reserving screen info memory for above-4G firmware Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 18/36] LoongArch/irq: Use seq_put_decimal_ull_width() for decimal values Sasha Levin
2024-12-11 19:14   ` Thomas Gleixner
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 19/36] LoongArch: BPF: Adjust the parameter of emit_jirl() Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 20/36] platform/x86: asus-nb-wmi: Ignore unknown event 0xCF Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 21/36] bpf: Zero index arg error string for dynptr and iter Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 22/36] net: sched: fix ordering of qlen adjustment Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 23/36] spi: intel: Add Panther Lake SPI controller support Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 24/36] scsi: mpt3sas: Diag-Reset when Doorbell-In-Use bit is set during driver load time Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 25/36] scsi: mpi3mr: Synchronize access to ioctl data buffer Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 26/36] scsi: mpi3mr: Fix corrupt config pages PHY state is switched in sysfs Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 27/36] scsi: mpi3mr: Start controller indexing from 0 Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 28/36] scsi: mpi3mr: Handling of fault code for insufficient power Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 29/36] scsi: storvsc: Do not flag MAINTENANCE_IN return of SRB_STATUS_DATA_OVERRUN as an error Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 30/36] ACPI/IORT: Add PMCG platform information for HiSilicon HIP09A Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 31/36] spi: omap2-mcspi: Fix the IS_ERR() bug for devm_clk_get_optional_enabled() Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 32/36] drm/dp_mst: Ensure mst_primary pointer is valid in drm_dp_mst_handle_up_req() Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 33/36] drm/dp_mst: Reset message rx state after OOM " Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 34/36] virtio-blk: don't keep queue frozen during system suspend Sasha Levin
2024-12-11 18:49 ` [PATCH AUTOSEL 6.12 35/36] blk-mq: register cpuhp callback after hctx is added to xarray table Sasha Levin
2024-12-11 18:49 ` Sasha Levin [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241211185028.3841047-36-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=babu.moger@amd.com \
    --cc=fenghua.yu@intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=peternewman@google.com \
    --cc=reinette.chatre@intel.com \
    --cc=stable@vger.kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.