[PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray
@ 2025-11-28  8:53 Fengnan Chang
  2025-11-28  8:53 ` [PATCH v3 1/2] " Fengnan Chang
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Fengnan Chang @ 2025-11-28  8:53 UTC (permalink / raw)
  To: axboe, linux-block, ming.lei, hare, hch, yukuai; +Cc: Fengnan Chang

After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
an xarray instead of array to store hctx, but in poll mode, each time
in blk_mq_poll, we need use xa_load to find corresponding hctx, this
introduce some costs. In my test, xa_load may cost 3.8% cpu.

After revert previous change, eliminates the overhead of xa_load and can
result in a 3% performance improvement.

potentital use-after-free on q->queue_hw_ctx can be fixed by use rcu to
avoid, same as Yu Kuai did in [1].

[1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/

v3:
fix build error and part sparse warnings, not all sparse warnings, because
the queue is freezed in __blk_mq_update_nr_hw_queues, only need protect
'queue_hw_ctx' through rcu where it can be accessed without grabbing
'q_usage_counter'.

v2:
1. modify synchronize_rcu() to synchronize_rcu_expedited()
2. use rcu_dereference(q->queue_hw_ctx)[id] in queue_hctx to better read.

Fengnan Chang (2):
  blk-mq: use array manage hctx map instead of xarray
  blk-mq: fix potential uaf for 'queue_hw_ctx'

 block/blk-mq-tag.c     |  2 +-
 block/blk-mq.c         | 63 ++++++++++++++++++++++++++++--------------
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h | 14 +++++++++-
 include/linux/blkdev.h |  2 +-
 5 files changed, 58 insertions(+), 25 deletions(-)

base-commit: 4941a17751c99e17422be743c02c923ad706f888
-- 
2.39.5 (Apple Git-154)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v3 1/2] blk-mq: use array manage hctx map instead of xarray
  2025-11-28  8:53 [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Fengnan Chang
@ 2025-11-28  8:53 ` Fengnan Chang
  2025-11-28  8:53 ` [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx' Fengnan Chang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Fengnan Chang @ 2025-11-28  8:53 UTC (permalink / raw)
  To: axboe, linux-block, ming.lei, hare, hch, yukuai; +Cc: Fengnan Chang

After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
an xarray instead of array to store hctx, but in poll mode, each time
in blk_mq_poll, we need use xa_load to find corresponding hctx, this
introduce some costs. In my test, xa_load may cost 3.8% cpu.

This patch revert previous change, eliminates the overhead of xa_load
and can result in a 3% performance improvement.

Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
---
 block/blk-mq-tag.c     |  2 +-
 block/blk-mq.c         | 58 +++++++++++++++++++++++++++---------------
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  3 ++-
 include/linux/blkdev.h |  2 +-
 5 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5b664dbdf655..33946cdb5716 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -499,7 +499,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 	int srcu_idx;
 
 	/*
-	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
+	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
 	 * while the queue is frozen. So we can use q_usage_counter to avoid
 	 * racing with it.
 	 */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d626d32f6e57..eed12fab3484 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -723,7 +723,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	 * If not tell the caller that it should skip this queue.
 	 */
 	ret = -EXDEV;
-	data.hctx = xa_load(&q->hctx_table, hctx_idx);
+	data.hctx = q->queue_hw_ctx[hctx_idx];
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -3935,8 +3935,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 			blk_free_flush_queue_callback);
 	hctx->fq = NULL;
 
-	xa_erase(&q->hctx_table, hctx_idx);
-
 	spin_lock(&q->unused_hctx_lock);
 	list_add(&hctx->hctx_list, &q->unused_hctx_list);
 	spin_unlock(&q->unused_hctx_lock);
@@ -3978,14 +3976,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 				hctx->numa_node))
 		goto exit_hctx;
 
-	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
-		goto exit_flush_rq;
-
 	return 0;
 
- exit_flush_rq:
-	if (set->ops->exit_request)
-		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
  exit_hctx:
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
@@ -4374,7 +4366,7 @@ void blk_mq_release(struct request_queue *q)
 		kobject_put(&hctx->kobj);
 	}
 
-	xa_destroy(&q->hctx_table);
+	kfree(q->queue_hw_ctx);
 
 	/*
 	 * release .mq_kobj and sw queue's kobject now because
@@ -4518,26 +4510,44 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
 static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 				     struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i, j;
+	int i, j, end;
+	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+
+	if (q->nr_hw_queues < set->nr_hw_queues) {
+		struct blk_mq_hw_ctx **new_hctxs;
+
+		new_hctxs = kcalloc_node(set->nr_hw_queues,
+				       sizeof(*new_hctxs), GFP_KERNEL,
+				       set->numa_node);
+		if (!new_hctxs)
+			return;
+		if (hctxs)
+			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
+			       sizeof(*hctxs));
+		q->queue_hw_ctx = new_hctxs;
+		kfree(hctxs);
+		hctxs = new_hctxs;
+	}
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int old_node;
 		int node = blk_mq_get_hctx_node(set, i);
-		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
+		struct blk_mq_hw_ctx *old_hctx = hctxs[i];
 
 		if (old_hctx) {
 			old_node = old_hctx->numa_node;
 			blk_mq_exit_hctx(q, set, old_hctx, i);
 		}
 
-		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+		hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
+		if (!hctxs[i]) {
 			if (!old_hctx)
 				break;
 			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
 					node, old_node);
-			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
-			WARN_ON_ONCE(!hctx);
+			hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
+					old_node);
+			WARN_ON_ONCE(!hctxs[i]);
 		}
 	}
 	/*
@@ -4546,13 +4556,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	 */
 	if (i != set->nr_hw_queues) {
 		j = q->nr_hw_queues;
+		end = i;
 	} else {
 		j = i;
+		end = q->nr_hw_queues;
 		q->nr_hw_queues = set->nr_hw_queues;
 	}
 
-	xa_for_each_start(&q->hctx_table, j, hctx, j)
-		blk_mq_exit_hctx(q, set, hctx, j);
+	for (; j < end; j++) {
+		struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+		if (hctx) {
+			blk_mq_exit_hctx(q, set, hctx, j);
+			hctxs[j] = NULL;
+		}
+	}
 }
 
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -4588,8 +4606,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	INIT_LIST_HEAD(&q->unused_hctx_list);
 	spin_lock_init(&q->unused_hctx_lock);
 
-	xa_init(&q->hctx_table);
-
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
 		goto err_hctxs;
@@ -5168,7 +5184,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
 {
 	if (!blk_mq_can_poll(q))
 		return 0;
-	return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
+	return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags);
 }
 
 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c4fccdeb5441..80a3f0c2bce7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -84,7 +84,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 							  enum hctx_type type,
 							  unsigned int cpu)
 {
-	return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
+	return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
 }
 
 static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b25d12545f46..0795f29dd65d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1000,7 +1000,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 }
 
 #define queue_for_each_hw_ctx(q, hctx, i)				\
-	xa_for_each(&(q)->hctx_table, (i), (hctx))
+	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
+	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
 
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 70b671a9a7f7..56328080ca09 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -493,7 +493,7 @@ struct request_queue {
 
 	/* hw dispatch queues */
 	unsigned int		nr_hw_queues;
-	struct xarray		hctx_table;
+	struct blk_mq_hw_ctx	**queue_hw_ctx;
 
 	struct percpu_ref	q_usage_counter;
 	struct lock_class_key	io_lock_cls_key;
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx'
  2025-11-28  8:53 [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Fengnan Chang
  2025-11-28  8:53 ` [PATCH v3 1/2] " Fengnan Chang
@ 2025-11-28  8:53 ` Fengnan Chang
  2025-11-28 16:20   ` Jens Axboe
  2025-11-28  9:54 ` [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Yu Kuai
  2025-11-28 16:22 ` Jens Axboe
  3 siblings, 1 reply; 9+ messages in thread
From: Fengnan Chang @ 2025-11-28  8:53 UTC (permalink / raw)
  To: axboe, linux-block, ming.lei, hare, hch, yukuai; +Cc: Fengnan Chang, Yu Kuai

This is just apply Kuai's patch in [1] with mirror changes.

blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
submit_queues through configfs for null_blk), while it might still be
used from other context(e.g. switch elevator to none):

t1					t2
elevator_switch
 blk_mq_unquiesce_queue
  blk_mq_run_hw_queues
   queue_for_each_hw_ctx
    // assembly code for hctx = (q)->queue_hw_ctx[i]
    mov    0x48(%rbp),%rdx -> read old queue_hw_ctx

					__blk_mq_update_nr_hw_queues
					 blk_mq_realloc_hw_ctxs
					  hctxs = q->queue_hw_ctx
					  q->queue_hw_ctx = new_hctxs
					  kfree(hctxs)
    movslq %ebx,%rax
    mov    (%rdx,%rax,8),%rdi ->uaf

This problem was found by code review, and I comfirmed that the concurrent
scenario do exist(specifically 'q->queue_hw_ctx' can be changed during
blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet
without hacking the kernel.

Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the
problem by protecting 'queue_hw_ctx' through rcu where it can be accessed
without grabbing 'q_usage_counter'.

[1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
---
 block/blk-mq.c         |  7 ++++++-
 include/linux/blk-mq.h | 13 ++++++++++++-
 include/linux/blkdev.h |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eed12fab3484..0b8b72194003 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4524,7 +4524,12 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		if (hctxs)
 			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
 			       sizeof(*hctxs));
-		q->queue_hw_ctx = new_hctxs;
+		rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
+		/*
+		 * Make sure reading the old queue_hw_ctx from other
+		 * context concurrently won't trigger uaf.
+		 */
+		synchronize_rcu_expedited();
 		kfree(hctxs);
 		hctxs = new_hctxs;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0795f29dd65d..c16875b35521 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -999,9 +999,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return rq + 1;
 }
 
+static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	rcu_read_lock();
+	hctx = rcu_dereference(q->queue_hw_ctx)[id];
+	rcu_read_unlock();
+
+	return hctx;
+}
+
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
-	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
+	     ({ hctx = queue_hctx((q), i); 1; }); (i)++)
 
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56328080ca09..e25d9802e08b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -493,7 +493,7 @@ struct request_queue {
 
 	/* hw dispatch queues */
 	unsigned int		nr_hw_queues;
-	struct blk_mq_hw_ctx	**queue_hw_ctx;
+	struct blk_mq_hw_ctx * __rcu *queue_hw_ctx;
 
 	struct percpu_ref	q_usage_counter;
 	struct lock_class_key	io_lock_cls_key;
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray
  2025-11-28  8:53 [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Fengnan Chang
  2025-11-28  8:53 ` [PATCH v3 1/2] " Fengnan Chang
  2025-11-28  8:53 ` [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx' Fengnan Chang
@ 2025-11-28  9:54 ` Yu Kuai
  2025-11-28 10:00   ` fengnan chang
  2025-11-28 16:22 ` Jens Axboe
  3 siblings, 1 reply; 9+ messages in thread
From: Yu Kuai @ 2025-11-28  9:54 UTC (permalink / raw)
  To: Fengnan Chang, axboe, linux-block, ming.lei, hare, hch, Yu Kuai
  Cc: Fengnan Chang

Hi,

在 2025/11/28 16:53, Fengnan Chang 写道:
> After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
> an xarray instead of array to store hctx, but in poll mode, each time
> in blk_mq_poll, we need use xa_load to find corresponding hctx, this
> introduce some costs. In my test, xa_load may cost 3.8% cpu.
>
> After revert previous change, eliminates the overhead of xa_load and can
> result in a 3% performance improvement.
>
> potentital use-after-free on q->queue_hw_ctx can be fixed by use rcu to
> avoid, same as Yu Kuai did in [1].

Hope I'm not too late for the party. I'm not against for this set, just
wonder have we considered changing to store hctx directly in bio for
blk-mq devices, are we strongly against to increase bio size?

>
> [1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/
>
> v3:
> fix build error and part sparse warnings, not all sparse warnings, because
> the queue is freezed in __blk_mq_update_nr_hw_queues, only need protect
> 'queue_hw_ctx' through rcu where it can be accessed without grabbing
> 'q_usage_counter'.
>
> v2:
> 1. modify synchronize_rcu() to synchronize_rcu_expedited()
> 2. use rcu_dereference(q->queue_hw_ctx)[id] in queue_hctx to better read.
>
> Fengnan Chang (2):
>    blk-mq: use array manage hctx map instead of xarray
>    blk-mq: fix potential uaf for 'queue_hw_ctx'
>
>   block/blk-mq-tag.c     |  2 +-
>   block/blk-mq.c         | 63 ++++++++++++++++++++++++++++--------------
>   block/blk-mq.h         |  2 +-
>   include/linux/blk-mq.h | 14 +++++++++-
>   include/linux/blkdev.h |  2 +-
>   5 files changed, 58 insertions(+), 25 deletions(-)
>
>
> base-commit: 4941a17751c99e17422be743c02c923ad706f888

-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray
  2025-11-28  9:54 ` [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Yu Kuai
@ 2025-11-28 10:00   ` fengnan chang
  2025-11-28 16:09     ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: fengnan chang @ 2025-11-28 10:00 UTC (permalink / raw)
  To: yukuai; +Cc: axboe, linux-block, ming.lei, hare, hch, Fengnan Chang

On Fri, Nov 28, 2025 at 5:55 PM Yu Kuai <yukuai@fnnas.com> wrote:
>
> Hi,
>
> 在 2025/11/28 16:53, Fengnan Chang 写道:
> > After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
> > an xarray instead of array to store hctx, but in poll mode, each time
> > in blk_mq_poll, we need use xa_load to find corresponding hctx, this
> > introduce some costs. In my test, xa_load may cost 3.8% cpu.
> >
> > After revert previous change, eliminates the overhead of xa_load and can
> > result in a 3% performance improvement.
> >
> > potentital use-after-free on q->queue_hw_ctx can be fixed by use rcu to
> > avoid, same as Yu Kuai did in [1].
>
> Hope I'm not too late for the party. I'm not against for this set, just
> wonder have we considered changing to store hctx directly in bio for
> blk-mq devices, are we strongly against to increase bio size?

I've thought about that put the hctx in the bio, it's a better way, but
increasing the size of the bio, which is now exactly 128 bytes.

>
> >
> > [1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/
> >
> > v3:
> > fix build error and part sparse warnings, not all sparse warnings, because
> > the queue is freezed in __blk_mq_update_nr_hw_queues, only need protect
> > 'queue_hw_ctx' through rcu where it can be accessed without grabbing
> > 'q_usage_counter'.
> >
> > v2:
> > 1. modify synchronize_rcu() to synchronize_rcu_expedited()
> > 2. use rcu_dereference(q->queue_hw_ctx)[id] in queue_hctx to better read.
> >
> > Fengnan Chang (2):
> >    blk-mq: use array manage hctx map instead of xarray
> >    blk-mq: fix potential uaf for 'queue_hw_ctx'
> >
> >   block/blk-mq-tag.c     |  2 +-
> >   block/blk-mq.c         | 63 ++++++++++++++++++++++++++++--------------
> >   block/blk-mq.h         |  2 +-
> >   include/linux/blk-mq.h | 14 +++++++++-
> >   include/linux/blkdev.h |  2 +-
> >   5 files changed, 58 insertions(+), 25 deletions(-)
> >
> >
> > base-commit: 4941a17751c99e17422be743c02c923ad706f888
>
> --
> Thanks,
> Kuai

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray
  2025-11-28 10:00   ` fengnan chang
@ 2025-11-28 16:09     ` Jens Axboe
  0 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2025-11-28 16:09 UTC (permalink / raw)
  To: fengnan chang, yukuai; +Cc: linux-block, ming.lei, hare, hch, Fengnan Chang

On 11/28/25 3:00 AM, fengnan chang wrote:
> On Fri, Nov 28, 2025 at 5:55 PM Yu Kuai <yukuai@fnnas.com> wrote:
>>
>> Hi,
>>
>> 在 2025/11/28 16:53, Fengnan Chang 写道:
>>> After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
>>> an xarray instead of array to store hctx, but in poll mode, each time
>>> in blk_mq_poll, we need use xa_load to find corresponding hctx, this
>>> introduce some costs. In my test, xa_load may cost 3.8% cpu.
>>>
>>> After revert previous change, eliminates the overhead of xa_load and can
>>> result in a 3% performance improvement.
>>>
>>> potentital use-after-free on q->queue_hw_ctx can be fixed by use rcu to
>>> avoid, same as Yu Kuai did in [1].
>>
>> Hope I'm not too late for the party. I'm not against for this set, just
>> wonder have we considered changing to store hctx directly in bio for
>> blk-mq devices, are we strongly against to increase bio size?
> 
> I've thought about that put the hctx in the bio, it's a better way, but
> increasing the size of the bio, which is now exactly 128 bytes.

Don't think it's worth it for polled IO.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx'
  2025-11-28  8:53 ` [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx' Fengnan Chang
@ 2025-11-28 16:20   ` Jens Axboe
  2025-12-01 12:26     ` fengnan chang
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2025-11-28 16:20 UTC (permalink / raw)
  To: Fengnan Chang, linux-block, ming.lei, hare, hch, yukuai
  Cc: Fengnan Chang, Yu Kuai

On 11/28/25 1:53 AM, Fengnan Chang wrote:
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 0795f29dd65d..c16875b35521 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -999,9 +999,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
>  	return rq + 1;
>  }
>  
> +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +
> +	rcu_read_lock();
> +	hctx = rcu_dereference(q->queue_hw_ctx)[id];
> +	rcu_read_unlock();
> +
> +	return hctx;
> +}

Should eg blk_mq_map_queue_type() use this helper now too?

Note: I've applied this, so anything beyond this v3 should be an
incremental against the current tree.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray
  2025-11-28  8:53 [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Fengnan Chang
                   ` (2 preceding siblings ...)
  2025-11-28  9:54 ` [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Yu Kuai
@ 2025-11-28 16:22 ` Jens Axboe
  3 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2025-11-28 16:22 UTC (permalink / raw)
  To: linux-block, ming.lei, hare, hch, yukuai, Fengnan Chang; +Cc: Fengnan Chang


On Fri, 28 Nov 2025 16:53:12 +0800, Fengnan Chang wrote:
> After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
> an xarray instead of array to store hctx, but in poll mode, each time
> in blk_mq_poll, we need use xa_load to find corresponding hctx, this
> introduce some costs. In my test, xa_load may cost 3.8% cpu.
> 
> After revert previous change, eliminates the overhead of xa_load and can
> result in a 3% performance improvement.
> 
> [...]

Applied, thanks!

[1/2] blk-mq: use array manage hctx map instead of xarray
      commit: d0c98769ee7d5db8d699a270690639cde1766cd4
[2/2] blk-mq: fix potential uaf for 'queue_hw_ctx'
      commit: 89e1fb7ceffd898505ad7fa57acec0585bfaa2cc

Best regards,
-- 
Jens Axboe




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx'
  2025-11-28 16:20   ` Jens Axboe
@ 2025-12-01 12:26     ` fengnan chang
  0 siblings, 0 replies; 9+ messages in thread
From: fengnan chang @ 2025-12-01 12:26 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, ming.lei, hare, hch, yukuai, Fengnan Chang, Yu Kuai

On Sat, Nov 29, 2025 at 12:20 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 11/28/25 1:53 AM, Fengnan Chang wrote:
> > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> > index 0795f29dd65d..c16875b35521 100644
> > --- a/include/linux/blk-mq.h
> > +++ b/include/linux/blk-mq.h
> > @@ -999,9 +999,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
> >       return rq + 1;
> >  }
> >
> > +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
> > +{
> > +     struct blk_mq_hw_ctx *hctx;
> > +
> > +     rcu_read_lock();
> > +     hctx = rcu_dereference(q->queue_hw_ctx)[id];
> > +     rcu_read_unlock();
> > +
> > +     return hctx;
> > +}
>
> Should eg blk_mq_map_queue_type() use this helper now too?

You are right, now some caller of blk_mq_map_queue_type now didn't
grab 'q_usage_counter', such as blk_mq_cpu_mapped_to_hctx.
Also checked all other functions, no more missed cases.

New patch:
https://lore.kernel.org/linux-block/20251201122504.64439-1-changfengnan@bytedance.com/T/#u


Thanks.

>
> Note: I've applied this, so anything beyond this v3 should be an
> incremental against the current tree.
>
> --
> Jens Axboe

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-12-01 12:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-28  8:53 [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Fengnan Chang
2025-11-28  8:53 ` [PATCH v3 1/2] " Fengnan Chang
2025-11-28  8:53 ` [PATCH v3 2/2] blk-mq: fix potential uaf for 'queue_hw_ctx' Fengnan Chang
2025-11-28 16:20   ` Jens Axboe
2025-12-01 12:26     ` fengnan chang
2025-11-28  9:54 ` [PATCH v3 0/2] blk-mq: use array manage hctx map instead of xarray Yu Kuai
2025-11-28 10:00   ` fengnan chang
2025-11-28 16:09     ` Jens Axboe
2025-11-28 16:22 ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).