From: Ming Lei <ming.lei@redhat.com>
To: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>,
linux-block@vger.kernel.org, Yu Kuai <yukuai3@huawei.com>,
Christoph Hellwig <hch@lst.de>
Subject: Re: [PATCH V3 6/6] blk-mq: manage hctx map via xarray
Date: Mon, 7 Mar 2022 15:49:22 +0800 [thread overview]
Message-ID: <YiW5Apu5V85bgtM3@T590> (raw)
In-Reply-To: <065432ee-7e1b-8c21-4536-2c4a7bb6734b@suse.de>
On Mon, Mar 07, 2022 at 08:44:24AM +0100, Hannes Reinecke wrote:
> On 3/7/22 07:44, Ming Lei wrote:
> > Firstly code becomes more clean by switching to xarray from plain array.
> >
> > Secondly use-after-free on q->queue_hw_ctx can be fixed because
> > queue_for_each_hw_ctx() may be run when updating nr_hw_queues is
> > in-progress. With this patch, q->hctx_table is defined as xarray, and
> > this structure will share same lifetime with request queue, so
> > queue_for_each_hw_ctx() can use q->hctx_table to lookup hctx reliably.
> >
> > Reported-by: Yu Kuai <yukuai3@huawei.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > block/blk-mq-tag.c | 2 +-
> > block/blk-mq.c | 55 ++++++++++++++++++------------------------
> > block/blk-mq.h | 2 +-
> > include/linux/blk-mq.h | 3 +--
> > include/linux/blkdev.h | 2 +-
> > 5 files changed, 28 insertions(+), 36 deletions(-)
> >
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 1850a4225e12..68ac23d0b640 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
> > void *priv)
> > {
> > /*
> > - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
> > + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
> > * while the queue is frozen. So we can use q_usage_counter to avoid
> > * racing with it.
> > */
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index bffdd71c670d..a15d12fb227c 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
> > static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
> > blk_qc_t qc)
> > {
> > - return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
> > + return xa_load(&q->hctx_table,
> > + (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
> > }
> > static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
> > @@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> > * If not tell the caller that it should skip this queue.
> > */
> > ret = -EXDEV;
> > - data.hctx = q->queue_hw_ctx[hctx_idx];
> > + data.hctx = xa_load(&q->hctx_table, hctx_idx);
> > if (!blk_mq_hw_queue_mapped(data.hctx))
> > goto out_queue_exit;
> > cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
> > @@ -3437,6 +3438,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
> > blk_mq_remove_cpuhp(hctx);
> > + xa_erase(&q->hctx_table, hctx_idx);
> > +
> > spin_lock(&q->unused_hctx_lock);
> > list_add(&hctx->hctx_list, &q->unused_hctx_list);
> > spin_unlock(&q->unused_hctx_lock);
> > @@ -3476,8 +3479,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
> > if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
> > hctx->numa_node))
> > goto exit_hctx;
> > +
> > + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
> > + goto exit_flush_rq;
> > +
> > return 0;
> > > + exit_flush_rq:
> > + if (set->ops->exit_request)
> > + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
>
> Why is this here? It's not directly related to the xarray conversion, so it
> should rather go into a separate patch.
This new error handling is only needed if xa_insert() fails.
>
> > exit_hctx:
> > if (set->ops->exit_hctx)
> > set->ops->exit_hctx(hctx, hctx_idx);
> > @@ -3856,7 +3866,7 @@ void blk_mq_release(struct request_queue *q)
> > kobject_put(&hctx->kobj);
> > }
> > - kfree(q->queue_hw_ctx);
> > + xa_destroy(&q->hctx_table);
> > /*
> > * release .mq_kobj and sw queue's kobject now because
> > @@ -3946,45 +3956,28 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
> > struct request_queue *q)
> > {
> > int i, j, end;
> > - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
> > -
> > - if (q->nr_hw_queues < set->nr_hw_queues) {
> > - struct blk_mq_hw_ctx **new_hctxs;
> > -
> > - new_hctxs = kcalloc_node(set->nr_hw_queues,
> > - sizeof(*new_hctxs), GFP_KERNEL,
> > - set->numa_node);
> > - if (!new_hctxs)
> > - return;
> > - if (hctxs)
> > - memcpy(new_hctxs, hctxs, q->nr_hw_queues *
> > - sizeof(*hctxs));
> > - q->queue_hw_ctx = new_hctxs;
> > - kfree(hctxs);
> > - hctxs = new_hctxs;
> > - }
> > /* protect against switching io scheduler */
> > mutex_lock(&q->sysfs_lock);
> > for (i = 0; i < set->nr_hw_queues; i++) {
> > int old_node;
> > int node = blk_mq_get_hctx_node(set, i);
> > - struct blk_mq_hw_ctx *old_hctx = hctxs[i];
> > + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
> > if (old_hctx) {
> > old_node = old_hctx->numa_node;
> > blk_mq_exit_hctx(q, set, old_hctx, i);
> > }
> > - hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
> > - if (!hctxs[i]) {
> > + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
> > + struct blk_mq_hw_ctx *hctx;
> > +
> > if (!old_hctx)
> > break;
> > pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
> > node, old_node);
> > - hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
> > - old_node);
> > - WARN_ON_ONCE(!hctxs[i]);
> > + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
> > + WARN_ON_ONCE(!hctx);
> > }
> > }
> > /*
> > @@ -4001,12 +3994,10 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
> > }
> > for (; j < end; j++) {
> > - struct blk_mq_hw_ctx *hctx = hctxs[j];
> > + struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, j);
> > - if (hctx) {
> > + if (hctx)
> > blk_mq_exit_hctx(q, set, hctx, j);
> > - hctxs[j] = NULL;
> > - }
>
> Do you need to call 'xa_load' here? Isn't it sufficient to call
> blk_mq_exit_hctx() and have it skip any non-present entries?
As Christoph suggested, xa_for_each_range() can be used here for
exiting any present entry.
Thanks,
Ming
prev parent reply other threads:[~2022-03-07 7:49 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-03-07 6:43 [PATCH V3 0/6] blk-mq: update_nr_hw_queues related improvement & bugfix Ming Lei
2022-03-07 6:43 ` [PATCH V3 1/6] blk-mq: figure out correct numa node for hw queue Ming Lei
2022-03-07 7:34 ` Hannes Reinecke
2022-03-07 6:43 ` [PATCH V3 2/6] blk-mq: simplify reallocation of hw ctxs a bit Ming Lei
2022-03-07 7:35 ` Hannes Reinecke
2022-03-07 6:43 ` [PATCH V3 3/6] blk-mq: reconfigure poll after queue map is changed Ming Lei
2022-03-07 7:10 ` Christoph Hellwig
2022-03-07 7:36 ` Hannes Reinecke
2022-03-07 6:43 ` [PATCH V3 4/6] block: mtip32xx: don't touch q->queue_hw_ctx Ming Lei
2022-03-07 7:36 ` Hannes Reinecke
2022-03-07 6:44 ` [PATCH V3 5/6] blk-mq: prepare for implementing hctx table via xarray Ming Lei
2022-03-07 7:10 ` Christoph Hellwig
2022-03-07 7:38 ` Hannes Reinecke
2022-03-07 6:44 ` [PATCH V3 6/6] blk-mq: manage hctx map " Ming Lei
2022-03-07 7:13 ` Christoph Hellwig
2022-03-07 7:44 ` Ming Lei
2022-03-07 7:44 ` Hannes Reinecke
2022-03-07 7:49 ` Ming Lei [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=YiW5Apu5V85bgtM3@T590 \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=yukuai3@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.