Re: [PATCH 6/6] blk-mq: allocate tags in batches

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Subject: Re: [PATCH 6/6] blk-mq: allocate tags in batches
Date: Wed, 15 Jan 2020 20:07:57 +0800	[thread overview]
Message-ID: <20200115120757.GA30398@ming.t460p> (raw)
In-Reply-To: <20200107163037.31745-7-axboe@kernel.dk>

On Tue, Jan 07, 2020 at 09:30:37AM -0700, Jens Axboe wrote:
> Instead of grabbing tags one by one, grab a batch and store the local
> cache in the software queue. Then subsequent tag allocations can just
> grab free tags from there, without having to hit the shared tag map.
> 
> We flush these batches out if we run out of tags on the hardware queue.
> The intent here is this should rarely happen.
> 
> This works very well in practice, with anywhere from 40-60 batch counts
> seen regularly in testing.

Could you describe your test a bit? I am just wondering if multi-task IO
can perform well as before.

> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq-debugfs.c |  18 +++++++
>  block/blk-mq-tag.c     | 104 ++++++++++++++++++++++++++++++++++++++++-
>  block/blk-mq-tag.h     |   3 ++
>  block/blk-mq.c         |  16 +++++--
>  block/blk-mq.h         |   5 ++
>  include/linux/blk-mq.h |   2 +
>  6 files changed, 144 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index e789f830ff59..914be72d080e 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -659,6 +659,23 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
>  CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
>  CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
>  
> +static ssize_t ctx_tag_hit_write(void *data, const char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	struct blk_mq_ctx *ctx = data;
> +
> +	ctx->tag_hit = ctx->tag_refill = 0;
> +	return count;
> +}
> +
> +static int ctx_tag_hit_show(void *data, struct seq_file *m)
> +{
> +	struct blk_mq_ctx *ctx = data;
> +
> +	seq_printf(m, "hit=%lu refills=%lu\n", ctx->tag_hit, ctx->tag_refill);
> +	return 0;
> +}
> +
>  static int ctx_dispatched_show(void *data, struct seq_file *m)
>  {
>  	struct blk_mq_ctx *ctx = data;
> @@ -800,6 +817,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
>  	{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
>  	{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
>  	{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
> +	{"tag_hit", 0600, ctx_tag_hit_show, ctx_tag_hit_write},
>  	{"merged", 0600, ctx_merged_show, ctx_merged_write},
>  	{"completed", 0600, ctx_completed_show, ctx_completed_write},
>  	{},
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index fbacde454718..94c1f16c6c71 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -99,6 +99,100 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
>  		return __sbitmap_queue_get(bt);
>  }
>  
> +static void ctx_flush_ipi(void *data)
> +{
> +	struct blk_mq_hw_ctx *hctx = data;
> +	struct sbitmap_queue *bt = &hctx->tags->bitmap_tags;
> +	struct blk_mq_ctx *ctx;
> +	unsigned int i;
> +
> +	ctx = __blk_mq_get_ctx(hctx->queue, smp_processor_id());
> +
> +	for (i = 0; i < hctx->queue->tag_set->nr_maps; i++) {
> +		struct blk_mq_ctx_type *type = &ctx->type[i];
> +
> +		if (!type->tags)
> +			continue;
> +
> +		__sbitmap_queue_clear_batch(bt, type->tag_offset, type->tags);
> +		type->tags = 0;
> +	}
> +	atomic_dec(&hctx->flush_pending);
> +}
> +
> +void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
> +				struct blk_mq_ctx *ctx)
> +{
> +	atomic_inc(&hctx->flush_pending);
> +	smp_call_function_single(ctx->cpu, ctx_flush_ipi, hctx, false);
> +}
> +
> +static void blk_mq_tag_flush_batches(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (atomic_cmpxchg(&hctx->flush_pending, 0, hctx->nr_ctx))
> +		return;
> +	preempt_disable();
> +	if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
> +		ctx_flush_ipi(hctx);
> +	smp_call_function_many(hctx->cpumask, ctx_flush_ipi, hctx, false);
> +	preempt_enable();
> +}
> +
> +void blk_mq_tag_queue_flush_batches(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i)
> +		blk_mq_tag_flush_batches(hctx);
> +}
> +
> +static int blk_mq_get_tag_batch(struct blk_mq_alloc_data *data)
> +{
> +	struct blk_mq_hw_ctx *hctx = data->hctx;
> +	struct blk_mq_ctx_type *type;
> +	struct blk_mq_ctx *ctx = data->ctx;
> +	struct blk_mq_tags *tags;
> +	struct sbitmap_queue *bt;
> +	int tag = -1;
> +
> +	if (!ctx || (data->flags & BLK_MQ_REQ_INTERNAL))
> +		return -1;
> +
> +	tags = hctx->tags;
> +	bt = &tags->bitmap_tags;
> +	/* don't do batches for round-robin or (very) sparse maps */
> +	if (bt->round_robin || bt->sb.shift < ilog2(BITS_PER_LONG) - 1)
> +		return -1;
> +
> +	/* we could make do with preempt disable, but we need to block flush */
> +	local_irq_disable();
> +	if (unlikely(ctx->cpu != smp_processor_id()))
> +		goto out;
> +
> +	type = &ctx->type[hctx->type];
> +
> +	if (type->tags) {
> +get_tag:
> +		ctx->tag_hit++;
> +
> +		tag = __ffs(type->tags);
> +		type->tags &= ~(1UL << tag);
> +		tag += type->tag_offset;
> +out:
> +		local_irq_enable();
> +		return tag;
> +	}
> +
> +	/* no current tag cache, attempt to refill a batch */
> +	if (!__sbitmap_queue_get_batch(bt, &type->tag_offset, &type->tags)) {
> +		ctx->tag_refill++;
> +		goto get_tag;
> +	}
> +
> +	goto out;
> +}
> +
>  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  {
>  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
> @@ -116,8 +210,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		bt = &tags->breserved_tags;
>  		tag_offset = 0;
>  	} else {
> -		bt = &tags->bitmap_tags;
>  		tag_offset = tags->nr_reserved_tags;
> +
> +		tag = blk_mq_get_tag_batch(data);
> +		if (tag != -1)
> +			goto found_tag;
> +
> +		bt = &tags->bitmap_tags;
>  	}
>  
>  	tag = __blk_mq_get_tag(data, bt);
> @@ -152,6 +251,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		if (tag != -1)
>  			break;
>  
> +		if (!(data->flags & BLK_MQ_REQ_RESERVED))
> +			blk_mq_tag_flush_batches(data->hctx);
> +
>  		bt_prev = bt;
>  		io_schedule();
>  
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 15bc74acb57e..b5964fff1630 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -34,6 +34,9 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
>  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
>  		void *priv);
> +void blk_mq_tag_queue_flush_batches(struct request_queue *q);
> +void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
> +				struct blk_mq_ctx *ctx);
>  
>  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
>  						 struct blk_mq_hw_ctx *hctx)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index cc48a0ffa5ec..81140f61a7c9 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2255,6 +2255,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>  	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
>  	type = hctx->type;
>  
> +	blk_mq_tag_ctx_flush_batch(hctx, ctx);

When blk_mq_hctx_notify_dead() is called, the 'cpu' has been offline
already, so the flush via IPI may not work as expected.


Thanks,
Ming

     prev parent reply	other threads:[~2020-01-15 12:08 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
2020-01-07 16:30 ` [PATCH 1/6] sbitmap: remove cleared bitmask Jens Axboe
2020-01-07 16:30 ` [PATCH 2/6] sbitmap: add batch tag retrieval Jens Axboe
2020-01-07 16:30 ` [PATCH 3/6] blk-mq: remove 'clear_ctx_on_error' Jens Axboe
2020-01-07 16:30 ` [PATCH 4/6] blk-mq: remove ctx->queue Jens Axboe
2020-01-07 16:30 ` [PATCH 5/6] blk-mq: add struct blk_mq_ctx_type Jens Axboe
2020-01-07 16:30 ` [PATCH 6/6] blk-mq: allocate tags in batches Jens Axboe
2020-01-15 12:07   ` Ming Lei [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200115120757.GA30398@ming.t460p \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.