[patch 2/2]blk-mq: Don't reserve a tag for flush request

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Shaohua Li <shli@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: axboe@kernel.dk, hch@infradead.org, kmo@daterainc.com
Subject: [patch 2/2]blk-mq: Don't reserve a tag for flush request
Date: Tue, 31 Dec 2013 11:38:50 +0800	[thread overview]
Message-ID: <20131231033850.GB31994@kernel.org> (raw)


Reserving a tag (request) for flush to avoid dead lock is a overkill. tag is
valuable resource. We can track flush request number and disallow too many
pending flush requests allocated. With this patch,
blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too
pending requests are allocated and new flush request is allocating. But this
should not be a problem, too many pending flush requests are very rare case.

I verified this can fix the deadlock caused by too many pending flush requests.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 block/blk-flush.c      |    8 +++++---
 block/blk-mq.c         |   46 ++++++++++++++++++++++++++++++----------------
 include/linux/blk-mq.h |    3 +++
 3 files changed, 38 insertions(+), 19 deletions(-)

Index: linux/block/blk-flush.c
===================================================================
--- linux.orig/block/blk-flush.c	2013-12-31 11:28:24.117417629 +0800
+++ linux/block/blk-flush.c	2013-12-31 11:28:24.109417628 +0800
@@ -284,9 +284,8 @@ static void mq_flush_work(struct work_st
 
 	q = container_of(work, struct request_queue, mq_flush_work);
 
-	/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
 	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
-		__GFP_WAIT|GFP_ATOMIC, true);
+		__GFP_WAIT|GFP_ATOMIC, false);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->end_io = flush_end_io;
 
@@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_FLUSH and FUA for the driver.
+	 * We keep REQ_FLUSH for mq to track flush requests. For !FUA,
+	 * we never dispatch the request directly.
 	 */
-	rq->cmd_flags &= ~REQ_FLUSH;
+	if (rq->cmd_flags & REQ_FUA)
+		rq->cmd_flags &= ~REQ_FLUSH;
 	if (!(fflags & REQ_FUA))
 		rq->cmd_flags &= ~REQ_FUA;
 
Index: linux/block/blk-mq.c
===================================================================
--- linux.orig/block/blk-mq.c	2013-12-31 11:28:24.117417629 +0800
+++ linux/block/blk-mq.c	2013-12-31 11:28:24.109417628 +0800
@@ -183,9 +183,27 @@ static void blk_mq_rq_ctx_init(struct re
 }
 
 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
-					      gfp_t gfp, bool reserved)
+					      gfp_t gfp, bool reserved,
+					      int rw)
 {
-	return blk_mq_alloc_rq(hctx, gfp, reserved);
+	struct request *req;
+	bool is_flush = false;
+	/*
+	 * flush need allocate a request, leave at least one request for
+	 * non-flush IO to avoid deadlock
+	 */
+	if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) {
+		if (atomic_inc_return(&hctx->pending_flush) >=
+		    hctx->queue_depth - hctx->reserved_tags - 1) {
+			atomic_dec(&hctx->pending_flush);
+			return NULL;
+		}
+		is_flush = true;
+	}
+	req = blk_mq_alloc_rq(hctx, gfp, reserved);
+	if (!req && is_flush)
+		atomic_dec(&hctx->pending_flush);
+	return req;
 }
 
 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
@@ -198,7 +216,7 @@ static struct request *blk_mq_alloc_requ
 		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw);
 		if (rq) {
 			blk_mq_rq_ctx_init(q, ctx, rq, rw);
 			break;
@@ -261,6 +279,9 @@ static void __blk_mq_free_request(struct
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
+	if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+		atomic_dec(&hctx->pending_flush);
+
 	blk_mq_rq_init(hctx, rq);
 	blk_mq_put_tag(hctx->tags, tag);
 
@@ -928,14 +949,14 @@ static void blk_mq_make_request(struct r
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
 	trace_block_getrq(q, bio, rw);
-	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw);
 	if (likely(rq))
-		blk_mq_rq_ctx_init(q, ctx, rq, rw);
+		blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw);
 	else {
 		blk_mq_put_ctx(ctx);
 		trace_block_sleeprq(q, bio, rw);
-		rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
-							false);
+		rq = blk_mq_alloc_request_pinned(q, bio->bi_rw,
+				__GFP_WAIT|GFP_ATOMIC, false);
 		ctx = rq->mq_ctx;
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	}
@@ -1212,7 +1233,9 @@ static int blk_mq_init_hw_queues(struct
 		hctx->queue_num = i;
 		hctx->flags = reg->flags;
 		hctx->queue_depth = reg->queue_depth;
+		hctx->reserved_tags = reg->reserved_tags;
 		hctx->cmd_size = reg->cmd_size;
+		atomic_set(&hctx->pending_flush, 0);
 
 		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 						blk_mq_hctx_notify, hctx);
@@ -1337,15 +1360,6 @@ struct request_queue *blk_mq_init_queue(
 		reg->queue_depth = BLK_MQ_MAX_DEPTH;
 	}
 
-	/*
-	 * Set aside a tag for flush requests.  It will only be used while
-	 * another flush request is in progress but outside the driver.
-	 *
-	 * TODO: only allocate if flushes are supported
-	 */
-	reg->queue_depth++;
-	reg->reserved_tags++;
-
 	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
 		return ERR_PTR(-EINVAL);
 
Index: linux/include/linux/blk-mq.h
===================================================================
--- linux.orig/include/linux/blk-mq.h	2013-12-31 11:28:24.117417629 +0800
+++ linux/include/linux/blk-mq.h	2013-12-31 11:28:24.109417628 +0800
@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx {
 	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
+	atomic_t		pending_flush;
+
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		queue_depth;
+	unsigned int		reserved_tags;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */

next             reply	other threads:[~2013-12-31  3:39 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-31  3:38 Shaohua Li [this message]
2013-12-31 16:12 ` [patch 2/2]blk-mq: Don't reserve a tag for flush request Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131231033850.GB31994@kernel.org \
    --to=shli@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=hch@infradead.org \
    --cc=kmo@daterainc.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.