From: Shaohua Li <shli@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: axboe@kernel.dk, hch@infradead.org, kmo@daterainc.com
Subject: [patch 2/2]blk-mq: Don't reserve a tag for flush request
Date: Tue, 31 Dec 2013 11:38:50 +0800 [thread overview]
Message-ID: <20131231033850.GB31994@kernel.org> (raw)
Reserving a tag (request) for flush to avoid dead lock is a overkill. tag is
valuable resource. We can track flush request number and disallow too many
pending flush requests allocated. With this patch,
blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too
pending requests are allocated and new flush request is allocating. But this
should not be a problem, too many pending flush requests are very rare case.
I verified this can fix the deadlock caused by too many pending flush requests.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
block/blk-flush.c | 8 +++++---
block/blk-mq.c | 46 ++++++++++++++++++++++++++++++----------------
include/linux/blk-mq.h | 3 +++
3 files changed, 38 insertions(+), 19 deletions(-)
Index: linux/block/blk-flush.c
===================================================================
--- linux.orig/block/blk-flush.c 2013-12-31 11:28:24.117417629 +0800
+++ linux/block/blk-flush.c 2013-12-31 11:28:24.109417628 +0800
@@ -284,9 +284,8 @@ static void mq_flush_work(struct work_st
q = container_of(work, struct request_queue, mq_flush_work);
- /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
- __GFP_WAIT|GFP_ATOMIC, true);
+ __GFP_WAIT|GFP_ATOMIC, false);
rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io;
@@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq
/*
* @policy now records what operations need to be done. Adjust
* REQ_FLUSH and FUA for the driver.
+ * We keep REQ_FLUSH for mq to track flush requests. For !FUA,
+ * we never dispatch the request directly.
*/
- rq->cmd_flags &= ~REQ_FLUSH;
+ if (rq->cmd_flags & REQ_FUA)
+ rq->cmd_flags &= ~REQ_FLUSH;
if (!(fflags & REQ_FUA))
rq->cmd_flags &= ~REQ_FUA;
Index: linux/block/blk-mq.c
===================================================================
--- linux.orig/block/blk-mq.c 2013-12-31 11:28:24.117417629 +0800
+++ linux/block/blk-mq.c 2013-12-31 11:28:24.109417628 +0800
@@ -183,9 +183,27 @@ static void blk_mq_rq_ctx_init(struct re
}
static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
- gfp_t gfp, bool reserved)
+ gfp_t gfp, bool reserved,
+ int rw)
{
- return blk_mq_alloc_rq(hctx, gfp, reserved);
+ struct request *req;
+ bool is_flush = false;
+ /*
+ * flush need allocate a request, leave at least one request for
+ * non-flush IO to avoid deadlock
+ */
+ if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) {
+ if (atomic_inc_return(&hctx->pending_flush) >=
+ hctx->queue_depth - hctx->reserved_tags - 1) {
+ atomic_dec(&hctx->pending_flush);
+ return NULL;
+ }
+ is_flush = true;
+ }
+ req = blk_mq_alloc_rq(hctx, gfp, reserved);
+ if (!req && is_flush)
+ atomic_dec(&hctx->pending_flush);
+ return req;
}
static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
@@ -198,7 +216,7 @@ static struct request *blk_mq_alloc_requ
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
- rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+ rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw);
if (rq) {
blk_mq_rq_ctx_init(q, ctx, rq, rw);
break;
@@ -261,6 +279,9 @@ static void __blk_mq_free_request(struct
const int tag = rq->tag;
struct request_queue *q = rq->q;
+ if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+ atomic_dec(&hctx->pending_flush);
+
blk_mq_rq_init(hctx, rq);
blk_mq_put_tag(hctx->tags, tag);
@@ -928,14 +949,14 @@ static void blk_mq_make_request(struct r
hctx = q->mq_ops->map_queue(q, ctx->cpu);
trace_block_getrq(q, bio, rw);
- rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+ rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw);
if (likely(rq))
- blk_mq_rq_ctx_init(q, ctx, rq, rw);
+ blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw);
else {
blk_mq_put_ctx(ctx);
trace_block_sleeprq(q, bio, rw);
- rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
- false);
+ rq = blk_mq_alloc_request_pinned(q, bio->bi_rw,
+ __GFP_WAIT|GFP_ATOMIC, false);
ctx = rq->mq_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
}
@@ -1212,7 +1233,9 @@ static int blk_mq_init_hw_queues(struct
hctx->queue_num = i;
hctx->flags = reg->flags;
hctx->queue_depth = reg->queue_depth;
+ hctx->reserved_tags = reg->reserved_tags;
hctx->cmd_size = reg->cmd_size;
+ atomic_set(&hctx->pending_flush, 0);
blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
blk_mq_hctx_notify, hctx);
@@ -1337,15 +1360,6 @@ struct request_queue *blk_mq_init_queue(
reg->queue_depth = BLK_MQ_MAX_DEPTH;
}
- /*
- * Set aside a tag for flush requests. It will only be used while
- * another flush request is in progress but outside the driver.
- *
- * TODO: only allocate if flushes are supported
- */
- reg->queue_depth++;
- reg->reserved_tags++;
-
if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
return ERR_PTR(-EINVAL);
Index: linux/include/linux/blk-mq.h
===================================================================
--- linux.orig/include/linux/blk-mq.h 2013-12-31 11:28:24.117417629 +0800
+++ linux/include/linux/blk-mq.h 2013-12-31 11:28:24.109417628 +0800
@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx {
struct list_head page_list;
struct blk_mq_tags *tags;
+ atomic_t pending_flush;
+
unsigned long queued;
unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 10
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int queue_depth;
+ unsigned int reserved_tags;
unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */
next reply other threads:[~2013-12-31 3:39 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-12-31 3:38 Shaohua Li [this message]
2013-12-31 16:12 ` [patch 2/2]blk-mq: Don't reserve a tag for flush request Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131231033850.GB31994@kernel.org \
--to=shli@kernel.org \
--cc=axboe@kernel.dk \
--cc=hch@infradead.org \
--cc=kmo@daterainc.com \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.