From: Jens Axboe <axboe@fb.com>
To: <axboe@kernel.dk>, <linux-block@vger.kernel.org>,
<linux-fsdevel@vger.kernel.org>
Cc: <hch@infradead.org>, Jens Axboe <axboe@fb.com>
Subject: [PATCH 3/3] blk-mq: make the polling code adaptive
Date: Fri, 11 Nov 2016 22:11:27 -0700 [thread overview]
Message-ID: <1478927487-12998-4-git-send-email-axboe@fb.com> (raw)
In-Reply-To: <1478927487-12998-1-git-send-email-axboe@fb.com>
The previous commit introduced the hybrid sleep/poll mode. Take
that one step further, and use the completion latencies to
automatically sleep for half the mean completion time. This is
a good approximation.
This changes the 'io_poll_delay' sysfs file a bit to expose the
various options. Depending on the value, the polling code will
behave differently:
-1 Never enter hybrid sleep mode
0 Use half of the completion mean for the sleep delay
>0 Use this specific value as the sleep delay
Signed-off-by: Jens Axboe <axboe@fb.com>
---
block/blk-mq.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----
block/blk-sysfs.c | 26 ++++++++++++------
include/linux/blkdev.h | 2 +-
3 files changed, 88 insertions(+), 14 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2c77a2da123a..70b1b59ed0d3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2125,6 +2125,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
*/
q->nr_requests = set->queue_depth;
+ /*
+ * Default to classic polling
+ */
+ q->poll_nsec = -1;
+
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
@@ -2462,13 +2467,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct blk_rq_stat stat[2];
+ unsigned long ret = 0;
+
+ /*
+ * If stats collection isn't on, don't sleep but turn it on for
+ * future users
+ */
+ if (!blk_stat_enable(q))
+ return 0;
+
+ /*
+ * We don't have to do this once per IO, should optimize this
+ * to just use the current window of stats until it changes
+ */
+ memset(&stat, 0, sizeof(stat));
+ blk_hctx_stat_get(hctx, stat);
+
+ /*
+ * As an optimistic guess, use half of the mean service time
+ * for this type of request. We can (and should) make this smarter.
+ * For instance, if the completion latencies are tight, we can
+ * get closer than just half the mean. This is especially
+ * important on devices where the completion latencies are longer
+ * than ~10 usec.
+ */
+ if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+ ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+ else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+ ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+ return ret;
+}
+
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
struct hrtimer_sleeper hs;
+ enum hrtimer_mode mode;
+ unsigned int nsecs;
ktime_t kt;
- if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ return false;
+
+ /*
+ * poll_nsec can be:
+ *
+ * -1: don't ever hybrid sleep
+ * 0: use half of prev avg
+ * >0: use this specific value
+ */
+ if (q->poll_nsec == -1)
+ return false;
+ else if (q->poll_nsec > 0)
+ nsecs = q->poll_nsec;
+ else
+ nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+ if (!nsecs)
return false;
set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@ -2477,9 +2539,10 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
* This will be replaced with the stats tracking code, using
* 'avg_completion_time / 2' as the pre-sleep target.
*/
- kt = ktime_set(0, q->poll_nsec);
+ kt = ktime_set(0, nsecs);
- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
hrtimer_set_expires(&hs.timer, kt);
hrtimer_init_sleeper(&hs, current);
@@ -2487,10 +2550,11 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_start_expires(&hs.timer, HRTIMER_MODE_REL);
+ hrtimer_start_expires(&hs.timer, mode);
if (hs.task)
io_schedule();
hrtimer_cancel(&hs.timer);
+ mode = HRTIMER_MODE_ABS;
} while (hs.task && !signal_pending(current));
__set_current_state(TASK_RUNNING);
@@ -2510,7 +2574,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
* the IO isn't complete, we'll get called again and will go
* straight to the busy poll loop.
*/
- if (blk_mq_poll_hybrid_sleep(q, rq))
+ if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
return true;
hctx->poll_considered++;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b87f992fdbd7..652a36eef00c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->poll_nsec / 1000, page);
+ int val;
+
+ if (q->poll_nsec == -1)
+ val = -1;
+ else
+ val = q->poll_nsec / 1000;
+
+ return sprintf(page, "%d\n", val);
}
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
size_t count)
{
- unsigned long poll_usec;
- ssize_t ret;
+ int err, val;
if (!q->mq_ops || !q->mq_ops->poll)
return -EINVAL;
- ret = queue_var_store(&poll_usec, page, count);
- if (ret < 0)
- return ret;
+ err = kstrtoint(page, 10, &val);
+ if (err < 0)
+ return err;
- q->poll_nsec = poll_usec * 1000;
- return ret;
+ if (val == -1)
+ q->poll_nsec = -1;
+ else
+ q->poll_nsec = val * 1000;
+
+ return count;
}
static ssize_t queue_poll_show(struct request_queue *q, char *page)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 37ed4ea705c8..85699bc90a51 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -509,7 +509,7 @@ struct request_queue {
unsigned int request_fn_active;
unsigned int rq_timeout;
- unsigned int poll_nsec;
+ int poll_nsec;
struct timer_list timeout;
struct work_struct timeout_work;
struct list_head timeout_list;
--
2.7.4
next prev parent reply other threads:[~2016-11-12 5:12 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-11-12 5:11 [PATCHSET v3] block: IO polling improvements Jens Axboe
2016-11-12 5:11 ` [PATCH 1/3] block: fast-path for small and simple direct I/O requests Jens Axboe
2016-11-14 19:33 ` Omar Sandoval
2016-11-14 20:00 ` Jens Axboe
2016-11-15 13:49 ` Christoph Hellwig
2016-11-12 5:11 ` [PATCH 2/3] blk-mq: implement hybrid poll mode for sync O_DIRECT Jens Axboe
2016-11-12 5:11 ` Jens Axboe [this message]
2016-11-14 19:43 ` [PATCH 3/3] blk-mq: make the polling code adaptive Omar Sandoval
2016-11-14 19:58 ` Jens Axboe
2016-11-16 17:31 ` [PATCHSET v3] block: IO polling improvements Stephen Bates
2016-11-16 20:59 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1478927487-12998-4-git-send-email-axboe@fb.com \
--to=axboe@fb.com \
--cc=axboe@kernel.dk \
--cc=hch@infradead.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).