[PATCH RFC 1/2] cfq: request-deadline policy

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC 1/2] cfq: request-deadline policy
@ 2011-07-04 13:08 Konstantin Khlebnikov
  2011-07-04 13:08 ` [PATCH RFC 2/2] blkio-cgroup: add max wait time statistics Konstantin Khlebnikov
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Konstantin Khlebnikov @ 2011-07-04 13:08 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, Vivek Goyal

CFQ is designed for sharing disk bandwidth proportionally between queues and groups
and for reordering requests to reduce disks seek time. Currently it cannot
gurantee or estimate latency for individual requests, even if latencies are low
for almost all requests, some of them can stuck inside scheduler for a long time.
The fair policy is good as long as someone luckless begins to die due to a timeout.

This patch implements fifo requests dispatching with deadline policy: now cfq
obliged to dispatch request if it stuck in the queue for more than deadline.

This way now cfq can try to ensure the expected latency of requests execution.
It is like a safety valve, it should not work all time, but it should keep latency
in sane range when the scheduler is unable to effectively handle flow of requests,
especially in cases when the "noop" or "deadline" shows better performance.

deadline can be tuned via /sys/block/<device>/queue/iosched/deadline_{sync,async}
it by default 2000ms for sync and 4000ms for async requests, use 0 to disable it.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
---
 block/cfq-iosched.c |   93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 93 insertions(+), 0 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f379943..8e68079 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -22,6 +22,7 @@
 /* max queue in one round of service */
 static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+static const int cfq_deadline[2] = { HZ * 4, HZ * 2 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
 /* penalty of a backwards seek */
@@ -120,6 +121,11 @@ struct cfq_queue {
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
+	/* cached rq_fifo_time() for first rq in fifo */
+	unsigned long fifo_key;
+	/* cfqd->fifo_tree member */
+	struct rb_node fifo_node;
+
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
 	unsigned int allocated_slice;
@@ -238,6 +244,11 @@ struct cfq_data {
 	 */
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
+	/* queues sorted by cfqq->fifo_key */
+	struct rb_root fifo_tree[2];
+	/* leftmost queue in fifo_tree */
+	struct cfq_queue *fifo_first[2];
+
 	unsigned int busy_queues;
 	unsigned int busy_sync_queues;
 
@@ -280,6 +291,7 @@ struct cfq_data {
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_fifo_expire[2];
+	unsigned int cfq_deadline[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
 	unsigned int cfq_slice[2];
@@ -1416,6 +1428,46 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->p_root = NULL;
 }
 
+static void cfq_resort_fifo_tree(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct rb_node **p, *parent;
+	struct cfq_queue *__cfqq;
+	int sync = cfq_cfqq_sync(cfqq);
+	int new_first = 1;
+
+	if (!RB_EMPTY_NODE(&cfqq->fifo_node)) {
+		if (cfqq == cfqd->fifo_first[sync]) {
+			parent = rb_next(&cfqq->fifo_node);
+			__cfqq = rb_entry(parent, struct cfq_queue, fifo_node);
+			cfqd->fifo_first[sync] = parent ? __cfqq : NULL;
+		}
+		rb_erase_init(&cfqq->fifo_node, &cfqd->fifo_tree[sync]);
+	}
+
+	if (list_empty(&cfqq->fifo) || !cfqd->cfq_deadline[sync] ||
+			cfq_class_idle(cfqq))
+		return;
+
+	cfqq->fifo_key = rq_fifo_time(rq_entry_fifo(cfqq->fifo.next));
+
+	parent = NULL;
+	p = &cfqd->fifo_tree[sync].rb_node;
+	while (*p) {
+		parent = *p;
+		__cfqq = rb_entry(parent, struct cfq_queue, fifo_node);
+		if (time_before_eq(__cfqq->fifo_key, cfqq->fifo_key)) {
+			p = &(*p)->rb_right;
+			new_first = 0;
+		} else
+			p = &(*p)->rb_left;
+	}
+	rb_link_node(&cfqq->fifo_node, parent, p);
+	rb_insert_color(&cfqq->fifo_node, &cfqd->fifo_tree[sync]);
+
+	if (new_first)
+		cfqd->fifo_first[sync] = cfqq;
+}
+
 /*
  * Update cfqq's position in the service tree.
  */
@@ -1444,6 +1496,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqd->busy_sync_queues++;
 
 	cfq_resort_rr_list(cfqd, cfqq);
+	cfq_resort_fifo_tree(cfqd, cfqq);
 }
 
 /*
@@ -1588,11 +1641,16 @@ static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 static void cfq_remove_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+	int fifo_first = rq->queuelist.prev == &cfqq->fifo;
 
 	if (cfqq->next_rq == rq)
 		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 
 	list_del_init(&rq->queuelist);
+
+	if (fifo_first && !RB_EMPTY_NODE(&cfqq->fifo_node))
+		cfq_resort_fifo_tree(cfqq->cfqd, cfqq);
+
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
@@ -2573,6 +2631,27 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return true;
 }
 
+static bool cfq_dispatch_deadline(struct cfq_data *cfqd, int sync)
+{
+	struct cfq_queue *cfqq = cfqd->fifo_first[sync];
+	unsigned int deadline = cfqd->cfq_deadline[sync];
+
+	if (!cfqq || !deadline)
+		return false;
+
+	if (time_before(jiffies, cfqq->fifo_key + deadline))
+		return false;
+
+	cfq_log_cfqq(cfqd, cfqq, "dispatch deadline");
+	cfq_dispatch_insert(cfqd->queue, rq_entry_fifo(cfqq->fifo.next));
+
+	/* remove empty queue from service tree and expire its slices */
+	if (RB_EMPTY_ROOT(&cfqq->sort_list))
+		__cfq_slice_expired(cfqd, cfqq, 0);
+
+	return true;
+}
+
 /*
  * Find the cfqq that we need to service and move a request from that to the
  * dispatch list
@@ -2588,6 +2667,9 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 	if (unlikely(force))
 		return cfq_forced_dispatch(cfqd);
 
+	if (cfq_dispatch_deadline(cfqd, 1) || cfq_dispatch_deadline(cfqd, 0))
+		return 1;
+
 	cfqq = cfq_select_queue(cfqd);
 	if (!cfqq)
 		return 0;
@@ -2924,6 +3006,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	RB_CLEAR_NODE(&cfqq->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
+	RB_CLEAR_NODE(&cfqq->fifo_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
 	cfqq->ref = 0;
@@ -4033,6 +4116,8 @@ static void *cfq_init_queue(struct request_queue *q)
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		cfqd->prio_trees[i] = RB_ROOT;
 
+	cfqd->fifo_tree[0] = cfqd->fifo_tree[1] = RB_ROOT;
+
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -4055,6 +4140,8 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
+	cfqd->cfq_deadline[0] = cfq_deadline[0];
+	cfqd->cfq_deadline[1] = cfq_deadline[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
 	cfqd->cfq_slice[0] = cfq_slice_async;
@@ -4130,6 +4217,8 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
+SHOW_FUNCTION(cfq_deadline_sync_show, cfqd->cfq_deadline[1], 1);
+SHOW_FUNCTION(cfq_deadline_async_show, cfqd->cfq_deadline[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
@@ -4161,6 +4250,8 @@ STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
 		UINT_MAX, 1);
+STORE_FUNCTION(cfq_deadline_sync_store, &cfqd->cfq_deadline[1], 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_deadline_async_store, &cfqd->cfq_deadline[0], 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
 		UINT_MAX, 0);
@@ -4180,6 +4271,8 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
 	CFQ_ATTR(fifo_expire_sync),
 	CFQ_ATTR(fifo_expire_async),
+	CFQ_ATTR(deadline_sync),
+	CFQ_ATTR(deadline_async),
 	CFQ_ATTR(back_seek_max),
 	CFQ_ATTR(back_seek_penalty),
 	CFQ_ATTR(slice_sync),


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH RFC 2/2] blkio-cgroup: add max wait time statistics
  2011-07-04 13:08 [PATCH RFC 1/2] cfq: request-deadline policy Konstantin Khlebnikov
@ 2011-07-04 13:08 ` Konstantin Khlebnikov
  2011-07-05  0:38 ` [PATCH RFC 1/2] cfq: request-deadline policy Shaohua Li
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Konstantin Khlebnikov @ 2011-07-04 13:08 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, Vivek Goyal

This patch adds blk-cgroup attribute blkio.io_wait_max to show
maximum time spent waiting in scheduler queue.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
---
 block/blk-cgroup.c |   27 ++++++++++++++++++++++++++-
 block/blk-cgroup.h |    3 +++
 2 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bcaf16e..cadb4dc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -197,6 +197,19 @@ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 		stat[BLKIO_STAT_ASYNC] += add;
 }
 
+static void blkio_max_stat(uint64_t *stat, uint64_t cur, bool direction,
+				bool sync)
+{
+	if (direction)
+		stat[BLKIO_STAT_WRITE] = max(stat[BLKIO_STAT_WRITE], cur);
+	else
+		stat[BLKIO_STAT_READ] = max(stat[BLKIO_STAT_READ], cur);
+	if (sync)
+		stat[BLKIO_STAT_SYNC] = max(stat[BLKIO_STAT_SYNC], cur);
+	else
+		stat[BLKIO_STAT_ASYNC] = max(stat[BLKIO_STAT_ASYNC], cur);
+}
+
 /*
  * Decrements the appropriate stat variable if non-zero depending on the
  * request type. Panics on value being zero.
@@ -432,9 +445,12 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	if (time_after64(now, io_start_time))
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 				now - io_start_time, direction, sync);
-	if (time_after64(io_start_time, start_time))
+	if (time_after64(io_start_time, start_time)) {
 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 				io_start_time - start_time, direction, sync);
+		blkio_max_stat(stats->stat_arr[BLKIO_STAT_WAIT_MAX],
+				io_start_time - start_time, direction, sync);
+	}
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
@@ -1252,6 +1268,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_PROP_io_wait_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_WAIT_TIME, 1, 0);
+		case BLKIO_PROP_io_wait_max:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_WAIT_MAX, 0, 0);
 		case BLKIO_PROP_io_merged:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_CPU_MERGED, 1, 1);
@@ -1423,6 +1442,12 @@ struct cftype blkio_files[] = {
 		.read_map = blkiocg_file_read_map,
 	},
 	{
+		.name = "io_wait_max",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_max),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
 		.name = "io_merged",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_merged),
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a71d290..8538699 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -39,6 +39,8 @@ enum stat_type {
 	BLKIO_STAT_SERVICE_TIME = 0,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
+	/* Maximum time spent waiting in scheduler queue in ns */
+	BLKIO_STAT_WAIT_MAX,
 	/* Number of IOs queued up */
 	BLKIO_STAT_QUEUED,
 	/* All the single valued stats go below this */
@@ -92,6 +94,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_unaccounted_time,
 	BLKIO_PROP_io_service_time,
 	BLKIO_PROP_io_wait_time,
+	BLKIO_PROP_io_wait_max,
 	BLKIO_PROP_io_merged,
 	BLKIO_PROP_io_queued,
 	BLKIO_PROP_avg_queue_size,


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC 1/2] cfq: request-deadline policy
  2011-07-04 13:08 [PATCH RFC 1/2] cfq: request-deadline policy Konstantin Khlebnikov
  2011-07-04 13:08 ` [PATCH RFC 2/2] blkio-cgroup: add max wait time statistics Konstantin Khlebnikov
@ 2011-07-05  0:38 ` Shaohua Li
  2011-07-05 15:04 ` Vivek Goyal
  2011-07-06 13:41 ` Peter Zijlstra
  3 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2011-07-05  0:38 UTC (permalink / raw)
  To: Konstantin Khlebnikov; +Cc: Jens Axboe, linux-kernel, Vivek Goyal

2011/7/4 Konstantin Khlebnikov <khlebnikov@openvz.org>:
> CFQ is designed for sharing disk bandwidth proportionally between queues and groups
> and for reordering requests to reduce disks seek time. Currently it cannot
> gurantee or estimate latency for individual requests, even if latencies are low
> for almost all requests, some of them can stuck inside scheduler for a long time.
> The fair policy is good as long as someone luckless begins to die due to a timeout.
>
> This patch implements fifo requests dispatching with deadline policy: now cfq
> obliged to dispatch request if it stuck in the queue for more than deadline.
>
> This way now cfq can try to ensure the expected latency of requests execution.
> It is like a safety valve, it should not work all time, but it should keep latency
> in sane range when the scheduler is unable to effectively handle flow of requests,
> especially in cases when the "noop" or "deadline" shows better performance.
>
> deadline can be tuned via /sys/block/<device>/queue/iosched/deadline_{sync,async}
> it by default 2000ms for sync and 4000ms for async requests, use 0 to disable it.
I'm afraid this takes too far away. It completed breaks fairness of
CFQ. The sync/async and priority of queues lose meaning.
Even deadline is meaningful of some kind, maybe we should add a policy
in preempt. If a queue has very old requests, don't preempt such
queue. But we should be careful to tune how long slice the queue can
take so fairness isn't broken.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC 1/2] cfq: request-deadline policy
  2011-07-04 13:08 [PATCH RFC 1/2] cfq: request-deadline policy Konstantin Khlebnikov
  2011-07-04 13:08 ` [PATCH RFC 2/2] blkio-cgroup: add max wait time statistics Konstantin Khlebnikov
  2011-07-05  0:38 ` [PATCH RFC 1/2] cfq: request-deadline policy Shaohua Li
@ 2011-07-05 15:04 ` Vivek Goyal
  2011-07-06  6:58   ` Konstantin Khlebnikov
  2011-07-06 13:41 ` Peter Zijlstra
  3 siblings, 1 reply; 7+ messages in thread
From: Vivek Goyal @ 2011-07-05 15:04 UTC (permalink / raw)
  To: Konstantin Khlebnikov; +Cc: Jens Axboe, linux-kernel

On Mon, Jul 04, 2011 at 05:08:38PM +0400, Konstantin Khlebnikov wrote:
> CFQ is designed for sharing disk bandwidth proportionally between queues and groups
> and for reordering requests to reduce disks seek time. Currently it cannot
> gurantee or estimate latency for individual requests, even if latencies are low
> for almost all requests, some of them can stuck inside scheduler for a long time.
> The fair policy is good as long as someone luckless begins to die due to a timeout.
> 
> This patch implements fifo requests dispatching with deadline policy: now cfq
> obliged to dispatch request if it stuck in the queue for more than deadline.
> 
> This way now cfq can try to ensure the expected latency of requests execution.
> It is like a safety valve, it should not work all time, but it should keep latency
> in sane range when the scheduler is unable to effectively handle flow of requests,
> especially in cases when the "noop" or "deadline" shows better performance.
> 
> deadline can be tuned via /sys/block/<device>/queue/iosched/deadline_{sync,async}
> it by default 2000ms for sync and 4000ms for async requests, use 0 to disable it.

What's the workload where you are running into issues with existing
policy?

We have low_latency=1 by default and which tries to schedule every 
queue once in 300ms atleast. And with-in queue we already have the
notion of looking at fifo and dispatch the expired request first.

So to me sync queue scheduling shold be pretty good. Async queues
can get starved though. With-in sync queue, if some requests have
expired, it is probably because of the fact that disk is slow and
we are throwing too much IO at it. So if we start always dispatching
expired requests first, then the notion of fairness is out of the
window.

Why not use deadline scheduler for your case?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC 1/2] cfq: request-deadline policy
  2011-07-05 15:04 ` Vivek Goyal
@ 2011-07-06  6:58   ` Konstantin Khlebnikov
  2011-07-06 14:23     ` Vivek Goyal
  0 siblings, 1 reply; 7+ messages in thread
From: Konstantin Khlebnikov @ 2011-07-06  6:58 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, linux-kernel@vger.kernel.org

Vivek Goyal wrote:
> On Mon, Jul 04, 2011 at 05:08:38PM +0400, Konstantin Khlebnikov wrote:
>> CFQ is designed for sharing disk bandwidth proportionally between queues and groups
>> and for reordering requests to reduce disks seek time. Currently it cannot
>> gurantee or estimate latency for individual requests, even if latencies are low
>> for almost all requests, some of them can stuck inside scheduler for a long time.
>> The fair policy is good as long as someone luckless begins to die due to a timeout.
>>
>> This patch implements fifo requests dispatching with deadline policy: now cfq
>> obliged to dispatch request if it stuck in the queue for more than deadline.
>>
>> This way now cfq can try to ensure the expected latency of requests execution.
>> It is like a safety valve, it should not work all time, but it should keep latency
>> in sane range when the scheduler is unable to effectively handle flow of requests,
>> especially in cases when the "noop" or "deadline" shows better performance.
>>
>> deadline can be tuned via /sys/block/<device>/queue/iosched/deadline_{sync,async}
>> it by default 2000ms for sync and 4000ms for async requests, use 0 to disable it.
>
> What's the workload where you are running into issues with existing
> policy?

This is huge internal test workload,
there >100 containers with mail/http/ftp and something more.

>
> We have low_latency=1 by default and which tries to schedule every
> queue once in 300ms atleast. And with-in queue we already have the
> notion of looking at fifo and dispatch the expired request first.

Without this patch some requests stuck in the scheduler for more than 30 seconds,
and it looks like it is no limit.

With this patch max-wait-time (from the second patch) shows 7 seconds for this workload,
so of course queue is over-congested, but it continues to work predictably.

>
> So to me sync queue scheduling shold be pretty good. Async queues
> can get starved though. With-in sync queue, if some requests have
> expired, it is probably because of the fact that disk is slow and
> we are throwing too much IO at it. So if we start always dispatching
> expired requests first, then the notion of fairness is out of the
> window.
>
> Why not use deadline scheduler for your case?

Because the scheduler must be universal, load can be arbitrary and constantly changing,
we also can not modify each machine separately.

>
> Thanks
> Vivek


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC 1/2] cfq: request-deadline policy
  2011-07-04 13:08 [PATCH RFC 1/2] cfq: request-deadline policy Konstantin Khlebnikov
                   ` (2 preceding siblings ...)
  2011-07-05 15:04 ` Vivek Goyal
@ 2011-07-06 13:41 ` Peter Zijlstra
  3 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2011-07-06 13:41 UTC (permalink / raw)
  To: Konstantin Khlebnikov; +Cc: Jens Axboe, linux-kernel, Vivek Goyal

On Mon, 2011-07-04 at 17:08 +0400, Konstantin Khlebnikov wrote:
> This patch implements fifo requests dispatching with deadline policy: now cfq
> obliged to dispatch request if it stuck in the queue for more than deadline.

Have you considered BFQ (http://lkml.org/lkml/2008/11/11/148) which is
WF2Q+ variant?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC 1/2] cfq: request-deadline policy
  2011-07-06  6:58   ` Konstantin Khlebnikov
@ 2011-07-06 14:23     ` Vivek Goyal
  0 siblings, 0 replies; 7+ messages in thread
From: Vivek Goyal @ 2011-07-06 14:23 UTC (permalink / raw)
  To: Konstantin Khlebnikov; +Cc: Jens Axboe, linux-kernel@vger.kernel.org

On Wed, Jul 06, 2011 at 10:58:41AM +0400, Konstantin Khlebnikov wrote:
> Vivek Goyal wrote:
> >On Mon, Jul 04, 2011 at 05:08:38PM +0400, Konstantin Khlebnikov wrote:
> >>CFQ is designed for sharing disk bandwidth proportionally between queues and groups
> >>and for reordering requests to reduce disks seek time. Currently it cannot
> >>gurantee or estimate latency for individual requests, even if latencies are low
> >>for almost all requests, some of them can stuck inside scheduler for a long time.
> >>The fair policy is good as long as someone luckless begins to die due to a timeout.
> >>
> >>This patch implements fifo requests dispatching with deadline policy: now cfq
> >>obliged to dispatch request if it stuck in the queue for more than deadline.
> >>
> >>This way now cfq can try to ensure the expected latency of requests execution.
> >>It is like a safety valve, it should not work all time, but it should keep latency
> >>in sane range when the scheduler is unable to effectively handle flow of requests,
> >>especially in cases when the "noop" or "deadline" shows better performance.
> >>
> >>deadline can be tuned via /sys/block/<device>/queue/iosched/deadline_{sync,async}
> >>it by default 2000ms for sync and 4000ms for async requests, use 0 to disable it.
> >
> >What's the workload where you are running into issues with existing
> >policy?
> 
> This is huge internal test workload,
> there >100 containers with mail/http/ftp and something more.
> 
> >
> >We have low_latency=1 by default and which tries to schedule every
> >queue once in 300ms atleast. And with-in queue we already have the
> >notion of looking at fifo and dispatch the expired request first.
> 
> Without this patch some requests stuck in the scheduler for more than 30 seconds,
> and it looks like it is no limit.

Have you done any analysis why requets are stuck for more than 30 seconds.
May be running blktrace will help.

Is it async requests which are stuck or sync requests?

We have seen that async requests can be stuck for a long time in CFQ
in an attempt to give sync requests priority. I will be surprised
if it is sync request stuck for 30 seconds.

I think first we need to run traces and analyze why request is stuck
for long time. With 300ms soft latency target, we dynamically reduce
the slice of every queue with 16ms minimum. So without cgroups, one shall
have to have close to 2 million procesess to create 32second delay in
servicing a cfq queue. I don't think you have that many processes doing
IO.

So running some traces and diving little deeper to figure out what's
happening will help. If you are quoting these numbers for async request
or for a sync request which is dependent on some async request, then
I can belive those.

> 
> With this patch max-wait-time (from the second patch) shows 7 seconds for this workload,
> so of course queue is over-congested, but it continues to work predictably.
> 
> >
> >So to me sync queue scheduling shold be pretty good. Async queues
> >can get starved though. With-in sync queue, if some requests have
> >expired, it is probably because of the fact that disk is slow and
> >we are throwing too much IO at it. So if we start always dispatching
> >expired requests first, then the notion of fairness is out of the
> >window.
> >
> >Why not use deadline scheduler for your case?
> 
> Because the scheduler must be universal, load can be arbitrary and constantly changing,
> we also can not modify each machine separately.

That's fine. But every scheduler has some property and one needs to use
a scheduler which suits their workoload/storage.

Anyway, before we figure out how to fix the problem, we need to figure
out what's the problem and what is being delayed and where does the
delay come from.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2011-07-06 14:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-04 13:08 [PATCH RFC 1/2] cfq: request-deadline policy Konstantin Khlebnikov
2011-07-04 13:08 ` [PATCH RFC 2/2] blkio-cgroup: add max wait time statistics Konstantin Khlebnikov
2011-07-05  0:38 ` [PATCH RFC 1/2] cfq: request-deadline policy Shaohua Li
2011-07-05 15:04 ` Vivek Goyal
2011-07-06  6:58   ` Konstantin Khlebnikov
2011-07-06 14:23     ` Vivek Goyal
2011-07-06 13:41 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox