Re: [RFC] cfq: adapt slice to number of processes doing I/O

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Corrado Zoccolo <czoccolo@gmail.com>
To: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>,
	"Linux-Kernel" <linux-kernel@vger.kernel.org>
Subject: Re: [RFC] cfq: adapt slice to number of processes doing I/O
Date: Mon, 7 Sep 2009 16:57:50 +0200	[thread overview]
Message-ID: <200909071657.50954.czoccolo@gmail.com> (raw)
In-Reply-To: <20090905161635.GI18599@kernel.dk>

Hi,
On Sat, Sep 05 2009 18:16:35, Jens Axboe wrote:
: > On Thu, Sep 03 2009, Corrado Zoccolo wrote:
> > Hi Jens,
> >
> > [snip]
> >
> > This is the reason that I have a minimum slice. It is already reached
> > for 32 processes as in my example, so the throughput drop is at most
> > 20%.
> > Currently it is computed as 2*slice_idle for sync, and 1*slice_idle
> > for async queues.
> > I think this causes the leveling of data transferred regardless of
> > priorities. I'll cook up a formula to better scale also the minimum
> > slice according to priority, to fix this issue.
>
> For your case, it may be different for other hardware. But I think the
> approach is sane to some degree, it'll require more work though. One
> problem is that the count of busy queues will fluctuate a lot for sync
> IO, so you'll have fairness issues. The number of potentially interested
> processes needs to be a rolling average of some sort, not just looking
> at ->busy_queues.
here is the new patch, with the improved formula for minimum time slice,
and with rolling average for number of processes doing I/O.
The average is computed in such a way that it can quickly follow
sudden increases (to keep latency low),
and decrease slowly (to have fairness in spite of rapid decreases of this value).

I added 2 tunables, for testing:
* the preferred latency, used to compute the threshold on number of processes (that was hardcoded in previous version).
* the divisor in the averaging formula, to alter the weights (weights are computed as 1/d and (d-1)/d).
they can be removed if/when we find the optimal values.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
---
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080e..b200b13 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
+static int cfq_preferred_latency = HZ * 3/10; /* 300 ms */
+static int cfq_queue_hist_divisor = 4;
 
 /*
  * offset from end of service tree
@@ -134,11 +136,13 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
+	unsigned int busy_queues_avg;
 	/*
 	 * Used to track any pending rt requests so we can pre-empt current
 	 * non-RT cfqq in service when this value is non-zero.
 	 */
 	unsigned int busy_rt_queues;
+	unsigned int busy_rt_queues_avg;
 
 	int rq_in_driver;
 	int sync_flight;
@@ -178,6 +182,8 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
+	unsigned int cfq_preferred_latency;
+	unsigned int cfq_queue_hist_divisor;
 
 	struct list_head cic_list;
 
@@ -303,10 +309,37 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
+static inline unsigned
+cfq_get_interested_queues(struct cfq_data *cfqd, bool rt) {
+	unsigned min_q, max_q;
+	unsigned mult = cfqd->cfq_queue_hist_divisor -1;
+	unsigned round = cfqd->cfq_queue_hist_divisor / 2;
+	if (rt) {
+		min_q = min(cfqd->busy_rt_queues_avg, cfqd->busy_rt_queues);
+		max_q = max(cfqd->busy_rt_queues_avg, cfqd->busy_rt_queues);
+		cfqd->busy_rt_queues_avg = (mult * max_q + min_q + round) / cfqd->cfq_queue_hist_divisor;
+		return cfqd->busy_rt_queues_avg;
+	} else {
+		min_q = min(cfqd->busy_queues_avg, cfqd->busy_queues);
+		max_q = max(cfqd->busy_queues_avg, cfqd->busy_queues);
+		cfqd->busy_queues_avg = (mult * max_q + min_q + round) / cfqd->cfq_queue_hist_divisor;
+		return cfqd->busy_queues_avg;
+	}
+}
+
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+	unsigned process_threshold = cfqd->cfq_preferred_latency / cfqd->cfq_slice[1];
+	unsigned interested_queues = cfq_get_interested_queues(cfqd, cfq_class_rt(cfqq));
+	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+
+	if (interested_queues > process_threshold) {
+		unsigned low_slice = min(slice, 2 * slice * cfqd->cfq_slice_idle / cfqd->cfq_slice[1]);
+		slice = max(slice * process_threshold / interested_queues, low_slice);
+	}
+
+	cfqq->slice_end = jiffies + slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
@@ -2494,6 +2527,8 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_preferred_latency = cfq_preferred_latency;
+	cfqd->cfq_queue_hist_divisor = cfq_queue_hist_divisor;
 	cfqd->hw_tag = 1;
 
 	return cfqd;
@@ -2563,6 +2598,8 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+SHOW_FUNCTION(cfq_preferred_latency_show, cfqd->cfq_preferred_latency, 1);
+SHOW_FUNCTION(cfq_queue_hist_divisor_show, cfqd->cfq_queue_hist_divisor, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -2594,6 +2631,10 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
+
+STORE_FUNCTION(cfq_preferred_latency_store, &cfqd->cfq_preferred_latency, 1, 1000, 1);
+STORE_FUNCTION(cfq_queue_hist_divisor_store, &cfqd->cfq_queue_hist_divisor, 1, 100, 0);
+
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -2609,6 +2650,8 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
+	CFQ_ATTR(preferred_latency),
+	CFQ_ATTR(queue_hist_divisor),
 	__ATTR_NULL
 };

next prev parent reply	other threads:[~2009-09-07 14:57 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-03 11:07 [RFC] cfq: adapt slice to number of processes doing I/O Corrado Zoccolo
2009-09-03 13:01 ` Jeff Moyer
2009-09-03 13:07   ` Jens Axboe
2009-09-03 16:36     ` Corrado Zoccolo
2009-09-05 16:16       ` Jens Axboe
2009-09-07 14:57         ` Corrado Zoccolo [this message]
2009-09-03 15:38   ` Jeff Moyer
2009-09-03 16:47     ` Corrado Zoccolo
2009-09-03 17:16       ` Jeff Moyer
2009-09-04  7:22         ` Corrado Zoccolo
2009-09-03 16:26   ` Corrado Zoccolo
2009-09-03 18:29     ` Jeff Moyer

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:fd7080e dfblob:b200b13 )
 OR (
bs:"Re: [RFC] cfq: adapt slice to number of processes doing I/O" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200909071657.50954.czoccolo@gmail.com \
    --to=czoccolo@gmail.com \
    --cc=jens.axboe@oracle.com \
    --cc=jmoyer@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.