All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC,RFT,PATCH] cfq: autotuning support
@ 2009-11-17 14:51 Corrado Zoccolo
  2009-11-17 16:52 ` Vivek Goyal
  0 siblings, 1 reply; 3+ messages in thread
From: Corrado Zoccolo @ 2009-11-17 14:51 UTC (permalink / raw)
  To: Linux-Kernel, Jens Axboe, Jeff Moyer, Vivek Goyal

Hi, this is my first attempt at autotuning cfq parameters, and should apply on top of for-2.6.33 branch.
Jeff and Vivek, if you can test this on your NCQ SSDs, it will help me to have your feedback (please include 
the output of: 'grep -r . /sys/block/sdX/queue/iosched' after you run your tests).

The patch introduces code to sample the request service time distribution, and analyze it,
in order to compute the following cfq parameters:
* slice_idle, is computed as the expected service time of random request
* cfq_slice[1] (i.e. the slice for sync queues)
* cfq_slice[0] (i.e. the slice for async queues)

The sync and async slices are scaled from default values proportionally to the new computed slice_idle.

Autotuning will be enabled by default only on kernels compiled with HZ >= 500.
With smaller HZ, I don't think jiffies is reliable to estimate those parameters.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
---
 block/cfq-iosched.c |  166 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 163 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6925ab9..d786a0b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -32,6 +32,12 @@ static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 
 /*
+ * Number of samples to collect before updating autotune
+ * higher # makes the measurements more stable
+ */
+#define CFQ_AUTOTUNE_SAMPLES	(10)
+
+/*
  * offset from end of service tree
  */
 #define CFQ_IDLE_DELAY		(HZ / 5)
@@ -201,6 +207,21 @@ struct cfq_data {
 	unsigned int hw_tag_samples;
 
 	/*
+	 * disk performance measurements
+	 */
+	unsigned long observation_start;
+	/*
+	 * measures are split (READ vs WRITE)
+	 */
+	unsigned long processed_rq[2];
+	/*
+	 * We store an histogram of samples for the service time
+	 * in log scale [0..5]; [6] is a count, that is reset every
+	 * time autotuning is done (i.e. every CFQ_AUTOTUNE_SAMPLES)
+	 */
+	unsigned int serv_time_samples[2][7];
+
+	/*
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
@@ -228,6 +249,7 @@ struct cfq_data {
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_latency;
+	unsigned int cfq_autotune;
 
 	struct list_head cic_list;
 
@@ -891,6 +913,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
+	if (!rq_in_driver(cfqd)) {
+		cfqd->observation_start = jiffies;
+		cfqd->processed_rq[0] = 0;
+		cfqd->processed_rq[1] = 0;
+	}
+
 	cfqd->rq_in_driver[rq_is_sync(rq)]++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						rq_in_driver(cfqd));
@@ -2562,14 +2590,120 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 		cfqd->hw_tag = 0;
 }
 
+
+/*
+ * Update the histogram to compute service time
+ * Returns true if we collected enough samples to re-run autotune
+ */
+static bool cfq_update_stime(unsigned samples[6], unsigned long stime)
+{
+	unsigned idx = (stime > 15) + (stime > 7) + (stime > 3)
+		       + (stime > 1) + (stime > 0);
+	samples[idx]++;
+	if (samples[idx] > (1U<<31))
+		for (idx = 0; idx < 6; ++idx) {
+			samples[idx]++;
+			samples[idx] >>= 1;
+		}
+
+	if (samples[6]++ < CFQ_AUTOTUNE_SAMPLES)
+		return false;
+	samples[6] = 0;
+	return true;
+}
+
+/*
+ * Currently, we measure only service time for pure READ or WRITE requests
+ * and we update autotune when we have collected enough READ requests
+ */
+static bool cfq_update_perf_measures(struct cfq_data *cfqd, unsigned long now)
+{
+	unsigned long tot_proc = cfqd->processed_rq[0] + cfqd->processed_rq[1];
+	unsigned long obstime = now - cfqd->observation_start;
+	unsigned long stime = obstime / tot_proc;
+
+	cfqd->observation_start = now;
+
+	if (!cfqd->processed_rq[READ])
+		cfq_update_stime(cfqd->serv_time_samples[WRITE], stime);
+	if (!cfqd->processed_rq[WRITE])
+		return cfq_update_stime(cfqd->serv_time_samples[READ], stime);
+	return false;
+}
+
+/*
+ * Compute service time from the sampled distribution in the histogram
+ * The real service time distribution is a super-position of two distinct
+ * distributions:
+ * the one for sequential requests (usually this has a small mean)
+ * the one for random requests (usually with a larger mean)
+ * and we want to identify the random request one
+ */
+static unsigned cfq_service_time(unsigned samples[6])
+{
+	unsigned last_max = 0, i;
+	/* Random request service time corresponds to the
+	 * largest maximum in the histogram */
+	for (i = 1; i < 6; ++i)
+		if (samples[i] > samples[i-1])
+			last_max = i;
+	/*
+	 * Unfortunately, if sequential requests overwhelm
+	 * random ones, and the two peaks are too near,
+	 * the second peak could be masked by the tail of the first.
+	 * To catch this, we check if the tail has enough weight,
+	 * and in this case we take the next bin as maximum
+	 */
+	if (last_max < 5) {
+		unsigned total = 0;
+		for (i = last_max + 1; i < 6; ++i)
+			total += samples[i];
+		if (total > samples[last_max])
+			++last_max;
+	}
+	if (!last_max)
+		return 0;
+	return 1U << last_max;
+}
+
+static void cfq_update_autotune(struct cfq_data *cfqd)
+{
+	unsigned long base, baseW;
+	if (!cfqd->cfq_autotune)
+		return;
+	base = cfq_service_time(cfqd->serv_time_samples[READ]);
+	baseW = cfq_service_time(cfqd->serv_time_samples[WRITE]);
+
+	/* Compute slice_idle */
+	if (!base)
+		base = baseW;
+	if (base > cfq_slice_idle)
+		base = min_t(unsigned long,
+			     (base + cfq_slice_idle) / 2, 2 * cfq_slice_idle);
+
+	cfqd->cfq_slice_idle = base;
+
+	/* Compute derived cfq_slice[*]
+	 * Note: those cannot be 0
+	 */
+	if (!base)
+		base = 1;
+
+	if (baseW)
+		baseW = min(base, baseW * cfq_slice_sync / cfq_slice_async);
+	else
+		baseW = base;
+
+	cfqd->cfq_slice[1] = base * cfq_slice_sync / cfq_slice_idle;
+	cfqd->cfq_slice[0] = baseW * cfq_slice_async / cfq_slice_idle;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	const int sync = rq_is_sync(rq);
-	unsigned long now;
-
-	now = jiffies;
+	unsigned long now = jiffies;
 	cfq_log_cfqq(cfqd, cfqq, "complete");
 
 	cfq_update_hw_tag(cfqd);
@@ -2578,6 +2712,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver[sync]--;
 	cfqq->dispatched--;
+	cfqd->processed_rq[rq_data_dir(rq)]++;
+
+	if (!rq_in_driver(cfqd) && cfq_update_perf_measures(cfqd, now))
+		cfq_update_autotune(cfqd);
 
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->sync_flight--;
@@ -2966,6 +3104,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_latency = 1;
+	cfqd->cfq_autotune = (HZ >= 500);
 	cfqd->hw_tag = -1;
 	cfqd->last_end_sync_rq = jiffies;
 	return cfqd;
@@ -3002,6 +3141,23 @@ fail:
 /*
  * sysfs parts below -->
  */
+static ssize_t autotune_stats_show(struct elevator_queue *e, char *page)
+{
+	struct cfq_data *cfqd = e->elevator_data;
+	int pos = 0, i, j;
+#if HZ < 500
+	pos += sprintf(page, "Autotune may work incorrectly with HZ < 500\n");
+#endif
+	for (j = 0; j < 2; ++j) {
+		for (i = 0; i < 6; ++i)
+			pos += sprintf(page+pos, "[%2u]",
+				cfqd->serv_time_samples[j][i]);
+		page[pos++] = '\n';
+	}
+	page[pos] = '\0';
+	return pos;
+}
+
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
@@ -3036,6 +3192,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_autotune_show, cfqd->cfq_autotune, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -3068,6 +3225,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_autotune_store, &cfqd->cfq_autotune, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -3084,6 +3242,8 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(low_latency),
+	CFQ_ATTR(autotune),
+	__ATTR_RO(autotune_stats),
 	__ATTR_NULL
 };
 
-- 
1.6.2.5



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [RFC,RFT,PATCH] cfq: autotuning support
  2009-11-17 14:51 [RFC,RFT,PATCH] cfq: autotuning support Corrado Zoccolo
@ 2009-11-17 16:52 ` Vivek Goyal
  2009-11-17 18:21   ` Corrado Zoccolo
  0 siblings, 1 reply; 3+ messages in thread
From: Vivek Goyal @ 2009-11-17 16:52 UTC (permalink / raw)
  To: Corrado Zoccolo; +Cc: Linux-Kernel, Jens Axboe, Jeff Moyer

On Tue, Nov 17, 2009 at 03:51:14PM +0100, Corrado Zoccolo wrote:
> Hi, this is my first attempt at autotuning cfq parameters, and should apply on top of for-2.6.33 branch.
> Jeff and Vivek, if you can test this on your NCQ SSDs, it will help me to have your feedback (please include 
> the output of: 'grep -r . /sys/block/sdX/queue/iosched' after you run your tests).

Sure, I will do some tests on this patch, may be later today.

I had a quick look at the patch. Had couple of questions.

- Service time seems to be the measure of on an average how much time it
  took to service a read/write request (be it sequential or random read).

  In auto tuning, you seem to be updating slice_idle dynamically based on
  service time. So the intent seems to be that is service times are higher
  then it is a slow media and we should have higher slice_idle otherwise
  a low slice_idle?

Thanks
Vivek


> 
> The patch introduces code to sample the request service time distribution, and analyze it,
> in order to compute the following cfq parameters:
> * slice_idle, is computed as the expected service time of random request
> * cfq_slice[1] (i.e. the slice for sync queues)
> * cfq_slice[0] (i.e. the slice for async queues)
> 
> The sync and async slices are scaled from default values proportionally to the new computed slice_idle.
> 
> Autotuning will be enabled by default only on kernels compiled with HZ >= 500.
> With smaller HZ, I don't think jiffies is reliable to estimate those parameters.
> 
> Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
> ---
>  block/cfq-iosched.c |  166 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 163 insertions(+), 3 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 6925ab9..d786a0b 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -32,6 +32,12 @@ static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
>  static const int cfq_hist_divisor = 4;
>  
>  /*
> + * Number of samples to collect before updating autotune
> + * higher # makes the measurements more stable
> + */
> +#define CFQ_AUTOTUNE_SAMPLES	(10)
> +
> +/*
>   * offset from end of service tree
>   */
>  #define CFQ_IDLE_DELAY		(HZ / 5)
> @@ -201,6 +207,21 @@ struct cfq_data {
>  	unsigned int hw_tag_samples;
>  
>  	/*
> +	 * disk performance measurements
> +	 */
> +	unsigned long observation_start;
> +	/*
> +	 * measures are split (READ vs WRITE)
> +	 */
> +	unsigned long processed_rq[2];
> +	/*
> +	 * We store an histogram of samples for the service time
> +	 * in log scale [0..5]; [6] is a count, that is reset every
> +	 * time autotuning is done (i.e. every CFQ_AUTOTUNE_SAMPLES)
> +	 */
> +	unsigned int serv_time_samples[2][7];
> +
> +	/*
>  	 * idle window management
>  	 */
>  	struct timer_list idle_slice_timer;
> @@ -228,6 +249,7 @@ struct cfq_data {
>  	unsigned int cfq_slice_async_rq;
>  	unsigned int cfq_slice_idle;
>  	unsigned int cfq_latency;
> +	unsigned int cfq_autotune;
>  
>  	struct list_head cic_list;
>  
> @@ -891,6 +913,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
>  {
>  	struct cfq_data *cfqd = q->elevator->elevator_data;
>  
> +	if (!rq_in_driver(cfqd)) {
> +		cfqd->observation_start = jiffies;
> +		cfqd->processed_rq[0] = 0;
> +		cfqd->processed_rq[1] = 0;
> +	}
> +
>  	cfqd->rq_in_driver[rq_is_sync(rq)]++;
>  	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
>  						rq_in_driver(cfqd));
> @@ -2562,14 +2590,120 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
>  		cfqd->hw_tag = 0;
>  }
>  
> +
> +/*
> + * Update the histogram to compute service time
> + * Returns true if we collected enough samples to re-run autotune
> + */
> +static bool cfq_update_stime(unsigned samples[6], unsigned long stime)
> +{
> +	unsigned idx = (stime > 15) + (stime > 7) + (stime > 3)
> +		       + (stime > 1) + (stime > 0);
> +	samples[idx]++;
> +	if (samples[idx] > (1U<<31))
> +		for (idx = 0; idx < 6; ++idx) {
> +			samples[idx]++;
> +			samples[idx] >>= 1;
> +		}
> +
> +	if (samples[6]++ < CFQ_AUTOTUNE_SAMPLES)
> +		return false;
> +	samples[6] = 0;
> +	return true;
> +}
> +
> +/*
> + * Currently, we measure only service time for pure READ or WRITE requests
> + * and we update autotune when we have collected enough READ requests
> + */
> +static bool cfq_update_perf_measures(struct cfq_data *cfqd, unsigned long now)
> +{
> +	unsigned long tot_proc = cfqd->processed_rq[0] + cfqd->processed_rq[1];
> +	unsigned long obstime = now - cfqd->observation_start;
> +	unsigned long stime = obstime / tot_proc;
> +
> +	cfqd->observation_start = now;
> +
> +	if (!cfqd->processed_rq[READ])
> +		cfq_update_stime(cfqd->serv_time_samples[WRITE], stime);
> +	if (!cfqd->processed_rq[WRITE])
> +		return cfq_update_stime(cfqd->serv_time_samples[READ], stime);
> +	return false;
> +}
> +
> +/*
> + * Compute service time from the sampled distribution in the histogram
> + * The real service time distribution is a super-position of two distinct
> + * distributions:
> + * the one for sequential requests (usually this has a small mean)
> + * the one for random requests (usually with a larger mean)
> + * and we want to identify the random request one
> + */
> +static unsigned cfq_service_time(unsigned samples[6])
> +{
> +	unsigned last_max = 0, i;
> +	/* Random request service time corresponds to the
> +	 * largest maximum in the histogram */
> +	for (i = 1; i < 6; ++i)
> +		if (samples[i] > samples[i-1])
> +			last_max = i;
> +	/*
> +	 * Unfortunately, if sequential requests overwhelm
> +	 * random ones, and the two peaks are too near,
> +	 * the second peak could be masked by the tail of the first.
> +	 * To catch this, we check if the tail has enough weight,
> +	 * and in this case we take the next bin as maximum
> +	 */
> +	if (last_max < 5) {
> +		unsigned total = 0;
> +		for (i = last_max + 1; i < 6; ++i)
> +			total += samples[i];
> +		if (total > samples[last_max])
> +			++last_max;
> +	}
> +	if (!last_max)
> +		return 0;
> +	return 1U << last_max;
> +}
> +
> +static void cfq_update_autotune(struct cfq_data *cfqd)
> +{
> +	unsigned long base, baseW;
> +	if (!cfqd->cfq_autotune)
> +		return;
> +	base = cfq_service_time(cfqd->serv_time_samples[READ]);
> +	baseW = cfq_service_time(cfqd->serv_time_samples[WRITE]);
> +
> +	/* Compute slice_idle */
> +	if (!base)
> +		base = baseW;
> +	if (base > cfq_slice_idle)
> +		base = min_t(unsigned long,
> +			     (base + cfq_slice_idle) / 2, 2 * cfq_slice_idle);
> +
> +	cfqd->cfq_slice_idle = base;
> +
> +	/* Compute derived cfq_slice[*]
> +	 * Note: those cannot be 0
> +	 */
> +	if (!base)
> +		base = 1;
> +
> +	if (baseW)
> +		baseW = min(base, baseW * cfq_slice_sync / cfq_slice_async);
> +	else
> +		baseW = base;
> +
> +	cfqd->cfq_slice[1] = base * cfq_slice_sync / cfq_slice_idle;
> +	cfqd->cfq_slice[0] = baseW * cfq_slice_async / cfq_slice_idle;
> +}
> +
>  static void cfq_completed_request(struct request_queue *q, struct request *rq)
>  {
>  	struct cfq_queue *cfqq = RQ_CFQQ(rq);
>  	struct cfq_data *cfqd = cfqq->cfqd;
>  	const int sync = rq_is_sync(rq);
> -	unsigned long now;
> -
> -	now = jiffies;
> +	unsigned long now = jiffies;
>  	cfq_log_cfqq(cfqd, cfqq, "complete");
>  
>  	cfq_update_hw_tag(cfqd);
> @@ -2578,6 +2712,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
>  	WARN_ON(!cfqq->dispatched);
>  	cfqd->rq_in_driver[sync]--;
>  	cfqq->dispatched--;
> +	cfqd->processed_rq[rq_data_dir(rq)]++;
> +
> +	if (!rq_in_driver(cfqd) && cfq_update_perf_measures(cfqd, now))
> +		cfq_update_autotune(cfqd);
>  
>  	if (cfq_cfqq_sync(cfqq))
>  		cfqd->sync_flight--;
> @@ -2966,6 +3104,7 @@ static void *cfq_init_queue(struct request_queue *q)
>  	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
>  	cfqd->cfq_slice_idle = cfq_slice_idle;
>  	cfqd->cfq_latency = 1;
> +	cfqd->cfq_autotune = (HZ >= 500);
>  	cfqd->hw_tag = -1;
>  	cfqd->last_end_sync_rq = jiffies;
>  	return cfqd;
> @@ -3002,6 +3141,23 @@ fail:
>  /*
>   * sysfs parts below -->
>   */
> +static ssize_t autotune_stats_show(struct elevator_queue *e, char *page)
> +{
> +	struct cfq_data *cfqd = e->elevator_data;
> +	int pos = 0, i, j;
> +#if HZ < 500
> +	pos += sprintf(page, "Autotune may work incorrectly with HZ < 500\n");
> +#endif
> +	for (j = 0; j < 2; ++j) {
> +		for (i = 0; i < 6; ++i)
> +			pos += sprintf(page+pos, "[%2u]",
> +				cfqd->serv_time_samples[j][i]);
> +		page[pos++] = '\n';
> +	}
> +	page[pos] = '\0';
> +	return pos;
> +}
> +
>  static ssize_t
>  cfq_var_show(unsigned int var, char *page)
>  {
> @@ -3036,6 +3192,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
>  SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
>  SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
>  SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
> +SHOW_FUNCTION(cfq_autotune_show, cfqd->cfq_autotune, 0);
>  #undef SHOW_FUNCTION
>  
>  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> @@ -3068,6 +3225,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
>  STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
>  		UINT_MAX, 0);
>  STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
> +STORE_FUNCTION(cfq_autotune_store, &cfqd->cfq_autotune, 0, 1, 0);
>  #undef STORE_FUNCTION
>  
>  #define CFQ_ATTR(name) \
> @@ -3084,6 +3242,8 @@ static struct elv_fs_entry cfq_attrs[] = {
>  	CFQ_ATTR(slice_async_rq),
>  	CFQ_ATTR(slice_idle),
>  	CFQ_ATTR(low_latency),
> +	CFQ_ATTR(autotune),
> +	__ATTR_RO(autotune_stats),
>  	__ATTR_NULL
>  };
>  
> -- 
> 1.6.2.5
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC,RFT,PATCH] cfq: autotuning support
  2009-11-17 16:52 ` Vivek Goyal
@ 2009-11-17 18:21   ` Corrado Zoccolo
  0 siblings, 0 replies; 3+ messages in thread
From: Corrado Zoccolo @ 2009-11-17 18:21 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Linux-Kernel, Jens Axboe, Jeff Moyer

On Tue, Nov 17, 2009 at 5:52 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Tue, Nov 17, 2009 at 03:51:14PM +0100, Corrado Zoccolo wrote:
>> Hi, this is my first attempt at autotuning cfq parameters, and should apply on top of for-2.6.33 branch.
>> Jeff and Vivek, if you can test this on your NCQ SSDs, it will help me to have your feedback (please include
>> the output of: 'grep -r . /sys/block/sdX/queue/iosched' after you run your tests).
>
> Sure, I will do some tests on this patch, may be later today.
>
> I had a quick look at the patch. Had couple of questions.
>
> - Service time seems to be the measure of on an average how much time it
>  took to service a read/write request (be it sequential or random read).
We sample both in the histogram, but then we try to extract the random
read service time.

>  In auto tuning, you seem to be updating slice_idle dynamically based on
>  service time. So the intent seems to be that is service times are higher
>  then it is a slow media and we should have higher slice_idle otherwise
>  a low slice_idle?

Yes. The target is to have slice_idle = the average seek cost.
So we will wait up to the average seek cost for a sequential request
before switching to an other queue and paying a possibly long seek.
For media that have really fast seek (less than 0.5ms), both when
reading and when writing, then we put slice_idle = 0. Both of your
SSDs should be in this category.

Thanks
Corrado
> Thanks
> Vivek
>
>
>>
>> The patch introduces code to sample the request service time distribution, and analyze it,
>> in order to compute the following cfq parameters:
>> * slice_idle, is computed as the expected service time of random request
>> * cfq_slice[1] (i.e. the slice for sync queues)
>> * cfq_slice[0] (i.e. the slice for async queues)
>>
>> The sync and async slices are scaled from default values proportionally to the new computed slice_idle.
>>
>> Autotuning will be enabled by default only on kernels compiled with HZ >= 500.
>> With smaller HZ, I don't think jiffies is reliable to estimate those parameters.
>>
>> Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
>> ---
>>  block/cfq-iosched.c |  166 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 files changed, 163 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 6925ab9..d786a0b 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -32,6 +32,12 @@ static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
>>  static const int cfq_hist_divisor = 4;
>>
>>  /*
>> + * Number of samples to collect before updating autotune
>> + * higher # makes the measurements more stable
>> + */
>> +#define CFQ_AUTOTUNE_SAMPLES (10)
>> +
>> +/*
>>   * offset from end of service tree
>>   */
>>  #define CFQ_IDLE_DELAY               (HZ / 5)
>> @@ -201,6 +207,21 @@ struct cfq_data {
>>       unsigned int hw_tag_samples;
>>
>>       /*
>> +      * disk performance measurements
>> +      */
>> +     unsigned long observation_start;
>> +     /*
>> +      * measures are split (READ vs WRITE)
>> +      */
>> +     unsigned long processed_rq[2];
>> +     /*
>> +      * We store an histogram of samples for the service time
>> +      * in log scale [0..5]; [6] is a count, that is reset every
>> +      * time autotuning is done (i.e. every CFQ_AUTOTUNE_SAMPLES)
>> +      */
>> +     unsigned int serv_time_samples[2][7];
>> +
>> +     /*
>>        * idle window management
>>        */
>>       struct timer_list idle_slice_timer;
>> @@ -228,6 +249,7 @@ struct cfq_data {
>>       unsigned int cfq_slice_async_rq;
>>       unsigned int cfq_slice_idle;
>>       unsigned int cfq_latency;
>> +     unsigned int cfq_autotune;
>>
>>       struct list_head cic_list;
>>
>> @@ -891,6 +913,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
>>  {
>>       struct cfq_data *cfqd = q->elevator->elevator_data;
>>
>> +     if (!rq_in_driver(cfqd)) {
>> +             cfqd->observation_start = jiffies;
>> +             cfqd->processed_rq[0] = 0;
>> +             cfqd->processed_rq[1] = 0;
>> +     }
>> +
>>       cfqd->rq_in_driver[rq_is_sync(rq)]++;
>>       cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
>>                                               rq_in_driver(cfqd));
>> @@ -2562,14 +2590,120 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
>>               cfqd->hw_tag = 0;
>>  }
>>
>> +
>> +/*
>> + * Update the histogram to compute service time
>> + * Returns true if we collected enough samples to re-run autotune
>> + */
>> +static bool cfq_update_stime(unsigned samples[6], unsigned long stime)
>> +{
>> +     unsigned idx = (stime > 15) + (stime > 7) + (stime > 3)
>> +                    + (stime > 1) + (stime > 0);
>> +     samples[idx]++;
>> +     if (samples[idx] > (1U<<31))
>> +             for (idx = 0; idx < 6; ++idx) {
>> +                     samples[idx]++;
>> +                     samples[idx] >>= 1;
>> +             }
>> +
>> +     if (samples[6]++ < CFQ_AUTOTUNE_SAMPLES)
>> +             return false;
>> +     samples[6] = 0;
>> +     return true;
>> +}
>> +
>> +/*
>> + * Currently, we measure only service time for pure READ or WRITE requests
>> + * and we update autotune when we have collected enough READ requests
>> + */
>> +static bool cfq_update_perf_measures(struct cfq_data *cfqd, unsigned long now)
>> +{
>> +     unsigned long tot_proc = cfqd->processed_rq[0] + cfqd->processed_rq[1];
>> +     unsigned long obstime = now - cfqd->observation_start;
>> +     unsigned long stime = obstime / tot_proc;
>> +
>> +     cfqd->observation_start = now;
>> +
>> +     if (!cfqd->processed_rq[READ])
>> +             cfq_update_stime(cfqd->serv_time_samples[WRITE], stime);
>> +     if (!cfqd->processed_rq[WRITE])
>> +             return cfq_update_stime(cfqd->serv_time_samples[READ], stime);
>> +     return false;
>> +}
>> +
>> +/*
>> + * Compute service time from the sampled distribution in the histogram
>> + * The real service time distribution is a super-position of two distinct
>> + * distributions:
>> + * the one for sequential requests (usually this has a small mean)
>> + * the one for random requests (usually with a larger mean)
>> + * and we want to identify the random request one
>> + */
>> +static unsigned cfq_service_time(unsigned samples[6])
>> +{
>> +     unsigned last_max = 0, i;
>> +     /* Random request service time corresponds to the
>> +      * largest maximum in the histogram */
>> +     for (i = 1; i < 6; ++i)
>> +             if (samples[i] > samples[i-1])
>> +                     last_max = i;
>> +     /*
>> +      * Unfortunately, if sequential requests overwhelm
>> +      * random ones, and the two peaks are too near,
>> +      * the second peak could be masked by the tail of the first.
>> +      * To catch this, we check if the tail has enough weight,
>> +      * and in this case we take the next bin as maximum
>> +      */
>> +     if (last_max < 5) {
>> +             unsigned total = 0;
>> +             for (i = last_max + 1; i < 6; ++i)
>> +                     total += samples[i];
>> +             if (total > samples[last_max])
>> +                     ++last_max;
>> +     }
>> +     if (!last_max)
>> +             return 0;
>> +     return 1U << last_max;
>> +}
>> +
>> +static void cfq_update_autotune(struct cfq_data *cfqd)
>> +{
>> +     unsigned long base, baseW;
>> +     if (!cfqd->cfq_autotune)
>> +             return;
>> +     base = cfq_service_time(cfqd->serv_time_samples[READ]);
>> +     baseW = cfq_service_time(cfqd->serv_time_samples[WRITE]);
>> +
>> +     /* Compute slice_idle */
>> +     if (!base)
>> +             base = baseW;
>> +     if (base > cfq_slice_idle)
>> +             base = min_t(unsigned long,
>> +                          (base + cfq_slice_idle) / 2, 2 * cfq_slice_idle);
>> +
>> +     cfqd->cfq_slice_idle = base;
>> +
>> +     /* Compute derived cfq_slice[*]
>> +      * Note: those cannot be 0
>> +      */
>> +     if (!base)
>> +             base = 1;
>> +
>> +     if (baseW)
>> +             baseW = min(base, baseW * cfq_slice_sync / cfq_slice_async);
>> +     else
>> +             baseW = base;
>> +
>> +     cfqd->cfq_slice[1] = base * cfq_slice_sync / cfq_slice_idle;
>> +     cfqd->cfq_slice[0] = baseW * cfq_slice_async / cfq_slice_idle;
>> +}
>> +
>>  static void cfq_completed_request(struct request_queue *q, struct request *rq)
>>  {
>>       struct cfq_queue *cfqq = RQ_CFQQ(rq);
>>       struct cfq_data *cfqd = cfqq->cfqd;
>>       const int sync = rq_is_sync(rq);
>> -     unsigned long now;
>> -
>> -     now = jiffies;
>> +     unsigned long now = jiffies;
>>       cfq_log_cfqq(cfqd, cfqq, "complete");
>>
>>       cfq_update_hw_tag(cfqd);
>> @@ -2578,6 +2712,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
>>       WARN_ON(!cfqq->dispatched);
>>       cfqd->rq_in_driver[sync]--;
>>       cfqq->dispatched--;
>> +     cfqd->processed_rq[rq_data_dir(rq)]++;
>> +
>> +     if (!rq_in_driver(cfqd) && cfq_update_perf_measures(cfqd, now))
>> +             cfq_update_autotune(cfqd);
>>
>>       if (cfq_cfqq_sync(cfqq))
>>               cfqd->sync_flight--;
>> @@ -2966,6 +3104,7 @@ static void *cfq_init_queue(struct request_queue *q)
>>       cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
>>       cfqd->cfq_slice_idle = cfq_slice_idle;
>>       cfqd->cfq_latency = 1;
>> +     cfqd->cfq_autotune = (HZ >= 500);
>>       cfqd->hw_tag = -1;
>>       cfqd->last_end_sync_rq = jiffies;
>>       return cfqd;
>> @@ -3002,6 +3141,23 @@ fail:
>>  /*
>>   * sysfs parts below -->
>>   */
>> +static ssize_t autotune_stats_show(struct elevator_queue *e, char *page)
>> +{
>> +     struct cfq_data *cfqd = e->elevator_data;
>> +     int pos = 0, i, j;
>> +#if HZ < 500
>> +     pos += sprintf(page, "Autotune may work incorrectly with HZ < 500\n");
>> +#endif
>> +     for (j = 0; j < 2; ++j) {
>> +             for (i = 0; i < 6; ++i)
>> +                     pos += sprintf(page+pos, "[%2u]",
>> +                             cfqd->serv_time_samples[j][i]);
>> +             page[pos++] = '\n';
>> +     }
>> +     page[pos] = '\0';
>> +     return pos;
>> +}
>> +
>>  static ssize_t
>>  cfq_var_show(unsigned int var, char *page)
>>  {
>> @@ -3036,6 +3192,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
>>  SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
>>  SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
>>  SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
>> +SHOW_FUNCTION(cfq_autotune_show, cfqd->cfq_autotune, 0);
>>  #undef SHOW_FUNCTION
>>
>>  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                      \
>> @@ -3068,6 +3225,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
>>  STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
>>               UINT_MAX, 0);
>>  STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
>> +STORE_FUNCTION(cfq_autotune_store, &cfqd->cfq_autotune, 0, 1, 0);
>>  #undef STORE_FUNCTION
>>
>>  #define CFQ_ATTR(name) \
>> @@ -3084,6 +3242,8 @@ static struct elv_fs_entry cfq_attrs[] = {
>>       CFQ_ATTR(slice_async_rq),
>>       CFQ_ATTR(slice_idle),
>>       CFQ_ATTR(low_latency),
>> +     CFQ_ATTR(autotune),
>> +     __ATTR_RO(autotune_stats),
>>       __ATTR_NULL
>>  };
>>
>> --
>> 1.6.2.5
>>
>



-- 
__________________________________________________________________________

dott. Corrado Zoccolo                          mailto:czoccolo@gmail.com
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
The self-confidence of a warrior is not the self-confidence of the average
man. The average man seeks certainty in the eyes of the onlooker and calls
that self-confidence. The warrior seeks impeccability in his own eyes and
calls that humbleness.
                               Tales of Power - C. Castaneda

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2009-11-17 18:21 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-17 14:51 [RFC,RFT,PATCH] cfq: autotuning support Corrado Zoccolo
2009-11-17 16:52 ` Vivek Goyal
2009-11-17 18:21   ` Corrado Zoccolo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.