linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-04-14 21:17 [PATCH 0/4 v3] " Jeff Moyer
@ 2010-04-14 21:17 ` Jeff Moyer
  2010-04-14 21:37   ` Vivek Goyal
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Moyer @ 2010-04-14 21:17 UTC (permalink / raw)
  To: jens.axboe; +Cc: linux-kernel, linux-ext4, vgoyal, Jeff Moyer

This patch uses an average think time for the entirety of the sync-noidle
workload to determine whether or not to idle on said workload.  This brings
it more in line with the policy for the sync queues in the sync workload.

Testing shows that this provided an overall increase in throughput for
a mixed workload on my hardware RAID array.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 block/cfq-iosched.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 838834b..ef59ab3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -83,9 +83,14 @@ struct cfq_rb_root {
 	unsigned total_weight;
 	u64 min_vdisktime;
 	struct rb_node *active;
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
-			.count = 0, .min_vdisktime = 0, }
+			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
+			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
 
 /*
  * Per process-grouping structure
@@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 		goto done;
 
 	cfqg->weight = blkcg->weight;
-	for_each_cfqg_st(cfqg, i, j, st)
+	for_each_cfqg_st(cfqg, i, j, st) {
 		*st = CFQ_RB_ROOT;
+		st->last_end_request = jiffies;
+	}
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/*
@@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	/*
 	 * Otherwise, we do only if they are the last ones
-	 * in their service tree.
+	 * in their service tree and the average think time is
+	 * less than the slice length.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+	    (!sample_valid(service_tree->ttime_samples || 
+	     cfqq->slice_end - jiffies < service_tree->ttime_mean)))
 		return 1;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
@@ -2988,6 +2998,18 @@ err:
 }
 
 static void
+cfq_update_st_thinktime(struct cfq_data *cfqd, struct cfq_rb_root *service_tree)
+{
+	unsigned long elapsed = jiffies - service_tree->last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
+
+	service_tree->ttime_samples = (7*service_tree->ttime_samples + 256) / 8;
+	service_tree->ttime_total = (7*service_tree->ttime_total + 256*ttime)/8;
+	service_tree->ttime_mean = (service_tree->ttime_total + 128) /
+						service_tree->ttime_samples;
+}
+
+static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
 	unsigned long elapsed = jiffies - cic->last_end_request;
@@ -3166,6 +3188,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_st_thinktime(cfqd, cfqq->service_tree);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
@@ -3304,7 +3327,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
+		struct cfq_rb_root *st;
+
 		RQ_CIC(rq)->last_end_request = now;
+		/*
+		 * cfqq->service_tree is only filled in while on the rb tree,
+		 * so we need to lookup the service tree here.
+		 */
+		st = service_tree_for(cfqq->cfqg,
+				      cfqq_prio(cfqq), cfqq_type(cfqq));
+		st->last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
@@ -3678,11 +3710,14 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
+	cfqd->grp_service_tree.last_end_request = jiffies;
 
 	/* Init root group */
 	cfqg = &cfqd->root_group;
-	for_each_cfqg_st(cfqg, i, j, st)
+	for_each_cfqg_st(cfqg, i, j, st) {
 		*st = CFQ_RB_ROOT;
+		st->last_end_request = jiffies;
+	}
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/* Give preference to root group over other groups */
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-04-14 21:17 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
@ 2010-04-14 21:37   ` Vivek Goyal
  2010-04-14 23:06     ` Jeff Moyer
  0 siblings, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-04-14 21:37 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: jens.axboe, linux-kernel, linux-ext4

On Wed, Apr 14, 2010 at 05:17:03PM -0400, Jeff Moyer wrote:
> This patch uses an average think time for the entirety of the sync-noidle
> workload to determine whether or not to idle on said workload.  This brings
> it more in line with the policy for the sync queues in the sync workload.
> 
> Testing shows that this provided an overall increase in throughput for
> a mixed workload on my hardware RAID array.
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  block/cfq-iosched.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 838834b..ef59ab3 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -83,9 +83,14 @@ struct cfq_rb_root {
>  	unsigned total_weight;
>  	u64 min_vdisktime;
>  	struct rb_node *active;
> +	unsigned long last_end_request;
> +	unsigned long ttime_total;
> +	unsigned long ttime_samples;
> +	unsigned long ttime_mean;
>  };
>  #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
> -			.count = 0, .min_vdisktime = 0, }
> +			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
> +			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
>  
>  /*
>   * Per process-grouping structure
> @@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>  		goto done;
>  
>  	cfqg->weight = blkcg->weight;
> -	for_each_cfqg_st(cfqg, i, j, st)
> +	for_each_cfqg_st(cfqg, i, j, st) {
>  		*st = CFQ_RB_ROOT;
> +		st->last_end_request = jiffies;
> +	}
>  	RB_CLEAR_NODE(&cfqg->rb_node);
>  
>  	/*
> @@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  
>  	/*
>  	 * Otherwise, we do only if they are the last ones
> -	 * in their service tree.
> +	 * in their service tree and the average think time is
> +	 * less than the slice length.
>  	 */
> -	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
> +	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
> +	    (!sample_valid(service_tree->ttime_samples || 
> +	     cfqq->slice_end - jiffies < service_tree->ttime_mean)))
>  		return 1;
>  	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
>  			service_tree->count);
> @@ -2988,6 +2998,18 @@ err:
>  }
>  
>  static void
> +cfq_update_st_thinktime(struct cfq_data *cfqd, struct cfq_rb_root *service_tree)
> +{
> +	unsigned long elapsed = jiffies - service_tree->last_end_request;
> +	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
> +
> +	service_tree->ttime_samples = (7*service_tree->ttime_samples + 256) / 8;
> +	service_tree->ttime_total = (7*service_tree->ttime_total + 256*ttime)/8;
> +	service_tree->ttime_mean = (service_tree->ttime_total + 128) /
> +						service_tree->ttime_samples;
> +}
> +
> +static void
>  cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
>  {
>  	unsigned long elapsed = jiffies - cic->last_end_request;
> @@ -3166,6 +3188,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  		cfqq->meta_pending++;
>  
>  	cfq_update_io_thinktime(cfqd, cic);
> +	cfq_update_st_thinktime(cfqd, cfqq->service_tree);
>  	cfq_update_io_seektime(cfqd, cfqq, rq);
>  	cfq_update_idle_window(cfqd, cfqq, cic);
>  
> @@ -3304,7 +3327,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
>  	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
>  
>  	if (sync) {
> +		struct cfq_rb_root *st;
> +
>  		RQ_CIC(rq)->last_end_request = now;
> +		/*
> +		 * cfqq->service_tree is only filled in while on the rb tree,
> +		 * so we need to lookup the service tree here.
> +		 */
> +		st = service_tree_for(cfqq->cfqg,
> +				      cfqq_prio(cfqq), cfqq_type(cfqq));
> +		st->last_end_request = now;
>  		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
>  			cfqd->last_delayed_sync = now;
>  	}
> @@ -3678,11 +3710,14 @@ static void *cfq_init_queue(struct request_queue *q)
>  
>  	/* Init root service tree */
>  	cfqd->grp_service_tree = CFQ_RB_ROOT;
> +	cfqd->grp_service_tree.last_end_request = jiffies;
>  

This assignment is not required as we never update think time analsys of
service tree where all the groups are hanging.  So for grp_service_tree,
last_end_request can be zero and there will be no harm.

Otherwise patch looks good to me. Can you please run some simple blkio
cgroup tests to make sure that functionality is not broken.

Acked-by: Vivek Goyal <vgoyal@redhat.com>

Thanks
Vivek

>  	/* Init root group */
>  	cfqg = &cfqd->root_group;
> -	for_each_cfqg_st(cfqg, i, j, st)
> +	for_each_cfqg_st(cfqg, i, j, st) {
>  		*st = CFQ_RB_ROOT;
> +		st->last_end_request = jiffies;
> +	}
>  	RB_CLEAR_NODE(&cfqg->rb_node);
>  
>  	/* Give preference to root group over other groups */
> -- 
> 1.6.2.5

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-04-14 21:37   ` Vivek Goyal
@ 2010-04-14 23:06     ` Jeff Moyer
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-04-14 23:06 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: jens.axboe, linux-kernel, linux-ext4

Vivek Goyal <vgoyal@redhat.com> writes:

> On Wed, Apr 14, 2010 at 05:17:03PM -0400, Jeff Moyer wrote:

>> @@ -3678,11 +3710,14 @@ static void *cfq_init_queue(struct request_queue *q)
>>  
>>  	/* Init root service tree */
>>  	cfqd->grp_service_tree = CFQ_RB_ROOT;
>> +	cfqd->grp_service_tree.last_end_request = jiffies;
>>  
>
> This assignment is not required as we never update think time analsys of
> service tree where all the groups are hanging.  So for grp_service_tree,
> last_end_request can be zero and there will be no harm.

OK, thanks.

> Otherwise patch looks good to me. Can you please run some simple blkio
> cgroup tests to make sure that functionality is not broken.

Yes, I'll add that to my list and get back to you.

Thanks for the review, Vivek.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ
@ 2010-05-18 18:20 Jeff Moyer
  2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-18 18:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ext4, jens.axboe, vgoyal

Hi,

In this, the fourth posting of this patch series, I've addressed the following
issues:
- cfq queue yielding is now done in select_queue instead of the dispatch routine
- minor patch review comments were addressed
- the queue is now yielded to a specific task

For those not familiar with this patch set already, previous discussions
appeared here:
  http://lkml.org/lkml/2010/4/1/344
  http://lkml.org/lkml/2010/4/7/325
  http://lkml.org/lkml/2010/4/14/394

This patch series addresses a performance problem experienced when running
io_zone with small file sizes (from 4KB up to 8MB) and including fsync in
the timings.  A good example of this would be the following command line:
  iozone -s 64 -e -f /mnt/test/iozone.0 -i 0
As the file sizes get larger, the performance improves.  By the time the
file size is 16MB, there is no difference in performance between runs
using CFQ and runs using deadline.  The storage in my testing was a NetApp
array connected via a single fibre channel link.  When testing against a
single SATA disk, the performance difference is not apparent.

fs_mark can also be used to show the performance problem using the following
example command line:
  fs_mark  -S  1  -D  100  -N  1000  -d  /mnt/test/fs_mark  -s  65536  -t  1  -w  4096

Following are some performance numbers from my testing.  The below numbers
represent an average of 5 runs for each configuration when running: 
        iozone -s 64 -e -f /mnt/test/iozone.0 -i 0
Numbers are in KB/s.

            |    SATA      |     %diff      ||      SAN      |     %diff
            |write |rewrite| write |rewrite || write |rewrite| write |rewrite
------------+--------------+----------------++-------------------------------
deadline    | 1452 |  1788 | 1.0   | 1.0    || 35611 | 46260 | 1.0   | 1.0
vanilla cfq | 1323 |  1330 | 0.91  | 0.74   ||  6725 |  7163 | 0.19  | 0.15
patched cfq | 1591 |  1485 | 1.10  | 0.83   || 35555 | 46358 | 1.0   | 1.0


Here are some fs_mark numbers from the same storage configurations:

            SATA | SAN
           file/s|file/s
----------+------+------
deadline  | 33.7 | 538.9
unpatched | 33.5 | 110.2
  patched | 35.6 | 558.9

It's worth noting that this patch series only helps a single stream of I/O in
my testing.  What I mean by that is, if you were to add a single sequential
reader into the mix, the performance of CFQ again drops for the fsync-ing
process.  I fought with that for a while, but I think it is likely the subject
for another patch series.

I'd like to get some comments and performance testing feedback from others
as I'm not yet 100% convinced of the merits of this approach.

Cheers,
Jeff

[PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
[PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.
[PATCH 3/4] jbd: yield the device queue when waiting for commits
[PATCH 4/4] jbd2: yield the device queue when waiting for journal commits

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
@ 2010-05-18 18:20 ` Jeff Moyer
  2010-05-18 20:52   ` Vivek Goyal
  2010-05-18 21:02   ` Vivek Goyal
  2010-05-18 18:20 ` [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler Jeff Moyer
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-18 18:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ext4, jens.axboe, vgoyal, Jeff Moyer

This patch uses an average think time for the entirety of the sync-noidle
workload to determine whether or not to idle on said workload.  This brings
it more in line with the policy for the sync queues in the sync workload.

Testing shows that this provided an overall increase in throughput for
a mixed workload on my hardware RAID array.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 block/cfq-iosched.c |   44 +++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 838834b..46a7fe5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -83,9 +83,14 @@ struct cfq_rb_root {
 	unsigned total_weight;
 	u64 min_vdisktime;
 	struct rb_node *active;
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
-			.count = 0, .min_vdisktime = 0, }
+			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
+			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
 
 /*
  * Per process-grouping structure
@@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 		goto done;
 
 	cfqg->weight = blkcg->weight;
-	for_each_cfqg_st(cfqg, i, j, st)
+	for_each_cfqg_st(cfqg, i, j, st) {
 		*st = CFQ_RB_ROOT;
+		st->last_end_request = jiffies;
+	}
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/*
@@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	/*
 	 * Otherwise, we do only if they are the last ones
-	 * in their service tree.
+	 * in their service tree and the average think time is
+	 * less than the slice length.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+	    (!sample_valid(service_tree->ttime_samples || 
+	     cfqq->slice_end - jiffies < service_tree->ttime_mean)))
 		return 1;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
@@ -2988,6 +2998,18 @@ err:
 }
 
 static void
+cfq_update_st_thinktime(struct cfq_data *cfqd, struct cfq_rb_root *service_tree)
+{
+	unsigned long elapsed = jiffies - service_tree->last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
+
+	service_tree->ttime_samples = (7*service_tree->ttime_samples + 256) / 8;
+	service_tree->ttime_total = (7*service_tree->ttime_total + 256*ttime)/8;
+	service_tree->ttime_mean = (service_tree->ttime_total + 128) /
+						service_tree->ttime_samples;
+}
+
+static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
 	unsigned long elapsed = jiffies - cic->last_end_request;
@@ -3166,6 +3188,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_st_thinktime(cfqd, cfqq->service_tree);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
@@ -3304,7 +3327,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
+		struct cfq_rb_root *st;
+
 		RQ_CIC(rq)->last_end_request = now;
+		/*
+		 * cfqq->service_tree is only filled in while on the rb tree,
+		 * so we need to lookup the service tree here.
+		 */
+		st = service_tree_for(cfqq->cfqg,
+				      cfqq_prio(cfqq), cfqq_type(cfqq));
+		st->last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
@@ -3681,8 +3713,10 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	/* Init root group */
 	cfqg = &cfqd->root_group;
-	for_each_cfqg_st(cfqg, i, j, st)
+	for_each_cfqg_st(cfqg, i, j, st) {
 		*st = CFQ_RB_ROOT;
+		st->last_end_request = jiffies;
+	}
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/* Give preference to root group over other groups */
-- 
1.6.5.2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.
  2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
  2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
@ 2010-05-18 18:20 ` Jeff Moyer
  2010-05-18 21:07   ` Vivek Goyal
  2010-05-18 21:44   ` Vivek Goyal
  2010-05-18 18:20 ` [PATCH 3/4] jbd: yield the device queue when waiting for commits Jeff Moyer
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-18 18:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ext4, jens.axboe, vgoyal, Jeff Moyer

This patch implements a blk_yield to allow a process to voluntarily
give up its I/O scheduler time slice.  This is desirable for those processes
which know that they will be blocked on I/O from another process, such as
the file system journal thread.  Following patches will put calls to blk_yield
into jbd and jbd2.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 block/blk-core.c         |   13 +++++
 block/blk-settings.c     |    6 +++
 block/cfq-iosched.c      |  112 +++++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c         |    8 +++
 include/linux/blkdev.h   |    4 ++
 include/linux/elevator.h |    3 +
 6 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174d..b8be6c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -323,6 +323,18 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
 
+void generic_yield_iosched(struct request_queue *q, struct task_struct *tsk)
+{
+	elv_yield(q, tsk);
+}
+
+void blk_yield(struct request_queue *q, struct task_struct *tsk)
+{
+	if (q->yield_fn)
+		q->yield_fn(q, tsk);
+}
+EXPORT_SYMBOL(blk_yield);
+
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -580,6 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
+	q->yield_fn		= generic_yield_iosched;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f5ed5a1..fe548c9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -171,6 +171,12 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 }
 EXPORT_SYMBOL(blk_queue_make_request);
 
+void blk_queue_yield(struct request_queue *q, yield_fn *yield)
+{
+	q->yield_fn = yield;
+}
+EXPORT_SYMBOL_GPL(blk_queue_yield);
+
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q: the request queue for the device
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 46a7fe5..9aab701 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -148,6 +148,7 @@ struct cfq_queue {
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
+	struct cfq_io_context *yield_to;
 	/* Sectors dispatched in current dispatch round */
 	unsigned long nr_sectors;
 };
@@ -320,6 +321,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
+	CFQ_CFQQ_FLAG_yield,		/* Allow another cfqq to run */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -349,6 +351,7 @@ CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
+CFQ_CFQQ_FNS(yield);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_DEBUG_CFQ_IOSCHED
@@ -1566,6 +1569,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
+	cfq_clear_cfqq_yield(cfqq);
 
 	/*
 	 * If this cfqq is shared between multiple processes, check to
@@ -2068,7 +2072,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
-	cfq_log(cfqd, "workload slice:%d", slice);
+	cfq_log(cfqd, "workload:%d slice:%d", cfqd->serving_type, slice);
 	cfqd->workload_expires = jiffies + slice;
 	cfqd->noidle_tree_requires_idle = false;
 }
@@ -2138,7 +2142,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		 * ok to wait for this request to complete.
 		 */
 		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
-		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+		    && cfqq->dispatched && !cfq_cfqq_yield(cfqq) &&
+		    cfq_should_idle(cfqd, cfqq)) {
 			cfqq = NULL;
 			goto keep_queue;
 		} else
@@ -2165,6 +2170,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto expire;
 	}
 
+	if (cfq_cfqq_yield(cfqq))
+		goto expire;
+
 	/*
 	 * No requests pending. If the active queue still has requests in
 	 * flight or is idling for a new request, allow either of these
@@ -2191,6 +2199,98 @@ keep_queue:
 	return cfqq;
 }
 
+static void cfq_yield(struct request_queue *q, struct task_struct *tsk)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_io_context *cic, *new_cic;
+	struct cfq_queue *cfqq, *sync_cfqq, *async_cfqq, *new_cfqq;
+
+	cic = cfq_cic_lookup(cfqd, current->io_context);
+	if (!cic)
+		return;
+
+	task_lock(tsk);
+	new_cic = cfq_cic_lookup(cfqd, tsk->io_context);
+	atomic_long_inc(&tsk->io_context->refcount);
+	task_unlock(tsk);
+	if (!new_cic)
+		goto out_dec;
+
+	spin_lock_irq(q->queue_lock);
+
+	/*
+	 * This is primarily called to ensure that the long synchronous
+	 * time slice does not prevent other I/O happenning (like journal
+	 * commits) while we idle waiting for it.  Thus, check to see if the
+	 * current cfqq is the sync cfqq for this process.
+	 */
+	cfqq = cic_to_cfqq(cic, 1);
+	if (!cfqq)
+		goto out_unlock;
+
+	if (cfqd->active_queue != cfqq)
+		goto out_unlock;
+
+	sync_cfqq = cic_to_cfqq(new_cic, 1);
+	async_cfqq = cic_to_cfqq(new_cic, 0);
+	if (!sync_cfqq && !async_cfqq)
+		goto out_unlock;
+	if (!RB_EMPTY_ROOT(&sync_cfqq->sort_list))
+		new_cfqq = sync_cfqq;
+	else
+		new_cfqq = async_cfqq;
+
+	/*
+	 * If we are currently servicing the SYNC_NOIDLE_WORKLOAD, and we
+	 * are idling on the last queue in that workload, *and* the average
+	 * think time is larger thank the remaining slice time, go ahead
+	 * and yield the queue.  Otherwise, don't yield so that fsync-heavy
+	 * workloads don't starve out the sync-noidle workload.
+	 */
+	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	    (!sample_valid(cfqq->service_tree->ttime_samples) ||
+	     cfqq->slice_end - jiffies > cfqq->service_tree->ttime_mean)) {
+		/*
+		 * If there's only a single sync-noidle queue on average,
+		 * there's no point in idling in order to not starve the
+		 * non-existent other queues.
+		 */
+		if (cfq_group_busy_queues_wl(SYNC_NOIDLE_WORKLOAD, cfqd,
+					     cfqq->cfqg) > 1)
+			goto out_unlock;
+	}
+
+	cfq_log_cfqq(cfqd, cfqq, "yielding queue");
+
+	/*
+	 * If there are other requests pending, just mark the queue as
+	 * yielding and give up our slice after the last request is
+	 * dispatched.  Or, if there are no requests currently pending
+	 * on the selected queue, keep our time slice until such a time
+	 * as the new queue has something to do.  It will then preempt
+	 * this queue (see cfq_should_preempt).
+	 */
+	if (!RB_EMPTY_ROOT(&cfqq->sort_list) ||
+	    RB_EMPTY_ROOT(&new_cfqq->sort_list)) {
+		cfqq->yield_to = new_cic;
+		cfq_mark_cfqq_yield(cfqq);
+		goto out_unlock;
+	}
+
+	/*
+	 * At this point, we know that the target queue has requests pending.
+	 * Yield the io scheduler.
+	 */
+	__cfq_slice_expired(cfqd, cfqq, 1);
+	__cfq_set_active_queue(cfqd, new_cfqq);
+	__blk_run_queue(cfqd->queue);
+
+out_unlock:
+	spin_unlock_irq(q->queue_lock);
+out_dec:
+	put_io_context(tsk->io_context);
+}
+
 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
 	int dispatched = 0;
@@ -3094,6 +3194,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (!cfqq)
 		return false;
 
+	/*
+	 * If the active queue yielded its timeslice to this queue, let
+	 * it preempt.
+	 */
+	if (cfq_cfqq_yield(cfqq) && RQ_CIC(rq) == cfqq->yield_to)
+		return true;
+
 	if (cfq_class_idle(new_cfqq))
 		return false;
 
@@ -3910,6 +4017,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
 		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
+		.elevator_yield_fn =		cfq_yield,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_set_req_fn =		cfq_set_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702..3af7796 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -855,6 +855,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	}
 }
 
+void elv_yield(struct request_queue *q, struct task_struct *tsk)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->ops->elevator_yield_fn)
+		e->ops->elevator_yield_fn(q, tsk);
+}
+
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
 
 static ssize_t
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8b..1099d29 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -259,6 +259,7 @@ struct request_pm_state
 
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
+typedef void (yield_fn) (struct request_queue *q, struct task_struct *tsk);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 
@@ -341,6 +342,7 @@ struct request_queue
 
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
+	yield_fn		*yield_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
@@ -833,6 +835,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 extern void blk_unplug(struct request_queue *q);
+extern void blk_yield(struct request_queue *q, struct task_struct *tsk);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
@@ -920,6 +923,7 @@ extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
+extern void blk_queue_yield(struct request_queue *, yield_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372..a5aaee9 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -20,6 +20,7 @@ typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
 typedef int (elevator_queue_empty_fn) (struct request_queue *);
 typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
+typedef void (elevator_yield_fn) (struct request_queue *, struct task_struct *tsk);
 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
 
 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
@@ -44,6 +45,7 @@ struct elevator_ops
 
 	elevator_queue_empty_fn *elevator_queue_empty_fn;
 	elevator_completed_req_fn *elevator_completed_req_fn;
+	elevator_yield_fn *elevator_yield_fn;
 
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
@@ -105,6 +107,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 extern void elv_merged_request(struct request_queue *, struct request *, int);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
+extern void elv_yield(struct request_queue *, struct task_struct *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
-- 
1.6.5.2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/4] jbd: yield the device queue when waiting for commits
  2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
  2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
  2010-05-18 18:20 ` [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler Jeff Moyer
@ 2010-05-18 18:20 ` Jeff Moyer
  2010-05-18 18:20 ` [PATCH 4/4] jbd2: yield the device queue when waiting for journal commits Jeff Moyer
  2010-05-19  2:05 ` [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ KOSAKI Motohiro
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-18 18:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ext4, jens.axboe, vgoyal, Jeff Moyer

This patch gets CFQ back in line with deadline for iozone runs, especially
those testing small files + fsync timings.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/jbd/journal.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224ee..267108f 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/blkdev.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -549,6 +550,11 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
+		/*
+		 * Give up our I/O scheduler time slice to allow the journal
+		 * thread to issue I/O.
+		 */
+		blk_yield(journal->j_dev->bd_disk->queue, journal->j_task);
 		wake_up(&journal->j_wait_commit);
 		spin_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit,
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/4] jbd2: yield the device queue when waiting for journal commits
  2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
                   ` (2 preceding siblings ...)
  2010-05-18 18:20 ` [PATCH 3/4] jbd: yield the device queue when waiting for commits Jeff Moyer
@ 2010-05-18 18:20 ` Jeff Moyer
  2010-05-19  2:05 ` [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ KOSAKI Motohiro
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-18 18:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ext4, jens.axboe, vgoyal, Jeff Moyer

This patch gets CFQ back in line with deadline for iozone runs, especially
those testing small files + fsync timings.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/jbd2/journal.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dc..28848a6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/blkdev.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -580,6 +581,11 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
+		/*
+		 * Give up our I/O scheduler time slice to allow the journal
+		 * thread to issue I/O.
+		 */
+		blk_yield(journal->j_dev->bd_disk->queue, journal->j_task);
 		wake_up(&journal->j_wait_commit);
 		spin_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit,
-- 
1.6.5.2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
@ 2010-05-18 20:52   ` Vivek Goyal
  2010-05-18 21:02   ` Vivek Goyal
  1 sibling, 0 replies; 16+ messages in thread
From: Vivek Goyal @ 2010-05-18 20:52 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-ext4, jens.axboe

On Tue, May 18, 2010 at 02:20:17PM -0400, Jeff Moyer wrote:
> This patch uses an average think time for the entirety of the sync-noidle
> workload to determine whether or not to idle on said workload.  This brings
> it more in line with the policy for the sync queues in the sync workload.
> 
> Testing shows that this provided an overall increase in throughput for
> a mixed workload on my hardware RAID array.
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  block/cfq-iosched.c |   44 +++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 838834b..46a7fe5 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -83,9 +83,14 @@ struct cfq_rb_root {
>  	unsigned total_weight;
>  	u64 min_vdisktime;
>  	struct rb_node *active;
> +	unsigned long last_end_request;
> +	unsigned long ttime_total;
> +	unsigned long ttime_samples;
> +	unsigned long ttime_mean;
>  };
>  #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
> -			.count = 0, .min_vdisktime = 0, }
> +			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
> +			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
>  
>  /*
>   * Per process-grouping structure
> @@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>  		goto done;
>  
>  	cfqg->weight = blkcg->weight;
> -	for_each_cfqg_st(cfqg, i, j, st)
> +	for_each_cfqg_st(cfqg, i, j, st) {
>  		*st = CFQ_RB_ROOT;
> +		st->last_end_request = jiffies;
> +	}
>  	RB_CLEAR_NODE(&cfqg->rb_node);
>  
>  	/*
> @@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  
>  	/*
>  	 * Otherwise, we do only if they are the last ones
> -	 * in their service tree.
> +	 * in their service tree and the average think time is
> +	 * less than the slice length.
>  	 */
> -	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
> +	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
> +	    (!sample_valid(service_tree->ttime_samples || 
						     ^^^
Jeff, 

Are we closing sample_valid() bracket at right place here?

I am wondering where it is helping you. If it is to bring in line with
with sync tree (old implementation), then we should have also compared
the think time with slice_idle?

But comparing that here might not be the best thing as cfq_should_idle()
is used in many contexts.
 
Vivek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
  2010-05-18 20:52   ` Vivek Goyal
@ 2010-05-18 21:02   ` Vivek Goyal
  2010-06-01 19:31     ` Jeff Moyer
  1 sibling, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-05-18 21:02 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-ext4, jens.axboe

On Tue, May 18, 2010 at 02:20:17PM -0400, Jeff Moyer wrote:
> This patch uses an average think time for the entirety of the sync-noidle
> workload to determine whether or not to idle on said workload.  This brings
> it more in line with the policy for the sync queues in the sync workload.
> 
> Testing shows that this provided an overall increase in throughput for
> a mixed workload on my hardware RAID array.
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  block/cfq-iosched.c |   44 +++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 838834b..46a7fe5 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -83,9 +83,14 @@ struct cfq_rb_root {
>  	unsigned total_weight;
>  	u64 min_vdisktime;
>  	struct rb_node *active;
> +	unsigned long last_end_request;
> +	unsigned long ttime_total;
> +	unsigned long ttime_samples;
> +	unsigned long ttime_mean;
>  };
>  #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
> -			.count = 0, .min_vdisktime = 0, }
> +			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
> +			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
>  
>  /*
>   * Per process-grouping structure
> @@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>  		goto done;
>  
>  	cfqg->weight = blkcg->weight;
> -	for_each_cfqg_st(cfqg, i, j, st)
> +	for_each_cfqg_st(cfqg, i, j, st) {
>  		*st = CFQ_RB_ROOT;
> +		st->last_end_request = jiffies;
> +	}
>  	RB_CLEAR_NODE(&cfqg->rb_node);
>  
>  	/*
> @@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  
>  	/*
>  	 * Otherwise, we do only if they are the last ones
> -	 * in their service tree.
> +	 * in their service tree and the average think time is
> +	 * less than the slice length.
>  	 */
> -	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
> +	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
> +	    (!sample_valid(service_tree->ttime_samples || 
> +	     cfqq->slice_end - jiffies < service_tree->ttime_mean)))
>  		return 1;

This comparision will also might break some logic in select_queue() where
we wait for a queue/group to get busy even if queue's time slice has
expired.

********************************************************************
if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
                /*
                 * If slice had not expired at the completion of last
                 * request
                 * we might not have turned on wait_busy flag. Don't
                 * expire
                 * the queue yet. Allow the group to get backlogged.
                 *
                 * The very fact that we have used the slice, that means
                 * we
                 * have been idling all along on this queue and it should
                 * be
                 * ok to wait for this request to complete.
                 */
                if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
                    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
                        cfqq = NULL;
                        goto keep_queue;
		}

*************************************************************************

With this change, now above condition will never be true as
cfq_should_idle() will always return false as slice has already expired.
And that will affect group loosing its fair share.

So I guess we can define new functions to check more conditions instead of
putting it in cfq_should_idle()

Vivek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.
  2010-05-18 18:20 ` [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler Jeff Moyer
@ 2010-05-18 21:07   ` Vivek Goyal
  2010-05-18 21:44   ` Vivek Goyal
  1 sibling, 0 replies; 16+ messages in thread
From: Vivek Goyal @ 2010-05-18 21:07 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-ext4, jens.axboe

On Tue, May 18, 2010 at 02:20:18PM -0400, Jeff Moyer wrote:
> This patch implements a blk_yield to allow a process to voluntarily
> give up its I/O scheduler time slice.  This is desirable for those processes
> which know that they will be blocked on I/O from another process, such as
> the file system journal thread.  Following patches will put calls to blk_yield
> into jbd and jbd2.
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  block/blk-core.c         |   13 +++++
>  block/blk-settings.c     |    6 +++
>  block/cfq-iosched.c      |  112 +++++++++++++++++++++++++++++++++++++++++++++-
>  block/elevator.c         |    8 +++
>  include/linux/blkdev.h   |    4 ++
>  include/linux/elevator.h |    3 +
>  6 files changed, 144 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9fe174d..b8be6c8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -323,6 +323,18 @@ void blk_unplug(struct request_queue *q)
>  }
>  EXPORT_SYMBOL(blk_unplug);
>  
> +void generic_yield_iosched(struct request_queue *q, struct task_struct *tsk)
> +{
> +	elv_yield(q, tsk);
> +}
> +
> +void blk_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	if (q->yield_fn)
> +		q->yield_fn(q, tsk);
> +}
> +EXPORT_SYMBOL(blk_yield);
> +
>  /**
>   * blk_start_queue - restart a previously stopped queue
>   * @q:    The &struct request_queue in question
> @@ -580,6 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
>  	q->request_fn		= rfn;
>  	q->prep_rq_fn		= NULL;
>  	q->unplug_fn		= generic_unplug_device;
> +	q->yield_fn		= generic_yield_iosched;
>  	q->queue_flags		= QUEUE_FLAG_DEFAULT;
>  	q->queue_lock		= lock;
>  
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index f5ed5a1..fe548c9 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -171,6 +171,12 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
>  }
>  EXPORT_SYMBOL(blk_queue_make_request);
>  
> +void blk_queue_yield(struct request_queue *q, yield_fn *yield)
> +{
> +	q->yield_fn = yield;
> +}
> +EXPORT_SYMBOL_GPL(blk_queue_yield);
> +
>  /**
>   * blk_queue_bounce_limit - set bounce buffer limit for queue
>   * @q: the request queue for the device
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 46a7fe5..9aab701 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -148,6 +148,7 @@ struct cfq_queue {
>  	struct cfq_queue *new_cfqq;
>  	struct cfq_group *cfqg;
>  	struct cfq_group *orig_cfqg;
> +	struct cfq_io_context *yield_to;
>  	/* Sectors dispatched in current dispatch round */
>  	unsigned long nr_sectors;
>  };
> @@ -320,6 +321,7 @@ enum cfqq_state_flags {
>  	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
>  	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
>  	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
> +	CFQ_CFQQ_FLAG_yield,		/* Allow another cfqq to run */
>  };
>  
>  #define CFQ_CFQQ_FNS(name)						\
> @@ -349,6 +351,7 @@ CFQ_CFQQ_FNS(coop);
>  CFQ_CFQQ_FNS(split_coop);
>  CFQ_CFQQ_FNS(deep);
>  CFQ_CFQQ_FNS(wait_busy);
> +CFQ_CFQQ_FNS(yield);
>  #undef CFQ_CFQQ_FNS
>  
>  #ifdef CONFIG_DEBUG_CFQ_IOSCHED
> @@ -1566,6 +1569,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  
>  	cfq_clear_cfqq_wait_request(cfqq);
>  	cfq_clear_cfqq_wait_busy(cfqq);
> +	cfq_clear_cfqq_yield(cfqq);
>  
>  	/*
>  	 * If this cfqq is shared between multiple processes, check to
> @@ -2068,7 +2072,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  		slice = max(slice, 2 * cfqd->cfq_slice_idle);
>  
>  	slice = max_t(unsigned, slice, CFQ_MIN_TT);
> -	cfq_log(cfqd, "workload slice:%d", slice);
> +	cfq_log(cfqd, "workload:%d slice:%d", cfqd->serving_type, slice);
>  	cfqd->workload_expires = jiffies + slice;
>  	cfqd->noidle_tree_requires_idle = false;
>  }
> @@ -2138,7 +2142,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		 * ok to wait for this request to complete.
>  		 */
>  		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
> -		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
> +		    && cfqq->dispatched && !cfq_cfqq_yield(cfqq) &&
> +		    cfq_should_idle(cfqd, cfqq)) {
>  			cfqq = NULL;

I think if we just place cfq_cfqq_yield(cfqq), check above it, we don't
need above code modification?

This might result in some group losing fairness in certain scenarios, but
I guess we will tackle it once we face it. For the time being if fsync
thread is giving up cpu, journald commits will come in root group and
there might not be a point in wasting time idling on this group.

Vivek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.
  2010-05-18 18:20 ` [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler Jeff Moyer
  2010-05-18 21:07   ` Vivek Goyal
@ 2010-05-18 21:44   ` Vivek Goyal
  2010-06-01 20:01     ` Jeff Moyer
  1 sibling, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-05-18 21:44 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, linux-ext4, jens.axboe

On Tue, May 18, 2010 at 02:20:18PM -0400, Jeff Moyer wrote:
> This patch implements a blk_yield to allow a process to voluntarily
> give up its I/O scheduler time slice.  This is desirable for those processes
> which know that they will be blocked on I/O from another process, such as
> the file system journal thread.  Following patches will put calls to blk_yield
> into jbd and jbd2.
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  block/blk-core.c         |   13 +++++
>  block/blk-settings.c     |    6 +++
>  block/cfq-iosched.c      |  112 +++++++++++++++++++++++++++++++++++++++++++++-
>  block/elevator.c         |    8 +++
>  include/linux/blkdev.h   |    4 ++
>  include/linux/elevator.h |    3 +
>  6 files changed, 144 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9fe174d..b8be6c8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -323,6 +323,18 @@ void blk_unplug(struct request_queue *q)
>  }
>  EXPORT_SYMBOL(blk_unplug);
>  
> +void generic_yield_iosched(struct request_queue *q, struct task_struct *tsk)
> +{
> +	elv_yield(q, tsk);
> +}
> +
> +void blk_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	if (q->yield_fn)
> +		q->yield_fn(q, tsk);
> +}
> +EXPORT_SYMBOL(blk_yield);
> +
>  /**
>   * blk_start_queue - restart a previously stopped queue
>   * @q:    The &struct request_queue in question
> @@ -580,6 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
>  	q->request_fn		= rfn;
>  	q->prep_rq_fn		= NULL;
>  	q->unplug_fn		= generic_unplug_device;
> +	q->yield_fn		= generic_yield_iosched;
>  	q->queue_flags		= QUEUE_FLAG_DEFAULT;
>  	q->queue_lock		= lock;
>  
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index f5ed5a1..fe548c9 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -171,6 +171,12 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
>  }
>  EXPORT_SYMBOL(blk_queue_make_request);
>  
> +void blk_queue_yield(struct request_queue *q, yield_fn *yield)
> +{
> +	q->yield_fn = yield;
> +}
> +EXPORT_SYMBOL_GPL(blk_queue_yield);
> +
>  /**
>   * blk_queue_bounce_limit - set bounce buffer limit for queue
>   * @q: the request queue for the device
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 46a7fe5..9aab701 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -148,6 +148,7 @@ struct cfq_queue {
>  	struct cfq_queue *new_cfqq;
>  	struct cfq_group *cfqg;
>  	struct cfq_group *orig_cfqg;
> +	struct cfq_io_context *yield_to;
>  	/* Sectors dispatched in current dispatch round */
>  	unsigned long nr_sectors;
>  };
> @@ -320,6 +321,7 @@ enum cfqq_state_flags {
>  	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
>  	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
>  	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
> +	CFQ_CFQQ_FLAG_yield,		/* Allow another cfqq to run */
>  };
>  
>  #define CFQ_CFQQ_FNS(name)						\
> @@ -349,6 +351,7 @@ CFQ_CFQQ_FNS(coop);
>  CFQ_CFQQ_FNS(split_coop);
>  CFQ_CFQQ_FNS(deep);
>  CFQ_CFQQ_FNS(wait_busy);
> +CFQ_CFQQ_FNS(yield);
>  #undef CFQ_CFQQ_FNS
>  
>  #ifdef CONFIG_DEBUG_CFQ_IOSCHED
> @@ -1566,6 +1569,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  
>  	cfq_clear_cfqq_wait_request(cfqq);
>  	cfq_clear_cfqq_wait_busy(cfqq);
> +	cfq_clear_cfqq_yield(cfqq);
>  
>  	/*
>  	 * If this cfqq is shared between multiple processes, check to
> @@ -2068,7 +2072,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  		slice = max(slice, 2 * cfqd->cfq_slice_idle);
>  
>  	slice = max_t(unsigned, slice, CFQ_MIN_TT);
> -	cfq_log(cfqd, "workload slice:%d", slice);
> +	cfq_log(cfqd, "workload:%d slice:%d", cfqd->serving_type, slice);
>  	cfqd->workload_expires = jiffies + slice;
>  	cfqd->noidle_tree_requires_idle = false;
>  }
> @@ -2138,7 +2142,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		 * ok to wait for this request to complete.
>  		 */
>  		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
> -		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
> +		    && cfqq->dispatched && !cfq_cfqq_yield(cfqq) &&
> +		    cfq_should_idle(cfqd, cfqq)) {
>  			cfqq = NULL;
>  			goto keep_queue;
>  		} else
> @@ -2165,6 +2170,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		goto expire;
>  	}
>  
> +	if (cfq_cfqq_yield(cfqq))
> +		goto expire;
> +
>  	/*
>  	 * No requests pending. If the active queue still has requests in
>  	 * flight or is idling for a new request, allow either of these
> @@ -2191,6 +2199,98 @@ keep_queue:
>  	return cfqq;
>  }
>  
> +static void cfq_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	struct cfq_data *cfqd = q->elevator->elevator_data;
> +	struct cfq_io_context *cic, *new_cic;
> +	struct cfq_queue *cfqq, *sync_cfqq, *async_cfqq, *new_cfqq;
> +
> +	cic = cfq_cic_lookup(cfqd, current->io_context);
> +	if (!cic)
> +		return;
> +
> +	task_lock(tsk);
> +	new_cic = cfq_cic_lookup(cfqd, tsk->io_context);
> +	atomic_long_inc(&tsk->io_context->refcount);
> +	task_unlock(tsk);
> +	if (!new_cic)
> +		goto out_dec;
> +
> +	spin_lock_irq(q->queue_lock);
> +
> +	/*
> +	 * This is primarily called to ensure that the long synchronous
> +	 * time slice does not prevent other I/O happenning (like journal
> +	 * commits) while we idle waiting for it.  Thus, check to see if the
> +	 * current cfqq is the sync cfqq for this process.
> +	 */
> +	cfqq = cic_to_cfqq(cic, 1);
> +	if (!cfqq)
> +		goto out_unlock;
> +
> +	if (cfqd->active_queue != cfqq)
> +		goto out_unlock;

Why does yielding queue has to be active queue? Could it be possible that
a queue submitted a bunch of IO and did yield. It is possible that
yielding queue is not active queue. Even in that case we will like to mark
cfqq yielding and expire queue after dispatch of pending requests?

> +
> +	sync_cfqq = cic_to_cfqq(new_cic, 1);
> +	async_cfqq = cic_to_cfqq(new_cic, 0);
> +	if (!sync_cfqq && !async_cfqq)
> +		goto out_unlock;
> +	if (!RB_EMPTY_ROOT(&sync_cfqq->sort_list))
> +		new_cfqq = sync_cfqq;
> +	else
> +		new_cfqq = async_cfqq;
> +

Why is it mandatory that target context has already the queue setup. If
there is no queue setup, can't we just yield the slice and let somebody
else run?

Oh, I guess you want to keep the slice and idle till target queue
dispatches some requests and sets up context and a queue? But even that
will not help as you have anyway missed the yield event?

> +	/*
> +	 * If we are currently servicing the SYNC_NOIDLE_WORKLOAD, and we
> +	 * are idling on the last queue in that workload, *and* the average
> +	 * think time is larger thank the remaining slice time, go ahead
> +	 * and yield the queue.  Otherwise, don't yield so that fsync-heavy
> +	 * workloads don't starve out the sync-noidle workload.
> +	 */
> +	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
> +	    (!sample_valid(cfqq->service_tree->ttime_samples) ||
> +	     cfqq->slice_end - jiffies > cfqq->service_tree->ttime_mean)) {
> +		/*

This is confusing. The yielding queue's stats are already part of service
tree's think time. So if yielding queue has done bunch of IO, then service
tree's mean think time will be low and we will not yield? I guess that's
the reason you are trying to check average number of queues in following
code.  

> +		 * If there's only a single sync-noidle queue on average,
> +		 * there's no point in idling in order to not starve the
> +		 * non-existent other queues.
> +		 */
> +		if (cfq_group_busy_queues_wl(SYNC_NOIDLE_WORKLOAD, cfqd,
> +					     cfqq->cfqg) > 1)

Does cfq_group_busy_queues_wl() give average number of queues. Were you
looking for cfqg->busy_queues_avg?

> +			goto out_unlock;
> +	}
> +
> +	cfq_log_cfqq(cfqd, cfqq, "yielding queue");
> +
> +	/*
> +	 * If there are other requests pending, just mark the queue as
> +	 * yielding and give up our slice after the last request is
> +	 * dispatched.  Or, if there are no requests currently pending
> +	 * on the selected queue, keep our time slice until such a time
> +	 * as the new queue has something to do.  It will then preempt
> +	 * this queue (see cfq_should_preempt).
> +	 */
> +	if (!RB_EMPTY_ROOT(&cfqq->sort_list) ||
> +	    RB_EMPTY_ROOT(&new_cfqq->sort_list)) {
> +		cfqq->yield_to = new_cic;

Where are you clearing yield_to flag? Can't seem to find it.


> +		cfq_mark_cfqq_yield(cfqq);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * At this point, we know that the target queue has requests pending.
> +	 * Yield the io scheduler.
> +	 */
> +	__cfq_slice_expired(cfqd, cfqq, 1);
> +	__cfq_set_active_queue(cfqd, new_cfqq);

What happens to cfq_group considerations? So we can yield slices across
groups. That does not seem right. If existing group has requests on other
service trees, these should be processed first.

Can we handle all this logic in select_queue(). I think it might turn out
to be simpler. I mean in this function if we just mark the existing queue
as yielding and also setup who to yield the queue to and let
select_queue() take the decision whether to idle or choose new queue etc.
I guess it should be possible and code might turn out to be simpler to
read.

Vivek

> +	__blk_run_queue(cfqd->queue);
> +
> +out_unlock:
> +	spin_unlock_irq(q->queue_lock);
> +out_dec:
> +	put_io_context(tsk->io_context);
> +}
> +
>  static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
>  {
>  	int dispatched = 0;
> @@ -3094,6 +3194,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
>  	if (!cfqq)
>  		return false;
>  
> +	/*
> +	 * If the active queue yielded its timeslice to this queue, let
> +	 * it preempt.
> +	 */
> +	if (cfq_cfqq_yield(cfqq) && RQ_CIC(rq) == cfqq->yield_to)
> +		return true;
> +
>  	if (cfq_class_idle(new_cfqq))
>  		return false;
>  
> @@ -3910,6 +4017,7 @@ static struct elevator_type iosched_cfq = {
>  		.elevator_deactivate_req_fn =	cfq_deactivate_request,
>  		.elevator_queue_empty_fn =	cfq_queue_empty,
>  		.elevator_completed_req_fn =	cfq_completed_request,
> +		.elevator_yield_fn =		cfq_yield,
>  		.elevator_former_req_fn =	elv_rb_former_request,
>  		.elevator_latter_req_fn =	elv_rb_latter_request,
>  		.elevator_set_req_fn =		cfq_set_request,
> diff --git a/block/elevator.c b/block/elevator.c
> index 76e3702..3af7796 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -855,6 +855,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
>  	}
>  }
>  
> +void elv_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	struct elevator_queue *e = q->elevator;
> +
> +	if (e && e->ops->elevator_yield_fn)
> +		e->ops->elevator_yield_fn(q, tsk);
> +}
> +
>  #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
>  
>  static ssize_t
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 6690e8b..1099d29 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -259,6 +259,7 @@ struct request_pm_state
>  
>  typedef void (request_fn_proc) (struct request_queue *q);
>  typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
> +typedef void (yield_fn) (struct request_queue *q, struct task_struct *tsk);
>  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>  typedef void (unplug_fn) (struct request_queue *);
>  
> @@ -341,6 +342,7 @@ struct request_queue
>  
>  	request_fn_proc		*request_fn;
>  	make_request_fn		*make_request_fn;
> +	yield_fn		*yield_fn;
>  	prep_rq_fn		*prep_rq_fn;
>  	unplug_fn		*unplug_fn;
>  	merge_bvec_fn		*merge_bvec_fn;
> @@ -833,6 +835,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
>  extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
>  				  struct request *, int, rq_end_io_fn *);
>  extern void blk_unplug(struct request_queue *q);
> +extern void blk_yield(struct request_queue *q, struct task_struct *tsk);
>  
>  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>  {
> @@ -920,6 +923,7 @@ extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
>  extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
>  extern void blk_cleanup_queue(struct request_queue *);
>  extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
> +extern void blk_queue_yield(struct request_queue *, yield_fn *);
>  extern void blk_queue_bounce_limit(struct request_queue *, u64);
>  extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
>  extern void blk_queue_max_segments(struct request_queue *, unsigned short);
> diff --git a/include/linux/elevator.h b/include/linux/elevator.h
> index 1cb3372..a5aaee9 100644
> --- a/include/linux/elevator.h
> +++ b/include/linux/elevator.h
> @@ -20,6 +20,7 @@ typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
>  typedef int (elevator_queue_empty_fn) (struct request_queue *);
>  typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
>  typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
> +typedef void (elevator_yield_fn) (struct request_queue *, struct task_struct *tsk);
>  typedef int (elevator_may_queue_fn) (struct request_queue *, int);
>  
>  typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
> @@ -44,6 +45,7 @@ struct elevator_ops
>  
>  	elevator_queue_empty_fn *elevator_queue_empty_fn;
>  	elevator_completed_req_fn *elevator_completed_req_fn;
> +	elevator_yield_fn *elevator_yield_fn;
>  
>  	elevator_request_list_fn *elevator_former_req_fn;
>  	elevator_request_list_fn *elevator_latter_req_fn;
> @@ -105,6 +107,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
>  extern void elv_merged_request(struct request_queue *, struct request *, int);
>  extern void elv_requeue_request(struct request_queue *, struct request *);
>  extern int elv_queue_empty(struct request_queue *);
> +extern void elv_yield(struct request_queue *, struct task_struct *);
>  extern struct request *elv_former_request(struct request_queue *, struct request *);
>  extern struct request *elv_latter_request(struct request_queue *, struct request *);
>  extern int elv_register_queue(struct request_queue *q);
> -- 
> 1.6.5.2

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ
  2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
                   ` (3 preceding siblings ...)
  2010-05-18 18:20 ` [PATCH 4/4] jbd2: yield the device queue when waiting for journal commits Jeff Moyer
@ 2010-05-19  2:05 ` KOSAKI Motohiro
  2010-05-26 15:33   ` Jeff Moyer
  4 siblings, 1 reply; 16+ messages in thread
From: KOSAKI Motohiro @ 2010-05-19  2:05 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: kosaki.motohiro, linux-kernel, linux-ext4, jens.axboe, vgoyal

Hi Jeff,

> This patch series addresses a performance problem experienced when running
> io_zone with small file sizes (from 4KB up to 8MB) and including fsync in
> the timings.  A good example of this would be the following command line:
>   iozone -s 64 -e -f /mnt/test/iozone.0 -i 0
> As the file sizes get larger, the performance improves.  By the time the
> file size is 16MB, there is no difference in performance between runs
> using CFQ and runs using deadline.  The storage in my testing was a NetApp
> array connected via a single fibre channel link.  When testing against a
> single SATA disk, the performance difference is not apparent.

offtopic:

Can this patch help to reduce a pain of following much small files issue?

http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=578635

Now, userland folks think sync() is faster than fsync() on ext4. I don't 
hope spread this unrecommended habit widely.




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ
  2010-05-19  2:05 ` [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ KOSAKI Motohiro
@ 2010-05-26 15:33   ` Jeff Moyer
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-05-26 15:33 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: linux-kernel, linux-ext4, jens.axboe, vgoyal

KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:

> Hi Jeff,
>
>> This patch series addresses a performance problem experienced when running
>> io_zone with small file sizes (from 4KB up to 8MB) and including fsync in
>> the timings.  A good example of this would be the following command line:
>>   iozone -s 64 -e -f /mnt/test/iozone.0 -i 0
>> As the file sizes get larger, the performance improves.  By the time the
>> file size is 16MB, there is no difference in performance between runs
>> using CFQ and runs using deadline.  The storage in my testing was a NetApp
>> array connected via a single fibre channel link.  When testing against a
>> single SATA disk, the performance difference is not apparent.
>
> offtopic:
>
> Can this patch help to reduce a pain of following much small files issue?
>
> http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=578635

Perhaps.  I don't have a debian system handy to test that, though.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload.
  2010-05-18 21:02   ` Vivek Goyal
@ 2010-06-01 19:31     ` Jeff Moyer
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-06-01 19:31 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: linux-kernel, linux-ext4, axboe

Vivek Goyal <vgoyal@redhat.com> writes:

> On Tue, May 18, 2010 at 02:20:17PM -0400, Jeff Moyer wrote:
>> This patch uses an average think time for the entirety of the sync-noidle
>> workload to determine whether or not to idle on said workload.  This brings
>> it more in line with the policy for the sync queues in the sync workload.
>> 
>> Testing shows that this provided an overall increase in throughput for
>> a mixed workload on my hardware RAID array.
>> 
>> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
>> ---
>>  block/cfq-iosched.c |   44 +++++++++++++++++++++++++++++++++++++++-----
>>  1 files changed, 39 insertions(+), 5 deletions(-)
>> 
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 838834b..46a7fe5 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -83,9 +83,14 @@ struct cfq_rb_root {
>>  	unsigned total_weight;
>>  	u64 min_vdisktime;
>>  	struct rb_node *active;
>> +	unsigned long last_end_request;
>> +	unsigned long ttime_total;
>> +	unsigned long ttime_samples;
>> +	unsigned long ttime_mean;
>>  };
>>  #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
>> -			.count = 0, .min_vdisktime = 0, }
>> +			.count = 0, .min_vdisktime = 0, .last_end_request = 0, \
>> +			.ttime_total = 0, .ttime_samples = 0, .ttime_mean = 0 }
>>  
>>  /*
>>   * Per process-grouping structure
>> @@ -962,8 +967,10 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>  		goto done;
>>  
>>  	cfqg->weight = blkcg->weight;
>> -	for_each_cfqg_st(cfqg, i, j, st)
>> +	for_each_cfqg_st(cfqg, i, j, st) {
>>  		*st = CFQ_RB_ROOT;
>> +		st->last_end_request = jiffies;
>> +	}
>>  	RB_CLEAR_NODE(&cfqg->rb_node);
>>  
>>  	/*
>> @@ -1795,9 +1802,12 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  
>>  	/*
>>  	 * Otherwise, we do only if they are the last ones
>> -	 * in their service tree.
>> +	 * in their service tree and the average think time is
>> +	 * less than the slice length.
>>  	 */
>> -	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
>> +	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
>> +	    (!sample_valid(service_tree->ttime_samples || 

> Jeff, 
> 
> Are we closing sample_valid() bracket at right place here?

I think you know the answer to that.  ;-)  Thanks for catching this.

> I am wondering where it is helping you.

The idea behind the patch is to optimize the idling such that we don't
wait needlessly for more sync-noidle I/O when it likely isn't coming.
As mentioned in the patch description, it aims to bring the sync-noidle
workload, which is treated like a single cfqq, more in-line with the
handling of a single cfqq.

> If it is to bring in line with with sync tree (old implementation),
> then we should have also compared the think time with slice_idle?

I'm not sure I follow 100%.  Are you saying we should disable idling for
the sync-noidle workload if the think time is too long?  That sounds
reasonable.

> But comparing that here might not be the best thing as cfq_should_idle()
> is used in many contexts.

Again, looking for clarification on this point.

>> +	     cfqq->slice_end - jiffies < service_tree->ttime_mean)))
>>  		return 1;
>
> This comparision will also might break some logic in select_queue() where
> we wait for a queue/group to get busy even if queue's time slice has
> expired.
>
> ********************************************************************
> if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
>                 /*
>                  * If slice had not expired at the completion of last
>                  * request
>                  * we might not have turned on wait_busy flag. Don't
>                  * expire
>                  * the queue yet. Allow the group to get backlogged.
>                  *
>                  * The very fact that we have used the slice, that means
>                  * we
>                  * have been idling all along on this queue and it should
>                  * be
>                  * ok to wait for this request to complete.
>                  */
>                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
>                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
>                         cfqq = NULL;
>                         goto keep_queue;
> 		}
>
> *************************************************************************
>
> With this change, now above condition will never be true as
> cfq_should_idle() will always return false as slice has already expired.
> And that will affect group loosing its fair share.
>
> So I guess we can define new functions to check more conditions instead of
> putting it in cfq_should_idle()

Right, thanks for pointing this out.  Do we have a test case that
exposes this issue?  We really need to start a regression test suite for
CFQ.  Also, I had promised to run some numbers for you with cgroups
enabled and I didn't.  I'll get that data before the next posting.

Thanks for the review, Vivek!

-Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.
  2010-05-18 21:44   ` Vivek Goyal
@ 2010-06-01 20:01     ` Jeff Moyer
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Moyer @ 2010-06-01 20:01 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: linux-kernel, linux-ext4, axboe

Vivek Goyal <vgoyal@redhat.com> writes:

> On Tue, May 18, 2010 at 02:20:18PM -0400, Jeff Moyer wrote:
>> This patch implements a blk_yield to allow a process to voluntarily
>> give up its I/O scheduler time slice.  This is desirable for those processes
>> which know that they will be blocked on I/O from another process, such as
>> the file system journal thread.  Following patches will put calls to blk_yield
>> into jbd and jbd2.
>> 
>> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
>> ---
>>  block/blk-core.c         |   13 +++++
>>  block/blk-settings.c     |    6 +++
>>  block/cfq-iosched.c      |  112 +++++++++++++++++++++++++++++++++++++++++++++-
>>  block/elevator.c         |    8 +++
>>  include/linux/blkdev.h   |    4 ++
>>  include/linux/elevator.h |    3 +
>>  6 files changed, 144 insertions(+), 2 deletions(-)
>> 

[...]

> @@ -2138,7 +2142,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		 * ok to wait for this request to complete.
>  		 */
>  		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
> -		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
> +		    && cfqq->dispatched && !cfq_cfqq_yield(cfqq) &&
> +		    cfq_should_idle(cfqd, cfqq)) {
>  			cfqq = NULL;

> I think if we just place cfq_cfqq_yield(cfqq), check above it, we don't
> need above code modification?

Yep.

> This might result in some group losing fairness in certain scenarios, but
> I guess we will tackle it once we face it. For the time being if fsync
> thread is giving up cpu, journald commits will come in root group and
> there might not be a point in wasting time idling on this group.

ok.

[...]

>> @@ -2191,6 +2199,98 @@ keep_queue:
>>  	return cfqq;
>>  }
>>  
>> +static void cfq_yield(struct request_queue *q, struct task_struct *tsk)
>> +{
>> +	struct cfq_data *cfqd = q->elevator->elevator_data;
>> +	struct cfq_io_context *cic, *new_cic;
>> +	struct cfq_queue *cfqq, *sync_cfqq, *async_cfqq, *new_cfqq;
>> +
>> +	cic = cfq_cic_lookup(cfqd, current->io_context);
>> +	if (!cic)
>> +		return;
>> +
>> +	task_lock(tsk);
>> +	new_cic = cfq_cic_lookup(cfqd, tsk->io_context);
>> +	atomic_long_inc(&tsk->io_context->refcount);
>> +	task_unlock(tsk);
>> +	if (!new_cic)
>> +		goto out_dec;
>> +
>> +	spin_lock_irq(q->queue_lock);
>> +
>> +	/*
>> +	 * This is primarily called to ensure that the long synchronous
>> +	 * time slice does not prevent other I/O happenning (like journal
>> +	 * commits) while we idle waiting for it.  Thus, check to see if the
>> +	 * current cfqq is the sync cfqq for this process.
>> +	 */
>> +	cfqq = cic_to_cfqq(cic, 1);
>> +	if (!cfqq)
>> +		goto out_unlock;
>> +
>> +	if (cfqd->active_queue != cfqq)
>> +		goto out_unlock;
>
> Why does yielding queue has to be active queue? Could it be possible that
> a queue submitted a bunch of IO and did yield. It is possible that
> yielding queue is not active queue. Even in that case we will like to mark
> cfqq yielding and expire queue after dispatch of pending requests?

You're absolutely right, this should not be a limitation in the code.

>> +
>> +	sync_cfqq = cic_to_cfqq(new_cic, 1);
>> +	async_cfqq = cic_to_cfqq(new_cic, 0);
>> +	if (!sync_cfqq && !async_cfqq)
>> +		goto out_unlock;
>> +	if (!RB_EMPTY_ROOT(&sync_cfqq->sort_list))
>> +		new_cfqq = sync_cfqq;
>> +	else
>> +		new_cfqq = async_cfqq;
>> +
>
> Why is it mandatory that target context has already the queue setup. If
> there is no queue setup, can't we just yield the slice and let somebody
> else run?

The only callers of the yield method are yielding to a kernel thread
that is always going to exist.  I didn't see a need to add any
complexity here for a case that doesn't yet exist.

> Oh, I guess you want to keep the slice and idle till target queue
> dispatches some requests and sets up context and a queue? But even that
> will not help as you have anyway missed the yield event?

No, see above.

>> +	/*
>> +	 * If we are currently servicing the SYNC_NOIDLE_WORKLOAD, and we
>> +	 * are idling on the last queue in that workload, *and* the average
>> +	 * think time is larger thank the remaining slice time, go ahead
>> +	 * and yield the queue.  Otherwise, don't yield so that fsync-heavy
>> +	 * workloads don't starve out the sync-noidle workload.
>> +	 */
>> +	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
>> +	    (!sample_valid(cfqq->service_tree->ttime_samples) ||
>> +	     cfqq->slice_end - jiffies > cfqq->service_tree->ttime_mean)) {
>> +		/*
>
> This is confusing. The yielding queue's stats are already part of service
> tree's think time. So if yielding queue has done bunch of IO, then service
> tree's mean think time will be low and we will not yield? I guess that's
> the reason you are trying to check average number of queues in following
> code.  

Sorry for confusing you.  Your deduction is correct.  Is there a way I
can write this that would be less confusing to you?

>> +		 * If there's only a single sync-noidle queue on average,
>> +		 * there's no point in idling in order to not starve the
>> +		 * non-existent other queues.
>> +		 */
>> +		if (cfq_group_busy_queues_wl(SYNC_NOIDLE_WORKLOAD, cfqd,
>> +					     cfqq->cfqg) > 1)
>
> Does cfq_group_busy_queues_wl() give average number of queues. Were you
> looking for cfqg->busy_queues_avg?

Yes, thanks again.

>
>> +			goto out_unlock;
>> +	}
>> +
>> +	cfq_log_cfqq(cfqd, cfqq, "yielding queue");
>> +
>> +	/*
>> +	 * If there are other requests pending, just mark the queue as
>> +	 * yielding and give up our slice after the last request is
>> +	 * dispatched.  Or, if there are no requests currently pending
>> +	 * on the selected queue, keep our time slice until such a time
>> +	 * as the new queue has something to do.  It will then preempt
>> +	 * this queue (see cfq_should_preempt).
>> +	 */
>> +	if (!RB_EMPTY_ROOT(&cfqq->sort_list) ||
>> +	    RB_EMPTY_ROOT(&new_cfqq->sort_list)) {
>> +		cfqq->yield_to = new_cic;
>
> Where are you clearing yield_to flag? Can't seem to find it.

yield_to is not a flag.  ;-)  It is not cleared as it is only valid if
the cfq_cfqq_yield flag is set.  If you would find it more sensible, I
can certainly add code to clear it.

>> +		cfq_mark_cfqq_yield(cfqq);
>> +		goto out_unlock;
>> +	}
>> +
>> +	/*
>> +	 * At this point, we know that the target queue has requests pending.
>> +	 * Yield the io scheduler.
>> +	 */
>> +	__cfq_slice_expired(cfqd, cfqq, 1);
>> +	__cfq_set_active_queue(cfqd, new_cfqq);
>
> What happens to cfq_group considerations? So we can yield slices across
> groups. That does not seem right. If existing group has requests on other
> service trees, these should be processed first.
>
> Can we handle all this logic in select_queue(). I think it might turn out
> to be simpler. I mean in this function if we just mark the existing queue
> as yielding and also setup who to yield the queue to and let
> select_queue() take the decision whether to idle or choose new queue etc.
> I guess it should be possible and code might turn out to be simpler to
> read.

I'll give that a shot, thanks for the suggestion.  My concern is that,
when employing cgroups, you may never actually yield the queue depending
on where the file system's journal thread ends up.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-06-01 20:01 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-18 18:20 [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ Jeff Moyer
2010-05-18 18:20 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
2010-05-18 20:52   ` Vivek Goyal
2010-05-18 21:02   ` Vivek Goyal
2010-06-01 19:31     ` Jeff Moyer
2010-05-18 18:20 ` [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler Jeff Moyer
2010-05-18 21:07   ` Vivek Goyal
2010-05-18 21:44   ` Vivek Goyal
2010-06-01 20:01     ` Jeff Moyer
2010-05-18 18:20 ` [PATCH 3/4] jbd: yield the device queue when waiting for commits Jeff Moyer
2010-05-18 18:20 ` [PATCH 4/4] jbd2: yield the device queue when waiting for journal commits Jeff Moyer
2010-05-19  2:05 ` [PATCH 0/4 v4] ext3/4: enhance fsync performance when using CFQ KOSAKI Motohiro
2010-05-26 15:33   ` Jeff Moyer
  -- strict thread matches above, loose matches on Subject: below --
2010-04-14 21:17 [PATCH 0/4 v3] " Jeff Moyer
2010-04-14 21:17 ` [PATCH 1/4] cfq-iosched: Keep track of average think time for the sync-noidle workload Jeff Moyer
2010-04-14 21:37   ` Vivek Goyal
2010-04-14 23:06     ` Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).