Cache queue_congestion_on/off

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* Cache queue_congestion_on/off_threshold
@ 2004-05-05 22:12 Chen, Kenneth W
  2004-05-06  6:20 ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Chen, Kenneth W @ 2004-05-05 22:12 UTC (permalink / raw)
  To: linux-kernel

It's kind of redundant that queue_congestion_on/off_threshold gets
calculated on every I/O and they produce the same number over and
over again unless q->nr_requests gets changed (which is probably a
very rare event).  Can we cache those values in the request_queue
structure?

- Ken


diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.blk/drivers/block/ll_rw_blk.c
--- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-05 14:32:31.000000000 -0700
+++ linux-2.6.6-rc3.blk/drivers/block/ll_rw_blk.c	2004-05-05 15:04:59.000000000 -0700
@@ -70,14 +70,7 @@ EXPORT_SYMBOL(blk_max_pfn);
  */
 static inline int queue_congestion_on_threshold(struct request_queue *q)
 {
-	int ret;
-
-	ret = q->nr_requests - (q->nr_requests / 8) + 1;
-
-	if (ret > q->nr_requests)
-		ret = q->nr_requests;
-
-	return ret;
+	return q->nr_congestion_on;
 }

 /*
@@ -85,14 +78,22 @@ static inline int queue_congestion_on_th
  */
 static inline int queue_congestion_off_threshold(struct request_queue *q)
 {
-	int ret;
+	return q->nr_congestion_off;
+}

-	ret = q->nr_requests - (q->nr_requests / 8) - 1;
+static inline void blk_queue_congestion_threshold(struct request_queue *q)
+{
+	int nr;

-	if (ret < 1)
-		ret = 1;
+	nr = q->nr_requests - (q->nr_requests / 8) + 1;
+	if (nr > q->nr_requests)
+		nr = q->nr_requests;
+	q->nr_congestion_on = nr;

-	return ret;
+	nr = q->nr_requests - (q->nr_requests / 8) - 1;
+	if (nr < 1)
+		nr = 1;
+	q->nr_congestion_off = nr;
 }

 void clear_backing_dev_congested(struct backing_dev_info *bdi, int rw)
@@ -235,6 +236,7 @@ void blk_queue_make_request(request_queu
 	blk_queue_max_sectors(q, MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
+	blk_queue_congestion_threshold(q);

 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
@@ -2953,6 +2955,7 @@ queue_requests_store(struct request_queu
 	int ret = queue_var_store(&q->nr_requests, page, count);
 	if (q->nr_requests < BLKDEV_MIN_RQ)
 		q->nr_requests = BLKDEV_MIN_RQ;
+	blk_queue_congestion_threshold(q);

 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, READ);
diff -Nurp linux-2.6.6-rc3/include/linux/blkdev.h linux-2.6.6-rc3.blk/include/linux/blkdev.h
--- linux-2.6.6-rc3/include/linux/blkdev.h	2004-04-27 18:35:21.000000000 -0700
+++ linux-2.6.6-rc3.blk/include/linux/blkdev.h	2004-05-05 15:04:59.000000000 -0700
@@ -334,6 +334,8 @@ struct request_queue
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
+	unsigned int		nr_congestion_on;
+	unsigned int		nr_congestion_off;

 	unsigned short		max_sectors;
 	unsigned short		max_phys_segments;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-05 22:12 Cache queue_congestion_on/off_threshold Chen, Kenneth W
@ 2004-05-06  6:20 ` Jens Axboe
  2004-05-06  6:34   ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2004-05-06  6:20 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: linux-kernel

On Wed, May 05 2004, Chen, Kenneth W wrote:
> It's kind of redundant that queue_congestion_on/off_threshold gets
> calculated on every I/O and they produce the same number over and
> over again unless q->nr_requests gets changed (which is probably a
> very rare event).  Can we cache those values in the request_queue
> structure?
> 
> - Ken
> 
> 
> diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.blk/drivers/block/ll_rw_blk.c
> --- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-05 14:32:31.000000000 -0700
> +++ linux-2.6.6-rc3.blk/drivers/block/ll_rw_blk.c	2004-05-05 15:04:59.000000000 -0700
> @@ -70,14 +70,7 @@ EXPORT_SYMBOL(blk_max_pfn);
>   */
>  static inline int queue_congestion_on_threshold(struct request_queue *q)
>  {
> -	int ret;
> -
> -	ret = q->nr_requests - (q->nr_requests / 8) + 1;
> -
> -	if (ret > q->nr_requests)
> -		ret = q->nr_requests;
> -
> -	return ret;
> +	return q->nr_congestion_on;
>  }
> 
>  /*
> @@ -85,14 +78,22 @@ static inline int queue_congestion_on_th
>   */
>  static inline int queue_congestion_off_threshold(struct request_queue *q)
>  {
> -	int ret;
> +	return q->nr_congestion_off;
> +}
> 
> -	ret = q->nr_requests - (q->nr_requests / 8) - 1;
> +static inline void blk_queue_congestion_threshold(struct request_queue *q)
> +{
> +	int nr;
> 
> -	if (ret < 1)
> -		ret = 1;
> +	nr = q->nr_requests - (q->nr_requests / 8) + 1;
> +	if (nr > q->nr_requests)
> +		nr = q->nr_requests;
> +	q->nr_congestion_on = nr;
> 
> -	return ret;
> +	nr = q->nr_requests - (q->nr_requests / 8) - 1;
> +	if (nr < 1)
> +		nr = 1;
> +	q->nr_congestion_off = nr;
>  }
> 
>  void clear_backing_dev_congested(struct backing_dev_info *bdi, int rw)
> @@ -235,6 +236,7 @@ void blk_queue_make_request(request_queu
>  	blk_queue_max_sectors(q, MAX_SECTORS);
>  	blk_queue_hardsect_size(q, 512);
>  	blk_queue_dma_alignment(q, 511);
> +	blk_queue_congestion_threshold(q);
> 
>  	q->unplug_thresh = 4;		/* hmm */
>  	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
> @@ -2953,6 +2955,7 @@ queue_requests_store(struct request_queu
>  	int ret = queue_var_store(&q->nr_requests, page, count);
>  	if (q->nr_requests < BLKDEV_MIN_RQ)
>  		q->nr_requests = BLKDEV_MIN_RQ;
> +	blk_queue_congestion_threshold(q);
> 
>  	if (rl->count[READ] >= queue_congestion_on_threshold(q))
>  		set_queue_congested(q, READ);
> diff -Nurp linux-2.6.6-rc3/include/linux/blkdev.h linux-2.6.6-rc3.blk/include/linux/blkdev.h
> --- linux-2.6.6-rc3/include/linux/blkdev.h	2004-04-27 18:35:21.000000000 -0700
> +++ linux-2.6.6-rc3.blk/include/linux/blkdev.h	2004-05-05 15:04:59.000000000 -0700
> @@ -334,6 +334,8 @@ struct request_queue
>  	 * queue settings
>  	 */
>  	unsigned long		nr_requests;	/* Max # of requests */
> +	unsigned int		nr_congestion_on;
> +	unsigned int		nr_congestion_off;
> 
>  	unsigned short		max_sectors;
>  	unsigned short		max_phys_segments;

Do you have any numbers at all for this? I'd say these calculations are
severly into the noise area when submitting io.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-06  6:20 ` Jens Axboe
@ 2004-05-06  6:34   ` Andrew Morton
  2004-05-06  6:43     ` Jens Axboe
  2004-05-06 20:29     ` Chen, Kenneth W
  0 siblings, 2 replies; 18+ messages in thread
From: Andrew Morton @ 2004-05-06  6:34 UTC (permalink / raw)
  To: Jens Axboe; +Cc: kenneth.w.chen, linux-kernel

Jens Axboe <axboe@suse.de> wrote:
>
> Do you have any numbers at all for this? I'd say these calculations are
>  severly into the noise area when submitting io.

The difference will not be measurable, but I think the patch makes sense
regardless of what the numbers say.

I uninlined the new function though.  Two callsites, both slowpath...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-06  6:34   ` Andrew Morton
@ 2004-05-06  6:43     ` Jens Axboe
  2004-05-06 20:30       ` Chen, Kenneth W
  2004-05-06 20:29     ` Chen, Kenneth W
  1 sibling, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2004-05-06  6:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: kenneth.w.chen, linux-kernel

On Wed, May 05 2004, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Do you have any numbers at all for this? I'd say these calculations are
> >  severly into the noise area when submitting io.
> 
> The difference will not be measurable, but I think the patch makes sense
> regardless of what the numbers say.

Humm dunno, I'd rather save the sizeof(int) * 2.

> I uninlined the new function though.  Two callsites, both slowpath...

The more reason not to make it static :). I don't feel too strongly
about it though.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Cache queue_congestion_on/off_threshold
  2004-05-06  6:34   ` Andrew Morton
  2004-05-06  6:43     ` Jens Axboe
@ 2004-05-06 20:29     ` Chen, Kenneth W
  2004-05-07  9:39       ` Jens Axboe
  1 sibling, 1 reply; 18+ messages in thread
From: Chen, Kenneth W @ 2004-05-06 20:29 UTC (permalink / raw)
  To: 'Andrew Morton', Jens Axboe; +Cc: linux-kernel

>>>> Andrew Morton wrote on Wednesday, May 05, 2004 11:34 PM
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Do you have any numbers at all for this? I'd say these calculations are
> >  severly into the noise area when submitting io.
>
> The difference will not be measurable, but I think the patch makes sense
> regardless of what the numbers say.

Even though it is in the noise range that can't be easily measured, they are
indeed in the positive territory.  If I stack 5 of these little things, we
actually measured positive gain on a large db workload.

There isn't anything absurd in 2.6 kernel, however, I hate to say that we
consistently see performance regression with latest 2.6 kernel compare to
best 2.4 based kernel under heavy db workload on 4-way SMP platform. (2.6
rocks on numa platform that 2.4 doesn't even have a chance to compete).

Some of the examples are:

(1) it's cheaper to find out whether a queue is empty or not by calling
    elv_queue_empty() instead of using heavier elv_next_request().
(2) it's better to check queue empty before calling into q->request_fn()


diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c
--- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-06 13:03:14.000000000 -0700
+++ linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c	2004-05-06 13:04:04.000000000 -0700
@@ -1128,7 +1128,7 @@ static inline void __generic_unplug_devi
 	/*
 	 * was plugged, fire request_fn if queue has stuff to do
 	 */
-	if (elv_next_request(q))
+	if (!elv_queue_empty(q))
 		q->request_fn(q);
 }

@@ -1237,7 +1237,8 @@ void blk_run_queue(struct request_queue

 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_remove_plug(q);
-	q->request_fn(q);
+	if (!elv_queue_empty(q))
+		q->request_fn(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }



(3) can we allocate request structure up front in __make_request?
    For I/O that cannot be merged, the elevator code executes twice
    in __make_request.


diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c
--- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-06 13:03:14.000000000 -0700
+++ linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c	2004-05-06 13:11:39.000000000 -0700
@@ -2154,15 +2154,14 @@ static int __make_request(request_queue_

 	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);

+	/* Grab a free request from the freelist */
+	freereq = get_request(q, rw, GFP_ATOMIC);
+
 again:
 	spin_lock_irq(q->queue_lock);

-	if (elv_queue_empty(q)) {
+	if (elv_queue_empty(q))
 		blk_plug_device(q);
-		goto get_rq;
-	}
-	if (barrier)
-		goto get_rq;

 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {


Some more, I will post in another thread.

- Ken



^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Cache queue_congestion_on/off_threshold
  2004-05-06  6:43     ` Jens Axboe
@ 2004-05-06 20:30       ` Chen, Kenneth W
  2004-05-07  3:02         ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Chen, Kenneth W @ 2004-05-06 20:30 UTC (permalink / raw)
  To: 'Jens Axboe', Andrew Morton; +Cc: linux-kernel

>>>> Jens Axboe wrote on Wed, May 05, 2004 11:43 PM
> On Wed, May 05 2004, Andrew Morton wrote:
> > Jens Axboe <axboe@suse.de> wrote:
> > >
> > > Do you have any numbers at all for this? I'd say these calculations are
> > >  severly into the noise area when submitting io.
> >
> > The difference will not be measurable, but I think the patch makes sense
> > regardless of what the numbers say.
>
> Humm dunno, I'd rather save the sizeof(int) * 2.

Strictly speaking from memory consumption point of view, it probably comes
for free since sizeof(struct request_queue) currently is 456 bytes on x86
and 816 on 64bit arch.  The structure is being rounded to 512 or 1024 with
kmalloc.  If it is on the borderline to next kmalloc size, it probably should
be revisited.  But so far it has been noise on the positive side.

- Ken

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-06 20:30       ` Chen, Kenneth W
@ 2004-05-07  3:02         ` Andrew Morton
  2004-05-07  9:35           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-05-07  3:02 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: axboe, linux-kernel

"Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
>
> >>>> Jens Axboe wrote on Wed, May 05, 2004 11:43 PM
> > On Wed, May 05 2004, Andrew Morton wrote:
> > > Jens Axboe <axboe@suse.de> wrote:
> > > >
> > > > Do you have any numbers at all for this? I'd say these calculations are
> > > >  severly into the noise area when submitting io.
> > >
> > > The difference will not be measurable, but I think the patch makes sense
> > > regardless of what the numbers say.
> >
> > Humm dunno, I'd rather save the sizeof(int) * 2.
> 
> Strictly speaking from memory consumption point of view, it probably comes
> for free since sizeof(struct request_queue) currently is 456 bytes on x86
> and 816 on 64bit arch.  The structure is being rounded to 512 or 1024 with
> kmalloc.

That's a good argument for creating a standalone slab cache for request
queue structures ;)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-07  3:02         ` Andrew Morton
@ 2004-05-07  9:35           ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2004-05-07  9:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chen, Kenneth W, linux-kernel

On Thu, May 06 2004, Andrew Morton wrote:
> "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
> >
> > >>>> Jens Axboe wrote on Wed, May 05, 2004 11:43 PM
> > > On Wed, May 05 2004, Andrew Morton wrote:
> > > > Jens Axboe <axboe@suse.de> wrote:
> > > > >
> > > > > Do you have any numbers at all for this? I'd say these calculations are
> > > > >  severly into the noise area when submitting io.
> > > >
> > > > The difference will not be measurable, but I think the patch makes sense
> > > > regardless of what the numbers say.
> > >
> > > Humm dunno, I'd rather save the sizeof(int) * 2.
> > 
> > Strictly speaking from memory consumption point of view, it probably comes
> > for free since sizeof(struct request_queue) currently is 456 bytes on x86
> > and 816 on 64bit arch.  The structure is being rounded to 512 or 1024 with
> > kmalloc.
> 
> That's a good argument for creating a standalone slab cache for request
> queue structures ;)

Precisely, it's definitely not a good argument to keep loading it. I'll
do that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-06 20:29     ` Chen, Kenneth W
@ 2004-05-07  9:39       ` Jens Axboe
  2004-05-07 22:00         ` Chen, Kenneth W
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2004-05-07  9:39 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Andrew Morton', linux-kernel

On Thu, May 06 2004, Chen, Kenneth W wrote:
> >>>> Andrew Morton wrote on Wednesday, May 05, 2004 11:34 PM
> > Jens Axboe <axboe@suse.de> wrote:
> > >
> > > Do you have any numbers at all for this? I'd say these calculations are
> > >  severly into the noise area when submitting io.
> >
> > The difference will not be measurable, but I think the patch makes sense
> > regardless of what the numbers say.
> 
> Even though it is in the noise range that can't be easily measured, they are
> indeed in the positive territory.  If I stack 5 of these little things, we
> actually measured positive gain on a large db workload.

I somehow still find that very hard to believe, it's a branch and a
couple of cycles.

> There isn't anything absurd in 2.6 kernel, however, I hate to say that we
> consistently see performance regression with latest 2.6 kernel compare to
> best 2.4 based kernel under heavy db workload on 4-way SMP platform. (2.6
> rocks on numa platform that 2.4 doesn't even have a chance to compete).
> 
> Some of the examples are:
> 
> (1) it's cheaper to find out whether a queue is empty or not by calling
>     elv_queue_empty() instead of using heavier elv_next_request().
> (2) it's better to check queue empty before calling into q->request_fn()
> 
> 
> diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c
> --- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-06 13:03:14.000000000 -0700
> +++ linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c	2004-05-06 13:04:04.000000000 -0700
> @@ -1128,7 +1128,7 @@ static inline void __generic_unplug_devi
>  	/*
>  	 * was plugged, fire request_fn if queue has stuff to do
>  	 */
> -	if (elv_next_request(q))
> +	if (!elv_queue_empty(q))
>  		q->request_fn(q);
>  }
> 
> @@ -1237,7 +1237,8 @@ void blk_run_queue(struct request_queue
> 
>  	spin_lock_irqsave(q->queue_lock, flags);
>  	blk_remove_plug(q);
> -	q->request_fn(q);
> +	if (!elv_queue_empty(q))
> +		q->request_fn(q);
>  	spin_unlock_irqrestore(q->queue_lock, flags);
>  }

This looks great, should be merged right away.

> (3) can we allocate request structure up front in __make_request?
>     For I/O that cannot be merged, the elevator code executes twice
>     in __make_request.
> 
> 
> diff -Nurp linux-2.6.6-rc3/drivers/block/ll_rw_blk.c linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c
> --- linux-2.6.6-rc3/drivers/block/ll_rw_blk.c	2004-05-06 13:03:14.000000000 -0700
> +++ linux-2.6.6-rc3.ken/drivers/block/ll_rw_blk.c	2004-05-06 13:11:39.000000000 -0700
> @@ -2154,15 +2154,14 @@ static int __make_request(request_queue_
> 
>  	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
> 
> +	/* Grab a free request from the freelist */
> +	freereq = get_request(q, rw, GFP_ATOMIC);
> +
>  again:
>  	spin_lock_irq(q->queue_lock);
> 
> -	if (elv_queue_empty(q)) {
> +	if (elv_queue_empty(q))
>  		blk_plug_device(q);
> -		goto get_rq;
> -	}
> -	if (barrier)
> -		goto get_rq;
> 
>  	el_ret = elv_merge(q, &req, bio);
>  	switch (el_ret) {

Actually, with the good working batching we might get away with killing
freereq completely. Have you tested that (if not, could you?)

> Some more, I will post in another thread.

Can you please remember to cc in initial posts as well, I don't want to
always hunt for your findings. Thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Cache queue_congestion_on/off_threshold
  2004-05-07  9:39       ` Jens Axboe
@ 2004-05-07 22:00         ` Chen, Kenneth W
  2004-05-10 14:30           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Chen, Kenneth W @ 2004-05-07 22:00 UTC (permalink / raw)
  To: 'Jens Axboe'; +Cc: 'Andrew Morton', linux-kernel

>>>> Jens Axboe wrote on Friday, May 07, 2004 2:39 AM
> On Thu, May 06 2004, Chen, Kenneth W wrote:
> > (3) can we allocate request structure up front in __make_request?
> >     For I/O that cannot be merged, the elevator code executes twice
> >     in __make_request.
> >
>
> Actually, with the good working batching we might get away with killing
> freereq completely. Have you tested that (if not, could you?)

Sorry, I'm clueless on "good working batching".  If you could please give
me some pointers, I will definitely test it.

- Ken



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-07 22:00         ` Chen, Kenneth W
@ 2004-05-10 14:30           ` Jens Axboe
  2004-05-10 14:43             ` Nick Piggin
  2004-05-12  5:32             ` Chen, Kenneth W
  0 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2004-05-10 14:30 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Andrew Morton', linux-kernel

On Fri, May 07 2004, Chen, Kenneth W wrote:
> >>>> Jens Axboe wrote on Friday, May 07, 2004 2:39 AM
> > On Thu, May 06 2004, Chen, Kenneth W wrote:
> > > (3) can we allocate request structure up front in __make_request?
> > >     For I/O that cannot be merged, the elevator code executes twice
> > >     in __make_request.
> > >
> >
> > Actually, with the good working batching we might get away with killing
> > freereq completely. Have you tested that (if not, could you?)
> 
> Sorry, I'm clueless on "good working batching".  If you could please give
> me some pointers, I will definitely test it.

Something like this.

--- linux-2.6.6/drivers/block/ll_rw_blk.c~	2004-05-10 16:23:45.684726955 +0200
+++ linux-2.6.6/drivers/block/ll_rw_blk.c	2004-05-10 16:29:04.333792268 +0200
@@ -2138,8 +2138,8 @@
 
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
-	struct request *req, *freereq = NULL;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	struct request *req;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2161,15 +2161,15 @@
 
 	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
 
-again:
+	if (barrier)
+		goto get_rq_nolock;
+
 	spin_lock_irq(q->queue_lock);
 
 	if (elv_queue_empty(q)) {
 		blk_plug_device(q);
 		goto get_rq;
 	}
-	if (barrier)
-		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {
@@ -2230,21 +2230,17 @@
 	 * a free slot.
 	 */
 get_rq:
-	if (freereq) {
-		req = freereq;
-		freereq = NULL;
-	} else {
-		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
-			/*
-			 * READA bit set
-			 */
-			if (ra)
-				goto end_io;
+	spin_unlock_irq(q->queue_lock);
+get_rq_nolock:
+	req = get_request(q, rw, GFP_ATOMIC);
+	if (!req) {
+		/*
+		 * READA bit set
+		 */
+		if (ra)
+			goto end_io;
 	
-			freereq = get_request_wait(q, rw);
-		}
-		goto again;
+		req = get_request_wait(q, rw);
 	}
 
 	req->flags |= REQ_CMD;
@@ -2276,10 +2272,9 @@
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
 
+	spin_lock_irq(q->queue_lock);
 	add_request(q, req);
 out:
-	if (freereq)
-		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-10 14:30           ` Jens Axboe
@ 2004-05-10 14:43             ` Nick Piggin
  2004-05-10 14:44               ` Jens Axboe
  2004-05-12  5:32             ` Chen, Kenneth W
  1 sibling, 1 reply; 18+ messages in thread
From: Nick Piggin @ 2004-05-10 14:43 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Chen, Kenneth W, 'Andrew Morton', linux-kernel

Jens Axboe wrote:
> On Fri, May 07 2004, Chen, Kenneth W wrote:
> 
>>>>>>Jens Axboe wrote on Friday, May 07, 2004 2:39 AM
>>>
>>>On Thu, May 06 2004, Chen, Kenneth W wrote:
>>>
>>>>(3) can we allocate request structure up front in __make_request?
>>>>    For I/O that cannot be merged, the elevator code executes twice
>>>>    in __make_request.
>>>>
>>>
>>>Actually, with the good working batching we might get away with killing
>>>freereq completely. Have you tested that (if not, could you?)
>>
>>Sorry, I'm clueless on "good working batching".  If you could please give
>>me some pointers, I will definitely test it.
> 
> 
> Something like this.
> 

While we're doing that can we drop the GFP_ATOMIC allocation
completely?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-10 14:43             ` Nick Piggin
@ 2004-05-10 14:44               ` Jens Axboe
  2004-05-11  3:22                 ` Nick Piggin
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2004-05-10 14:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Chen, Kenneth W, 'Andrew Morton', linux-kernel

On Tue, May 11 2004, Nick Piggin wrote:
> Jens Axboe wrote:
> >On Fri, May 07 2004, Chen, Kenneth W wrote:
> >
> >>>>>>Jens Axboe wrote on Friday, May 07, 2004 2:39 AM
> >>>
> >>>On Thu, May 06 2004, Chen, Kenneth W wrote:
> >>>
> >>>>(3) can we allocate request structure up front in __make_request?
> >>>>   For I/O that cannot be merged, the elevator code executes twice
> >>>>   in __make_request.
> >>>>
> >>>
> >>>Actually, with the good working batching we might get away with killing
> >>>freereq completely. Have you tested that (if not, could you?)
> >>
> >>Sorry, I'm clueless on "good working batching".  If you could please give
> >>me some pointers, I will definitely test it.
> >
> >
> >Something like this.
> >
> 
> While we're doing that can we drop the GFP_ATOMIC allocation
> completely?

Thought the same thing. But lets stick to single item tests first, then
we can kill that double allocation after.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-10 14:44               ` Jens Axboe
@ 2004-05-11  3:22                 ` Nick Piggin
  0 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2004-05-11  3:22 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Chen, Kenneth W, 'Andrew Morton', linux-kernel

Jens Axboe wrote:
> On Tue, May 11 2004, Nick Piggin wrote:
> 

>>
>>While we're doing that can we drop the GFP_ATOMIC allocation
>>completely?
> 
> 
> Thought the same thing. But lets stick to single item tests first, then
> we can kill that double allocation after.
> 

That sounds like the best idea :)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Cache queue_congestion_on/off_threshold
  2004-05-10 14:30           ` Jens Axboe
  2004-05-10 14:43             ` Nick Piggin
@ 2004-05-12  5:32             ` Chen, Kenneth W
  2004-05-12  7:05               ` Jens Axboe
  2004-05-12 14:22               ` Jens Axboe
  1 sibling, 2 replies; 18+ messages in thread
From: Chen, Kenneth W @ 2004-05-12  5:32 UTC (permalink / raw)
  To: 'Jens Axboe'; +Cc: 'Andrew Morton', linux-kernel

>>>> Jens Axboe wrote on Monday, May 10, 2004 7:30 AM
> > >
> > > Actually, with the good working batching we might get away with killing
> > > freereq completely. Have you tested that (if not, could you?)
> >
> > Sorry, I'm clueless on "good working batching".  If you could please give
> > me some pointers, I will definitely test it.
>
> Something like this.
>
> --- linux-2.6.6/drivers/block/ll_rw_blk.c~	2004-05-10 16:23:45.684726955 +0200
> +++ linux-2.6.6/drivers/block/ll_rw_blk.c	2004-05-10 16:29:04.333792268 +0200
> @@ -2138,8 +2138,8 @@
>
>  static int __make_request(request_queue_t *q, struct bio *bio)
>  {
> -	struct request *req, *freereq = NULL;
>  	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
> +	struct request *req;
>  	sector_t sector;
>
>
> [snip] ...

I'm still working on this.  With this patch, several processes stuck
in "D" state and never finish.  Suspect it's the barrier thing, it
jumps through blk_plug_device() and might goof up the queue afterwards.

- Ken



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-12  5:32             ` Chen, Kenneth W
@ 2004-05-12  7:05               ` Jens Axboe
  2004-05-12 13:48                 ` Jens Axboe
  2004-05-12 14:22               ` Jens Axboe
  1 sibling, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2004-05-12  7:05 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Andrew Morton', linux-kernel

On Tue, May 11 2004, Chen, Kenneth W wrote:
> >>>> Jens Axboe wrote on Monday, May 10, 2004 7:30 AM
> > > >
> > > > Actually, with the good working batching we might get away with killing
> > > > freereq completely. Have you tested that (if not, could you?)
> > >
> > > Sorry, I'm clueless on "good working batching".  If you could please give
> > > me some pointers, I will definitely test it.
> >
> > Something like this.
> >
> > --- linux-2.6.6/drivers/block/ll_rw_blk.c~	2004-05-10 16:23:45.684726955 +0200
> > +++ linux-2.6.6/drivers/block/ll_rw_blk.c	2004-05-10 16:29:04.333792268 +0200
> > @@ -2138,8 +2138,8 @@
> >
> >  static int __make_request(request_queue_t *q, struct bio *bio)
> >  {
> > -	struct request *req, *freereq = NULL;
> >  	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
> > +	struct request *req;
> >  	sector_t sector;
> >
> >
> > [snip] ...
> 
> I'm still working on this.  With this patch, several processes stuck
> in "D" state and never finish.  Suspect it's the barrier thing, it
> jumps through blk_plug_device() and might goof up the queue afterwards.

I'll do a quick test run (and review) of the patch, it wasn't even
compiled here. So the chance of a slip-up is non-zero.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-12  7:05               ` Jens Axboe
@ 2004-05-12 13:48                 ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2004-05-12 13:48 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Andrew Morton', linux-kernel

On Wed, May 12 2004, Jens Axboe wrote:
> On Tue, May 11 2004, Chen, Kenneth W wrote:
> > >>>> Jens Axboe wrote on Monday, May 10, 2004 7:30 AM
> > > > >
> > > > > Actually, with the good working batching we might get away with killing
> > > > > freereq completely. Have you tested that (if not, could you?)
> > > >
> > > > Sorry, I'm clueless on "good working batching".  If you could please give
> > > > me some pointers, I will definitely test it.
> > >
> > > Something like this.
> > >
> > > --- linux-2.6.6/drivers/block/ll_rw_blk.c~	2004-05-10 16:23:45.684726955 +0200
> > > +++ linux-2.6.6/drivers/block/ll_rw_blk.c	2004-05-10 16:29:04.333792268 +0200
> > > @@ -2138,8 +2138,8 @@
> > >
> > >  static int __make_request(request_queue_t *q, struct bio *bio)
> > >  {
> > > -	struct request *req, *freereq = NULL;
> > >  	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
> > > +	struct request *req;
> > >  	sector_t sector;
> > >
> > >
> > > [snip] ...
> > 
> > I'm still working on this.  With this patch, several processes stuck
> > in "D" state and never finish.  Suspect it's the barrier thing, it
> > jumps through blk_plug_device() and might goof up the queue afterwards.
> 
> I'll do a quick test run (and review) of the patch, it wasn't even
> compiled here. So the chance of a slip-up is non-zero.

This at least boots here, does it work correctly for you?

--- include/linux/bio.h~	2004-05-12 09:57:55.000000000 +0200
+++ include/linux/bio.h	2004-05-12 13:15:16.672532790 +0200
@@ -126,6 +126,7 @@
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
+#define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
 /*
  * various member access, note that bio_data should of course not be used
--- drivers/block/ll_rw_blk.c~	2004-05-12 09:40:37.000000000 +0200
+++ drivers/block/ll_rw_blk.c	2004-05-12 15:42:06.924309518 +0200
@@ -1614,7 +1615,6 @@
 	DEFINE_WAIT(wait);
 	struct request *rq;
 
-	generic_unplug_device(q);
 	do {
 		struct request_list *rl = &q->rq;
 
@@ -1626,6 +1626,7 @@
 		if (!rq) {
 			struct io_context *ioc;
 
+			generic_unplug_device(q);
 			io_schedule();
 
 			/*
@@ -2133,8 +2134,8 @@
 
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
-	struct request *req, *freereq = NULL;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	struct request *req = NULL;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2152,17 +2153,17 @@
 
 	spin_lock_prefetch(q->queue_lock);
 
-	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+	mb();
+	barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+	ra = bio_rw_flagged(bio, BIO_RW_AHEAD);
 
-again:
 	spin_lock_irq(q->queue_lock);
 
 	if (elv_queue_empty(q)) {
 		blk_plug_device(q);
 		goto get_rq;
 	}
+
 	if (barrier)
 		goto get_rq;
 
@@ -2225,21 +2226,16 @@
 	 * a free slot.
 	 */
 get_rq:
-	if (freereq) {
-		req = freereq;
-		freereq = NULL;
-	} else {
-		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
-			/*
-			 * READA bit set
-			 */
-			if (ra)
-				goto end_io;
+	spin_unlock_irq(q->queue_lock);
+	req = get_request(q, rw, GFP_ATOMIC);
+	if (!req) {
+		/*
+		 * READA bit set
+		 */
+		if (ra)
+			goto end_io;
 	
-			freereq = get_request_wait(q, rw);
-		}
-		goto again;
+		req = get_request_wait(q, rw);
 	}
 
 	req->flags |= REQ_CMD;
@@ -2248,7 +2244,7 @@
 	 * inherit FAILFAST from bio and don't stack up
 	 * retries for read ahead
 	 */
-	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
+	if (ra || bio_rw_flagged(bio, BIO_RW_FAILFAST))
 		req->flags |= REQ_FAILFAST;
 
 	/*
@@ -2271,10 +2267,9 @@
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
 
+	spin_lock_irq(q->queue_lock);
 	add_request(q, req);
 out:
-	if (freereq)
-		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Cache queue_congestion_on/off_threshold
  2004-05-12  5:32             ` Chen, Kenneth W
  2004-05-12  7:05               ` Jens Axboe
@ 2004-05-12 14:22               ` Jens Axboe
  1 sibling, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2004-05-12 14:22 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Andrew Morton', linux-kernel

On Tue, May 11 2004, Chen, Kenneth W wrote:
> >>>> Jens Axboe wrote on Monday, May 10, 2004 7:30 AM
> > > >
> > > > Actually, with the good working batching we might get away with killing
> > > > freereq completely. Have you tested that (if not, could you?)
> > >
> > > Sorry, I'm clueless on "good working batching".  If you could please give
> > > me some pointers, I will definitely test it.
> >
> > Something like this.
> >
> > --- linux-2.6.6/drivers/block/ll_rw_blk.c~	2004-05-10 16:23:45.684726955 +0200
> > +++ linux-2.6.6/drivers/block/ll_rw_blk.c	2004-05-10 16:29:04.333792268 +0200
> > @@ -2138,8 +2138,8 @@
> >
> >  static int __make_request(request_queue_t *q, struct bio *bio)
> >  {
> > -	struct request *req, *freereq = NULL;
> >  	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
> > +	struct request *req;
> >  	sector_t sector;
> >
> >
> > [snip] ...
> 
> I'm still working on this.  With this patch, several processes stuck
> in "D" state and never finish.  Suspect it's the barrier thing, it
> jumps through blk_plug_device() and might goof up the queue afterwards.

BTW, the barrier bit was buggy, but it could not make a difference since
the stock kernels + -mm doesn't use barriers. So you are right it's
wrong, but no impact on this test.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2004-05-12 14:23 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-05-05 22:12 Cache queue_congestion_on/off_threshold Chen, Kenneth W
2004-05-06  6:20 ` Jens Axboe
2004-05-06  6:34   ` Andrew Morton
2004-05-06  6:43     ` Jens Axboe
2004-05-06 20:30       ` Chen, Kenneth W
2004-05-07  3:02         ` Andrew Morton
2004-05-07  9:35           ` Jens Axboe
2004-05-06 20:29     ` Chen, Kenneth W
2004-05-07  9:39       ` Jens Axboe
2004-05-07 22:00         ` Chen, Kenneth W
2004-05-10 14:30           ` Jens Axboe
2004-05-10 14:43             ` Nick Piggin
2004-05-10 14:44               ` Jens Axboe
2004-05-11  3:22                 ` Nick Piggin
2004-05-12  5:32             ` Chen, Kenneth W
2004-05-12  7:05               ` Jens Axboe
2004-05-12 13:48                 ` Jens Axboe
2004-05-12 14:22               ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox