[PATCH 1/1] cciss per disk queue for 2.6

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/1] cciss per disk queue for 2.6
@ 2005-06-30 21:03 mike.miller
  2005-07-01  7:02 ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: mike.miller @ 2005-06-30 21:03 UTC (permalink / raw)
  To: akpm, axboe; +Cc: linux-kernel, linux-scsi

This patch adds per disk queue functionality to cciss. Sometime back I submitted a patch but it looks like only part of what I needed. In the 2.6 kernel if we have more than one logical volume the driver will Oops during rmmod. It seems all of the queues actually point back to the same queue. So after deleting the first volume you hit a null pointer on the second one.

This has been tested in our labs. There is no difference in performance, it just fixes the Oops. Please consider this patch for inclusion.

Signed-off-by: Mike Miller <mike.miller@hp.com>

 cciss.c |   49 ++++++++++++++++++++++++++-----------------------
 cciss.h |    4 ++--
 2 files changed, 28 insertions(+), 25 deletions(-)
--------------------------------------------------------------------------------
diff -burNp lx2612-rc6.orig/drivers/block/cciss.c lx2612-rc6-tmp/drivers/block/cciss.c
--- lx2612-rc6.orig/drivers/block/cciss.c	2005-06-13 14:51:02.000000000 -0500
+++ lx2612-rc6-tmp/drivers/block/cciss.c	2005-06-30 16:54:56.000000000 -0500
@@ -1140,7 +1140,7 @@ static int revalidate_allvol(ctlr_info_t
 		/* this is for the online array utilities */
 		if (!drv->heads && i)
 			continue;
-		blk_queue_hardsect_size(host->queue, drv->block_size);
+		blk_queue_hardsect_size(drv->queue, drv->block_size);
 		set_capacity(disk, drv->nr_blocks);
 		add_disk(disk);
 	}
@@ -1696,7 +1696,7 @@ static int cciss_revalidate(struct gendi
 	cciss_read_capacity(h->ctlr, logvol, size_buff, 1, &total_size, &block_size);
 	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, inq_buff, drv);
 
-	blk_queue_hardsect_size(h->queue, drv->block_size);
+	blk_queue_hardsect_size(drv->queue, drv->block_size);
 	set_capacity(disk, drv->nr_blocks);
 
 	kfree(size_buff);
@@ -2253,12 +2253,12 @@ static irqreturn_t do_cciss_intr(int irq
  	 * them up.  We will also keep track of the next queue to run so
  	 * that every queue gets a chance to be started first.
  	*/
- 	for (j=0; j < NWD; j++){
- 		int curr_queue = (start_queue + j) % NWD;
+	for (j=0; j < h->highest_lun + 1; j++){
+		int curr_queue = (start_queue + j) % (h->highest_lun + 1);
  		/* make sure the disk has been added and the drive is real
  		 * because this can be called from the middle of init_one.
  		*/
- 		if(!(h->gendisk[curr_queue]->queue) ||
+		if(!(h->drv[curr_queue].queue) || 
 		 		   !(h->drv[curr_queue].heads))
  			continue;
  		blk_start_queue(h->gendisk[curr_queue]->queue);
@@ -2269,14 +2269,14 @@ static irqreturn_t do_cciss_intr(int irq
  		if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS)
  		{
  			if (curr_queue == start_queue){
- 				h->next_to_run = (start_queue + 1) % NWD;
+				h->next_to_run = (start_queue + 1) % (h->highest_lun + 1);
  				goto cleanup;
  			} else {
  				h->next_to_run = curr_queue;
  				goto cleanup;
  	}
  		} else {
- 			curr_queue = (curr_queue + 1) % NWD;
+			curr_queue = (curr_queue + 1) % (h->highest_lun + 1);
  		}
  	}
 
@@ -2284,7 +2284,6 @@ cleanup:
 	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 	return IRQ_HANDLED;
 }
-
 /* 
  *  We cannot read the structure directly, for portablity we must use 
  *   the io functions.
@@ -2799,13 +2798,6 @@ static int __devinit cciss_init_one(stru
 	}
 
 	spin_lock_init(&hba[i]->lock);
-	q = blk_init_queue(do_cciss_request, &hba[i]->lock);
-	if (!q)
-		goto clean4;
-
-	q->backing_dev_info.ra_pages = READ_AHEAD;
-	hba[i]->queue = q;
-	q->queuedata = hba[i];
 
 	/* Initialize the pdev driver private data. 
 		have it point to hba[i].  */
@@ -2827,6 +2819,20 @@ static int __devinit cciss_init_one(stru
 
 	cciss_procinit(i);
 
+	for(j=0; j < NWD; j++) { /* mfm */
+		drive_info_struct *drv = &(hba[i]->drv[j]);
+		struct gendisk *disk = hba[i]->gendisk[j];
+
+		q = blk_init_queue(do_cciss_request, &hba[i]->lock);
+		if (!q) {
+			printk(KERN_ERR
+			   "cciss:  unable to allocate queue for disk %d\n",
+			   j);
+			break;
+		}
+		drv->queue = q;
+
+		q->backing_dev_info.ra_pages = READ_AHEAD;
 	blk_queue_bounce_limit(q, hba[i]->pdev->dma_mask);
 
 	/* This is a hardware imposed limit. */
@@ -2837,26 +2843,23 @@ static int __devinit cciss_init_one(stru
 
 	blk_queue_max_sectors(q, 512);
 
-
-	for(j=0; j<NWD; j++) {
-		drive_info_struct *drv = &(hba[i]->drv[j]);
-		struct gendisk *disk = hba[i]->gendisk[j];
-
+		q->queuedata = hba[i];
 		sprintf(disk->disk_name, "cciss/c%dd%d", i, j);
 		sprintf(disk->devfs_name, "cciss/host%d/target%d", i, j);
 		disk->major = hba[i]->major;
 		disk->first_minor = j << NWD_SHIFT;
 		disk->fops = &cciss_fops;
-		disk->queue = hba[i]->queue;
+		disk->queue = q;
 		disk->private_data = drv;
 		/* we must register the controller even if no disks exist */
 		/* this is for the online array utilities */
 		if(!drv->heads && j)
 			continue;
-		blk_queue_hardsect_size(hba[i]->queue, drv->block_size);
+		blk_queue_hardsect_size(q, drv->block_size);
 		set_capacity(disk, drv->nr_blocks);
 		add_disk(disk);
 	}
+
 	return(1);
 
 clean4:
@@ -2922,10 +2925,10 @@ static void __devexit cciss_remove_one (
 	for (j = 0; j < NWD; j++) {
 		struct gendisk *disk = hba[i]->gendisk[j];
 		if (disk->flags & GENHD_FL_UP)
+			blk_cleanup_queue(disk->queue);
 			del_gendisk(disk);
 	}
 
-	blk_cleanup_queue(hba[i]->queue);
 	pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof(CommandList_struct),
 			    hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
 	pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof( ErrorInfo_struct),
diff -burNp lx2612-rc6.orig/drivers/block/cciss.h lx2612-rc6-tmp/drivers/block/cciss.h
--- lx2612-rc6.orig/drivers/block/cciss.h	2005-06-13 14:51:02.000000000 -0500
+++ lx2612-rc6-tmp/drivers/block/cciss.h	2005-06-30 16:46:10.000000000 -0500
@@ -29,6 +29,7 @@ typedef struct _drive_info_struct
 {
  	__u32   LunID;	
 	int 	usage_count;
+	struct request_queue *queue;
 	sector_t nr_blocks;
 	int	block_size;
 	int 	heads;
@@ -72,7 +73,6 @@ struct ctlr_info 
 	unsigned int maxQsinceinit;
 	unsigned int maxSG;
 	spinlock_t lock;
-	struct request_queue *queue;
 
 	//* pointers to command and error info pool */ 
 	CommandList_struct 	*cmd_pool;
@@ -260,7 +260,7 @@ struct board_type {
 	struct access_method *access;
 };
 
-#define CCISS_LOCK(i)	(hba[i]->queue->queue_lock)
+#define CCISS_LOCK(i)	(&hba[i]->lock)
 
 #endif /* CCISS_H */
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] cciss per disk queue for 2.6
  2005-06-30 21:03 mike.miller
@ 2005-07-01  7:02 ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2005-07-01  7:02 UTC (permalink / raw)
  To: mike.miller; +Cc: akpm, linux-kernel, linux-scsi

On Thu, Jun 30 2005, mike.miller@hp.com wrote:
> This patch adds per disk queue functionality to cciss. Sometime back I
> submitted a patch but it looks like only part of what I needed. In the
> 2.6 kernel if we have more than one logical volume the driver will
> Oops during rmmod. It seems all of the queues actually point back to
> the same queue. So after deleting the first volume you hit a null
> pointer on the second one.
> 
> This has been tested in our labs. There is no difference in
> performance, it just fixes the Oops. Please consider this patch for
> inclusion.

Going to per-disk queues is undoubtedly a good idea, performance will be
much better this way. So far, so good.

But you need to do something about the queueing for this to be
acceptable, imho. You have a per-controller queueing limit in place, you
need to enforce some fairness to ensure an equal distribution of
commands between the disks.

Perhaps just limit the per-queue depth to something sane, instead of the
huuuge 384 commands you have now. I've had several people complain to
me, that ciss is doing some nasty starvation with that many commands in
flight and we've effectively had to limit the queueing depth to
something really low to get adequate read performance in presence of
writes.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 1/1] cciss per disk queue for 2.6
@ 2005-07-01 16:41 Miller, Mike (OS Dev)
  2005-07-01 16:51 ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: Miller, Mike (OS Dev) @ 2005-07-01 16:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: akpm, linux-kernel, linux-scsi

 

> -----Original Message-----
> From: Jens Axboe [mailto:axboe@suse.de] 
> Sent: Friday, July 01, 2005 2:02 AM
> To: Miller, Mike (OS Dev)
> Cc: akpm@osdl.org; linux-kernel@vger.kernel.org; 
> linux-scsi@vger.kernel.org
> Subject: Re: [PATCH 1/1] cciss per disk queue for 2.6
> 
> On Thu, Jun 30 2005, mike.miller@hp.com wrote:
> > This patch adds per disk queue functionality to cciss. 
> Sometime back I 
> > submitted a patch but it looks like only part of what I 
> needed. In the
> > 2.6 kernel if we have more than one logical volume the driver will 
> > Oops during rmmod. It seems all of the queues actually 
> point back to 
> > the same queue. So after deleting the first volume you hit a null 
> > pointer on the second one.
> > 
> > This has been tested in our labs. There is no difference in 
> > performance, it just fixes the Oops. Please consider this patch for 
> > inclusion.
> 
> Going to per-disk queues is undoubtedly a good idea, 
> performance will be much better this way. So far, so good.
> 
> But you need to do something about the queueing for this to 
> be acceptable, imho. You have a per-controller queueing limit 
> in place, you need to enforce some fairness to ensure an 
> equal distribution of commands between the disks.

Sometime back I submitted what we called the "fair enough" algorithm. I
don't know how I managed to separate the two. :( But it was accepted and
is in the mainstream kernel. Here's a snippet:
------------------------------------------------------------------------
-
-	/*
-	 * See if we can queue up some more IO
-	 * check every disk that exists on this controller
-	 * and start it's IO
-	 */
-	for(j=0; j < NWD; j++){
-		/* make sure the disk has been added and the drive is
real */
-		/* because this can be called from the middle of
init_one */
-		if(!(h->gendisk[j]->queue) || !(h->drv[j].heads))
+	/* check to see if we have maxed out the number of commands that
can
+	 * be placed on the queue.  If so then exit.  We do this check
here
+	 * in case the interrupt we serviced was from an ioctl and did
not
+	 * free any new commands.
+	*/
+	if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS)
+		goto cleanup;
+
+	/* We have room on the queue for more commands.  Now we need to
queue
+	 * them up.  We will also keep track of the next queue to run so
+	 * that every queue gets a chance to be started first.
+	*/
+	for (j=0; j < NWD; j++){
+		int curr_queue = (start_queue + j) % NWD;
+		/* make sure the disk has been added and the drive is
real
+		 * because this can be called from the middle of
init_one.
+		*/
+		if(!(h->gendisk[curr_queue]->queue) || 
+		   !(h->drv[curr_queue].heads))
------------------------------------------------------------------------
-
> 
> Perhaps just limit the per-queue depth to something sane, 
> instead of the huuuge 384 commands you have now. I've had 
> several people complain to me, that ciss is doing some nasty 
> starvation with that many commands in flight and we've 
> effectively had to limit the queueing depth to something 
> really low to get adequate read performance in presence of writes.

The combination of per disk queues and this "fairness" prevents
starvation on different LV's. Ex: if volumes 0 & 1 are being written and
volume 3 is being read all will get _almost_ equal time. We believe the
elevator algoritm(s) may be causing writes to starve reads on the same
logical volume. We continue to investigate our other performance issues.

Thanks,
mikem


> 
> 
> --
> Jens Axboe
> 
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] cciss per disk queue for 2.6
  2005-07-01 16:41 Miller, Mike (OS Dev)
@ 2005-07-01 16:51 ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2005-07-01 16:51 UTC (permalink / raw)
  To: Miller, Mike (OS Dev); +Cc: akpm, linux-kernel, linux-scsi

On Fri, Jul 01 2005, Miller, Mike (OS Dev) wrote:
> > > This has been tested in our labs. There is no difference in 
> > > performance, it just fixes the Oops. Please consider this patch for 
> > > inclusion.
> > 
> > Going to per-disk queues is undoubtedly a good idea, 
> > performance will be much better this way. So far, so good.
> > 
> > But you need to do something about the queueing for this to 
> > be acceptable, imho. You have a per-controller queueing limit 
> > in place, you need to enforce some fairness to ensure an 
> > equal distribution of commands between the disks.
> 
> Sometime back I submitted what we called the "fair enough" algorithm. I
> don't know how I managed to separate the two. :( But it was accepted and
> is in the mainstream kernel. Here's a snippet:
> ------------------------------------------------------------------------
> -
> -	/*
> -	 * See if we can queue up some more IO
> -	 * check every disk that exists on this controller
> -	 * and start it's IO
> -	 */
> -	for(j=0; j < NWD; j++){
> -		/* make sure the disk has been added and the drive is
> real */
> -		/* because this can be called from the middle of
> init_one */
> -		if(!(h->gendisk[j]->queue) || !(h->drv[j].heads))
> +	/* check to see if we have maxed out the number of commands that
> can
> +	 * be placed on the queue.  If so then exit.  We do this check
> here
> +	 * in case the interrupt we serviced was from an ioctl and did
> not
> +	 * free any new commands.
> +	*/
> +	if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS)
> +		goto cleanup;
> +
> +	/* We have room on the queue for more commands.  Now we need to
> queue
> +	 * them up.  We will also keep track of the next queue to run so
> +	 * that every queue gets a chance to be started first.
> +	*/
> +	for (j=0; j < NWD; j++){
> +		int curr_queue = (start_queue + j) % NWD;
> +		/* make sure the disk has been added and the drive is
> real
> +		 * because this can be called from the middle of
> init_one.
> +		*/
> +		if(!(h->gendisk[curr_queue]->queue) || 
> +		   !(h->drv[curr_queue].heads))

I suppose that will be fair-enough, at least as long as NR_CMDS >>
q->nr_requests. With eg 4 volumes, I don't think this will be the case.
Just making sure you round-robin start the queues is not enough to
ensure fair queueing between them.

> > Perhaps just limit the per-queue depth to something sane, 
> > instead of the huuuge 384 commands you have now. I've had 
> > several people complain to me, that ciss is doing some nasty 
> > starvation with that many commands in flight and we've 
> > effectively had to limit the queueing depth to something 
> > really low to get adequate read performance in presence of writes.
> 
> The combination of per disk queues and this "fairness" prevents
> starvation on different LV's. Ex: if volumes 0 & 1 are being written and
> volume 3 is being read all will get _almost_ equal time. We believe the

Different thing, I'm talking about single volume starvation, not
volume-to-volume.

> elevator algoritm(s) may be causing writes to starve reads on the same
> logical volume. We continue to investigate our other performance issues.

I completely disagree. Even with an intelligent io scheduler, starvation
is seen on ciss that does not happen on other queueing hardware such as
'normal' scsi controllers/drives. So something else is going on, and the
only 'fix' so far is to limit the ciss queue depth heavily.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 1/1] cciss per disk queue for 2.6
@ 2005-07-01 16:53 Miller, Mike (OS Dev)
  2005-07-01 18:21 ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: Miller, Mike (OS Dev) @ 2005-07-01 16:53 UTC (permalink / raw)
  To: Jens Axboe; +Cc: akpm, linux-kernel, linux-scsi

Jens Axboe wrote:
> Different thing, I'm talking about single volume starvation, 
> not volume-to-volume.
> 
> > elevator algoritm(s) may be causing writes to starve reads 
> on the same 
> > logical volume. We continue to investigate our other 
> performance issues.
> 
> I completely disagree. Even with an intelligent io scheduler, 
> starvation is seen on ciss that does not happen on other 
> queueing hardware such as 'normal' scsi controllers/drives. 
> So something else is going on, and the only 'fix' so far is 
> to limit the ciss queue depth heavily.

We will investigate this further and come up with a solution. Could be
the firmware, I suppose.

mikem

> 
> --
> Jens Axboe
> 
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] cciss per disk queue for 2.6
  2005-07-01 16:53 [PATCH 1/1] cciss per disk queue for 2.6 Miller, Mike (OS Dev)
@ 2005-07-01 18:21 ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2005-07-01 18:21 UTC (permalink / raw)
  To: Miller, Mike (OS Dev); +Cc: akpm, linux-kernel, linux-scsi

On Fri, Jul 01 2005, Miller, Mike (OS Dev) wrote:
> Jens Axboe wrote:
> > Different thing, I'm talking about single volume starvation, 
> > not volume-to-volume.
> > 
> > > elevator algoritm(s) may be causing writes to starve reads 
> > on the same 
> > > logical volume. We continue to investigate our other 
> > performance issues.
> > 
> > I completely disagree. Even with an intelligent io scheduler, 
> > starvation is seen on ciss that does not happen on other 
> > queueing hardware such as 'normal' scsi controllers/drives. 
> > So something else is going on, and the only 'fix' so far is 
> > to limit the ciss queue depth heavily.
> 
> We will investigate this further and come up with a solution. Could be
> the firmware, I suppose.

Unfortunately I don't have any hardware at hand for testing, so I cannot
do it myself. Would be appreciated!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2005-07-01 18:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-07-01 16:53 [PATCH 1/1] cciss per disk queue for 2.6 Miller, Mike (OS Dev)
2005-07-01 18:21 ` Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2005-07-01 16:41 Miller, Mike (OS Dev)
2005-07-01 16:51 ` Jens Axboe
2005-06-30 21:03 mike.miller
2005-07-01  7:02 ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox