[PATCH] Fix aic7xxx del_timer

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Fix aic7xxx del_timer_sync() deadlock
@ 2004-02-27 18:26 James Bottomley
  2004-02-27 19:23 ` Justin T. Gibbs
  0 siblings, 1 reply; 29+ messages in thread
From: James Bottomley @ 2004-02-27 18:26 UTC (permalink / raw)
  To: gibbs; +Cc: SCSI Mailing List, Andrew Morton

The deadlock is quite simple: if the device timer ever fires, it can
call a linux_free_device() from the timer which will do a del_timer_sync
on the timer and hang.

The best fix, I think is to eliminate this timer from the driver.  All
it was doing was providing a time limited queue freeze for the BUSY and
QUEUE_FULL on no outstanding commands cases.  These are handled
correctly by the mid-layer, so there's no need for the driver do try to
handle them on it's own.

It looks like the only other consumer of the queue freezing api is an
untimed tag/untagged switch, so I pulled out the per device timer as
well.

James

===== drivers/scsi/aic7xxx/aic79xx_osm.c 1.52 vs edited =====
--- 1.52/drivers/scsi/aic7xxx/aic79xx_osm.c	Wed Feb 18 21:42:35 2004
+++ edited/drivers/scsi/aic7xxx/aic79xx_osm.c	Fri Feb 27 12:20:52 2004
@@ -472,7 +472,6 @@
 					 Scsi_Cmnd *cmd);
 static void ahd_linux_filter_inquiry(struct ahd_softc *ahd,
 				     struct ahd_devinfo *devinfo);
-static void ahd_linux_dev_timed_unfreeze(u_long arg);
 static void ahd_linux_sem_timeout(u_long arg);
 static void ahd_linux_initialize_scsi_bus(struct ahd_softc *ahd);
 static void ahd_linux_size_nseg(void);
@@ -4262,7 +4261,6 @@
 	if (dev == NULL)
 		return (NULL);
 	memset(dev, 0, sizeof(*dev));
-	init_timer(&dev->timer);
 	TAILQ_INIT(&dev->busyq);
 	dev->flags = AHD_DEV_UNCONFIGURED;
 	dev->lun = lun;
@@ -4291,7 +4289,6 @@
 {
 	struct ahd_linux_target *targ;
 
-	del_timer(&dev->timer);
 	targ = dev->target;
 	targ->devices[dev->lun] = NULL;
 	free(dev, M_DEVBUF);
@@ -4683,28 +4680,9 @@
 			     (dev->flags & AHD_DEV_Q_BASIC)
 			   ? AHD_QUEUE_BASIC : AHD_QUEUE_TAGGED);
 		ahd_set_scsi_status(scb, SCSI_STATUS_BUSY);
-		/* FALLTHROUGH */
-	}
-	case SCSI_STATUS_BUSY:
-		/*
-		 * Set a short timer to defer sending commands for
-		 * a bit since Linux will not delay in this case.
-		 */
-		if ((dev->flags & AHD_DEV_TIMER_ACTIVE) != 0) {
-			printf("%s:%c:%d: Device Timer still active during "
-			       "busy processing\n", ahd_name(ahd),
-				dev->target->channel, dev->target->target);
-			break;
-		}
-		dev->flags |= AHD_DEV_TIMER_ACTIVE;
-		dev->qfrozen++;
-		init_timer(&dev->timer);
-		dev->timer.data = (u_long)dev;
-		dev->timer.expires = jiffies + (HZ/2);
-		dev->timer.function = ahd_linux_dev_timed_unfreeze;
-		add_timer(&dev->timer);
 		break;
 	}
+	}
 }
 
 static void
@@ -5017,28 +4995,6 @@
 		scb->platform_data->flags &= ~AHD_SCB_UP_EH_SEM;
 		up(&ahd->platform_data->eh_sem);
 	}
-	ahd_unlock(ahd, &s);
-}
-
-static void
-ahd_linux_dev_timed_unfreeze(u_long arg)
-{
-	struct ahd_linux_device *dev;
-	struct ahd_softc *ahd;
-	u_long s;
-
-	dev = (struct ahd_linux_device *)arg;
-	ahd = dev->target->ahd;
-	ahd_lock(ahd, &s);
-	dev->flags &= ~AHD_DEV_TIMER_ACTIVE;
-	if (dev->qfrozen > 0)
-		dev->qfrozen--;
-	if (dev->qfrozen == 0
-	 && (dev->flags & AHD_DEV_ON_RUN_LIST) == 0)
-		ahd_linux_run_device_queue(ahd, dev);
-	if ((dev->flags & AHD_DEV_UNCONFIGURED) != 0
-	 && dev->active == 0)
-		ahd_linux_free_device(ahd, dev);
 	ahd_unlock(ahd, &s);
 }
 
===== drivers/scsi/aic7xxx/aic79xx_osm.h 1.35 vs edited =====
--- 1.35/drivers/scsi/aic7xxx/aic79xx_osm.h	Tue Dec 30 12:12:35 2003
+++ edited/drivers/scsi/aic7xxx/aic79xx_osm.h	Fri Feb 27 12:16:56 2004
@@ -387,11 +387,6 @@
 	ahd_linux_dev_flags	flags;
 
 	/*
-	 * Per device timer.
-	 */
-	struct timer_list	timer;
-
-	/*
 	 * The high limit for the tags variable.
 	 */
 	u_int			maxtags;
===== drivers/scsi/aic7xxx/aic7xxx_osm.c 1.49 vs edited =====
--- 1.49/drivers/scsi/aic7xxx/aic7xxx_osm.c	Wed Feb 18 21:42:35 2004
+++ edited/drivers/scsi/aic7xxx/aic7xxx_osm.c	Fri Feb 27 12:12:51 2004
@@ -490,7 +490,6 @@
 static void ahc_linux_sem_timeout(u_long arg);
 static void ahc_linux_freeze_simq(struct ahc_softc *ahc);
 static void ahc_linux_release_simq(u_long arg);
-static void ahc_linux_dev_timed_unfreeze(u_long arg);
 static int  ahc_linux_queue_recovery_cmd(Scsi_Cmnd *cmd, scb_flag flag);
 static void ahc_linux_initialize_scsi_bus(struct ahc_softc *ahc);
 static void ahc_linux_size_nseg(void);
@@ -3944,7 +3943,6 @@
 	if (dev == NULL)
 		return (NULL);
 	memset(dev, 0, sizeof(*dev));
-	init_timer(&dev->timer);
 	TAILQ_INIT(&dev->busyq);
 	dev->flags = AHC_DEV_UNCONFIGURED;
 	dev->lun = lun;
@@ -3973,7 +3971,6 @@
 {
 	struct ahc_linux_target *targ;
 
-	del_timer_sync(&dev->timer);
 	targ = dev->target;
 	targ->devices[dev->lun] = NULL;
 	free(dev, M_DEVBUF);
@@ -4363,27 +4360,6 @@
 		ahc_platform_set_tags(ahc, &devinfo,
 			     (dev->flags & AHC_DEV_Q_BASIC)
 			   ? AHC_QUEUE_BASIC : AHC_QUEUE_TAGGED);
-		/* FALLTHROUGH */
-	}
-	case SCSI_STATUS_BUSY:
-	{
-		/*
-		 * Set a short timer to defer sending commands for
-		 * a bit since Linux will not delay in this case.
-		 */
-		if ((dev->flags & AHC_DEV_TIMER_ACTIVE) != 0) {
-			printf("%s:%c:%d: Device Timer still active during "
-			       "busy processing\n", ahc_name(ahc),
-				dev->target->channel, dev->target->target);
-			break;
-		}
-		dev->flags |= AHC_DEV_TIMER_ACTIVE;
-		dev->qfrozen++;
-		init_timer(&dev->timer);
-		dev->timer.data = (u_long)dev;
-		dev->timer.expires = jiffies + (HZ/2);
-		dev->timer.function = ahc_linux_dev_timed_unfreeze;
-		add_timer(&dev->timer);
 		break;
 	}
 	}
@@ -4673,28 +4649,6 @@
 	 */
 	if (unblock_reqs)
 		scsi_unblock_requests(ahc->platform_data->host);
-}
-
-static void
-ahc_linux_dev_timed_unfreeze(u_long arg)
-{
-	struct ahc_linux_device *dev;
-	struct ahc_softc *ahc;
-	u_long s;
-
-	dev = (struct ahc_linux_device *)arg;
-	ahc = dev->target->ahc;
-	ahc_lock(ahc, &s);
-	dev->flags &= ~AHC_DEV_TIMER_ACTIVE;
-	if (dev->qfrozen > 0)
-		dev->qfrozen--;
-	if (dev->qfrozen == 0
-	 && (dev->flags & AHC_DEV_ON_RUN_LIST) == 0)
-		ahc_linux_run_device_queue(ahc, dev);
-	if (TAILQ_EMPTY(&dev->busyq)
-	 && dev->active == 0)
-		ahc_linux_free_device(ahc, dev);
-	ahc_unlock(ahc, &s);
 }
 
 static int
===== drivers/scsi/aic7xxx/aic7xxx_osm.h 1.51 vs edited =====
--- 1.51/drivers/scsi/aic7xxx/aic7xxx_osm.h	Tue Dec 30 12:12:37 2003
+++ edited/drivers/scsi/aic7xxx/aic7xxx_osm.h	Fri Feb 27 12:13:34 2004
@@ -399,11 +399,6 @@
 	ahc_linux_dev_flags	flags;
 
 	/*
-	 * Per device timer.
-	 */
-	struct timer_list	timer;
-
-	/*
 	 * The high limit for the tags variable.
 	 */
 	u_int			maxtags;


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-27 18:26 [PATCH] Fix aic7xxx del_timer_sync() deadlock James Bottomley
@ 2004-02-27 19:23 ` Justin T. Gibbs
  2004-02-27 19:34   ` James Bottomley
  2004-02-28  2:40   ` Jeff Garzik
  0 siblings, 2 replies; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-27 19:23 UTC (permalink / raw)
  To: James Bottomley; +Cc: SCSI Mailing List, Andrew Morton

> The best fix, I think is to eliminate this timer from the driver.  All
> it was doing was providing a time limited queue freeze for the BUSY and
> QUEUE_FULL on no outstanding commands cases.  These are handled
> correctly by the mid-layer, so there's no need for the driver do try to
> handle them on it's own.

There are two flaws with your analysis:

1) The mid-layer doesn't correctly handle this situation.  The mid-layer
   uses blk_plug_device() to implement this behavior rather than
   blk_stop_queue().  The leaves the implementation vulnerable to
   any code that does a manual unplug (e.g. the SCSI scan code) or a
   blk_run_queues() (e.g. MD) which can alter the duration of the delay.
   While the host may be able to tune the delay to HZ, the default values
   are not.  I'm also not sure that leaving the default blk_unplug delay
   as 0 gives you a deterministic jiffy long delay before the request
   function will be hit again.

2) These drivers both include Domain Validation support.  These commands
   do not go through the mid-layer at all yet require proper queue full
   and busy handling.

--
Justin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-27 19:23 ` Justin T. Gibbs
@ 2004-02-27 19:34   ` James Bottomley
  2004-02-27 20:50     ` Justin T. Gibbs
  2004-02-28  2:40   ` Jeff Garzik
  1 sibling, 1 reply; 29+ messages in thread
From: James Bottomley @ 2004-02-27 19:34 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: SCSI Mailing List, Andrew Morton

On Fri, 2004-02-27 at 13:23, Justin T. Gibbs wrote:
> 1) The mid-layer doesn't correctly handle this situation.  The mid-layer
>    uses blk_plug_device() to implement this behavior rather than
>    blk_stop_queue().  The leaves the implementation vulnerable to
>    any code that does a manual unplug (e.g. the SCSI scan code) or a
>    blk_run_queues() (e.g. MD) which can alter the duration of the delay.
>    While the host may be able to tune the delay to HZ, the default values
>    are not.  I'm also not sure that leaving the default blk_unplug delay
>    as 0 gives you a deterministic jiffy long delay before the request
>    function will be hit again.

There's no guaranteed minimum delay time required by the specs.  We
pause for a predetermined number of unplugs if you look at the code,
which is actually under the control of the driver.  The trade off is
that we want to try faster under heavy I/O pressure, so the delay isn't
fixed, it's bounded by the I/O pressure in the system, which is as it
should be.

> 2) These drivers both include Domain Validation support.  These commands
>    do not go through the mid-layer at all yet require proper queue full
>    and busy handling.

Well, if the driver used the correct command handling infrastructure for
that, they would go through the mid-layer and there would be no problem.

James



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-27 19:34   ` James Bottomley
@ 2004-02-27 20:50     ` Justin T. Gibbs
  2004-02-28 15:39       ` James Bottomley
  0 siblings, 1 reply; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-27 20:50 UTC (permalink / raw)
  To: James Bottomley; +Cc: SCSI Mailing List, Andrew Morton

> There's no guaranteed minimum delay time required by the specs.  We
> pause for a predetermined number of unplugs if you look at the code,
> which is actually under the control of the driver.  The trade off is
> that we want to try faster under heavy I/O pressure, so the delay isn't
> fixed, it's bounded by the I/O pressure in the system, which is as it
> should be.

Well, experience shows that if you implement a SCSI system based solely
on the spec, it won't work.

There are lots of devices out there that require a delay of at least
250ms in order to not deadlock their internal SCSI processor.  The
I/O load of the system has no bearing on when a device will become
"unbusy" (we can't even say why it is "busy"), so I fail to see why
it should have any effect on how long we wait in response to this
condition.

>> 2) These drivers both include Domain Validation support.  These commands
>>    do not go through the mid-layer at all yet require proper queue full
>>    and busy handling.
> 
> Well, if the driver used the correct command handling infrastructure for
> that, they would go through the mid-layer and there would be no problem.

In order to issue a DV command to the end device via the mid-layer, the
host queue and the device queue must not be blocked.  But, for DV to be
effective, it must be the only activity occurring on that device.  How do
you reconcile the two while using the mid-layer to do your I/O?  The
mid-layer has no concept of allowing a client to freeze the queue,
wait for the active count to go to zero, effectively pre-empt
the command stream with a series of special commands, and then unblock
everyone else only at the end.  The closest the mid-layer comes to this
is in some of its error recovery handling but those are internal
interfaces.

--
Justin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-27 19:23 ` Justin T. Gibbs
  2004-02-27 19:34   ` James Bottomley
@ 2004-02-28  2:40   ` Jeff Garzik
  2004-02-28  9:25     ` Jens Axboe
  1 sibling, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-28  2:40 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: James Bottomley, SCSI Mailing List, Andrew Morton

Justin T. Gibbs wrote:
> 1) The mid-layer doesn't correctly handle this situation.  The mid-layer
>    uses blk_plug_device() to implement this behavior rather than
>    blk_stop_queue().  The leaves the implementation vulnerable to
>    any code that does a manual unplug (e.g. the SCSI scan code) or a
>    blk_run_queues() (e.g. MD) which can alter the duration of the delay.


hum.

Long term we definitely want to use blk_{start,stop}_queue().  I had to 
work on Jens for a little while before he was kind enough to add it :) 
so let's put it to use.

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-28  2:40   ` Jeff Garzik
@ 2004-02-28  9:25     ` Jens Axboe
  2004-02-28 23:50       ` Jeff Garzik
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2004-02-28  9:25 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Fri, Feb 27 2004, Jeff Garzik wrote:
> Justin T. Gibbs wrote:
> >1) The mid-layer doesn't correctly handle this situation.  The mid-layer
> >   uses blk_plug_device() to implement this behavior rather than
> >   blk_stop_queue().  The leaves the implementation vulnerable to
> >   any code that does a manual unplug (e.g. the SCSI scan code) or a
> >   blk_run_queues() (e.g. MD) which can alter the duration of the delay.
> 
> 
> hum.
> 
> Long term we definitely want to use blk_{start,stop}_queue().  I had to 
> work on Jens for a little while before he was kind enough to add it :) 
> so let's put it to use.

The only nasty right now is that you cannot unconditionally call
blk_stop_queue(), it requires a manual blk_start_queue() on io
completion which will only happen if io is pending of course. I just got
an idea on how to do that properly, I'll make sure it works in any case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-27 20:50     ` Justin T. Gibbs
@ 2004-02-28 15:39       ` James Bottomley
  2004-02-29 19:26         ` Justin T. Gibbs
  0 siblings, 1 reply; 29+ messages in thread
From: James Bottomley @ 2004-02-28 15:39 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: SCSI Mailing List, Andrew Morton

On Fri, 2004-02-27 at 14:50, Justin T. Gibbs wrote:
> Well, experience shows that if you implement a SCSI system based solely

Heh, well, I won't disagree with that.

> There are lots of devices out there that require a delay of at least
> 250ms in order to not deadlock their internal SCSI processor.  The
> I/O load of the system has no bearing on when a device will become
> "unbusy" (we can't even say why it is "busy"), so I fail to see why
> it should have any effect on how long we wait in response to this
> condition.

Could you give the most common example ... I'll see if I can persuade
the OSDL test people to try it out with the current stack?

What we currently do is by design ... on busy or queue full at zero
depth we pause for three unplugs.  The first will be the returning queue
unplug, but the other two depend on the I/O pressure or the unplug
timer.  If you tell me what the inquiry strings of these devices are, I
can blacklist them to have a much larger max_device_blocked count, so if
there is a problem with them, *all* drivers will work rather than just
the Adaptec ones.

> In order to issue a DV command to the end device via the mid-layer, the
> host queue and the device queue must not be blocked.  But, for DV to be
> effective, it must be the only activity occurring on that device.  How do
> you reconcile the two while using the mid-layer to do your I/O?  The
> mid-layer has no concept of allowing a client to freeze the queue,
> wait for the active count to go to zero, effectively pre-empt
> the command stream with a series of special commands, and then unblock
> everyone else only at the end.  The closest the mid-layer comes to this
> is in some of its error recovery handling but those are internal
> interfaces.

But domain validation is a pretty intrusive thing.  It's only really
supposed to be run in two places:

1. At start of day, which you should do from slave_configure, where you
are guaranteed that nothing else is using the device

2. On indication of transport problems.  This you would run for a single
target from the bus or device reset handler after issuing the command
and pausing for the settle time (OK, that's bad because the settle time
is also built into the error handler, but that will improve when error
handling becomes more transport specific and I can build domain
validation directly into the SPI transport error handling).

In both of these cases, you are guaranteed a quiescent device queue, so
I don't see what the problem is.

James

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-28  9:25     ` Jens Axboe
@ 2004-02-28 23:50       ` Jeff Garzik
  2004-02-29  9:13         ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-28 23:50 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

Jens Axboe wrote:
> The only nasty right now is that you cannot unconditionally call
> blk_stop_queue(), it requires a manual blk_start_queue() on io
> completion which will only happen if io is pending of course. I just got
> an idea on how to do that properly, I'll make sure it works in any case.

hmmm...  I thought the model was to call blk_stop_queue() when you can 
accept no more requests, and call blk_start_queue() once you can start 
accepting more requests.

That would imply that blk_start_queue() is OK when there is no more I/O 
pending...

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-28 23:50       ` Jeff Garzik
@ 2004-02-29  9:13         ` Jens Axboe
  2004-02-29 16:21           ` Jeff Garzik
  2004-02-29 18:57           ` Justin T. Gibbs
  0 siblings, 2 replies; 29+ messages in thread
From: Jens Axboe @ 2004-02-29  9:13 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Sat, Feb 28 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >The only nasty right now is that you cannot unconditionally call
> >blk_stop_queue(), it requires a manual blk_start_queue() on io
> >completion which will only happen if io is pending of course. I just got
> >an idea on how to do that properly, I'll make sure it works in any case.
> 
> hmmm...  I thought the model was to call blk_stop_queue() when you can 
> accept no more requests, and call blk_start_queue() once you can start 
> accepting more requests.
> 
> That would imply that blk_start_queue() is OK when there is no more I/O 
> pending...

That is the model. The problem is that you don't have an opportunitty to
call blk_start_queue() if there's no more IO pending (you typicall do it
from io completion). There's no problem with doing that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29  9:13         ` Jens Axboe
@ 2004-02-29 16:21           ` Jeff Garzik
  2004-02-29 16:39             ` Jens Axboe
  2004-02-29 18:57           ` Justin T. Gibbs
  1 sibling, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 16:21 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

Jens Axboe wrote:
> On Sat, Feb 28 2004, Jeff Garzik wrote:
> 
>>Jens Axboe wrote:
>>
>>>The only nasty right now is that you cannot unconditionally call
>>>blk_stop_queue(), it requires a manual blk_start_queue() on io
>>>completion which will only happen if io is pending of course. I just got
>>>an idea on how to do that properly, I'll make sure it works in any case.
>>
>>hmmm...  I thought the model was to call blk_stop_queue() when you can 
>>accept no more requests, and call blk_start_queue() once you can start 
>>accepting more requests.
>>
>>That would imply that blk_start_queue() is OK when there is no more I/O 
>>pending...
> 
> 
> That is the model. The problem is that you don't have an opportunitty to
> call blk_start_queue() if there's no more IO pending (you typicall do it
> from io completion). There's no problem with doing that.

Help me out here, I must be dumb :)

If you call blk_start_queue() from io completion -- as you should -- 
then it gets called unconditionally, regardless of whether there is IO 
pending or not.  The LLD should only care if it can accept more 
requests.  The opportunity seems to be there, to me...

	Jeff



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 16:21           ` Jeff Garzik
@ 2004-02-29 16:39             ` Jens Axboe
  2004-02-29 17:28               ` Jeff Garzik
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2004-02-29 16:39 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Sun, Feb 29 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >On Sat, Feb 28 2004, Jeff Garzik wrote:
> >
> >>Jens Axboe wrote:
> >>
> >>>The only nasty right now is that you cannot unconditionally call
> >>>blk_stop_queue(), it requires a manual blk_start_queue() on io
> >>>completion which will only happen if io is pending of course. I just got
> >>>an idea on how to do that properly, I'll make sure it works in any case.
> >>
> >>hmmm...  I thought the model was to call blk_stop_queue() when you can 
> >>accept no more requests, and call blk_start_queue() once you can start 
> >>accepting more requests.
> >>
> >>That would imply that blk_start_queue() is OK when there is no more I/O 
> >>pending...
> >
> >
> >That is the model. The problem is that you don't have an opportunitty to
> >call blk_start_queue() if there's no more IO pending (you typicall do it
> >from io completion). There's no problem with doing that.
> 
> Help me out here, I must be dumb :)

It's probably my fault, I didn't list all the conditions carefully
enough...

> If you call blk_start_queue() from io completion -- as you should -- 
> then it gets called unconditionally, regardless of whether there is IO 
> pending or not.  The LLD should only care if it can accept more 
> requests.  The opportunity seems to be there, to me...

You are still assuming you have io pending when you call
blk_stop_queue(). If some condition prevents you from queueing the first
request, you call blk_stop_queue() without having anything to start the
queue again.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 16:39             ` Jens Axboe
@ 2004-02-29 17:28               ` Jeff Garzik
  2004-02-29 17:55                 ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 17:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

Jens Axboe wrote:
> On Sun, Feb 29 2004, Jeff Garzik wrote:
>>If you call blk_start_queue() from io completion -- as you should -- 
>>then it gets called unconditionally, regardless of whether there is IO 
>>pending or not.  The LLD should only care if it can accept more 
>>requests.  The opportunity seems to be there, to me...
> 
> 
> You are still assuming you have io pending when you call
> blk_stop_queue(). If some condition prevents you from queueing the first
> request, you call blk_stop_queue() without having anything to start the
> queue again.

ok, I got it.  I thought your "io pending" was referring to requests in 
the request_queue, not requests active in the LLD.

Still, this is the responsibility of the LLD, right?  For example, 
before any requests are ever sent to the request_queue, the LLD may 
choose to call blk_stop_queue() to prevent queueing while it probes or 
resets the bus.  That would be a case where there is no io pending, for 
both definitions of the phrase :)  And it would be up to the LLD to call 
blk_start_queue() again.  Right?

	Jeff

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 17:28               ` Jeff Garzik
@ 2004-02-29 17:55                 ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2004-02-29 17:55 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Sun, Feb 29 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >On Sun, Feb 29 2004, Jeff Garzik wrote:
> >>If you call blk_start_queue() from io completion -- as you should -- 
> >>then it gets called unconditionally, regardless of whether there is IO 
> >>pending or not.  The LLD should only care if it can accept more 
> >>requests.  The opportunity seems to be there, to me...
> >
> >
> >You are still assuming you have io pending when you call
> >blk_stop_queue(). If some condition prevents you from queueing the first
> >request, you call blk_stop_queue() without having anything to start the
> >queue again.
> 
> ok, I got it.  I thought your "io pending" was referring to requests in 
> the request_queue, not requests active in the LLD.
> 
> Still, this is the responsibility of the LLD, right?  For example, 
> before any requests are ever sent to the request_queue, the LLD may 
> choose to call blk_stop_queue() to prevent queueing while it probes or 
> resets the bus.  That would be a case where there is no io pending, for 
> both definitions of the phrase :)  And it would be up to the LLD to call 
> blk_start_queue() again.  Right?

Yeah that's precisely right. So right now it means that the LLD has to
arm some sort of timer to trigger a blk_start_queue(), _or_ call
blk_plug_device() instead of blk_stop_queue() for this particular case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29  9:13         ` Jens Axboe
  2004-02-29 16:21           ` Jeff Garzik
@ 2004-02-29 18:57           ` Justin T. Gibbs
  2004-02-29 19:00             ` Jeff Garzik
  2004-02-29 20:04             ` Jens Axboe
  1 sibling, 2 replies; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-29 18:57 UTC (permalink / raw)
  To: Jens Axboe, Jeff Garzik; +Cc: James Bottomley, SCSI Mailing List, Andrew Morton

>> That would imply that blk_start_queue() is OK when there is no more I/O 
>> pending...
> 
> That is the model. The problem is that you don't have an opportunitty to
> call blk_start_queue() if there's no more IO pending (you typicall do it
> from io completion). There's no problem with doing that.

You just need to schedule a timer to do the blk_start_queue() call.
The point of doing this is so that the delay is deterministic.

--
Justin


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 18:57           ` Justin T. Gibbs
@ 2004-02-29 19:00             ` Jeff Garzik
  2004-02-29 19:28               ` Justin T. Gibbs
  2004-02-29 20:04             ` Jens Axboe
  1 sibling, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 19:00 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

Justin T. Gibbs wrote:
>>>That would imply that blk_start_queue() is OK when there is no more I/O 
>>>pending...
>>
>>That is the model. The problem is that you don't have an opportunitty to
>>call blk_start_queue() if there's no more IO pending (you typicall do it
>>from io completion). There's no problem with doing that.
> 
> 
> You just need to schedule a timer to do the blk_start_queue() call.
> The point of doing this is so that the delay is deterministic.


Now that I understand what Jens is referring to...  this depends on the 
LLD.  In network drivers (from which the model I speak of originated) 
and in the upcoming block-level SATA drivers, a delay not needed.  The 
LLD calls blk_start_queue() when it is ready to receive requests -- 
which is not a time-based constraint.

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-28 15:39       ` James Bottomley
@ 2004-02-29 19:26         ` Justin T. Gibbs
  2004-02-29 21:10           ` James Bottomley
  2004-02-29 21:25           ` James Bottomley
  0 siblings, 2 replies; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-29 19:26 UTC (permalink / raw)
  To: James Bottomley; +Cc: SCSI Mailing List, Andrew Morton

>> There are lots of devices out there that require a delay of at least
>> 250ms in order to not deadlock their internal SCSI processor.  The
>> I/O load of the system has no bearing on when a device will become
>> "unbusy" (we can't even say why it is "busy"), so I fail to see why
>> it should have any effect on how long we wait in response to this
>> condition.
> 
> Could you give the most common example ... I'll see if I can persuade
> the OSDL test people to try it out with the current stack?

I don't recall exact devices, model numbers etc, but the behavior
I has been observed this behavior on scanners, CDROM drives, and
several external RAID controllers from different vendors.

> If you tell me what the inquiry strings of these devices are, I
> can blacklist them to have a much larger max_device_blocked count, so if
> there is a problem with them, *all* drivers will work rather than just
> the Adaptec ones.

This is not something worth black-listing.  It is not a special case.
Busy and/or queue full with no I/O pending is a rare event.  The user
will never notice this in practice other than their devices that need
this delay will work correctly in this situation.  To put it another
way, the aic7xxx and aic79xx drivers have enforced this delay for almost
four years in Linux and I have yet to have someone complain that they
had poor device performance due to this delay.  It is just not worth
the code complexity or potential of missing a broken device to "optimize"
this delay.

>> In order to issue a DV command to the end device via the mid-layer, the
>> host queue and the device queue must not be blocked.  But, for DV to be
>> effective, it must be the only activity occurring on that device.  How do
>> you reconcile the two while using the mid-layer to do your I/O?  The
>> mid-layer has no concept of allowing [this] ...
>
> But domain validation is a pretty intrusive thing.  It's only really
> supposed to be run in two places:
> 
> 1. At start of day, which you should do from slave_configure, where you
> are guaranteed that nothing else is using the device
> 
> 2. On indication of transport problems.  This you would run for a single
> target from the bus or device reset handler after issuing the command
> and pausing for the settle time (OK, that's bad because the settle time
> is also built into the error handler, but that will improve when error
> handling becomes more transport specific and I can build domain
> validation directly into the SPI transport error handling).
> 
> In both of these cases, you are guaranteed a quiescent device queue, so
> I don't see what the problem is.

First of all, domain validation may occur without the mid-layer ever
seeing a timeout.  The driver records transmission errors regardless of
whether they are successfully recovered (target performs a manual restore
data pointers), are properly reported via sense information, or result
in a timeout.  After a predetermined threshold, the driver will "fallback"
to a slower speed and re-perform domain validation.

Domain validation also occurs any time the driver believes the end-target
may have changed.  Typically this occurs due to a selection timeout or the
target reporting a power-on or inquiry change event.  The Linux mid-layer
is not very careful about determining if a device has changed.  This is
especially true during error recovery where unit attention conditions
are routinely discarded without any type of processing.  If the end
device has changed, the optimal negotiated transfer rate has likely
also changed, which is why domain validation is required.

Even without these issues, I don't see how you expect a driver to do
domain validation through the mid-layer.  If domain validation occurs
before the mid-layer scans for devices, then by definition, you can't
go through the mid-layer.  If you wait until after the device is found,
you run into the same problem of making sure the queue is frozen yet allows
your domain validation code to issue commands through the mid-layer.  
In the case of doing this in an error-handler, you again are not able
to issue these commands through the mid-layer - the queue is blocked.
This means that any correction to the queue full or busy behavior in the
mid-layer does not address the need for a functional delay in domain
validation case.

While it is certainly possible to move domain validation into the mid-layer
and remove it from LLDs, I doubt that is the type of change you want to
include in 2.6.  It would require at minimum:

1) A generic method for fetching and changing the transport parameters for
   devices attached to LLDs.

2) A generic method for freezing the execution queue and "single-stepping"
   things like domain validation and error recovery commands.

3) A way to wait for the active count on a device or target (including all
   luns) to drop to 0.

Don't get me wrong.  One of my main complaints of the Linux SCSI layer is
its intrinsic lack of consistency.  In my opinion, the code path for sending
commands to and processing the results from a LLD should be identical
regardless of whether the commands are from error recovery, domain validation,
a peripheral driver, or the probe code.  Instead we only get certain behavior
(like a bus settle delay) for some clients and not others.  Fixing this
would allow the behavior to be defined in one place and these "workarounds"
could be removed from the LLDs.  Unfortunately, the LLDs are the only "code
path" common to all of the clients of the mid-layer, so you wind up with the
LLDs enforcing correct behavior.

--
Justin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:00             ` Jeff Garzik
@ 2004-02-29 19:28               ` Justin T. Gibbs
  2004-02-29 19:37                 ` Jeff Garzik
  0 siblings, 1 reply; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-29 19:28 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

>> You just need to schedule a timer to do the blk_start_queue() call.
>> The point of doing this is so that the delay is deterministic.
> 
> 
> Now that I understand what Jens is referring to...  this depends on the LLD.

Exactly.  The block layer shouldn't be involved in this at all.  In the
particular case I was talking about, busy or queue full with no I/O pending,
you need to schedule a timer to unfreeze the queue since there is no
other event that can be used to give you a deterministic delay.

--
Justin


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:28               ` Justin T. Gibbs
@ 2004-02-29 19:37                 ` Jeff Garzik
  2004-02-29 19:42                   ` Justin T. Gibbs
  2004-02-29 19:43                   ` Jeff Garzik
  0 siblings, 2 replies; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 19:37 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

Justin T. Gibbs wrote:
>>>You just need to schedule a timer to do the blk_start_queue() call.
>>>The point of doing this is so that the delay is deterministic.
>>
>>
>>Now that I understand what Jens is referring to...  this depends on the LLD.
> 
> 
> Exactly.  The block layer shouldn't be involved in this at all.  In the
> particular case I was talking about, busy or queue full with no I/O pending,
> you need to schedule a timer to unfreeze the queue since there is no
> other event that can be used to give you a deterministic delay.

Agreed for busy.

But queue full?  I hope you clear out the queue first and do some error 
handling, before calling blk_start_queue()?

Normally, calling blk_start_queue() when the queue is full would be the 
wrong thing to do :)  You call blk_stop_queue() when the queue becomes 
full, then blk_start_queue() only when the LLD's hardware signals it has 
completed a transaction...

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:37                 ` Jeff Garzik
@ 2004-02-29 19:42                   ` Justin T. Gibbs
  2004-02-29 19:44                     ` Jeff Garzik
  2004-02-29 19:43                   ` Jeff Garzik
  1 sibling, 1 reply; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-29 19:42 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

>> Exactly.  The block layer shouldn't be involved in this at all.  In the
>> particular case I was talking about, busy or queue full with no I/O pending,
                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> Agreed for busy.
> 
> But queue full?

Some SCSI devices send Queue Full instead of Busy in the case of
a "busy" condition with no I/O pending.

--
Justin


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:37                 ` Jeff Garzik
  2004-02-29 19:42                   ` Justin T. Gibbs
@ 2004-02-29 19:43                   ` Jeff Garzik
  1 sibling, 0 replies; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 19:43 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

Jeff Garzik wrote:
> But queue full?  I hope you clear out the queue first and do some error 
> handling, before calling blk_start_queue()?
> 
> Normally, calling blk_start_queue() when the queue is full would be the 
> wrong thing to do :)  You call blk_stop_queue() when the queue becomes 
> full, then blk_start_queue() only when the LLD's hardware signals it has 
> completed a transaction...

Actually, thinking more about it, I guess this depends (again) on the LLD...

when does queue-full occur, when the LLD itself cannot clear the 
condition?  During an error/failure condition?

	Jeff



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:42                   ` Justin T. Gibbs
@ 2004-02-29 19:44                     ` Jeff Garzik
  2004-02-29 20:06                       ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 19:44 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jens Axboe, James Bottomley, SCSI Mailing List, Andrew Morton

Justin T. Gibbs wrote:
>>>Exactly.  The block layer shouldn't be involved in this at all.  In the
>>>particular case I was talking about, busy or queue full with no I/O pending,
> 
>                                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> 
>>Agreed for busy.
>>
>>But queue full?
> 
> 
> Some SCSI devices send Queue Full instead of Busy in the case of
> a "busy" condition with no I/O pending.


Gotcha, thanks.

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 18:57           ` Justin T. Gibbs
  2004-02-29 19:00             ` Jeff Garzik
@ 2004-02-29 20:04             ` Jens Axboe
  1 sibling, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2004-02-29 20:04 UTC (permalink / raw)
  To: Justin T. Gibbs
  Cc: Jeff Garzik, James Bottomley, SCSI Mailing List, Andrew Morton

On Sun, Feb 29 2004, Justin T. Gibbs wrote:
> >> That would imply that blk_start_queue() is OK when there is no more I/O 
> >> pending...
> > 
> > That is the model. The problem is that you don't have an opportunitty to
> > call blk_start_queue() if there's no more IO pending (you typicall do it
> > from io completion). There's no problem with doing that.
> 
> You just need to schedule a timer to do the blk_start_queue() call.
> The point of doing this is so that the delay is deterministic.

There is no good pre-determined delay... If you plug the device when you
cannot simply use blk_stop_queue() (or prefer not to), then you get the
usual q->unplug_delay delay (3ms).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:44                     ` Jeff Garzik
@ 2004-02-29 20:06                       ` Jens Axboe
  2004-02-29 20:20                         ` Jeff Garzik
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2004-02-29 20:06 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Sun, Feb 29 2004, Jeff Garzik wrote:
> Justin T. Gibbs wrote:
> >>>Exactly.  The block layer shouldn't be involved in this at all.  In the
> >>>particular case I was talking about, busy or queue full with no I/O 
> >>>pending,
> >
> >                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> >
> >>Agreed for busy.
> >>
> >>But queue full?
> >
> >
> >Some SCSI devices send Queue Full instead of Busy in the case of
> >a "busy" condition with no I/O pending.
> 
> 
> Gotcha, thanks.

Don't forget that this isn't the only condition. What about pci dma
mapping failures?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 20:06                       ` Jens Axboe
@ 2004-02-29 20:20                         ` Jeff Garzik
  2004-02-29 20:27                           ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 20:20 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

Jens Axboe wrote:
> Don't forget that this isn't the only condition. What about pci dma
> mapping failures?

hehe, granted there might be other conditions, but for PCI DMA mapping 
specifically, it is (unfortunately) currently defined to not fail.

That wants changing for ppc64 and x86-64 iommu, which are a -lot- more 
likely to fail an iommu transaction than parisc or sparc64... but that's 
the way it is for now.  Way too many drivers assume PCI DMA mapping 
always succeeds.

	Jeff

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 20:20                         ` Jeff Garzik
@ 2004-02-29 20:27                           ` Jens Axboe
  2004-02-29 20:28                             ` Jeff Garzik
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2004-02-29 20:27 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

On Sun, Feb 29 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >Don't forget that this isn't the only condition. What about pci dma
> >mapping failures?
> 
> 
> hehe, granted there might be other conditions, but for PCI DMA mapping 
> specifically, it is (unfortunately) currently defined to not fail.

Which is bogus and must be changed anyways... Besides, that was just one
example. There could be other resource shortages that can happen which
prevent queueing.

> That wants changing for ppc64 and x86-64 iommu, which are a -lot- more 
> likely to fail an iommu transaction than parisc or sparc64... but that's 
> the way it is for now.  Way too many drivers assume PCI DMA mapping 
> always succeeds.

I just don't like _at all_ that it's assumed it cannot fail, when in
fact it _can_ fail. Likeliness just means it's less of a problem for
those archs, it's still an eye sore.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 20:27                           ` Jens Axboe
@ 2004-02-29 20:28                             ` Jeff Garzik
  0 siblings, 0 replies; 29+ messages in thread
From: Jeff Garzik @ 2004-02-29 20:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Justin T. Gibbs, James Bottomley, SCSI Mailing List,
	Andrew Morton

Jens Axboe wrote:
> On Sun, Feb 29 2004, Jeff Garzik wrote:
> 
>>Jens Axboe wrote:
>>
>>>Don't forget that this isn't the only condition. What about pci dma
>>>mapping failures?
>>
>>
>>hehe, granted there might be other conditions, but for PCI DMA mapping 
>>specifically, it is (unfortunately) currently defined to not fail.
> 
> 
> Which is bogus and must be changed anyways... Besides, that was just one
> example. There could be other resource shortages that can happen which
> prevent queueing.

Agreed.


>>That wants changing for ppc64 and x86-64 iommu, which are a -lot- more 
>>likely to fail an iommu transaction than parisc or sparc64... but that's 
>>the way it is for now.  Way too many drivers assume PCI DMA mapping 
>>always succeeds.
> 
> 
> I just don't like _at all_ that it's assumed it cannot fail, when in
> fact it _can_ fail. Likeliness just means it's less of a problem for
> those archs, it's still an eye sore.

I agree with you completely...  but the current PCI DMA API is reality. 
  We just need to change reality.  :)

	Jeff




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:26         ` Justin T. Gibbs
@ 2004-02-29 21:10           ` James Bottomley
  2004-02-29 22:23             ` Justin T. Gibbs
  2004-02-29 21:25           ` James Bottomley
  1 sibling, 1 reply; 29+ messages in thread
From: James Bottomley @ 2004-02-29 21:10 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: SCSI Mailing List, Andrew Morton

On Sun, 2004-02-29 at 13:26, Justin T. Gibbs wrote:
> This is not something worth black-listing.  It is not a special case.
> Busy and/or queue full with no I/O pending is a rare event.  The user
> will never notice this in practice other than their devices that need
> this delay will work correctly in this situation.  To put it another
> way, the aic7xxx and aic79xx drivers have enforced this delay for almost
> four years in Linux and I have yet to have someone complain that they
> had poor device performance due to this delay.  It is just not worth
> the code complexity or potential of missing a broken device to "optimize"
> this delay.

Well, actually, it is: there are certain array vendors (who should
justifiably remain nameless) who implemented the array queue resources
as global controller pools.  Thus, under heavy I/O to multiple LUNs,
they become highly likely to throw BUSY or QUEUE FULL at zero depth and
do it quite often.  Pausing for fractions of a second here will cause
nasty performance glitches in the benchmarks.

What about putting a rate limited printk in when the stutter is
triggered?  That way if someone still has one of the problem devices we
should have a very good trace when they report the hang.

James

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 19:26         ` Justin T. Gibbs
  2004-02-29 21:10           ` James Bottomley
@ 2004-02-29 21:25           ` James Bottomley
  1 sibling, 0 replies; 29+ messages in thread
From: James Bottomley @ 2004-02-29 21:25 UTC (permalink / raw)
  To: Justin T. Gibbs; +Cc: SCSI Mailing List, Andrew Morton

On Sun, 2004-02-29 at 13:26, Justin T. Gibbs wrote:
> While it is certainly possible to move domain validation into the mid-layer
> and remove it from LLDs, I doubt that is the type of change you want to
> include in 2.6.  It would require at minimum:

Well, possibly not, but it's certainly something to begin thinking about
now.

> 1) A generic method for fetching and changing the transport parameters for
>    devices attached to LLDs.

That's what the embryonic transport classes will become for SPI and when
they finally have this capability, then I'll try to put domain
validation into them.

> 2) A generic method for freezing the execution queue and "single-stepping"
>    things like domain validation and error recovery commands.

We have that all ready in the mid-layer as part of error recovery.  This
would simply be an extension for domain validation.

> 3) A way to wait for the active count on a device or target (including all
>    luns) to drop to 0.

That too is done in error recovery (admittedly at the host level rather
than the device level).

> Don't get me wrong.  One of my main complaints of the Linux SCSI layer is
> its intrinsic lack of consistency.  In my opinion, the code path for sending
> commands to and processing the results from a LLD should be identical
> regardless of whether the commands are from error recovery, domain validation,
> a peripheral driver, or the probe code.  Instead we only get certain behavior
> (like a bus settle delay) for some clients and not others.  Fixing this
> would allow the behavior to be defined in one place and these "workarounds"
> could be removed from the LLDs.  Unfortunately, the LLDs are the only "code
> path" common to all of the clients of the mid-layer, so you wind up with the
> LLDs enforcing correct behavior.

Well, I haven't claimed SCSI to be anything more than a work in
progress.  I do, however, think it is in better shape than it was a
couple of years ago.  What I really need is for people actually to try
out the mid-layer code where they would have worked around it in 2.4.

James



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Fix aic7xxx del_timer_sync() deadlock
  2004-02-29 21:10           ` James Bottomley
@ 2004-02-29 22:23             ` Justin T. Gibbs
  0 siblings, 0 replies; 29+ messages in thread
From: Justin T. Gibbs @ 2004-02-29 22:23 UTC (permalink / raw)
  To: James Bottomley; +Cc: SCSI Mailing List, Andrew Morton

> On Sun, 2004-02-29 at 13:26, Justin T. Gibbs wrote:
>> This is not something worth black-listing.  It is not a special case.
>> Busy and/or queue full with no I/O pending is a rare event.  The user
>> will never notice this in practice other than their devices that need
>> this delay will work correctly in this situation.  To put it another
>> way, the aic7xxx and aic79xx drivers have enforced this delay for almost
>> four years in Linux and I have yet to have someone complain that they
>> had poor device performance due to this delay.  It is just not worth
>> the code complexity or potential of missing a broken device to "optimize"
>> this delay.
> 
> Well, actually, it is: there are certain array vendors (who should
> justifiably remain nameless) who implemented the array queue resources
> as global controller pools.  Thus, under heavy I/O to multiple LUNs,
> they become highly likely to throw BUSY or QUEUE FULL at zero depth and
> do it quite often.  Pausing for fractions of a second here will cause
> nasty performance glitches in the benchmarks.

In the case of heavy I/O from one machine, the OS should be guaranteeing
fare access to resources.  FreeBSD has done this using a round-robin
scheduler since '97.  In my testing with that system, you get a few
queue full or busy events until the scheduler can rebalance and then
no stalls at all.  This is exactly how it should be since busy and
queue full events rob precious bus bandwidth.

If you are using a multi-initiator setup with multiple hosts connecting
to the same controller, you typically are buying from someone who knows
how to build a properly functioning target (Well, either that or you
are going to get exactly what you paid for - bad performance in not
only this situation but most others).  In this case, even if a global
transaction pool is being used, a resource fairness algorithm is employed
so that very quickly the resources are rebalanced based on load.  From
the test matrices I've seen from customers of these types of boxes,
high transactional loads on multiple "completely independent" channels
is one of the first things they do.  There is no tolerance for starving
out a channel except for the first command after a long period of no
activity.

> What about putting a rate limited printk in when the stutter is
> triggered?  That way if someone still has one of the problem devices we
> should have a very good trace when they report the hang.

Feel free to log repeated busy events as you see fit, but so long
as the status of the last command sent that caused a device to go
offline is printed, I would have all I need to see that the device
was "perpetually busy".

--
Justin

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2004-02-29 22:23 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-27 18:26 [PATCH] Fix aic7xxx del_timer_sync() deadlock James Bottomley
2004-02-27 19:23 ` Justin T. Gibbs
2004-02-27 19:34   ` James Bottomley
2004-02-27 20:50     ` Justin T. Gibbs
2004-02-28 15:39       ` James Bottomley
2004-02-29 19:26         ` Justin T. Gibbs
2004-02-29 21:10           ` James Bottomley
2004-02-29 22:23             ` Justin T. Gibbs
2004-02-29 21:25           ` James Bottomley
2004-02-28  2:40   ` Jeff Garzik
2004-02-28  9:25     ` Jens Axboe
2004-02-28 23:50       ` Jeff Garzik
2004-02-29  9:13         ` Jens Axboe
2004-02-29 16:21           ` Jeff Garzik
2004-02-29 16:39             ` Jens Axboe
2004-02-29 17:28               ` Jeff Garzik
2004-02-29 17:55                 ` Jens Axboe
2004-02-29 18:57           ` Justin T. Gibbs
2004-02-29 19:00             ` Jeff Garzik
2004-02-29 19:28               ` Justin T. Gibbs
2004-02-29 19:37                 ` Jeff Garzik
2004-02-29 19:42                   ` Justin T. Gibbs
2004-02-29 19:44                     ` Jeff Garzik
2004-02-29 20:06                       ` Jens Axboe
2004-02-29 20:20                         ` Jeff Garzik
2004-02-29 20:27                           ` Jens Axboe
2004-02-29 20:28                             ` Jeff Garzik
2004-02-29 19:43                   ` Jeff Garzik
2004-02-29 20:04             ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox