From mboxrd@z Thu Jan  1 00:00:00 1970
From: Mike Anderson <andmike@us.ibm.com>
Subject: [PATCH] scsi_error update take 2
Date: Wed, 19 Feb 2003 01:09:19 -0800
Sender: linux-scsi-owner@vger.kernel.org
Message-ID: <20030219090919.GA2185@beaverton.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Return-path: <linux-scsi-owner@vger.kernel.org>
Received: from westrelay01.boulder.ibm.com (westrelay01.boulder.ibm.com [9.17.194.22])
	by e32.co.us.ibm.com (8.12.7/8.12.2) with ESMTP id h1J97XtU024704
	for <linux-scsi@vger.kernel.org>; Wed, 19 Feb 2003 04:07:33 -0500
Received: from hmsbounty (sig-9-65-40-25.mts.ibm.com [9.65.40.25])
	by westrelay01.boulder.ibm.com (8.12.3/NCO/VER6.5) with ESMTP id h1J97U7E139692
	for <linux-scsi@vger.kernel.org>; Wed, 19 Feb 2003 02:07:31 -0700
Content-Disposition: inline
List-Id: linux-scsi@vger.kernel.org
To: linux-scsi@vger.kernel.org

This is an update of a previous patch I posted.
http://marc.theaimsgroup.com/?l=linux-scsi&m=104495114103628&w=2

This patch is against scsi-misc-2.5

The updates from the last patch include:
	- Names changes:
		eh_cmd_list => eh_cmd_q
		eh_list => eh_entry

	- Move shost->in_recovery = 0

	- Switch from scsi_retry_command to scsi_queue_insert for retry
	  to solve fast completions / serial start of retries.

	- Use list_splice_init in scsi_unjam_host.

Sorry for the one large patch chunk :-(.

-andmike
--
Michael Anderson
andmike@us.ibm.com

 hosts.c      |   20 --
 hosts.h      |    2 
 scsi.c       |   46 ++++
 scsi.h       |   11 -
 scsi_error.c |  550 +++++++++++++++++++++++++++++++----------------------------
 scsi_lib.c   |    2 
 scsi_syms.c  |    1 
 7 files changed, 351 insertions(+), 281 deletions(-)
-------

diff -Nru a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
--- a/drivers/scsi/hosts.c	Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/hosts.c	Tue Feb 18 09:13:11 2003
@@ -397,6 +397,7 @@
 	spin_lock_init(&shost->default_lock);
 	scsi_assign_lock(shost, &shost->default_lock);
 	INIT_LIST_HEAD(&shost->my_devices);
+	INIT_LIST_HEAD(&shost->eh_cmd_q);
 
 	init_waitqueue_head(&shost->host_wait);
 	shost->dma_channel = 0xff;
@@ -634,22 +635,9 @@
 	spin_lock_irqsave(shost->host_lock, flags);
 	shost->host_busy--;
 	sdev->device_busy--;
-	if (shost->in_recovery && (shost->host_busy == shost->host_failed)) {
-		up(shost->eh_wait);
-		SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
-					  " thread\n"));
-	}
-	spin_unlock_irqrestore(shost->host_lock, flags);
-}
-
-void scsi_host_failed_inc_and_test(struct Scsi_Host *shost)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(shost->host_lock, flags);
-	shost->in_recovery = 1;
-	shost->host_failed++;
-	if (shost->host_busy == shost->host_failed) {
+	if (shost->in_recovery && shost->host_failed &&
+	    (shost->host_busy == shost->host_failed))
+	{
 		up(shost->eh_wait);
 		SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
 					  " thread\n"));
diff -Nru a/drivers/scsi/hosts.h b/drivers/scsi/hosts.h
--- a/drivers/scsi/hosts.h	Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/hosts.h	Tue Feb 18 09:13:11 2003
@@ -384,6 +384,7 @@
     spinlock_t		  default_lock;
     spinlock_t		  *host_lock;
 
+    struct list_head	eh_cmd_q;
     struct task_struct    * ehandler;  /* Error recovery thread. */
     struct semaphore      * eh_wait;   /* The error recovery thread waits on
                                           this. */
@@ -587,7 +588,6 @@
  */
 extern void scsi_host_busy_inc(struct Scsi_Host *, Scsi_Device *);
 extern void scsi_host_busy_dec_and_test(struct Scsi_Host *, Scsi_Device *);
-extern void scsi_host_failed_inc_and_test(struct Scsi_Host *);
 
 /**
  * scsi_find_device - find a device given the host
diff -Nru a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
--- a/drivers/scsi/scsi.c	Tue Feb 18 09:13:12 2003
+++ b/drivers/scsi/scsi.c	Tue Feb 18 09:13:12 2003
@@ -790,13 +790,9 @@
 				if ((status_byte(SCpnt->result) & CHECK_CONDITION) != 0) {
 					SCSI_LOG_MLCOMPLETE(3, print_sense("bh", SCpnt));
 				}
-				if (SCpnt->device->host->eh_wait != NULL) {
-					scsi_eh_eflags_set(SCpnt, SCSI_EH_CMD_FAILED | SCSI_EH_CMD_ERR);
-					SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
-					SCpnt->state = SCSI_STATE_FAILED;
 
-					scsi_host_failed_inc_and_test(SCpnt->device->host);
-				} else {
+				if (!scsi_eh_scmd_add(SCpnt, 0))
+				{
 					/*
 					 * We only get here if the error
 					 * recovery thread has died.
@@ -1296,6 +1292,44 @@
 {
 	sdev->access_count--;
 	module_put(sdev->host->hostt->module);
+}
+
+/**
+ * scsi_set_device_offline - set scsi_device offline
+ * @sdev:	pointer to struct scsi_device to offline. 
+ *
+ * Locks:	host_lock held on entry.
+ **/
+void scsi_set_device_offline(struct scsi_device *sdev)
+{
+	struct scsi_cmnd *scmd;
+	int	cmds_active = 0;
+	unsigned long flags;
+
+	sdev->online = FALSE;
+
+	spin_lock_irqsave(&sdev->list_lock, flags);
+	list_for_each_entry(scmd, &sdev->cmd_list, list) {
+		if (scmd->request && scmd->request->rq_status != RQ_INACTIVE) {
+			/*
+			 * If we are unable to remove the timer, it means
+			 * that the command has already timed out or
+			 * finished.
+			 */
+			if (!scsi_delete_timer(scmd)) {
+				continue;
+			}
+
+			++cmds_active;
+
+			scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD);
+		}
+	}
+	spin_unlock_irqrestore(&sdev->list_lock, flags);
+
+	if (!cmds_active) {
+		/* FIXME: Send online state change hotplug event */
+	}
 }
 
 /*
diff -Nru a/drivers/scsi/scsi.h b/drivers/scsi/scsi.h
--- a/drivers/scsi/scsi.h	Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi.h	Tue Feb 18 09:13:11 2003
@@ -455,6 +455,7 @@
 extern void scsi_slave_detach(struct scsi_device *);
 extern int scsi_device_get(struct scsi_device *);
 extern void scsi_device_put(struct scsi_device *);
+extern void scsi_set_device_offline(struct scsi_device *);
 extern void scsi_done(Scsi_Cmnd * SCpnt);
 extern void scsi_finish_command(Scsi_Cmnd *);
 extern int scsi_retry_command(Scsi_Cmnd *);
@@ -726,6 +727,7 @@
 
 	struct list_head list;  /* scsi_cmnd participates in queue lists */
 
+	struct list_head eh_entry; /* entry for the host eh_cmd_q */
 	int eh_state;		/* Used for state tracking in error handlr */
 	int eh_eflags;		/* Used by error handlr */
 	void (*done) (struct scsi_cmnd *);	/* Mid-level done function */
@@ -850,6 +852,7 @@
  */
 #define SCSI_MLQUEUE_HOST_BUSY   0x1055
 #define SCSI_MLQUEUE_DEVICE_BUSY 0x1056
+#define SCSI_MLQUEUE_EH_RETRY    0x1057
 
 /*
  * old style reset request from external source
@@ -960,12 +963,12 @@
 /*
  * Scsi Error Handler Flags
  */
-#define SCSI_EH_CMD_ERR	0x0001	/* Orig cmd error'd */
-#define SCSI_EH_CMD_FAILED	0x0002	/* Orig cmd error type failed */
-#define SCSI_EH_CMD_TIMEOUT	0x0004	/* Orig cmd error type timeout */
-#define SCSI_EH_REC_TIMEOUT	0x0008	/* Recovery cmd timeout */
+#define SCSI_EH_CANCEL_CMD	0x0001	/* Cancel this cmd */
+#define SCSI_EH_REC_TIMEOUT	0x0002	/* EH retry timed out */
 
 #define SCSI_SENSE_VALID(scmd) ((scmd->sense_buffer[0] & 0x70) == 0x70)
+
+extern int scsi_eh_scmd_add(struct scsi_cmnd *, int);
 
 int scsi_set_medium_removal(Scsi_Device *dev, char state);
 
diff -Nru a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
--- a/drivers/scsi/scsi_error.c	Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi_error.c	Tue Feb 18 09:13:11 2003
@@ -56,6 +56,49 @@
 #define HOST_RESET_SETTLE_TIME  10*HZ
 
 /**
+ * scsi_eh_scmd_add - add scsi cmd to error handling.
+ * @scmd:	scmd to run eh on.
+ * @eh_flag:	optional SCSI_EH flag.
+ *
+ * Return value:
+ *	0 on failure.
+ **/
+int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
+{
+	struct Scsi_Host *shost = scmd->device->host;
+	unsigned long flags;
+
+	if (shost->eh_wait == NULL)
+		return 0;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+
+	scsi_eh_eflags_set(scmd, eh_flag);
+	/*
+	 * FIXME: Can we stop setting owner and state.
+	 */
+	scmd->owner = SCSI_OWNER_ERROR_HANDLER;
+	scmd->state = SCSI_STATE_FAILED;
+	/*
+	 * Set the serial_number_at_timeout to the current
+	 * serial_number
+	 */
+	scmd->serial_number_at_timeout = scmd->serial_number;
+	list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
+	shost->in_recovery = 1;
+	shost->host_failed++;
+	if (shost->host_busy == shost->host_failed) {
+		up(shost->eh_wait);
+		SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
+					  " thread\n"));
+	}
+
+	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	return 1;
+}
+
+/**
  * scsi_add_timer - Start timeout timer for a single scsi command.
  * @scmd:	scsi command that is about to start running.
  * @timeout:	amount of time to allow this command to run.
@@ -131,22 +174,14 @@
  **/
 void scsi_times_out(Scsi_Cmnd *scmd)
 {
-	struct Scsi_Host *shost = scmd->device->host;
-
-	/* Set the serial_number_at_timeout to the current serial_number */
-	scmd->serial_number_at_timeout = scmd->serial_number;
-
-	scsi_eh_eflags_set(scmd, SCSI_EH_CMD_TIMEOUT | SCSI_EH_CMD_ERR);
-
-	if (unlikely(shost->eh_wait == NULL)) {
+	if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) {
 		panic("Error handler thread not present at %p %p %s %d",
-		      scmd, shost, __FILE__, __LINE__);
+		      scmd, scmd->device->host, __FILE__, __LINE__);
 	}
 
-	scsi_host_failed_inc_and_test(shost);
-
 	SCSI_LOG_TIMEOUT(3, printk("Command timed out busy=%d failed=%d\n",
-				   shost->host_busy, shost->host_failed));
+				   scmd->device->host->host_busy,
+				   scmd->device->host->host_failed));
 }
 
 /**
@@ -173,39 +208,40 @@
 #if CONFIG_SCSI_LOGGING
 /**
  * scsi_eh_prt_fail_stats - Log info on failures.
- * @sc_list:	List for failed cmds.
  * @shost:	scsi host being recovered.
+ * @work_q:	Queue of scsi cmds to process.
  **/
-static void scsi_eh_prt_fail_stats(Scsi_Cmnd *sc_list, struct Scsi_Host *shost)
+static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost,
+					  struct list_head *work_q)
 {
-	Scsi_Cmnd *scmd;
-	Scsi_Device *sdev;
+	struct scsi_cmnd *scmd;
+	struct scsi_device *sdev;
 	int total_failures = 0;
 	int cmd_failed = 0;
-	int cmd_timed_out = 0;
+	int cmd_cancel = 0;
 	int devices_failed = 0;
 
 
 	list_for_each_entry(sdev, &shost->my_devices, siblings) {
-		for (scmd = sc_list; scmd; scmd = scmd->bh_next) {
+		list_for_each_entry(scmd, work_q, eh_entry) {
 			if (scmd->device == sdev) {
 				++total_failures;
 				if (scsi_eh_eflags_chk(scmd,
-						       SCSI_EH_CMD_TIMEOUT))
-					++cmd_timed_out;
-				else
+						       SCSI_EH_CANCEL_CMD))
+					++cmd_cancel;
+				else 
 					++cmd_failed;
 			}
 		}
 
-		if (cmd_timed_out || cmd_failed) {
+		if (cmd_cancel || cmd_failed) {
 			SCSI_LOG_ERROR_RECOVERY(3,
 				printk("%s: %d:%d:%d:%d cmds failed: %d,"
-				       " timedout: %d\n",
+				       " cancel: %d\n",
 				       __FUNCTION__, shost->host_no,
 				       sdev->channel, sdev->id, sdev->lun,
-				       cmd_failed, cmd_timed_out));
-			cmd_timed_out = 0;
+				       cmd_failed, cmd_cancel));
+			cmd_cancel = 0;
 			cmd_failed = 0;
 			++devices_failed;
 		}
@@ -218,68 +254,6 @@
 #endif
 
 /**
- * scsi_eh_get_failed - Gather failed cmds.
- * @sc_list:	A pointer to a list for failed cmds.
- * @shost:	Scsi host being recovered.
- *
- * XXX Add opaque interator for device / shost. Investigate direct
- * addition to per eh list on error allowing skipping of this step.
- **/
-static void scsi_eh_get_failed(Scsi_Cmnd **sc_list, struct Scsi_Host *shost)
-{
-	int found;
-	Scsi_Device *sdev;
-	Scsi_Cmnd *scmd;
-
-	found = 0;
-	list_for_each_entry(sdev, &shost->my_devices, siblings) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&sdev->list_lock, flags);
-		list_for_each_entry(scmd, &sdev->cmd_list, list) {
-			if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) {
-				scmd->bh_next = *sc_list;
-				*sc_list = scmd;
-				found++;
-			} else {
-				/*
-				 * FIXME Verify how this can happen and if
-				 * this is still needed??
-				 */
-			    if (scmd->state != SCSI_STATE_INITIALIZING
-			    && scmd->state != SCSI_STATE_UNUSED) {
-				/*
-				 * Rats.  Something is still floating
-				 * around out there This could be the
-				 * result of the fact that the upper level
-				 * drivers are still frobbing commands
-				 * that might have succeeded.  There are
-				 * two outcomes. One is that the command
-				 * block will eventually be freed, and the
-				 * other one is that the command will be
-				 * queued and will be finished along the
-				 * way.
-				 */
-				SCSI_LOG_ERROR_RECOVERY(1, printk("Error hdlr"
-							  " prematurely woken"
-							  " cmds still active"
-							  " (%p %x %d)\n",
-					       scmd, scmd->state,
-					       scmd->device->id));
-				}
-			}
-		}
-		spin_unlock_irqrestore(&sdev->list_lock, flags);
-	}
-
-	SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(*sc_list, shost));
-
-	if (shost->host_failed != found)
-		printk(KERN_ERR "%s: host_failed: %d != found: %d\n", 
-		       __FUNCTION__, shost->host_failed, found);
-}
-
-/**
  * scsi_check_sense - Examine scsi cmd sense
  * @scmd:	Cmd to have sense checked.
  *
@@ -535,7 +509,8 @@
 			spin_lock_irqsave(scmd->device->host->host_lock, flags);
 			if (scmd->device->host->hostt->eh_abort_handler)
 				scmd->device->host->hostt->eh_abort_handler(scmd);
-			spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
+			spin_unlock_irqrestore(scmd->device->host->host_lock,
+					       flags);
 			
 			scmd->request->rq_status = RQ_SCSI_DONE;
 			scmd->owner = SCSI_OWNER_ERROR_HANDLER;
@@ -676,7 +651,7 @@
 /**
  * scsi_eh_finish_cmd - Handle a cmd that eh is finished with.
  * @scmd:	Original SCSI cmd that eh has finished.
- * @shost:	SCSI host that cmd originally failed on.
+ * @done_q:	Queue for processed commands.
  *
  * Notes:
  *    We don't want to use the normal command completion while we are are
@@ -685,9 +660,10 @@
  *    keep a list of pending commands for final completion, and once we
  *    are ready to leave error handling we handle completion for real.
  **/
-static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost)
+static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd,
+			       struct list_head *done_q )
 {
-	shost->host_failed--;
+	scmd->device->host->host_failed--;
 	scmd->state = SCSI_STATE_BHQUEUE;
 	scsi_eh_eflags_clr_all(scmd);
 
@@ -696,12 +672,14 @@
 	 * things.
 	 */
 	scsi_setup_cmd_retry(scmd);
+
+	list_move_tail(&scmd->eh_entry, done_q);
 }
 
 /**
  * scsi_eh_get_sense - Get device sense data.
- * @sc_todo:	list of cmds that have failed.
- * @shost:	scsi host being recovered.
+ * @work_q:	Queue of commands to process.
+ * @done_q:	Queue of proccessed commands..
  *
  * Description:
  *    See if we need to request sense information.  if so, then get it
@@ -719,23 +697,23 @@
  *
  *    In 2.5 this capability will be going away.
  **/
-static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_get_sense(struct list_head *work_q,
+			     struct list_head *done_q)
 {
 	int rtn;
+	struct list_head *lh, *lh_sf;
 	Scsi_Cmnd *scmd;
 
-	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need"
-					  " to request sense\n",
-					  __FUNCTION__));
-
-	for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
-		if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_FAILED) ||
+	list_for_each_safe(lh, lh_sf, work_q) {
+		scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+		if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD) ||
 		    SCSI_SENSE_VALID(scmd))
 			continue;
 
 		SCSI_LOG_ERROR_RECOVERY(2, printk("%s: requesting sense"
-						  " for tgt: %d\n",
-						  __FUNCTION__, scmd->device->id));
+						  " for id: %d\n",
+						  current->comm,
+						  scmd->device->id));
 		rtn = scsi_request_sense(scmd);
 		if (rtn != SUCCESS)
 			continue;
@@ -752,7 +730,7 @@
 		 * upper level.
 		 */
 		if (rtn == SUCCESS)
-			scsi_eh_finish_cmd(scmd, shost);
+			scsi_eh_finish_cmd(scmd, done_q);
 		if (rtn != NEEDS_RETRY)
 			continue;
 
@@ -771,10 +749,10 @@
 		/*
 		 * we eventually hand this one back to the top level.
 		 */
-		scsi_eh_finish_cmd(scmd, shost);
+		scsi_eh_finish_cmd(scmd, done_q);
 	}
 
-	return shost->host_failed;
+	return list_empty(work_q);
 }
 
 /**
@@ -864,9 +842,9 @@
 }
 
 /**
- * scsi_eh_abort_cmd - abort a timed-out cmd.
- * @sc_todo:	A list of cmds that have failed.
+ * scsi_eh_abort_cmds - abort canceled commands.
  * @shost:	scsi host being recovered.
+ * @eh_done_q:	list_head for processed commands.
  *
  * Decription:
  *    Try and see whether or not it makes sense to try and abort the
@@ -875,29 +853,36 @@
  *    no sense to try and abort the command, since as far as the shost
  *    adapter is concerned, it isn't running.
  **/
-static int scsi_eh_abort_cmd(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_abort_cmds(struct list_head *work_q,
+			      struct list_head *done_q)
 {
-
 	int rtn;
-	Scsi_Cmnd *scmd;
+	struct list_head *lh, *lh_sf;
+	struct scsi_cmnd *scmd;
 
-	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need"
-					  " to abort cmd\n", __FUNCTION__));
-
-	for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
-		if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT))
+	list_for_each_safe(lh, lh_sf, work_q) {
+		scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+		if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD))
 			continue;
-
+		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
+						  "0x%p\n", current->comm,
+						  scmd));
 		rtn = scsi_try_to_abort_cmd(scmd);
 		if (rtn == SUCCESS) {
-			if (!scsi_eh_tur(scmd)) {
-				rtn = scsi_eh_retry_cmd(scmd);
-				if (rtn == SUCCESS)
-					scsi_eh_finish_cmd(scmd, shost);
+			scsi_eh_eflags_clr(scmd,  SCSI_EH_CANCEL_CMD);
+			if (!scmd->device->online || !scsi_eh_tur(scmd)) {
+				scsi_eh_finish_cmd(scmd, done_q);
 			}
-		}
+				
+		} else
+			SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
+							  " cmd failed:"
+							  "0x%p\n",
+							  current->comm,
+							  scmd));
 	}
-	return shost->host_failed;
+
+	return list_empty(work_q);
 }
 
 /**
@@ -933,9 +918,9 @@
 }
 
 /**
- * scsi_eh_bus_device_reset - send bdr is needed
- * @sc_todo:	a list of cmds that have failed.
+ * scsi_eh_bus_device_reset - send bdr if needed
  * @shost:	scsi host being recovered.
+ * @eh_done_q:	list_head for processed commands.
  *
  * Notes:
  *    Try a bus device reset.  still, look to see whether we have multiple
@@ -943,39 +928,52 @@
  *    makes no sense to try bus_device_reset - we really would need to try
  *    a bus_reset instead. 
  **/
-static int scsi_eh_bus_device_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
+				    struct list_head *work_q,
+				    struct list_head *done_q)
 {
 	int rtn;
-	Scsi_Cmnd *scmd;
-	Scsi_Device *sdev;
-
-	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Trying BDR\n", __FUNCTION__));
+	struct list_head *lh, *lh_sf;
+	struct scsi_cmnd *scmd, *bdr_scmd;
+	struct scsi_device *sdev;
 
 	list_for_each_entry(sdev, &shost->my_devices, siblings) {
-		for (scmd = sc_todo; scmd; scmd = scmd->bh_next)
-			if ((scmd->device == sdev) &&
-			    scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
+		bdr_scmd = NULL;
+		list_for_each_entry(scmd, work_q, eh_entry)
+			if (scmd->device == sdev) {
+				bdr_scmd = scmd;
 				break;
+			}
 
-		if (!scmd)
+		if (!bdr_scmd)
 			continue;
 
-		/*
-		 * ok, we have a device that is having problems.  try and send
-		 * a bus device reset to it.
-		 */
-		rtn = scsi_try_bus_device_reset(scmd);
-		if ((rtn == SUCCESS) && (!scsi_eh_tur(scmd)))
-				for (scmd = sc_todo; scmd; scmd = scmd->bh_next)
-					if ((scmd->device == sdev) &&
-					    scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) {
-						rtn = scsi_eh_retry_cmd(scmd);
-						if (rtn == SUCCESS)
-							scsi_eh_finish_cmd(scmd, shost);
-					}
+		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BDR sdev:"
+						  " 0x%p\n", current->comm,
+						  sdev));
+		rtn = scsi_try_bus_device_reset(bdr_scmd);
+		if (rtn == SUCCESS) {
+			if (!sdev->online || !scsi_eh_tur(bdr_scmd)) {
+				list_for_each_safe(lh, lh_sf,
+						   work_q) {
+					scmd = list_entry(lh, struct
+							  scsi_cmnd,
+							  eh_entry);
+					if (scmd->device == sdev)
+						scsi_eh_finish_cmd(scmd,
+								   done_q);
+				}
+			}
+		} else {
+			SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BDR"
+							  " failed sdev:"
+							  "0x%p\n",
+							  current->comm,
+							   sdev));
+		}
 	}
 
-	return shost->host_failed;
+	return list_empty(work_q);
 }
 
 /**
@@ -1005,7 +1003,8 @@
 		/*
 		 * Mark all affected devices to expect a unit attention.
 		 */
-		list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings)
+		list_for_each_entry(sdev, &scmd->device->host->my_devices,
+				    siblings)
 			if (scmd->device->channel == sdev->channel) {
 				sdev->was_reset = 1;
 				sdev->expecting_cc_ua = 1;
@@ -1041,7 +1040,8 @@
 		/*
 		 * Mark all affected devices to expect a unit attention.
 		 */
-		list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings)
+		list_for_each_entry(sdev, &scmd->device->host->my_devices,
+				    siblings)
 			if (scmd->device->channel == sdev->channel) {
 				sdev->was_reset = 1;
 				sdev->expecting_cc_ua = 1;
@@ -1051,26 +1051,21 @@
 }
 
 /**
- * scsi_eh_bus_host_reset - send a bus reset and on failure try host reset
- * @sc_todo:	a list of cmds that have failed.
+ * scsi_eh_bus_reset - send a bus reset 
  * @shost:	scsi host being recovered.
+ * @eh_done_q:	list_head for processed commands.
  **/
-static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_bus_reset(struct Scsi_Host *shost,
+			     struct list_head *work_q,
+			     struct list_head *done_q)
 {
 	int rtn;
+	struct list_head *lh, *lh_sf;
 	Scsi_Cmnd *scmd;
 	Scsi_Cmnd *chan_scmd;
 	unsigned int channel;
 
 	/*
-	 * if we ended up here, we have serious problems.  the only thing left
-	 * to try is a full bus reset.  if someone has grabbed the bus and isn't
-	 * letting go, then perhaps this will help.
-	 */
-	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Try Bus/Host RST\n",
-					  __FUNCTION__));
-
-	/* 
 	 * we really want to loop over the various channels, and do this on
 	 * a channel by channel basis.  we should also check to see if any
 	 * of the failed commands are on soft_reset devices, and if so, skip
@@ -1078,9 +1073,8 @@
 	 */
 
 	for (channel = 0; channel <= shost->max_channel; channel++) {
-		for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
-			if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
-				continue;
+		chan_scmd = NULL;
+		list_for_each_entry(scmd, work_q, eh_entry) {
 			if (channel == scmd->device->channel) {
 				chan_scmd = scmd;
 				break;
@@ -1091,63 +1085,95 @@
 			}
 		}
 
-		if (!scmd)
+		if (!chan_scmd)
 			continue;
+		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BRST chan:"
+						  " %d\n", current->comm,
+						  channel));
+		rtn = scsi_try_bus_reset(chan_scmd);
+		if (rtn == SUCCESS) {
+			list_for_each_safe(lh, lh_sf, work_q) {
+				scmd = list_entry(lh, struct scsi_cmnd,
+						  eh_entry);
+				if (channel == scmd->device->channel)
+					if (!scmd->device->online ||
+					    !scsi_eh_tur(scmd))
+						scsi_eh_finish_cmd(scmd,
+								   done_q);
+			}
+		} else {
+			SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
+							  " failed chan: %d\n",
+							  current->comm,
+							  channel));
+		}
+	}
+	return list_empty(work_q);
+}
 
-		/*
-		 * we now know that we are able to perform a reset for the
-		 * channel that scmd points to.
-		 */
-		rtn = scsi_try_bus_reset(scmd);
-		if (rtn != SUCCESS)
-			rtn = scsi_try_host_reset(scmd);
+/**
+ * scsi_eh_host_reset - send a host reset 
+ * @work_q:	list_head for processed commands.
+ * @done_q:	list_head for processed commands.
+ **/
+static int scsi_eh_host_reset(struct list_head *work_q,
+			      struct list_head *done_q)
+{
+	int rtn;
+	struct list_head *lh, *lh_sf;
+	Scsi_Cmnd *scmd;
 
-		if (rtn == SUCCESS) {
-			for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
-				if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)
-				    || channel != scmd->device->channel)
-					continue;
-				if (!scsi_eh_tur(scmd)) {
-					rtn = scsi_eh_retry_cmd(scmd);
+	if (!list_empty(work_q)) {
+		scmd = list_entry(work_q->next,
+				  struct scsi_cmnd, eh_entry);
 
-					if (rtn == SUCCESS)
-						scsi_eh_finish_cmd(scmd, shost);
-				}
+		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending HRST\n"
+						  , current->comm));
+
+		rtn = scsi_try_host_reset(scmd);
+		if (rtn == SUCCESS) {
+			list_for_each_safe(lh, lh_sf, work_q) {
+				scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+				if (!scmd->device->online || !scsi_eh_tur(scmd)) 
+					scsi_eh_finish_cmd(scmd, done_q);
 			}
+		} else {
+			SCSI_LOG_ERROR_RECOVERY(3, printk("%s: HRST"
+							  " failed\n",
+							  current->comm));
 		}
-
 	}
-	return shost->host_failed;
+	return list_empty(work_q);
 }
 
 /**
  * scsi_eh_offline_sdevs - offline scsi devices that fail to recover
- * @sc_todo:	a list of cmds that have failed.
- * @shost:	scsi host being recovered.
+ * @work_q:	list_head for processed commands.
+ * @done_q:	list_head for processed commands.
  *
  **/
-static void scsi_eh_offline_sdevs(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static void scsi_eh_offline_sdevs(struct list_head *work_q,
+				  struct list_head *done_q)
 {
+	struct list_head *lh, *lh_sf;
 	Scsi_Cmnd *scmd;
 
-	for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
-		if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
-			continue;
-
+	list_for_each_safe(lh, lh_sf, work_q) {
+		scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
 		printk(KERN_INFO "scsi: Device offlined - not"
-				" ready or command retry failed"
-				" after error recovery: host"
+		       		" ready after error recovery: host"
 				" %d channel %d id %d lun %d\n",
-				shost->host_no,
+				scmd->device->host->host_no,
 				scmd->device->channel,
 				scmd->device->id,
 				scmd->device->lun);
-
-		if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT))
-			scmd->result |= (DRIVER_TIMEOUT << 24);
-
-		scmd->device->online = 0;
-		scsi_eh_finish_cmd(scmd, shost);
+		scmd->device->online = FALSE;
+		if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD)) {
+			/*
+			 * FIXME: Handle lost cmds.
+			 */
+		}
+		scsi_eh_finish_cmd(scmd, done_q);
 	}
 	return;
 }
@@ -1459,6 +1485,8 @@
 	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n",
 					  __FUNCTION__));
 
+	shost->in_recovery = 0;
+
 	wake_up(&shost->host_wait);
 
 	/*
@@ -1482,6 +1510,55 @@
 }
 
 /**
+ * scsi_eh_ready_devs - check device ready state and recover if not.
+ * @shost: 	host to be recovered.
+ * @eh_done_q:	list_head for processed commands.
+ *
+ **/
+static void scsi_eh_ready_devs(struct Scsi_Host *shost,
+			       struct list_head *work_q,
+			       struct list_head *done_q)
+{
+	if (scsi_eh_bus_device_reset(shost, work_q, done_q))
+		if (scsi_eh_bus_reset(shost, work_q, done_q))
+			if (scsi_eh_host_reset(work_q, done_q))
+				scsi_eh_offline_sdevs(work_q, done_q);
+}
+
+/**
+ * scsi_eh_flush_done_q - finish processed commands or retry them.
+ * @done_q:	list_head of processed commands.
+ *
+ **/
+static void scsi_eh_flush_done_q(struct list_head *done_q)
+{
+	struct list_head *lh, *lh_sf;
+	Scsi_Cmnd *scmd;
+
+	list_for_each_safe(lh, lh_sf, done_q) {
+		scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+		list_del_init(lh);
+		if (!scmd->device->online) {
+			 scmd->result |= (DRIVER_TIMEOUT << 24);
+		} else {
+			if (++scmd->retries < scmd->allowed) {
+				SCSI_LOG_ERROR_RECOVERY(3,
+					printk("%s: flush retry"
+					       " cmd: %p\n",
+						  current->comm,
+						  scmd));
+				scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
+				continue;
+			}
+		}
+		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush finish"
+				       " cmd: %p\n",
+					  current->comm, scmd));
+		scsi_finish_command(scmd);
+	}
+}
+
+/**
  * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
  * @shost:	Host to unjam.
  *
@@ -1506,60 +1583,21 @@
  **/
 static void scsi_unjam_host(struct Scsi_Host *shost)
 {
-	Scsi_Cmnd *sc_todo = NULL;
-	Scsi_Cmnd *scmd;
-
-	/*
-	 * Is this assert really ok anymore (andmike). Should we at least
-	 * be using spin_lock_unlocked.
-	 */
-	ASSERT_LOCK(shost->host_lock, 0);
-
-	scsi_eh_get_failed(&sc_todo, shost);
-
-	if (scsi_eh_get_sense(sc_todo, shost))
-		if (scsi_eh_abort_cmd(sc_todo, shost))
-			if (scsi_eh_bus_device_reset(sc_todo, shost))
-				if (scsi_eh_bus_host_reset(sc_todo, shost))
-					scsi_eh_offline_sdevs(sc_todo, shost);
-
-	BUG_ON(shost->host_failed);
+	unsigned long flags;
+	LIST_HEAD(eh_work_q);
+	LIST_HEAD(eh_done_q);
 
+	spin_lock_irqsave(shost->host_lock, flags);
+	list_splice_init(&shost->eh_cmd_q, &eh_work_q);
+	spin_unlock_irqrestore(shost->host_lock, flags);
 
-	/*
-	 * We are currently holding these things in a linked list - we
-	 * didn't put them in the bottom half queue because we wanted to
-	 * keep things quiet while we were working on recovery, and
-	 * passing them up to the top level could easily cause the top
-	 * level to try and queue something else again.
-	 *
-	 * start by marking that the host is no longer in error recovery.
-	 */
-	shost->in_recovery = 0;
+	SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
 
-	/*
-	 * take the list of commands, and stick them in the bottom half queue.
-	 * the current implementation of scsi_done will do this for us - if need
-	 * be we can create a special version of this function to do the
-	 * same job for us.
-	 */
-	for (scmd = sc_todo; scmd; scmd = sc_todo) {
-		sc_todo = scmd->bh_next;
-		scmd->bh_next = NULL;
-		/*
-		 * Oh, this is a vile hack.  scsi_done() expects a timer
-		 * to be running on the command.  If there isn't, it assumes
-		 * that the command has actually timed out, and a timer
-		 * handler is running.  That may well be how we got into
-		 * this fix, but right now things are stable.  We add
-		 * a timer back again so that we can report completion.
-		 * scsi_done() will immediately remove said timer from
-		 * the command, and then process it.
-		 */
-		scsi_add_timer(scmd, 100, scsi_eh_times_out);
-		scsi_done(scmd);
-	}
+	if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
+		if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
+			scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
 
+	scsi_eh_flush_done_q(&eh_done_q);
 }
 
 /**
@@ -1597,7 +1635,8 @@
 	/*
 	 * Wake up the thread that created us.
 	 */
-	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of scsi_eh_%d\n",shost->host_no));
+	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
+					  " scsi_eh_%d\n",shost->host_no));
 
 	complete(shost->eh_notify);
 
@@ -1607,7 +1646,9 @@
 		 * away and die.  This typically happens if the user is
 		 * trying to unload a module.
 		 */
-		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d sleeping\n",shost->host_no));
+		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
+						  " scsi_eh_%d"
+						  " sleeping\n",shost->host_no));
 
 		/*
 		 * Note - we always use down_interruptible with the semaphore
@@ -1622,7 +1663,9 @@
 		if (shost->eh_kill)
 			break;
 
-		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d waking up\n",shost->host_no));
+		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
+						  " scsi_eh_%d waking"
+						  " up\n",shost->host_no));
 
 		shost->eh_active = 1;
 
@@ -1650,7 +1693,8 @@
 
 	}
 
-	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d exiting\n",shost->host_no));
+	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
+					  " exiting\n",shost->host_no));
 
 	/*
 	 * Make sure that nobody tries to wake us up again.
diff -Nru a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
--- a/drivers/scsi/scsi_lib.c	Tue Feb 18 09:13:12 2003
+++ b/drivers/scsi/scsi_lib.c	Tue Feb 18 09:13:12 2003
@@ -117,7 +117,7 @@
 	 */
 	if (reason == SCSI_MLQUEUE_HOST_BUSY)
 		host->host_blocked = host->max_host_blocked;
-	else
+	else if (reason == SCSI_MLQUEUE_DEVICE_BUSY)
 		device->device_blocked = device->max_device_blocked;
 
 	/*
diff -Nru a/drivers/scsi/scsi_syms.c b/drivers/scsi/scsi_syms.c
--- a/drivers/scsi/scsi_syms.c	Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi_syms.c	Tue Feb 18 09:13:11 2003
@@ -80,6 +80,7 @@
 EXPORT_SYMBOL(scsi_slave_detach);
 EXPORT_SYMBOL(scsi_device_get);
 EXPORT_SYMBOL(scsi_device_put);
+EXPORT_SYMBOL(scsi_set_device_offline);
 
 /*
  * This symbol is for the highlevel drivers (e.g. sg) only.