From: Mike Anderson <andmike@us.ibm.com>
To: linux-scsi@vger.kernel.org
Subject: [PATCH] scsi_error update take 2
Date: Wed, 19 Feb 2003 01:09:19 -0800 [thread overview]
Message-ID: <20030219090919.GA2185@beaverton.ibm.com> (raw)
This is an update of a previous patch I posted.
http://marc.theaimsgroup.com/?l=linux-scsi&m=104495114103628&w=2
This patch is against scsi-misc-2.5
The updates from the last patch include:
- Names changes:
eh_cmd_list => eh_cmd_q
eh_list => eh_entry
- Move shost->in_recovery = 0
- Switch from scsi_retry_command to scsi_queue_insert for retry
to solve fast completions / serial start of retries.
- Use list_splice_init in scsi_unjam_host.
Sorry for the one large patch chunk :-(.
-andmike
--
Michael Anderson
andmike@us.ibm.com
hosts.c | 20 --
hosts.h | 2
scsi.c | 46 ++++
scsi.h | 11 -
scsi_error.c | 550 +++++++++++++++++++++++++++++++----------------------------
scsi_lib.c | 2
scsi_syms.c | 1
7 files changed, 351 insertions(+), 281 deletions(-)
-------
diff -Nru a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
--- a/drivers/scsi/hosts.c Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/hosts.c Tue Feb 18 09:13:11 2003
@@ -397,6 +397,7 @@
spin_lock_init(&shost->default_lock);
scsi_assign_lock(shost, &shost->default_lock);
INIT_LIST_HEAD(&shost->my_devices);
+ INIT_LIST_HEAD(&shost->eh_cmd_q);
init_waitqueue_head(&shost->host_wait);
shost->dma_channel = 0xff;
@@ -634,22 +635,9 @@
spin_lock_irqsave(shost->host_lock, flags);
shost->host_busy--;
sdev->device_busy--;
- if (shost->in_recovery && (shost->host_busy == shost->host_failed)) {
- up(shost->eh_wait);
- SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
- " thread\n"));
- }
- spin_unlock_irqrestore(shost->host_lock, flags);
-}
-
-void scsi_host_failed_inc_and_test(struct Scsi_Host *shost)
-{
- unsigned long flags;
-
- spin_lock_irqsave(shost->host_lock, flags);
- shost->in_recovery = 1;
- shost->host_failed++;
- if (shost->host_busy == shost->host_failed) {
+ if (shost->in_recovery && shost->host_failed &&
+ (shost->host_busy == shost->host_failed))
+ {
up(shost->eh_wait);
SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
" thread\n"));
diff -Nru a/drivers/scsi/hosts.h b/drivers/scsi/hosts.h
--- a/drivers/scsi/hosts.h Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/hosts.h Tue Feb 18 09:13:11 2003
@@ -384,6 +384,7 @@
spinlock_t default_lock;
spinlock_t *host_lock;
+ struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct semaphore * eh_wait; /* The error recovery thread waits on
this. */
@@ -587,7 +588,6 @@
*/
extern void scsi_host_busy_inc(struct Scsi_Host *, Scsi_Device *);
extern void scsi_host_busy_dec_and_test(struct Scsi_Host *, Scsi_Device *);
-extern void scsi_host_failed_inc_and_test(struct Scsi_Host *);
/**
* scsi_find_device - find a device given the host
diff -Nru a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
--- a/drivers/scsi/scsi.c Tue Feb 18 09:13:12 2003
+++ b/drivers/scsi/scsi.c Tue Feb 18 09:13:12 2003
@@ -790,13 +790,9 @@
if ((status_byte(SCpnt->result) & CHECK_CONDITION) != 0) {
SCSI_LOG_MLCOMPLETE(3, print_sense("bh", SCpnt));
}
- if (SCpnt->device->host->eh_wait != NULL) {
- scsi_eh_eflags_set(SCpnt, SCSI_EH_CMD_FAILED | SCSI_EH_CMD_ERR);
- SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
- SCpnt->state = SCSI_STATE_FAILED;
- scsi_host_failed_inc_and_test(SCpnt->device->host);
- } else {
+ if (!scsi_eh_scmd_add(SCpnt, 0))
+ {
/*
* We only get here if the error
* recovery thread has died.
@@ -1296,6 +1292,44 @@
{
sdev->access_count--;
module_put(sdev->host->hostt->module);
+}
+
+/**
+ * scsi_set_device_offline - set scsi_device offline
+ * @sdev: pointer to struct scsi_device to offline.
+ *
+ * Locks: host_lock held on entry.
+ **/
+void scsi_set_device_offline(struct scsi_device *sdev)
+{
+ struct scsi_cmnd *scmd;
+ int cmds_active = 0;
+ unsigned long flags;
+
+ sdev->online = FALSE;
+
+ spin_lock_irqsave(&sdev->list_lock, flags);
+ list_for_each_entry(scmd, &sdev->cmd_list, list) {
+ if (scmd->request && scmd->request->rq_status != RQ_INACTIVE) {
+ /*
+ * If we are unable to remove the timer, it means
+ * that the command has already timed out or
+ * finished.
+ */
+ if (!scsi_delete_timer(scmd)) {
+ continue;
+ }
+
+ ++cmds_active;
+
+ scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD);
+ }
+ }
+ spin_unlock_irqrestore(&sdev->list_lock, flags);
+
+ if (!cmds_active) {
+ /* FIXME: Send online state change hotplug event */
+ }
}
/*
diff -Nru a/drivers/scsi/scsi.h b/drivers/scsi/scsi.h
--- a/drivers/scsi/scsi.h Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi.h Tue Feb 18 09:13:11 2003
@@ -455,6 +455,7 @@
extern void scsi_slave_detach(struct scsi_device *);
extern int scsi_device_get(struct scsi_device *);
extern void scsi_device_put(struct scsi_device *);
+extern void scsi_set_device_offline(struct scsi_device *);
extern void scsi_done(Scsi_Cmnd * SCpnt);
extern void scsi_finish_command(Scsi_Cmnd *);
extern int scsi_retry_command(Scsi_Cmnd *);
@@ -726,6 +727,7 @@
struct list_head list; /* scsi_cmnd participates in queue lists */
+ struct list_head eh_entry; /* entry for the host eh_cmd_q */
int eh_state; /* Used for state tracking in error handlr */
int eh_eflags; /* Used by error handlr */
void (*done) (struct scsi_cmnd *); /* Mid-level done function */
@@ -850,6 +852,7 @@
*/
#define SCSI_MLQUEUE_HOST_BUSY 0x1055
#define SCSI_MLQUEUE_DEVICE_BUSY 0x1056
+#define SCSI_MLQUEUE_EH_RETRY 0x1057
/*
* old style reset request from external source
@@ -960,12 +963,12 @@
/*
* Scsi Error Handler Flags
*/
-#define SCSI_EH_CMD_ERR 0x0001 /* Orig cmd error'd */
-#define SCSI_EH_CMD_FAILED 0x0002 /* Orig cmd error type failed */
-#define SCSI_EH_CMD_TIMEOUT 0x0004 /* Orig cmd error type timeout */
-#define SCSI_EH_REC_TIMEOUT 0x0008 /* Recovery cmd timeout */
+#define SCSI_EH_CANCEL_CMD 0x0001 /* Cancel this cmd */
+#define SCSI_EH_REC_TIMEOUT 0x0002 /* EH retry timed out */
#define SCSI_SENSE_VALID(scmd) ((scmd->sense_buffer[0] & 0x70) == 0x70)
+
+extern int scsi_eh_scmd_add(struct scsi_cmnd *, int);
int scsi_set_medium_removal(Scsi_Device *dev, char state);
diff -Nru a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
--- a/drivers/scsi/scsi_error.c Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi_error.c Tue Feb 18 09:13:11 2003
@@ -56,6 +56,49 @@
#define HOST_RESET_SETTLE_TIME 10*HZ
/**
+ * scsi_eh_scmd_add - add scsi cmd to error handling.
+ * @scmd: scmd to run eh on.
+ * @eh_flag: optional SCSI_EH flag.
+ *
+ * Return value:
+ * 0 on failure.
+ **/
+int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
+{
+ struct Scsi_Host *shost = scmd->device->host;
+ unsigned long flags;
+
+ if (shost->eh_wait == NULL)
+ return 0;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+
+ scsi_eh_eflags_set(scmd, eh_flag);
+ /*
+ * FIXME: Can we stop setting owner and state.
+ */
+ scmd->owner = SCSI_OWNER_ERROR_HANDLER;
+ scmd->state = SCSI_STATE_FAILED;
+ /*
+ * Set the serial_number_at_timeout to the current
+ * serial_number
+ */
+ scmd->serial_number_at_timeout = scmd->serial_number;
+ list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
+ shost->in_recovery = 1;
+ shost->host_failed++;
+ if (shost->host_busy == shost->host_failed) {
+ up(shost->eh_wait);
+ SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
+ " thread\n"));
+ }
+
+ spin_unlock_irqrestore(shost->host_lock, flags);
+
+ return 1;
+}
+
+/**
* scsi_add_timer - Start timeout timer for a single scsi command.
* @scmd: scsi command that is about to start running.
* @timeout: amount of time to allow this command to run.
@@ -131,22 +174,14 @@
**/
void scsi_times_out(Scsi_Cmnd *scmd)
{
- struct Scsi_Host *shost = scmd->device->host;
-
- /* Set the serial_number_at_timeout to the current serial_number */
- scmd->serial_number_at_timeout = scmd->serial_number;
-
- scsi_eh_eflags_set(scmd, SCSI_EH_CMD_TIMEOUT | SCSI_EH_CMD_ERR);
-
- if (unlikely(shost->eh_wait == NULL)) {
+ if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) {
panic("Error handler thread not present at %p %p %s %d",
- scmd, shost, __FILE__, __LINE__);
+ scmd, scmd->device->host, __FILE__, __LINE__);
}
- scsi_host_failed_inc_and_test(shost);
-
SCSI_LOG_TIMEOUT(3, printk("Command timed out busy=%d failed=%d\n",
- shost->host_busy, shost->host_failed));
+ scmd->device->host->host_busy,
+ scmd->device->host->host_failed));
}
/**
@@ -173,39 +208,40 @@
#if CONFIG_SCSI_LOGGING
/**
* scsi_eh_prt_fail_stats - Log info on failures.
- * @sc_list: List for failed cmds.
* @shost: scsi host being recovered.
+ * @work_q: Queue of scsi cmds to process.
**/
-static void scsi_eh_prt_fail_stats(Scsi_Cmnd *sc_list, struct Scsi_Host *shost)
+static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost,
+ struct list_head *work_q)
{
- Scsi_Cmnd *scmd;
- Scsi_Device *sdev;
+ struct scsi_cmnd *scmd;
+ struct scsi_device *sdev;
int total_failures = 0;
int cmd_failed = 0;
- int cmd_timed_out = 0;
+ int cmd_cancel = 0;
int devices_failed = 0;
list_for_each_entry(sdev, &shost->my_devices, siblings) {
- for (scmd = sc_list; scmd; scmd = scmd->bh_next) {
+ list_for_each_entry(scmd, work_q, eh_entry) {
if (scmd->device == sdev) {
++total_failures;
if (scsi_eh_eflags_chk(scmd,
- SCSI_EH_CMD_TIMEOUT))
- ++cmd_timed_out;
- else
+ SCSI_EH_CANCEL_CMD))
+ ++cmd_cancel;
+ else
++cmd_failed;
}
}
- if (cmd_timed_out || cmd_failed) {
+ if (cmd_cancel || cmd_failed) {
SCSI_LOG_ERROR_RECOVERY(3,
printk("%s: %d:%d:%d:%d cmds failed: %d,"
- " timedout: %d\n",
+ " cancel: %d\n",
__FUNCTION__, shost->host_no,
sdev->channel, sdev->id, sdev->lun,
- cmd_failed, cmd_timed_out));
- cmd_timed_out = 0;
+ cmd_failed, cmd_cancel));
+ cmd_cancel = 0;
cmd_failed = 0;
++devices_failed;
}
@@ -218,68 +254,6 @@
#endif
/**
- * scsi_eh_get_failed - Gather failed cmds.
- * @sc_list: A pointer to a list for failed cmds.
- * @shost: Scsi host being recovered.
- *
- * XXX Add opaque interator for device / shost. Investigate direct
- * addition to per eh list on error allowing skipping of this step.
- **/
-static void scsi_eh_get_failed(Scsi_Cmnd **sc_list, struct Scsi_Host *shost)
-{
- int found;
- Scsi_Device *sdev;
- Scsi_Cmnd *scmd;
-
- found = 0;
- list_for_each_entry(sdev, &shost->my_devices, siblings) {
- unsigned long flags;
-
- spin_lock_irqsave(&sdev->list_lock, flags);
- list_for_each_entry(scmd, &sdev->cmd_list, list) {
- if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) {
- scmd->bh_next = *sc_list;
- *sc_list = scmd;
- found++;
- } else {
- /*
- * FIXME Verify how this can happen and if
- * this is still needed??
- */
- if (scmd->state != SCSI_STATE_INITIALIZING
- && scmd->state != SCSI_STATE_UNUSED) {
- /*
- * Rats. Something is still floating
- * around out there This could be the
- * result of the fact that the upper level
- * drivers are still frobbing commands
- * that might have succeeded. There are
- * two outcomes. One is that the command
- * block will eventually be freed, and the
- * other one is that the command will be
- * queued and will be finished along the
- * way.
- */
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error hdlr"
- " prematurely woken"
- " cmds still active"
- " (%p %x %d)\n",
- scmd, scmd->state,
- scmd->device->id));
- }
- }
- }
- spin_unlock_irqrestore(&sdev->list_lock, flags);
- }
-
- SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(*sc_list, shost));
-
- if (shost->host_failed != found)
- printk(KERN_ERR "%s: host_failed: %d != found: %d\n",
- __FUNCTION__, shost->host_failed, found);
-}
-
-/**
* scsi_check_sense - Examine scsi cmd sense
* @scmd: Cmd to have sense checked.
*
@@ -535,7 +509,8 @@
spin_lock_irqsave(scmd->device->host->host_lock, flags);
if (scmd->device->host->hostt->eh_abort_handler)
scmd->device->host->hostt->eh_abort_handler(scmd);
- spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
+ spin_unlock_irqrestore(scmd->device->host->host_lock,
+ flags);
scmd->request->rq_status = RQ_SCSI_DONE;
scmd->owner = SCSI_OWNER_ERROR_HANDLER;
@@ -676,7 +651,7 @@
/**
* scsi_eh_finish_cmd - Handle a cmd that eh is finished with.
* @scmd: Original SCSI cmd that eh has finished.
- * @shost: SCSI host that cmd originally failed on.
+ * @done_q: Queue for processed commands.
*
* Notes:
* We don't want to use the normal command completion while we are are
@@ -685,9 +660,10 @@
* keep a list of pending commands for final completion, and once we
* are ready to leave error handling we handle completion for real.
**/
-static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost)
+static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd,
+ struct list_head *done_q )
{
- shost->host_failed--;
+ scmd->device->host->host_failed--;
scmd->state = SCSI_STATE_BHQUEUE;
scsi_eh_eflags_clr_all(scmd);
@@ -696,12 +672,14 @@
* things.
*/
scsi_setup_cmd_retry(scmd);
+
+ list_move_tail(&scmd->eh_entry, done_q);
}
/**
* scsi_eh_get_sense - Get device sense data.
- * @sc_todo: list of cmds that have failed.
- * @shost: scsi host being recovered.
+ * @work_q: Queue of commands to process.
+ * @done_q: Queue of proccessed commands..
*
* Description:
* See if we need to request sense information. if so, then get it
@@ -719,23 +697,23 @@
*
* In 2.5 this capability will be going away.
**/
-static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_get_sense(struct list_head *work_q,
+ struct list_head *done_q)
{
int rtn;
+ struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd;
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need"
- " to request sense\n",
- __FUNCTION__));
-
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
- if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_FAILED) ||
+ list_for_each_safe(lh, lh_sf, work_q) {
+ scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+ if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD) ||
SCSI_SENSE_VALID(scmd))
continue;
SCSI_LOG_ERROR_RECOVERY(2, printk("%s: requesting sense"
- " for tgt: %d\n",
- __FUNCTION__, scmd->device->id));
+ " for id: %d\n",
+ current->comm,
+ scmd->device->id));
rtn = scsi_request_sense(scmd);
if (rtn != SUCCESS)
continue;
@@ -752,7 +730,7 @@
* upper level.
*/
if (rtn == SUCCESS)
- scsi_eh_finish_cmd(scmd, shost);
+ scsi_eh_finish_cmd(scmd, done_q);
if (rtn != NEEDS_RETRY)
continue;
@@ -771,10 +749,10 @@
/*
* we eventually hand this one back to the top level.
*/
- scsi_eh_finish_cmd(scmd, shost);
+ scsi_eh_finish_cmd(scmd, done_q);
}
- return shost->host_failed;
+ return list_empty(work_q);
}
/**
@@ -864,9 +842,9 @@
}
/**
- * scsi_eh_abort_cmd - abort a timed-out cmd.
- * @sc_todo: A list of cmds that have failed.
+ * scsi_eh_abort_cmds - abort canceled commands.
* @shost: scsi host being recovered.
+ * @eh_done_q: list_head for processed commands.
*
* Decription:
* Try and see whether or not it makes sense to try and abort the
@@ -875,29 +853,36 @@
* no sense to try and abort the command, since as far as the shost
* adapter is concerned, it isn't running.
**/
-static int scsi_eh_abort_cmd(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_abort_cmds(struct list_head *work_q,
+ struct list_head *done_q)
{
-
int rtn;
- Scsi_Cmnd *scmd;
+ struct list_head *lh, *lh_sf;
+ struct scsi_cmnd *scmd;
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need"
- " to abort cmd\n", __FUNCTION__));
-
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
- if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT))
+ list_for_each_safe(lh, lh_sf, work_q) {
+ scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+ if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD))
continue;
-
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
+ "0x%p\n", current->comm,
+ scmd));
rtn = scsi_try_to_abort_cmd(scmd);
if (rtn == SUCCESS) {
- if (!scsi_eh_tur(scmd)) {
- rtn = scsi_eh_retry_cmd(scmd);
- if (rtn == SUCCESS)
- scsi_eh_finish_cmd(scmd, shost);
+ scsi_eh_eflags_clr(scmd, SCSI_EH_CANCEL_CMD);
+ if (!scmd->device->online || !scsi_eh_tur(scmd)) {
+ scsi_eh_finish_cmd(scmd, done_q);
}
- }
+
+ } else
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
+ " cmd failed:"
+ "0x%p\n",
+ current->comm,
+ scmd));
}
- return shost->host_failed;
+
+ return list_empty(work_q);
}
/**
@@ -933,9 +918,9 @@
}
/**
- * scsi_eh_bus_device_reset - send bdr is needed
- * @sc_todo: a list of cmds that have failed.
+ * scsi_eh_bus_device_reset - send bdr if needed
* @shost: scsi host being recovered.
+ * @eh_done_q: list_head for processed commands.
*
* Notes:
* Try a bus device reset. still, look to see whether we have multiple
@@ -943,39 +928,52 @@
* makes no sense to try bus_device_reset - we really would need to try
* a bus_reset instead.
**/
-static int scsi_eh_bus_device_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
+ struct list_head *work_q,
+ struct list_head *done_q)
{
int rtn;
- Scsi_Cmnd *scmd;
- Scsi_Device *sdev;
-
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Trying BDR\n", __FUNCTION__));
+ struct list_head *lh, *lh_sf;
+ struct scsi_cmnd *scmd, *bdr_scmd;
+ struct scsi_device *sdev;
list_for_each_entry(sdev, &shost->my_devices, siblings) {
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next)
- if ((scmd->device == sdev) &&
- scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
+ bdr_scmd = NULL;
+ list_for_each_entry(scmd, work_q, eh_entry)
+ if (scmd->device == sdev) {
+ bdr_scmd = scmd;
break;
+ }
- if (!scmd)
+ if (!bdr_scmd)
continue;
- /*
- * ok, we have a device that is having problems. try and send
- * a bus device reset to it.
- */
- rtn = scsi_try_bus_device_reset(scmd);
- if ((rtn == SUCCESS) && (!scsi_eh_tur(scmd)))
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next)
- if ((scmd->device == sdev) &&
- scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) {
- rtn = scsi_eh_retry_cmd(scmd);
- if (rtn == SUCCESS)
- scsi_eh_finish_cmd(scmd, shost);
- }
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BDR sdev:"
+ " 0x%p\n", current->comm,
+ sdev));
+ rtn = scsi_try_bus_device_reset(bdr_scmd);
+ if (rtn == SUCCESS) {
+ if (!sdev->online || !scsi_eh_tur(bdr_scmd)) {
+ list_for_each_safe(lh, lh_sf,
+ work_q) {
+ scmd = list_entry(lh, struct
+ scsi_cmnd,
+ eh_entry);
+ if (scmd->device == sdev)
+ scsi_eh_finish_cmd(scmd,
+ done_q);
+ }
+ }
+ } else {
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BDR"
+ " failed sdev:"
+ "0x%p\n",
+ current->comm,
+ sdev));
+ }
}
- return shost->host_failed;
+ return list_empty(work_q);
}
/**
@@ -1005,7 +1003,8 @@
/*
* Mark all affected devices to expect a unit attention.
*/
- list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings)
+ list_for_each_entry(sdev, &scmd->device->host->my_devices,
+ siblings)
if (scmd->device->channel == sdev->channel) {
sdev->was_reset = 1;
sdev->expecting_cc_ua = 1;
@@ -1041,7 +1040,8 @@
/*
* Mark all affected devices to expect a unit attention.
*/
- list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings)
+ list_for_each_entry(sdev, &scmd->device->host->my_devices,
+ siblings)
if (scmd->device->channel == sdev->channel) {
sdev->was_reset = 1;
sdev->expecting_cc_ua = 1;
@@ -1051,26 +1051,21 @@
}
/**
- * scsi_eh_bus_host_reset - send a bus reset and on failure try host reset
- * @sc_todo: a list of cmds that have failed.
+ * scsi_eh_bus_reset - send a bus reset
* @shost: scsi host being recovered.
+ * @eh_done_q: list_head for processed commands.
**/
-static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static int scsi_eh_bus_reset(struct Scsi_Host *shost,
+ struct list_head *work_q,
+ struct list_head *done_q)
{
int rtn;
+ struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd;
Scsi_Cmnd *chan_scmd;
unsigned int channel;
/*
- * if we ended up here, we have serious problems. the only thing left
- * to try is a full bus reset. if someone has grabbed the bus and isn't
- * letting go, then perhaps this will help.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Try Bus/Host RST\n",
- __FUNCTION__));
-
- /*
* we really want to loop over the various channels, and do this on
* a channel by channel basis. we should also check to see if any
* of the failed commands are on soft_reset devices, and if so, skip
@@ -1078,9 +1073,8 @@
*/
for (channel = 0; channel <= shost->max_channel; channel++) {
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
- if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
- continue;
+ chan_scmd = NULL;
+ list_for_each_entry(scmd, work_q, eh_entry) {
if (channel == scmd->device->channel) {
chan_scmd = scmd;
break;
@@ -1091,63 +1085,95 @@
}
}
- if (!scmd)
+ if (!chan_scmd)
continue;
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BRST chan:"
+ " %d\n", current->comm,
+ channel));
+ rtn = scsi_try_bus_reset(chan_scmd);
+ if (rtn == SUCCESS) {
+ list_for_each_safe(lh, lh_sf, work_q) {
+ scmd = list_entry(lh, struct scsi_cmnd,
+ eh_entry);
+ if (channel == scmd->device->channel)
+ if (!scmd->device->online ||
+ !scsi_eh_tur(scmd))
+ scsi_eh_finish_cmd(scmd,
+ done_q);
+ }
+ } else {
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
+ " failed chan: %d\n",
+ current->comm,
+ channel));
+ }
+ }
+ return list_empty(work_q);
+}
- /*
- * we now know that we are able to perform a reset for the
- * channel that scmd points to.
- */
- rtn = scsi_try_bus_reset(scmd);
- if (rtn != SUCCESS)
- rtn = scsi_try_host_reset(scmd);
+/**
+ * scsi_eh_host_reset - send a host reset
+ * @work_q: list_head for processed commands.
+ * @done_q: list_head for processed commands.
+ **/
+static int scsi_eh_host_reset(struct list_head *work_q,
+ struct list_head *done_q)
+{
+ int rtn;
+ struct list_head *lh, *lh_sf;
+ Scsi_Cmnd *scmd;
- if (rtn == SUCCESS) {
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
- if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)
- || channel != scmd->device->channel)
- continue;
- if (!scsi_eh_tur(scmd)) {
- rtn = scsi_eh_retry_cmd(scmd);
+ if (!list_empty(work_q)) {
+ scmd = list_entry(work_q->next,
+ struct scsi_cmnd, eh_entry);
- if (rtn == SUCCESS)
- scsi_eh_finish_cmd(scmd, shost);
- }
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending HRST\n"
+ , current->comm));
+
+ rtn = scsi_try_host_reset(scmd);
+ if (rtn == SUCCESS) {
+ list_for_each_safe(lh, lh_sf, work_q) {
+ scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+ if (!scmd->device->online || !scsi_eh_tur(scmd))
+ scsi_eh_finish_cmd(scmd, done_q);
}
+ } else {
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: HRST"
+ " failed\n",
+ current->comm));
}
-
}
- return shost->host_failed;
+ return list_empty(work_q);
}
/**
* scsi_eh_offline_sdevs - offline scsi devices that fail to recover
- * @sc_todo: a list of cmds that have failed.
- * @shost: scsi host being recovered.
+ * @work_q: list_head for processed commands.
+ * @done_q: list_head for processed commands.
*
**/
-static void scsi_eh_offline_sdevs(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
+static void scsi_eh_offline_sdevs(struct list_head *work_q,
+ struct list_head *done_q)
{
+ struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd;
- for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
- if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR))
- continue;
-
+ list_for_each_safe(lh, lh_sf, work_q) {
+ scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
printk(KERN_INFO "scsi: Device offlined - not"
- " ready or command retry failed"
- " after error recovery: host"
+ " ready after error recovery: host"
" %d channel %d id %d lun %d\n",
- shost->host_no,
+ scmd->device->host->host_no,
scmd->device->channel,
scmd->device->id,
scmd->device->lun);
-
- if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT))
- scmd->result |= (DRIVER_TIMEOUT << 24);
-
- scmd->device->online = 0;
- scsi_eh_finish_cmd(scmd, shost);
+ scmd->device->online = FALSE;
+ if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD)) {
+ /*
+ * FIXME: Handle lost cmds.
+ */
+ }
+ scsi_eh_finish_cmd(scmd, done_q);
}
return;
}
@@ -1459,6 +1485,8 @@
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n",
__FUNCTION__));
+ shost->in_recovery = 0;
+
wake_up(&shost->host_wait);
/*
@@ -1482,6 +1510,55 @@
}
/**
+ * scsi_eh_ready_devs - check device ready state and recover if not.
+ * @shost: host to be recovered.
+ * @eh_done_q: list_head for processed commands.
+ *
+ **/
+static void scsi_eh_ready_devs(struct Scsi_Host *shost,
+ struct list_head *work_q,
+ struct list_head *done_q)
+{
+ if (scsi_eh_bus_device_reset(shost, work_q, done_q))
+ if (scsi_eh_bus_reset(shost, work_q, done_q))
+ if (scsi_eh_host_reset(work_q, done_q))
+ scsi_eh_offline_sdevs(work_q, done_q);
+}
+
+/**
+ * scsi_eh_flush_done_q - finish processed commands or retry them.
+ * @done_q: list_head of processed commands.
+ *
+ **/
+static void scsi_eh_flush_done_q(struct list_head *done_q)
+{
+ struct list_head *lh, *lh_sf;
+ Scsi_Cmnd *scmd;
+
+ list_for_each_safe(lh, lh_sf, done_q) {
+ scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
+ list_del_init(lh);
+ if (!scmd->device->online) {
+ scmd->result |= (DRIVER_TIMEOUT << 24);
+ } else {
+ if (++scmd->retries < scmd->allowed) {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ printk("%s: flush retry"
+ " cmd: %p\n",
+ current->comm,
+ scmd));
+ scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
+ continue;
+ }
+ }
+ SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush finish"
+ " cmd: %p\n",
+ current->comm, scmd));
+ scsi_finish_command(scmd);
+ }
+}
+
+/**
* scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
* @shost: Host to unjam.
*
@@ -1506,60 +1583,21 @@
**/
static void scsi_unjam_host(struct Scsi_Host *shost)
{
- Scsi_Cmnd *sc_todo = NULL;
- Scsi_Cmnd *scmd;
-
- /*
- * Is this assert really ok anymore (andmike). Should we at least
- * be using spin_lock_unlocked.
- */
- ASSERT_LOCK(shost->host_lock, 0);
-
- scsi_eh_get_failed(&sc_todo, shost);
-
- if (scsi_eh_get_sense(sc_todo, shost))
- if (scsi_eh_abort_cmd(sc_todo, shost))
- if (scsi_eh_bus_device_reset(sc_todo, shost))
- if (scsi_eh_bus_host_reset(sc_todo, shost))
- scsi_eh_offline_sdevs(sc_todo, shost);
-
- BUG_ON(shost->host_failed);
+ unsigned long flags;
+ LIST_HEAD(eh_work_q);
+ LIST_HEAD(eh_done_q);
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_splice_init(&shost->eh_cmd_q, &eh_work_q);
+ spin_unlock_irqrestore(shost->host_lock, flags);
- /*
- * We are currently holding these things in a linked list - we
- * didn't put them in the bottom half queue because we wanted to
- * keep things quiet while we were working on recovery, and
- * passing them up to the top level could easily cause the top
- * level to try and queue something else again.
- *
- * start by marking that the host is no longer in error recovery.
- */
- shost->in_recovery = 0;
+ SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
- /*
- * take the list of commands, and stick them in the bottom half queue.
- * the current implementation of scsi_done will do this for us - if need
- * be we can create a special version of this function to do the
- * same job for us.
- */
- for (scmd = sc_todo; scmd; scmd = sc_todo) {
- sc_todo = scmd->bh_next;
- scmd->bh_next = NULL;
- /*
- * Oh, this is a vile hack. scsi_done() expects a timer
- * to be running on the command. If there isn't, it assumes
- * that the command has actually timed out, and a timer
- * handler is running. That may well be how we got into
- * this fix, but right now things are stable. We add
- * a timer back again so that we can report completion.
- * scsi_done() will immediately remove said timer from
- * the command, and then process it.
- */
- scsi_add_timer(scmd, 100, scsi_eh_times_out);
- scsi_done(scmd);
- }
+ if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
+ if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
+ scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
+ scsi_eh_flush_done_q(&eh_done_q);
}
/**
@@ -1597,7 +1635,8 @@
/*
* Wake up the thread that created us.
*/
- SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of scsi_eh_%d\n",shost->host_no));
+ SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
+ " scsi_eh_%d\n",shost->host_no));
complete(shost->eh_notify);
@@ -1607,7 +1646,9 @@
* away and die. This typically happens if the user is
* trying to unload a module.
*/
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d sleeping\n",shost->host_no));
+ SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
+ " scsi_eh_%d"
+ " sleeping\n",shost->host_no));
/*
* Note - we always use down_interruptible with the semaphore
@@ -1622,7 +1663,9 @@
if (shost->eh_kill)
break;
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d waking up\n",shost->host_no));
+ SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
+ " scsi_eh_%d waking"
+ " up\n",shost->host_no));
shost->eh_active = 1;
@@ -1650,7 +1693,8 @@
}
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d exiting\n",shost->host_no));
+ SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
+ " exiting\n",shost->host_no));
/*
* Make sure that nobody tries to wake us up again.
diff -Nru a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
--- a/drivers/scsi/scsi_lib.c Tue Feb 18 09:13:12 2003
+++ b/drivers/scsi/scsi_lib.c Tue Feb 18 09:13:12 2003
@@ -117,7 +117,7 @@
*/
if (reason == SCSI_MLQUEUE_HOST_BUSY)
host->host_blocked = host->max_host_blocked;
- else
+ else if (reason == SCSI_MLQUEUE_DEVICE_BUSY)
device->device_blocked = device->max_device_blocked;
/*
diff -Nru a/drivers/scsi/scsi_syms.c b/drivers/scsi/scsi_syms.c
--- a/drivers/scsi/scsi_syms.c Tue Feb 18 09:13:11 2003
+++ b/drivers/scsi/scsi_syms.c Tue Feb 18 09:13:11 2003
@@ -80,6 +80,7 @@
EXPORT_SYMBOL(scsi_slave_detach);
EXPORT_SYMBOL(scsi_device_get);
EXPORT_SYMBOL(scsi_device_put);
+EXPORT_SYMBOL(scsi_set_device_offline);
/*
* This symbol is for the highlevel drivers (e.g. sg) only.
reply other threads:[~2003-02-19 9:07 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20030219090919.GA2185@beaverton.ibm.com \
--to=andmike@us.ibm.com \
--cc=linux-scsi@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox