From: Michael Reed <mdr@sgi.com>
To: linux-scsi <linux-scsi@vger.kernel.org>
Cc: Jeremy Higdon <jeremy@sgi.com>
Subject: [RFC] Prevent infinite retries due to DID_RESET return status
Date: Mon, 11 Dec 2006 15:42:34 -0600 [thread overview]
Message-ID: <457DD0CA.9010509@sgi.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1147 bytes --]
Due to a firmware mismatch between a host and target (names withheld to
protect the innocent?), the LLDD was returning DID_RESET for every
i/o command. This patch modifies the scsi layer to take into account
when the command which received DID_RESET was issued and eventually
give up on it instead of unconditionally reissuing it forever
when it receives a DID_RESET. With this patch, on my test system,
the command receiving the constant DID_RESET times out after about
360 seconds.
The premise for this patch is that no command should have an infinite
lifetime. The impetus for this patch was a system which would not
reach a command prompt without disconnecting the storage from the
host.
The significant change in this patch is to call scsi_retry_command()
instead of scsi_requeue_command() if the command which receives a
DID_RESET did not complete any i/o (good_bytes==0). scsi_retry_command()
does not release the command and regenerate it like scsi_requeue_command()
does, hence jiffies_at_alloc reflects when the command was first issued.
This patch is based upon 2.6.19. Thanks for taking the time to
look at this.
Mike
[-- Attachment #2: did_reset.patch --]
[-- Type: text/x-patch, Size: 6879 bytes --]
--- kdbu/drivers/scsi/scsi_priv.h 2006-10-09 01:58:19.000000000 -0500
+++ kdb/drivers/scsi/scsi_priv.h 2006-12-07 14:15:19.925332776 -0600
@@ -28,7 +28,7 @@ extern int scsi_dispatch_cmd(struct scsi
extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
extern void __scsi_done(struct scsi_cmnd *cmd);
-extern int scsi_retry_command(struct scsi_cmnd *cmd);
+extern int scsi_retry_command(struct scsi_cmnd *cmd, int reason);
#ifdef CONFIG_SCSI_LOGGING
void scsi_log_send(struct scsi_cmnd *cmd);
void scsi_log_completion(struct scsi_cmnd *cmd, int disposition);
--- kdbu/include/scsi/scsi.h 2006-10-31 21:08:47.000000000 -0600
+++ kdb/include/scsi/scsi.h 2006-12-07 14:13:09.188052974 -0600
@@ -353,6 +353,7 @@ struct scsi_lun {
#define SCSI_MLQUEUE_HOST_BUSY 0x1055
#define SCSI_MLQUEUE_DEVICE_BUSY 0x1056
#define SCSI_MLQUEUE_EH_RETRY 0x1057
+#define SCSI_MLQUEUE_DID_RESET 0x1058
/*
* Use these to separate status msg and our bytes
--- kdbu/drivers/scsi/scsi.c 2006-10-09 01:58:19.000000000 -0500
+++ kdb/drivers/scsi/scsi.c 2006-12-07 14:15:49.835794930 -0600
@@ -673,7 +673,7 @@ void __scsi_done(struct scsi_cmnd *cmd)
* level drivers should not become re-entrant as a result of
* this.
*/
-int scsi_retry_command(struct scsi_cmnd *cmd)
+int scsi_retry_command(struct scsi_cmnd *cmd, int reason)
{
/*
* Zero the sense information from the last time we tried
@@ -681,7 +681,7 @@ int scsi_retry_command(struct scsi_cmnd
*/
memset(cmd->sense_buffer, 0, sizeof(cmd->sense_buffer));
- return scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
+ return scsi_queue_insert(cmd, reason);
}
/*
--- kdbu/drivers/scsi/scsi_lib.c 2006-11-29 21:09:07.000000000 -0600
+++ kdb/drivers/scsi/scsi_lib.c 2006-12-11 14:22:52.756579311 -0600
@@ -65,6 +65,7 @@ static struct scsi_host_sg_pool scsi_sg_
#undef SP
static void scsi_run_queue(struct request_queue *q);
+static void scsi_release_buffers(struct scsi_cmnd *cmd);
/*
* Function: scsi_unprep_request()
@@ -100,10 +101,10 @@ static void scsi_unprep_request(struct r
*
* Returns: Nothing.
*
- * Notes: We do this for one of two cases. Either the host is busy
+ * Notes: We do this for one of three cases. 1) the host is busy
* and it cannot accept any more commands for the time being,
- * or the device returned QUEUE_FULL and can accept no more
- * commands.
+ * 2) the device returned QUEUE_FULL and can accept no more
+ * commands, or 3) the LLDD returned DID_RESET.
* Notes: This could be called either from an interrupt context or a
* normal process context.
*/
@@ -137,9 +138,11 @@ int scsi_queue_insert(struct scsi_cmnd *
/*
* Decrement the counters, since these commands are no longer
- * active on the host/device.
+ * active on the host/device. If the reason is SCSI_MLQUEUE_DID_RESET
+ * then scsi_device_unbusy() was previously called.
*/
- scsi_device_unbusy(device);
+ if (reason != SCSI_MLQUEUE_DID_RESET)
+ scsi_device_unbusy(device);
/*
* Requeue this command. It will go before all other commands
@@ -601,6 +604,7 @@ static void scsi_requeue_command(struct
struct request *req = cmd->request;
unsigned long flags;
+ scsi_release_buffers(cmd);
scsi_unprep_request(req);
spin_lock_irqsave(q->queue_lock, flags);
blk_requeue_request(q, req);
@@ -646,6 +650,7 @@ void scsi_run_host_queues(struct Scsi_Ho
* Lock status: Assumed that lock is not held upon entry.
*
* Returns: cmd if requeue required, NULL otherwise.
+ * If cmd is returned then its buffers have not been released.
*
* Notes: This is called for block device requests in order to
* mark some number of sectors as complete.
@@ -688,6 +693,7 @@ static struct scsi_cmnd *scsi_end_reques
}
}
+ scsi_release_buffers(cmd);
add_disk_randomness(req->rq_disk);
spin_lock_irqsave(q->queue_lock, flags);
@@ -786,6 +792,33 @@ static void scsi_release_buffers(struct
}
/*
+ * Function: scsi_command_expired()
+ *
+ * Purpose: Check scsi a command's age before retrying it.
+ *
+ * Arguments: cmd - command that we are checking for timeout.
+ *
+ * Returns: non-zero if command has exceeded its lifetime
+ * zero otherwise
+ *
+ * Notes: A commands lifetime is considered to be the number
+ * of (retries permitted plus one) * command timeout.
+ *
+ */
+static int scsi_command_expired(struct scsi_cmnd *cmd)
+{
+ int ret = 0;
+ unsigned long wait_for = (cmd->allowed + 1) * cmd->timeout_per_command;
+ if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
+ sdev_printk(KERN_ERR, cmd->device,
+ "timing out command, waited %lus\n",
+ wait_for/HZ);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
* Function: scsi_io_completion()
*
* Purpose: Completion processing for block device I/O requests.
@@ -824,8 +857,6 @@ void scsi_io_completion(struct scsi_cmnd
int sense_valid = 0;
int sense_deferred = 0;
- scsi_release_buffers(cmd);
-
if (result) {
sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
if (sense_valid)
@@ -961,9 +992,20 @@ void scsi_io_completion(struct scsi_cmnd
/* Third party bus reset or reset for error recovery
* reasons. Just retry the request and see what
* happens.
+ * If no data was transferred, just reissue this
+ * command. If data was transferred, regenerate
+ * the command to transfer only untransferred data.
*/
- scsi_requeue_command(q, cmd);
- return;
+ if (!good_bytes) {
+ if (!(scsi_command_expired(cmd))) {
+ scsi_retry_command(cmd, SCSI_MLQUEUE_DID_RESET);
+ return;
+ }
+ }
+ else {
+ scsi_requeue_command(q, cmd);
+ return;
+ }
}
if (result) {
if (!(req->cmd_flags & REQ_QUIET)) {
@@ -1358,17 +1400,12 @@ static void scsi_kill_request(struct req
static void scsi_softirq_done(struct request *rq)
{
struct scsi_cmnd *cmd = rq->completion_data;
- unsigned long wait_for = (cmd->allowed + 1) * cmd->timeout_per_command;
int disposition;
INIT_LIST_HEAD(&cmd->eh_entry);
disposition = scsi_decide_disposition(cmd);
- if (disposition != SUCCESS &&
- time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
- sdev_printk(KERN_ERR, cmd->device,
- "timing out command, waited %lus\n",
- wait_for/HZ);
+ if (disposition != SUCCESS && scsi_command_expired(cmd)) {
disposition = SUCCESS;
}
@@ -1379,7 +1416,7 @@ static void scsi_softirq_done(struct req
scsi_finish_command(cmd);
break;
case NEEDS_RETRY:
- scsi_retry_command(cmd);
+ scsi_retry_command(cmd, SCSI_MLQUEUE_EH_RETRY);
break;
case ADD_TO_MLQUEUE:
scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
next reply other threads:[~2006-12-11 21:43 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-12-11 21:42 Michael Reed [this message]
2007-01-02 12:15 ` [RFC] Prevent infinite retries due to DID_RESET return status Christoph Hellwig
2007-01-31 18:54 ` [PATCH 0/2] " Michael Reed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=457DD0CA.9010509@sgi.com \
--to=mdr@sgi.com \
--cc=jeremy@sgi.com \
--cc=linux-scsi@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox