* [PATCH 1/2] scsi: core: avoid leaving shost->last_reset with stale value if EH does not run
2021-10-25 14:39 [PATCH 0/2] Fix SCSI async abort handling when eh_deadline is active Ewan D. Milne
@ 2021-10-25 14:39 ` Ewan D. Milne
2021-10-28 3:29 ` Martin K. Petersen
2021-10-25 14:39 ` [PATCH 2/2] scsi: core: simplify control flow in scmd_eh_abort_handler Ewan D. Milne
1 sibling, 1 reply; 7+ messages in thread
From: Ewan D. Milne @ 2021-10-25 14:39 UTC (permalink / raw)
To: linux-scsi
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the command(s)
is successful, and there are never any scmds added to the shost->eh_cmd_q,
there is no code path which will reset the ->last_reset value back to zero.
The effect of this is that after a successful abort with no EH thread activity,
a subsequent timeout, perhaps a long time later, might immediately be considered
past a user-set eh_deadline time, and the host will be reset with no attempt at
recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Reported-by: Gopinath Marappan <marappan_gopinath@emc.com>
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable@vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
drivers/scsi/hosts.c | 1 +
drivers/scsi/scsi_error.c | 44 +++++++++++++++++++++++++++++++++++++++++---
drivers/scsi/scsi_lib.c | 1 +
include/scsi/scsi_cmnd.h | 2 +-
include/scsi/scsi_host.h | 1 +
5 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 3f6f14f..ff82f25 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index b6c86cc..9f4001e 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -151,9 +151,11 @@ scmd_eh_abort_handler(struct work_struct *work)
struct scsi_cmnd *scmd =
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
+ struct Scsi_Host *shost = sdev->host;
enum scsi_disposition rtn;
+ unsigned long flags;
- if (scsi_host_eh_past_deadline(sdev->host)) {
+ if (scsi_host_eh_past_deadline(shost)) {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_INFO, scmd,
"eh timeout, not aborting\n"));
@@ -161,10 +163,10 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_INFO, scmd,
"aborting command\n"));
- rtn = scsi_try_to_abort_cmd(sdev->host->hostt, scmd);
+ rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
if (rtn == SUCCESS) {
set_host_byte(scmd, DID_TIME_OUT);
- if (scsi_host_eh_past_deadline(sdev->host)) {
+ if (scsi_host_eh_past_deadline(shost)) {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_INFO, scmd,
"eh timeout, not retrying "
@@ -175,12 +177,42 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+
+ spin_unlock_irqrestore(shost->host_lock, flags);
+
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+
+ spin_unlock_irqrestore(shost->host_lock, flags);
+
scsi_finish_command(scmd);
return;
}
@@ -193,6 +225,10 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(shost->host_lock, flags);
+
scsi_eh_scmd_add(scmd);
}
@@ -223,6 +259,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5726738..05f8f6e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index eaf04c9..59afe87 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -68,7 +68,7 @@ struct scsi_pointer {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 7536370..1a02e58 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -556,6 +556,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
--
2.1.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 2/2] scsi: core: simplify control flow in scmd_eh_abort_handler
2021-10-25 14:39 [PATCH 0/2] Fix SCSI async abort handling when eh_deadline is active Ewan D. Milne
2021-10-25 14:39 ` [PATCH 1/2] scsi: core: avoid leaving shost->last_reset with stale value if EH does not run Ewan D. Milne
@ 2021-10-25 14:39 ` Ewan D. Milne
2021-10-28 3:31 ` Martin K. Petersen
1 sibling, 1 reply; 7+ messages in thread
From: Ewan D. Milne @ 2021-10-25 14:39 UTC (permalink / raw)
To: linux-scsi
Simplify the nested conditionals in the function by using
a label for the error path, and remove duplicate code.
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
drivers/scsi/scsi_error.c | 104 ++++++++++++++++++++--------------------------
1 file changed, 46 insertions(+), 58 deletions(-)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 9f4001e..e8248bb 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -159,72 +159,60 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_INFO, scmd,
"eh timeout, not aborting\n"));
- } else {
- SCSI_LOG_ERROR_RECOVERY(3,
+ goto out;
+ }
+
+ SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_INFO, scmd,
"aborting command\n"));
- rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
- if (rtn == SUCCESS) {
- set_host_byte(scmd, DID_TIME_OUT);
- if (scsi_host_eh_past_deadline(shost)) {
- SCSI_LOG_ERROR_RECOVERY(3,
- scmd_printk(KERN_INFO, scmd,
- "eh timeout, not retrying "
- "aborted command\n"));
- } else if (!scsi_noretry_cmd(scmd) &&
- scsi_cmd_retry_allowed(scmd) &&
- scsi_eh_should_retry_cmd(scmd)) {
- SCSI_LOG_ERROR_RECOVERY(3,
- scmd_printk(KERN_WARNING, scmd,
- "retry aborted command\n"));
-
- spin_lock_irqsave(shost->host_lock, flags);
- list_del_init(&scmd->eh_entry);
-
- /*
- * If the abort succeeds, and there is no further
- * EH action, clear the ->last_reset time.
- */
- if (list_empty(&shost->eh_abort_list) &&
- list_empty(&shost->eh_cmd_q))
- if (shost->eh_deadline != -1)
- shost->last_reset = 0;
-
- spin_unlock_irqrestore(shost->host_lock, flags);
-
- scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
- return;
- } else {
- SCSI_LOG_ERROR_RECOVERY(3,
- scmd_printk(KERN_WARNING, scmd,
- "finish aborted command\n"));
+ rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
+ if (rtn != SUCCESS) {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ scmd_printk(KERN_INFO, scmd,
+ "cmd abort %s\n",
+ (rtn == FAST_IO_FAIL) ?
+ "not send" : "failed"));
+ goto out;
+ }
+ set_host_byte(scmd, DID_TIME_OUT);
+ if (scsi_host_eh_past_deadline(shost)) {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ scmd_printk(KERN_INFO, scmd,
+ "eh timeout, not retrying "
+ "aborted command\n"));
+ goto out;
+ }
- spin_lock_irqsave(shost->host_lock, flags);
- list_del_init(&scmd->eh_entry);
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
- /*
- * If the abort succeeds, and there is no further
- * EH action, clear the ->last_reset time.
- */
- if (list_empty(&shost->eh_abort_list) &&
- list_empty(&shost->eh_cmd_q))
- if (shost->eh_deadline != -1)
- shost->last_reset = 0;
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
- spin_unlock_irqrestore(shost->host_lock, flags);
+ spin_unlock_irqrestore(shost->host_lock, flags);
- scsi_finish_command(scmd);
- return;
- }
- } else {
- SCSI_LOG_ERROR_RECOVERY(3,
- scmd_printk(KERN_INFO, scmd,
- "cmd abort %s\n",
- (rtn == FAST_IO_FAIL) ?
- "not send" : "failed"));
- }
+ if (!scsi_noretry_cmd(scmd) &&
+ scsi_cmd_retry_allowed(scmd) &&
+ scsi_eh_should_retry_cmd(scmd)) {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ scmd_printk(KERN_WARNING, scmd,
+ "retry aborted command\n"));
+ scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
+ } else {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ scmd_printk(KERN_WARNING, scmd,
+ "finish aborted command\n"));
+ scsi_finish_command(scmd);
}
+ return;
+out:
spin_lock_irqsave(shost->host_lock, flags);
list_del_init(&scmd->eh_entry);
spin_unlock_irqrestore(shost->host_lock, flags);
--
2.1.0
^ permalink raw reply related [flat|nested] 7+ messages in thread