* [PATCH 01/16] zfcp: do not wait for rports to become unblocked after host reset
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 02/16] bfa: Do not use scsi command to signal TMF status Hannes Reinecke
` (14 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke,
Benjamin Block
zfcp_scsi_eh_host_reset_handler() would call fc_block_rport() to
wait for all rports to become unblocked after host reset.
But after host reset it might happen that the port is gone, hence
fc_block_rport() might fail due to a missing port.
But that's a perfectly legal operation; on FC remote ports might
come and go.
So this patch removes the call to fc_block_rport() after host
reset. But with that rports may still be in blocked state after
host reset, so we need to return FAST_IO_FAIL from host reset
to avoid SCSI EH to fail commands prematurely if the rports
are still blocked.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
drivers/s390/scsi/zfcp_scsi.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index b2a8cd792266..b1df853e6f66 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -375,7 +375,7 @@ static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
{
struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(scpnt->device);
struct zfcp_adapter *adapter = zfcp_sdev->port->adapter;
- int ret = SUCCESS, fc_ret;
+ int ret = FAST_IO_FAIL;
if (!(adapter->connection_features & FSF_FEATURE_NPIV_MODE)) {
zfcp_erp_port_forced_reopen_all(adapter, 0, "schrh_p");
@@ -383,10 +383,6 @@ static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
}
zfcp_erp_adapter_reopen(adapter, 0, "schrh_1");
zfcp_erp_wait(adapter);
- fc_ret = fc_block_scsi_eh(scpnt);
- if (fc_ret)
- ret = fc_ret;
-
zfcp_dbf_scsi_eh("schrh_r", adapter, ~0, ret);
return ret;
}
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 02/16] bfa: Do not use scsi command to signal TMF status
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
2023-10-23 9:14 ` [PATCH 01/16] zfcp: do not wait for rports to become unblocked after host reset Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 03/16] aha152x: look for stuck command when resetting device Hannes Reinecke
` (13 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
The bfa driver hijacks the scsi command to signal the TMF status,
which will no longer work if the TMF handler will be converted.
So rework TMF handling to not use a scsi command but rather add
new TMF fields to the per-device structure.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/bfa/bfad_im.c | 112 ++++++++++++++++++++-----------------
drivers/scsi/bfa/bfad_im.h | 2 +
2 files changed, 63 insertions(+), 51 deletions(-)
diff --git a/drivers/scsi/bfa/bfad_im.c b/drivers/scsi/bfa/bfad_im.c
index a9d3d8562d3c..ef352fc59458 100644
--- a/drivers/scsi/bfa/bfad_im.c
+++ b/drivers/scsi/bfa/bfad_im.c
@@ -147,13 +147,13 @@ void
bfa_cb_tskim_done(void *bfad, struct bfad_tskim_s *dtsk,
enum bfi_tskim_status tsk_status)
{
- struct scsi_cmnd *cmnd = (struct scsi_cmnd *)dtsk;
+ struct bfad_itnim_data_s *itnim_data =
+ (struct bfad_itnim_data_s *)dtsk;
wait_queue_head_t *wq;
- bfad_priv(cmnd)->status |= tsk_status << 1;
- set_bit(IO_DONE_BIT, &bfad_priv(cmnd)->status);
- wq = bfad_priv(cmnd)->wq;
- bfad_priv(cmnd)->wq = NULL;
+ itnim_data->tmf_status |= tsk_status << 1;
+ set_bit(IO_DONE_BIT, &itnim_data->tmf_status);
+ wq = itnim_data->tmf_wq;
if (wq)
wake_up(wq);
@@ -238,15 +238,16 @@ bfad_im_abort_handler(struct scsi_cmnd *cmnd)
}
static bfa_status_t
-bfad_im_target_reset_send(struct bfad_s *bfad, struct scsi_cmnd *cmnd,
- struct bfad_itnim_s *itnim)
+bfad_im_target_reset_send(struct bfad_s *bfad,
+ struct bfad_itnim_data_s *itnim_data)
{
+ struct bfad_itnim_s *itnim = itnim_data->itnim;
struct bfa_tskim_s *tskim;
struct bfa_itnim_s *bfa_itnim;
bfa_status_t rc = BFA_STATUS_OK;
struct scsi_lun scsilun;
- tskim = bfa_tskim_alloc(&bfad->bfa, (struct bfad_tskim_s *) cmnd);
+ tskim = bfa_tskim_alloc(&bfad->bfa, (struct bfad_tskim_s *) itnim_data);
if (!tskim) {
BFA_LOG(KERN_ERR, bfad, bfa_log_level,
"target reset, fail to allocate tskim\n");
@@ -254,12 +255,6 @@ bfad_im_target_reset_send(struct bfad_s *bfad, struct scsi_cmnd *cmnd,
goto out;
}
- /*
- * Set host_scribble to NULL to avoid aborting a task command if
- * happens.
- */
- cmnd->host_scribble = NULL;
- bfad_priv(cmnd)->status = 0;
bfa_itnim = bfa_fcs_itnim_get_halitn(&itnim->fcs_itnim);
/*
* bfa_itnim can be NULL if the port gets disconnected and the bfa
@@ -290,10 +285,11 @@ bfad_im_target_reset_send(struct bfad_s *bfad, struct scsi_cmnd *cmnd,
static int
bfad_im_reset_lun_handler(struct scsi_cmnd *cmnd)
{
- struct Scsi_Host *shost = cmnd->device->host;
+ struct scsi_device *sdev = cmnd->device;
+ struct Scsi_Host *shost = sdev->host;
struct bfad_im_port_s *im_port =
(struct bfad_im_port_s *) shost->hostdata[0];
- struct bfad_itnim_data_s *itnim_data = cmnd->device->hostdata;
+ struct bfad_itnim_data_s *itnim_data = sdev->hostdata;
struct bfad_s *bfad = im_port->bfad;
struct bfa_tskim_s *tskim;
struct bfad_itnim_s *itnim;
@@ -305,14 +301,20 @@ bfad_im_reset_lun_handler(struct scsi_cmnd *cmnd)
struct scsi_lun scsilun;
spin_lock_irqsave(&bfad->bfad_lock, flags);
+ if (itnim_data->tmf_wq) {
+ BFA_LOG(KERN_ERR, bfad, bfa_log_level,
+ "LUN reset, TMF already active");
+ spin_unlock_irqrestore(&bfad->bfad_lock, flags);
+ rc = FAILED;
+ goto out;
+ }
itnim = itnim_data->itnim;
if (!itnim) {
spin_unlock_irqrestore(&bfad->bfad_lock, flags);
rc = FAILED;
goto out;
}
-
- tskim = bfa_tskim_alloc(&bfad->bfa, (struct bfad_tskim_s *) cmnd);
+ tskim = bfa_tskim_alloc(&bfad->bfa, (struct bfad_tskim_s *) itnim_data);
if (!tskim) {
BFA_LOG(KERN_ERR, bfad, bfa_log_level,
"LUN reset, fail to allocate tskim");
@@ -321,13 +323,8 @@ bfad_im_reset_lun_handler(struct scsi_cmnd *cmnd)
goto out;
}
- /*
- * Set host_scribble to NULL to avoid aborting a task command
- * if happens.
- */
- cmnd->host_scribble = NULL;
- bfad_priv(cmnd)->wq = &wq;
- bfad_priv(cmnd)->status = 0;
+ itnim_data->tmf_wq = &wq;
+ itnim_data->tmf_status = 0;
bfa_itnim = bfa_fcs_itnim_get_halitn(&itnim->fcs_itnim);
/*
* bfa_itnim can be NULL if the port gets disconnected and the bfa
@@ -342,14 +339,15 @@ bfad_im_reset_lun_handler(struct scsi_cmnd *cmnd)
rc = FAILED;
goto out;
}
- int_to_scsilun(cmnd->device->lun, &scsilun);
+ int_to_scsilun(sdev->lun, &scsilun);
bfa_tskim_start(tskim, bfa_itnim, scsilun,
FCP_TM_LUN_RESET, BFAD_LUN_RESET_TMO);
spin_unlock_irqrestore(&bfad->bfad_lock, flags);
- wait_event(wq, test_bit(IO_DONE_BIT, &bfad_priv(cmnd)->status));
+ wait_event(wq, test_bit(IO_DONE_BIT, &itnim_data->tmf_status));
- task_status = bfad_priv(cmnd)->status >> 1;
+ itnim_data->tmf_wq = NULL;
+ task_status = itnim_data->tmf_status >> 1;
if (task_status != BFI_TSKIM_STS_OK) {
BFA_LOG(KERN_ERR, bfad, bfa_log_level,
"LUN reset failure, status: %d\n", task_status);
@@ -368,35 +366,47 @@ bfad_im_reset_target_handler(struct scsi_cmnd *cmnd)
{
struct Scsi_Host *shost = cmnd->device->host;
struct scsi_target *starget = scsi_target(cmnd->device);
+ struct fc_rport *rport = starget_to_rport(starget);
struct bfad_im_port_s *im_port =
(struct bfad_im_port_s *) shost->hostdata[0];
- struct bfad_s *bfad = im_port->bfad;
- struct bfad_itnim_s *itnim;
- unsigned long flags;
- u32 rc, rtn = FAILED;
+ struct bfad_s *bfad = im_port->bfad;
+ struct bfad_itnim_data_s *itnim_data;
+ struct bfad_itnim_s *itnim;
+ unsigned long flags;
+ u32 rc, rtn = FAILED;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
enum bfi_tskim_status task_status;
spin_lock_irqsave(&bfad->bfad_lock, flags);
- itnim = bfad_get_itnim(im_port, starget->id);
- if (itnim) {
- bfad_priv(cmnd)->wq = &wq;
- rc = bfad_im_target_reset_send(bfad, cmnd, itnim);
- if (rc == BFA_STATUS_OK) {
- /* wait target reset to complete */
- spin_unlock_irqrestore(&bfad->bfad_lock, flags);
- wait_event(wq, test_bit(IO_DONE_BIT,
- &bfad_priv(cmnd)->status));
- spin_lock_irqsave(&bfad->bfad_lock, flags);
-
- task_status = bfad_priv(cmnd)->status >> 1;
- if (task_status != BFI_TSKIM_STS_OK)
- BFA_LOG(KERN_ERR, bfad, bfa_log_level,
- "target reset failure,"
- " status: %d\n", task_status);
- else
- rtn = SUCCESS;
- }
+ if (!rport->dd_data) {
+ spin_unlock_irqrestore(&bfad->bfad_lock, flags);
+ return rtn;
+ }
+ itnim_data = rport->dd_data;
+ if (itnim_data->tmf_wq) {
+ BFA_LOG(KERN_ERR, bfad, bfa_log_level,
+ "target reset failed, TMF already active");
+ spin_unlock_irqrestore(&bfad->bfad_lock, flags);
+ return rtn;
+ }
+ itnim = itnim_data->itnim;
+ itnim_data->tmf_wq = &wq;
+ itnim_data->tmf_status = 0;
+ rc = bfad_im_target_reset_send(bfad, itnim_data);
+ if (rc == BFA_STATUS_OK) {
+ /* wait target reset to complete */
+ spin_unlock_irqrestore(&bfad->bfad_lock, flags);
+ wait_event(wq, test_bit(IO_DONE_BIT,
+ &itnim_data->tmf_status));
+ spin_lock_irqsave(&bfad->bfad_lock, flags);
+
+ task_status = itnim_data->tmf_status >> 1;
+ if (task_status != BFI_TSKIM_STS_OK)
+ BFA_LOG(KERN_ERR, bfad, bfa_log_level,
+ "target reset failure,"
+ " status: %d\n", task_status);
+ else
+ rtn = SUCCESS;
}
spin_unlock_irqrestore(&bfad->bfad_lock, flags);
diff --git a/drivers/scsi/bfa/bfad_im.h b/drivers/scsi/bfa/bfad_im.h
index 4353feedf76a..48e8c12969c9 100644
--- a/drivers/scsi/bfa/bfad_im.h
+++ b/drivers/scsi/bfa/bfad_im.h
@@ -61,6 +61,8 @@ static inline struct bfad_cmd_priv *bfad_priv(struct scsi_cmnd *cmd)
struct bfad_itnim_data_s {
struct bfad_itnim_s *itnim;
+ wait_queue_head_t *tmf_wq;
+ unsigned long tmf_status;
};
struct bfad_im_port_s {
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 03/16] aha152x: look for stuck command when resetting device
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
2023-10-23 9:14 ` [PATCH 01/16] zfcp: do not wait for rports to become unblocked after host reset Hannes Reinecke
2023-10-23 9:14 ` [PATCH 02/16] bfa: Do not use scsi command to signal TMF status Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 04/16] a1000u2w: do not rely on the command for inia100_device_reset() Hannes Reinecke
` (12 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
The LLDD needs a command to send the reset with, so look at the
list of outstanding commands to get one.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/aha152x.c | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index 055adb349b0e..936c9f5c6f23 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -1070,24 +1070,28 @@ static int aha152x_abort(struct scsi_cmnd *SCpnt)
*/
static int aha152x_device_reset(struct scsi_cmnd * SCpnt)
{
- struct Scsi_Host *shpnt = SCpnt->device->host;
+ struct scsi_device *sdev = SCpnt->device;
+ struct Scsi_Host *shpnt = sdev->host;
DECLARE_COMPLETION(done);
int ret, issued, disconnected;
- unsigned char old_cmd_len = SCpnt->cmd_len;
+ unsigned char old_cmd_len;
unsigned long flags;
unsigned long timeleft;
- if(CURRENT_SC==SCpnt) {
- scmd_printk(KERN_ERR, SCpnt, "cannot reset current device\n");
- return FAILED;
- }
-
DO_LOCK(flags);
- issued = remove_SC(&ISSUE_SC, SCpnt) == NULL;
- disconnected = issued && remove_SC(&DISCONNECTED_SC, SCpnt);
+ /* Look for the stuck command */
+ SCpnt = remove_lun_SC(&ISSUE_SC, sdev->id, sdev->lun);
+ if (SCpnt)
+ issued = 1;
+ else
+ SCpnt = remove_lun_SC(&DISCONNECTED_SC, sdev->id, sdev->lun);
+ if (!issued && SCpnt)
+ disconnected = 1;
DO_UNLOCK(flags);
-
- SCpnt->cmd_len = 0;
+ if (!SCpnt)
+ return FAILED;
+ old_cmd_len = SCpnt->cmd_len;
+ SCpnt->cmd_len = 0;
aha152x_internal_queue(SCpnt, &done, resetting);
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 04/16] a1000u2w: do not rely on the command for inia100_device_reset()
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (2 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 03/16] aha152x: look for stuck command when resetting device Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 05/16] fas216: Rework device reset to not rely on SCSI command pointer Hannes Reinecke
` (11 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Use the scsi device as argument to orc_device_reset() instead
of relying on the passed in scsi command.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/a100u2w.c | 46 +++++++++++-------------------------------
1 file changed, 12 insertions(+), 34 deletions(-)
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index b95147fb18b0..ab57bb8f6850 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -585,46 +585,26 @@ static int orc_reset_scsi_bus(struct orc_host * host)
/**
* orc_device_reset - device reset handler
* @host: host to reset
- * @cmd: command causing the reset
- * @target: target device
+ * @sdev: target device
*
* Reset registers, reset a hanging bus and kill active and disconnected
* commands for target w/o soft reset
*/
-static int orc_device_reset(struct orc_host * host, struct scsi_cmnd *cmd, unsigned int target)
+static int orc_device_reset(struct orc_host * host, struct scsi_device *sdev)
{ /* I need Host Control Block Information */
struct orc_scb *scb;
struct orc_extended_scb *escb;
- struct orc_scb *host_scb;
- u8 i;
unsigned long flags;
spin_lock_irqsave(&(host->allocation_lock), flags);
scb = (struct orc_scb *) NULL;
escb = (struct orc_extended_scb *) NULL;
- /* setup scatter list address with one buffer */
- host_scb = host->scb_virt;
-
/* FIXME: is this safe if we then fail to issue the reset or race
a completion ? */
init_alloc_map(host);
- /* Find the scb corresponding to the command */
- for (i = 0; i < ORC_MAXQUEUE; i++) {
- escb = host_scb->escb;
- if (host_scb->status && escb->srb == cmd)
- break;
- host_scb++;
- }
-
- if (i == ORC_MAXQUEUE) {
- printk(KERN_ERR "Unable to Reset - No SCB Found\n");
- spin_unlock_irqrestore(&(host->allocation_lock), flags);
- return FAILED;
- }
-
/* Allocate a new SCB for the reset command to the firmware */
if ((scb = __orc_alloc_scb(host)) == NULL) {
/* Can't happen.. */
@@ -635,7 +615,7 @@ static int orc_device_reset(struct orc_host * host, struct scsi_cmnd *cmd, unsig
/* Reset device is handled by the firmware, we fill in an SCB and
fire it at the controller, it does the rest */
scb->opcode = ORC_BUSDEVRST;
- scb->target = target;
+ scb->target = sdev->id;
scb->hastat = 0;
scb->tastat = 0;
scb->status = 0x0;
@@ -645,8 +625,8 @@ static int orc_device_reset(struct orc_host * host, struct scsi_cmnd *cmd, unsig
scb->xferlen = cpu_to_le32(0);
scb->sg_len = cpu_to_le32(0);
+ escb = scb->escb;
escb->srb = NULL;
- escb->srb = cmd;
orc_exec_scb(host, scb); /* Start execute SCB */
spin_unlock_irqrestore(&host->allocation_lock, flags);
return SUCCESS;
@@ -971,7 +951,7 @@ static int inia100_device_reset(struct scsi_cmnd * cmd)
{ /* I need Host Control Block Information */
struct orc_host *host;
host = (struct orc_host *) cmd->device->host->hostdata;
- return orc_device_reset(host, cmd, scmd_id(cmd));
+ return orc_device_reset(host, cmd->device);
}
@@ -991,11 +971,7 @@ static void inia100_scb_handler(struct orc_host *host, struct orc_scb *scb)
struct orc_extended_scb *escb;
escb = scb->escb;
- if ((cmd = (struct scsi_cmnd *) escb->srb) == NULL) {
- printk(KERN_ERR "inia100_scb_handler: SRB pointer is empty\n");
- orc_release_scb(host, scb); /* Release SCB for current channel */
- return;
- }
+ cmd = (struct scsi_cmnd *)escb->srb;
escb->srb = NULL;
switch (scb->hastat) {
@@ -1033,13 +1009,15 @@ static void inia100_scb_handler(struct orc_host *host, struct orc_scb *scb)
break;
}
- if (scb->tastat == 2) { /* Check condition */
+ if (cmd && scb->tastat == 2) { /* Check condition */
memcpy((unsigned char *) &cmd->sense_buffer[0],
(unsigned char *) &escb->sglist[0], SENSE_SIZE);
}
- cmd->result = scb->tastat | (scb->hastat << 16);
- scsi_dma_unmap(cmd);
- scsi_done(cmd); /* Notify system DONE */
+ if (cmd) {
+ cmd->result = scb->tastat | (scb->hastat << 16);
+ scsi_dma_unmap(cmd);
+ scsi_done(cmd); /* Notify system DONE */
+ }
orc_release_scb(host, scb); /* Release SCB for current channel */
}
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 05/16] fas216: Rework device reset to not rely on SCSI command pointer
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (3 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 04/16] a1000u2w: do not rely on the command for inia100_device_reset() Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 06/16] xen-scsifront: add scsi device as argument to scsifront_do_request() Hannes Reinecke
` (10 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
The device reset code should not rely on the SCSI command pointer;
it will be going away with the device reset handler rework.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/arm/fas216.c | 39 +++++++++++++++++++--------------------
1 file changed, 19 insertions(+), 20 deletions(-)
diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index 4ce0b2d73614..e6289c6af5ef 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -1985,7 +1985,6 @@ static void fas216_devicereset_done(FAS216_Info *info, struct scsi_cmnd *SCpnt,
{
fas216_log(info, LOG_ERROR, "fas216 device reset complete");
- info->rstSCpnt = NULL;
info->rst_dev_status = 1;
wake_up(&info->eh_wait);
}
@@ -2143,12 +2142,12 @@ static void fas216_done(FAS216_Info *info, unsigned int result)
fas216_checkmagic(info);
- if (!info->SCpnt)
+ if (!info->SCpnt && info->rst_dev_status)
goto no_command;
SCpnt = info->SCpnt;
info->SCpnt = NULL;
- info->scsi.phase = PHASE_IDLE;
+ info->scsi.phase = PHASE_IDLE;
if (info->scsi.aborting) {
fas216_log(info, 0, "uncaught abort - returning DID_ABORT");
@@ -2160,7 +2159,7 @@ static void fas216_done(FAS216_Info *info, unsigned int result)
* Sanity check the completion - if we have zero bytes left
* to transfer, we should not have a valid pointer.
*/
- if (info->scsi.SCp.ptr && info->scsi.SCp.this_residual == 0) {
+ if (SCpnt && info->scsi.SCp.ptr && info->scsi.SCp.this_residual == 0) {
scmd_printk(KERN_INFO, SCpnt,
"zero bytes left to transfer, but buffer pointer still valid: ptr=%p len=%08x\n",
info->scsi.SCp.ptr, info->scsi.SCp.this_residual);
@@ -2173,12 +2172,18 @@ static void fas216_done(FAS216_Info *info, unsigned int result)
* the sense information, fas216_kick will re-assert the busy
* status.
*/
- info->device[SCpnt->device->id].parity_check = 0;
- clear_bit(SCpnt->device->id * 8 +
- (u8)(SCpnt->device->lun & 0x7), info->busyluns);
-
- fn = (void (*)(FAS216_Info *, struct scsi_cmnd *, unsigned int))SCpnt->host_scribble;
- fn(info, SCpnt, result);
+ if (SCpnt) {
+ info->device[SCpnt->device->id].parity_check = 0;
+ clear_bit(SCpnt->device->id * 8 +
+ (u8)(SCpnt->device->lun & 0x7), info->busyluns);
+ }
+ if (!info->rst_dev_status) {
+ info->rst_dev_status = 1;
+ wake_up(&info->eh_wait);
+ } else {
+ fn = (void (*)(FAS216_Info *, struct scsi_cmnd *, unsigned int))SCpnt->host_scribble;
+ fn(info, SCpnt, result);
+ }
if (info->scsi.irq) {
spin_lock_irqsave(&info->host_lock, flags);
@@ -2478,9 +2483,10 @@ int fas216_eh_abort(struct scsi_cmnd *SCpnt)
*/
int fas216_eh_device_reset(struct scsi_cmnd *SCpnt)
{
- FAS216_Info *info = (FAS216_Info *)SCpnt->device->host->hostdata;
+ struct scsi_device *sdev = SCpnt->device;
+ FAS216_Info *info = (FAS216_Info *)sdev->host->hostdata;
unsigned long flags;
- int i, res = FAILED, target = SCpnt->device->id;
+ int i, res = FAILED, target = sdev->id;
fas216_log(info, LOG_ERROR, "device reset for target %d", target);
@@ -2494,7 +2500,7 @@ int fas216_eh_device_reset(struct scsi_cmnd *SCpnt)
* and we need a bus reset.
*/
if (info->SCpnt && !info->scsi.disconnectable &&
- info->SCpnt->device->id == SCpnt->device->id)
+ info->SCpnt->device->id == sdev->id)
break;
/*
@@ -2512,14 +2518,7 @@ int fas216_eh_device_reset(struct scsi_cmnd *SCpnt)
for (i = 0; i < 8; i++)
clear_bit(target * 8 + i, info->busyluns);
- /*
- * Hijack this SCSI command structure to send
- * a bus device reset message to this device.
- */
- SCpnt->host_scribble = (void *)fas216_devicereset_done;
-
info->rst_dev_status = 0;
- info->rstSCpnt = SCpnt;
if (info->scsi.phase == PHASE_IDLE)
fas216_kick(info);
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 06/16] xen-scsifront: add scsi device as argument to scsifront_do_request()
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (4 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 05/16] fas216: Rework device reset to not rely on SCSI command pointer Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 07/16] xen-scsifront: rework scsifront_action_handler() Hannes Reinecke
` (9 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Add scsi device as argument to scsifront_do_request() so that it
will be possible to call it with a NULL command pointer.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/xen-scsifront.c | 31 ++++++++++++++++++++-----------
1 file changed, 20 insertions(+), 11 deletions(-)
diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c
index 9ec55ddc1204..a0c13200d53a 100644
--- a/drivers/scsi/xen-scsifront.c
+++ b/drivers/scsi/xen-scsifront.c
@@ -178,7 +178,8 @@ static void scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id)
scsifront_wake_up(info);
}
-static int scsifront_do_request(struct vscsifrnt_info *info,
+static int scsifront_do_request(struct scsi_device *sdev,
+ struct vscsifrnt_info *info,
struct vscsifrnt_shadow *shadow)
{
struct vscsiif_front_ring *ring = &(info->ring);
@@ -205,17 +206,25 @@ static int scsifront_do_request(struct vscsifrnt_info *info,
ring_req->ref_rqid = shadow->ref_rqid;
ring_req->nr_segments = shadow->nr_segments;
- ring_req->id = sc->device->id;
- ring_req->lun = sc->device->lun;
- ring_req->channel = sc->device->channel;
- ring_req->cmd_len = sc->cmd_len;
+ ring_req->id = sdev->id;
+ ring_req->lun = sdev->lun;
+ ring_req->channel = sdev->channel;
- BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
+ if (sc) {
+ ring_req->cmd_len = sc->cmd_len;
- memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+ BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
- ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction;
- ring_req->timeout_per_command = scsi_cmd_to_rq(sc)->timeout / HZ;
+ memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+
+ ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction;
+ ring_req->timeout_per_command =
+ scsi_cmd_to_rq(sc)->timeout / HZ;
+ } else {
+ ring_req->cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
+ memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
+ ring_req->sc_data_direction = DMA_NONE;
+ }
for (i = 0; i < (shadow->nr_segments & ~VSCSIIF_SG_GRANT); i++)
ring_req->seg[i] = shadow->seg[i];
@@ -637,7 +646,7 @@ static int scsifront_queuecommand(struct Scsi_Host *shost,
return 0;
}
- if (scsifront_do_request(info, shadow)) {
+ if (scsifront_do_request(sc->device, info, shadow)) {
scsifront_gnttab_done(info, shadow);
goto busy;
}
@@ -685,7 +694,7 @@ static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
if (scsifront_enter(info))
goto fail;
- if (!scsifront_do_request(info, shadow))
+ if (!scsifront_do_request(sc->device, info, shadow))
break;
scsifront_return(info);
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 07/16] xen-scsifront: rework scsifront_action_handler()
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (5 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 06/16] xen-scsifront: add scsi device as argument to scsifront_do_request() Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:14 ` [PATCH 08/16] libiscsi: use cls_session as argument for target and session reset Hannes Reinecke
` (8 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Rework scsifront_action_handler() to add the SCSI device as the
first argument, and select between abort and device reset by
checking whether the scsi_cmnd argument is NULL.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/xen-scsifront.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c
index a0c13200d53a..5a6cb582e5eb 100644
--- a/drivers/scsi/xen-scsifront.c
+++ b/drivers/scsi/xen-scsifront.c
@@ -668,11 +668,12 @@ static int scsifront_queuecommand(struct Scsi_Host *shost,
* We have to wait until an answer is returned. This answer contains the
* result to be returned to the requestor.
*/
-static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
+static int scsifront_action_handler(struct scsi_device *sdev,
+ struct scsi_cmnd *sc)
{
- struct Scsi_Host *host = sc->device->host;
+ struct Scsi_Host *host = sdev->host;
struct vscsifrnt_info *info = shost_priv(host);
- struct vscsifrnt_shadow *shadow, *s = scsi_cmd_priv(sc);
+ struct vscsifrnt_shadow *shadow;
int err = 0;
if (info->host_active == STATE_ERROR)
@@ -682,10 +683,14 @@ static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
if (!shadow)
return FAILED;
- shadow->act = act;
+ shadow->act = sc ? VSCSIIF_ACT_SCSI_ABORT : VSCSIIF_ACT_SCSI_RESET;
shadow->rslt_reset = RSLT_RESET_WAITING;
shadow->sc = sc;
- shadow->ref_rqid = s->rqid;
+ if (sc) {
+ struct vscsifrnt_shadow *s = scsi_cmd_priv(sc);
+
+ shadow->ref_rqid = s->rqid;
+ }
init_waitqueue_head(&shadow->wq_reset);
spin_lock_irq(host->host_lock);
@@ -735,13 +740,13 @@ static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
{
pr_debug("%s\n", __func__);
- return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_ABORT);
+ return scsifront_action_handler(sc->device, sc);
}
static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
{
pr_debug("%s\n", __func__);
- return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_RESET);
+ return scsifront_action_handler(sc->device, NULL);
}
static int scsifront_sdev_configure(struct scsi_device *sdev)
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 08/16] libiscsi: use cls_session as argument for target and session reset
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (6 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 07/16] xen-scsifront: rework scsifront_action_handler() Hannes Reinecke
@ 2023-10-23 9:14 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 09/16] scsi_transport_iscsi: use session as argument for iscsi_block_scsi_eh() Hannes Reinecke
` (7 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:14 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke,
Mike Christie
iscsi_eh_target_reset() and iscsi_eh_session_reset() only depend
on the cls_session, so use that as an argument.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
---
drivers/scsi/be2iscsi/be_main.c | 10 +++++++++-
drivers/scsi/libiscsi.c | 25 +++++++++++--------------
include/scsi/libiscsi.h | 2 +-
3 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index e48f14ad6dfd..441ad2ebc5d5 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -385,6 +385,14 @@ static int beiscsi_eh_device_reset(struct scsi_cmnd *sc)
return rc;
}
+static int beiscsi_eh_session_reset(struct scsi_cmnd *sc)
+{
+ struct iscsi_cls_session *cls_session;
+
+ cls_session = starget_to_session(scsi_target(sc->device));
+ return iscsi_eh_session_reset(cls_session);
+}
+
/*------------------- PCI Driver operations and data ----------------- */
static const struct pci_device_id beiscsi_pci_id_table[] = {
{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) },
@@ -408,7 +416,7 @@ static const struct scsi_host_template beiscsi_sht = {
.eh_timed_out = iscsi_eh_cmd_timed_out,
.eh_abort_handler = beiscsi_eh_abort,
.eh_device_reset_handler = beiscsi_eh_device_reset,
- .eh_target_reset_handler = iscsi_eh_session_reset,
+ .eh_target_reset_handler = beiscsi_eh_session_reset,
.shost_groups = beiscsi_groups,
.sg_tablesize = BEISCSI_SGLIST_ELEMENTS,
.can_queue = BE2_IO_DEPTH,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 0fda8905eabd..3b9e1c35936e 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2595,18 +2595,16 @@ EXPORT_SYMBOL_GPL(iscsi_session_recovery_timedout);
/**
* iscsi_eh_session_reset - drop session and attempt relogin
- * @sc: scsi command
+ * @cls_session: class session to reset
*
* This function will wait for a relogin, session termination from
* userspace, or a recovery/replacement timeout.
*/
-int iscsi_eh_session_reset(struct scsi_cmnd *sc)
+int iscsi_eh_session_reset(struct iscsi_cls_session *cls_session)
{
- struct iscsi_cls_session *cls_session;
struct iscsi_session *session;
struct iscsi_conn *conn;
- cls_session = starget_to_session(scsi_target(sc->device));
session = cls_session->dd_data;
mutex_lock(&session->eh_mutex);
@@ -2653,7 +2651,7 @@ int iscsi_eh_session_reset(struct scsi_cmnd *sc)
}
EXPORT_SYMBOL_GPL(iscsi_eh_session_reset);
-static void iscsi_prep_tgt_reset_pdu(struct scsi_cmnd *sc, struct iscsi_tm *hdr)
+static void iscsi_prep_tgt_reset_pdu(struct iscsi_tm *hdr)
{
memset(hdr, 0, sizeof(*hdr));
hdr->opcode = ISCSI_OP_SCSI_TMFUNC | ISCSI_OP_IMMEDIATE;
@@ -2664,23 +2662,20 @@ static void iscsi_prep_tgt_reset_pdu(struct scsi_cmnd *sc, struct iscsi_tm *hdr)
/**
* iscsi_eh_target_reset - reset target
- * @sc: scsi command
+ * @cls_session: class session to reset
*
* This will attempt to send a warm target reset.
*/
-static int iscsi_eh_target_reset(struct scsi_cmnd *sc)
+static int iscsi_eh_target_reset(struct iscsi_cls_session *cls_session)
{
- struct iscsi_cls_session *cls_session;
struct iscsi_session *session;
struct iscsi_conn *conn;
struct iscsi_tm *hdr;
int rc = FAILED;
- cls_session = starget_to_session(scsi_target(sc->device));
session = cls_session->dd_data;
- ISCSI_DBG_EH(session, "tgt Reset [sc %p tgt %s]\n", sc,
- session->targetname);
+ ISCSI_DBG_EH(session, "tgt Reset [tgt %s]\n", session->targetname);
mutex_lock(&session->eh_mutex);
spin_lock_bh(&session->frwd_lock);
@@ -2698,7 +2693,7 @@ static int iscsi_eh_target_reset(struct scsi_cmnd *sc)
session->tmf_state = TMF_QUEUED;
hdr = &session->tmhdr;
- iscsi_prep_tgt_reset_pdu(sc, hdr);
+ iscsi_prep_tgt_reset_pdu(hdr);
if (iscsi_exec_task_mgmt_fn(conn, hdr, session->age,
session->tgt_reset_timeout)) {
@@ -2750,11 +2745,13 @@ static int iscsi_eh_target_reset(struct scsi_cmnd *sc)
*/
int iscsi_eh_recover_target(struct scsi_cmnd *sc)
{
+ struct iscsi_cls_session *cls_session;
int rc;
- rc = iscsi_eh_target_reset(sc);
+ cls_session = starget_to_session(scsi_target(sc->device));
+ rc = iscsi_eh_target_reset(cls_session);
if (rc == FAILED)
- rc = iscsi_eh_session_reset(sc);
+ rc = iscsi_eh_session_reset(cls_session);
return rc;
}
EXPORT_SYMBOL_GPL(iscsi_eh_recover_target);
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 7282555adfd5..7dddf785edd0 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -390,7 +390,7 @@ struct iscsi_host {
*/
extern int iscsi_eh_abort(struct scsi_cmnd *sc);
extern int iscsi_eh_recover_target(struct scsi_cmnd *sc);
-extern int iscsi_eh_session_reset(struct scsi_cmnd *sc);
+extern int iscsi_eh_session_reset(struct iscsi_cls_session *cls_session);
extern int iscsi_eh_device_reset(struct scsi_cmnd *sc);
extern int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc);
extern enum scsi_timeout_action iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc);
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 09/16] scsi_transport_iscsi: use session as argument for iscsi_block_scsi_eh()
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (7 preceding siblings ...)
2023-10-23 9:14 ` [PATCH 08/16] libiscsi: use cls_session as argument for target and session reset Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 10/16] snic: reserve tag for TMF Hannes Reinecke
` (6 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
We should be passing in the session directly instead of deriving it
from the scsi command.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/qla4xxx/ql4_os.c | 34 ++++++++++++++++++-----------
drivers/scsi/scsi_transport_iscsi.c | 6 ++---
include/scsi/scsi_transport_iscsi.h | 2 +-
3 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 675332e49a7b..961fe65bbe65 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -9272,15 +9272,18 @@ static int qla4xxx_eh_abort(struct scsi_cmnd *cmd)
**/
static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd)
{
- struct scsi_qla_host *ha = to_qla_host(cmd->device->host);
- struct ddb_entry *ddb_entry = cmd->device->hostdata;
+ struct scsi_device *sdev = cmd->device;
+ struct scsi_qla_host *ha = to_qla_host(sdev->host);
+ struct iscsi_cls_session *session;
+ struct ddb_entry *ddb_entry = sdev->hostdata;
int ret = FAILED, stat;
int rval;
if (!ddb_entry)
return ret;
- ret = iscsi_block_scsi_eh(cmd);
+ session = starget_to_session(scsi_target(sdev));
+ ret = iscsi_block_scsi_eh(session);
if (ret)
return ret;
ret = FAILED;
@@ -9341,19 +9344,25 @@ static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd)
**/
static int qla4xxx_eh_target_reset(struct scsi_cmnd *cmd)
{
- struct scsi_qla_host *ha = to_qla_host(cmd->device->host);
- struct ddb_entry *ddb_entry = cmd->device->hostdata;
+ struct scsi_target *starget = scsi_target(cmd->device);
+ struct iscsi_cls_session *cls_session = starget_to_session(starget);
+ struct iscsi_session *sess;
+ struct scsi_qla_host *ha;
+ struct ddb_entry *ddb_entry;
int stat, ret;
int rval;
+ sess = cls_session->dd_data;
+ ddb_entry = sess->dd_data;
if (!ddb_entry)
return FAILED;
+ ha = ddb_entry->ha;
- ret = iscsi_block_scsi_eh(cmd);
+ ret = iscsi_block_scsi_eh(cls_session);
if (ret)
return ret;
- starget_printk(KERN_INFO, scsi_target(cmd->device),
+ starget_printk(KERN_INFO, starget,
"WARM TARGET RESET ISSUED.\n");
DEBUG2(printk(KERN_INFO
@@ -9370,14 +9379,13 @@ static int qla4xxx_eh_target_reset(struct scsi_cmnd *cmd)
stat = qla4xxx_reset_target(ha, ddb_entry);
if (stat != QLA_SUCCESS) {
- starget_printk(KERN_INFO, scsi_target(cmd->device),
+ starget_printk(KERN_INFO, starget,
"WARM TARGET RESET FAILED.\n");
return FAILED;
}
- if (qla4xxx_eh_wait_for_commands(ha, scsi_target(cmd->device),
- NULL)) {
- starget_printk(KERN_INFO, scsi_target(cmd->device),
+ if (qla4xxx_eh_wait_for_commands(ha, starget, NULL)) {
+ starget_printk(KERN_INFO, starget,
"WARM TARGET DEVICE RESET FAILED - "
"waiting for commands.\n");
return FAILED;
@@ -9386,13 +9394,13 @@ static int qla4xxx_eh_target_reset(struct scsi_cmnd *cmd)
/* Send marker. */
if (qla4xxx_send_marker_iocb(ha, ddb_entry, cmd->device->lun,
MM_TGT_WARM_RESET) != QLA_SUCCESS) {
- starget_printk(KERN_INFO, scsi_target(cmd->device),
+ starget_printk(KERN_INFO, starget,
"WARM TARGET DEVICE RESET FAILED - "
"marker iocb failed.\n");
return FAILED;
}
- starget_printk(KERN_INFO, scsi_target(cmd->device),
+ starget_printk(KERN_INFO, starget,
"WARM TARGET RESET SUCCEEDED.\n");
return SUCCESS;
}
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 3075b2ddf7a6..4e81ef882a49 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1838,17 +1838,15 @@ static void iscsi_scan_session(struct work_struct *work)
/**
* iscsi_block_scsi_eh - block scsi eh until session state has transistioned
- * @cmd: scsi cmd passed to scsi eh handler
+ * @session: iscsi session derived from scsi eh handler argument
*
* If the session is down this function will wait for the recovery
* timer to fire or for the session to be logged back in. If the
* recovery timer fires then FAST_IO_FAIL is returned. The caller
* should pass this error value to the scsi eh.
*/
-int iscsi_block_scsi_eh(struct scsi_cmnd *cmd)
+int iscsi_block_scsi_eh(struct iscsi_cls_session *session)
{
- struct iscsi_cls_session *session =
- starget_to_session(scsi_target(cmd->device));
unsigned long flags;
int ret = 0;
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index fb3399e4cd29..8ec36de5d236 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -466,7 +466,7 @@ extern struct iscsi_endpoint *iscsi_create_endpoint(int dd_size);
extern void iscsi_destroy_endpoint(struct iscsi_endpoint *ep);
extern struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle);
extern void iscsi_put_endpoint(struct iscsi_endpoint *ep);
-extern int iscsi_block_scsi_eh(struct scsi_cmnd *cmd);
+extern int iscsi_block_scsi_eh(struct iscsi_cls_session *session);
extern struct iscsi_iface *iscsi_create_iface(struct Scsi_Host *shost,
struct iscsi_transport *t,
uint32_t iface_type,
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 10/16] snic: reserve tag for TMF
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (8 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 09/16] scsi_transport_iscsi: use session as argument for iscsi_block_scsi_eh() Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 11/16] snic: allocate device reset command Hannes Reinecke
` (5 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Rework the hba reset function to not rely on scsi commands and
reserve one command for HBA reset.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/snic/snic.h | 2 +-
drivers/scsi/snic/snic_main.c | 5 ++-
drivers/scsi/snic/snic_scsi.c | 82 ++++++++---------------------------
3 files changed, 21 insertions(+), 68 deletions(-)
diff --git a/drivers/scsi/snic/snic.h b/drivers/scsi/snic/snic.h
index 32f5a34b6987..0b7411624bcf 100644
--- a/drivers/scsi/snic/snic.h
+++ b/drivers/scsi/snic/snic.h
@@ -366,7 +366,7 @@ int snic_queuecommand(struct Scsi_Host *, struct scsi_cmnd *);
int snic_abort_cmd(struct scsi_cmnd *);
int snic_device_reset(struct scsi_cmnd *);
int snic_host_reset(struct scsi_cmnd *);
-int snic_reset(struct Scsi_Host *, struct scsi_cmnd *);
+int snic_reset(struct Scsi_Host *);
void snic_shutdown_scsi_cleanup(struct snic *);
diff --git a/drivers/scsi/snic/snic_main.c b/drivers/scsi/snic/snic_main.c
index cc824dcfe7da..9bee91eedc10 100644
--- a/drivers/scsi/snic/snic_main.c
+++ b/drivers/scsi/snic/snic_main.c
@@ -494,9 +494,10 @@ snic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
/* Configure Maximum Outstanding IO reqs */
max_ios = snic->config.io_throttle_count;
if (max_ios != SNIC_UCSM_DFLT_THROTTLE_CNT_BLD)
- shost->can_queue = min_t(u32, SNIC_MAX_IO_REQ,
+ max_ios = min_t(u32, SNIC_MAX_IO_REQ,
max_t(u32, SNIC_MIN_IO_REQ, max_ios));
-
+ /* Reserve one tag for HBA reset */
+ shost->can_queue = max_ios - 1;
snic->max_tag_id = shost->can_queue;
shost->max_lun = snic->config.luns_per_tgt;
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index c50ede326cc4..f1ef781df837 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -952,13 +952,13 @@ snic_itmf_cmpl_handler(struct snic *snic, struct snic_fw_req *fwreq)
static void
-snic_hba_reset_scsi_cleanup(struct snic *snic, struct scsi_cmnd *sc)
+snic_hba_reset_scsi_cleanup(struct snic *snic)
{
struct snic_stats *st = &snic->s_stats;
long act_ios = 0, act_fwreqs = 0;
SNIC_SCSI_DBG(snic->shost, "HBA Reset scsi cleanup.\n");
- snic_scsi_cleanup(snic, snic_cmd_tag(sc));
+ snic_scsi_cleanup(snic, snic->max_tag_id);
/* Update stats on pending IOs */
act_ios = atomic64_read(&st->io.active);
@@ -984,7 +984,6 @@ snic_hba_reset_cmpl_handler(struct snic *snic, struct snic_fw_req *fwreq)
u32 hid;
u8 typ;
u8 hdr_stat;
- struct scsi_cmnd *sc = NULL;
struct snic_req_info *rqi = NULL;
spinlock_t *io_lock = NULL;
unsigned long flags, gflags;
@@ -999,18 +998,9 @@ snic_hba_reset_cmpl_handler(struct snic *snic, struct snic_fw_req *fwreq)
"reset_cmpl: type = %x, hdr_stat = %x, cmnd_id = %x, hid = %x, ctx = %lx\n",
typ, hdr_stat, cmnd_id, hid, ctx);
- /* spl case, host reset issued through ioctl */
- if (cmnd_id == SCSI_NO_TAG) {
- rqi = (struct snic_req_info *) ctx;
- SNIC_HOST_INFO(snic->shost,
- "reset_cmpl:Tag %d ctx %lx cmpl stat %s\n",
- cmnd_id, ctx, snic_io_status_to_str(hdr_stat));
- sc = rqi->sc;
-
- goto ioctl_hba_rst;
- }
+ rqi = (struct snic_req_info *) ctx;
- if (cmnd_id >= snic->max_tag_id) {
+ if (cmnd_id != snic->max_tag_id) {
SNIC_HOST_ERR(snic->shost,
"reset_cmpl: Tag 0x%x out of Range,HdrStat %s\n",
cmnd_id, snic_io_status_to_str(hdr_stat));
@@ -1019,55 +1009,35 @@ snic_hba_reset_cmpl_handler(struct snic *snic, struct snic_fw_req *fwreq)
return 1;
}
- sc = scsi_host_find_tag(snic->shost, cmnd_id);
-ioctl_hba_rst:
- if (!sc) {
- atomic64_inc(&snic->s_stats.io.sc_null);
- SNIC_HOST_ERR(snic->shost,
- "reset_cmpl: sc is NULL - Hdr Stat %s Tag 0x%x\n",
- snic_io_status_to_str(hdr_stat), cmnd_id);
- ret = 1;
-
- return ret;
- }
-
SNIC_HOST_INFO(snic->shost,
- "reset_cmpl: sc %p rqi %p Tag %d flags 0x%llx\n",
- sc, rqi, cmnd_id, CMD_FLAGS(sc));
+ "reset_cmpl: rqi %p Tag %d\n", rqi, cmnd_id);
- io_lock = snic_io_lock_hash(snic, sc);
+ io_lock = snic_io_lock_tag(snic, cmnd_id);
spin_lock_irqsave(io_lock, flags);
if (!snic->remove_wait) {
spin_unlock_irqrestore(io_lock, flags);
SNIC_HOST_ERR(snic->shost,
"reset_cmpl:host reset completed after timeout\n");
- ret = 1;
-
- return ret;
+ return 1;
}
- rqi = (struct snic_req_info *) CMD_SP(sc);
WARN_ON_ONCE(!rqi);
if (!rqi) {
atomic64_inc(&snic->s_stats.io.req_null);
spin_unlock_irqrestore(io_lock, flags);
- CMD_FLAGS(sc) |= SNIC_IO_ABTS_TERM_REQ_NULL;
SNIC_HOST_ERR(snic->shost,
- "reset_cmpl: rqi is null,Hdr stat %s Tag 0x%x sc 0x%p flags 0x%llx\n",
- snic_io_status_to_str(hdr_stat), cmnd_id, sc,
- CMD_FLAGS(sc));
-
- ret = 1;
+ "reset_cmpl: rqi is null,Hdr stat %s Tag 0x%x\n",
+ snic_io_status_to_str(hdr_stat), cmnd_id);
- return ret;
+ return 1;
}
/* stats */
spin_unlock_irqrestore(io_lock, flags);
/* scsi cleanup */
- snic_hba_reset_scsi_cleanup(snic, sc);
+ snic_hba_reset_scsi_cleanup(snic);
SNIC_BUG_ON(snic_get_state(snic) != SNIC_OFFLINE &&
snic_get_state(snic) != SNIC_FWRESET);
@@ -2203,7 +2173,7 @@ snic_device_reset(struct scsi_cmnd *sc)
* snic_issue_hba_reset : Queues FW Reset Request.
*/
static int
-snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
+snic_issue_hba_reset(struct snic *snic)
{
struct snic_req_info *rqi = NULL;
struct snic_host_req *req = NULL;
@@ -2219,26 +2189,15 @@ snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
goto hba_rst_end;
}
- if (snic_cmd_tag(sc) == SCSI_NO_TAG) {
- memset(scsi_cmd_priv(sc), 0,
- sizeof(struct snic_internal_io_state));
- SNIC_HOST_INFO(snic->shost, "issu_hr:Host reset thru ioctl.\n");
- rqi->sc = sc;
- }
-
req = rqi_to_req(rqi);
- io_lock = snic_io_lock_hash(snic, sc);
+ io_lock = snic_io_lock_tag(snic, snic->max_tag_id);
spin_lock_irqsave(io_lock, flags);
- SNIC_BUG_ON(CMD_SP(sc) != NULL);
- CMD_STATE(sc) = SNIC_IOREQ_PENDING;
- CMD_SP(sc) = (char *) rqi;
- CMD_FLAGS(sc) |= SNIC_IO_INITIALIZED;
snic->remove_wait = &wait;
spin_unlock_irqrestore(io_lock, flags);
/* Initialize Request */
- snic_io_hdr_enc(&req->hdr, SNIC_REQ_HBA_RESET, 0, snic_cmd_tag(sc),
+ snic_io_hdr_enc(&req->hdr, SNIC_REQ_HBA_RESET, 0, snic->max_tag_id,
snic->config.hid, 0, (ulong) rqi);
req->u.reset.flags = 0;
@@ -2252,9 +2211,6 @@ snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
goto hba_rst_err;
}
- spin_lock_irqsave(io_lock, flags);
- CMD_FLAGS(sc) |= SNIC_HOST_RESET_ISSUED;
- spin_unlock_irqrestore(io_lock, flags);
atomic64_inc(&snic->s_stats.reset.hba_resets);
SNIC_HOST_INFO(snic->shost, "Queued HBA Reset Successfully.\n");
@@ -2270,8 +2226,6 @@ snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
spin_lock_irqsave(io_lock, flags);
snic->remove_wait = NULL;
- rqi = (struct snic_req_info *) CMD_SP(sc);
- CMD_SP(sc) = NULL;
spin_unlock_irqrestore(io_lock, flags);
if (rqi)
@@ -2284,8 +2238,6 @@ snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
hba_rst_err:
spin_lock_irqsave(io_lock, flags);
snic->remove_wait = NULL;
- rqi = (struct snic_req_info *) CMD_SP(sc);
- CMD_SP(sc) = NULL;
spin_unlock_irqrestore(io_lock, flags);
if (rqi)
@@ -2300,7 +2252,7 @@ snic_issue_hba_reset(struct snic *snic, struct scsi_cmnd *sc)
} /* end of snic_issue_hba_reset */
int
-snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc)
+snic_reset(struct Scsi_Host *shost)
{
struct snic *snic = shost_priv(shost);
enum snic_state sv_state;
@@ -2329,7 +2281,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc)
while (atomic_read(&snic->ios_inflight))
schedule_timeout(msecs_to_jiffies(1));
- ret = snic_issue_hba_reset(snic, sc);
+ ret = snic_issue_hba_reset(snic);
if (ret) {
SNIC_HOST_ERR(shost,
"reset:Host Reset Failed w/ err %d.\n",
@@ -2368,7 +2320,7 @@ snic_host_reset(struct scsi_cmnd *sc)
sc, sc->cmnd[0], scsi_cmd_to_rq(sc),
snic_cmd_tag(sc), CMD_FLAGS(sc));
- ret = snic_reset(shost, sc);
+ ret = snic_reset(shost);
SNIC_TRC(shost->host_no, snic_cmd_tag(sc), (ulong) sc,
jiffies_to_msecs(jiffies - start_time),
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 11/16] snic: allocate device reset command
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (9 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 10/16] snic: reserve tag for TMF Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 12/16] snic: Use scsi_host_busy_iter() to traverse commands Hannes Reinecke
` (4 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Allocate a command to send a device reset instead of relying
on using the command which triggered the device failure.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/snic/snic_scsi.c | 72 +++++++++++++++++++----------------
1 file changed, 40 insertions(+), 32 deletions(-)
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index f1ef781df837..1807f713504f 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -2091,70 +2091,78 @@ snic_unlink_and_release_req(struct snic *snic, struct scsi_cmnd *sc, int flag)
int
snic_device_reset(struct scsi_cmnd *sc)
{
- struct Scsi_Host *shost = sc->device->host;
+ struct scsi_device *sdev = sc->device;
+ struct Scsi_Host *shost = sdev->host;
struct snic *snic = shost_priv(shost);
+ struct request *req;
struct snic_req_info *rqi = NULL;
- int tag = snic_cmd_tag(sc);
int start_time = jiffies;
int ret = FAILED;
int dr_supp = 0;
- SNIC_SCSI_DBG(shost, "dev_reset:sc %p :0x%x :req = %p :tag = %d\n",
- sc, sc->cmnd[0], scsi_cmd_to_rq(sc),
- snic_cmd_tag(sc));
- dr_supp = snic_dev_reset_supported(sc->device);
+ SNIC_SCSI_DBG(shost, "dev_reset\n");
+ dr_supp = snic_dev_reset_supported(sdev);
if (!dr_supp) {
/* device reset op is not supported */
SNIC_HOST_INFO(shost, "LUN Reset Op not supported.\n");
- snic_unlink_and_release_req(snic, sc, SNIC_DEV_RST_NOTSUP);
-
- goto dev_rst_end;
+ return ret;
}
if (unlikely(snic_get_state(snic) != SNIC_ONLINE)) {
- snic_unlink_and_release_req(snic, sc, 0);
SNIC_HOST_ERR(shost, "Devrst: Parent Devs are not online.\n");
- goto dev_rst_end;
+ return ret;
}
- /* There is no tag when lun reset is issue through ioctl. */
- if (unlikely(tag <= SNIC_NO_TAG)) {
- SNIC_HOST_INFO(snic->shost,
- "Devrst: LUN Reset Recvd thru IOCTL.\n");
-
- rqi = snic_req_init(snic, 0);
- if (!rqi)
- goto dev_rst_end;
-
- memset(scsi_cmd_priv(sc), 0,
- sizeof(struct snic_internal_io_state));
- CMD_SP(sc) = (char *)rqi;
- CMD_FLAGS(sc) = SNIC_NO_FLAGS;
+ rqi = snic_req_init(snic, 0);
+ if (!rqi)
+ return ret;
- /* Add special tag for dr coming from user spc */
- rqi->tm_tag = SNIC_TAG_IOCTL_DEV_RST;
- rqi->sc = sc;
+ req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
+ BLK_MQ_REQ_NOWAIT);
+ if (!req) {
+ /*
+ * Request allocation might fail, indicating that
+ * all tags are busy.
+ * But device reset will be called only from within
+ * SCSI EH, at which time all I/O is stopped. So the
+ * only active tags would be for failed I/O, but
+ * when all I/O is failed it'll be better to escalate
+ * to host reset anyway.
+ */
+ SNIC_HOST_ERR(snic->shost,
+ "Devrst: TMF busy, escalate to host reset\n");
+ goto dev_rst_end;
}
+ sc = blk_mq_rq_to_pdu(req);
+ memset(scsi_cmd_priv(sc), 0,
+ sizeof(struct snic_internal_io_state));
+ CMD_SP(sc) = (char *)rqi;
+ CMD_FLAGS(sc) = SNIC_NO_FLAGS;
+ /* Add special tag for dr coming from user spc */
+ rqi->tm_tag = SNIC_TAG_IOCTL_DEV_RST;
+ rqi->sc = sc;
+ WRITE_ONCE(req->state, MQ_RQ_IN_FLIGHT);
ret = snic_send_dr_and_wait(snic, sc);
if (ret) {
SNIC_HOST_ERR(snic->shost,
"Devrst: IO w/ Tag %x Failed w/ err = %d\n",
- tag, ret);
-
+ snic_cmd_tag(sc), ret);
+ blk_mq_set_request_complete(req);
snic_unlink_and_release_req(snic, sc, 0);
goto dev_rst_end;
}
-
+ blk_mq_set_request_complete(req);
ret = snic_dr_finish(snic, sc);
dev_rst_end:
- SNIC_TRC(snic->shost->host_no, tag, (ulong) sc,
+ SNIC_TRC(snic->shost->host_no, snic_cmd_tag(sc), (ulong) sc,
jiffies_to_msecs(jiffies - start_time),
0, SNIC_TRC_CMD(sc), SNIC_TRC_CMD_STATE_FLAGS(sc));
-
+ if (req)
+ blk_mq_free_request(req);
SNIC_SCSI_DBG(snic->shost,
"Devrst: Returning from Device Reset : %s\n",
(ret == SUCCESS) ? "SUCCESS" : "FAILED");
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 12/16] snic: Use scsi_host_busy_iter() to traverse commands
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (10 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 11/16] snic: allocate device reset command Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 13/16] fnic: allocate device reset command on the fly Hannes Reinecke
` (3 subsequent siblings)
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke,
Johannes Thumshirn
Use scsi_host_busy_iter() to traverse commands instead of hand-crafted
routines walking the command list.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
drivers/scsi/snic/snic_scsi.c | 177 ++++++++++++++++------------------
1 file changed, 84 insertions(+), 93 deletions(-)
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index 1807f713504f..f123531e7dd1 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -63,7 +63,7 @@ static const char * const snic_io_status_str[] = {
[SNIC_STAT_FATAL_ERROR] = "SNIC_STAT_FATAL_ERROR",
};
-static void snic_scsi_cleanup(struct snic *, int);
+static void snic_scsi_cleanup(struct snic *);
const char *
snic_state_to_str(unsigned int state)
@@ -958,7 +958,7 @@ snic_hba_reset_scsi_cleanup(struct snic *snic)
long act_ios = 0, act_fwreqs = 0;
SNIC_SCSI_DBG(snic->shost, "HBA Reset scsi cleanup.\n");
- snic_scsi_cleanup(snic, snic->max_tag_id);
+ snic_scsi_cleanup(snic);
/* Update stats on pending IOs */
act_ios = atomic64_read(&st->io.active);
@@ -2366,53 +2366,36 @@ snic_cmpl_pending_tmreq(struct snic *snic, struct scsi_cmnd *sc)
complete(rqi->abts_done);
}
-/*
- * snic_scsi_cleanup: Walks through tag map and releases the reqs
- */
-static void
-snic_scsi_cleanup(struct snic *snic, int ex_tag)
+static bool
+snic_scsi_cleanup_iter(struct scsi_cmnd *sc, void *data, bool reserved)
{
+ struct snic *snic = data;
struct snic_req_info *rqi = NULL;
- struct scsi_cmnd *sc = NULL;
spinlock_t *io_lock = NULL;
unsigned long flags;
- int tag;
+ int tag = scsi_cmd_to_rq(sc)->tag;
u64 st_time = 0;
SNIC_SCSI_DBG(snic->shost, "sc_clean: scsi cleanup.\n");
- for (tag = 0; tag < snic->max_tag_id; tag++) {
- /* Skip ex_tag */
- if (tag == ex_tag)
- continue;
-
- io_lock = snic_io_lock_tag(snic, tag);
- spin_lock_irqsave(io_lock, flags);
- sc = scsi_host_find_tag(snic->shost, tag);
- if (!sc) {
- spin_unlock_irqrestore(io_lock, flags);
-
- continue;
- }
-
- if (unlikely(snic_tmreq_pending(sc))) {
- /*
- * When FW Completes reset w/o sending completions
- * for outstanding ios.
- */
- snic_cmpl_pending_tmreq(snic, sc);
- spin_unlock_irqrestore(io_lock, flags);
-
- continue;
- }
+ io_lock = snic_io_lock_tag(snic, tag);
+ spin_lock_irqsave(io_lock, flags);
- rqi = (struct snic_req_info *) CMD_SP(sc);
- if (!rqi) {
- spin_unlock_irqrestore(io_lock, flags);
+ if (unlikely(snic_tmreq_pending(sc))) {
+ /*
+ * When FW Completes reset w/o sending completions
+ * for outstanding ios.
+ */
+ snic_cmpl_pending_tmreq(snic, sc);
+ spin_unlock_irqrestore(io_lock, flags);
- goto cleanup;
- }
+ return true;
+ }
+ rqi = (struct snic_req_info *) CMD_SP(sc);
+ if (!rqi)
+ spin_unlock_irqrestore(io_lock, flags);
+ else {
SNIC_SCSI_DBG(snic->shost,
"sc_clean: sc %p, rqi %p, tag %d flags 0x%llx\n",
sc, rqi, tag, CMD_FLAGS(sc));
@@ -2427,24 +2410,34 @@ snic_scsi_cleanup(struct snic *snic, int ex_tag)
rqi, CMD_FLAGS(sc));
snic_release_req_buf(snic, rqi, sc);
+ }
+ sc->result = DID_TRANSPORT_DISRUPTED << 16;
+ SNIC_HOST_INFO(snic->shost,
+ "sc_clean: DID_TRANSPORT_DISRUPTED for sc %p, Tag %d flags 0x%llx rqi %p duration %u msecs\n",
+ sc, tag, CMD_FLAGS(sc), rqi,
+ jiffies_to_msecs(jiffies - st_time));
-cleanup:
- sc->result = DID_TRANSPORT_DISRUPTED << 16;
- SNIC_HOST_INFO(snic->shost,
- "sc_clean: DID_TRANSPORT_DISRUPTED for sc %p, Tag %d flags 0x%llx rqi %p duration %u msecs\n",
- sc, scsi_cmd_to_rq(sc)->tag, CMD_FLAGS(sc), rqi,
- jiffies_to_msecs(jiffies - st_time));
+ /* Update IO stats */
+ snic_stats_update_io_cmpl(&snic->s_stats);
- /* Update IO stats */
- snic_stats_update_io_cmpl(&snic->s_stats);
+ SNIC_TRC(snic->shost->host_no, tag, (ulong) sc,
+ jiffies_to_msecs(jiffies - st_time), 0,
+ SNIC_TRC_CMD(sc),
+ SNIC_TRC_CMD_STATE_FLAGS(sc));
- SNIC_TRC(snic->shost->host_no, tag, (ulong) sc,
- jiffies_to_msecs(jiffies - st_time), 0,
- SNIC_TRC_CMD(sc),
- SNIC_TRC_CMD_STATE_FLAGS(sc));
+ scsi_done(sc);
+ return true;
+}
- scsi_done(sc);
- }
+/*
+ * snic_scsi_cleanup: Walks through tag map and releases the reqs
+ */
+static void
+snic_scsi_cleanup(struct snic *snic)
+{
+ SNIC_SCSI_DBG(snic->shost, "sc_clean: scsi cleanup\n");
+
+ scsi_host_busy_iter(snic->shost, snic_scsi_cleanup_iter, snic);
} /* end of snic_scsi_cleanup */
void
@@ -2452,7 +2445,7 @@ snic_shutdown_scsi_cleanup(struct snic *snic)
{
SNIC_HOST_INFO(snic->shost, "Shutdown time SCSI Cleanup.\n");
- snic_scsi_cleanup(snic, SCSI_NO_TAG);
+ snic_scsi_cleanup(snic);
} /* end of snic_shutdown_scsi_cleanup */
/*
@@ -2466,7 +2459,7 @@ snic_internal_abort_io(struct snic *snic, struct scsi_cmnd *sc, int tmf)
spinlock_t *io_lock = NULL;
unsigned long flags;
u32 sv_state = 0;
- int ret = 0;
+ int ret = FAILED;
io_lock = snic_io_lock_hash(snic, sc);
spin_lock_irqsave(io_lock, flags);
@@ -2541,6 +2534,35 @@ snic_internal_abort_io(struct snic *snic, struct scsi_cmnd *sc, int tmf)
return ret;
} /* end of snic_internal_abort_io */
+struct snic_tgt_scsi_abort_io_data {
+ struct snic *snic;
+ struct snic_tgt *tgt;
+ int tmf;
+ int abt_cnt;
+};
+
+static bool snic_tgt_scsi_abort_io_iter(struct scsi_cmnd *sc, void *data,
+ bool reserved)
+{
+ struct snic_tgt_scsi_abort_io_data *iter_data = data;
+ struct snic *snic = iter_data->snic;
+ struct snic_tgt *sc_tgt;
+ int ret;
+
+ sc_tgt = starget_to_tgt(scsi_target(sc->device));
+ if (sc_tgt != iter_data->tgt)
+ return true;
+
+ ret = snic_internal_abort_io(snic, sc, iter_data->tmf);
+ if (ret == SUCCESS)
+ iter_data->abt_cnt++;
+ else
+ SNIC_HOST_ERR(snic->shost,
+ "tgt_abt_io: Tag %x, Failed w err = %d\n",
+ scsi_cmd_to_rq(sc)->tag, ret);
+ return true;
+}
+
/*
* snic_tgt_scsi_abort_io : called by snic_tgt_del
*/
@@ -2548,11 +2570,9 @@ int
snic_tgt_scsi_abort_io(struct snic_tgt *tgt)
{
struct snic *snic = NULL;
- struct scsi_cmnd *sc = NULL;
- struct snic_tgt *sc_tgt = NULL;
- spinlock_t *io_lock = NULL;
- unsigned long flags;
- int ret = 0, tag, abt_cnt = 0, tmf = 0;
+ struct snic_tgt_scsi_abort_io_data data = {
+ .abt_cnt = 0,
+ };
if (!tgt)
return -1;
@@ -2560,44 +2580,15 @@ snic_tgt_scsi_abort_io(struct snic_tgt *tgt)
snic = shost_priv(snic_tgt_to_shost(tgt));
SNIC_SCSI_DBG(snic->shost, "tgt_abt_io: Cleaning Pending IOs.\n");
+ data.snic = snic;
if (tgt->tdata.typ == SNIC_TGT_DAS)
- tmf = SNIC_ITMF_ABTS_TASK;
+ data.tmf = SNIC_ITMF_ABTS_TASK;
else
- tmf = SNIC_ITMF_ABTS_TASK_TERM;
-
- for (tag = 0; tag < snic->max_tag_id; tag++) {
- io_lock = snic_io_lock_tag(snic, tag);
-
- spin_lock_irqsave(io_lock, flags);
- sc = scsi_host_find_tag(snic->shost, tag);
- if (!sc) {
- spin_unlock_irqrestore(io_lock, flags);
-
- continue;
- }
+ data.tmf = SNIC_ITMF_ABTS_TASK_TERM;
- sc_tgt = starget_to_tgt(scsi_target(sc->device));
- if (sc_tgt != tgt) {
- spin_unlock_irqrestore(io_lock, flags);
-
- continue;
- }
- spin_unlock_irqrestore(io_lock, flags);
-
- ret = snic_internal_abort_io(snic, sc, tmf);
- if (ret < 0) {
- SNIC_HOST_ERR(snic->shost,
- "tgt_abt_io: Tag %x, Failed w err = %d\n",
- tag, ret);
-
- continue;
- }
-
- if (ret == SUCCESS)
- abt_cnt++;
- }
+ scsi_host_busy_iter(snic->shost, snic_tgt_scsi_abort_io_iter, &data);
- SNIC_SCSI_DBG(snic->shost, "tgt_abt_io: abt_cnt = %d\n", abt_cnt);
+ SNIC_SCSI_DBG(snic->shost, "tgt_abt_io: abt_cnt = %d\n", data.abt_cnt);
return 0;
} /* end of snic_tgt_scsi_abort_io */
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 13/16] fnic: allocate device reset command on the fly
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (11 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 12/16] snic: Use scsi_host_busy_iter() to traverse commands Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-24 6:54 ` Christoph Hellwig
2023-10-23 9:15 ` [PATCH 14/16] fnic: use fc_block_rport() correctly Hannes Reinecke
` (2 subsequent siblings)
15 siblings, 1 reply; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Allocate a reset command on the fly instead of relying
on using the command which triggered the device failure.
This might fail if all available tags are busy, but in
that case it'll be safer to fall back to host reset anyway.
Signed-off-by: Hannes Reinecke <hare@suse.de>
---
drivers/scsi/fnic/fnic.h | 1 -
drivers/scsi/fnic/fnic_scsi.c | 113 ++++++++++++++++------------------
drivers/scsi/snic/snic_scsi.c | 5 +-
3 files changed, 55 insertions(+), 64 deletions(-)
diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h
index 93c68931a593..8ffcafb4687f 100644
--- a/drivers/scsi/fnic/fnic.h
+++ b/drivers/scsi/fnic/fnic.h
@@ -236,7 +236,6 @@ struct fnic {
unsigned int wq_count;
unsigned int cq_count;
- struct mutex sgreset_mutex;
struct dentry *fnic_stats_debugfs_host;
struct dentry *fnic_stats_debugfs_file;
struct dentry *fnic_reset_debugfs_file;
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 9761b2c9db48..7a8b6285a096 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -1984,7 +1984,6 @@ static inline int fnic_queue_dr_io_req(struct fnic *fnic,
struct fnic_pending_aborts_iter_data {
struct fnic *fnic;
- struct scsi_cmnd *lr_sc;
struct scsi_device *lun_dev;
int ret;
};
@@ -2002,7 +2001,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
DECLARE_COMPLETION_ONSTACK(tm_done);
enum fnic_ioreq_state old_ioreq_state;
- if (sc == iter_data->lr_sc || sc->device != lun_dev)
+ if (sc->device != lun_dev)
return true;
io_lock = fnic_io_lock_tag(fnic, abt_tag);
@@ -2105,17 +2104,11 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
return false;
}
fnic_priv(sc)->state = FNIC_IOREQ_ABTS_COMPLETE;
-
- /* original sc used for lr is handled by dev reset code */
- if (sc != iter_data->lr_sc)
- fnic_priv(sc)->io_req = NULL;
+ fnic_priv(sc)->io_req = NULL;
spin_unlock_irqrestore(io_lock, flags);
- /* original sc used for lr is handled by dev reset code */
- if (sc != iter_data->lr_sc) {
- fnic_release_ioreq_buf(fnic, io_req, sc);
- mempool_free(io_req, fnic->io_req_pool);
- }
+ fnic_release_ioreq_buf(fnic, io_req, sc);
+ mempool_free(io_req, fnic->io_req_pool);
/*
* Any IO is returned during reset, it needs to call scsi_done
@@ -2135,8 +2128,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
* successfully aborted, 1 otherwise
*/
static int fnic_clean_pending_aborts(struct fnic *fnic,
- struct scsi_cmnd *lr_sc,
- bool new_sc)
+ struct scsi_cmnd *lr_sc)
{
int ret = 0;
@@ -2146,9 +2138,6 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
.ret = SUCCESS,
};
- if (new_sc)
- iter_data.lr_sc = lr_sc;
-
scsi_host_busy_iter(fnic->lport->host,
fnic_pending_aborts_iter, &iter_data);
if (iter_data.ret == FAILED) {
@@ -2174,7 +2163,8 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
*/
int fnic_device_reset(struct scsi_cmnd *sc)
{
- struct request *rq = scsi_cmd_to_rq(sc);
+ struct scsi_device *sdev = sc->device;
+ struct request *req;
struct fc_lport *lp;
struct fnic *fnic;
struct fnic_io_req *io_req = NULL;
@@ -2187,15 +2177,17 @@ int fnic_device_reset(struct scsi_cmnd *sc)
struct scsi_lun fc_lun;
struct fnic_stats *fnic_stats;
struct reset_stats *reset_stats;
- int tag = rq->tag;
+ int tag;
DECLARE_COMPLETION_ONSTACK(tm_done);
- bool new_sc = 0;
/* Wait for rport to unblock */
- fc_block_scsi_eh(sc);
+ rport = starget_to_rport(scsi_target(sdev));
+ ret = fc_block_rport(rport);
+ if (ret)
+ return ret;
/* Get local-port, check ready and link up */
- lp = shost_priv(sc->device->host);
+ lp = shost_priv(sdev->host);
fnic = lport_priv(lp);
fnic_stats = &fnic->fnic_stats;
@@ -2203,53 +2195,55 @@ int fnic_device_reset(struct scsi_cmnd *sc)
atomic64_inc(&reset_stats->device_resets);
- rport = starget_to_rport(scsi_target(sc->device));
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
- "Device reset called FCID 0x%x, LUN 0x%llx sc 0x%p\n",
- rport->port_id, sc->device->lun, sc);
+ "Device reset called FCID 0x%x, LUN 0x%llx\n",
+ rport->port_id, sdev->lun);
if (lp->state != LPORT_ST_READY || !(lp->link_up))
- goto fnic_device_reset_end;
+ return ret;
/* Check if remote port up */
if (fc_remote_port_chkready(rport)) {
atomic64_inc(&fnic_stats->misc_stats.rport_not_ready);
- goto fnic_device_reset_end;
+ return ret;
}
- fnic_priv(sc)->flags = FNIC_DEVICE_RESET;
-
- if (unlikely(tag < 0)) {
+ req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
+ BLK_MQ_REQ_NOWAIT);
+ if (!req) {
/*
- * For device reset issued through sg3utils, we let
- * only one LUN_RESET to go through and use a special
- * tag equal to max_tag_id so that we don't have to allocate
- * or free it. It won't interact with tags
- * allocated by mid layer.
+ * Request allocation might fail, indicating that
+ * all tags are busy.
+ * But device reset will be called only from within
+ * SCSI EH, at which time all I/O is stopped. So the
+ * only active tags would be for failed I/O, but
+ * when all I/O is failed it'll be better to escalate
+ * to host reset anyway.
*/
- mutex_lock(&fnic->sgreset_mutex);
- tag = fnic->fnic_max_tag_id;
- new_sc = 1;
+ FNIC_SCSI_DBG(KERN_ERR, fnic->lport->host,
+ "Device reset allocation failed, all tags busy\n");
+ return ret;
}
+ sc = blk_mq_rq_to_pdu(req);
+
+ tag = req->tag;
io_lock = fnic_io_lock_hash(fnic, sc);
spin_lock_irqsave(io_lock, flags);
io_req = fnic_priv(sc)->io_req;
+ if (io_req)
+ goto fnic_device_reset_end;
- /*
- * If there is a io_req attached to this command, then use it,
- * else allocate a new one.
- */
+ io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
if (!io_req) {
- io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
- if (!io_req) {
- spin_unlock_irqrestore(io_lock, flags);
- goto fnic_device_reset_end;
- }
- memset(io_req, 0, sizeof(*io_req));
- io_req->port_id = rport->port_id;
- fnic_priv(sc)->io_req = io_req;
+ spin_unlock_irqrestore(io_lock, flags);
+ goto fnic_device_reset_end;
}
+ memset(io_req, 0, sizeof(*io_req));
+ io_req->port_id = rport->port_id;
+ fnic_priv(sc)->io_req = io_req;
+
io_req->dr_done = &tm_done;
+ fnic_priv(sc)->flags = FNIC_DEVICE_RESET;
fnic_priv(sc)->state = FNIC_IOREQ_CMD_PENDING;
fnic_priv(sc)->lr_status = FCPIO_INVALID_CODE;
spin_unlock_irqrestore(io_lock, flags);
@@ -2260,11 +2254,13 @@ int fnic_device_reset(struct scsi_cmnd *sc)
* issue the device reset, if enqueue failed, clean up the ioreq
* and break assoc with scsi cmd
*/
+ WRITE_ONCE(req->state, MQ_RQ_IN_FLIGHT);
if (fnic_queue_dr_io_req(fnic, sc, io_req)) {
spin_lock_irqsave(io_lock, flags);
io_req = fnic_priv(sc)->io_req;
if (io_req)
io_req->dr_done = NULL;
+ WRITE_ONCE(req->state, MQ_RQ_IDLE);
goto fnic_device_reset_clean;
}
spin_lock_irqsave(io_lock, flags);
@@ -2279,11 +2275,12 @@ int fnic_device_reset(struct scsi_cmnd *sc)
msecs_to_jiffies(FNIC_LUN_RESET_TIMEOUT));
spin_lock_irqsave(io_lock, flags);
+ WRITE_ONCE(req->state, MQ_RQ_IDLE);
io_req = fnic_priv(sc)->io_req;
if (!io_req) {
spin_unlock_irqrestore(io_lock, flags);
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
- "io_req is null tag 0x%x sc 0x%p\n", tag, sc);
+ "io_req is null tag 0x%x\n", tag);
goto fnic_device_reset_end;
}
io_req->dr_done = NULL;
@@ -2326,7 +2323,7 @@ int fnic_device_reset(struct scsi_cmnd *sc)
spin_unlock_irqrestore(io_lock, flags);
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
"Abort and terminate issued on Device reset "
- "tag 0x%x sc 0x%p\n", tag, sc);
+ "tag 0x%x\n", tag);
break;
}
}
@@ -2364,7 +2361,7 @@ int fnic_device_reset(struct scsi_cmnd *sc)
* the lun reset cmd. If all cmds get cleaned, the lun reset
* succeeds
*/
- if (fnic_clean_pending_aborts(fnic, sc, new_sc)) {
+ if (fnic_clean_pending_aborts(fnic, sc)) {
spin_lock_irqsave(io_lock, flags);
io_req = fnic_priv(sc)->io_req;
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
@@ -2393,15 +2390,15 @@ int fnic_device_reset(struct scsi_cmnd *sc)
}
fnic_device_reset_end:
- FNIC_TRACE(fnic_device_reset, sc->device->host->host_no, rq->tag, sc,
+ FNIC_TRACE(fnic_device_reset, sc->device->host->host_no,
+ scsi_cmd_to_rq(sc)->tag, sc,
jiffies_to_msecs(jiffies - start_time),
0, ((u64)sc->cmnd[0] << 32 |
(u64)sc->cmnd[2] << 24 | (u64)sc->cmnd[3] << 16 |
(u64)sc->cmnd[4] << 8 | sc->cmnd[5]),
fnic_flags_and_state(sc));
- if (new_sc)
- mutex_unlock(&fnic->sgreset_mutex);
+ blk_mq_free_request(req);
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
"Returning from device reset %s\n",
@@ -2633,8 +2630,6 @@ static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data)
* ignore this lun reset cmd or cmds that do not belong to
* this lun
*/
- if (iter_data->lr_sc && sc == iter_data->lr_sc)
- return true;
if (iter_data->lun_dev && sc->device != iter_data->lun_dev)
return true;
@@ -2677,10 +2672,8 @@ int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc)
.ret = 0,
};
- if (lr_sc) {
+ if (lr_sc)
iter_data.lun_dev = lr_sc->device;
- iter_data.lr_sc = lr_sc;
- }
/* walk again to check, if IOs are still pending in fw */
scsi_host_busy_iter(fnic->lport->host,
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index f123531e7dd1..c38f648da3d7 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -2367,7 +2367,7 @@ snic_cmpl_pending_tmreq(struct snic *snic, struct scsi_cmnd *sc)
}
static bool
-snic_scsi_cleanup_iter(struct scsi_cmnd *sc, void *data, bool reserved)
+snic_scsi_cleanup_iter(struct scsi_cmnd *sc, void *data)
{
struct snic *snic = data;
struct snic_req_info *rqi = NULL;
@@ -2541,8 +2541,7 @@ struct snic_tgt_scsi_abort_io_data {
int abt_cnt;
};
-static bool snic_tgt_scsi_abort_io_iter(struct scsi_cmnd *sc, void *data,
- bool reserved)
+static bool snic_tgt_scsi_abort_io_iter(struct scsi_cmnd *sc, void *data)
{
struct snic_tgt_scsi_abort_io_data *iter_data = data;
struct snic *snic = iter_data->snic;
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-10-23 9:15 ` [PATCH 13/16] fnic: allocate device reset command on the fly Hannes Reinecke
@ 2023-10-24 6:54 ` Christoph Hellwig
2023-11-06 19:42 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2023-10-24 6:54 UTC (permalink / raw)
To: Hannes Reinecke
Cc: Martin K. Petersen, Christoph Hellwig, James Bottomley,
linux-scsi, Karan Tilak Kumar, djhawar, gcboffa, mkai2, satishkh
Adding the fnic maintainers as they are probably most qualified to
review and test this.
On Mon, Oct 23, 2023 at 11:15:04AM +0200, Hannes Reinecke wrote:
> Allocate a reset command on the fly instead of relying
> on using the command which triggered the device failure.
> This might fail if all available tags are busy, but in
> that case it'll be safer to fall back to host reset anyway.
>
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
> drivers/scsi/fnic/fnic.h | 1 -
> drivers/scsi/fnic/fnic_scsi.c | 113 ++++++++++++++++------------------
> drivers/scsi/snic/snic_scsi.c | 5 +-
> 3 files changed, 55 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h
> index 93c68931a593..8ffcafb4687f 100644
> --- a/drivers/scsi/fnic/fnic.h
> +++ b/drivers/scsi/fnic/fnic.h
> @@ -236,7 +236,6 @@ struct fnic {
> unsigned int wq_count;
> unsigned int cq_count;
>
> - struct mutex sgreset_mutex;
> struct dentry *fnic_stats_debugfs_host;
> struct dentry *fnic_stats_debugfs_file;
> struct dentry *fnic_reset_debugfs_file;
> diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
> index 9761b2c9db48..7a8b6285a096 100644
> --- a/drivers/scsi/fnic/fnic_scsi.c
> +++ b/drivers/scsi/fnic/fnic_scsi.c
> @@ -1984,7 +1984,6 @@ static inline int fnic_queue_dr_io_req(struct fnic *fnic,
>
> struct fnic_pending_aborts_iter_data {
> struct fnic *fnic;
> - struct scsi_cmnd *lr_sc;
> struct scsi_device *lun_dev;
> int ret;
> };
> @@ -2002,7 +2001,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
> DECLARE_COMPLETION_ONSTACK(tm_done);
> enum fnic_ioreq_state old_ioreq_state;
>
> - if (sc == iter_data->lr_sc || sc->device != lun_dev)
> + if (sc->device != lun_dev)
> return true;
>
> io_lock = fnic_io_lock_tag(fnic, abt_tag);
> @@ -2105,17 +2104,11 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
> return false;
> }
> fnic_priv(sc)->state = FNIC_IOREQ_ABTS_COMPLETE;
> -
> - /* original sc used for lr is handled by dev reset code */
> - if (sc != iter_data->lr_sc)
> - fnic_priv(sc)->io_req = NULL;
> + fnic_priv(sc)->io_req = NULL;
> spin_unlock_irqrestore(io_lock, flags);
>
> - /* original sc used for lr is handled by dev reset code */
> - if (sc != iter_data->lr_sc) {
> - fnic_release_ioreq_buf(fnic, io_req, sc);
> - mempool_free(io_req, fnic->io_req_pool);
> - }
> + fnic_release_ioreq_buf(fnic, io_req, sc);
> + mempool_free(io_req, fnic->io_req_pool);
>
> /*
> * Any IO is returned during reset, it needs to call scsi_done
> @@ -2135,8 +2128,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
> * successfully aborted, 1 otherwise
> */
> static int fnic_clean_pending_aborts(struct fnic *fnic,
> - struct scsi_cmnd *lr_sc,
> - bool new_sc)
> + struct scsi_cmnd *lr_sc)
>
> {
> int ret = 0;
> @@ -2146,9 +2138,6 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
> .ret = SUCCESS,
> };
>
> - if (new_sc)
> - iter_data.lr_sc = lr_sc;
> -
> scsi_host_busy_iter(fnic->lport->host,
> fnic_pending_aborts_iter, &iter_data);
> if (iter_data.ret == FAILED) {
> @@ -2174,7 +2163,8 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
> */
> int fnic_device_reset(struct scsi_cmnd *sc)
> {
> - struct request *rq = scsi_cmd_to_rq(sc);
> + struct scsi_device *sdev = sc->device;
> + struct request *req;
> struct fc_lport *lp;
> struct fnic *fnic;
> struct fnic_io_req *io_req = NULL;
> @@ -2187,15 +2177,17 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> struct scsi_lun fc_lun;
> struct fnic_stats *fnic_stats;
> struct reset_stats *reset_stats;
> - int tag = rq->tag;
> + int tag;
> DECLARE_COMPLETION_ONSTACK(tm_done);
> - bool new_sc = 0;
>
> /* Wait for rport to unblock */
> - fc_block_scsi_eh(sc);
> + rport = starget_to_rport(scsi_target(sdev));
> + ret = fc_block_rport(rport);
> + if (ret)
> + return ret;
>
> /* Get local-port, check ready and link up */
> - lp = shost_priv(sc->device->host);
> + lp = shost_priv(sdev->host);
>
> fnic = lport_priv(lp);
> fnic_stats = &fnic->fnic_stats;
> @@ -2203,53 +2195,55 @@ int fnic_device_reset(struct scsi_cmnd *sc)
>
> atomic64_inc(&reset_stats->device_resets);
>
> - rport = starget_to_rport(scsi_target(sc->device));
> FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
> - "Device reset called FCID 0x%x, LUN 0x%llx sc 0x%p\n",
> - rport->port_id, sc->device->lun, sc);
> + "Device reset called FCID 0x%x, LUN 0x%llx\n",
> + rport->port_id, sdev->lun);
>
> if (lp->state != LPORT_ST_READY || !(lp->link_up))
> - goto fnic_device_reset_end;
> + return ret;
>
> /* Check if remote port up */
> if (fc_remote_port_chkready(rport)) {
> atomic64_inc(&fnic_stats->misc_stats.rport_not_ready);
> - goto fnic_device_reset_end;
> + return ret;
> }
>
> - fnic_priv(sc)->flags = FNIC_DEVICE_RESET;
> -
> - if (unlikely(tag < 0)) {
> + req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
> + BLK_MQ_REQ_NOWAIT);
> + if (!req) {
> /*
> - * For device reset issued through sg3utils, we let
> - * only one LUN_RESET to go through and use a special
> - * tag equal to max_tag_id so that we don't have to allocate
> - * or free it. It won't interact with tags
> - * allocated by mid layer.
> + * Request allocation might fail, indicating that
> + * all tags are busy.
> + * But device reset will be called only from within
> + * SCSI EH, at which time all I/O is stopped. So the
> + * only active tags would be for failed I/O, but
> + * when all I/O is failed it'll be better to escalate
> + * to host reset anyway.
> */
> - mutex_lock(&fnic->sgreset_mutex);
> - tag = fnic->fnic_max_tag_id;
> - new_sc = 1;
> + FNIC_SCSI_DBG(KERN_ERR, fnic->lport->host,
> + "Device reset allocation failed, all tags busy\n");
> + return ret;
> }
> + sc = blk_mq_rq_to_pdu(req);
> +
> + tag = req->tag;
> io_lock = fnic_io_lock_hash(fnic, sc);
> spin_lock_irqsave(io_lock, flags);
> io_req = fnic_priv(sc)->io_req;
> + if (io_req)
> + goto fnic_device_reset_end;
>
> - /*
> - * If there is a io_req attached to this command, then use it,
> - * else allocate a new one.
> - */
> + io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
> if (!io_req) {
> - io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
> - if (!io_req) {
> - spin_unlock_irqrestore(io_lock, flags);
> - goto fnic_device_reset_end;
> - }
> - memset(io_req, 0, sizeof(*io_req));
> - io_req->port_id = rport->port_id;
> - fnic_priv(sc)->io_req = io_req;
> + spin_unlock_irqrestore(io_lock, flags);
> + goto fnic_device_reset_end;
> }
> + memset(io_req, 0, sizeof(*io_req));
> + io_req->port_id = rport->port_id;
> + fnic_priv(sc)->io_req = io_req;
> +
> io_req->dr_done = &tm_done;
> + fnic_priv(sc)->flags = FNIC_DEVICE_RESET;
> fnic_priv(sc)->state = FNIC_IOREQ_CMD_PENDING;
> fnic_priv(sc)->lr_status = FCPIO_INVALID_CODE;
> spin_unlock_irqrestore(io_lock, flags);
> @@ -2260,11 +2254,13 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> * issue the device reset, if enqueue failed, clean up the ioreq
> * and break assoc with scsi cmd
> */
> + WRITE_ONCE(req->state, MQ_RQ_IN_FLIGHT);
> if (fnic_queue_dr_io_req(fnic, sc, io_req)) {
> spin_lock_irqsave(io_lock, flags);
> io_req = fnic_priv(sc)->io_req;
> if (io_req)
> io_req->dr_done = NULL;
> + WRITE_ONCE(req->state, MQ_RQ_IDLE);
> goto fnic_device_reset_clean;
> }
> spin_lock_irqsave(io_lock, flags);
> @@ -2279,11 +2275,12 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> msecs_to_jiffies(FNIC_LUN_RESET_TIMEOUT));
>
> spin_lock_irqsave(io_lock, flags);
> + WRITE_ONCE(req->state, MQ_RQ_IDLE);
> io_req = fnic_priv(sc)->io_req;
> if (!io_req) {
> spin_unlock_irqrestore(io_lock, flags);
> FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
> - "io_req is null tag 0x%x sc 0x%p\n", tag, sc);
> + "io_req is null tag 0x%x\n", tag);
> goto fnic_device_reset_end;
> }
> io_req->dr_done = NULL;
> @@ -2326,7 +2323,7 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> spin_unlock_irqrestore(io_lock, flags);
> FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
> "Abort and terminate issued on Device reset "
> - "tag 0x%x sc 0x%p\n", tag, sc);
> + "tag 0x%x\n", tag);
> break;
> }
> }
> @@ -2364,7 +2361,7 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> * the lun reset cmd. If all cmds get cleaned, the lun reset
> * succeeds
> */
> - if (fnic_clean_pending_aborts(fnic, sc, new_sc)) {
> + if (fnic_clean_pending_aborts(fnic, sc)) {
> spin_lock_irqsave(io_lock, flags);
> io_req = fnic_priv(sc)->io_req;
> FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
> @@ -2393,15 +2390,15 @@ int fnic_device_reset(struct scsi_cmnd *sc)
> }
>
> fnic_device_reset_end:
> - FNIC_TRACE(fnic_device_reset, sc->device->host->host_no, rq->tag, sc,
> + FNIC_TRACE(fnic_device_reset, sc->device->host->host_no,
> + scsi_cmd_to_rq(sc)->tag, sc,
> jiffies_to_msecs(jiffies - start_time),
> 0, ((u64)sc->cmnd[0] << 32 |
> (u64)sc->cmnd[2] << 24 | (u64)sc->cmnd[3] << 16 |
> (u64)sc->cmnd[4] << 8 | sc->cmnd[5]),
> fnic_flags_and_state(sc));
>
> - if (new_sc)
> - mutex_unlock(&fnic->sgreset_mutex);
> + blk_mq_free_request(req);
>
> FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
> "Returning from device reset %s\n",
> @@ -2633,8 +2630,6 @@ static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data)
> * ignore this lun reset cmd or cmds that do not belong to
> * this lun
> */
> - if (iter_data->lr_sc && sc == iter_data->lr_sc)
> - return true;
> if (iter_data->lun_dev && sc->device != iter_data->lun_dev)
> return true;
>
> @@ -2677,10 +2672,8 @@ int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc)
> .ret = 0,
> };
>
> - if (lr_sc) {
> + if (lr_sc)
> iter_data.lun_dev = lr_sc->device;
> - iter_data.lr_sc = lr_sc;
> - }
>
> /* walk again to check, if IOs are still pending in fw */
> scsi_host_busy_iter(fnic->lport->host,
> diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
> index f123531e7dd1..c38f648da3d7 100644
> --- a/drivers/scsi/snic/snic_scsi.c
> +++ b/drivers/scsi/snic/snic_scsi.c
> @@ -2367,7 +2367,7 @@ snic_cmpl_pending_tmreq(struct snic *snic, struct scsi_cmnd *sc)
> }
>
> static bool
> -snic_scsi_cleanup_iter(struct scsi_cmnd *sc, void *data, bool reserved)
> +snic_scsi_cleanup_iter(struct scsi_cmnd *sc, void *data)
> {
> struct snic *snic = data;
> struct snic_req_info *rqi = NULL;
> @@ -2541,8 +2541,7 @@ struct snic_tgt_scsi_abort_io_data {
> int abt_cnt;
> };
>
> -static bool snic_tgt_scsi_abort_io_iter(struct scsi_cmnd *sc, void *data,
> - bool reserved)
> +static bool snic_tgt_scsi_abort_io_iter(struct scsi_cmnd *sc, void *data)
> {
> struct snic_tgt_scsi_abort_io_data *iter_data = data;
> struct snic *snic = iter_data->snic;
> --
> 2.35.3
---end quoted text---
^ permalink raw reply [flat|nested] 28+ messages in thread* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-10-24 6:54 ` Christoph Hellwig
@ 2023-11-06 19:42 ` Karan Tilak Kumar (kartilak)
2023-11-14 2:29 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-06 19:42 UTC (permalink / raw)
To: Christoph Hellwig, Hannes Reinecke
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Monday, October 23, 2023 11:54 PM, Christoph Hellwig <hch@lst.de> wrote:
>
> Adding the fnic maintainers as they are probably most qualified to review and test this.
>
> On Mon, Oct 23, 2023 at 11:15:04AM +0200, Hannes Reinecke wrote:
> > Allocate a reset command on the fly instead of relying on using the
> > command which triggered the device failure.
> > This might fail if all available tags are busy, but in that case it'll
> > be safer to fall back to host reset anyway.
> >
Thanks for this fix, Hannes.
I'm working on integrating these changes and testing them.
I'll get back to you about this.
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-06 19:42 ` Karan Tilak Kumar (kartilak)
@ 2023-11-14 2:29 ` Karan Tilak Kumar (kartilak)
2023-11-14 8:01 ` Hannes Reinecke
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-14 2:29 UTC (permalink / raw)
To: Christoph Hellwig, Hannes Reinecke
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Monday, November 6, 2023 11:42 AM, Karan Tilak Kumar (kartilak) wrote:
>
> On Monday, October 23, 2023 11:54 PM, Christoph Hellwig <hch@lst.de> wrote:
> >
> > Adding the fnic maintainers as they are probably most qualified to review and test this.
> >
> > On Mon, Oct 23, 2023 at 11:15:04AM +0200, Hannes Reinecke wrote:
> > > Allocate a reset command on the fly instead of relying on using the
> > > command which triggered the device failure.
> > > This might fail if all available tags are busy, but in that case
> > > it'll be safer to fall back to host reset anyway.
> > >
>
> Thanks for this fix, Hannes.
> I'm working on integrating these changes and testing them.
> I'll get back to you about this.
>
I integrated your patch set using "b4 shazam" and built all the changes to do some dev testing.
I instrumented the code to do the following:
- After one million IOs, drop one IO response.
- This will trigger an abort. Drop that abort response.
- This will trigger a device reset. I'm seeing that the tag here is 0xFFFFFFFF (SCSI_NO_TAG).
This tag fails the out-of-range tag check and escalates to host reset.
I've been able to repro this three times with the same result.
The expectation with this test case is that the device reset should go through successfully without escalating to host reset.
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-14 2:29 ` Karan Tilak Kumar (kartilak)
@ 2023-11-14 8:01 ` Hannes Reinecke
2023-11-16 0:22 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Hannes Reinecke @ 2023-11-14 8:01 UTC (permalink / raw)
To: Karan Tilak Kumar (kartilak), Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On 11/14/23 03:29, Karan Tilak Kumar (kartilak) wrote:
> On Monday, November 6, 2023 11:42 AM, Karan Tilak Kumar (kartilak) wrote:
>>
>> On Monday, October 23, 2023 11:54 PM, Christoph Hellwig <hch@lst.de> wrote:
>>>
>>> Adding the fnic maintainers as they are probably most qualified to review and test this.
>>>
>>> On Mon, Oct 23, 2023 at 11:15:04AM +0200, Hannes Reinecke wrote:
>>>> Allocate a reset command on the fly instead of relying on using the
>>>> command which triggered the device failure.
>>>> This might fail if all available tags are busy, but in that case
>>>> it'll be safer to fall back to host reset anyway.
>>>>
>>
>> Thanks for this fix, Hannes.
>> I'm working on integrating these changes and testing them.
>> I'll get back to you about this.
>>
>
> I integrated your patch set using "b4 shazam" and built all the changes to do some dev testing.
> I instrumented the code to do the following:
>
> - After one million IOs, drop one IO response.
> - This will trigger an abort. Drop that abort response.
> - This will trigger a device reset. I'm seeing that the tag here is 0xFFFFFFFF (SCSI_NO_TAG).
>
> This tag fails the out-of-range tag check and escalates to host reset.
> I've been able to repro this three times with the same result.
>
> The expectation with this test case is that the device reset should go through successfully without escalating to host reset.
>
Thanks for the report.
Certainly not what was intended with the patch. Can you try with this
patch on top:
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 8909cf09cf9e..50eea6d2344a 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2210,7 +2210,7 @@ int fnic_device_reset(struct scsi_device *sdev)
req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
BLK_MQ_REQ_NOWAIT);
- if (!req) {
+ if (IS_ERR(req)) {
/*
* Request allocation might fail, indicating that
* all tags are busy.
@@ -2221,7 +2221,8 @@ int fnic_device_reset(struct scsi_device *sdev)
* to host reset anyway.
*/
FNIC_SCSI_DBG(KERN_ERR, fnic->lport->host,
- "Device reset allocation failed, all tags
busy\n");
+ "Device reset allocation failed with %d\n",
+ PTR_ERR(req));
return ret;
}
(The first hunk is actually a bugfix, and will be included with
the next submission anyway.)
Please enable fnic debug during testing and send me the logs.
It _might_ be that this is actually by design, as we will be
failing if all tags are busy (ie if the error is EWOULDBLOCK).
But if we get EAGAIN it means that the queue is blocked and
we'll need to investigate.
Thanks for your help here.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply related [flat|nested] 28+ messages in thread* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-14 8:01 ` Hannes Reinecke
@ 2023-11-16 0:22 ` Karan Tilak Kumar (kartilak)
2023-11-17 23:31 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-16 0:22 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Tuesday, November 14, 2023 12:02 AM, Hannes Reinecke <hare@suse.de> wrote:
> > I integrated your patch set using "b4 shazam" and built all the changes to do some dev testing.
> > I instrumented the code to do the following:
> >
> > - After one million IOs, drop one IO response.
> > - This will trigger an abort. Drop that abort response.
> > - This will trigger a device reset. I'm seeing that the tag here is 0xFFFFFFFF (SCSI_NO_TAG).
> >
> > This tag fails the out-of-range tag check and escalates to host reset.
> > I've been able to repro this three times with the same result.
> >
> > The expectation with this test case is that the device reset should go through successfully without escalating to host reset.
> >
> Thanks for the report.
> Certainly not what was intended with the patch. Can you try with this patch on top:
I applied this patch on top of the previous one and performed the unit tests that I had mentioned before.
>
> (The first hunk is actually a bugfix, and will be included with the next submission anyway.) Please enable fnic debug during testing and send me the logs.
> It _might_ be that this is actually by design, as we will be failing if all tags are busy (ie if the error is EWOULDBLOCK).
> But if we get EAGAIN it means that the queue is blocked and we'll need to investigate.
There were two attempts at reproduction. One of them resulted in a crash. I have enabled fnic debug and captured the log.
I've pasted the excerpt here for reference.
Repro 1:
Nov 15 16:08:14 localhost kernel: fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0x91 sc: 00000000bbe9fd78 CDB Opcode: 0x28 Dropping icmnd completion
Nov 15 16:08:35 localhost multipathd[1746]: burst continued 30025 ms, too long time, stopped
Nov 15 16:08:36 localhost kernel: sched: RT throttling activated
Nov 15 16:08:44 localhost kernel: scsi host7: Abort Cmd called FCID 0x5205fa, LUN 0x1 TAG 91 flags 3
Nov 15 16:08:44 localhost kernel: scsi host7: CBD Opcode: 28 Abort issued time: 30036 msec
Nov 15 16:08:44 localhost kernel: scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0x91 sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
Nov 15 16:09:05 localhost multipathd[1746]: burst continued 30041 ms, too long time, stopped
Nov 15 16:09:06 localhost kernel: scsi host7: Returning from abort cmd type 2 FAILED
Nov 15 16:09:06 localhost kernel: scsi host7: Device reset called FCID 0x5205fa, LUN 0x1 sc: 00000000bbe9fd78
Nov 15 16:09:06 localhost kernel: scsi host7: TAG ffffffff sc: 000000005ee1ff69
Nov 15 16:09:06 localhost kernel: scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_SUCCESS
Nov 15 16:09:17 localhost kernel: scsi host7: Device reset timed out
Nov 15 16:09:17 localhost kernel: scsi host7: Abort and terminate issued on Device reset tag 0xffffffff
Nov 15 16:09:17 localhost kernel: scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_IO_NOT_FOUND
Nov 15 16:09:27 localhost kernel: scsi host7: Device reset completed - failed
Nov 15 16:09:27 localhost kernel: scsi host7: Returning from device reset FAILED
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_reset called
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x52060c
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fb
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fa
Nov 15 16:09:27 localhost kernel: scsi host7: update_mac 00:25:b5:bb:bb:a0
Nov 15 16:09:27 localhost kernel: scsi host7: Issued fw reset
Nov 15 16:09:27 localhost kernel: scsi host7: set port_id 0 fp 0000000000000000
Nov 15 16:09:27 localhost kernel: scsi host7: Returning from fnic reset SUCCESS
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x52060c
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fb
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_cleanup_io: tag:0x91 : sc:0x00000000bbe9fd78 duration = 72835 DID_TRANSPORT_DISRUPTED
Nov 15 16:09:27 localhost kernel: scsi host7: Calling done for IO not issued to fw: tag:0x91 sc:0x00000000bbe9fd78
Nov 15 16:09:27 localhost kernel: scsi host7: reset cmpl success
Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fa
Nov 15 16:09:29 localhost kernel: host7: Assigned Port ID 0d0840
Nov 15 16:09:29 localhost kernel: scsi host7: set port_id d0840 fp 0000000046a33a9d
Nov 15 16:09:29 localhost kernel: scsi host7: update_mac 0e:fc:00:0d:08:40
Nov 15 16:09:29 localhost kernel: scsi host7: FLOGI reg issued fcid d0840 map 0 dest 8c:60:4f:95:ea:a4
Nov 15 16:09:29 localhost kernel: scsi host7: flog reg succeeded
Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: sdb - tur checker timed out
Nov 15 16:09:38 localhost multipathd[1746]: checker failed path 8:16 in map 3600a098038303873633f4d415156374d
Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: remaining active paths: 1
Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: sde - tur checker timed out
Nov 15 16:09:38 localhost multipathd[1746]: checker failed path 8:64 in map 3600a098038303873633f4d415156374e
Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: remaining active paths: 1
Nov 15 16:09:38 localhost multipathd[1746]: sdb: mark as failed
Nov 15 16:09:38 localhost multipathd[1746]: sde: mark as failed
Nov 15 16:09:38 localhost kernel: device-mapper: multipath: 253:3: Failing path 8:16.
Nov 15 16:09:38 localhost kernel: device-mapper: multipath: 253:4: Failing path 8:64.
Nov 15 16:09:40 localhost kernel: sd 7:0:2:1: Power-on or device reset occurred
Nov 15 16:09:40 localhost kernel: sd 7:0:1:1: Power-on or device reset occurred
Nov 15 16:09:40 localhost kernel: sd 7:0:1:0: Power-on or device reset occurred
Nov 15 16:09:40 localhost kernel: sd 7:0:2:0: Power-on or device reset occurred
Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: sdb - tur checker reports path is up
Nov 15 16:09:43 localhost multipathd[1746]: 8:16: reinstated
Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: remaining active paths: 2
Nov 15 16:09:43 localhost kernel: device-mapper: multipath: 253:3: Reinstating path 8:16.
Nov 15 16:09:43 localhost kernel: device-mapper: multipath: 253:4: Reinstating path 8:64.
Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: sde - tur checker reports path is up
Nov 15 16:09:43 localhost multipathd[1746]: 8:64: reinstated
Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: remaining active paths: 2
=============
Repro 2:
[ 184.517949] fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0xeb sc: 000000000d852f23 CDB Opcode: 0x28 Dropping icmnd completion
[ 214.523460] scsi host7: Abort Cmd called FCID 0x5205fb, LUN 0x1 TAG eb flags 3
[ 214.523474] scsi host7: CBD Opcode: 28 Abort issued time: 30026 msec
[ 214.523548] scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0xeb sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
[ 236.835184] scsi host7: Returning from abort cmd type 2 FAILED
[ 236.943669] scsi host7: Device reset called FCID 0x5205fb, LUN 0x1 sc: 000000000d852f23
[ 236.943688] scsi host7: TAG ffffffff sc: 00000000ea33dc31
[ 236.956270] scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_SUCCESS
[ 247.074054] scsi host7: Device reset timed out
[ 247.074063] scsi host7: Abort and terminate issued on Device reset tag 0xffffffff
[ 247.074070] scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_IO_NOT_FOUND
[ 257.312999] scsi host7: Device reset completed - failed
[ 257.313008] BUG: kernel NULL pointer dereference, address: 000000000000001c
[ 257.313016] #PF: supervisor read access in kernel mode
[ 257.313021] #PF: error_code(0x0000) - not-present page
[ 257.313028] PGD 80000010c746c067 P4D 80000010c746c067 PUD 10c746d067 PMD 0
[ 257.313041] Oops: 0000 [#1] PREEMPT SMP PTI
[ 257.313049] CPU: 0 PID: 1114 Comm: scsi_eh_7 Kdump: loaded Not tainted 6.4.7 #5
[ 257.313058] Hardware name: Cisco Systems Inc UCSB-B200-M5/UCSB-B200-M5, BIOS B200M5.4.0.2a.0.1102181538 11/02/2018
[ 257.313063] RIP: 0010:dma_direct_unmap_sg+0x58/0x1d0
[ 257.313080] Code: 89 f5 89 d3 4d 89 c5 45 31 ff eb 1e 83 e0 fe 89 45 1c 48 89 ef 41 83 c7 01 e8 b4 b9 44 00 48 89 c5 44 39 fb 0f 84 39 01 00 00 <8b> 45 1c a8 01 75 db 4c 8b 05 f2 1b 68 01 49 8b bc 24 58 02 00 00
[ 257.313087] RSP: 0018:ffff9cfa49157cc0 EFLAGS: 00010246
[ 257.313094] RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000002
[ 257.313099] RDX: 0000000000000003 RSI: 0000000000000000 RDI: ffff8fab45f470d0
[ 257.313103] RBP: 0000000000000000 R08: 0000000000000000 R09: 00000000ffff7fff
[ 257.313107] R10: ffff9cfa49157b18 R11: ffffffff969e5d28 R12: ffff8fab45f470d0
[ 257.313112] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 257.313116] FS: 0000000000000000(0000) GS:ffff8faaff800000(0000) knlGS:0000000000000000
[ 257.313122] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 257.313128] CR2: 000000000000001c CR3: 00000010c874c002 CR4: 00000000007706f0
[ 257.313133] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 257.313137] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 257.313141] PKRU: 55555554
[ 257.313145] Call Trace:
[ 257.313150] <TASK>
[ 257.313158] ? __die+0x24/0x70
[ 257.313171] ? page_fault_oops+0x82/0x150
[ 257.313181] ? irq_work_queue+0x32/0x60
[ 257.313190] ? vprintk_emit+0x100/0x230
[ 257.313200] ? exc_page_fault+0x69/0x150
[ 257.313211] ? asm_exc_page_fault+0x26/0x30
[ 257.313228] ? dma_direct_unmap_sg+0x58/0x1d0
[ 257.313244] fnic_release_ioreq_buf+0x23/0xc0 [fnic]
[ 257.313274] fnic_device_reset+0x57c/0x860 [fnic]
[ 257.313297] ? sched_clock_cpu+0xf/0x190
[ 257.313308] ? raw_spin_rq_lock_nested+0x1d/0x80
[ 257.313318] ? newidle_balance+0x26e/0x400
[ 257.313329] scsi_eh_bus_device_reset+0xee/0x2e0
[ 257.313343] scsi_eh_ready_devs+0x6e/0x2f0
[ 257.313366] ? rpm_resume+0x411/0x750
[ 257.313372] scsi_unjam_host+0x10a/0x200
[ 257.313377] scsi_error_handler+0x144/0x1d0
[ 257.313380] ? __pfx_scsi_error_handler+0x10/0x10
[ 257.313383] kthread+0xe3/0x120
[ 257.313386] ? __pfx_kthread+0x10/0x10
[ 257.313388] ret_from_fork+0x29/0x50
[ 257.313394] </TASK>
[ 257.313395] Modules linked in: dm_round_robin snd_seq_dummy snd_hrtimer snd_seq snd_timer snd_seq_device snd soundcore nft_compat nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables rfkill nfnetlink qrtr dm_multipath intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common isst_if_common skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass rapl intel_cstate joydev mei_me intel_uncore ioatdma mei intel_pch_thermal ipmi_ssif pcspkr dca lpc_ich acpi_ipmi ipmi_si acpi_pad acpi_power_meter xfs libcrc32c mgag200 sd_mod t10_pi drm_kms_helper crc64_rocksoft_generic crc64_rocksoft crc64 sg fnic syscopyarea sysfillrect sysimgblt i2c_algo_bit libfcoe drm_shmem_helper crct10dif_pclmul ahci crc32_pclmul crc32c_intel libahci libfc ghash_clmulni_intel drm libata enic scsi_transport_fc sha512_ssse3 megaraid_sas wmi dm_mirror
[ 257.313470] dm_region_hash dm_log dm_mod ipmi_devintf ipmi_msghandler fuse
[ 257.313478] CR2: 000000000000001c
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-16 0:22 ` Karan Tilak Kumar (kartilak)
@ 2023-11-17 23:31 ` Karan Tilak Kumar (kartilak)
2023-11-20 7:50 ` Hannes Reinecke
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-17 23:31 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Wednesday, November 15, 2023 4:22 PM, Karan Tilak Kumar (kartilak) wrote:
>
> On Tuesday, November 14, 2023 12:02 AM, Hannes Reinecke <hare@suse.de> wrote:
> > > I integrated your patch set using "b4 shazam" and built all the changes to do some dev testing.
> > > I instrumented the code to do the following:
> > >
> > > - After one million IOs, drop one IO response.
> > > - This will trigger an abort. Drop that abort response.
> > > - This will trigger a device reset. I'm seeing that the tag here is 0xFFFFFFFF (SCSI_NO_TAG).
> > >
> > > This tag fails the out-of-range tag check and escalates to host reset.
> > > I've been able to repro this three times with the same result.
> > >
> > > The expectation with this test case is that the device reset should go through successfully without escalating to host reset.
> > >
> > Thanks for the report.
> > Certainly not what was intended with the patch. Can you try with this patch on top:
>
> I applied this patch on top of the previous one and performed the unit tests that I had mentioned before.
>
> >
> > (The first hunk is actually a bugfix, and will be included with the next submission anyway.) Please enable fnic debug during testing and send me the logs.
> > It _might_ be that this is actually by design, as we will be failing if all tags are busy (ie if the error is EWOULDBLOCK).
> > But if we get EAGAIN it means that the queue is blocked and we'll need to investigate.
>
> There were two attempts at reproduction. One of them resulted in a crash. I have enabled fnic debug and captured the log.
> I've pasted the excerpt here for reference.
>
> Repro 1:
>
> Nov 15 16:08:14 localhost kernel: fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0x91 sc: 00000000bbe9fd78 CDB Opcode: 0x28 Dropping icmnd completion
> Nov 15 16:08:35 localhost multipathd[1746]: burst continued 30025 ms, too long time, stopped
> Nov 15 16:08:36 localhost kernel: sched: RT throttling activated
> Nov 15 16:08:44 localhost kernel: scsi host7: Abort Cmd called FCID 0x5205fa, LUN 0x1 TAG 91 flags 3
> Nov 15 16:08:44 localhost kernel: scsi host7: CBD Opcode: 28 Abort issued time: 30036 msec
> Nov 15 16:08:44 localhost kernel: scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0x91 sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
> Nov 15 16:09:05 localhost multipathd[1746]: burst continued 30041 ms, too long time, stopped
> Nov 15 16:09:06 localhost kernel: scsi host7: Returning from abort cmd type 2 FAILED
> Nov 15 16:09:06 localhost kernel: scsi host7: Device reset called FCID 0x5205fa, LUN 0x1 sc: 00000000bbe9fd78
> Nov 15 16:09:06 localhost kernel: scsi host7: TAG ffffffff sc: 000000005ee1ff69
> Nov 15 16:09:06 localhost kernel: scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_SUCCESS
> Nov 15 16:09:17 localhost kernel: scsi host7: Device reset timed out
> Nov 15 16:09:17 localhost kernel: scsi host7: Abort and terminate issued on Device reset tag 0xffffffff
> Nov 15 16:09:17 localhost kernel: scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_IO_NOT_FOUND
> Nov 15 16:09:27 localhost kernel: scsi host7: Device reset completed - failed
> Nov 15 16:09:27 localhost kernel: scsi host7: Returning from device reset FAILED
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_reset called
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x52060c
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fb
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fa
> Nov 15 16:09:27 localhost kernel: scsi host7: update_mac 00:25:b5:bb:bb:a0
> Nov 15 16:09:27 localhost kernel: scsi host7: Issued fw reset
> Nov 15 16:09:27 localhost kernel: scsi host7: set port_id 0 fp 0000000000000000
> Nov 15 16:09:27 localhost kernel: scsi host7: Returning from fnic reset SUCCESS
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x52060c
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fb
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_cleanup_io: tag:0x91 : sc:0x00000000bbe9fd78 duration = 72835 DID_TRANSPORT_DISRUPTED
> Nov 15 16:09:27 localhost kernel: scsi host7: Calling done for IO not issued to fw: tag:0x91 sc:0x00000000bbe9fd78
> Nov 15 16:09:27 localhost kernel: scsi host7: reset cmpl success
> Nov 15 16:09:27 localhost kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205fa
> Nov 15 16:09:29 localhost kernel: host7: Assigned Port ID 0d0840
> Nov 15 16:09:29 localhost kernel: scsi host7: set port_id d0840 fp 0000000046a33a9d
> Nov 15 16:09:29 localhost kernel: scsi host7: update_mac 0e:fc:00:0d:08:40
> Nov 15 16:09:29 localhost kernel: scsi host7: FLOGI reg issued fcid d0840 map 0 dest 8c:60:4f:95:ea:a4
> Nov 15 16:09:29 localhost kernel: scsi host7: flog reg succeeded
> Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: sdb - tur checker timed out
> Nov 15 16:09:38 localhost multipathd[1746]: checker failed path 8:16 in map 3600a098038303873633f4d415156374d
> Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: remaining active paths: 1
> Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: sde - tur checker timed out
> Nov 15 16:09:38 localhost multipathd[1746]: checker failed path 8:64 in map 3600a098038303873633f4d415156374e
> Nov 15 16:09:38 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: remaining active paths: 1
> Nov 15 16:09:38 localhost multipathd[1746]: sdb: mark as failed
> Nov 15 16:09:38 localhost multipathd[1746]: sde: mark as failed
> Nov 15 16:09:38 localhost kernel: device-mapper: multipath: 253:3: Failing path 8:16.
> Nov 15 16:09:38 localhost kernel: device-mapper: multipath: 253:4: Failing path 8:64.
> Nov 15 16:09:40 localhost kernel: sd 7:0:2:1: Power-on or device reset occurred
> Nov 15 16:09:40 localhost kernel: sd 7:0:1:1: Power-on or device reset occurred
> Nov 15 16:09:40 localhost kernel: sd 7:0:1:0: Power-on or device reset occurred
> Nov 15 16:09:40 localhost kernel: sd 7:0:2:0: Power-on or device reset occurred
> Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: sdb - tur checker reports path is up
> Nov 15 16:09:43 localhost multipathd[1746]: 8:16: reinstated
> Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374d: remaining active paths: 2
> Nov 15 16:09:43 localhost kernel: device-mapper: multipath: 253:3: Reinstating path 8:16.
> Nov 15 16:09:43 localhost kernel: device-mapper: multipath: 253:4: Reinstating path 8:64.
> Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: sde - tur checker reports path is up
> Nov 15 16:09:43 localhost multipathd[1746]: 8:64: reinstated
> Nov 15 16:09:43 localhost multipathd[1746]: 3600a098038303873633f4d415156374e: remaining active paths: 2
>
> =============
> Repro 2:
>
> [ 184.517949] fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0xeb sc: 000000000d852f23 CDB Opcode: 0x28 Dropping icmnd completion
> [ 214.523460] scsi host7: Abort Cmd called FCID 0x5205fb, LUN 0x1 TAG eb flags 3
> [ 214.523474] scsi host7: CBD Opcode: 28 Abort issued time: 30026 msec
> [ 214.523548] scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0xeb sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
> [ 236.835184] scsi host7: Returning from abort cmd type 2 FAILED
> [ 236.943669] scsi host7: Device reset called FCID 0x5205fb, LUN 0x1 sc: 000000000d852f23
> [ 236.943688] scsi host7: TAG ffffffff sc: 00000000ea33dc31
> [ 236.956270] scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_SUCCESS
> [ 247.074054] scsi host7: Device reset timed out
> [ 247.074063] scsi host7: Abort and terminate issued on Device reset tag 0xffffffff
> [ 247.074070] scsi host7: Tag out of range tag ffffffff hdr status = FCPIO_IO_NOT_FOUND
> [ 257.312999] scsi host7: Device reset completed - failed
> [ 257.313008] BUG: kernel NULL pointer dereference, address: 000000000000001c
> [ 257.313016] #PF: supervisor read access in kernel mode
> [ 257.313021] #PF: error_code(0x0000) - not-present page
> [ 257.313028] PGD 80000010c746c067 P4D 80000010c746c067 PUD 10c746d067 PMD 0
> [ 257.313041] Oops: 0000 [#1] PREEMPT SMP PTI
> [ 257.313049] CPU: 0 PID: 1114 Comm: scsi_eh_7 Kdump: loaded Not tainted 6.4.7 #5
> [ 257.313058] Hardware name: Cisco Systems Inc UCSB-B200-M5/UCSB-B200-M5, BIOS B200M5.4.0.2a.0.1102181538 11/02/2018
> [ 257.313063] RIP: 0010:dma_direct_unmap_sg+0x58/0x1d0
> [ 257.313080] Code: 89 f5 89 d3 4d 89 c5 45 31 ff eb 1e 83 e0 fe 89 45 1c 48 89 ef 41 83 c7 01 e8 b4 b9 44 00 48 89 c5 44 39 fb 0f 84 39 01 00 00 <8b> 45 1c a8 01 75 db 4c 8b 05 f2 1b 68 01 49 8b bc 24 58 02 00 00
> [ 257.313087] RSP: 0018:ffff9cfa49157cc0 EFLAGS: 00010246
> [ 257.313094] RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000002
> [ 257.313099] RDX: 0000000000000003 RSI: 0000000000000000 RDI: ffff8fab45f470d0
> [ 257.313103] RBP: 0000000000000000 R08: 0000000000000000 R09: 00000000ffff7fff
> [ 257.313107] R10: ffff9cfa49157b18 R11: ffffffff969e5d28 R12: ffff8fab45f470d0
> [ 257.313112] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [ 257.313116] FS: 0000000000000000(0000) GS:ffff8faaff800000(0000) knlGS:0000000000000000
> [ 257.313122] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 257.313128] CR2: 000000000000001c CR3: 00000010c874c002 CR4: 00000000007706f0
> [ 257.313133] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 257.313137] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 257.313141] PKRU: 55555554
> [ 257.313145] Call Trace:
> [ 257.313150] <TASK>
> [ 257.313158] ? __die+0x24/0x70
> [ 257.313171] ? page_fault_oops+0x82/0x150
> [ 257.313181] ? irq_work_queue+0x32/0x60
> [ 257.313190] ? vprintk_emit+0x100/0x230
> [ 257.313200] ? exc_page_fault+0x69/0x150
> [ 257.313211] ? asm_exc_page_fault+0x26/0x30
> [ 257.313228] ? dma_direct_unmap_sg+0x58/0x1d0
> [ 257.313244] fnic_release_ioreq_buf+0x23/0xc0 [fnic]
> [ 257.313274] fnic_device_reset+0x57c/0x860 [fnic]
> [ 257.313297] ? sched_clock_cpu+0xf/0x190
> [ 257.313308] ? raw_spin_rq_lock_nested+0x1d/0x80
> [ 257.313318] ? newidle_balance+0x26e/0x400
> [ 257.313329] scsi_eh_bus_device_reset+0xee/0x2e0
> [ 257.313343] scsi_eh_ready_devs+0x6e/0x2f0
> [ 257.313366] ? rpm_resume+0x411/0x750
> [ 257.313372] scsi_unjam_host+0x10a/0x200
> [ 257.313377] scsi_error_handler+0x144/0x1d0
> [ 257.313380] ? __pfx_scsi_error_handler+0x10/0x10
> [ 257.313383] kthread+0xe3/0x120
> [ 257.313386] ? __pfx_kthread+0x10/0x10
> [ 257.313388] ret_from_fork+0x29/0x50
> [ 257.313394] </TASK>
> [ 257.313395] Modules linked in: dm_round_robin snd_seq_dummy snd_hrtimer snd_seq snd_timer snd_seq_device snd soundcore nft_compat nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables rfkill nfnetlink qrtr dm_multipath intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common isst_if_common skx_edac nfit libnvdimm
> x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass rapl intel_cstate joydev mei_me intel_uncore ioatdma mei intel_pch_thermal ipmi_ssif pcspkr dca lpc_ich acpi_ipmi ipmi_si acpi_pad acpi_power_meter xfs libcrc32c mgag200 sd_mod t10_pi drm_kms_helper crc64_rocksoft_generic crc64_rocksoft crc64 sg fnic syscopyarea sysfillrect sysimgblt i2c_algo_bit libfcoe drm_shmem_helper crct10dif_pclmul ahci crc32_pclmul crc32c_intel libahci libfc ghash_clmulni_intel drm libata
> enic scsi_transport_fc sha512_ssse3 megaraid_sas wmi dm_mirror
> [ 257.313470] dm_region_hash dm_log dm_mod ipmi_devintf ipmi_msghandler fuse
> [ 257.313478] CR2: 000000000000001c
>
>
> Regards,
> Karan
>
Repro 3:
Nov 17 15:27:00 rhel-c4s5 kernel: fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0x52 sc: 00000000fab117f4 CDB Opcode: 0x28 Dropping icmnd completion
Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: Abort Cmd called FCID 0x5205f2, LUN 0x2 TAG 52 flags 3
Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: CBD Opcode: 28 Abort issued time: 30067 msec
Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0x52 sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Returning from abort cmd type 2 FAILED
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Device reset called FCID 0x5205f2, LUN 0x2 sc: 00000000fab117f4
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Device reset allocation failed with -11
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_reset called
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: update_mac 00:25:b5:cc:aa:00
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Issued fw reset
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: set port_id 0 fp 0000000000000000
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Returning from fnic reset SUCCESS
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_cleanup_io: tag:0x52 : sc:0x00000000fab117f4 duration = 52450 DID_TRANSPORT_DISRUPTED
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Calling done for IO not issued to fw: tag:0x52 sc:0x00000000fab117f4
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: reset cmpl success
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
Nov 17 15:27:54 rhel-c4s5 kernel: host7: Assigned Port ID 0d0880
Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: set port_id d0880 fp 0000000067a66565
Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: update_mac 0e:fc:00:0d:08:80
Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: FLOGI reg issued fcid d0880 map 0 dest 8c:60:4f:95:ea:a4
Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: flog reg succeeded
Nov 17 15:28:06 rhel-c4s5 kernel: sd 7:0:3:2: Power-on or device reset occurred
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-17 23:31 ` Karan Tilak Kumar (kartilak)
@ 2023-11-20 7:50 ` Hannes Reinecke
2023-11-20 22:27 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Hannes Reinecke @ 2023-11-20 7:50 UTC (permalink / raw)
To: Karan Tilak Kumar (kartilak), Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
[-- Attachment #1: Type: text/plain, Size: 5918 bytes --]
On 11/18/23 00:31, Karan Tilak Kumar (kartilak) wrote:
> On Wednesday, November 15, 2023 4:22 PM, Karan Tilak Kumar (kartilak) wrote:
>>
>> On Tuesday, November 14, 2023 12:02 AM, Hannes Reinecke <hare@suse.de> wrote:
>>>> I integrated your patch set using "b4 shazam" and built all the changes to do some dev testing.
>>>> I instrumented the code to do the following:
>>>>
>>>> - After one million IOs, drop one IO response.
>>>> - This will trigger an abort. Drop that abort response.
>>>> - This will trigger a device reset. I'm seeing that the tag here is 0xFFFFFFFF (SCSI_NO_TAG).
>>>>
>>>> This tag fails the out-of-range tag check and escalates to host reset.
>>>> I've been able to repro this three times with the same result.
>>>>
>>>> The expectation with this test case is that the device reset should go through successfully without escalating to host reset.
>>>>
>>> Thanks for the report.
>>> Certainly not what was intended with the patch. Can you try with this patch on top:
>>
>> I applied this patch on top of the previous one and performed the unit tests that I had mentioned before.
>>
>>>
>>> (The first hunk is actually a bugfix, and will be included with the next submission anyway.) Please enable fnic debug during testing and send me the logs.
>>> It _might_ be that this is actually by design, as we will be failing if all tags are busy (ie if the error is EWOULDBLOCK).
>>> But if we get EAGAIN it means that the queue is blocked and we'll need to investigate.
>>
>> There were two attempts at reproduction. One of them resulted in a crash. I have enabled fnic debug and captured the log.
>> I've pasted the excerpt here for reference.
>>
>> Repro 1:
>>
[ .. ]
>>
>> =============
>> Repro 2:
>>
[ .. ]
>>
>>
>> Regards,
>> Karan
>>
>
> Repro 3:
>
> Nov 17 15:27:00 rhel-c4s5 kernel: fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0x52 sc: 00000000fab117f4 CDB Opcode: 0x28 Dropping icmnd completion
> Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: Abort Cmd called FCID 0x5205f2, LUN 0x2 TAG 52 flags 3
> Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: CBD Opcode: 28 Abort issued time: 30067 msec
> Nov 17 15:27:30 rhel-c4s5 kernel: scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0x52 sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Returning from abort cmd type 2 FAILED
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Device reset called FCID 0x5205f2, LUN 0x2 sc: 00000000fab117f4
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Device reset allocation failed with -11
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_reset called
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: update_mac 00:25:b5:cc:aa:00
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Issued fw reset
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: set port_id 0 fp 0000000000000000
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Returning from fnic reset SUCCESS
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_cleanup_io: tag:0x52 : sc:0x00000000fab117f4 duration = 52450 DID_TRANSPORT_DISRUPTED
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: Calling done for IO not issued to fw: tag:0x52 sc:0x00000000fab117f4
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: reset cmpl success
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
> Nov 17 15:27:52 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
> Nov 17 15:27:54 rhel-c4s5 kernel: host7: Assigned Port ID 0d0880
> Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: set port_id d0880 fp 0000000067a66565
> Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: update_mac 0e:fc:00:0d:08:80
> Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: FLOGI reg issued fcid d0880 map 0 dest 8c:60:4f:95:ea:a4
> Nov 17 15:27:54 rhel-c4s5 kernel: scsi host7: flog reg succeeded
> Nov 17 15:28:06 rhel-c4s5 kernel: sd 7:0:3:2: Power-on or device reset occurred
>
Seems that 'blk_queue_enter()' called from scsi_alloc_request() is
failing, presumably as the queue is frozen/quiesced.
Can you try with the attached patch instead of the previous debug patch?
On, and incidentally: there's an unlock missing:
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 0278c4a207f3..47bcc6bd7376 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2233,8 +2233,10 @@ int fnic_device_reset(struct scsi_device *sdev)
io_lock = fnic_io_lock_hash(fnic, sc);
spin_lock_irqsave(io_lock, flags);
io_req = fnic_priv(sc)->io_req;
- if (io_req)
+ if (io_req) {
+ spin_unlock_irqrestore(io_lock, flags);
goto fnic_device_reset_end;
+ }
io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
if (!io_req) {
Maybe fold it in with your patchset (if it's not already merged).
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich
[-- Attachment #2: 0001-fnic-debug-device-reset-command-allocation-failures.patch --]
[-- Type: text/x-patch, Size: 1412 bytes --]
From f7eba73f0654ebd2060b92ab08c9458243a9c914 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 20 Nov 2023 08:41:16 +0100
Subject: [PATCH] fnic: debug device reset command allocation failures
Add code to debug why device reset command allocation fails.
Signed-off-by: Hannes Reinecke <hare@suse.de>
---
drivers/scsi/fnic/fnic_scsi.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 8909cf09cf9e..0278c4a207f3 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2209,8 +2209,8 @@ int fnic_device_reset(struct scsi_device *sdev)
}
req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
- BLK_MQ_REQ_NOWAIT);
- if (!req) {
+ BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM);
+ if (IS_ERR(req)) {
/*
* Request allocation might fail, indicating that
* all tags are busy.
@@ -2221,7 +2221,10 @@ int fnic_device_reset(struct scsi_device *sdev)
* to host reset anyway.
*/
FNIC_SCSI_DBG(KERN_ERR, fnic->lport->host,
- "Device reset allocation failed, all tags busy\n");
+ "Device reset allocation failed (error %ld%s%s)\n",
+ PTR_ERR(req),
+ sdev->request_queue->mq_freeze_depth ? ",frozen" : "",
+ sdev->quiesced_by ? ",quiesced" : "");
return ret;
}
sc = blk_mq_rq_to_pdu(req);
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-20 7:50 ` Hannes Reinecke
@ 2023-11-20 22:27 ` Karan Tilak Kumar (kartilak)
2023-11-21 2:24 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-20 22:27 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Sunday, November 19, 2023 11:50 PM, Hannes Reinecke <hare@suse.de> wrote:
> Seems that 'blk_queue_enter()' called from scsi_alloc_request() is
> failing, presumably as the queue is frozen/quiesced.
> Can you try with the attached patch instead of the previous debug patch?
>
> On, and incidentally: there's an unlock missing:
>
> diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
> index 0278c4a207f3..47bcc6bd7376 100644
> --- a/drivers/scsi/fnic/fnic_scsi.c
> +++ b/drivers/scsi/fnic/fnic_scsi.c
> @@ -2233,8 +2233,10 @@ int fnic_device_reset(struct scsi_device *sdev)
> io_lock = fnic_io_lock_hash(fnic, sc);
> spin_lock_irqsave(io_lock, flags);
> io_req = fnic_priv(sc)->io_req;
> - if (io_req)
> + if (io_req) {
> + spin_unlock_irqrestore(io_lock, flags);
> goto fnic_device_reset_end;
> + }
>
> io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC);
> if (!io_req) {
>
> Maybe fold it in with your patchset (if it's not already merged).
>
Thanks Hannes. I've modified the code based on your patch. Here's the repro log:
Nov 20 13:59:01 rhel-c4s5 kernel: fnic<7>: UT: fnic_fcpio_icmnd_cmpl_handler: 847: tag: 0xf sc: 00000000d0bc6014 CDB Opcode: 0x28 Dropping icmnd completion
Nov 20 13:59:31 rhel-c4s5 kernel: scsi host7: Abort Cmd called FCID 0x52061b, LUN 0x2 TAG f flags 3
Nov 20 13:59:31 rhel-c4s5 kernel: scsi host7: CBD Opcode: 28 Abort issued time: 30029 msec
Nov 20 13:59:31 rhel-c4s5 kernel: scsi host7: fnic<7>: UT: fnic_fcpio_itmf_cmpl_handler: 1113: tag: 0xf sc: 00 status: FCPIO_IO_NOT_FOUND Dropping abort completion
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Returning from abort cmd type 2 FAILED
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Device reset called FCID 0x52061b, LUN 0x2 sc: 00000000d0bc6014
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Device reset allocation failed (error -11)
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_reset called
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: update_mac 00:25:b5:cc:aa:00
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Issued fw reset
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: set port_id 0 fp 0000000000000000
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Returning from fnic reset SUCCESS
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0xfffffc
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_cleanup_io: tag:0xf : sc:0x00000000d0bc6014 duration = 52089 DID_TRANSPORT_DISRUPTED
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: Calling done for IO not issued to fw: tag:0xf sc:0x00000000d0bc6014
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: reset cmpl success
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x52061b
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205f2
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205cb
Nov 20 13:59:53 rhel-c4s5 kernel: scsi host7: fnic_rport_exch_reset called portid 0x5205ca
Nov 20 13:59:55 rhel-c4s5 kernel: host7: Assigned Port ID 0d0880
Nov 20 13:59:55 rhel-c4s5 kernel: scsi host7: set port_id d0880 fp 00000000a1c453b9
Nov 20 13:59:55 rhel-c4s5 kernel: scsi host7: update_mac 0e:fc:00:0d:08:80
Nov 20 13:59:55 rhel-c4s5 kernel: scsi host7: FLOGI reg issued fcid d0880 map 0 dest 8c:60:4f:95:ea:a4
Nov 20 13:59:55 rhel-c4s5 kernel: scsi host7: flog reg succeeded
Nov 20 14:00:06 rhel-c4s5 kernel: sd 7:0:3:2: Power-on or device reset occurred
Nov 20 14:00:06 rhel-c4s5 kernel: sd 7:0:3:2: alua: transition timeout set to 120 seconds
Nov 20 14:00:06 rhel-c4s5 kernel: sd 7:0:3:2: alua: port group 3e9 state A non-preferred supports TolUsNA
This is what the code looks like for the above test:
req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM);
if (IS_ERR(req)) {
/*
* Request allocation might fail, indicating that
* all tags are busy.
* But device reset will be called only from within
* SCSI EH, at which time all I/O is stopped. So the
* only active tags would be for failed I/O, but
* when all I/O is failed it'll be better to escalate
* to host reset anyway.
*/
FNIC_SCSI_DBG(KERN_ERR, fnic->lport->host,
"Device reset allocation failed (error %ld%s%s)\n",
PTR_ERR(req),
sdev->request_queue->mq_freeze_depth ? ",frozen" : "",
sdev->quiesced_by ? ",quiesced" : "");
return ret;
}
sc = blk_mq_rq_to_pdu(req);
tag = req->tag;
io_lock = fnic_io_lock_hash(fnic, sc);
spin_lock_irqsave(io_lock, flags);
io_req = fnic_priv(sc)->io_req;
if (io_req) {
spin_unlock_irqrestore(io_lock, flags);
goto fnic_device_reset_end;
}
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-20 22:27 ` Karan Tilak Kumar (kartilak)
@ 2023-11-21 2:24 ` Karan Tilak Kumar (kartilak)
2023-11-21 21:06 ` Karan Tilak Kumar (kartilak)
0 siblings, 1 reply; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-21 2:24 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Monday, November 20, 2023 2:27 PM, Karan Tilak Kumar (kartilak) wrote:
> > Maybe fold it in with your patchset (if it's not already merged).
Sure Hannes. The patchset is not merged yet.
I'll make the required change and send out V4.
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* RE: [PATCH 13/16] fnic: allocate device reset command on the fly
2023-11-21 2:24 ` Karan Tilak Kumar (kartilak)
@ 2023-11-21 21:06 ` Karan Tilak Kumar (kartilak)
0 siblings, 0 replies; 28+ messages in thread
From: Karan Tilak Kumar (kartilak) @ 2023-11-21 21:06 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: Martin K. Petersen, James Bottomley, linux-scsi@vger.kernel.org,
Dhanraj Jhawar (djhawar), Gian Carlo Boffa (gcboffa),
Masa Kai (mkai2), Satish Kharat (satishkh),
Arulprabhu Ponnusamy (arulponn), Sesidhar Baddela (sebaddel)
On Monday, November 20, 2023 6:24 PM, Karan Tilak Kumar (kartilak) wrote:
>
> On Monday, November 20, 2023 2:27 PM, Karan Tilak Kumar (kartilak) wrote:
> > > Maybe fold it in with your patchset (if it's not already merged).
>
> Sure Hannes. The patchset is not merged yet.
> I'll make the required change and send out V4.
>
> Regards,
> Karan
>
Thanks for pointing this out, Hannes.
The MQ code has corrected some missing unlocks. Therefore, this problem does not appear to be in the V3 patch set.
Regards,
Karan
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 14/16] fnic: use fc_block_rport() correctly
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (12 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 13/16] fnic: allocate device reset command on the fly Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 15/16] csiostor: use separate TMF command Hannes Reinecke
2023-10-23 9:15 ` [PATCH 16/16] dc395x: Remove 'scmd' parameter from doing_srb_done() Hannes Reinecke
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke,
Johannes Thumshirn
Use fc_block_rport() instead of fc_block_scsi_eh() and evaluate
the return value to avoid issuing TMF commands when the port
is still blocked.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
drivers/scsi/fnic/fnic_scsi.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 7a8b6285a096..90f30944eab3 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -1719,9 +1719,6 @@ int fnic_abort_cmd(struct scsi_cmnd *sc)
unsigned long abt_issued_time;
DECLARE_COMPLETION_ONSTACK(tm_done);
- /* Wait for rport to unblock */
- fc_block_scsi_eh(sc);
-
/* Get local-port, check ready and link up */
lp = shost_priv(sc->device->host);
@@ -1731,6 +1728,10 @@ int fnic_abort_cmd(struct scsi_cmnd *sc)
term_stats = &fnic->fnic_stats.term_stats;
rport = starget_to_rport(scsi_target(sc->device));
+ ret = fc_block_rport(rport);
+ if (ret)
+ return ret;
+
FNIC_SCSI_DBG(KERN_DEBUG,
fnic->lport->host,
"Abort Cmd called FCID 0x%x, LUN 0x%llx TAG %x flags %x\n",
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 15/16] csiostor: use separate TMF command
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (13 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 14/16] fnic: use fc_block_rport() correctly Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
2023-10-23 9:15 ` [PATCH 16/16] dc395x: Remove 'scmd' parameter from doing_srb_done() Hannes Reinecke
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
Set one command aside as a TMF command, and use this command to
send the TMF. This avoids having to rely on the passed-in scsi
command when resetting the device.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/csiostor/csio_scsi.c | 72 ++++++++++++++++++++-----------
1 file changed, 48 insertions(+), 24 deletions(-)
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 05e1a63e00c3..08597de49b7f 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -1724,7 +1724,10 @@ csio_scsi_err_handler(struct csio_hw *hw, struct csio_ioreq *req)
}
cmnd->result = (((host_status) << 16) | scsi_status);
- scsi_done(cmnd);
+ if (csio_priv(cmnd)->fc_tm_flags == FCP_TMF_LUN_RESET)
+ blk_mq_set_request_complete(scsi_cmd_to_rq(cmnd));
+ else
+ scsi_done(cmnd);
/* Wake up waiting threads */
csio_scsi_cmnd(req) = NULL;
@@ -1752,7 +1755,10 @@ csio_scsi_cbfn(struct csio_hw *hw, struct csio_ioreq *req)
}
cmnd->result = (((host_status) << 16) | scsi_status);
- scsi_done(cmnd);
+ if (csio_priv(cmnd)->fc_tm_flags == FCP_TMF_LUN_RESET)
+ blk_mq_set_request_complete(scsi_cmd_to_rq(cmnd));
+ else
+ scsi_done(cmnd);
csio_scsi_cmnd(req) = NULL;
CSIO_INC_STATS(csio_hw_to_scsim(hw), n_tot_success);
} else {
@@ -2056,17 +2062,21 @@ csio_tm_cbfn(struct csio_hw *hw, struct csio_ioreq *req)
/* Wake up the TM handler thread */
csio_scsi_cmnd(req) = NULL;
+ cmnd->host_scribble = NULL;
}
static int
csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
{
- struct csio_lnode *ln = shost_priv(cmnd->device->host);
+ struct scsi_device *sdev = cmnd->device;
+ struct csio_lnode *ln = shost_priv(sdev->host);
struct csio_hw *hw = csio_lnode_to_hw(ln);
struct csio_scsim *scsim = csio_hw_to_scsim(hw);
- struct csio_rnode *rn = (struct csio_rnode *)(cmnd->device->hostdata);
+ struct csio_rnode *rn = (struct csio_rnode *)(sdev->hostdata);
struct csio_ioreq *ioreq = NULL;
struct csio_scsi_qset *sqset;
+ struct scsi_cmnd *tmf_cmnd;
+ struct request *req;
unsigned long flags;
int retval;
int count, ret;
@@ -2074,17 +2084,17 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
struct csio_scsi_level_data sld;
if (!rn)
- goto fail;
+ return FAILED;
csio_dbg(hw, "Request to reset LUN:%llu (ssni:0x%x tgtid:%d)\n",
- cmnd->device->lun, rn->flowid, rn->scsi_id);
+ sdev->lun, rn->flowid, rn->scsi_id);
if (!csio_is_lnode_ready(ln)) {
csio_err(hw,
"LUN reset cannot be issued on non-ready"
" local node vnpi:0x%x (LUN:%llu)\n",
- ln->vnp_flowid, cmnd->device->lun);
- goto fail;
+ ln->vnp_flowid, sdev->lun);
+ return FAILED;
}
/* Lnode is ready, now wait on rport node readiness */
@@ -2103,10 +2113,19 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
csio_err(hw,
"LUN reset cannot be issued on non-ready"
" remote node ssni:0x%x (LUN:%llu)\n",
- rn->flowid, cmnd->device->lun);
- goto fail;
+ rn->flowid, sdev->lun);
+ return FAILED;
}
+ req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN,
+ BLK_MQ_REQ_NOWAIT);
+ if (!req) {
+ csio_err(hw,
+ "LUN reset TMF already busy (LUN:%llu)\n",
+ sdev->lun);
+ return FAILED;
+ }
+ tmf_cmnd = blk_mq_rq_to_pdu(req);
/* Get a free ioreq structure - SM is already set to uninit */
ioreq = csio_get_scsi_ioreq_lock(hw, scsim);
@@ -2123,11 +2142,11 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
ioreq->iq_idx = sqset->iq_idx;
ioreq->eq_idx = sqset->eq_idx;
- csio_scsi_cmnd(ioreq) = cmnd;
- cmnd->host_scribble = (unsigned char *)ioreq;
- csio_priv(cmnd)->wr_status = 0;
+ csio_scsi_cmnd(ioreq) = tmf_cmnd;
+ tmf_cmnd->host_scribble = (unsigned char *)ioreq;
+ csio_priv(tmf_cmnd)->wr_status = 0;
- csio_priv(cmnd)->fc_tm_flags = FCP_TMF_LUN_RESET;
+ csio_priv(tmf_cmnd)->fc_tm_flags = FCP_TMF_LUN_RESET;
ioreq->tmo = CSIO_SCSI_LUNRST_TMO_MS / 1000;
/*
@@ -2144,30 +2163,33 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
sld.level = CSIO_LEV_LUN;
sld.lnode = ioreq->lnode;
sld.rnode = ioreq->rnode;
- sld.oslun = cmnd->device->lun;
+ sld.oslun = sdev->lun;
+ WRITE_ONCE(req->state, MQ_RQ_IN_FLIGHT);
spin_lock_irqsave(&hw->lock, flags);
/* Kick off TM SM on the ioreq */
retval = csio_scsi_start_tm(ioreq);
spin_unlock_irqrestore(&hw->lock, flags);
if (retval != 0) {
+ blk_mq_set_request_complete(req);
csio_err(hw, "Failed to issue LUN reset, req:%p, status:%d\n",
ioreq, retval);
+ tmf_cmnd->host_scribble = NULL;
goto fail_ret_ioreq;
}
csio_dbg(hw, "Waiting max %d secs for LUN reset completion\n",
count * (CSIO_SCSI_TM_POLL_MS / 1000));
/* Wait for completion */
- while ((((struct scsi_cmnd *)csio_scsi_cmnd(ioreq)) == cmnd)
+ while ((((struct scsi_cmnd *)csio_scsi_cmnd(ioreq)) == tmf_cmnd)
&& count--)
msleep(CSIO_SCSI_TM_POLL_MS);
/* LUN reset timed-out */
- if (((struct scsi_cmnd *)csio_scsi_cmnd(ioreq)) == cmnd) {
+ if (((struct scsi_cmnd *)csio_scsi_cmnd(ioreq)) == tmf_cmnd) {
csio_err(hw, "LUN reset (%d:%llu) timed out\n",
- cmnd->device->id, cmnd->device->lun);
+ sdev->id, sdev->lun);
spin_lock_irq(&hw->lock);
csio_scsi_drvcleanup(ioreq);
@@ -2178,10 +2200,10 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
}
/* LUN reset returned, check cached status */
- if (csio_priv(cmnd)->wr_status != FW_SUCCESS) {
+ if (csio_priv(tmf_cmnd)->wr_status != FW_SUCCESS) {
csio_err(hw, "LUN reset failed (%d:%llu), status: %d\n",
- cmnd->device->id, cmnd->device->lun,
- csio_priv(cmnd)->wr_status);
+ sdev->id, sdev->lun,
+ csio_priv(tmf_cmnd)->wr_status);
goto fail;
}
@@ -2201,7 +2223,7 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
if (retval != 0) {
csio_err(hw,
"Attempt to abort I/Os during LUN reset of %llu"
- " returned %d\n", cmnd->device->lun, retval);
+ " returned %d\n", sdev->lun, retval);
/* Return I/Os back to active_q */
spin_lock_irq(&hw->lock);
list_splice_tail_init(&local_q, &scsim->active_q);
@@ -2212,14 +2234,16 @@ csio_eh_lun_reset_handler(struct scsi_cmnd *cmnd)
CSIO_INC_STATS(rn, n_lun_rst);
csio_info(hw, "LUN reset occurred (%d:%llu)\n",
- cmnd->device->id, cmnd->device->lun);
-
+ sdev->id, sdev->lun);
+ blk_mq_free_request(req);
return SUCCESS;
fail_ret_ioreq:
csio_put_scsi_ioreq_lock(hw, scsim, ioreq);
fail:
CSIO_INC_STATS(rn, n_lun_rst_fail);
+ if (req)
+ blk_mq_free_request(req);
return FAILED;
}
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread* [PATCH 16/16] dc395x: Remove 'scmd' parameter from doing_srb_done()
2023-10-23 9:14 [PATCHv8 00/16] scsi: EH rework prep patches, part 2 Hannes Reinecke
` (14 preceding siblings ...)
2023-10-23 9:15 ` [PATCH 15/16] csiostor: use separate TMF command Hannes Reinecke
@ 2023-10-23 9:15 ` Hannes Reinecke
15 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2023-10-23 9:15 UTC (permalink / raw)
To: Martin K. Petersen
Cc: Christoph Hellwig, James Bottomley, linux-scsi, Hannes Reinecke
doing_srb_done() has two passes, one for the 'going' list
and one for the 'waiting' list. When aborting commands on these
lists we should be calling scsi_done() for each command, and
not only for the command which triggered the error.
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/scsi/dc395x.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index c8e86f8a631e..822d21e7da14 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -367,8 +367,7 @@ static inline void enable_msgout_abort(struct AdapterCtlBlk *acb,
struct ScsiReqBlk *srb);
static void build_srb(struct scsi_cmnd *cmd, struct DeviceCtlBlk *dcb,
struct ScsiReqBlk *srb);
-static void doing_srb_done(struct AdapterCtlBlk *acb, u8 did_code,
- struct scsi_cmnd *cmd, u8 force);
+static void doing_srb_done(struct AdapterCtlBlk *acb, u8 did_code, u8 force);
static void scsi_reset_detect(struct AdapterCtlBlk *acb);
static void pci_unmap_srb(struct AdapterCtlBlk *acb, struct ScsiReqBlk *srb);
static void pci_unmap_srb_sense(struct AdapterCtlBlk *acb,
@@ -1182,7 +1181,7 @@ static int __dc395x_eh_bus_reset(struct scsi_cmnd *cmd)
set_basic_config(acb);
reset_dev_param(acb);
- doing_srb_done(acb, DID_RESET, cmd, 0);
+ doing_srb_done(acb, DID_RESET, 0);
acb->active_dcb = NULL;
acb->acb_flag = 0; /* RESET_DETECT, RESET_DONE ,RESET_DEV */
waiting_process_next(acb);
@@ -2896,7 +2895,7 @@ static void disconnect(struct AdapterCtlBlk *acb)
dcb->flag &= ~ABORT_DEV_;
acb->last_reset = jiffies + HZ / 2 + 1;
dprintkl(KERN_ERR, "disconnect: SRB_ABORT_SENT\n");
- doing_srb_done(acb, DID_ABORT, srb->cmd, 1);
+ doing_srb_done(acb, DID_ABORT, 1);
waiting_process_next(acb);
} else {
if ((srb->state & (SRB_START_ + SRB_MSGOUT))
@@ -3337,8 +3336,7 @@ static void srb_done(struct AdapterCtlBlk *acb, struct DeviceCtlBlk *dcb,
/* abort all cmds in our queues */
-static void doing_srb_done(struct AdapterCtlBlk *acb, u8 did_flag,
- struct scsi_cmnd *cmd, u8 force)
+static void doing_srb_done(struct AdapterCtlBlk *acb, u8 did_flag, u8 force)
{
struct DeviceCtlBlk *dcb;
dprintkl(KERN_INFO, "doing_srb_done: pids ");
@@ -3389,7 +3387,7 @@ static void doing_srb_done(struct AdapterCtlBlk *acb, u8 did_flag,
if (force) {
/* For new EH, we normally don't need to give commands back,
* as they all complete or all time out */
- scsi_done(cmd);
+ scsi_done(p);
}
}
if (!list_empty(&dcb->srb_waiting_list))
@@ -3475,7 +3473,7 @@ static void scsi_reset_detect(struct AdapterCtlBlk *acb)
} else {
acb->acb_flag |= RESET_DETECT;
reset_dev_param(acb);
- doing_srb_done(acb, DID_RESET, NULL, 1);
+ doing_srb_done(acb, DID_RESET, 1);
/*DC395x_RecoverSRB( acb ); */
acb->active_dcb = NULL;
acb->acb_flag = 0;
--
2.35.3
^ permalink raw reply related [flat|nested] 28+ messages in thread