From: "Ewan D. Milne" <emilne@redhat.com>
To: Himanshu Madhani <hmadhani@marvell.com>,
James.Bottomley@HansenPartnership.com,
martin.petersen@oracle.com
Cc: linux-scsi@vger.kernel.org
Subject: Re: [PATCH 5/8] qla2xxx: Fix double scsi_done for abort path
Date: Tue, 05 Nov 2019 10:20:03 -0500 [thread overview]
Message-ID: <c3731b3424c15946a61cedfdef905aecb5ce44c1.camel@redhat.com> (raw)
In-Reply-To: <20191105150657.8092-6-hmadhani@marvell.com>
On Tue, 2019-11-05 at 07:06 -0800, Himanshu Madhani wrote:
> From: Quinn Tran <qutran@marvell.com>
>
> Current code assume abort will remove the original command from the
> active list where scsi_done will not be call. Instead, the eh_abort
> thread will do the scsi_done. That is not the case. Instead, we
> have a double scsi_done calls triggering use after free.
>
> Abort will tell FW to release the command from FW possesion. The
> original command will return to ULP with error in its normal fashion via
> scsi_done. eh_abort path would wait for the original command
> completion before returning. eh_abort path will not perform the
> scsi_done call.
>
> Fixes: 219d27d7147e0 ("scsi: qla2xxx: Fix race conditions in the code for aborting SCSI commands")
> Cc: stable@vger.kernel.org # 5.2
> Signed-off-by: Quinn Tran <qutran@marvell.com>
> Signed-off-by: Arun Easi <aeasi@marvell.com>
> Signed-off-by: Himanshu Madhani <hmadhani@marvell.com>
> ---
> drivers/scsi/qla2xxx/qla_def.h | 5 +-
> drivers/scsi/qla2xxx/qla_isr.c | 5 ++
> drivers/scsi/qla2xxx/qla_nvme.c | 4 +-
> drivers/scsi/qla2xxx/qla_os.c | 117 +++++++++++++++++++++-------------------
> 4 files changed, 72 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
> index ef9bb3c7ad6f..2a9e6a9a8c9d 100644
> --- a/drivers/scsi/qla2xxx/qla_def.h
> +++ b/drivers/scsi/qla2xxx/qla_def.h
> @@ -591,13 +591,16 @@ typedef struct srb {
> */
> uint8_t cmd_type;
> uint8_t pad[3];
> - atomic_t ref_count;
> struct kref cmd_kref; /* need to migrate ref_count over to this */
> void *priv;
> wait_queue_head_t nvme_ls_waitq;
> struct fc_port *fcport;
> struct scsi_qla_host *vha;
> unsigned int start_timer:1;
> + unsigned int abort:1;
> + unsigned int aborted:1;
> + unsigned int completed:1;
> +
> uint32_t handle;
> uint16_t flags;
> uint16_t type;
> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
> index acef3d73983c..1b8f297449cf 100644
> --- a/drivers/scsi/qla2xxx/qla_isr.c
> +++ b/drivers/scsi/qla2xxx/qla_isr.c
> @@ -2487,6 +2487,11 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt)
> return;
> }
>
> + if (sp->abort)
> + sp->aborted = 1;
> + else
> + sp->completed = 1;
> +
> if (sp->cmd_type != TYPE_SRB) {
> req->outstanding_cmds[handle] = NULL;
> ql_dbg(ql_dbg_io, vha, 0x3015,
> diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
> index 6cc19e060afc..941aa53363f5 100644
> --- a/drivers/scsi/qla2xxx/qla_nvme.c
> +++ b/drivers/scsi/qla2xxx/qla_nvme.c
> @@ -224,8 +224,8 @@ static void qla_nvme_abort_work(struct work_struct *work)
>
> if (ha->flags.host_shutting_down) {
> ql_log(ql_log_info, sp->fcport->vha, 0xffff,
> - "%s Calling done on sp: %p, type: 0x%x, sp->ref_count: 0x%x\n",
> - __func__, sp, sp->type, atomic_read(&sp->ref_count));
> + "%s Calling done on sp: %p, type: 0x%x\n",
> + __func__, sp, sp->type);
> sp->done(sp, 0);
> goto out;
> }
> diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
> index 2e7a4a2d6c5a..b59d579834ea 100644
> --- a/drivers/scsi/qla2xxx/qla_os.c
> +++ b/drivers/scsi/qla2xxx/qla_os.c
> @@ -698,11 +698,6 @@ void qla2x00_sp_compl(srb_t *sp, int res)
> struct scsi_cmnd *cmd = GET_CMD_SP(sp);
> struct completion *comp = sp->comp;
>
> - if (WARN_ON_ONCE(atomic_read(&sp->ref_count) == 0))
> - return;
> -
> - atomic_dec(&sp->ref_count);
> -
> sp->free(sp);
> cmd->result = res;
> CMD_SP(cmd) = NULL;
> @@ -794,11 +789,6 @@ void qla2xxx_qpair_sp_compl(srb_t *sp, int res)
> struct scsi_cmnd *cmd = GET_CMD_SP(sp);
> struct completion *comp = sp->comp;
>
> - if (WARN_ON_ONCE(atomic_read(&sp->ref_count) == 0))
> - return;
> -
> - atomic_dec(&sp->ref_count);
> -
> sp->free(sp);
> cmd->result = res;
> CMD_SP(cmd) = NULL;
> @@ -903,7 +893,7 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>
> sp->u.scmd.cmd = cmd;
> sp->type = SRB_SCSI_CMD;
> - atomic_set(&sp->ref_count, 1);
> +
> CMD_SP(cmd) = (void *)sp;
> sp->free = qla2x00_sp_free_dma;
> sp->done = qla2x00_sp_compl;
> @@ -985,11 +975,9 @@ qla2xxx_mqueuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd,
>
> sp->u.scmd.cmd = cmd;
> sp->type = SRB_SCSI_CMD;
> - atomic_set(&sp->ref_count, 1);
> CMD_SP(cmd) = (void *)sp;
> sp->free = qla2xxx_qpair_sp_free_dma;
> sp->done = qla2xxx_qpair_sp_compl;
> - sp->qpair = qpair;
>
> rval = ha->isp_ops->start_scsi_mq(sp);
> if (rval != QLA_SUCCESS) {
> @@ -1187,16 +1175,6 @@ qla2x00_wait_for_chip_reset(scsi_qla_host_t *vha)
> return return_status;
> }
>
> -static int
> -sp_get(struct srb *sp)
> -{
> - if (!refcount_inc_not_zero((refcount_t *)&sp->ref_count))
> - /* kref get fail */
> - return ENXIO;
> - else
> - return 0;
> -}
> -
> #define ISP_REG_DISCONNECT 0xffffffffU
> /**************************************************************************
> * qla2x00_isp_reg_stat
> @@ -1252,6 +1230,9 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
> uint64_t lun;
> int rval;
> struct qla_hw_data *ha = vha->hw;
> + uint32_t ratov_j;
> + struct qla_qpair *qpair;
> + unsigned long flags;
>
> if (qla2x00_isp_reg_stat(ha)) {
> ql_log(ql_log_info, vha, 0x8042,
> @@ -1264,13 +1245,26 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
> return ret;
>
> sp = scsi_cmd_priv(cmd);
> + qpair = sp->qpair;
>
> - if (sp->fcport && sp->fcport->deleted)
> + if ((sp->fcport && sp->fcport->deleted) || !qpair)
> return SUCCESS;
>
> - /* Return if the command has already finished. */
> - if (sp_get(sp))
> + spin_lock_irqsave(qpair->qp_lock_ptr, flags);
> + if (sp->completed) {
> + spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
> return SUCCESS;
> + }
> +
> + if (sp->abort || sp->aborted) {
> + spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
> + return FAILED;
> + }
> +
> + sp->abort = 1;
> + sp->comp = ∁
> + spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
> +
>
> id = cmd->device->id;
> lun = cmd->device->lun;
> @@ -1279,47 +1273,37 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
> "Aborting from RISC nexus=%ld:%d:%llu sp=%p cmd=%p handle=%x\n",
> vha->host_no, id, lun, sp, cmd, sp->handle);
>
> + /*
> + * Abort will release the original Command/sp from FW. Let the
> + * original command call scsi_done. In return, he will wakeup
> + * this sleeping thread.
> + */
> rval = ha->isp_ops->abort_command(sp);
> +
> ql_dbg(ql_dbg_taskm, vha, 0x8003,
> "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval);
>
> + /* Wait for the command completion. */
> + ratov_j = ha->r_a_tov/10 * 4 * 1000;
> + ratov_j = msecs_to_jiffies(ratov_j);
> switch (rval) {
> case QLA_SUCCESS:
> - /*
> - * The command has been aborted. That means that the firmware
> - * won't report a completion.
> - */
> - sp->done(sp, DID_ABORT << 16);
> - ret = SUCCESS;
> - break;
> - case QLA_FUNCTION_PARAMETER_ERROR: {
> - /* Wait for the command completion. */
> - uint32_t ratov = ha->r_a_tov/10;
> - uint32_t ratov_j = msecs_to_jiffies(4 * ratov * 1000);
> -
> - WARN_ON_ONCE(sp->comp);
> - sp->comp = ∁
> if (!wait_for_completion_timeout(&comp, ratov_j)) {
> ql_dbg(ql_dbg_taskm, vha, 0xffff,
> "%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
> - __func__, ha->r_a_tov);
> + __func__, ha->r_a_tov/10);
> ret = FAILED;
> } else {
> ret = SUCCESS;
> }
> break;
> - }
> default:
> - /*
> - * Either abort failed or abort and completion raced. Let
> - * the SCSI core retry the abort in the former case.
> - */
> ret = FAILED;
> break;
> }
>
> sp->comp = NULL;
> - atomic_dec(&sp->ref_count);
> +
> ql_log(ql_log_info, vha, 0x801c,
> "Abort command issued nexus=%ld:%d:%llu -- %x.\n",
> vha->host_no, id, lun, ret);
> @@ -1711,32 +1695,53 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
> scsi_qla_host_t *vha = qp->vha;
> struct qla_hw_data *ha = vha->hw;
> int rval;
> + bool ret_cmd;
> + uint32_t ratov_j;
>
> - if (sp_get(sp))
> + if (qla2x00_chip_is_down(vha)) {
> + sp->done(sp, res);
> return;
> + }
>
> if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS ||
> (sp->type == SRB_SCSI_CMD && !ha->flags.eeh_busy &&
> !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
> !qla2x00_isp_reg_stat(ha))) {
> + if (sp->comp) {
> + sp->done(sp, res);
> + return;
> + }
> +
> sp->comp = ∁
> + sp->abort = 1;
> spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
> - rval = ha->isp_ops->abort_command(sp);
>
> + rval = ha->isp_ops->abort_command(sp);
> + /* Wait for command completion. */
> + ret_cmd = false;
> + ratov_j = ha->r_a_tov/10 * 4 * 1000;
> + ratov_j = msecs_to_jiffies(ratov_j);
> switch (rval) {
> case QLA_SUCCESS:
> - sp->done(sp, res);
> + if (wait_for_completion_timeout(&comp, ratov_j)) {
> + ql_dbg(ql_dbg_taskm, vha, 0xffff,
> + "%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
> + __func__, ha->r_a_tov/10);
> + ret_cmd = true;
> + }
> + /* else FW return SP to driver */
> break;
> - case QLA_FUNCTION_PARAMETER_ERROR:
> - wait_for_completion(&comp);
> + default:
> + ret_cmd = true;
> break;
> }
>
> spin_lock_irqsave(qp->qp_lock_ptr, *flags);
> - sp->comp = NULL;
> + if (ret_cmd && (!sp->completed || !sp->aborted))
> + sp->done(sp, res);
> + } else {
> + sp->done(sp, res);
> }
> -
> - atomic_dec(&sp->ref_count);
> }
>
> static void
> @@ -1758,7 +1763,6 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res)
> for (cnt = 1; cnt < req->num_outstanding_cmds; cnt++) {
> sp = req->outstanding_cmds[cnt];
> if (sp) {
> - req->outstanding_cmds[cnt] = NULL;
> switch (sp->cmd_type) {
> case TYPE_SRB:
> qla2x00_abort_srb(qp, sp, res, &flags);
> @@ -1780,6 +1784,7 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res)
> default:
> break;
> }
> + req->outstanding_cmds[cnt] = NULL;
> }
> }
> spin_unlock_irqrestore(qp->qp_lock_ptr, flags);
Reviewed-by: Ewan D. Milne <emilne@redhat.com>
next prev parent reply other threads:[~2019-11-05 15:20 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-11-05 15:06 [PATCH 0/8] qla2xxx: Bug Fixes for the driver Himanshu Madhani
2019-11-05 15:06 ` [PATCH 1/8] qla2xxx: Retry PLOGI on FC-NVMe PRLI failure Himanshu Madhani
2019-11-05 15:18 ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 2/8] qla2xxx: Do command completion on abort timeout Himanshu Madhani
2019-11-05 15:18 ` Ewan D. Milne
2019-11-05 16:57 ` Bart Van Assche
2019-11-07 16:46 ` [EXT] " Himanshu Madhani
2019-11-05 15:06 ` [PATCH 3/8] qla2xxx: Fix SRB leak on switch command timeout Himanshu Madhani
2019-11-05 15:19 ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 4/8] qla2xxx: Fix driver unload hang Himanshu Madhani
2019-11-05 15:19 ` Ewan D. Milne
2019-11-07 16:54 ` Bart Van Assche
[not found] ` <83CC0DDF-4907-41A2-91EC-1569A07A6BA9@marvell.com>
2019-11-07 17:58 ` Bart Van Assche
2019-11-07 18:30 ` Bart Van Assche
2019-11-08 23:38 ` [EXT] " Himanshu Madhani
2019-11-08 23:58 ` Bart Van Assche
2019-11-05 15:06 ` [PATCH 5/8] qla2xxx: Fix double scsi_done for abort path Himanshu Madhani
2019-11-05 15:20 ` Ewan D. Milne [this message]
2019-11-07 18:01 ` Bart Van Assche
2019-11-05 15:06 ` [PATCH 6/8] qla2xxx: Fix memory leak when sending I/O fails Himanshu Madhani
2019-11-05 15:20 ` Ewan D. Milne
2019-11-18 20:25 ` Himanshu Madhani
2019-11-19 5:02 ` Martin K. Petersen
2019-11-05 15:06 ` [PATCH 7/8] qla2xxx: Fix device connect issues in P2P configuration Himanshu Madhani
2019-11-05 15:21 ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 8/8] qla2xxx: Update driver version to 10.01.00.21-k Himanshu Madhani
2019-11-05 15:21 ` Ewan D. Milne
2019-11-09 2:16 ` [PATCH 0/8] qla2xxx: Bug Fixes for the driver Martin K. Petersen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c3731b3424c15946a61cedfdef905aecb5ce44c1.camel@redhat.com \
--to=emilne@redhat.com \
--cc=James.Bottomley@HansenPartnership.com \
--cc=hmadhani@marvell.com \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox