Re: [PATCH 5/8] qla2xxx: Fix double scsi_done for abort path

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

From: "Ewan D. Milne" <emilne@redhat.com>
To: Himanshu Madhani <hmadhani@marvell.com>,
	James.Bottomley@HansenPartnership.com,
	martin.petersen@oracle.com
Cc: linux-scsi@vger.kernel.org
Subject: Re: [PATCH 5/8] qla2xxx: Fix double scsi_done for abort path
Date: Tue, 05 Nov 2019 10:20:03 -0500	[thread overview]
Message-ID: <c3731b3424c15946a61cedfdef905aecb5ce44c1.camel@redhat.com> (raw)
In-Reply-To: <20191105150657.8092-6-hmadhani@marvell.com>

On Tue, 2019-11-05 at 07:06 -0800, Himanshu Madhani wrote:
> From: Quinn Tran <qutran@marvell.com>
> 
> Current code assume abort will remove the original command from the
> active list where scsi_done will not be call. Instead, the eh_abort
> thread will do the scsi_done. That is not the case.  Instead, we
> have a double scsi_done calls triggering use after free.
> 
> Abort will tell FW to release the command from FW possesion. The
> original command will return to ULP with error in its normal fashion via
> scsi_done.  eh_abort path would wait for the original command
> completion before returning.  eh_abort path will not perform the
> scsi_done call.
> 
> Fixes: 219d27d7147e0 ("scsi: qla2xxx: Fix race conditions in the code for aborting SCSI commands")
> Cc: stable@vger.kernel.org # 5.2
> Signed-off-by: Quinn Tran <qutran@marvell.com>
> Signed-off-by: Arun Easi <aeasi@marvell.com>
> Signed-off-by: Himanshu Madhani <hmadhani@marvell.com>
> ---
>  drivers/scsi/qla2xxx/qla_def.h  |   5 +-
>  drivers/scsi/qla2xxx/qla_isr.c  |   5 ++
>  drivers/scsi/qla2xxx/qla_nvme.c |   4 +-
>  drivers/scsi/qla2xxx/qla_os.c   | 117 +++++++++++++++++++++-------------------
>  4 files changed, 72 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
> index ef9bb3c7ad6f..2a9e6a9a8c9d 100644
> --- a/drivers/scsi/qla2xxx/qla_def.h
> +++ b/drivers/scsi/qla2xxx/qla_def.h
> @@ -591,13 +591,16 @@ typedef struct srb {
>  	 */
>  	uint8_t cmd_type;
>  	uint8_t pad[3];
> -	atomic_t ref_count;
>  	struct kref cmd_kref;	/* need to migrate ref_count over to this */
>  	void *priv;
>  	wait_queue_head_t nvme_ls_waitq;
>  	struct fc_port *fcport;
>  	struct scsi_qla_host *vha;
>  	unsigned int start_timer:1;
> +	unsigned int abort:1;
> +	unsigned int aborted:1;
> +	unsigned int completed:1;
> +
>  	uint32_t handle;
>  	uint16_t flags;
>  	uint16_t type;
> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
> index acef3d73983c..1b8f297449cf 100644
> --- a/drivers/scsi/qla2xxx/qla_isr.c
> +++ b/drivers/scsi/qla2xxx/qla_isr.c
> @@ -2487,6 +2487,11 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt)
>  		return;
>  	}
>  
> +	if (sp->abort)
> +		sp->aborted = 1;
> +	else
> +		sp->completed = 1;
> +
>  	if (sp->cmd_type != TYPE_SRB) {
>  		req->outstanding_cmds[handle] = NULL;
>  		ql_dbg(ql_dbg_io, vha, 0x3015,
> diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
> index 6cc19e060afc..941aa53363f5 100644
> --- a/drivers/scsi/qla2xxx/qla_nvme.c
> +++ b/drivers/scsi/qla2xxx/qla_nvme.c
> @@ -224,8 +224,8 @@ static void qla_nvme_abort_work(struct work_struct *work)
>  
>  	if (ha->flags.host_shutting_down) {
>  		ql_log(ql_log_info, sp->fcport->vha, 0xffff,
> -		    "%s Calling done on sp: %p, type: 0x%x, sp->ref_count: 0x%x\n",
> -		    __func__, sp, sp->type, atomic_read(&sp->ref_count));
> +		    "%s Calling done on sp: %p, type: 0x%x\n",
> +		    __func__, sp, sp->type);
>  		sp->done(sp, 0);
>  		goto out;
>  	}
> diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
> index 2e7a4a2d6c5a..b59d579834ea 100644
> --- a/drivers/scsi/qla2xxx/qla_os.c
> +++ b/drivers/scsi/qla2xxx/qla_os.c
> @@ -698,11 +698,6 @@ void qla2x00_sp_compl(srb_t *sp, int res)
>  	struct scsi_cmnd *cmd = GET_CMD_SP(sp);
>  	struct completion *comp = sp->comp;
>  
> -	if (WARN_ON_ONCE(atomic_read(&sp->ref_count) == 0))
> -		return;
> -
> -	atomic_dec(&sp->ref_count);
> -
>  	sp->free(sp);
>  	cmd->result = res;
>  	CMD_SP(cmd) = NULL;
> @@ -794,11 +789,6 @@ void qla2xxx_qpair_sp_compl(srb_t *sp, int res)
>  	struct scsi_cmnd *cmd = GET_CMD_SP(sp);
>  	struct completion *comp = sp->comp;
>  
> -	if (WARN_ON_ONCE(atomic_read(&sp->ref_count) == 0))
> -		return;
> -
> -	atomic_dec(&sp->ref_count);
> -
>  	sp->free(sp);
>  	cmd->result = res;
>  	CMD_SP(cmd) = NULL;
> @@ -903,7 +893,7 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  
>  	sp->u.scmd.cmd = cmd;
>  	sp->type = SRB_SCSI_CMD;
> -	atomic_set(&sp->ref_count, 1);
> +
>  	CMD_SP(cmd) = (void *)sp;
>  	sp->free = qla2x00_sp_free_dma;
>  	sp->done = qla2x00_sp_compl;
> @@ -985,11 +975,9 @@ qla2xxx_mqueuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd,
>  
>  	sp->u.scmd.cmd = cmd;
>  	sp->type = SRB_SCSI_CMD;
> -	atomic_set(&sp->ref_count, 1);
>  	CMD_SP(cmd) = (void *)sp;
>  	sp->free = qla2xxx_qpair_sp_free_dma;
>  	sp->done = qla2xxx_qpair_sp_compl;
> -	sp->qpair = qpair;
>  
>  	rval = ha->isp_ops->start_scsi_mq(sp);
>  	if (rval != QLA_SUCCESS) {
> @@ -1187,16 +1175,6 @@ qla2x00_wait_for_chip_reset(scsi_qla_host_t *vha)
>  	return return_status;
>  }
>  
> -static int
> -sp_get(struct srb *sp)
> -{
> -	if (!refcount_inc_not_zero((refcount_t *)&sp->ref_count))
> -		/* kref get fail */
> -		return ENXIO;
> -	else
> -		return 0;
> -}
> -
>  #define ISP_REG_DISCONNECT 0xffffffffU
>  /**************************************************************************
>  * qla2x00_isp_reg_stat
> @@ -1252,6 +1230,9 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
>  	uint64_t lun;
>  	int rval;
>  	struct qla_hw_data *ha = vha->hw;
> +	uint32_t ratov_j;
> +	struct qla_qpair *qpair;
> +	unsigned long flags;
>  
>  	if (qla2x00_isp_reg_stat(ha)) {
>  		ql_log(ql_log_info, vha, 0x8042,
> @@ -1264,13 +1245,26 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
>  		return ret;
>  
>  	sp = scsi_cmd_priv(cmd);
> +	qpair = sp->qpair;
>  
> -	if (sp->fcport && sp->fcport->deleted)
> +	if ((sp->fcport && sp->fcport->deleted) || !qpair)
>  		return SUCCESS;
>  
> -	/* Return if the command has already finished. */
> -	if (sp_get(sp))
> +	spin_lock_irqsave(qpair->qp_lock_ptr, flags);
> +	if (sp->completed) {
> +		spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
>  		return SUCCESS;
> +	}
> +
> +	if (sp->abort || sp->aborted) {
> +		spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
> +		return FAILED;
> +	}
> +
> +	sp->abort = 1;
> +	sp->comp = &comp;
> +	spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
> +
>  
>  	id = cmd->device->id;
>  	lun = cmd->device->lun;
> @@ -1279,47 +1273,37 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
>  	    "Aborting from RISC nexus=%ld:%d:%llu sp=%p cmd=%p handle=%x\n",
>  	    vha->host_no, id, lun, sp, cmd, sp->handle);
>  
> +	/*
> +	 * Abort will release the original Command/sp from FW. Let the
> +	 * original command call scsi_done. In return, he will wakeup
> +	 * this sleeping thread.
> +	 */
>  	rval = ha->isp_ops->abort_command(sp);
> +
>  	ql_dbg(ql_dbg_taskm, vha, 0x8003,
>  	       "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval);
>  
> +	/* Wait for the command completion. */
> +	ratov_j = ha->r_a_tov/10 * 4 * 1000;
> +	ratov_j = msecs_to_jiffies(ratov_j);
>  	switch (rval) {
>  	case QLA_SUCCESS:
> -		/*
> -		 * The command has been aborted. That means that the firmware
> -		 * won't report a completion.
> -		 */
> -		sp->done(sp, DID_ABORT << 16);
> -		ret = SUCCESS;
> -		break;
> -	case QLA_FUNCTION_PARAMETER_ERROR: {
> -		/* Wait for the command completion. */
> -		uint32_t ratov = ha->r_a_tov/10;
> -		uint32_t ratov_j = msecs_to_jiffies(4 * ratov * 1000);
> -
> -		WARN_ON_ONCE(sp->comp);
> -		sp->comp = &comp;
>  		if (!wait_for_completion_timeout(&comp, ratov_j)) {
>  			ql_dbg(ql_dbg_taskm, vha, 0xffff,
>  			    "%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
> -			    __func__, ha->r_a_tov);
> +			    __func__, ha->r_a_tov/10);
>  			ret = FAILED;
>  		} else {
>  			ret = SUCCESS;
>  		}
>  		break;
> -	}
>  	default:
> -		/*
> -		 * Either abort failed or abort and completion raced. Let
> -		 * the SCSI core retry the abort in the former case.
> -		 */
>  		ret = FAILED;
>  		break;
>  	}
>  
>  	sp->comp = NULL;
> -	atomic_dec(&sp->ref_count);
> +
>  	ql_log(ql_log_info, vha, 0x801c,
>  	    "Abort command issued nexus=%ld:%d:%llu -- %x.\n",
>  	    vha->host_no, id, lun, ret);
> @@ -1711,32 +1695,53 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
>  	scsi_qla_host_t *vha = qp->vha;
>  	struct qla_hw_data *ha = vha->hw;
>  	int rval;
> +	bool ret_cmd;
> +	uint32_t ratov_j;
>  
> -	if (sp_get(sp))
> +	if (qla2x00_chip_is_down(vha)) {
> +		sp->done(sp, res);
>  		return;
> +	}
>  
>  	if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS ||
>  	    (sp->type == SRB_SCSI_CMD && !ha->flags.eeh_busy &&
>  	     !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
>  	     !qla2x00_isp_reg_stat(ha))) {
> +		if (sp->comp) {
> +			sp->done(sp, res);
> +			return;
> +		}
> +
>  		sp->comp = &comp;
> +		sp->abort =  1;
>  		spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
> -		rval = ha->isp_ops->abort_command(sp);
>  
> +		rval = ha->isp_ops->abort_command(sp);
> +		/* Wait for command completion. */
> +		ret_cmd = false;
> +		ratov_j = ha->r_a_tov/10 * 4 * 1000;
> +		ratov_j = msecs_to_jiffies(ratov_j);
>  		switch (rval) {
>  		case QLA_SUCCESS:
> -			sp->done(sp, res);
> +			if (wait_for_completion_timeout(&comp, ratov_j)) {
> +				ql_dbg(ql_dbg_taskm, vha, 0xffff,
> +				    "%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
> +				    __func__, ha->r_a_tov/10);
> +				ret_cmd = true;
> +			}
> +			/* else FW return SP to driver */
>  			break;
> -		case QLA_FUNCTION_PARAMETER_ERROR:
> -			wait_for_completion(&comp);
> +		default:
> +			ret_cmd = true;
>  			break;
>  		}
>  
>  		spin_lock_irqsave(qp->qp_lock_ptr, *flags);
> -		sp->comp = NULL;
> +		if (ret_cmd && (!sp->completed || !sp->aborted))
> +			sp->done(sp, res);
> +	} else {
> +		sp->done(sp, res);
>  	}
> -
> -	atomic_dec(&sp->ref_count);
>  }
>  
>  static void
> @@ -1758,7 +1763,6 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res)
>  	for (cnt = 1; cnt < req->num_outstanding_cmds; cnt++) {
>  		sp = req->outstanding_cmds[cnt];
>  		if (sp) {
> -			req->outstanding_cmds[cnt] = NULL;
>  			switch (sp->cmd_type) {
>  			case TYPE_SRB:
>  				qla2x00_abort_srb(qp, sp, res, &flags);
> @@ -1780,6 +1784,7 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res)
>  			default:
>  				break;
>  			}
> +			req->outstanding_cmds[cnt] = NULL;
>  		}
>  	}
>  	spin_unlock_irqrestore(qp->qp_lock_ptr, flags);

Reviewed-by: Ewan D. Milne <emilne@redhat.com>

next prev parent reply	other threads:[~2019-11-05 15:20 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-11-05 15:06 [PATCH 0/8] qla2xxx: Bug Fixes for the driver Himanshu Madhani
2019-11-05 15:06 ` [PATCH 1/8] qla2xxx: Retry PLOGI on FC-NVMe PRLI failure Himanshu Madhani
2019-11-05 15:18   ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 2/8] qla2xxx: Do command completion on abort timeout Himanshu Madhani
2019-11-05 15:18   ` Ewan D. Milne
2019-11-05 16:57   ` Bart Van Assche
2019-11-07 16:46     ` [EXT] " Himanshu Madhani
2019-11-05 15:06 ` [PATCH 3/8] qla2xxx: Fix SRB leak on switch command timeout Himanshu Madhani
2019-11-05 15:19   ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 4/8] qla2xxx: Fix driver unload hang Himanshu Madhani
2019-11-05 15:19   ` Ewan D. Milne
2019-11-07 16:54   ` Bart Van Assche
     [not found]     ` <83CC0DDF-4907-41A2-91EC-1569A07A6BA9@marvell.com>
2019-11-07 17:58       ` Bart Van Assche
2019-11-07 18:30         ` Bart Van Assche
2019-11-08 23:38           ` [EXT] " Himanshu Madhani
2019-11-08 23:58             ` Bart Van Assche
2019-11-05 15:06 ` [PATCH 5/8] qla2xxx: Fix double scsi_done for abort path Himanshu Madhani
2019-11-05 15:20   ` Ewan D. Milne [this message]
2019-11-07 18:01   ` Bart Van Assche
2019-11-05 15:06 ` [PATCH 6/8] qla2xxx: Fix memory leak when sending I/O fails Himanshu Madhani
2019-11-05 15:20   ` Ewan D. Milne
2019-11-18 20:25   ` Himanshu Madhani
2019-11-19  5:02     ` Martin K. Petersen
2019-11-05 15:06 ` [PATCH 7/8] qla2xxx: Fix device connect issues in P2P configuration Himanshu Madhani
2019-11-05 15:21   ` Ewan D. Milne
2019-11-05 15:06 ` [PATCH 8/8] qla2xxx: Update driver version to 10.01.00.21-k Himanshu Madhani
2019-11-05 15:21   ` Ewan D. Milne
2019-11-09  2:16 ` [PATCH 0/8] qla2xxx: Bug Fixes for the driver Martin K. Petersen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c3731b3424c15946a61cedfdef905aecb5ce44c1.camel@redhat.com \
    --to=emilne@redhat.com \
    --cc=James.Bottomley@HansenPartnership.com \
    --cc=hmadhani@marvell.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox