[PATCH v3 04/17] lpfc: Fix crash after bad bar setup on driver attachment

stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 04/17] lpfc: Fix crash after bad bar setup on driver attachment
       [not found] <20171121000044.27702-1-jsmart2021@gmail.com>
@ 2017-11-21  0:00 ` James Smart
  2017-11-21  0:00 ` [PATCH v3 15/17] lpfc: Fix driver handling of nvme resources during unload James Smart
  1 sibling, 0 replies; 2+ messages in thread
From: James Smart @ 2017-11-21  0:00 UTC (permalink / raw)
  To: linux-scsi; +Cc: James Smart, stable, Dick Kennedy, James Smart

In test cases where an instance of the driver is detached and
reattached, the driver will crash on reattachment. There is a
compound if statement that will skip over the bar setup if
the pci_resource_start call is not successful. The driver
erroneously returns success to its bar setup in this scenario
even though the bars aren't properly configured.

Rework the offending code segment for proper initialization steps.
If the pci_resource_start call fails, -ENOMEM is now returned.

Sample stack:

rport-5:0-10: blocked FC remote port time out: removing rport
BUG: unable to handle kernel NULL pointer dereference at           (null)
... lpfc_sli4_wait_bmbx_ready+0x32/0x70 [lpfc]
...
...  RIP: 0010:...  ... lpfc_sli4_wait_bmbx_ready+0x32/0x70 [lpfc]
 Call Trace:
  ... lpfc_sli4_post_sync_mbox+0x106/0x4d0 [lpfc]
  ... ? __alloc_pages_nodemask+0x176/0x420
  ... ? __kmalloc+0x2e/0x230
  ... lpfc_sli_issue_mbox_s4+0x533/0x720 [lpfc]
  ... ? mempool_alloc+0x69/0x170
  ... ? dma_generic_alloc_coherent+0x8f/0x140
  ... lpfc_sli_issue_mbox+0xf/0x20 [lpfc]
  ... lpfc_sli4_driver_resource_setup+0xa6f/0x1130 [lpfc]
  ... ? lpfc_pci_probe_one+0x23e/0x16f0 [lpfc]
  ... lpfc_pci_probe_one+0x445/0x16f0 [lpfc]
  ... local_pci_probe+0x45/0xa0
  ... work_for_cpu_fn+0x14/0x20
  ... process_one_work+0x17a/0x440

Cc: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/scsi/lpfc/lpfc_init.c | 84 ++++++++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 745aff753396..fc9f91327724 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -9437,44 +9437,62 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba)
 		lpfc_sli4_bar0_register_memmap(phba, if_type);
 	}
 
-	if ((if_type == LPFC_SLI_INTF_IF_TYPE_0) &&
-	    (pci_resource_start(pdev, PCI_64BIT_BAR2))) {
-		/*
-		 * Map SLI4 if type 0 HBA Control Register base to a kernel
-		 * virtual address and setup the registers.
-		 */
-		phba->pci_bar1_map = pci_resource_start(pdev, PCI_64BIT_BAR2);
-		bar1map_len = pci_resource_len(pdev, PCI_64BIT_BAR2);
-		phba->sli4_hba.ctrl_regs_memmap_p =
-				ioremap(phba->pci_bar1_map, bar1map_len);
-		if (!phba->sli4_hba.ctrl_regs_memmap_p) {
-			dev_printk(KERN_ERR, &pdev->dev,
-			   "ioremap failed for SLI4 HBA control registers.\n");
+	if (if_type == LPFC_SLI_INTF_IF_TYPE_0) {
+		if (pci_resource_start(pdev, PCI_64BIT_BAR2)) {
+			/*
+			 * Map SLI4 if type 0 HBA Control Register base to a
+			 * kernel virtual address and setup the registers.
+			 */
+			phba->pci_bar1_map = pci_resource_start(pdev,
+								PCI_64BIT_BAR2);
+			bar1map_len = pci_resource_len(pdev, PCI_64BIT_BAR2);
+			phba->sli4_hba.ctrl_regs_memmap_p =
+					ioremap(phba->pci_bar1_map,
+						bar1map_len);
+			if (!phba->sli4_hba.ctrl_regs_memmap_p) {
+				dev_err(&pdev->dev,
+					   "ioremap failed for SLI4 HBA "
+					    "control registers.\n");
+				error = -ENOMEM;
+				goto out_iounmap_conf;
+			}
+			phba->pci_bar2_memmap_p =
+					 phba->sli4_hba.ctrl_regs_memmap_p;
+			lpfc_sli4_bar1_register_memmap(phba);
+		} else {
+			error = -ENOMEM;
 			goto out_iounmap_conf;
 		}
-		phba->pci_bar2_memmap_p = phba->sli4_hba.ctrl_regs_memmap_p;
-		lpfc_sli4_bar1_register_memmap(phba);
 	}
 
-	if ((if_type == LPFC_SLI_INTF_IF_TYPE_0) &&
-	    (pci_resource_start(pdev, PCI_64BIT_BAR4))) {
-		/*
-		 * Map SLI4 if type 0 HBA Doorbell Register base to a kernel
-		 * virtual address and setup the registers.
-		 */
-		phba->pci_bar2_map = pci_resource_start(pdev, PCI_64BIT_BAR4);
-		bar2map_len = pci_resource_len(pdev, PCI_64BIT_BAR4);
-		phba->sli4_hba.drbl_regs_memmap_p =
-				ioremap(phba->pci_bar2_map, bar2map_len);
-		if (!phba->sli4_hba.drbl_regs_memmap_p) {
-			dev_printk(KERN_ERR, &pdev->dev,
-			   "ioremap failed for SLI4 HBA doorbell registers.\n");
-			goto out_iounmap_ctrl;
-		}
-		phba->pci_bar4_memmap_p = phba->sli4_hba.drbl_regs_memmap_p;
-		error = lpfc_sli4_bar2_register_memmap(phba, LPFC_VF0);
-		if (error)
+	if (if_type == LPFC_SLI_INTF_IF_TYPE_0) {
+		if (pci_resource_start(pdev, PCI_64BIT_BAR4)) {
+			/*
+			 * Map SLI4 if type 0 HBA Doorbell Register base to
+			 * a kernel virtual address and setup the registers.
+			 */
+			phba->pci_bar2_map = pci_resource_start(pdev,
+								PCI_64BIT_BAR4);
+			bar2map_len = pci_resource_len(pdev, PCI_64BIT_BAR4);
+			phba->sli4_hba.drbl_regs_memmap_p =
+					ioremap(phba->pci_bar2_map,
+						bar2map_len);
+			if (!phba->sli4_hba.drbl_regs_memmap_p) {
+				dev_err(&pdev->dev,
+					   "ioremap failed for SLI4 HBA"
+					   " doorbell registers.\n");
+				error = -ENOMEM;
+				goto out_iounmap_ctrl;
+			}
+			phba->pci_bar4_memmap_p =
+					phba->sli4_hba.drbl_regs_memmap_p;
+			error = lpfc_sli4_bar2_register_memmap(phba, LPFC_VF0);
+			if (error)
+				goto out_iounmap_all;
+		} else {
+			error = -ENOMEM;
 			goto out_iounmap_all;
+		}
 	}
 
 	return 0;
-- 
2.13.1

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH v3 15/17] lpfc: Fix driver handling of nvme resources during unload
       [not found] <20171121000044.27702-1-jsmart2021@gmail.com>
  2017-11-21  0:00 ` [PATCH v3 04/17] lpfc: Fix crash after bad bar setup on driver attachment James Smart
@ 2017-11-21  0:00 ` James Smart
  1 sibling, 0 replies; 2+ messages in thread
From: James Smart @ 2017-11-21  0:00 UTC (permalink / raw)
  To: linux-scsi; +Cc: James Smart, stable, Dick Kennedy, James Smart

During driver unload, the driver may crash due to NULL pointers.
The NULL pointers were due to the driver not protecting itself
sufficiently during some of the teardown paths.
Additionally, the driver was not waiting for and cleanup up nvme
io resources. As such, the driver wasn't making the callbacks
to the transport, stalling the transports association teardown.

This patch waits for io clean up before tearding down and adds
checks for possible NULL pointers.

Cc: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/scsi/lpfc/lpfc_crtn.h |  2 +
 drivers/scsi/lpfc/lpfc_init.c | 18 ++++++++
 drivers/scsi/lpfc/lpfc_nvme.c | 96 ++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h
index 7e300734b345..dac33900bf17 100644
--- a/drivers/scsi/lpfc/lpfc_crtn.h
+++ b/drivers/scsi/lpfc/lpfc_crtn.h
@@ -254,6 +254,8 @@ void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba,
 			    struct lpfc_nvmet_ctxbuf *ctxp);
 int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport,
 			       struct fc_frame_header *fc_hdr);
+void lpfc_sli_flush_nvme_rings(struct lpfc_hba *phba);
+void lpfc_nvme_wait_for_io_drain(struct lpfc_hba *phba);
 void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *,
 			uint16_t);
 int lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq,
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 7a06f23a3baf..c466ceb43bc9 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -10136,6 +10136,16 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
 	int fcp_xri_cmpl = 1;
 	int els_xri_cmpl = list_empty(&phba->sli4_hba.lpfc_abts_els_sgl_list);
 
+	/* Driver just aborted IOs during the hba_unset process.  Pause
+	 * here to give the HBA time to complete the IO and get entries
+	 * into the abts lists.
+	 */
+	msleep(LPFC_XRI_EXCH_BUSY_WAIT_T1 * 5);
+
+	/* Wait for NVME pending IO to flush back to transport. */
+	if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+		lpfc_nvme_wait_for_io_drain(phba);
+
 	if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP)
 		fcp_xri_cmpl =
 			list_empty(&phba->sli4_hba.lpfc_abts_scsi_buf_list);
@@ -11659,6 +11669,10 @@ lpfc_sli4_prep_dev_for_reset(struct lpfc_hba *phba)
 	/* Flush all driver's outstanding SCSI I/Os as we are to reset */
 	lpfc_sli_flush_fcp_rings(phba);
 
+	/* Flush the outstanding NVME IOs if fc4 type enabled. */
+	if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+		lpfc_sli_flush_nvme_rings(phba);
+
 	/* stop all timers */
 	lpfc_stop_hba_timers(phba);
 
@@ -11690,6 +11704,10 @@ lpfc_sli4_prep_dev_for_perm_failure(struct lpfc_hba *phba)
 
 	/* Clean up all driver's outstanding SCSI I/Os */
 	lpfc_sli_flush_fcp_rings(phba);
+
+	/* Flush the outstanding NVME IOs if fc4 type enabled. */
+	if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+		lpfc_sli_flush_nvme_rings(phba);
 }
 
 /**
diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c
index 9b231c88ca8b..50bbc61bfe5d 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -88,6 +88,9 @@ lpfc_nvme_create_queue(struct nvme_fc_local_port *pnvme_lport,
 	struct lpfc_nvme_qhandle *qhandle;
 	char *str;
 
+	if (!pnvme_lport->private)
+		return -ENOMEM;
+
 	lport = (struct lpfc_nvme_lport *)pnvme_lport->private;
 	vport = lport->vport;
 	qhandle = kzalloc(sizeof(struct lpfc_nvme_qhandle), GFP_KERNEL);
@@ -140,6 +143,9 @@ lpfc_nvme_delete_queue(struct nvme_fc_local_port *pnvme_lport,
 	struct lpfc_nvme_lport *lport;
 	struct lpfc_vport *vport;
 
+	if (!pnvme_lport->private)
+		return;
+
 	lport = (struct lpfc_nvme_lport *)pnvme_lport->private;
 	vport = lport->vport;
 
@@ -1265,13 +1271,29 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
 	struct lpfc_nvme_buf *lpfc_ncmd;
 	struct lpfc_nvme_rport *rport;
 	struct lpfc_nvme_qhandle *lpfc_queue_info;
-	struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
+	struct lpfc_nvme_fcpreq_priv *freqpriv;
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
 	uint64_t start = 0;
 #endif
 
+	/* Validate pointers. LLDD fault handling with transport does
+	 * have timing races.
+	 */
 	lport = (struct lpfc_nvme_lport *)pnvme_lport->private;
+	if (unlikely(!lport)) {
+		ret = -EINVAL;
+		goto out_fail;
+	}
+
 	vport = lport->vport;
+
+	if (unlikely(!hw_queue_handle)) {
+		lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
+				 "6129 Fail Abort, NULL hw_queue_handle\n");
+		ret = -EINVAL;
+		goto out_fail;
+	}
+
 	phba = vport->phba;
 
 	if (vport->load_flag & FC_UNLOADING) {
@@ -1284,13 +1306,9 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
 		goto out_fail;
 	}
 
-	/* Validate pointers. */
-	if (!pnvme_lport || !pnvme_rport || !freqpriv) {
-		lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_IOERR | LOG_NODE,
-				 "6117 No Send:IO submit ptrs NULL, lport %p, "
-				 "rport %p fcreq_priv %p\n",
-				 pnvme_lport, pnvme_rport, freqpriv);
-		ret = -ENODEV;
+	freqpriv = pnvme_fcreq->private;
+	if (unlikely(!freqpriv)) {
+		ret = -EINVAL;
 		goto out_fail;
 	}
 
@@ -1497,20 +1515,34 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
 	struct lpfc_nvme_lport *lport;
 	struct lpfc_vport *vport;
 	struct lpfc_hba *phba;
-	struct lpfc_nvme_rport *rport;
 	struct lpfc_nvme_buf *lpfc_nbuf;
 	struct lpfc_iocbq *abts_buf;
 	struct lpfc_iocbq *nvmereq_wqe;
-	struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
+	struct lpfc_nvme_fcpreq_priv *freqpriv;
 	union lpfc_wqe *abts_wqe;
 	unsigned long flags;
 	int ret_val;
 
+	/* Validate pointers. LLDD fault handling with transport does
+	 * have timing races.
+	 */
 	lport = (struct lpfc_nvme_lport *)pnvme_lport->private;
-	rport = (struct lpfc_nvme_rport *)pnvme_rport->private;
+	if (unlikely(!lport))
+		return;
+
 	vport = lport->vport;
+
+	if (unlikely(!hw_queue_handle)) {
+		lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
+				 "6129 Fail Abort, HW Queue Handle NULL.\n");
+		return;
+	}
+
 	phba = vport->phba;
+	freqpriv = pnvme_fcreq->private;
 
+	if (unlikely(!freqpriv))
+		return;
 	if (vport->load_flag & FC_UNLOADING)
 		return;
 
@@ -2677,3 +2709,45 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
 			"6312 XRI Aborted xri x%x not found\n", xri);
 
 }
+
+/**
+ * lpfc_nvme_wait_for_io_drain - Wait for all NVME wqes to complete
+ * @phba: Pointer to HBA context object.
+ *
+ * This function flushes all wqes in the nvme rings and frees all resources
+ * in the txcmplq. This function does not issue abort wqes for the IO
+ * commands in txcmplq, they will just be returned with
+ * IOERR_SLI_DOWN. This function is invoked with EEH when device's PCI
+ * slot has been permanently disabled.
+ **/
+void
+lpfc_nvme_wait_for_io_drain(struct lpfc_hba *phba)
+{
+	struct lpfc_sli_ring  *pring;
+	u32 i, wait_cnt = 0;
+
+	if (phba->sli_rev < LPFC_SLI_REV4)
+		return;
+
+	/* Cycle through all NVME rings and make sure all outstanding
+	 * WQEs have been removed from the txcmplqs.
+	 */
+	for (i = 0; i < phba->cfg_nvme_io_channel; i++) {
+		pring = phba->sli4_hba.nvme_wq[i]->pring;
+
+		/* Retrieve everything on the txcmplq */
+		while (!list_empty(&pring->txcmplq)) {
+			msleep(LPFC_XRI_EXCH_BUSY_WAIT_T1);
+			wait_cnt++;
+
+			/* The sleep is 10mS.  Every ten seconds,
+			 * dump a message.  Something is wrong.
+			 */
+			if ((wait_cnt % 1000) == 0) {
+				lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
+						"6178 NVME IO not empty, "
+						"cnt %d\n", wait_cnt);
+			}
+		}
+	}
+}
-- 
2.13.1

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-11-21  0:01 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20171121000044.27702-1-jsmart2021@gmail.com>
2017-11-21  0:00 ` [PATCH v3 04/17] lpfc: Fix crash after bad bar setup on driver attachment James Smart
2017-11-21  0:00 ` [PATCH v3 15/17] lpfc: Fix driver handling of nvme resources during unload James Smart

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).