From mboxrd@z Thu Jan 1 00:00:00 1970 From: Brian King Subject: Re: [PATCH v6 1/3] cxlflash: Base error recovery support Date: Mon, 17 Aug 2015 09:38:06 -0500 Message-ID: <55D1F1CE.1030900@linux.vnet.ibm.com> References: <1439520454-56759-1-git-send-email-mrochs@linux.vnet.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 7bit Return-path: Received: from e18.ny.us.ibm.com ([129.33.205.208]:41497 "EHLO e18.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751317AbbHQOiS (ORCPT ); Mon, 17 Aug 2015 10:38:18 -0400 Received: from /spool/local by e18.ny.us.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 17 Aug 2015 10:38:17 -0400 Received: from b01cxnp23032.gho.pok.ibm.com (b01cxnp23032.gho.pok.ibm.com [9.57.198.27]) by d01dlp01.pok.ibm.com (Postfix) with ESMTP id 2048338C806A for ; Mon, 17 Aug 2015 10:38:14 -0400 (EDT) Received: from d01av04.pok.ibm.com (d01av04.pok.ibm.com [9.56.224.64]) by b01cxnp23032.gho.pok.ibm.com (8.14.9/8.14.9/NCO v10.0) with ESMTP id t7HEcEq845678788 for ; Mon, 17 Aug 2015 14:38:14 GMT Received: from d01av04.pok.ibm.com (localhost [127.0.0.1]) by d01av04.pok.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id t7HEcBW7000890 for ; Mon, 17 Aug 2015 10:38:13 -0400 In-Reply-To: <1439520454-56759-1-git-send-email-mrochs@linux.vnet.ibm.com> Sender: linux-scsi-owner@vger.kernel.org List-Id: linux-scsi@vger.kernel.org To: "Matthew R. Ochs" , linux-scsi@vger.kernel.org, James.Bottomley@HansenPartnership.com, nab@linux-iscsi.org, wenxiong@linux.vnet.ibm.com, mikey@neuling.org, dja@ozlabs.au.ibm.com, benh@kernel.crashing.org Cc: hch@infradead.org, imunsie@au1.ibm.com, "Manoj N. Kumar" On 08/13/2015 09:47 PM, Matthew R. Ochs wrote: > --- a/drivers/scsi/cxlflash/common.h > +++ b/drivers/scsi/cxlflash/common.h > @@ -76,6 +76,12 @@ enum cxlflash_init_state { > INIT_STATE_SCSI > }; > > +enum cxlflash_state { > + STATE_NORMAL, /* Normal running state, everything good */ > + STATE_LIMBO, /* Limbo running state, trying to reset/recover */ Might be more clear to call this STATE_RESETTING or STATE_RECOVERY > + STATE_FAILTERM /* Failed/terminating state, error out users/threads */ > +}; > + > /* > * Each context has its own set of resource handles that is visible > * only from that context. > @@ -105,7 +109,8 @@ struct cxlflash_cfg { > > wait_queue_head_t tmf_waitq; > bool tmf_active; > - u8 err_recovery_active:1; > + wait_queue_head_t limbo_waitq; How about reset_waitq instead? > + enum cxlflash_state state; > }; > > struct afu_cmd { > @@ -455,9 +471,21 @@ static int cxlflash_eh_device_reset_handler(struct scsi_cmnd *scp) > get_unaligned_be32(&((u32 *)scp->cmnd)[2]), > get_unaligned_be32(&((u32 *)scp->cmnd)[3])); > > - rcr = send_tmf(afu, scp, TMF_LUN_RESET); > - if (unlikely(rcr)) > + switch (cfg->state) { > + case STATE_NORMAL: > + rcr = send_tmf(afu, scp, TMF_LUN_RESET); > + if (unlikely(rcr)) > + rc = FAILED; > + break; > + case STATE_LIMBO: > + wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO); > + if (cfg->state == STATE_NORMAL) > + break; In this case you've been asked to do a LUN reset but didn't actually reset the LUN. I'd suggest restructuring this switch statement to send the LUN reset TMF in the case where you had to wait for an AFU reset to complete. > + /* fall through */ > + default: > rc = FAILED; > + break; > + } > > pr_debug("%s: returning rc=%d\n", __func__, rc); > return rc; > @@ -487,11 +515,29 @@ static int cxlflash_eh_host_reset_handler(struct scsi_cmnd *scp) > get_unaligned_be32(&((u32 *)scp->cmnd)[2]), > get_unaligned_be32(&((u32 *)scp->cmnd)[3])); > > - rcr = cxlflash_afu_reset(cfg); > - if (rcr == 0) > - rc = SUCCESS; > - else > + switch (cfg->state) { > + case STATE_NORMAL: > + cfg->state = STATE_LIMBO; > + scsi_block_requests(cfg->host); > + > + rcr = cxlflash_afu_reset(cfg); > + if (rcr) { > + rc = FAILED; > + cfg->state = STATE_FAILTERM; > + } else > + cfg->state = STATE_NORMAL; > + wake_up_all(&cfg->limbo_waitq); > + scsi_unblock_requests(cfg->host); The scsi_block_requests / scsi_unblock_requests is not necessary in this path, since SCSI EH will already be preventing any new commands being issued via queuecommand. > + break; > + case STATE_LIMBO: > + wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO); > + if (cfg->state == STATE_NORMAL) > + break; > + /* fall through */ > + default: > rc = FAILED; > + break; > + } > > pr_debug("%s: returning rc=%d\n", __func__, rc); > return rc; > @@ -642,7 +688,7 @@ static void cxlflash_wait_for_pci_err_recovery(struct cxlflash_cfg *cfg) > struct pci_dev *pdev = cfg->dev; > > if (pci_channel_offline(pdev)) > - wait_event_timeout(cfg->eeh_waitq, > + wait_event_timeout(cfg->limbo_waitq, > !pci_channel_offline(pdev), > CXLFLASH_PCI_ERROR_RECOVERY_TIMEOUT); > } > @@ -825,6 +871,8 @@ static void cxlflash_remove(struct pci_dev *pdev) > !cfg->tmf_active); > spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags); > > + cfg->state = STATE_FAILTERM; I don't see any locking around the setting or reading of this flag. What are the implications if the processor reorders the store to change this state either here or elsewhere. Same goes for the load associated with checking the state. > + > switch (cfg->init_state) { > case INIT_STATE_SCSI: > scsi_remove_host(cfg->host); > @@ -1879,6 +1927,8 @@ static int init_afu(struct cxlflash_cfg *cfg) > struct afu *afu = cfg->afu; > struct device *dev = &cfg->dev->dev; > > + cxl_perst_reloads_same_image(cfg->cxl_afu, true); > + > rc = init_mc(cfg); > if (rc) { > dev_err(dev, "%s: call to init_mc failed, rc=%d!\n", Reviewed-by: Brian King -- Brian King Power Linux I/O IBM Linux Technology Center