From: Sasha Levin <sashal@kernel.org>
To: stable@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: James Smart <jsmart2021@gmail.com>,
Christoph Hellwig <hch@lst.de>, Sasha Levin <sashal@kernel.org>,
linux-nvme@lists.infradead.org
Subject: [PATCH AUTOSEL 4.19 25/68] nvme-fc: resolve io failures during connect
Date: Thu, 29 Nov 2018 00:55:16 -0500 [thread overview]
Message-ID: <20181129055559.159228-25-sashal@kernel.org> (raw)
In-Reply-To: <20181129055559.159228-1-sashal@kernel.org>
From: James Smart <jsmart2021@gmail.com>
[ Upstream commit 4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc ]
If an io error occurs on an io issued while connecting, recovery
of the io falls flat as the state checking ends up nooping the error
handler.
Create an err_work work item that is scheduled upon an io error while
connecting. The work thread terminates all io on all queues and marks
the queues as not connected. The termination of the io will return
back to the callee, which will then back out of the connection attempt
and will reschedule, if possible, the connection attempt.
The changes:
- in case there are several commands hitting the error handler, a
state flag is kept so that the error work is only scheduled once,
on the first error. The subsequent errors can be ignored.
- The calling sequence to stop keep alive and terminate the queues
and their io is lifted from the reset routine. Made a small
service routine used by both reset and err_work.
- During debugging, found that the teardown path can reference
an uninitialized pointer, resulting in a NULL pointer oops.
The aen_ops weren't initialized yet. Add validation on their
initialization before calling the teardown routine.
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
drivers/nvme/host/fc.c | 73 ++++++++++++++++++++++++++++++++++++------
1 file changed, 63 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 611e70cae754..9375fa705d82 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -144,6 +144,7 @@ struct nvme_fc_ctrl {
bool ioq_live;
bool assoc_active;
+ atomic_t err_work_active;
u64 association_id;
struct list_head ctrl_list; /* rport->ctrl_list */
@@ -152,6 +153,7 @@ struct nvme_fc_ctrl {
struct blk_mq_tag_set tag_set;
struct delayed_work connect_work;
+ struct work_struct err_work;
struct kref ref;
u32 flags;
@@ -1523,6 +1525,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
int i;
+ /* ensure we've initialized the ops once */
+ if (!(aen_op->flags & FCOP_FLAGS_AEN))
+ return;
+
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
__nvme_fc_abort_op(ctrl, aen_op);
}
@@ -2036,7 +2042,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
static void
nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
{
- /* only proceed if in LIVE state - e.g. on first error */
+ int active;
+
+ /*
+ * if an error (io timeout, etc) while (re)connecting,
+ * it's an error on creating the new association.
+ * Start the error recovery thread if it hasn't already
+ * been started. It is expected there could be multiple
+ * ios hitting this path before things are cleaned up.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
+ active = atomic_xchg(&ctrl->err_work_active, 1);
+ if (!active && !schedule_work(&ctrl->err_work)) {
+ atomic_set(&ctrl->err_work_active, 0);
+ WARN_ON(1);
+ }
+ return;
+ }
+
+ /* Otherwise, only proceed if in LIVE state - e.g. on first error */
if (ctrl->ctrl.state != NVME_CTRL_LIVE)
return;
@@ -2802,6 +2826,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+ cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->connect_work);
/*
* kill the association on the link side. this will block
@@ -2854,23 +2879,30 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
}
static void
-nvme_fc_reset_ctrl_work(struct work_struct *work)
+__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
{
- struct nvme_fc_ctrl *ctrl =
- container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
- int ret;
-
- nvme_stop_ctrl(&ctrl->ctrl);
+ nvme_stop_keep_alive(&ctrl->ctrl);
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+ if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
+ !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
dev_err(ctrl->ctrl.device,
"NVME-FC{%d}: error_recovery: Couldn't change state "
"to CONNECTING\n", ctrl->cnum);
- return;
- }
+}
+
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+ struct nvme_fc_ctrl *ctrl =
+ container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
+ int ret;
+
+ __nvme_fc_terminate_io(ctrl);
+
+ nvme_stop_ctrl(&ctrl->ctrl);
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
ret = nvme_fc_create_association(ctrl);
@@ -2885,6 +2917,24 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
ctrl->cnum);
}
+static void
+nvme_fc_connect_err_work(struct work_struct *work)
+{
+ struct nvme_fc_ctrl *ctrl =
+ container_of(work, struct nvme_fc_ctrl, err_work);
+
+ __nvme_fc_terminate_io(ctrl);
+
+ atomic_set(&ctrl->err_work_active, 0);
+
+ /*
+ * Rescheduling the connection after recovering
+ * from the io error is left to the reconnect work
+ * item, which is what should have stalled waiting on
+ * the io that had the error that scheduled this work.
+ */
+}
+
static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.name = "fc",
.module = THIS_MODULE,
@@ -2995,6 +3045,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->cnum = idx;
ctrl->ioq_live = false;
ctrl->assoc_active = false;
+ atomic_set(&ctrl->err_work_active, 0);
init_waitqueue_head(&ctrl->ioabort_wait);
get_device(ctrl->dev);
@@ -3002,6 +3053,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+ INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work);
spin_lock_init(&ctrl->lock);
/* io queue count */
@@ -3092,6 +3144,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
fail_ctrl:
nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
cancel_work_sync(&ctrl->ctrl.reset_work);
+ cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->connect_work);
ctrl->ctrl.opts = NULL;
--
2.17.1
next prev parent reply other threads:[~2018-11-29 5:58 UTC|newest]
Thread overview: 77+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-11-29 5:54 [PATCH AUTOSEL 4.19 01/68] media: vicodec: lower minimum height to 360 Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 02/68] media: cec: check for non-OK/NACK conditions while claiming a LA Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 03/68] media: omap3isp: Unregister media device as first Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 04/68] media: ipu3-cio2: Unregister device nodes first, then release resources Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 05/68] iommu/vt-d: Fix NULL pointer dereference in prq_event_thread() Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 06/68] brcmutil: really fix decoding channel info for 160 MHz bandwidth Sasha Levin
2018-11-29 11:49 ` Kalle Valo
2018-11-29 16:54 ` Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 07/68] mt76: fix building without CONFIG_LEDS_CLASS Sasha Levin
2018-11-29 5:54 ` [PATCH AUTOSEL 4.19 08/68] iommu/ipmmu-vmsa: Fix crash on early domain free Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 09/68] scsi: ufs: Fix hynix ufs bug with quirk on hi36xx SoC Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 10/68] can: ucan: remove set but not used variable 'udev' Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 11/68] can: rcar_can: Fix erroneous registration Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 12/68] test_firmware: fix error return getting clobbered Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 13/68] HID: input: Ignore battery reported by Symbol DS4308 Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 14/68] batman-adv: Use explicit tvlv padding for ELP packets Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 15/68] batman-adv: Expand merged fragment buffer for full packet Sasha Levin
2018-11-29 10:00 ` Sergei Shtylyov
2018-11-29 10:04 ` Sergei Shtylyov
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 16/68] amd/iommu: Fix Guest Virtual APIC Log Tail Address Register Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 17/68] bnx2x: Assign unique DMAE channel number for FW DMAE transactions Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 18/68] qed: Fix PTT leak in qed_drain() Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 19/68] qed: Fix overriding offload_tc by protocols without APP TLV Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 20/68] qed: Fix rdma_info structure allocation Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 21/68] qed: Fix reading wrong value in loop condition Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 22/68] usb: dwc2: pci: Fix an error code in probe Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 23/68] Revert "usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers" Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 24/68] s390/ism: clear dmbe_mask bit before SMC IRQ handling Sasha Levin
2018-11-29 5:55 ` Sasha Levin [this message]
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 26/68] bnxt_en: Fix filling time in bnxt_fill_coredump_record() Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 27/68] drm/amdgpu: Add amdgpu "max bpc" connector property (v2) Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 28/68] drm/amd/display: Support " Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 29/68] net/mlx4_core: Zero out lkey field in SW2HW_MPT fw command Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 30/68] net/mlx4_core: Fix uninitialized variable compilation warning Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 31/68] net/mlx4: Fix UBSAN warning of signed integer overflow Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 32/68] drivers/net/ethernet/qlogic/qed/qed_rdma.h: fix typo Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 33/68] gpio: pxa: fix legacy non pinctrl aware builds again Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 34/68] gpio: mockup: fix indicated direction Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 35/68] tc-testing: tdc.py: ignore errors when decoding stdout/stderr Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 36/68] tc-testing: tdc.py: Guard against lack of returncode in executed command Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 37/68] mtd: rawnand: qcom: Namespace prefix some commands Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 38/68] cpufreq: ti-cpufreq: Only register platform_device when supported Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 39/68] exec: make de_thread() freezable Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 40/68] ALSA: hda/ca0132 - Add new ZxR quirk Sasha Levin
2018-11-29 14:51 ` Connor McAdams
2018-12-05 16:00 ` Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 41/68] Revert "HID: uhid: use strlcpy() instead of strncpy()" Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 42/68] HID: steam: remove input device when a hid client is running Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 43/68] HID: multitouch: Add pointstick support for Cirque Touchpad Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 44/68] mtd: spi-nor: Fix Cadence QSPI page fault kernel panic Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 45/68] net: ena: fix crash during failed resume from hibernation Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 46/68] NFSv4: Fix a NFSv4 state manager deadlock Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 47/68] qed: Fix bitmap_weight() check Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 48/68] qed: Fix QM getters to always return a valid pq Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 49/68] net/ibmnvic: Fix deadlock problem in reset Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 50/68] riscv: fix warning in arch/riscv/include/asm/module.h Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 51/68] iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 52/68] iomap: sub-block dio needs to zeroout beyond EOF Sasha Levin
2018-11-29 12:19 ` Dave Chinner
2018-11-29 12:36 ` Amir Goldstein
2018-11-29 22:43 ` Dave Chinner
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 53/68] iomap: dio data corruption and spurious errors when pipes fill Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 54/68] iomap: readpages doesn't zero page tail beyond EOF Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 55/68] net: faraday: ftmac100: remove netif_running(netdev) check before disabling interrupts Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 56/68] iommu/vt-d: Use memunmap to free memremap Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 57/68] NFSv4.2 copy do not allocate memory under the lock Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 58/68] flexfiles: use per-mirror specified stateid for IO Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 59/68] net/dim: Update DIM start sample after each DIM iteration Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 60/68] net: thunderx: set xdp_prog to NULL if bpf_prog_add fails Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 61/68] ibmvnic: Fix RX queue buffer cleanup Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 62/68] ibmvnic: Update driver queues after change in ring size support Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 63/68] virtio-net: disable guest csum during XDP set Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 64/68] virtio-net: fail XDP set if guest csum is negotiated Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 65/68] team: no need to do team_notify_peers or team_mcast_rejoin when disabling port Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 66/68] net: amd: add missing of_node_put() Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 67/68] net: thunderx: set tso_hdrs pointer to NULL in nicvf_free_snd_queue Sasha Levin
2018-11-29 5:55 ` [PATCH AUTOSEL 4.19 68/68] net: gemini: Fix copy/paste error Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181129055559.159228-25-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=hch@lst.de \
--cc=jsmart2021@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox