public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Aharon Landau <aharonl@nvidia.com>,
	Michael Guralnik <michaelgur@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Sasha Levin <sashal@kernel.org>
Subject: [PATCH 5.19 05/38] RDMA/mlx5: Add a umr recovery flow
Date: Fri, 16 Sep 2022 12:08:39 +0200	[thread overview]
Message-ID: <20220916100448.664816105@linuxfoundation.org> (raw)
In-Reply-To: <20220916100448.431016349@linuxfoundation.org>

From: Aharon Landau <aharonl@nvidia.com>

[ Upstream commit 158e71bb69e368b8b33e8b7c4ac8c111da0c1ae2 ]

When a UMR fails, the UMR QP state changes to an error state. Therefore,
all the further UMR operations will fail too.

Add a recovery flow to the UMR QP, and repost the flushed WQEs.

Link: https://lore.kernel.org/r/6cc24816cca049bd8541317f5e41d3ac659445d3.1652588303.git.leonro@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Stable-dep-of: 9b7d4be967f1 ("RDMA/mlx5: Fix UMR cleanup on error flow of driver init")
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/infiniband/hw/mlx5/cq.c      |  4 ++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 ++++-
 drivers/infiniband/hw/mlx5/umr.c     | 78 ++++++++++++++++++++++++----
 3 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 08371a80fdc26..be189e0525de6 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -523,6 +523,10 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
 			    "Requestor" : "Responder", cq->mcq.cqn);
 		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
 			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (wc->status != IB_WC_WR_FLUSH_ERR &&
+		    (*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
+			dev->umrc.state = MLX5_UMR_STATE_RECOVER;
+
 		if (opcode == MLX5_CQE_REQ_ERR) {
 			wq = &(*cur_qp)->sq;
 			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 998b67509a533..7460e0dfe6db4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -717,13 +717,23 @@ struct mlx5_ib_umr_context {
 	struct completion	done;
 };
 
+enum {
+	MLX5_UMR_STATE_ACTIVE,
+	MLX5_UMR_STATE_RECOVER,
+	MLX5_UMR_STATE_ERR,
+};
+
 struct umr_common {
 	struct ib_pd	*pd;
 	struct ib_cq	*cq;
 	struct ib_qp	*qp;
-	/* control access to UMR QP
+	/* Protects from UMR QP overflow
 	 */
 	struct semaphore	sem;
+	/* Protects from using UMR while the UMR is not active
+	 */
+	struct mutex lock;
+	unsigned int state;
 };
 
 struct mlx5_cache_ent {
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 3a48364c09181..e00b94d1b1ea1 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	mutex_init(&dev->umrc.lock);
 
 	return 0;
 
@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
 	ib_dealloc_pd(dev->umrc.pd);
 }
 
+static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct ib_qp_attr attr;
+	int err;
+
+	attr.qp_state = IB_QPS_RESET;
+	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
+	if (err) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto err;
+	}
+
+	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
+	if (err)
+		goto err;
+
+	umrc->state = MLX5_UMR_STATE_ACTIVE;
+	return 0;
+
+err:
+	umrc->state = MLX5_UMR_STATE_ERR;
+	return err;
+}
+
 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
 			       struct mlx5r_umr_wqe *wqe, bool with_data)
 {
@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
 
 	id.ib_cqe = cqe;
 	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
-			 MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
+			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
 
 	mlx5r_ring_db(qp, 1, ctrl);
 
@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
 	mlx5r_umr_init_context(&umr_context);
 
 	down(&umrc->sem);
-	err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
-				  with_data);
-	if (err)
-		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
-	else {
-		wait_for_completion(&umr_context.done);
-		if (umr_context.status != IB_WC_SUCCESS) {
-			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
-				     umr_context.status);
+	while (true) {
+		mutex_lock(&umrc->lock);
+		if (umrc->state == MLX5_UMR_STATE_ERR) {
+			mutex_unlock(&umrc->lock);
 			err = -EFAULT;
+			break;
+		}
+
+		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
+			mutex_unlock(&umrc->lock);
+			usleep_range(3000, 5000);
+			continue;
+		}
+
+		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
+					  with_data);
+		mutex_unlock(&umrc->lock);
+		if (err) {
+			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
+				     err);
+			break;
 		}
+
+		wait_for_completion(&umr_context.done);
+
+		if (umr_context.status == IB_WC_SUCCESS)
+			break;
+
+		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
+			continue;
+
+		WARN_ON_ONCE(1);
+		mlx5_ib_warn(dev,
+			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
+			umr_context.status);
+		mutex_lock(&umrc->lock);
+		err = mlx5r_umr_recover(dev);
+		mutex_unlock(&umrc->lock);
+		if (err)
+			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
+				     err);
+		err = -EFAULT;
+		break;
 	}
 	up(&umrc->sem);
 	return err;
-- 
2.35.1




  parent reply	other threads:[~2022-09-16 10:25 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-16 10:08 [PATCH 5.19 00/38] 5.19.10-rc1 review Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 01/38] iommu/vt-d: Fix kdump kernels boot failure with scalable mode Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 02/38] net/mlx5: Introduce ifc bits for using software vhca id Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 03/38] net/mlx5: Use software VHCA id when its supported Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 04/38] RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile Greg Kroah-Hartman
2022-09-16 10:08 ` Greg Kroah-Hartman [this message]
2022-09-16 10:08 ` [PATCH 5.19 06/38] RDMA/mlx5: Fix UMR cleanup on error flow of driver init Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 07/38] ACPI: resource: skip IRQ override on AMD Zen platforms Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 08/38] Input: goodix - add support for GT1158 Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 09/38] platform/surface: aggregator_registry: Add support for Surface Laptop Go 2 Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 10/38] drm/msm/rd: Fix FIFO-full deadlock Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 11/38] peci: cpu: Fix use-after-free in adev_release() Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 12/38] hwmon: (pmbus) Use dev_err_probe() to filter -EPROBE_DEFER error messages Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 13/38] kvm: x86: mmu: Always flush TLBs when enabling dirty logging Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 14/38] dt-bindings: iio: gyroscope: bosch,bmg160: correct number of pins Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 15/38] HID: ishtp-hid-clientHID: ishtp-hid-client: Fix comment typo Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 16/38] hid: intel-ish-hid: ishtp: Fix ishtp client sending disordered message Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 17/38] Bluetooth: MGMT: Fix Get Device Flags Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 18/38] tg3: Disable tg3 device on system reboot to avoid triggering AER Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 19/38] r8152: add PID for the Lenovo OneLink+ Dock Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 20/38] gpio: mockup: remove gpio debugfs when remove device Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 21/38] ieee802154: cc2520: add rc code in cc2520_tx() Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 22/38] Input: iforce - add support for Boeder Force Feedback Wheel Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 23/38] drm/amdgpu: disable FRU access on special SIENNA CICHLID card Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 24/38] drm/amd/pm: use vbios carried pptable for all SMU13.0.7 SKUs Greg Kroah-Hartman
2022-09-16 10:08 ` [PATCH 5.19 25/38] nvme-pci: add NVME_QUIRK_BOGUS_NID for Lexar NM610 Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 26/38] nvmet-tcp: fix unhandled tcp states in nvmet_tcp_state_change() Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 27/38] drm/amd/amdgpu: skip ucode loading if ucode_size == 0 Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 28/38] net: dsa: hellcreek: Print warning only once Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 29/38] perf/arm_pmu_platform: fix tests for platform_get_irq() failure Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 30/38] platform/x86: acer-wmi: Acer Aspire One AOD270/Packard Bell Dot keymap fixes Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 31/38] usb: storage: Add ASUS <0x0b05:0x1932> to IGNORE_UAS Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 32/38] platform/x86: asus-wmi: Increase FAN_CURVE_BUF_LEN to 32 Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 33/38] LoongArch: Fix section mismatch due to acpi_os_ioremap() Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 34/38] LoongArch: Fix arch_remove_memory() undefined build error Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 35/38] gpio: 104-dio-48e: Make irq_chip immutable Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 36/38] gpio: 104-idio-16: " Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 37/38] RDMA/irdma: Use s/g array in post send only when its valid Greg Kroah-Hartman
2022-09-16 10:09 ` [PATCH 5.19 38/38] Input: goodix - add compatible string for GT1158 Greg Kroah-Hartman
2022-09-16 21:49 ` [PATCH 5.19 00/38] 5.19.10-rc1 review Guenter Roeck
2022-09-16 22:35 ` Ron Economos
2022-09-17  7:26 ` Fenil Jain
2022-09-17  8:16 ` Bagas Sanjaya
2022-09-17 14:29 ` Sudip Mukherjee (Codethink)
2022-09-17 15:03 ` Naresh Kamboju
2022-09-19  0:12 ` Justin Forbes
2022-09-19  1:06 ` Florian Fainelli

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220916100448.664816105@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=aharonl@nvidia.com \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michaelgur@nvidia.com \
    --cc=sashal@kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox