All of lore.kernel.org
 help / color / mirror / Atom feed
From: Long Li <longli@microsoft.com>
To: Long Li <longli@microsoft.com>,
	Konstantin Taranov <kotaranov@microsoft.com>,
	Jakub Kicinski <kuba@kernel.org>,
	"David S . Miller" <davem@davemloft.net>,
	Paolo Abeni <pabeni@redhat.com>,
	Eric Dumazet <edumazet@google.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	Jason Gunthorpe <jgg@ziepe.ca>, Leon Romanovsky <leon@kernel.org>,
	Haiyang Zhang <haiyangz@microsoft.com>,
	"K . Y . Srinivasan" <kys@microsoft.com>,
	Wei Liu <wei.liu@kernel.org>, Dexuan Cui <decui@microsoft.com>
Cc: Simon Horman <horms@kernel.org>,
	netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH rdma-next 7/8] RDMA/mana_ib: Notify service reset events to RDMA devices
Date: Fri,  6 Mar 2026 17:47:21 -0800	[thread overview]
Message-ID: <20260307014723.556523-8-longli@microsoft.com> (raw)
In-Reply-To: <20260307014723.556523-1-longli@microsoft.com>

Register reset_notify and resume_notify callbacks so the RDMA driver
is informed when the MANA service undergoes a reset cycle.

On reset notification:
  - Acquire reset_rwsem write lock to serialize with resource creation
  - Walk every tracked ucontext and invalidate firmware handles for
    all PD, CQ, WQ, QP, and MR resources (set to INVALID_MANA_HANDLE)
  - Dispatch IB_EVENT_PORT_ERR to each affected ucontext so userspace
    (e.g. DPDK) learns about the reset

On resume notification:
  - Release reset_rwsem write lock, unblocking new resource creation

Resource creation paths (alloc_pd, create_cq, create_wq, create_qp for
RAW_PACKET, reg_user_mr) acquire reset_rwsem read lock to ensure handles
are not invalidated while being set up.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/cq.c               |  15 ++-
 drivers/infiniband/hw/mana/device.c           | 103 ++++++++++++++++++
 drivers/infiniband/hw/mana/main.c             |   9 ++
 drivers/infiniband/hw/mana/mana_ib.h          |   2 +
 drivers/infiniband/hw/mana/mr.c               |   4 +
 drivers/infiniband/hw/mana/qp.c               |   5 +
 drivers/infiniband/hw/mana/wq.c               |   4 +
 drivers/net/ethernet/microsoft/mana/mana_en.c |  14 ++-
 include/net/mana/gdma.h                       |   6 +
 9 files changed, 155 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 89cf60987ff5..b054684b8de7 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -41,13 +41,17 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 			ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
 			return -EINVAL;
 		}
+	}
+
+	down_read(&mdev->reset_rwsem);
 
+	if (udata) {
 		cq->cqe = attr->cqe;
 		err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE,
 					   &cq->queue);
 		if (err) {
 			ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err);
-			return err;
+			goto err_unlock;
 		}
 
 		mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
@@ -56,14 +60,15 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	} else {
 		if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) {
 			ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
-			return -EINVAL;
+			err = -EINVAL;
+			goto err_unlock;
 		}
 		buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE));
 		cq->cqe = buf_size / COMP_ENTRY_SIZE;
 		err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue);
 		if (err) {
 			ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err);
-			return err;
+			goto err_unlock;
 		}
 		doorbell = mdev->gdma_dev->doorbell;
 	}
@@ -105,6 +110,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&mdev->reset_rwsem);
 	return 0;
 
 err_remove_cq_cb:
@@ -113,7 +119,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	mana_ib_gd_destroy_cq(mdev, cq);
 err_destroy_queue:
 	mana_ib_destroy_queue(mdev, &cq->queue);
-
+err_unlock:
+	up_read(&mdev->reset_rwsem);
 	return err;
 }
 
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 149e8d4d5b8e..081be31563ca 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -103,6 +103,7 @@ static int mana_ib_netdev_event(struct notifier_block *this,
 					netdev_put(ndev, &dev->dev_tracker);
 
 				return NOTIFY_OK;
+
 			default:
 				return NOTIFY_DONE;
 			}
@@ -110,6 +111,93 @@ static int mana_ib_netdev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
+/*
+ * Reset cleanup: invalidate firmware handles for all tracked user objects.
+ *
+ * Called during service reset BEFORE dispatching IB_EVENT_PORT_ERR to
+ * user-mode.
+ *
+ * Only invalidates FW handles — does NOT free kernel resources (umem, queues)
+ * or remove objects from lists. The IB core's destroy callbacks handle full
+ * resource teardown when user-space closes the uverbs FD or ib_unregister_device
+ * is called. The destroy callbacks skip FW commands when the handle is already
+ * INVALID_MANA_HANDLE.
+ *
+ * For CQs, also removes the CQ callback to prevent stale completions.
+ */
+static void mana_ib_reset_notify(void *ctx)
+{
+	struct mana_ib_dev *mdev = ctx;
+	struct mana_ib_ucontext *uctx;
+	struct mana_ib_qp *qp;
+	struct mana_ib_wq *wq;
+	struct mana_ib_cq *cq;
+	struct mana_ib_mr *mr;
+	struct mana_ib_pd *pd;
+	struct ib_event ibev;
+	int i;
+
+	down_write(&mdev->reset_rwsem);
+
+	ibdev_dbg(&mdev->ib_dev, "reset cleanup starting\n");
+
+	mutex_lock(&mdev->ucontext_lock);
+	list_for_each_entry(uctx, &mdev->ucontext_list, dev_list) {
+		mutex_lock(&uctx->lock);
+
+		list_for_each_entry(qp, &uctx->qp_list, ucontext_list)
+			qp->qp_handle = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(wq, &uctx->wq_list, ucontext_list)
+			wq->rx_object = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(cq, &uctx->cq_list, ucontext_list) {
+			mana_ib_remove_cq_cb(mdev, cq);
+			cq->cq_handle = INVALID_MANA_HANDLE;
+		}
+
+		list_for_each_entry(mr, &uctx->mr_list, ucontext_list)
+			mr->mr_handle = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(pd, &uctx->pd_list, ucontext_list)
+			pd->pd_handle = INVALID_MANA_HANDLE;
+
+		uctx->doorbell = INVALID_DOORBELL;
+
+		mutex_unlock(&uctx->lock);
+	}
+	mutex_unlock(&mdev->ucontext_lock);
+
+	up_write(&mdev->reset_rwsem);
+
+	/* Revoke user doorbell mappings so userspace cannot ring
+	 * stale doorbells after firmware handles are invalidated.
+	 */
+	rdma_user_mmap_disassociate(&mdev->ib_dev);
+
+	/* Notify userspace (e.g. DPDK) that the port is down */
+	for (i = 0; i < mdev->ib_dev.phys_port_cnt; i++) {
+		ibev.device = &mdev->ib_dev;
+		ibev.element.port_num = i + 1;
+		ibev.event = IB_EVENT_PORT_ERR;
+		ib_dispatch_event(&ibev);
+	}
+}
+
+static void mana_ib_resume_notify(void *ctx)
+{
+	struct mana_ib_dev *dev = ctx;
+	struct ib_event ibev;
+	int i;
+
+	for (i = 0; i < dev->ib_dev.phys_port_cnt; i++) {
+		ibev.device = &dev->ib_dev;
+		ibev.element.port_num = i + 1;
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		ib_dispatch_event(&ibev);
+	}
+}
+
 static int mana_ib_probe(struct auxiliary_device *adev,
 			 const struct auxiliary_device_id *id)
 {
@@ -134,6 +222,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 	xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
 	mutex_init(&dev->ucontext_lock);
 	INIT_LIST_HEAD(&dev->ucontext_list);
+	init_rwsem(&dev->reset_rwsem);
 
 	if (mana_ib_is_rnic(dev)) {
 		dev->ib_dev.phys_port_cnt = 1;
@@ -216,6 +305,15 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 
 	dev_set_drvdata(&adev->dev, dev);
 
+	/* ETH device persists across reset — use callback for cleanup.
+	 * RNIC device is removed/re-added, so its cleanup happens in remove.
+	 */
+	if (!mana_ib_is_rnic(dev)) {
+		mdev->reset_notify = mana_ib_reset_notify;
+		mdev->resume_notify = mana_ib_resume_notify;
+		mdev->reset_notify_ctx = dev;
+	}
+
 	return 0;
 
 deallocate_pool:
@@ -242,6 +340,11 @@ static void mana_ib_remove(struct auxiliary_device *adev)
 	if (mana_ib_is_rnic(dev))
 		mana_drain_gsi_sqs(dev);
 
+	if (!mana_ib_is_rnic(dev)) {
+		dev->gdma_dev->reset_notify = NULL;
+		dev->gdma_dev->resume_notify = NULL;
+		dev->gdma_dev->reset_notify_ctx = NULL;
+	}
 	ib_unregister_device(&dev->ib_dev);
 	dma_pool_destroy(dev->av_pool);
 	if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index f739e6da5435..61ce30aa9cb2 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -81,6 +81,8 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 	gc = mdev_to_gc(dev);
 
+	down_read(&dev->reset_rwsem);
+
 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
 			     sizeof(resp));
 
@@ -98,6 +100,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 		if (!err)
 			err = -EPROTO;
 
+		up_read(&dev->reset_rwsem);
 		return err;
 	}
 
@@ -118,6 +121,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&dev->reset_rwsem);
 	return 0;
 }
 
@@ -230,10 +234,13 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 	gc = mdev_to_gc(mdev);
 
+	down_read(&mdev->reset_rwsem);
+
 	/* Allocate a doorbell page index */
 	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
 	if (ret) {
 		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+		up_read(&mdev->reset_rwsem);
 		return ret;
 	}
 
@@ -252,6 +259,8 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 	list_add_tail(&ucontext->dev_list, &mdev->ucontext_list);
 	mutex_unlock(&mdev->ucontext_lock);
 
+	up_read(&mdev->reset_rwsem);
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index ce5c6c030fb2..29201cf3274c 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -86,6 +86,8 @@ struct mana_ib_dev {
 	/* Protects ucontext_list */
 	struct mutex ucontext_lock;
 	struct list_head ucontext_list;
+	/* Serializes resource create callbacks vs reset cleanup */
+	struct rw_semaphore reset_rwsem;
 };
 
 struct mana_ib_wq {
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 559bb4f7c31d..7189ccd41576 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -141,6 +141,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
+	down_read(&dev->reset_rwsem);
+
 	mr->umem = ib_umem_get(ibdev, start, length, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
@@ -195,6 +197,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&dev->reset_rwsem);
 	return &mr->ibmr;
 
 err_dma_region:
@@ -204,6 +207,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	ib_umem_release(mr->umem);
 
 err_free:
+	up_read(&dev->reset_rwsem);
 	kfree(mr);
 	return ERR_PTR(err);
 }
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 315bc54d8ae6..d590aca9b93a 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -701,12 +701,16 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
 		      struct ib_udata *udata)
 {
 	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(ibqp->device, struct mana_ib_dev, ib_dev);
 	int err;
 
 	INIT_LIST_HEAD(&qp->ucontext_list);
 
 	switch (attr->qp_type) {
 	case IB_QPT_RAW_PACKET:
+		down_read(&mdev->reset_rwsem);
+
 		/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
 		if (attr->rwq_ind_tbl)
 			err = mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
@@ -724,6 +728,7 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
 			mutex_unlock(&mana_ucontext->lock);
 		}
 
+		up_read(&mdev->reset_rwsem);
 		return err;
 	case IB_QPT_RC:
 		return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata);
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
index 1af9869933aa..67b757cf30f9 100644
--- a/drivers/infiniband/hw/mana/wq.c
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -31,6 +31,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 
 	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
 
+	down_read(&mdev->reset_rwsem);
+
 	err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue);
 	if (err) {
 		ibdev_dbg(&mdev->ib_dev,
@@ -52,9 +54,11 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&mdev->reset_rwsem);
 	return &wq->ibwq;
 
 err_free_wq:
+	up_read(&mdev->reset_rwsem);
 	kfree(wq);
 
 	return ERR_PTR(err);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ea71de39f996..3493b36426f7 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3659,15 +3659,19 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 		}
 	}
 
-	err = add_adev(gd, "eth");
+	if (!resuming)
+		err = add_adev(gd, "eth");
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
-
 out:
 	if (err) {
 		mana_remove(gd, false);
 	} else {
+		/* Notify IB layer that ports are back up after reset */
+		if (resuming && gd->resume_notify)
+			gd->resume_notify(gd->reset_notify_ctx);
+
 		dev_dbg(dev, "gd=%p, id=%u, num_ports=%d, type=%u, instance=%u\n",
 			gd, gd->dev_id.as_uint32, ac->num_ports,
 			gd->dev_id.type, gd->dev_id.instance);
@@ -3691,9 +3695,13 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	cancel_delayed_work_sync(&ac->gf_stats_work);
 
 	/* adev currently doesn't support suspending, always remove it */
-	if (gd->adev)
+	if (gd->adev && !suspending)
 		remove_adev(gd);
 
+	/* Notify IB layer before tearing down net devices during reset */
+	if (suspending && gd->reset_notify)
+		gd->reset_notify(gd->reset_notify_ctx);
+
 	for (i = 0; i < ac->num_ports; i++) {
 		ndev = ac->ports[i];
 		if (!ndev) {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index ec17004b10c0..9187c5b4d0d1 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -249,6 +249,12 @@ struct gdma_dev {
 	struct auxiliary_device *adev;
 	bool is_suspended;
 	bool rdma_teardown;
+
+	/* Called by mana_remove() during reset to notify IB layer */
+	void (*reset_notify)(void *ctx);
+	/* Called by mana_probe() during resume to notify IB layer */
+	void (*resume_notify)(void *ctx);
+	void *reset_notify_ctx;
 };
 
 /* MANA_PAGE_SIZE is the DMA unit */
-- 
2.43.0


  parent reply	other threads:[~2026-03-07  1:47 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-07  1:47 [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Long Li
2026-03-07  1:47 ` [PATCH rdma-next 1/8] RDMA/mana_ib: Track ucontext per device Long Li
2026-03-07  1:47 ` [PATCH rdma-next 2/8] RDMA/mana_ib: Track PD per ucontext Long Li
2026-03-07  1:47 ` [PATCH rdma-next 3/8] RDMA/mana_ib: Track CQ " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 4/8] RDMA/mana_ib: Track WQ " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 5/8] RDMA/mana_ib: Track QP " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 6/8] RDMA/mana_ib: Track MR " Long Li
2026-03-07  1:47 ` Long Li [this message]
2026-03-07  1:47 ` [PATCH rdma-next 8/8] RDMA/mana_ib: Skip firmware commands for invalidated handles Long Li
2026-03-07 17:38 ` [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Leon Romanovsky
2026-03-13 16:59   ` Jason Gunthorpe
2026-03-16 20:08     ` Leon Romanovsky
2026-03-17 23:43       ` [EXTERNAL] " Long Li
2026-03-18 14:49         ` Leon Romanovsky
2026-03-21  0:49           ` Long Li
2026-04-10 15:49         ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260307014723.556523-8-longli@microsoft.com \
    --to=longli@microsoft.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=decui@microsoft.com \
    --cc=edumazet@google.com \
    --cc=haiyangz@microsoft.com \
    --cc=horms@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kotaranov@microsoft.com \
    --cc=kuba@kernel.org \
    --cc=kys@microsoft.com \
    --cc=leon@kernel.org \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=wei.liu@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.