public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Long Li <longli@microsoft.com>
To: Long Li <longli@microsoft.com>,
	Konstantin Taranov <kotaranov@microsoft.com>,
	Jakub Kicinski <kuba@kernel.org>,
	"David S . Miller" <davem@davemloft.net>,
	Paolo Abeni <pabeni@redhat.com>,
	Eric Dumazet <edumazet@google.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	Jason Gunthorpe <jgg@ziepe.ca>, Leon Romanovsky <leon@kernel.org>,
	Haiyang Zhang <haiyangz@microsoft.com>,
	"K . Y . Srinivasan" <kys@microsoft.com>,
	Wei Liu <wei.liu@kernel.org>, Dexuan Cui <decui@microsoft.com>
Cc: Simon Horman <horms@kernel.org>,
	netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH rdma-next 7/8] RDMA/mana_ib: Notify service reset events to RDMA devices
Date: Fri,  6 Mar 2026 17:47:21 -0800	[thread overview]
Message-ID: <20260307014723.556523-8-longli@microsoft.com> (raw)
In-Reply-To: <20260307014723.556523-1-longli@microsoft.com>

Register reset_notify and resume_notify callbacks so the RDMA driver
is informed when the MANA service undergoes a reset cycle.

On reset notification:
  - Acquire reset_rwsem write lock to serialize with resource creation
  - Walk every tracked ucontext and invalidate firmware handles for
    all PD, CQ, WQ, QP, and MR resources (set to INVALID_MANA_HANDLE)
  - Dispatch IB_EVENT_PORT_ERR to each affected ucontext so userspace
    (e.g. DPDK) learns about the reset

On resume notification:
  - Release reset_rwsem write lock, unblocking new resource creation

Resource creation paths (alloc_pd, create_cq, create_wq, create_qp for
RAW_PACKET, reg_user_mr) acquire reset_rwsem read lock to ensure handles
are not invalidated while being set up.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/cq.c               |  15 ++-
 drivers/infiniband/hw/mana/device.c           | 103 ++++++++++++++++++
 drivers/infiniband/hw/mana/main.c             |   9 ++
 drivers/infiniband/hw/mana/mana_ib.h          |   2 +
 drivers/infiniband/hw/mana/mr.c               |   4 +
 drivers/infiniband/hw/mana/qp.c               |   5 +
 drivers/infiniband/hw/mana/wq.c               |   4 +
 drivers/net/ethernet/microsoft/mana/mana_en.c |  14 ++-
 include/net/mana/gdma.h                       |   6 +
 9 files changed, 155 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 89cf60987ff5..b054684b8de7 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -41,13 +41,17 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 			ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
 			return -EINVAL;
 		}
+	}
+
+	down_read(&mdev->reset_rwsem);
 
+	if (udata) {
 		cq->cqe = attr->cqe;
 		err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE,
 					   &cq->queue);
 		if (err) {
 			ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err);
-			return err;
+			goto err_unlock;
 		}
 
 		mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
@@ -56,14 +60,15 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	} else {
 		if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) {
 			ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
-			return -EINVAL;
+			err = -EINVAL;
+			goto err_unlock;
 		}
 		buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE));
 		cq->cqe = buf_size / COMP_ENTRY_SIZE;
 		err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue);
 		if (err) {
 			ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err);
-			return err;
+			goto err_unlock;
 		}
 		doorbell = mdev->gdma_dev->doorbell;
 	}
@@ -105,6 +110,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&mdev->reset_rwsem);
 	return 0;
 
 err_remove_cq_cb:
@@ -113,7 +119,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	mana_ib_gd_destroy_cq(mdev, cq);
 err_destroy_queue:
 	mana_ib_destroy_queue(mdev, &cq->queue);
-
+err_unlock:
+	up_read(&mdev->reset_rwsem);
 	return err;
 }
 
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 149e8d4d5b8e..081be31563ca 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -103,6 +103,7 @@ static int mana_ib_netdev_event(struct notifier_block *this,
 					netdev_put(ndev, &dev->dev_tracker);
 
 				return NOTIFY_OK;
+
 			default:
 				return NOTIFY_DONE;
 			}
@@ -110,6 +111,93 @@ static int mana_ib_netdev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
+/*
+ * Reset cleanup: invalidate firmware handles for all tracked user objects.
+ *
+ * Called during service reset BEFORE dispatching IB_EVENT_PORT_ERR to
+ * user-mode.
+ *
+ * Only invalidates FW handles — does NOT free kernel resources (umem, queues)
+ * or remove objects from lists. The IB core's destroy callbacks handle full
+ * resource teardown when user-space closes the uverbs FD or ib_unregister_device
+ * is called. The destroy callbacks skip FW commands when the handle is already
+ * INVALID_MANA_HANDLE.
+ *
+ * For CQs, also removes the CQ callback to prevent stale completions.
+ */
+static void mana_ib_reset_notify(void *ctx)
+{
+	struct mana_ib_dev *mdev = ctx;
+	struct mana_ib_ucontext *uctx;
+	struct mana_ib_qp *qp;
+	struct mana_ib_wq *wq;
+	struct mana_ib_cq *cq;
+	struct mana_ib_mr *mr;
+	struct mana_ib_pd *pd;
+	struct ib_event ibev;
+	int i;
+
+	down_write(&mdev->reset_rwsem);
+
+	ibdev_dbg(&mdev->ib_dev, "reset cleanup starting\n");
+
+	mutex_lock(&mdev->ucontext_lock);
+	list_for_each_entry(uctx, &mdev->ucontext_list, dev_list) {
+		mutex_lock(&uctx->lock);
+
+		list_for_each_entry(qp, &uctx->qp_list, ucontext_list)
+			qp->qp_handle = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(wq, &uctx->wq_list, ucontext_list)
+			wq->rx_object = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(cq, &uctx->cq_list, ucontext_list) {
+			mana_ib_remove_cq_cb(mdev, cq);
+			cq->cq_handle = INVALID_MANA_HANDLE;
+		}
+
+		list_for_each_entry(mr, &uctx->mr_list, ucontext_list)
+			mr->mr_handle = INVALID_MANA_HANDLE;
+
+		list_for_each_entry(pd, &uctx->pd_list, ucontext_list)
+			pd->pd_handle = INVALID_MANA_HANDLE;
+
+		uctx->doorbell = INVALID_DOORBELL;
+
+		mutex_unlock(&uctx->lock);
+	}
+	mutex_unlock(&mdev->ucontext_lock);
+
+	up_write(&mdev->reset_rwsem);
+
+	/* Revoke user doorbell mappings so userspace cannot ring
+	 * stale doorbells after firmware handles are invalidated.
+	 */
+	rdma_user_mmap_disassociate(&mdev->ib_dev);
+
+	/* Notify userspace (e.g. DPDK) that the port is down */
+	for (i = 0; i < mdev->ib_dev.phys_port_cnt; i++) {
+		ibev.device = &mdev->ib_dev;
+		ibev.element.port_num = i + 1;
+		ibev.event = IB_EVENT_PORT_ERR;
+		ib_dispatch_event(&ibev);
+	}
+}
+
+static void mana_ib_resume_notify(void *ctx)
+{
+	struct mana_ib_dev *dev = ctx;
+	struct ib_event ibev;
+	int i;
+
+	for (i = 0; i < dev->ib_dev.phys_port_cnt; i++) {
+		ibev.device = &dev->ib_dev;
+		ibev.element.port_num = i + 1;
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		ib_dispatch_event(&ibev);
+	}
+}
+
 static int mana_ib_probe(struct auxiliary_device *adev,
 			 const struct auxiliary_device_id *id)
 {
@@ -134,6 +222,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 	xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
 	mutex_init(&dev->ucontext_lock);
 	INIT_LIST_HEAD(&dev->ucontext_list);
+	init_rwsem(&dev->reset_rwsem);
 
 	if (mana_ib_is_rnic(dev)) {
 		dev->ib_dev.phys_port_cnt = 1;
@@ -216,6 +305,15 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 
 	dev_set_drvdata(&adev->dev, dev);
 
+	/* ETH device persists across reset — use callback for cleanup.
+	 * RNIC device is removed/re-added, so its cleanup happens in remove.
+	 */
+	if (!mana_ib_is_rnic(dev)) {
+		mdev->reset_notify = mana_ib_reset_notify;
+		mdev->resume_notify = mana_ib_resume_notify;
+		mdev->reset_notify_ctx = dev;
+	}
+
 	return 0;
 
 deallocate_pool:
@@ -242,6 +340,11 @@ static void mana_ib_remove(struct auxiliary_device *adev)
 	if (mana_ib_is_rnic(dev))
 		mana_drain_gsi_sqs(dev);
 
+	if (!mana_ib_is_rnic(dev)) {
+		dev->gdma_dev->reset_notify = NULL;
+		dev->gdma_dev->resume_notify = NULL;
+		dev->gdma_dev->reset_notify_ctx = NULL;
+	}
 	ib_unregister_device(&dev->ib_dev);
 	dma_pool_destroy(dev->av_pool);
 	if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index f739e6da5435..61ce30aa9cb2 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -81,6 +81,8 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 	gc = mdev_to_gc(dev);
 
+	down_read(&dev->reset_rwsem);
+
 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
 			     sizeof(resp));
 
@@ -98,6 +100,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 		if (!err)
 			err = -EPROTO;
 
+		up_read(&dev->reset_rwsem);
 		return err;
 	}
 
@@ -118,6 +121,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&dev->reset_rwsem);
 	return 0;
 }
 
@@ -230,10 +234,13 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 	gc = mdev_to_gc(mdev);
 
+	down_read(&mdev->reset_rwsem);
+
 	/* Allocate a doorbell page index */
 	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
 	if (ret) {
 		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+		up_read(&mdev->reset_rwsem);
 		return ret;
 	}
 
@@ -252,6 +259,8 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 	list_add_tail(&ucontext->dev_list, &mdev->ucontext_list);
 	mutex_unlock(&mdev->ucontext_lock);
 
+	up_read(&mdev->reset_rwsem);
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index ce5c6c030fb2..29201cf3274c 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -86,6 +86,8 @@ struct mana_ib_dev {
 	/* Protects ucontext_list */
 	struct mutex ucontext_lock;
 	struct list_head ucontext_list;
+	/* Serializes resource create callbacks vs reset cleanup */
+	struct rw_semaphore reset_rwsem;
 };
 
 struct mana_ib_wq {
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 559bb4f7c31d..7189ccd41576 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -141,6 +141,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
+	down_read(&dev->reset_rwsem);
+
 	mr->umem = ib_umem_get(ibdev, start, length, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
@@ -195,6 +197,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&dev->reset_rwsem);
 	return &mr->ibmr;
 
 err_dma_region:
@@ -204,6 +207,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	ib_umem_release(mr->umem);
 
 err_free:
+	up_read(&dev->reset_rwsem);
 	kfree(mr);
 	return ERR_PTR(err);
 }
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 315bc54d8ae6..d590aca9b93a 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -701,12 +701,16 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
 		      struct ib_udata *udata)
 {
 	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(ibqp->device, struct mana_ib_dev, ib_dev);
 	int err;
 
 	INIT_LIST_HEAD(&qp->ucontext_list);
 
 	switch (attr->qp_type) {
 	case IB_QPT_RAW_PACKET:
+		down_read(&mdev->reset_rwsem);
+
 		/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
 		if (attr->rwq_ind_tbl)
 			err = mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
@@ -724,6 +728,7 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
 			mutex_unlock(&mana_ucontext->lock);
 		}
 
+		up_read(&mdev->reset_rwsem);
 		return err;
 	case IB_QPT_RC:
 		return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata);
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
index 1af9869933aa..67b757cf30f9 100644
--- a/drivers/infiniband/hw/mana/wq.c
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -31,6 +31,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 
 	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
 
+	down_read(&mdev->reset_rwsem);
+
 	err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue);
 	if (err) {
 		ibdev_dbg(&mdev->ib_dev,
@@ -52,9 +54,11 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 		mutex_unlock(&mana_ucontext->lock);
 	}
 
+	up_read(&mdev->reset_rwsem);
 	return &wq->ibwq;
 
 err_free_wq:
+	up_read(&mdev->reset_rwsem);
 	kfree(wq);
 
 	return ERR_PTR(err);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ea71de39f996..3493b36426f7 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3659,15 +3659,19 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 		}
 	}
 
-	err = add_adev(gd, "eth");
+	if (!resuming)
+		err = add_adev(gd, "eth");
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
-
 out:
 	if (err) {
 		mana_remove(gd, false);
 	} else {
+		/* Notify IB layer that ports are back up after reset */
+		if (resuming && gd->resume_notify)
+			gd->resume_notify(gd->reset_notify_ctx);
+
 		dev_dbg(dev, "gd=%p, id=%u, num_ports=%d, type=%u, instance=%u\n",
 			gd, gd->dev_id.as_uint32, ac->num_ports,
 			gd->dev_id.type, gd->dev_id.instance);
@@ -3691,9 +3695,13 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	cancel_delayed_work_sync(&ac->gf_stats_work);
 
 	/* adev currently doesn't support suspending, always remove it */
-	if (gd->adev)
+	if (gd->adev && !suspending)
 		remove_adev(gd);
 
+	/* Notify IB layer before tearing down net devices during reset */
+	if (suspending && gd->reset_notify)
+		gd->reset_notify(gd->reset_notify_ctx);
+
 	for (i = 0; i < ac->num_ports; i++) {
 		ndev = ac->ports[i];
 		if (!ndev) {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index ec17004b10c0..9187c5b4d0d1 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -249,6 +249,12 @@ struct gdma_dev {
 	struct auxiliary_device *adev;
 	bool is_suspended;
 	bool rdma_teardown;
+
+	/* Called by mana_remove() during reset to notify IB layer */
+	void (*reset_notify)(void *ctx);
+	/* Called by mana_probe() during resume to notify IB layer */
+	void (*resume_notify)(void *ctx);
+	void *reset_notify_ctx;
 };
 
 /* MANA_PAGE_SIZE is the DMA unit */
-- 
2.43.0


  parent reply	other threads:[~2026-03-07  1:47 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-07  1:47 [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Long Li
2026-03-07  1:47 ` [PATCH rdma-next 1/8] RDMA/mana_ib: Track ucontext per device Long Li
2026-03-07  1:47 ` [PATCH rdma-next 2/8] RDMA/mana_ib: Track PD per ucontext Long Li
2026-03-07  1:47 ` [PATCH rdma-next 3/8] RDMA/mana_ib: Track CQ " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 4/8] RDMA/mana_ib: Track WQ " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 5/8] RDMA/mana_ib: Track QP " Long Li
2026-03-07  1:47 ` [PATCH rdma-next 6/8] RDMA/mana_ib: Track MR " Long Li
2026-03-07  1:47 ` Long Li [this message]
2026-03-07  1:47 ` [PATCH rdma-next 8/8] RDMA/mana_ib: Skip firmware commands for invalidated handles Long Li
2026-03-07 17:38 ` [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Leon Romanovsky
2026-03-13 16:59   ` Jason Gunthorpe
2026-03-16 20:08     ` Leon Romanovsky
2026-03-17 23:43       ` [EXTERNAL] " Long Li
2026-03-18 14:49         ` Leon Romanovsky
2026-03-21  0:49           ` Long Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260307014723.556523-8-longli@microsoft.com \
    --to=longli@microsoft.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=decui@microsoft.com \
    --cc=edumazet@google.com \
    --cc=haiyangz@microsoft.com \
    --cc=horms@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kotaranov@microsoft.com \
    --cc=kuba@kernel.org \
    --cc=kys@microsoft.com \
    --cc=leon@kernel.org \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=wei.liu@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox