From: Long Li <longli@microsoft.com>
To: Long Li <longli@microsoft.com>,
Konstantin Taranov <kotaranov@microsoft.com>,
Jakub Kicinski <kuba@kernel.org>,
"David S . Miller" <davem@davemloft.net>,
Paolo Abeni <pabeni@redhat.com>,
Eric Dumazet <edumazet@google.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
Jason Gunthorpe <jgg@ziepe.ca>, Leon Romanovsky <leon@kernel.org>,
Haiyang Zhang <haiyangz@microsoft.com>,
"K . Y . Srinivasan" <kys@microsoft.com>,
Wei Liu <wei.liu@kernel.org>, Dexuan Cui <decui@microsoft.com>
Cc: Simon Horman <horms@kernel.org>,
netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH rdma-next 7/8] RDMA/mana_ib: Notify service reset events to RDMA devices
Date: Fri, 6 Mar 2026 17:47:21 -0800 [thread overview]
Message-ID: <20260307014723.556523-8-longli@microsoft.com> (raw)
In-Reply-To: <20260307014723.556523-1-longli@microsoft.com>
Register reset_notify and resume_notify callbacks so the RDMA driver
is informed when the MANA service undergoes a reset cycle.
On reset notification:
- Acquire reset_rwsem write lock to serialize with resource creation
- Walk every tracked ucontext and invalidate firmware handles for
all PD, CQ, WQ, QP, and MR resources (set to INVALID_MANA_HANDLE)
- Dispatch IB_EVENT_PORT_ERR to each affected ucontext so userspace
(e.g. DPDK) learns about the reset
On resume notification:
- Release reset_rwsem write lock, unblocking new resource creation
Resource creation paths (alloc_pd, create_cq, create_wq, create_qp for
RAW_PACKET, reg_user_mr) acquire reset_rwsem read lock to ensure handles
are not invalidated while being set up.
Signed-off-by: Long Li <longli@microsoft.com>
---
drivers/infiniband/hw/mana/cq.c | 15 ++-
drivers/infiniband/hw/mana/device.c | 103 ++++++++++++++++++
drivers/infiniband/hw/mana/main.c | 9 ++
drivers/infiniband/hw/mana/mana_ib.h | 2 +
drivers/infiniband/hw/mana/mr.c | 4 +
drivers/infiniband/hw/mana/qp.c | 5 +
drivers/infiniband/hw/mana/wq.c | 4 +
drivers/net/ethernet/microsoft/mana/mana_en.c | 14 ++-
include/net/mana/gdma.h | 6 +
9 files changed, 155 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 89cf60987ff5..b054684b8de7 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -41,13 +41,17 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
return -EINVAL;
}
+ }
+
+ down_read(&mdev->reset_rwsem);
+ if (udata) {
cq->cqe = attr->cqe;
err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE,
&cq->queue);
if (err) {
ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err);
- return err;
+ goto err_unlock;
}
mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
@@ -56,14 +60,15 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
} else {
if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) {
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_unlock;
}
buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE));
cq->cqe = buf_size / COMP_ENTRY_SIZE;
err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue);
if (err) {
ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err);
- return err;
+ goto err_unlock;
}
doorbell = mdev->gdma_dev->doorbell;
}
@@ -105,6 +110,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return 0;
err_remove_cq_cb:
@@ -113,7 +119,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
mana_ib_gd_destroy_cq(mdev, cq);
err_destroy_queue:
mana_ib_destroy_queue(mdev, &cq->queue);
-
+err_unlock:
+ up_read(&mdev->reset_rwsem);
return err;
}
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 149e8d4d5b8e..081be31563ca 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -103,6 +103,7 @@ static int mana_ib_netdev_event(struct notifier_block *this,
netdev_put(ndev, &dev->dev_tracker);
return NOTIFY_OK;
+
default:
return NOTIFY_DONE;
}
@@ -110,6 +111,93 @@ static int mana_ib_netdev_event(struct notifier_block *this,
return NOTIFY_DONE;
}
+/*
+ * Reset cleanup: invalidate firmware handles for all tracked user objects.
+ *
+ * Called during service reset BEFORE dispatching IB_EVENT_PORT_ERR to
+ * user-mode.
+ *
+ * Only invalidates FW handles — does NOT free kernel resources (umem, queues)
+ * or remove objects from lists. The IB core's destroy callbacks handle full
+ * resource teardown when user-space closes the uverbs FD or ib_unregister_device
+ * is called. The destroy callbacks skip FW commands when the handle is already
+ * INVALID_MANA_HANDLE.
+ *
+ * For CQs, also removes the CQ callback to prevent stale completions.
+ */
+static void mana_ib_reset_notify(void *ctx)
+{
+ struct mana_ib_dev *mdev = ctx;
+ struct mana_ib_ucontext *uctx;
+ struct mana_ib_qp *qp;
+ struct mana_ib_wq *wq;
+ struct mana_ib_cq *cq;
+ struct mana_ib_mr *mr;
+ struct mana_ib_pd *pd;
+ struct ib_event ibev;
+ int i;
+
+ down_write(&mdev->reset_rwsem);
+
+ ibdev_dbg(&mdev->ib_dev, "reset cleanup starting\n");
+
+ mutex_lock(&mdev->ucontext_lock);
+ list_for_each_entry(uctx, &mdev->ucontext_list, dev_list) {
+ mutex_lock(&uctx->lock);
+
+ list_for_each_entry(qp, &uctx->qp_list, ucontext_list)
+ qp->qp_handle = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(wq, &uctx->wq_list, ucontext_list)
+ wq->rx_object = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(cq, &uctx->cq_list, ucontext_list) {
+ mana_ib_remove_cq_cb(mdev, cq);
+ cq->cq_handle = INVALID_MANA_HANDLE;
+ }
+
+ list_for_each_entry(mr, &uctx->mr_list, ucontext_list)
+ mr->mr_handle = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(pd, &uctx->pd_list, ucontext_list)
+ pd->pd_handle = INVALID_MANA_HANDLE;
+
+ uctx->doorbell = INVALID_DOORBELL;
+
+ mutex_unlock(&uctx->lock);
+ }
+ mutex_unlock(&mdev->ucontext_lock);
+
+ up_write(&mdev->reset_rwsem);
+
+ /* Revoke user doorbell mappings so userspace cannot ring
+ * stale doorbells after firmware handles are invalidated.
+ */
+ rdma_user_mmap_disassociate(&mdev->ib_dev);
+
+ /* Notify userspace (e.g. DPDK) that the port is down */
+ for (i = 0; i < mdev->ib_dev.phys_port_cnt; i++) {
+ ibev.device = &mdev->ib_dev;
+ ibev.element.port_num = i + 1;
+ ibev.event = IB_EVENT_PORT_ERR;
+ ib_dispatch_event(&ibev);
+ }
+}
+
+static void mana_ib_resume_notify(void *ctx)
+{
+ struct mana_ib_dev *dev = ctx;
+ struct ib_event ibev;
+ int i;
+
+ for (i = 0; i < dev->ib_dev.phys_port_cnt; i++) {
+ ibev.device = &dev->ib_dev;
+ ibev.element.port_num = i + 1;
+ ibev.event = IB_EVENT_PORT_ACTIVE;
+ ib_dispatch_event(&ibev);
+ }
+}
+
static int mana_ib_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
@@ -134,6 +222,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
mutex_init(&dev->ucontext_lock);
INIT_LIST_HEAD(&dev->ucontext_list);
+ init_rwsem(&dev->reset_rwsem);
if (mana_ib_is_rnic(dev)) {
dev->ib_dev.phys_port_cnt = 1;
@@ -216,6 +305,15 @@ static int mana_ib_probe(struct auxiliary_device *adev,
dev_set_drvdata(&adev->dev, dev);
+ /* ETH device persists across reset — use callback for cleanup.
+ * RNIC device is removed/re-added, so its cleanup happens in remove.
+ */
+ if (!mana_ib_is_rnic(dev)) {
+ mdev->reset_notify = mana_ib_reset_notify;
+ mdev->resume_notify = mana_ib_resume_notify;
+ mdev->reset_notify_ctx = dev;
+ }
+
return 0;
deallocate_pool:
@@ -242,6 +340,11 @@ static void mana_ib_remove(struct auxiliary_device *adev)
if (mana_ib_is_rnic(dev))
mana_drain_gsi_sqs(dev);
+ if (!mana_ib_is_rnic(dev)) {
+ dev->gdma_dev->reset_notify = NULL;
+ dev->gdma_dev->resume_notify = NULL;
+ dev->gdma_dev->reset_notify_ctx = NULL;
+ }
ib_unregister_device(&dev->ib_dev);
dma_pool_destroy(dev->av_pool);
if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index f739e6da5435..61ce30aa9cb2 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -81,6 +81,8 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
gc = mdev_to_gc(dev);
+ down_read(&dev->reset_rwsem);
+
mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
sizeof(resp));
@@ -98,6 +100,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
if (!err)
err = -EPROTO;
+ up_read(&dev->reset_rwsem);
return err;
}
@@ -118,6 +121,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&dev->reset_rwsem);
return 0;
}
@@ -230,10 +234,13 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
gc = mdev_to_gc(mdev);
+ down_read(&mdev->reset_rwsem);
+
/* Allocate a doorbell page index */
ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
if (ret) {
ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+ up_read(&mdev->reset_rwsem);
return ret;
}
@@ -252,6 +259,8 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
list_add_tail(&ucontext->dev_list, &mdev->ucontext_list);
mutex_unlock(&mdev->ucontext_lock);
+ up_read(&mdev->reset_rwsem);
+
return 0;
}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index ce5c6c030fb2..29201cf3274c 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -86,6 +86,8 @@ struct mana_ib_dev {
/* Protects ucontext_list */
struct mutex ucontext_lock;
struct list_head ucontext_list;
+ /* Serializes resource create callbacks vs reset cleanup */
+ struct rw_semaphore reset_rwsem;
};
struct mana_ib_wq {
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 559bb4f7c31d..7189ccd41576 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -141,6 +141,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
+ down_read(&dev->reset_rwsem);
+
mr->umem = ib_umem_get(ibdev, start, length, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
@@ -195,6 +197,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&dev->reset_rwsem);
return &mr->ibmr;
err_dma_region:
@@ -204,6 +207,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
ib_umem_release(mr->umem);
err_free:
+ up_read(&dev->reset_rwsem);
kfree(mr);
return ERR_PTR(err);
}
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 315bc54d8ae6..d590aca9b93a 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -701,12 +701,16 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
struct ib_udata *udata)
{
struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+ struct mana_ib_dev *mdev =
+ container_of(ibqp->device, struct mana_ib_dev, ib_dev);
int err;
INIT_LIST_HEAD(&qp->ucontext_list);
switch (attr->qp_type) {
case IB_QPT_RAW_PACKET:
+ down_read(&mdev->reset_rwsem);
+
/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
if (attr->rwq_ind_tbl)
err = mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
@@ -724,6 +728,7 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return err;
case IB_QPT_RC:
return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata);
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
index 1af9869933aa..67b757cf30f9 100644
--- a/drivers/infiniband/hw/mana/wq.c
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -31,6 +31,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
+ down_read(&mdev->reset_rwsem);
+
err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue);
if (err) {
ibdev_dbg(&mdev->ib_dev,
@@ -52,9 +54,11 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return &wq->ibwq;
err_free_wq:
+ up_read(&mdev->reset_rwsem);
kfree(wq);
return ERR_PTR(err);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ea71de39f996..3493b36426f7 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3659,15 +3659,19 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
}
}
- err = add_adev(gd, "eth");
+ if (!resuming)
+ err = add_adev(gd, "eth");
INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
-
out:
if (err) {
mana_remove(gd, false);
} else {
+ /* Notify IB layer that ports are back up after reset */
+ if (resuming && gd->resume_notify)
+ gd->resume_notify(gd->reset_notify_ctx);
+
dev_dbg(dev, "gd=%p, id=%u, num_ports=%d, type=%u, instance=%u\n",
gd, gd->dev_id.as_uint32, ac->num_ports,
gd->dev_id.type, gd->dev_id.instance);
@@ -3691,9 +3695,13 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
cancel_delayed_work_sync(&ac->gf_stats_work);
/* adev currently doesn't support suspending, always remove it */
- if (gd->adev)
+ if (gd->adev && !suspending)
remove_adev(gd);
+ /* Notify IB layer before tearing down net devices during reset */
+ if (suspending && gd->reset_notify)
+ gd->reset_notify(gd->reset_notify_ctx);
+
for (i = 0; i < ac->num_ports; i++) {
ndev = ac->ports[i];
if (!ndev) {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index ec17004b10c0..9187c5b4d0d1 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -249,6 +249,12 @@ struct gdma_dev {
struct auxiliary_device *adev;
bool is_suspended;
bool rdma_teardown;
+
+ /* Called by mana_remove() during reset to notify IB layer */
+ void (*reset_notify)(void *ctx);
+ /* Called by mana_probe() during resume to notify IB layer */
+ void (*resume_notify)(void *ctx);
+ void *reset_notify_ctx;
};
/* MANA_PAGE_SIZE is the DMA unit */
--
2.43.0
next prev parent reply other threads:[~2026-03-07 1:47 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-07 1:47 [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Long Li
2026-03-07 1:47 ` [PATCH rdma-next 1/8] RDMA/mana_ib: Track ucontext per device Long Li
2026-03-07 1:47 ` [PATCH rdma-next 2/8] RDMA/mana_ib: Track PD per ucontext Long Li
2026-03-07 1:47 ` [PATCH rdma-next 3/8] RDMA/mana_ib: Track CQ " Long Li
2026-03-07 1:47 ` [PATCH rdma-next 4/8] RDMA/mana_ib: Track WQ " Long Li
2026-03-07 1:47 ` [PATCH rdma-next 5/8] RDMA/mana_ib: Track QP " Long Li
2026-03-07 1:47 ` [PATCH rdma-next 6/8] RDMA/mana_ib: Track MR " Long Li
2026-03-07 1:47 ` Long Li [this message]
2026-03-07 1:47 ` [PATCH rdma-next 8/8] RDMA/mana_ib: Skip firmware commands for invalidated handles Long Li
2026-03-07 17:38 ` [PATCH rdma-next 0/8] RDMA/mana_ib: Handle service reset for RDMA resources Leon Romanovsky
2026-03-13 16:59 ` Jason Gunthorpe
2026-03-16 20:08 ` Leon Romanovsky
2026-03-17 23:43 ` [EXTERNAL] " Long Li
2026-03-18 14:49 ` Leon Romanovsky
2026-03-21 0:49 ` Long Li
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260307014723.556523-8-longli@microsoft.com \
--to=longli@microsoft.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=decui@microsoft.com \
--cc=edumazet@google.com \
--cc=haiyangz@microsoft.com \
--cc=horms@kernel.org \
--cc=jgg@ziepe.ca \
--cc=kotaranov@microsoft.com \
--cc=kuba@kernel.org \
--cc=kys@microsoft.com \
--cc=leon@kernel.org \
--cc=linux-hyperv@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=wei.liu@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox