* [PATCH rdma-next 12/50] RDMA/mlx4: Inline mlx4_ib_get_cq_umem into callers
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
Inline the mlx4_ib_get_cq_umem helper function into its two call sites
(mlx4_ib_create_cq and mlx4_alloc_resize_umem) to prepare for the
transition to modern CQ creation interface.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx4/cq.c | 108 ++++++++++++++++++++++------------------
1 file changed, 60 insertions(+), 48 deletions(-)
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index c592374f4a58..94e9ff45725a 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -135,45 +135,6 @@ static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
}
-static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev,
- struct mlx4_ib_cq_buf *buf,
- struct ib_umem **umem, u64 buf_addr, int cqe)
-{
- int err;
- int cqe_size = dev->dev->caps.cqe_size;
- int shift;
- int n;
-
- *umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(*umem))
- return PTR_ERR(*umem);
-
- shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n);
- if (shift < 0) {
- err = shift;
- goto err_buf;
- }
-
- err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt);
- if (err)
- goto err_buf;
-
- err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
- if (err)
- goto err_mtt;
-
- return 0;
-
-err_mtt:
- mlx4_mtt_cleanup(dev->dev, &buf->mtt);
-
-err_buf:
- ib_umem_release(*umem);
-
- return err;
-}
-
#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct uverbs_attr_bundle *attrs)
@@ -208,6 +169,9 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (udata) {
struct mlx4_ib_create_cq ucmd;
+ int cqe_size = dev->dev->caps.cqe_size;
+ int shift;
+ int n;
if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
err = -EFAULT;
@@ -215,10 +179,28 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
}
buf_addr = (void *)(unsigned long)ucmd.buf_addr;
- err = mlx4_ib_get_cq_umem(dev, &cq->buf, &cq->umem,
- ucmd.buf_addr, entries);
- if (err)
+
+ cq->umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+ entries * cqe_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(cq->umem)) {
+ err = PTR_ERR(cq->umem);
goto err_cq;
+ }
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(cq->umem, 0, &n);
+ if (shift < 0) {
+ err = shift;
+ goto err_umem;
+ }
+
+ err = mlx4_mtt_init(dev->dev, n, shift, &cq->buf.mtt);
+ if (err)
+ goto err_umem;
+
+ err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem);
+ if (err)
+ goto err_mtt;
err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
if (err)
@@ -281,6 +263,7 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
err_mtt:
mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+err_umem:
ib_umem_release(cq->umem);
if (!udata)
mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
@@ -320,6 +303,9 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
int entries, struct ib_udata *udata)
{
struct mlx4_ib_resize_cq ucmd;
+ int cqe_size = dev->dev->caps.cqe_size;
+ int shift;
+ int n;
int err;
if (cq->resize_umem)
@@ -332,17 +318,43 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
if (!cq->resize_buf)
return -ENOMEM;
- err = mlx4_ib_get_cq_umem(dev, &cq->resize_buf->buf, &cq->resize_umem,
- ucmd.buf_addr, entries);
- if (err) {
- kfree(cq->resize_buf);
- cq->resize_buf = NULL;
- return err;
+ cq->resize_umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+ entries * cqe_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(cq->resize_umem)) {
+ err = PTR_ERR(cq->resize_umem);
+ goto err_buf;
+ }
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(cq->resize_umem, 0, &n);
+ if (shift < 0) {
+ err = shift;
+ goto err_umem;
}
+ err = mlx4_mtt_init(dev->dev, n, shift, &cq->resize_buf->buf.mtt);
+ if (err)
+ goto err_umem;
+
+ err = mlx4_ib_umem_write_mtt(dev, &cq->resize_buf->buf.mtt,
+ cq->resize_umem);
+ if (err)
+ goto err_mtt;
+
cq->resize_buf->cqe = entries - 1;
return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+
+err_umem:
+ ib_umem_release(cq->resize_umem);
+
+err_buf:
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ return err;
}
static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 14/50] RDMA/mlx4: Remove unused create_flags field from CQ structure
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
The CQ creation flags do not need to be cached, as they are used
immediately at the point where they are stored. Remove the unused
field and reclaim 4 bytes.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx4/cq.c | 4 +---
drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 -
2 files changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 4bee08317620..83169060d120 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -165,7 +165,6 @@ int mlx4_ib_create_user_cq(struct ib_cq *ibcq,
cq->ibcq.cqe = entries - 1;
mutex_init(&cq->resize_mutex);
spin_lock_init(&cq->lock);
- cq->create_flags = attr->flags;
INIT_LIST_HEAD(&cq->send_qp_list);
INIT_LIST_HEAD(&cq->recv_qp_list);
@@ -208,8 +207,7 @@ int mlx4_ib_create_user_cq(struct ib_cq *ibcq,
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, &context->uar,
cq->db.dma, &cq->mcq, vector, 0,
- !!(cq->create_flags &
- IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
+ attr->flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION,
buf_addr, true);
if (err)
goto err_dbmap;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 96563c0836ce..6a7ed5225c7d 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -122,7 +122,6 @@ struct mlx4_ib_cq {
spinlock_t lock;
struct mutex resize_mutex;
struct ib_umem *resize_umem;
- int create_flags;
/* List of qps that it serves.*/
struct list_head send_qp_list;
struct list_head recv_qp_list;
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 13/50] RDMA/mlx4: Introduce a modern CQ creation interface
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
The uverbs CQ creation UAPI allows users to supply their own umem when
creating a CQ. Update mlx4 to support this model while preserving compatibility
with the legacy interface that allocates umem internally.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx4/cq.c | 191 ++++++++++++++++++++---------------
drivers/infiniband/hw/mlx4/main.c | 1 +
drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 +-
3 files changed, 111 insertions(+), 85 deletions(-)
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 94e9ff45725a..4bee08317620 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -136,8 +136,9 @@ static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
}
#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
-int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct uverbs_attr_bundle *attrs)
+int mlx4_ib_create_user_cq(struct ib_cq *ibcq,
+ const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_udata *udata = &attrs->driver_udata;
struct ib_device *ibdev = ibcq->device;
@@ -145,13 +146,16 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
int vector = attr->comp_vector;
struct mlx4_ib_dev *dev = to_mdev(ibdev);
struct mlx4_ib_cq *cq = to_mcq(ibcq);
- struct mlx4_uar *uar;
+ struct mlx4_ib_create_cq ucmd;
+ int cqe_size = dev->dev->caps.cqe_size;
void *buf_addr;
+ int shift;
+ int n;
int err;
struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
udata, struct mlx4_ib_ucontext, ibucontext);
- if (entries < 1 || entries > dev->dev->caps.max_cqes)
+ if (attr->cqe > dev->dev->caps.max_cqes)
return -EINVAL;
if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
@@ -161,95 +165,63 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
cq->ibcq.cqe = entries - 1;
mutex_init(&cq->resize_mutex);
spin_lock_init(&cq->lock);
- cq->resize_buf = NULL;
- cq->resize_umem = NULL;
cq->create_flags = attr->flags;
INIT_LIST_HEAD(&cq->send_qp_list);
INIT_LIST_HEAD(&cq->recv_qp_list);
- if (udata) {
- struct mlx4_ib_create_cq ucmd;
- int cqe_size = dev->dev->caps.cqe_size;
- int shift;
- int n;
-
- if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
- err = -EFAULT;
- goto err_cq;
- }
-
- buf_addr = (void *)(unsigned long)ucmd.buf_addr;
-
- cq->umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
- entries * cqe_size,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(cq->umem)) {
- err = PTR_ERR(cq->umem);
- goto err_cq;
- }
-
- shift = mlx4_ib_umem_calc_optimal_mtt_size(cq->umem, 0, &n);
- if (shift < 0) {
- err = shift;
- goto err_umem;
- }
-
- err = mlx4_mtt_init(dev->dev, n, shift, &cq->buf.mtt);
- if (err)
- goto err_umem;
-
- err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem);
- if (err)
- goto err_mtt;
+ if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+ err = -EFAULT;
+ goto err_cq;
+ }
- err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
- if (err)
- goto err_mtt;
+ buf_addr = (void *)(unsigned long)ucmd.buf_addr;
- uar = &context->uar;
- cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
- } else {
- err = mlx4_db_alloc(dev->dev, &cq->db, 1);
- if (err)
- goto err_cq;
+ if (!ibcq->umem)
+ ibcq->umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+ entries * cqe_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(ibcq->umem)) {
+ err = PTR_ERR(ibcq->umem);
+ goto err_cq;
+ }
- cq->mcq.set_ci_db = cq->db.db;
- cq->mcq.arm_db = cq->db.db + 1;
- *cq->mcq.set_ci_db = 0;
- *cq->mcq.arm_db = 0;
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(cq->ibcq.umem, 0, &n);
+ if (shift < 0) {
+ err = shift;
+ goto err_cq;
+ }
- err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
- if (err)
- goto err_db;
+ err = mlx4_mtt_init(dev->dev, n, shift, &cq->buf.mtt);
+ if (err)
+ goto err_cq;
- buf_addr = &cq->buf.buf;
+ err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->ibcq.umem);
+ if (err)
+ goto err_mtt;
- uar = &dev->priv_uar;
- cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
- }
+ err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
+ if (err)
+ goto err_mtt;
if (dev->eq_table)
vector = dev->eq_table[vector % ibdev->num_comp_vectors];
- err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma,
- &cq->mcq, vector, 0,
+ err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, &context->uar,
+ cq->db.dma, &cq->mcq, vector, 0,
!!(cq->create_flags &
IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
- buf_addr, !!udata);
+ buf_addr, true);
if (err)
goto err_dbmap;
- if (udata)
- cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
- else
- cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
cq->mcq.event = mlx4_ib_cq_event;
+ cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
- if (udata)
- if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
- err = -EFAULT;
- goto err_cq_free;
- }
+ if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+ err = -EFAULT;
+ goto err_cq_free;
+ }
return 0;
@@ -257,21 +229,72 @@ int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
mlx4_cq_free(dev->dev, &cq->mcq);
err_dbmap:
- if (udata)
- mlx4_ib_db_unmap_user(context, &cq->db);
+ mlx4_ib_db_unmap_user(context, &cq->db);
err_mtt:
mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+ /* UMEM is released by ib_core */
-err_umem:
- ib_umem_release(cq->umem);
- if (!udata)
- mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+err_cq:
+ return err;
+}
+
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_device *ibdev = ibcq->device;
+ int entries = attr->cqe;
+ int vector = attr->comp_vector;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ void *buf_addr;
+ int err;
+
+ if (attr->cqe > dev->dev->caps.max_cqes)
+ return -EINVAL;
+
+ entries = roundup_pow_of_two(entries + 1);
+ cq->ibcq.cqe = entries - 1;
+ mutex_init(&cq->resize_mutex);
+ spin_lock_init(&cq->lock);
+ INIT_LIST_HEAD(&cq->send_qp_list);
+ INIT_LIST_HEAD(&cq->recv_qp_list);
+
+ err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+ if (err)
+ return err;
+
+ cq->mcq.set_ci_db = cq->db.db;
+ cq->mcq.arm_db = cq->db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ *cq->mcq.arm_db = 0;
+
+ err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+ if (err)
+ goto err_db;
+
+ buf_addr = &cq->buf.buf;
+
+ if (dev->eq_table)
+ vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
+ err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, &dev->priv_uar,
+ cq->db.dma, &cq->mcq, vector, 0, 0,
+ buf_addr, false);
+ if (err)
+ goto err_buf;
+
+ cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.event = mlx4_ib_cq_event;
+ cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
+
+ return 0;
+
+err_buf:
+ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
err_db:
- if (!udata)
- mlx4_db_free(dev->dev, &cq->db);
-err_cq:
+ mlx4_db_free(dev->dev, &cq->db);
return err;
}
@@ -445,8 +468,8 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
if (ibcq->uobject) {
cq->buf = cq->resize_buf->buf;
cq->ibcq.cqe = cq->resize_buf->cqe;
- ib_umem_release(cq->umem);
- cq->umem = cq->resize_umem;
+ ib_umem_release(cq->ibcq.umem);
+ cq->ibcq.umem = cq->resize_umem;
kfree(cq->resize_buf);
cq->resize_buf = NULL;
@@ -506,11 +529,11 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
struct mlx4_ib_ucontext,
ibucontext),
&mcq->db);
+ /* UMEM is released by ib_core */
} else {
mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
mlx4_db_free(dev->dev, &mcq->db);
}
- ib_umem_release(mcq->umem);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index dd35e03402ab..fc05e7a1a870 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2527,6 +2527,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
.attach_mcast = mlx4_ib_mcg_attach,
.create_ah = mlx4_ib_create_ah,
.create_cq = mlx4_ib_create_cq,
+ .create_user_cq = mlx4_ib_create_user_cq,
.create_qp = mlx4_ib_create_qp,
.create_srq = mlx4_ib_create_srq,
.dealloc_pd = mlx4_ib_dealloc_pd,
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 5df5b955114e..96563c0836ce 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -121,7 +121,6 @@ struct mlx4_ib_cq {
struct mlx4_db db;
spinlock_t lock;
struct mutex resize_mutex;
- struct ib_umem *umem;
struct ib_umem *resize_umem;
int create_flags;
/* List of qps that it serves.*/
@@ -772,6 +771,9 @@ int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct uverbs_attr_bundle *attrs);
+int mlx4_ib_create_user_cq(struct ib_cq *ibcq,
+ const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs);
int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 10/50] RDMA/mlx5: Save 4 bytes in CQ structure
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
There is no need to maintain separate, nearly empty create_flags and
private_flags fields. Unifying them reduces memory usage.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/cq.c | 5 +++--
drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +-
drivers/infiniband/hw/mlx5/qp.c | 2 +-
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 651d76bca114..1b4290166e87 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -983,7 +983,8 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
- cq->create_flags = attr->flags;
+ if (attr->flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)
+ cq->private_flags |= MLX5_IB_CQ_PR_TIMESTAMP_COMPLETION;
INIT_LIST_HEAD(&cq->list_send_qp);
INIT_LIST_HEAD(&cq->list_recv_qp);
@@ -1017,7 +1018,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
MLX5_SET(cqc, cqc, uar_page, index);
MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
- if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
+ if (attr->flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
MLX5_SET(cqc, cqc, oi, 1);
if (udata) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 4f4114d95130..ce3372aea48b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -561,6 +561,7 @@ struct mlx5_ib_cq_buf {
enum mlx5_ib_cq_pr_flags {
MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD = 1 << 0,
MLX5_IB_CQ_PR_FLAGS_REAL_TIME_TS = 1 << 1,
+ MLX5_IB_CQ_PR_TIMESTAMP_COMPLETION = 1 << 2,
};
struct mlx5_ib_cq {
@@ -581,7 +582,6 @@ struct mlx5_ib_cq {
int cqe_size;
struct list_head list_send_qp;
struct list_head list_recv_qp;
- u32 create_flags;
struct list_head wc_list;
enum ib_cq_notify_flags notify_flags;
struct work_struct notify_work;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 0324909e3151..7af09e668c4c 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1274,7 +1274,7 @@ static int get_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
}
return MLX5_TIMESTAMP_FORMAT_REAL_TIME;
}
- if (cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) {
+ if (cq->private_flags & MLX5_IB_CQ_PR_TIMESTAMP_COMPLETION) {
if (!fr_sup) {
mlx5_ib_dbg(dev,
"Free running TS format is not supported\n");
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 11/50] RDMA/mlx5: Provide a modern CQ creation interface
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
The uverbs CQ creation UAPI allows users to supply their own umem for a CQ.
Update mlx5 to support this workflow while preserving support for creating
umem through the legacy interface.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/cq.c | 154 +++++++++++++++++++++++------------
drivers/infiniband/hw/mlx5/main.c | 1 +
drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +
3 files changed, 107 insertions(+), 51 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 1b4290166e87..52a435efd0de 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -749,16 +749,15 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size;
- cq->buf.umem =
- ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
- entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(cq->buf.umem)) {
- err = PTR_ERR(cq->buf.umem);
- return err;
- }
+ if (!cq->ibcq.umem)
+ cq->ibcq.umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+ entries * ucmd.cqe_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(cq->ibcq.umem))
+ return PTR_ERR(cq->ibcq.umem);
page_size = mlx5_umem_find_best_cq_quantized_pgoff(
- cq->buf.umem, cqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT,
+ cq->ibcq.umem, cqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT,
page_offset, 64, &page_offset_quantized);
if (!page_size) {
err = -EINVAL;
@@ -769,12 +768,12 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (err)
goto err_umem;
- ncont = ib_umem_num_dma_blocks(cq->buf.umem, page_size);
+ ncont = ib_umem_num_dma_blocks(cq->ibcq.umem, page_size);
mlx5_ib_dbg(
dev,
"addr 0x%llx, size %u, npages %zu, page_size %lu, ncont %d\n",
ucmd.buf_addr, entries * ucmd.cqe_size,
- ib_umem_num_pages(cq->buf.umem), page_size, ncont);
+ ib_umem_num_pages(cq->ibcq.umem), page_size, ncont);
*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont;
@@ -785,7 +784,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
}
pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
- mlx5_ib_populate_pas(cq->buf.umem, page_size, pas, 0);
+ mlx5_ib_populate_pas(cq->ibcq.umem, page_size, pas, 0);
cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
MLX5_SET(cqc, cqc, log_page_size,
@@ -858,7 +857,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
mlx5_ib_db_unmap_user(context, &cq->db);
err_umem:
- ib_umem_release(cq->buf.umem);
+ /* UMEM is released by ib_core */
return err;
}
@@ -868,7 +867,6 @@ static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_udata *udata)
udata, struct mlx5_ib_ucontext, ibucontext);
mlx5_ib_db_unmap_user(context, &cq->db);
- ib_umem_release(cq->buf.umem);
}
static void init_cq_frag_buf(struct mlx5_ib_cq_buf *buf)
@@ -949,8 +947,9 @@ static void notify_soft_wc_handler(struct work_struct *work)
cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
}
-int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct uverbs_attr_bundle *attrs)
+int mlx5_ib_create_user_cq(struct ib_cq *ibcq,
+ const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_udata *udata = &attrs->driver_udata;
struct ib_device *ibdev = ibcq->device;
@@ -967,8 +966,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
int eqn;
int err;
- if (entries < 0 ||
- (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
+ if (attr->cqe > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
return -EINVAL;
if (check_cq_create_flags(attr->flags))
@@ -981,27 +979,15 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
cq->ibcq.cqe = entries - 1;
mutex_init(&cq->resize_mutex);
spin_lock_init(&cq->lock);
- cq->resize_buf = NULL;
- cq->resize_umem = NULL;
if (attr->flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)
cq->private_flags |= MLX5_IB_CQ_PR_TIMESTAMP_COMPLETION;
INIT_LIST_HEAD(&cq->list_send_qp);
INIT_LIST_HEAD(&cq->list_recv_qp);
- if (udata) {
- err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
- &index, &inlen, attrs);
- if (err)
- return err;
- } else {
- cqe_size = cache_line_size() == 128 ? 128 : 64;
- err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
- &index, &inlen);
- if (err)
- return err;
-
- INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
- }
+ err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, &index,
+ &inlen, attrs);
+ if (err)
+ return err;
err = mlx5_comp_eqn_get(dev->mdev, vector, &eqn);
if (err)
@@ -1021,12 +1007,8 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (attr->flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
MLX5_SET(cqc, cqc, oi, 1);
- if (udata) {
- cq->mcq.comp = mlx5_add_cq_to_tasklet;
- cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
- } else {
- cq->mcq.comp = mlx5_ib_cq_comp;
- }
+ cq->mcq.comp = mlx5_add_cq_to_tasklet;
+ cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
if (err)
@@ -1037,12 +1019,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
INIT_LIST_HEAD(&cq->wc_list);
- if (udata)
- if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
- err = -EFAULT;
- goto err_cmd;
- }
-
+ if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+ err = -EFAULT;
+ goto err_cmd;
+ }
kvfree(cqb);
return 0;
@@ -1052,10 +1032,82 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
err_cqb:
kvfree(cqb);
- if (udata)
- destroy_cq_user(cq, udata);
- else
- destroy_cq_kernel(dev, cq);
+ destroy_cq_user(cq, udata);
+ return err;
+}
+
+
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_device *ibdev = ibcq->device;
+ int entries = attr->cqe;
+ int vector = attr->comp_vector;
+ struct mlx5_ib_dev *dev = to_mdev(ibdev);
+ struct mlx5_ib_cq *cq = to_mcq(ibcq);
+ u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+ int index;
+ int inlen;
+ u32 *cqb = NULL;
+ void *cqc;
+ int cqe_size;
+ int eqn;
+ int err;
+
+ if (attr->cqe > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
+ return -EINVAL;
+
+ entries = roundup_pow_of_two(entries + 1);
+ if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
+ return -EINVAL;
+
+ cq->ibcq.cqe = entries - 1;
+ mutex_init(&cq->resize_mutex);
+ spin_lock_init(&cq->lock);
+ INIT_LIST_HEAD(&cq->list_send_qp);
+ INIT_LIST_HEAD(&cq->list_recv_qp);
+
+ cqe_size = cache_line_size() == 128 ? 128 : 64;
+ err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index,
+ &inlen);
+ if (err)
+ return err;
+
+ INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
+
+ err = mlx5_comp_eqn_get(dev->mdev, vector, &eqn);
+ if (err)
+ goto err_cqb;
+
+ cq->cqe_size = cqe_size;
+
+ cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context);
+ MLX5_SET(cqc, cqc, cqe_sz,
+ cqe_sz_to_mlx_sz(cqe_size,
+ cq->private_flags &
+ MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
+ MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
+ MLX5_SET(cqc, cqc, uar_page, index);
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
+ MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
+
+ cq->mcq.comp = mlx5_ib_cq_comp;
+
+ err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out,
+ sizeof(out));
+ if (err)
+ goto err_cqb;
+
+ mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
+ cq->mcq.event = mlx5_ib_cq_event;
+
+ INIT_LIST_HEAD(&cq->wc_list);
+ kvfree(cqb);
+ return 0;
+
+err_cqb:
+ kvfree(cqb);
+ destroy_cq_kernel(dev, cq);
return err;
}
@@ -1390,8 +1442,8 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
if (udata) {
cq->ibcq.cqe = entries - 1;
- ib_umem_release(cq->buf.umem);
- cq->buf.umem = cq->resize_umem;
+ ib_umem_release(cq->ibcq.umem);
+ cq->ibcq.umem = cq->resize_umem;
cq->resize_umem = NULL;
} else {
struct mlx5_ib_cq_buf tbuf;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index eba023b7af0f..4f49f65e2c16 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4447,6 +4447,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.check_mr_status = mlx5_ib_check_mr_status,
.create_ah = mlx5_ib_create_ah,
.create_cq = mlx5_ib_create_cq,
+ .create_user_cq = mlx5_ib_create_user_cq,
.create_qp = mlx5_ib_create_qp,
.create_srq = mlx5_ib_create_srq,
.create_user_ah = mlx5_ib_create_ah,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ce3372aea48b..2556e326afde 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1371,6 +1371,9 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
size_t buflen, size_t *bc);
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct uverbs_attr_bundle *attrs);
+int mlx5_ib_create_user_cq(struct ib_cq *ibcq,
+ const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs);
int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int mlx5_ib_pre_destroy_cq(struct ib_cq *cq);
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 06/50] RDMA/efa: Rely on CPU address in create‑QP
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
Align this code with other locations where efa_free_mapped() depends on the
presence of a valid CPU address, which is guaranteed when qp->rq_size != 0.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/efa/efa_verbs.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index ae9b98b4b528..bc69aef3e436 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -579,7 +579,7 @@ static int qp_mmap_entries_setup(struct efa_qp *qp,
resp->llq_desc_offset &= ~PAGE_MASK;
- if (qp->rq_size) {
+ if (qp->rq_cpu_addr) {
address = dev->db_bar_addr + resp->rq_db_offset;
qp->rq_db_mmap_entry =
@@ -828,7 +828,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
err_destroy_qp:
efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
err_free_mapped:
- if (qp->rq_size)
+ if (qp->rq_cpu_addr)
efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
qp->rq_size, DMA_TO_DEVICE);
err_out:
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 09/50] RDMA/efa: Remove check for zero CQE count
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
Since ib_core now handles validation, the device driver no longer needs
to verify that the CQE count is non‑zero.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/efa/efa_verbs.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index d465e6acfe3c..e8fb99b61be8 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1152,9 +1152,9 @@ int efa_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (attr->flags)
return -EOPNOTSUPP;
- if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
+ if (entries > dev->dev_attr.max_cq_depth) {
ibdev_dbg(ibdev,
- "cq: requested entries[%u] non-positive or greater than max[%u]\n",
+ "cq: requested entries[%u] greater than max[%u]\n",
entries, dev->dev_attr.max_cq_depth);
err = -EINVAL;
goto err_out;
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 08/50] RDMA/core: Reject zero CQE count
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
All drivers already ensure that the number of CQEs is at least 1.
Add this validation to the core so drivers no longer need to repeat it.
Future patches converting to the .create_user_cq() interface will remove
the per‑driver checks.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/cq.c | 3 +++
drivers/infiniband/core/uverbs_cmd.c | 3 +++
drivers/infiniband/core/uverbs_std_types_cq.c | 15 +++++++++------
drivers/infiniband/core/verbs.c | 3 +++
4 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 584537c71545..7e0b54ec4141 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -220,6 +220,9 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
struct ib_cq *cq;
int ret = -ENOMEM;
+ if (WARN_ON_ONCE(!nr_cqe))
+ return ERR_PTR(-EINVAL);
+
cq = rdma_zalloc_drv_obj(dev, ib_cq);
if (!cq)
return ERR_PTR(ret);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index c7be592f60e8..041bed7a43b4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1032,6 +1032,9 @@ static int create_cq(struct uverbs_attr_bundle *attrs,
if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors)
return -EINVAL;
+ if (!cmd->cqe)
+ return -EINVAL;
+
obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs,
&ib_dev);
if (IS_ERR(obj))
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index b999d8d62694..d2c8f71f934c 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -84,12 +84,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
ret = uverbs_copy_from(&attr.comp_vector, attrs,
UVERBS_ATTR_CREATE_CQ_COMP_VECTOR);
- if (!ret)
- ret = uverbs_copy_from(&attr.cqe, attrs,
- UVERBS_ATTR_CREATE_CQ_CQE);
- if (!ret)
- ret = uverbs_copy_from(&user_handle, attrs,
- UVERBS_ATTR_CREATE_CQ_USER_HANDLE);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&attr.cqe, attrs, UVERBS_ATTR_CREATE_CQ_CQE);
+ if (ret || !attr.cqe)
+ return ret ? : -EINVAL;
+
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_CQ_USER_HANDLE);
if (ret)
return ret;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index d0880346ebe2..9d075eeda463 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2203,6 +2203,9 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
if (!cq)
return ERR_PTR(-ENOMEM);
+ if (WARN_ON_ONCE(!cq_attr->cqe))
+ return ERR_PTR(-EINVAL);
+
cq->device = device;
cq->comp_handler = comp_handler;
cq->event_handler = event_handler;
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 07/50] RDMA/core: Prepare create CQ path for API unification
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
Ensure that .create_cq_umem() and .create_cq() follow the same API
contract, allowing drivers to be gradually migrated to the umem-aware
CQ management flow.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/device.c | 2 +-
drivers/infiniband/core/uverbs_cmd.c | 5 ++++-
drivers/infiniband/core/uverbs_std_types_cq.c | 16 +++++++++++-----
drivers/infiniband/core/verbs.c | 6 +++++-
drivers/infiniband/hw/efa/efa.h | 6 ++----
drivers/infiniband/hw/efa/efa_main.c | 3 +--
drivers/infiniband/hw/efa/efa_verbs.c | 10 ++--------
include/rdma/ib_verbs.h | 3 +--
8 files changed, 27 insertions(+), 24 deletions(-)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 4e09f6e0995e..9209b8c664ef 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2701,7 +2701,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
- SET_DEVICE_OP(dev_ops, create_cq_umem);
+ SET_DEVICE_OP(dev_ops, create_user_cq);
SET_DEVICE_OP(dev_ops, create_flow);
SET_DEVICE_OP(dev_ops, create_qp);
SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index fb19395b9f2a..c7be592f60e8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1068,7 +1068,10 @@ static int create_cq(struct uverbs_attr_bundle *attrs,
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, attrs);
+ if (ib_dev->ops.create_user_cq)
+ ret = ib_dev->ops.create_user_cq(cq, &attr, attrs);
+ else
+ ret = ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
rdma_restrack_add(&cq->res);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 05809f9ff0f6..b999d8d62694 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -78,7 +78,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
int buffer_fd;
int ret;
- if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq)
+ if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_user_cq) ||
+ !ib_dev->ops.destroy_cq)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -130,7 +131,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) ||
uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
- !ib_dev->ops.create_cq_umem) {
+ !ib_dev->ops.create_user_cq) {
ret = -EINVAL;
goto err_event_file;
}
@@ -155,7 +156,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
goto err_event_file;
if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) ||
- !ib_dev->ops.create_cq_umem) {
+ !ib_dev->ops.create_user_cq) {
ret = -EINVAL;
goto err_event_file;
}
@@ -196,11 +197,16 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) :
- ib_dev->ops.create_cq(cq, &attr, attrs);
+ if (ib_dev->ops.create_user_cq)
+ ret = ib_dev->ops.create_user_cq(cq, &attr, attrs);
+ else
+ ret = ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
+ /* Check that driver didn't overrun existing umem */
+ WARN_ON(umem && cq->umem != umem);
+
obj->uevent.uobject.object = cq;
obj->uevent.uobject.user_handle = user_handle;
rdma_restrack_add(&cq->res);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index ad48d2458a3f..d0880346ebe2 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2204,7 +2204,6 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
return ERR_PTR(-ENOMEM);
cq->device = device;
- cq->uobject = NULL;
cq->comp_handler = comp_handler;
cq->event_handler = event_handler;
cq->cq_context = cq_context;
@@ -2219,6 +2218,11 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
kfree(cq);
return ERR_PTR(ret);
}
+ /*
+ * We are in kernel verbs flow and drivers are not allowed
+ * to set umem pointer, it needs to stay NULL.
+ */
+ WARN_ON_ONCE(cq->umem);
rdma_restrack_add(&cq->res);
return cq;
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 96f9c3bc98b2..00b19f2ba3da 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -161,10 +161,8 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
struct ib_udata *udata);
int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct uverbs_attr_bundle *attrs);
-int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct ib_umem *umem, struct uverbs_attr_bundle *attrs);
+int efa_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs);
struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_dmah *dmah,
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 6c415b9adb5f..a1d68dc49e45 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -371,8 +371,7 @@ static const struct ib_device_ops efa_dev_ops = {
.alloc_hw_device_stats = efa_alloc_hw_device_stats,
.alloc_pd = efa_alloc_pd,
.alloc_ucontext = efa_alloc_ucontext,
- .create_cq = efa_create_cq,
- .create_cq_umem = efa_create_cq_umem,
+ .create_user_cq = efa_create_user_cq,
.create_qp = efa_create_qp,
.create_user_ah = efa_create_ah,
.dealloc_pd = efa_dealloc_pd,
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index bc69aef3e436..d465e6acfe3c 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1130,8 +1130,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
return 0;
}
-int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct ib_umem *umem, struct uverbs_attr_bundle *attrs)
+int efa_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_udata *udata = &attrs->driver_udata;
struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
@@ -1306,12 +1306,6 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
return err;
}
-int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct uverbs_attr_bundle *attrs)
-{
- return efa_create_cq_umem(ibcq, attr, NULL, attrs);
-}
-
static int umem_to_page_list(struct efa_dev *dev,
struct ib_umem *umem,
u64 *page_list,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b1e34fd2ed5f..67aa5fc2c0b7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2529,9 +2529,8 @@ struct ib_device_ops {
int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
struct uverbs_attr_bundle *attrs);
- int (*create_cq_umem)(struct ib_cq *cq,
+ int (*create_user_cq)(struct ib_cq *cq,
const struct ib_cq_init_attr *attr,
- struct ib_umem *umem,
struct uverbs_attr_bundle *attrs);
int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 02/50] RDMA/umem: Allow including ib_umem header from any location
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
Including ib_umem.h currently triggers circular dependency errors.
These issues can be resolved by removing the include of ib_verbs.h,
which was only needed to resolve the struct ib_device pointer.
>> depmod: ERROR: Cycle detected: ib_core -> ib_uverbs -> ib_core
>> depmod: ERROR: Found 2 modules in dependency cycles!
make[3]: *** [scripts/Makefile.modinst:132: depmod] Error 1
make[3]: Target '__modinst' not remade because of errors.
make[2]: *** [Makefile:1960: modules_install] Error 2
make[1]: *** [Makefile:248: __sub-make] Error 2
make[1]: Target 'modules_install' not remade because of errors.
make: *** [Makefile:248: __sub-make] Error 2
make: Target 'modules_install' not remade because of errors.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
include/rdma/ib_umem.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index ce47688dd003..084a1d9a66f3 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -10,8 +10,8 @@
#include <linux/list.h>
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
-#include <rdma/ib_verbs.h>
+struct ib_device;
struct ib_ucontext;
struct ib_umem_odp;
struct dma_buf_attach_ops;
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 05/50] RDMA/core: Manage CQ umem in core code
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
In the current implementation, CQ umem is handled both by ib_core and
the driver. ib_core sometimes creates and destroys it, while the driver
also destroys it.
Store the umem in struct ib_cq and ensure that only ib_core manages
its lifetime, relying solely on its internal reference counter.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/umem.c | 2 +-
drivers/infiniband/core/uverbs_cmd.c | 1 +
drivers/infiniband/core/uverbs_std_types_cq.c | 7 ++++++-
drivers/infiniband/core/verbs.c | 2 ++
drivers/infiniband/hw/efa/efa_verbs.c | 24 +++++++++++-------------
include/rdma/ib_verbs.h | 1 +
6 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 8137031c2a65..fc70b918f3f0 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -283,7 +283,7 @@ EXPORT_SYMBOL(ib_umem_get);
*/
void ib_umem_release(struct ib_umem *umem)
{
- if (!umem)
+ if (IS_ERR_OR_NULL(umem))
return;
if (umem->is_dmabuf)
return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index f4616deeca54..fb19395b9f2a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1085,6 +1085,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs,
return uverbs_response(attrs, &resp, sizeof(resp));
err_free:
+ ib_umem_release(cq->umem);
rdma_restrack_put(&cq->res);
kfree(cq);
err_file:
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index fab5d914029d..05809f9ff0f6 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -186,6 +186,11 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
cq->comp_handler = ib_uverbs_comp_handler;
cq->event_handler = ib_uverbs_cq_event_handler;
cq->cq_context = ev_file ? &ev_file->ev_queue : NULL;
+ /*
+ * If UMEM is not provided here, legacy drivers will set it during
+ * CQ creation based on their internal udata.
+ */
+ cq->umem = umem;
atomic_set(&cq->usecnt, 0);
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
@@ -206,7 +211,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
return ret;
err_free:
- ib_umem_release(umem);
+ ib_umem_release(cq->umem);
rdma_restrack_put(&cq->res);
kfree(cq);
err_event_file:
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 47a97797d7be..ad48d2458a3f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -49,6 +49,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
#include <rdma/rw.h>
#include <rdma/lag.h>
@@ -2249,6 +2250,7 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
if (ret)
return ret;
+ ib_umem_release(cq->umem);
rdma_restrack_del(&cq->res);
kfree(cq);
return ret;
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 19e3033d4ff7..ae9b98b4b528 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1083,15 +1083,14 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
efa_destroy_cq_idx(dev, cq->cq_idx);
- efa_cq_user_mmap_entries_remove(cq);
+ if (cq->cpu_addr)
+ efa_cq_user_mmap_entries_remove(cq);
if (cq->eq) {
xa_erase(&dev->cqs_xa, cq->cq_idx);
synchronize_irq(cq->eq->irq.irqn);
}
- if (cq->umem)
- ib_umem_release(cq->umem);
- else
+ if (cq->cpu_addr)
efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE);
return 0;
}
@@ -1212,22 +1211,20 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
cq->ucontext = ucontext;
cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
- if (umem) {
- if (umem->length < cq->size) {
+ if (ibcq->umem) {
+ if (ibcq->umem->length < cq->size) {
ibdev_dbg(&dev->ibdev, "External memory too small\n");
err = -EINVAL;
goto err_out;
}
- if (!ib_umem_is_contiguous(umem)) {
+ if (!ib_umem_is_contiguous(ibcq->umem)) {
ibdev_dbg(&dev->ibdev, "Non contiguous CQ unsupported\n");
err = -EINVAL;
goto err_out;
}
- cq->cpu_addr = NULL;
- cq->dma_addr = ib_umem_start_dma_addr(umem);
- cq->umem = umem;
+ cq->dma_addr = ib_umem_start_dma_addr(ibcq->umem);
} else {
cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
DMA_FROM_DEVICE);
@@ -1259,7 +1256,7 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
cq->ibcq.cqe = result.actual_depth;
WARN_ON_ONCE(entries != result.actual_depth);
- if (!umem)
+ if (cq->cpu_addr)
err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
if (err) {
@@ -1296,11 +1293,12 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (cq->eq)
xa_erase(&dev->cqs_xa, cq->cq_idx);
err_remove_mmap:
- efa_cq_user_mmap_entries_remove(cq);
+ if (cq->cpu_addr)
+ efa_cq_user_mmap_entries_remove(cq);
err_destroy_cq:
efa_destroy_cq_idx(dev, cq->cq_idx);
err_free_mapped:
- if (!umem)
+ if (cq->cpu_addr)
efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
DMA_FROM_DEVICE);
err_out:
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e1ec5a6c74e6..b1e34fd2ed5f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1649,6 +1649,7 @@ struct ib_cq {
u8 interrupt:1;
u8 shared:1;
unsigned int comp_vector;
+ struct ib_umem *umem;
/*
* Implementation details of the RDMA core, don't use in drivers:
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 04/50] RDMA/core: Promote UMEM to a core component
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
To manage UMEM objects at the core level and reuse the existing
ib_destroy_cq*() flow, move the UMEM files to be built together with
ib_core. Attempting to call ib_umem_release() from verbs.c currently
results in the following error:
depmod: ERROR: Cycle detected: ib_core -> ib_uverbs -> ib_core
depmod: ERROR: Found 2 modules in dependency cycles!
verbs.c:(.text+0x250c): undefined reference to `ib_umem_release'
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 48922e0ede56..ada9877d02df 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -16,6 +16,8 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
ib_cm-y := cm.o cm_trace.o
@@ -42,5 +44,3 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_wq.o \
uverbs_std_types_qp.o \
ucaps.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
-ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 03/50] RDMA/umem: Remove unnecessary includes and defines from ib_umem header
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
The ib_umem header no longer requires the removed includes or forward
declarations, so drop them to reduce clutter.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
include/rdma/ib_umem.h | 4 ----
1 file changed, 4 deletions(-)
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 084a1d9a66f3..c3ab11e6879f 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -7,13 +7,9 @@
#ifndef IB_UMEM_H
#define IB_UMEM_H
-#include <linux/list.h>
#include <linux/scatterlist.h>
-#include <linux/workqueue.h>
struct ib_device;
-struct ib_ucontext;
-struct ib_umem_odp;
struct dma_buf_attach_ops;
struct ib_umem {
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 01/50] RDMA: Move DMA block iterator logic into dedicated files
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
In-Reply-To: <20260213-refactor-umem-v1-0-f3be85847922@nvidia.com>
From: Leon Romanovsky <leonro@nvidia.com>
The DMA iterator logic was mixed into verbs and umem-specific code,
forcing all users to include rdma/ib_umem.h. Move the block iterator
logic into iter.c and rdma/iter.h so that rdma/ib_umem.h and
rdma/ib_verbs.h can be separated in a follow-up patch.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/iter.c | 43 ++++++++++++++
drivers/infiniband/core/verbs.c | 38 ------------
drivers/infiniband/hw/bnxt_re/qplib_res.c | 2 +-
drivers/infiniband/hw/cxgb4/mem.c | 2 +-
drivers/infiniband/hw/efa/efa_verbs.c | 2 +-
drivers/infiniband/hw/erdma/erdma_verbs.c | 2 +-
drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +-
drivers/infiniband/hw/ionic/ionic_ibdev.h | 2 +-
drivers/infiniband/hw/irdma/main.h | 2 +-
drivers/infiniband/hw/mana/mana_ib.h | 2 +-
drivers/infiniband/hw/mlx4/mr.c | 1 +
drivers/infiniband/hw/mlx5/mem.c | 1 +
drivers/infiniband/hw/mlx5/umr.c | 1 +
drivers/infiniband/hw/mthca/mthca_provider.c | 2 +-
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +-
drivers/infiniband/hw/qedr/verbs.c | 2 +-
drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 2 +-
include/rdma/ib_umem.h | 30 ----------
include/rdma/ib_verbs.h | 48 ---------------
include/rdma/iter.h | 88 ++++++++++++++++++++++++++++
21 files changed, 147 insertions(+), 129 deletions(-)
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index f483e0c12444..48922e0ede56 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
multicast.o mad.o smi.o agent.o mad_rmpp.o \
nldev.o restrack.o counters.o ib_core_uverbs.o \
- trace.o lag.o
+ trace.o lag.o iter.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/iter.c b/drivers/infiniband/core/iter.c
new file mode 100644
index 000000000000..8e543d100657
--- /dev/null
+++ b/drivers/infiniband/core/iter.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. */
+
+#include <linux/export.h>
+#include <rdma/iter.h>
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+ struct scatterlist *sglist, unsigned int nents,
+ unsigned long pgsz)
+{
+ memset(biter, 0, sizeof(struct ib_block_iter));
+ biter->__sg = sglist;
+ biter->__sg_nents = nents;
+
+ /* Driver provides best block size to use */
+ biter->__pg_bit = __fls(pgsz);
+}
+EXPORT_SYMBOL(__rdma_block_iter_start);
+
+bool __rdma_block_iter_next(struct ib_block_iter *biter)
+{
+ unsigned int block_offset;
+ unsigned int delta;
+
+ if (!biter->__sg_nents || !biter->__sg)
+ return false;
+
+ biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
+ block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
+ delta = BIT_ULL(biter->__pg_bit) - block_offset;
+
+ while (biter->__sg_nents && biter->__sg &&
+ sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) {
+ delta -= sg_dma_len(biter->__sg) - biter->__sg_advance;
+ biter->__sg_advance = 0;
+ biter->__sg = sg_next(biter->__sg);
+ biter->__sg_nents--;
+ }
+ biter->__sg_advance += delta;
+
+ return true;
+}
+EXPORT_SYMBOL(__rdma_block_iter_next);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 02ebc3e52196..47a97797d7be 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -3154,44 +3154,6 @@ int rdma_init_netdev(struct ib_device *device, u32 port_num,
}
EXPORT_SYMBOL(rdma_init_netdev);
-void __rdma_block_iter_start(struct ib_block_iter *biter,
- struct scatterlist *sglist, unsigned int nents,
- unsigned long pgsz)
-{
- memset(biter, 0, sizeof(struct ib_block_iter));
- biter->__sg = sglist;
- biter->__sg_nents = nents;
-
- /* Driver provides best block size to use */
- biter->__pg_bit = __fls(pgsz);
-}
-EXPORT_SYMBOL(__rdma_block_iter_start);
-
-bool __rdma_block_iter_next(struct ib_block_iter *biter)
-{
- unsigned int block_offset;
- unsigned int delta;
-
- if (!biter->__sg_nents || !biter->__sg)
- return false;
-
- biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
- block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
- delta = BIT_ULL(biter->__pg_bit) - block_offset;
-
- while (biter->__sg_nents && biter->__sg &&
- sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) {
- delta -= sg_dma_len(biter->__sg) - biter->__sg_advance;
- biter->__sg_advance = 0;
- biter->__sg = sg_next(biter->__sg);
- biter->__sg_nents--;
- }
- biter->__sg_advance += delta;
-
- return true;
-}
-EXPORT_SYMBOL(__rdma_block_iter_next);
-
/**
* rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
* for the drivers.
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 875d7b52c06a..64b02ea98cac 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -46,7 +46,7 @@
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <rdma/ib_verbs.h>
-#include <rdma/ib_umem.h>
+#include <rdma/iter.h>
#include "roce_hsi.h"
#include "qplib_res.h"
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index adeed7447e7b..e0ec2c4158a0 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -32,9 +32,9 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <rdma/ib_umem.h>
#include <linux/atomic.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/iter.h>
#include "iw_cxgb4.h"
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 22d3e25c3b9d..19e3033d4ff7 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -9,9 +9,9 @@
#include <linux/log2.h>
#include <rdma/ib_addr.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_verbs.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#define UVERBS_MODULE_NAME efa_ib
#include <rdma/uverbs_named_ioctl.h>
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
index 109a3f3de911..058edc42de58 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.c
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -12,7 +12,7 @@
#include <linux/vmalloc.h>
#include <net/addrconf.h>
#include <rdma/erdma-abi.h>
-#include <rdma/ib_umem.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include "erdma.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
index 6ee911f6885b..c21004814c3c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -32,7 +32,7 @@
*/
#include <linux/vmalloc.h>
-#include <rdma/ib_umem.h>
+#include <rdma/iter.h>
#include "hns_roce_device.h"
void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf)
diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.h b/drivers/infiniband/hw/ionic/ionic_ibdev.h
index 82fda1e3cdb6..63828240d659 100644
--- a/drivers/infiniband/hw/ionic/ionic_ibdev.h
+++ b/drivers/infiniband/hw/ionic/ionic_ibdev.h
@@ -4,9 +4,9 @@
#ifndef _IONIC_IBDEV_H_
#define _IONIC_IBDEV_H_
-#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_pack.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include <rdma/ionic-abi.h>
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index d320d1a228b3..3d49bd57bae7 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -37,8 +37,8 @@
#include <rdma/rdma_cm.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_cache.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include "osdep.h"
#include "defs.h"
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index e447acfd2071..a7c8c0fd7019 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -8,7 +8,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_mad.h>
-#include <rdma/ib_umem.h>
+#include <rdma/iter.h>
#include <rdma/mana-abi.h>
#include <rdma/uverbs_ioctl.h>
#include <linux/dmapool.h>
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 94464f1694d9..9b647a300eb9 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/iter.h>
#include "mlx4_ib.h"
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index af321f6ef7f5..75d5b5672b5c 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -31,6 +31,7 @@
*/
#include <rdma/ib_umem_odp.h>
+#include <rdma/iter.h>
#include "mlx5_ib.h"
/*
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 4e562e0dd9e1..29488fba21a0 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
#include <rdma/ib_umem_odp.h>
+#include <rdma/iter.h>
#include "mlx5_ib.h"
#include "umr.h"
#include "wr.h"
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index dd572d76866c..aa5ca5c4ff77 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -35,8 +35,8 @@
*/
#include <rdma/ib_smi.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include <linux/sched.h>
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 46d911fd38de..bf9211d8d130 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -45,9 +45,9 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/iw_cm.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include "ocrdma.h"
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index ab9bf0922979..cb06c5d894b8 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -39,9 +39,9 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/iw_cm.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
+#include <rdma/iter.h>
#include <rdma/uverbs_ioctl.h>
#include <linux/qed/common_hsi.h>
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 763ddc6f25d1..23e547d4b3a7 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -53,8 +53,8 @@
#include <linux/pci.h>
#include <linux/semaphore.h>
#include <linux/workqueue.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
+#include <rdma/iter.h>
#include <rdma/vmw_pvrdma-abi.h>
#include "pvrdma_ring.h"
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 0a8e092c0ea8..ce47688dd003 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -76,36 +76,6 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem)
return ib_umem_num_dma_blocks(umem, PAGE_SIZE);
}
-static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
- struct ib_umem *umem,
- unsigned long pgsz)
-{
- __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl,
- umem->sgt_append.sgt.nents, pgsz);
- biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1);
- biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz);
-}
-
-static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter)
-{
- return __rdma_block_iter_next(biter) && biter->__sg_numblocks--;
-}
-
-/**
- * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
- * @umem: umem to iterate over
- * @pgsz: Page size to split the list into
- *
- * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
- * returned DMA blocks will be aligned to pgsz and span the range:
- * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
- *
- * Performs exactly ib_umem_num_dma_blocks() iterations.
- */
-#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \
- for (__rdma_umem_block_iter_start(biter, umem, pgsz); \
- __rdma_umem_block_iter_next(biter);)
-
#ifdef CONFIG_INFINIBAND_USER_MEM
struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8bd020da7745..e1ec5a6c74e6 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2950,22 +2950,6 @@ struct ib_client {
u8 no_kverbs_req:1;
};
-/*
- * IB block DMA iterator
- *
- * Iterates the DMA-mapped SGL in contiguous memory blocks aligned
- * to a HW supported page size.
- */
-struct ib_block_iter {
- /* internal states */
- struct scatterlist *__sg; /* sg holding the current aligned block */
- dma_addr_t __dma_addr; /* unaligned DMA address of this block */
- size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */
- unsigned int __sg_nents; /* number of SG entries */
- unsigned int __sg_advance; /* number of bytes to advance in sg in next step */
- unsigned int __pg_bit; /* alignment of current block */
-};
-
struct ib_device *_ib_alloc_device(size_t size, struct net *net);
#define ib_alloc_device(drv_struct, member) \
container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
@@ -2994,38 +2978,6 @@ void ib_unregister_device_queued(struct ib_device *ib_dev);
int ib_register_client (struct ib_client *client);
void ib_unregister_client(struct ib_client *client);
-void __rdma_block_iter_start(struct ib_block_iter *biter,
- struct scatterlist *sglist,
- unsigned int nents,
- unsigned long pgsz);
-bool __rdma_block_iter_next(struct ib_block_iter *biter);
-
-/**
- * rdma_block_iter_dma_address - get the aligned dma address of the current
- * block held by the block iterator.
- * @biter: block iterator holding the memory block
- */
-static inline dma_addr_t
-rdma_block_iter_dma_address(struct ib_block_iter *biter)
-{
- return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1);
-}
-
-/**
- * rdma_for_each_block - iterate over contiguous memory blocks of the sg list
- * @sglist: sglist to iterate over
- * @biter: block iterator holding the memory block
- * @nents: maximum number of sg entries to iterate over
- * @pgsz: best HW supported page size to use
- *
- * Callers may use rdma_block_iter_dma_address() to get each
- * blocks aligned DMA address.
- */
-#define rdma_for_each_block(sglist, biter, nents, pgsz) \
- for (__rdma_block_iter_start(biter, sglist, nents, \
- pgsz); \
- __rdma_block_iter_next(biter);)
-
/**
* ib_get_client_data - Get IB client context
* @device:Device to get context for
diff --git a/include/rdma/iter.h b/include/rdma/iter.h
new file mode 100644
index 000000000000..19d64ef04ba9
--- /dev/null
+++ b/include/rdma/iter.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. */
+
+#ifndef _RDMA_ITER_H_
+#define _RDMA_ITER_H_
+
+#include <linux/scatterlist.h>
+#include <rdma/ib_umem.h>
+
+/**
+ * IB block DMA iterator
+ *
+ * Iterates the DMA-mapped SGL in contiguous memory blocks aligned
+ * to a HW supported page size.
+ */
+struct ib_block_iter {
+ /* internal states */
+ struct scatterlist *__sg; /* sg holding the current aligned block */
+ dma_addr_t __dma_addr; /* unaligned DMA address of this block */
+ size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */
+ unsigned int __sg_nents; /* number of SG entries */
+ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */
+ unsigned int __pg_bit; /* alignment of current block */
+};
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+ struct scatterlist *sglist,
+ unsigned int nents,
+ unsigned long pgsz);
+bool __rdma_block_iter_next(struct ib_block_iter *biter);
+
+/**
+ * rdma_block_iter_dma_address - get the aligned dma address of the current
+ * block held by the block iterator.
+ * @biter: block iterator holding the memory block
+ */
+static inline dma_addr_t
+rdma_block_iter_dma_address(struct ib_block_iter *biter)
+{
+ return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1);
+}
+
+/**
+ * rdma_for_each_block - iterate over contiguous memory blocks of the sg list
+ * @sglist: sglist to iterate over
+ * @biter: block iterator holding the memory block
+ * @nents: maximum number of sg entries to iterate over
+ * @pgsz: best HW supported page size to use
+ *
+ * Callers may use rdma_block_iter_dma_address() to get each
+ * blocks aligned DMA address.
+ */
+#define rdma_for_each_block(sglist, biter, nents, pgsz) \
+ for (__rdma_block_iter_start(biter, sglist, nents, \
+ pgsz); \
+ __rdma_block_iter_next(biter);)
+
+static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
+ struct ib_umem *umem,
+ unsigned long pgsz)
+{
+ __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl,
+ umem->sgt_append.sgt.nents, pgsz);
+ biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1);
+ biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz);
+}
+
+static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter)
+{
+ return __rdma_block_iter_next(biter) && biter->__sg_numblocks--;
+}
+
+/**
+ * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
+ * @umem: umem to iterate over
+ * @pgsz: Page size to split the list into
+ *
+ * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
+ * returned DMA blocks will be aligned to pgsz and span the range:
+ * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
+ *
+ * Performs exactly ib_umem_num_dma_blocks() iterations.
+ */
+#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \
+ for (__rdma_umem_block_iter_start(biter, umem, pgsz); \
+ __rdma_umem_block_iter_next(biter);)
+
+#endif /* _RDMA_ITER_H_ */
--
2.52.0
^ permalink raw reply related
* [PATCH rdma-next 00/50] RDMA: Ensure CQ UMEMs are managed by ib_core
From: Leon Romanovsky @ 2026-02-13 10:57 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Selvin Xavier, Kalesh AP,
Potnuri Bharat Teja, Michael Margolin, Gal Pressman,
Yossi Leybovich, Cheng Xu, Kai Shen, Chengchang Tang,
Junxian Huang, Abhijit Gangurde, Allen Hubbe, Krzysztof Czurylo,
Tatyana Nikolova, Long Li, Konstantin Taranov, Yishai Hadas,
Michal Kalderon, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, Christian Benvenuti,
Nelson Escobar, Dennis Dalessandro, Bernard Metzler, Zhu Yanjun
Cc: linux-kernel, linux-rdma, linux-hyperv
Unify CQ UMEM creation, resize and release in ib_core to avoid the need
for complex driver-side handling. This lets us rely on the internal
reference counters of the relevant ib_XXX objects to manage UMEM
lifetime safely and consistently.
The resize cleanup made it clear that most drivers never handled this
path correctly, and there's a good chance the functionality was never
actually used. The most common issue was relying on the cq->resize_umem
pointer to detect races with other CQ commands, without clearing it on
errors and while ignoring proper locking for other CQ operations.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
Leon Romanovsky (50):
RDMA: Move DMA block iterator logic into dedicated files
RDMA/umem: Allow including ib_umem header from any location
RDMA/umem: Remove unnecessary includes and defines from ib_umem header
RDMA/core: Promote UMEM to a core component
RDMA/core: Manage CQ umem in core code
RDMA/efa: Rely on CPU address in create‑QP
RDMA/core: Prepare create CQ path for API unification
RDMA/core: Reject zero CQE count
RDMA/efa: Remove check for zero CQE count
RDMA/mlx5: Save 4 bytes in CQ structure
RDMA/mlx5: Provide a modern CQ creation interface
RDMA/mlx4: Inline mlx4_ib_get_cq_umem into callers
RDMA/mlx4: Introduce a modern CQ creation interface
RDMA/mlx4: Remove unused create_flags field from CQ structure
RDMA/bnxt_re: Convert to modern CQ interface
RDMA/cxgb4: Separate kernel and user CQ creation paths
RDMA/mthca: Split user and kernel CQ creation paths
RDMA/erdma: Separate user and kernel CQ creation paths
RDMA/ionic: Split user and kernel CQ creation paths
RDMA/qedr: Convert to modern CQ interface
RDMA/vmw_pvrdma: Provide a modern CQ creation interface
RDMA/ocrdma: Split user and kernel CQ creation paths
RDMA/irdma: Split user and kernel CQ creation paths
RDMA/usnic: Provide a modern CQ creation interface
RDMA/mana: Provide a modern CQ creation interface
RDMA/erdma: Separate user and kernel CQ creation paths
RDMA/rdmavt: Split user and kernel CQ creation paths
RDMA/siw: Split user and kernel CQ creation paths
RDMA/rxe: Split user and kernel CQ creation paths
RDMA/core: Remove legacy CQ creation fallback path
RDMA/core: Remove unused ib_resize_cq() implementation
RDMA: Clarify that CQ resize is a user‑space verb
RDMA/bnxt_re: Drop support for resizing kernel CQs
RDMA/irdma: Remove resize support for kernel CQs
RDMA/mlx4: Remove support for kernel CQ resize
RDMA/mlx5: Remove support for resizing kernel CQs
RDMA/mthca: Remove resize support for kernel CQs
RDMA/rdmavt: Remove resize support for kernel CQs
RDMA/rxe: Remove unused kernel‑side CQ resize support
RDMA: Properly propagate the number of CQEs as unsigned int
RDMA/core: Generalize CQ resize locking
RDMA/bnxt_re: Complete CQ resize in a single step
RDMA/bnxt_re: Rely on common resize‑CQ locking
RDMA/bnxt_re: Reduce CQ memory footprint
RDMA/mlx4: Use generic resize-CQ lock
RDMA/mlx4: Use on‑stack variables instead of storing them in the CQ object
RDMA/mlx5: Use generic resize-CQ lock
RDMA/mlx5: Select resize‑CQ callback based on device capabilities
RDMA/mlx5: Reduce CQ memory footprint
RDMA/mthca: Use generic resize-CQ lock
drivers/infiniband/core/Makefile | 6 +-
drivers/infiniband/core/cq.c | 3 +
drivers/infiniband/core/device.c | 4 +-
drivers/infiniband/core/iter.c | 43 +++
drivers/infiniband/core/umem.c | 2 +-
drivers/infiniband/core/uverbs_cmd.c | 18 +-
drivers/infiniband/core/uverbs_std_types_cq.c | 35 ++-
drivers/infiniband/core/verbs.c | 61 +---
drivers/infiniband/hw/bnxt_re/ib_verbs.c | 246 ++++++++-------
drivers/infiniband/hw/bnxt_re/ib_verbs.h | 9 +-
drivers/infiniband/hw/bnxt_re/main.c | 3 +-
drivers/infiniband/hw/bnxt_re/qplib_res.c | 2 +-
drivers/infiniband/hw/cxgb4/cq.c | 218 +++++++++----
drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +
drivers/infiniband/hw/cxgb4/mem.c | 2 +-
drivers/infiniband/hw/cxgb4/provider.c | 1 +
drivers/infiniband/hw/efa/efa.h | 6 +-
drivers/infiniband/hw/efa/efa_main.c | 3 +-
drivers/infiniband/hw/efa/efa_verbs.c | 44 ++-
drivers/infiniband/hw/erdma/erdma_main.c | 1 +
drivers/infiniband/hw/erdma/erdma_verbs.c | 99 ++++--
drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +
drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +-
drivers/infiniband/hw/hns/hns_roce_cq.c | 103 ++++--
drivers/infiniband/hw/hns/hns_roce_debugfs.c | 1 -
drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
drivers/infiniband/hw/hns/hns_roce_main.c | 1 +
drivers/infiniband/hw/ionic/ionic_controlpath.c | 88 ++++--
drivers/infiniband/hw/ionic/ionic_ibdev.c | 1 +
drivers/infiniband/hw/ionic/ionic_ibdev.h | 4 +-
drivers/infiniband/hw/irdma/main.h | 2 +-
drivers/infiniband/hw/irdma/verbs.c | 402 +++++++++++++-----------
drivers/infiniband/hw/mana/cq.c | 128 +++++---
drivers/infiniband/hw/mana/device.c | 1 +
drivers/infiniband/hw/mana/main.c | 25 +-
drivers/infiniband/hw/mana/mana_ib.h | 6 +-
drivers/infiniband/hw/mana/qp.c | 42 ++-
drivers/infiniband/hw/mana/wq.c | 14 +-
drivers/infiniband/hw/mlx4/cq.c | 401 ++++++++---------------
drivers/infiniband/hw/mlx4/main.c | 3 +-
drivers/infiniband/hw/mlx4/mlx4_ib.h | 10 +-
drivers/infiniband/hw/mlx4/mr.c | 1 +
drivers/infiniband/hw/mlx5/cq.c | 383 ++++++++--------------
drivers/infiniband/hw/mlx5/main.c | 9 +-
drivers/infiniband/hw/mlx5/mem.c | 1 +
drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +-
drivers/infiniband/hw/mlx5/qp.c | 2 +-
drivers/infiniband/hw/mlx5/umr.c | 1 +
drivers/infiniband/hw/mthca/mthca_cq.c | 1 -
drivers/infiniband/hw/mthca/mthca_provider.c | 193 ++++--------
drivers/infiniband/hw/mthca/mthca_provider.h | 1 -
drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 +-
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 70 +++--
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 6 +-
drivers/infiniband/hw/qedr/main.c | 1 +
drivers/infiniband/hw/qedr/verbs.c | 325 +++++++++++--------
drivers/infiniband/hw/qedr/verbs.h | 2 +
drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +-
drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 6 +-
drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 4 +-
drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 2 +-
drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 171 ++++++----
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 +
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 3 +
drivers/infiniband/sw/rdmavt/cq.c | 224 +++++++------
drivers/infiniband/sw/rdmavt/cq.h | 4 +-
drivers/infiniband/sw/rdmavt/vt.c | 3 +-
drivers/infiniband/sw/rxe/rxe_cq.c | 31 --
drivers/infiniband/sw/rxe/rxe_loc.h | 3 -
drivers/infiniband/sw/rxe/rxe_verbs.c | 115 +++----
drivers/infiniband/sw/siw/siw_main.c | 1 +
drivers/infiniband/sw/siw/siw_verbs.c | 111 +++++--
drivers/infiniband/sw/siw/siw_verbs.h | 2 +
include/rdma/ib_umem.h | 36 +--
include/rdma/ib_verbs.h | 67 +---
include/rdma/iter.h | 88 ++++++
76 files changed, 2085 insertions(+), 1847 deletions(-)
---
base-commit: 42e3aac65c1c9eb36cdee0d8312a326196e0822f
change-id: 20260203-refactor-umem-e5b4277e41b4
Best regards,
--
Leon Romanovsky <leonro@nvidia.com>
^ permalink raw reply
* Re: [PATCH 3/3] x86/virt: rename x2apic_available to x2apic_without_ir_available
From: Shashank Balaji @ 2026-02-13 7:39 UTC (permalink / raw)
To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Suresh Siddha, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
Broadcom internal kernel review list, Jan Kiszka, Paolo Bonzini,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky
Cc: Ingo Molnar, linux-kernel, linux-hyperv, virtualization,
jailhouse-dev, kvm, xen-devel, Rahul Bukte, Daniel Palmer,
Tim Bird, Sohil Mehta
In-Reply-To: <20260202-x2apic-fix-v1-3-71c8f488a88b@sony.com>
Hi x86 and virt folks,
I'd like some feedback on this patch. I realise that just updating the
name to x2apic_without_ir_available() with no indication in the code
suggesting that the hypervisor implementations may not be answering the
question "is x2apic availalble without IR?" is bad.
I suppose the options are:
1. Check seven hypervisor's x2apic_available() implementation to see if
the "x2apic_without_ir_available" semantic matches, and then do the
renaming
Problem is, I don't know enough about the hypervisors to check
the implementations. Some help from the virt folks would be
great!
2. Add TODOs on the hypervisor implementations, hoping they'll be
audited in the future
There's a chance the TODOs will just sit there rotting. It's
ugly, even I don't like it
So how do we proceed?
On Mon, Feb 02, 2026 at 06:51:04PM +0900, Shashank Balaji wrote:
> No functional change.
>
> x86_init.hyper.x2apic_available is used only in try_to_enable_x2apic to check if
> x2apic needs to be disabled if interrupt remapping support isn't present. But
> the name x2apic_available doesn't reflect that usage.
>
> This is what x2apic_available is set to for various hypervisors:
>
> acrn boot_cpu_has(X86_FEATURE_X2APIC)
> mshyperv boot_cpu_has(X86_FEATURE_X2APIC)
> xen boot_cpu_has(X86_FEATURE_X2APIC) or false
> vmware vmware_legacy_x2apic_available
> kvm kvm_cpuid_base() != 0
> jailhouse x2apic_enabled()
> bhyve true
> default false
>
> Bare metal and vmware correctly check if x2apic is available without interrupt
> remapping. The rest of them check if x2apic is enabled/supported, and kvm just
> checks if the kernel is running on kvm. The other hypervisors may have to have
> their checks audited.
>
> Also fix the backwards pr_info message printed on disabling x2apic because of
> lack of irq remapping support.
>
> Compile tested with all the hypervisor guest support enabled.
>
> Co-developed-by: Rahul Bukte <rahul.bukte@sony.com>
> Signed-off-by: Rahul Bukte <rahul.bukte@sony.com>
> Signed-off-by: Shashank Balaji <shashank.mahadasyam@sony.com>
> ---
> arch/x86/include/asm/x86_init.h | 4 ++--
> arch/x86/kernel/apic/apic.c | 4 ++--
> arch/x86/kernel/cpu/acrn.c | 2 +-
> arch/x86/kernel/cpu/bhyve.c | 2 +-
> arch/x86/kernel/cpu/mshyperv.c | 2 +-
> arch/x86/kernel/cpu/vmware.c | 2 +-
> arch/x86/kernel/jailhouse.c | 2 +-
> arch/x86/kernel/kvm.c | 2 +-
> arch/x86/kernel/x86_init.c | 12 ++++++------
> arch/x86/xen/enlighten_hvm.c | 4 ++--
> 10 files changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 6c8a6ead84f6..b270d9eed755 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -116,7 +116,7 @@ struct x86_init_pci {
> * struct x86_hyper_init - x86 hypervisor init functions
> * @init_platform: platform setup
> * @guest_late_init: guest late init
> - * @x2apic_available: X2APIC detection
> + * @x2apic_without_ir_available: is x2apic available without irq remap?
> * @msi_ext_dest_id: MSI supports 15-bit APIC IDs
> * @init_mem_mapping: setup early mappings during init_mem_mapping()
> * @init_after_bootmem: guest init after boot allocator is finished
> @@ -124,7 +124,7 @@ struct x86_init_pci {
> struct x86_hyper_init {
> void (*init_platform)(void);
> void (*guest_late_init)(void);
> - bool (*x2apic_available)(void);
> + bool (*x2apic_without_ir_available)(void);
> bool (*msi_ext_dest_id)(void);
> void (*init_mem_mapping)(void);
> void (*init_after_bootmem)(void);
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index cc64d61f82cf..8820b631f8a2 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -1836,8 +1836,8 @@ static __init void try_to_enable_x2apic(int remap_mode)
> * Using X2APIC without IR is not architecturally supported
> * on bare metal but may be supported in guests.
> */
> - if (!x86_init.hyper.x2apic_available()) {
> - pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
> + if (!x86_init.hyper.x2apic_without_ir_available()) {
> + pr_info("x2apic: Not supported without IRQ remapping\n");
> x2apic_disable();
> return;
> }
> diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
> index 2c5b51aad91a..9204b98d4786 100644
> --- a/arch/x86/kernel/cpu/acrn.c
> +++ b/arch/x86/kernel/cpu/acrn.c
> @@ -77,5 +77,5 @@ const __initconst struct hypervisor_x86 x86_hyper_acrn = {
> .detect = acrn_detect,
> .type = X86_HYPER_ACRN,
> .init.init_platform = acrn_init_platform,
> - .init.x2apic_available = acrn_x2apic_available,
> + .init.x2apic_without_ir_available = acrn_x2apic_available,
> };
> diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c
> index f1a8ca3dd1ed..91a90a7459ce 100644
> --- a/arch/x86/kernel/cpu/bhyve.c
> +++ b/arch/x86/kernel/cpu/bhyve.c
> @@ -61,6 +61,6 @@ const struct hypervisor_x86 x86_hyper_bhyve __refconst = {
> .name = "Bhyve",
> .detect = bhyve_detect,
> .init.init_platform = x86_init_noop,
> - .init.x2apic_available = bhyve_x2apic_available,
> + .init.x2apic_without_ir_available = bhyve_x2apic_available,
> .init.msi_ext_dest_id = bhyve_ext_dest_id,
> };
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 579fb2c64cfd..61458855094a 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -760,7 +760,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
> .name = "Microsoft Hyper-V",
> .detect = ms_hyperv_platform,
> .type = X86_HYPER_MS_HYPERV,
> - .init.x2apic_available = ms_hyperv_x2apic_available,
> + .init.x2apic_without_ir_available = ms_hyperv_x2apic_available,
> .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
> .init.init_platform = ms_hyperv_init_platform,
> .init.guest_late_init = ms_hyperv_late_init,
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index cb3f900c46fc..46d325818797 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -585,7 +585,7 @@ const __initconst struct hypervisor_x86 x86_hyper_vmware = {
> .detect = vmware_platform,
> .type = X86_HYPER_VMWARE,
> .init.init_platform = vmware_platform_setup,
> - .init.x2apic_available = vmware_legacy_x2apic_available,
> + .init.x2apic_without_ir_available = vmware_legacy_x2apic_available,
> #ifdef CONFIG_AMD_MEM_ENCRYPT
> .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
> .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
> index 9e9a591a5fec..84a0bbe15989 100644
> --- a/arch/x86/kernel/jailhouse.c
> +++ b/arch/x86/kernel/jailhouse.c
> @@ -291,6 +291,6 @@ const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
> .name = "Jailhouse",
> .detect = jailhouse_detect,
> .init.init_platform = jailhouse_init_platform,
> - .init.x2apic_available = jailhouse_x2apic_available,
> + .init.x2apic_without_ir_available = jailhouse_x2apic_available,
> .ignore_nopv = true,
> };
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 37dc8465e0f5..709eba87d58e 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -1042,7 +1042,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
> .detect = kvm_detect,
> .type = X86_HYPER_KVM,
> .init.guest_late_init = kvm_guest_init,
> - .init.x2apic_available = kvm_para_available,
> + .init.x2apic_without_ir_available = kvm_para_available,
> .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
> .init.init_platform = kvm_init_platform,
> #if defined(CONFIG_AMD_MEM_ENCRYPT)
> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
> index ebefb77c37bb..9ddf8c901ac6 100644
> --- a/arch/x86/kernel/x86_init.c
> +++ b/arch/x86/kernel/x86_init.c
> @@ -112,12 +112,12 @@ struct x86_init_ops x86_init __initdata = {
> },
>
> .hyper = {
> - .init_platform = x86_init_noop,
> - .guest_late_init = x86_init_noop,
> - .x2apic_available = bool_x86_init_noop,
> - .msi_ext_dest_id = bool_x86_init_noop,
> - .init_mem_mapping = x86_init_noop,
> - .init_after_bootmem = x86_init_noop,
> + .init_platform = x86_init_noop,
> + .guest_late_init = x86_init_noop,
> + .x2apic_without_ir_available = bool_x86_init_noop,
> + .msi_ext_dest_id = bool_x86_init_noop,
> + .init_mem_mapping = x86_init_noop,
> + .init_after_bootmem = x86_init_noop,
> },
>
> .acpi = {
> diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
> index fe57ff85d004..42f3d21f313d 100644
> --- a/arch/x86/xen/enlighten_hvm.c
> +++ b/arch/x86/xen/enlighten_hvm.c
> @@ -311,7 +311,7 @@ static uint32_t __init xen_platform_hvm(void)
> * detect PVH and panic there.
> */
> h->init_platform = x86_init_noop;
> - h->x2apic_available = bool_x86_init_noop;
> + h->x2apic_without_ir_available = bool_x86_init_noop;
> h->init_mem_mapping = x86_init_noop;
> h->init_after_bootmem = x86_init_noop;
> h->guest_late_init = xen_hvm_guest_late_init;
> @@ -325,7 +325,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = {
> .detect = xen_platform_hvm,
> .type = X86_HYPER_XEN_HVM,
> .init.init_platform = xen_hvm_guest_init,
> - .init.x2apic_available = xen_x2apic_available,
> + .init.x2apic_without_ir_available = xen_x2apic_available,
> .init.init_mem_mapping = xen_hvm_init_mem_mapping,
> .init.guest_late_init = xen_hvm_guest_late_init,
> .init.msi_ext_dest_id = msi_ext_dest_id,
>
> --
> 2.43.0
>
^ permalink raw reply
* Re: [PATCH 1/2] kexec: Add permission notifier chain for kexec operations
From: Mukesh R @ 2026-02-12 22:12 UTC (permalink / raw)
To: Stanislav Kinsburskii, rppt, akpm, bhe, kys, haiyangz, wei.liu,
decui, longli
Cc: kexec, linux-hyperv, linux-kernel
In-Reply-To: <176962212169.85424.4683391728440118017.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On 1/28/26 09:42, Stanislav Kinsburskii wrote:
> Add a blocking notifier chain to allow subsystems to be notified
> before kexec execution. This enables modules to perform necessary
> cleanup or validation before the system transitions to a new kernel or
> block kexec if not possible under current conditions.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> include/linux/kexec.h | 6 ++++++
> kernel/kexec_core.c | 24 ++++++++++++++++++++++++
> 2 files changed, 30 insertions(+)
>
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index ff7e231b0485..311037d30f9e 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -35,6 +35,7 @@ extern note_buf_t __percpu *crash_notes;
> #include <linux/ioport.h>
> #include <linux/module.h>
> #include <linux/highmem.h>
> +#include <linux/notifier.h>
> #include <asm/kexec.h>
> #include <linux/crash_core.h>
>
> @@ -532,10 +533,13 @@ extern bool kexec_file_dbg_print;
>
> extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size);
> extern void kimage_unmap_segment(void *buffer);
> +extern int kexec_block_notifier_register(struct notifier_block *nb);
> +extern int kexec_block_notifier_unregister(struct notifier_block *nb);
> #else /* !CONFIG_KEXEC_CORE */
> struct pt_regs;
> struct task_struct;
> struct kimage;
> +struct notifier_block;
> static inline void __crash_kexec(struct pt_regs *regs) { }
> static inline void crash_kexec(struct pt_regs *regs) { }
> static inline int kexec_should_crash(struct task_struct *p) { return 0; }
> @@ -543,6 +547,8 @@ static inline int kexec_crash_loaded(void) { return 0; }
> static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size)
> { return NULL; }
> static inline void kimage_unmap_segment(void *buffer) { }
> +static inline int kexec_block_notifier_register(struct notifier_block *nb) { }
> +static inline int kexec_block_notifier_unregister(struct notifier_block *nb) { }
> #define kexec_in_progress false
> #endif /* CONFIG_KEXEC_CORE */
>
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 0f92acdd354d..1e86a6f175f0 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -57,6 +57,20 @@ bool kexec_in_progress = false;
>
> bool kexec_file_dbg_print;
>
> +static BLOCKING_NOTIFIER_HEAD(kexec_block_list);
> +
> +int kexec_block_notifier_register(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&kexec_block_list, nb);
> +}
> +EXPORT_SYMBOL_GPL(kexec_block_notifier_register);
> +
> +int kexec_block_notifier_unregister(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_unregister(&kexec_block_list, nb);
> +}
> +EXPORT_SYMBOL_GPL(kexec_block_notifier_unregister);
> +
> /*
> * When kexec transitions to the new kernel there is a one-to-one
> * mapping between physical and virtual addresses. On processors
> @@ -1124,6 +1138,12 @@ bool kexec_load_permitted(int kexec_image_type)
> return true;
> }
>
> +static int kexec_check_blockers(void)
> +{
> + /* Notify subsystems of impending kexec */
> + return blocking_notifier_call_chain(&kexec_block_list, 0, NULL);
> +}
> +
> /*
> * Move into place and start executing a preloaded standalone
> * executable. If nothing was preloaded return an error.
> @@ -1139,6 +1159,10 @@ int kernel_kexec(void)
> goto Unlock;
> }
>
> + error = kexec_check_blockers();
This could take a long time, and I am not sure if it's a good idea
to stall kexec with such dependencies.
Thanks,
-Mukesh
> + if (error)
> + goto Unlock;
> +
> error = liveupdate_reboot();
> if (error)
> goto Unlock;
>
>
^ permalink raw reply
* Re: [PATCH 2/2] mshv: Add kexec blocking support
From: Mukesh R @ 2026-02-12 22:11 UTC (permalink / raw)
To: Stanislav Kinsburskii, rppt, akpm, bhe, kys, haiyangz, wei.liu,
decui, longli
Cc: kexec, linux-hyperv, linux-kernel
In-Reply-To: <176962212724.85424.5690118672585914211.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On 1/28/26 09:42, Stanislav Kinsburskii wrote:
> Add kexec notifier to prevent kexec when VMs are active or memory
> is deposited. The notifier blocks kexec operations if:
> - Active VMs exist in the partition table
> - Pages are still deposited to the hypervisor
>
> The kernel cannot access hypervisor deposited pages: any access
> triggers a GPF. Until the deposited page state can be handed over
> to the next kernel, kexec must be blocked if there is any shared
> state between kernel and hypervisor.
>
> For L1 host virtualization, attempt to withdraw all deposited memory before
> allowing kexec to proceed. If withdrawal fails or pages remain deposited
> block the kexec operation.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/Makefile | 1 +
> drivers/hv/hv_proc.c | 4 ++
> drivers/hv/mshv_kexec.c | 66 ++++++++++++++++++++++++++++++++++++++++
> drivers/hv/mshv_root.h | 14 ++++++++
> drivers/hv/mshv_root_hv_call.c | 2 +
> drivers/hv/mshv_root_main.c | 7 ++++
> 6 files changed, 94 insertions(+)
> create mode 100644 drivers/hv/mshv_kexec.c
>
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index a49f93c2d245..bb72be5cc525 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -15,6 +15,7 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
> hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
> mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
> mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
> +mshv_root-$(CONFIG_KEXEC) += mshv_kexec.o
> mshv_vtl-y := mshv_vtl_main.o
>
> # Code that must be built-in
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index 89870c1b0087..39bbbedb0340 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -15,6 +15,8 @@
> */
> #define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
>
> +atomic_t hv_pages_deposited;
> +
> /* Deposits exact number of pages. Must be called with interrupts enabled. */
> int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> {
> @@ -93,6 +95,8 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> goto err_free_allocations;
> }
>
> + atomic_add(page_count, &hv_pages_deposited);
> +
> ret = 0;
> goto free_buf;
>
> diff --git a/drivers/hv/mshv_kexec.c b/drivers/hv/mshv_kexec.c
> new file mode 100644
> index 000000000000..5222b2e4ff97
> --- /dev/null
> +++ b/drivers/hv/mshv_kexec.c
> @@ -0,0 +1,66 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2026, Microsoft Corporation.
> + *
> + * Live update orchestration management for mshv_root module.
> + *
> + * Author: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> + */
> +
> +#include <linux/kexec.h>
> +#include <linux/notifier.h>
> +#include <asm/mshyperv.h>
> +#include "mshv_root.h"
> +
> +static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain);
> +
> +static int mshv_block_kexec_notify(struct notifier_block *nb,
> + unsigned long action, void *arg)
> +{
> + if (!hash_empty(mshv_root.pt_htable)) {
> + pr_warn("mshv: Cannot perform kexec while VMs are active\n");
> + return -EBUSY;
> + }
> +
> + if (hv_l1vh_partition()) {
> + int err;
> +
> + /* Attempt to withdraw all the deposited pages */
> + err = hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE,
> + hv_current_partition_id);
> + if (err) {
> + pr_err("mshv: Failed to withdraw memory from L1 virtualization: %d\n",
> + err);
> + return err;
> + }
> + }
> +
> + if (atomic_read(&hv_pages_deposited)) {
> + pr_warn("mshv: Cannot perform kexec while pages are deposited\n");
> + return -EBUSY;
> + }
> + return 0;
> +}
> +
What guarantees another deposit won't happen after this. Are all cpus
"locked" in kexec path and not doing anything at this point?
Thanks,
-Mukesh
> +static struct notifier_block mshv_kexec_notifier = {
> + .notifier_call = mshv_block_kexec_notify,
> +};
> +
> +int __init mshv_kexec_init(void)
> +{
> + int err;
> +
> + err = kexec_block_notifier_register(&mshv_kexec_notifier);
> + if (err) {
> + pr_err("mshv: Could not register kexec notifier: %pe\n",
> + ERR_PTR(err));
> + return err;
> + }
> +
> + return 0;
> +}
> +
> +void __exit mshv_kexec_exit(void)
> +{
> + (void)kexec_block_notifier_unregister(&mshv_kexec_notifier);
> +}
> diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
> index 3c1d88b36741..311f76262d10 100644
> --- a/drivers/hv/mshv_root.h
> +++ b/drivers/hv/mshv_root.h
> @@ -17,6 +17,7 @@
> #include <linux/build_bug.h>
> #include <linux/mmu_notifier.h>
> #include <uapi/linux/mshv.h>
> +#include <hyperv/hvhdk.h>
>
> /*
> * Hypervisor must be between these version numbers (inclusive)
> @@ -319,6 +320,7 @@ int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 a
> extern struct mshv_root mshv_root;
> extern enum hv_scheduler_type hv_scheduler_type;
> extern u8 * __percpu *hv_synic_eventring_tail;
> +extern atomic_t hv_pages_deposited;
>
> struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
> u64 uaddr, u32 flags);
> @@ -333,4 +335,16 @@ bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
> void mshv_region_movable_fini(struct mshv_mem_region *region);
> bool mshv_region_movable_init(struct mshv_mem_region *region);
>
> +#if IS_ENABLED(CONFIG_KEXEC)
> +int mshv_kexec_init(void);
> +void mshv_kexec_exit(void);
> +#else
> +static inline int mshv_kexec_init(void)
> +{
> + return 0;
> +}
> +
> +static inline void mshv_kexec_exit(void) { }
> +#endif
> +
> #endif /* _MSHV_ROOT_H_ */
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index 06f2bac8039d..4203af5190ee 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -73,6 +73,8 @@ int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
> for (i = 0; i < completed; i++)
> __free_page(pfn_to_page(output_page->gpa_page_list[i]));
>
> + atomic_sub(completed, &hv_pages_deposited);
> +
> if (!hv_result_success(status)) {
> if (hv_result(status) == HV_STATUS_NO_RESOURCES)
> status = HV_STATUS_SUCCESS;
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 5fc572e31cd7..d55aa69d130c 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -2330,6 +2330,10 @@ static int __init mshv_parent_partition_init(void)
> if (ret)
> goto deinit_root_scheduler;
>
> + ret = mshv_kexec_init();
> + if (ret)
> + goto deinit_irqfd_wq;
> +
> spin_lock_init(&mshv_root.pt_ht_lock);
> hash_init(mshv_root.pt_htable);
>
> @@ -2337,6 +2341,8 @@ static int __init mshv_parent_partition_init(void)
>
> return 0;
>
> +deinit_irqfd_wq:
> + mshv_irqfd_wq_cleanup();
> deinit_root_scheduler:
> root_scheduler_deinit();
> exit_partition:
> @@ -2356,6 +2362,7 @@ static void __exit mshv_parent_partition_exit(void)
> hv_setup_mshv_handler(NULL);
> mshv_port_table_fini();
> misc_deregister(&mshv_dev);
> + mshv_kexec_exit();
> mshv_irqfd_wq_cleanup();
> root_scheduler_deinit();
> if (hv_root_partition())
>
>
^ permalink raw reply
* RE: [PATCH 2/2] drm/hyperv: During panic do VMBus unload after frame buffer is flushed
From: Michael Kelley @ 2026-02-12 16:30 UTC (permalink / raw)
To: Jocelyn Falempe, drawat.floss@gmail.com,
maarten.lankhorst@linux.intel.com, mripard@kernel.org,
tzimmermann@suse.de, airlied@gmail.com, simona@ffwll.ch,
kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
decui@microsoft.com, longli@microsoft.com, ryasuoka@redhat.com
Cc: dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org,
linux-hyperv@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <7c6933fc-663d-4bf6-8594-c14c4be83c98@redhat.com>
From: Jocelyn Falempe <jfalempe@redhat.com> Sent: Thursday, February 12, 2026 2:10 AM
>
> On 12/02/2026 10:49, Jocelyn Falempe wrote:
> > On 12/02/2026 00:01, mhklkml@zohomail.com wrote:
> >> From: Jocelyn Falempe <jfalempe@redhat.com> Sent: Wednesday, February
> >> 11, 2026 1:54 PM
> >>
> >> But for this patch, the issue is that drm_panic() never gets called if CONFIG_PRINTK
> >> isn't set. In that case, kmsg_dump_register() is a stub that returns an error. So
> >> drm_panic_register() never registers the callback to drm_panic(). And if
> >> drm_panic() isn't going to run, responsibility for doing the VMBus unload
> >> must remain with the VMBus code. It's hard to actually test this case because
> >> of depending on printk() for debugging output, so double-check my thinking.
> >
> > Ok you're right. I changed from
> > atomic_notifier_chain_register(&panic_notifier_list, ...) to
> > kmsg_dump_register() in the v10 of drm_panic.
> >
> > So I should either make DRM_PANIC depends on PRINTK, or call
> > atomic_notifier_chain_register() if PRINTK is not defined.
> >
> > As I think kernel without PRINTK are uncommon, I'll probably do the
> > first solution.
> >
>
> FYI, I just sent the corresponding change:
>
> https://patchwork.freedesktop.org/series/161544/
>
Works for me. That means I can drop the CONFIG_PRINTK condition
from my patch, which would be good. The current version is rather
strange in that regard. I'm pretty tied up the rest of this week,
so it may be next week before I resubmit my patches.
Michael
^ permalink raw reply
* RE: [PATCH] x86: mshyperv: Use kthread for vmbus interrupts on PREEMPT_RT
From: Michael Kelley @ 2026-02-12 16:22 UTC (permalink / raw)
To: Jan Kiszka, Michael Kelley, Florian Bezdeka, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86@kernel.org
Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org, RT,
Mitchell Levy, skinsburskii@linux.microsoft.com,
mrathor@linux.microsoft.com, anirudh@anirudhrb.com,
schakrabarti@linux.microsoft.com, ssengar@linux.microsoft.com
In-Reply-To: <b084a7b6-c394-4337-82cd-8b9cb911d8d5@siemens.com>
From: Jan Kiszka <jan.kiszka@siemens.com> Sent: Thursday, February 12, 2026 8:06 AM
>
> On 09.02.26 19:25, Michael Kelley wrote:
> > From: Florian Bezdeka <florian.bezdeka@siemens.com> Sent: Monday, February 9, 2026 2:35 AM
> >>
> >> On Sat, 2026-02-07 at 01:30 +0000, Michael Kelley wrote:
> >>
> >> [snip]
> >>>
> >>> I've run your suggested experiment on an arm64 VM in the Azure cloud. My
> >>> kernel was linux-next 20260128. I set CONFIG_PREEMPT_RT=y and
> >>> CONFIG_PROVE_LOCKING=y, but did not add either of your two patches
> >>> (neither the storvsc driver patch nor the x86 VMBus interrupt handling patch).
> >>> The VM comes up and runs, but with this warning during boot:
> >>>
> >>> [ 3.075604] hv_utils: Registering HyperV Utility Driver
> >>> [ 3.075636] hv_vmbus: registering driver hv_utils
> >>> [ 3.085920] =============================
> >>> [ 3.088128] hv_vmbus: registering driver hv_netvsc
> >>> [ 3.091180] [ BUG: Invalid wait context ]
> >>> [ 3.093544] 6.19.0-rc7-next-20260128+ #3 Tainted: G E
> >>> [ 3.097582] -----------------------------
> >>> [ 3.099899] systemd-udevd/284 is trying to lock:
> >>> [ 3.102568] ffff000100e24490 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0x128/0x3b8 [hv_vmbus]
> >>> [ 3.108208] other info that might help us debug this:
> >>> [ 3.111454] context-{2:2}
> >>> [ 3.112987] 1 lock held by systemd-udevd/284:
> >>> [ 3.115626] #0: ffffd5cfc20bcc80 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0xcc/0x3b8 [hv_vmbus]
> >>> [ 3.121224] stack backtrace:
> >>> [ 3.122897] CPU: 0 UID: 0 PID: 284 Comm: systemd-udevd Tainted: G E 6.19.0-rc7-next-20260128+ #3 PREEMPT_RT
> >>> [ 3.129631] Tainted: [E]=UNSIGNED_MODULE
> >>> [ 3.131946] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 06/10/2025
> >>> [ 3.138553] Call trace:
> >>> [ 3.140015] show_stack+0x20/0x38 (C)
> >>> [ 3.142137] dump_stack_lvl+0x9c/0x158
> >>> [ 3.144340] dump_stack+0x18/0x28
> >>> [ 3.146290] __lock_acquire+0x488/0x1e20
> >>> [ 3.148569] lock_acquire+0x11c/0x388
> >>> [ 3.150703] rt_spin_lock+0x54/0x230
> >>> [ 3.152785] vmbus_chan_sched+0x128/0x3b8 [hv_vmbus]
> >>> [ 3.155611] vmbus_isr+0x34/0x80 [hv_vmbus]
> >>> [ 3.158093] vmbus_percpu_isr+0x18/0x30 [hv_vmbus]
> >>> [ 3.160848] handle_percpu_devid_irq+0xdc/0x348
> >>> [ 3.163495] handle_irq_desc+0x48/0x68
> >>> [ 3.165851] generic_handle_domain_irq+0x20/0x38
> >>> [ 3.168664] gic_handle_irq+0x1dc/0x430
> >>> [ 3.170868] call_on_irq_stack+0x30/0x70
> >>> [ 3.173161] do_interrupt_handler+0x88/0xa0
> >>> [ 3.175724] el1_interrupt+0x4c/0xb0
> >>> [ 3.177855] el1h_64_irq_handler+0x18/0x28
> >>> [ 3.180332] el1h_64_irq+0x84/0x88
> >>> [ 3.182378] _raw_spin_unlock_irqrestore+0x4c/0xb0 (P)
> >>> [ 3.185493] rt_mutex_slowunlock+0x404/0x440
> >>> [ 3.187951] rt_spin_unlock+0xb8/0x178
> >>> [ 3.190394] kmem_cache_alloc_noprof+0xf0/0x4f8
> >>> [ 3.193100] alloc_empty_file+0x64/0x148
> >>> [ 3.195461] path_openat+0x58/0xaa0
> >>> [ 3.197658] do_file_open+0xa0/0x140
> >>> [ 3.199752] do_sys_openat2+0x190/0x278
> >>> [ 3.202124] do_sys_open+0x60/0xb8
> >>> [ 3.204047] __arm64_sys_openat+0x2c/0x48
> >>> [ 3.206433] invoke_syscall+0x6c/0xf8
> >>> [ 3.208519] el0_svc_common.constprop.0+0x48/0xf0
> >>> [ 3.211050] do_el0_svc+0x24/0x38
> >>> [ 3.212990] el0_svc+0x164/0x3c8
> >>> [ 3.214842] el0t_64_sync_handler+0xd0/0xe8
> >>> [ 3.217251] el0t_64_sync+0x1b0/0x1b8
> >>> [ 3.219450] hv_utils: Heartbeat IC version 3.0
> >>> [ 3.219471] hv_utils: Shutdown IC version 3.2
> >>> [ 3.219844] hv_utils: TimeSync IC version 4.0
> >>
> >> That matches with my expectation that the same problem exists on arm64.
> >> The patch from Jan addresses that issue for x86 (only, so far) as we do
> >> not have a working test environment for arm64 yet.
> >
> > OK. I had understood Jan's earlier comments to mean that the VMBus
> > interrupt problem was implicitly solved on arm64 because of VMBus using
> > a standard Linux IRQ on arm64. But evidently that's not the case. So my
> > earlier comment stands: The code changes should go into the architecture
> > independent portion of the VMBus driver, and not under arch/x86. I
> > can probably work with you to test on arm64 if need be.
> >
>
> I can move the code, sure, but I still haven't understood what
> invalidates my assumptions (beside what you observed). vmbus_drv calls
> request_percpu_irq, and that is - as far as I can see - not injecting
> IRQF_NO_THREAD. Any explanations welcome.
>
> Reproduction is still not possible for me. I was playing a bit with qemu
> in the hope to make it provide its minimal vmbus support (for
> ballooning), but that was not yet successful on arm64.
>
Let me try to debug my experiment on arm64 and see why it isn't
handing off the VMBus interrupt to a thread. Maybe there's something
missing in my .config. But it will be sometime next week before
I can do it.
Michael
^ permalink raw reply
* Re: [PATCH] x86: mshyperv: Use kthread for vmbus interrupts on PREEMPT_RT
From: Jan Kiszka @ 2026-02-12 16:06 UTC (permalink / raw)
To: Michael Kelley, Florian Bezdeka, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86@kernel.org
Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org, RT,
Mitchell Levy, skinsburskii@linux.microsoft.com,
mrathor@linux.microsoft.com, anirudh@anirudhrb.com,
schakrabarti@linux.microsoft.com, ssengar@linux.microsoft.com
In-Reply-To: <SN6PR02MB4157DB59F0F7BFBF56612651D465A@SN6PR02MB4157.namprd02.prod.outlook.com>
On 09.02.26 19:25, Michael Kelley wrote:
> From: Florian Bezdeka <florian.bezdeka@siemens.com> Sent: Monday, February 9, 2026 2:35 AM
>>
>> On Sat, 2026-02-07 at 01:30 +0000, Michael Kelley wrote:
>>
>> [snip]
>>>
>>> I've run your suggested experiment on an arm64 VM in the Azure cloud. My
>>> kernel was linux-next 20260128. I set CONFIG_PREEMPT_RT=y and
>>> CONFIG_PROVE_LOCKING=y, but did not add either of your two patches
>>> (neither the storvsc driver patch nor the x86 VMBus interrupt handling patch).
>>> The VM comes up and runs, but with this warning during boot:
>>>
>>> [ 3.075604] hv_utils: Registering HyperV Utility Driver
>>> [ 3.075636] hv_vmbus: registering driver hv_utils
>>> [ 3.085920] =============================
>>> [ 3.088128] hv_vmbus: registering driver hv_netvsc
>>> [ 3.091180] [ BUG: Invalid wait context ]
>>> [ 3.093544] 6.19.0-rc7-next-20260128+ #3 Tainted: G E
>>> [ 3.097582] -----------------------------
>>> [ 3.099899] systemd-udevd/284 is trying to lock:
>>> [ 3.102568] ffff000100e24490 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0x128/0x3b8 [hv_vmbus]
>>> [ 3.108208] other info that might help us debug this:
>>> [ 3.111454] context-{2:2}
>>> [ 3.112987] 1 lock held by systemd-udevd/284:
>>> [ 3.115626] #0: ffffd5cfc20bcc80 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0xcc/0x3b8 [hv_vmbus]
>>> [ 3.121224] stack backtrace:
>>> [ 3.122897] CPU: 0 UID: 0 PID: 284 Comm: systemd-udevd Tainted: G E 6.19.0-rc7-next-20260128+ #3 PREEMPT_RT
>>> [ 3.129631] Tainted: [E]=UNSIGNED_MODULE
>>> [ 3.131946] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 06/10/2025
>>> [ 3.138553] Call trace:
>>> [ 3.140015] show_stack+0x20/0x38 (C)
>>> [ 3.142137] dump_stack_lvl+0x9c/0x158
>>> [ 3.144340] dump_stack+0x18/0x28
>>> [ 3.146290] __lock_acquire+0x488/0x1e20
>>> [ 3.148569] lock_acquire+0x11c/0x388
>>> [ 3.150703] rt_spin_lock+0x54/0x230
>>> [ 3.152785] vmbus_chan_sched+0x128/0x3b8 [hv_vmbus]
>>> [ 3.155611] vmbus_isr+0x34/0x80 [hv_vmbus]
>>> [ 3.158093] vmbus_percpu_isr+0x18/0x30 [hv_vmbus]
>>> [ 3.160848] handle_percpu_devid_irq+0xdc/0x348
>>> [ 3.163495] handle_irq_desc+0x48/0x68
>>> [ 3.165851] generic_handle_domain_irq+0x20/0x38
>>> [ 3.168664] gic_handle_irq+0x1dc/0x430
>>> [ 3.170868] call_on_irq_stack+0x30/0x70
>>> [ 3.173161] do_interrupt_handler+0x88/0xa0
>>> [ 3.175724] el1_interrupt+0x4c/0xb0
>>> [ 3.177855] el1h_64_irq_handler+0x18/0x28
>>> [ 3.180332] el1h_64_irq+0x84/0x88
>>> [ 3.182378] _raw_spin_unlock_irqrestore+0x4c/0xb0 (P)
>>> [ 3.185493] rt_mutex_slowunlock+0x404/0x440
>>> [ 3.187951] rt_spin_unlock+0xb8/0x178
>>> [ 3.190394] kmem_cache_alloc_noprof+0xf0/0x4f8
>>> [ 3.193100] alloc_empty_file+0x64/0x148
>>> [ 3.195461] path_openat+0x58/0xaa0
>>> [ 3.197658] do_file_open+0xa0/0x140
>>> [ 3.199752] do_sys_openat2+0x190/0x278
>>> [ 3.202124] do_sys_open+0x60/0xb8
>>> [ 3.204047] __arm64_sys_openat+0x2c/0x48
>>> [ 3.206433] invoke_syscall+0x6c/0xf8
>>> [ 3.208519] el0_svc_common.constprop.0+0x48/0xf0
>>> [ 3.211050] do_el0_svc+0x24/0x38
>>> [ 3.212990] el0_svc+0x164/0x3c8
>>> [ 3.214842] el0t_64_sync_handler+0xd0/0xe8
>>> [ 3.217251] el0t_64_sync+0x1b0/0x1b8
>>> [ 3.219450] hv_utils: Heartbeat IC version 3.0
>>> [ 3.219471] hv_utils: Shutdown IC version 3.2
>>> [ 3.219844] hv_utils: TimeSync IC version 4.0
>>
>> That matches with my expectation that the same problem exists on arm64.
>> The patch from Jan addresses that issue for x86 (only, so far) as we do
>> not have a working test environment for arm64 yet.
>
> OK. I had understood Jan's earlier comments to mean that the VMBus
> interrupt problem was implicitly solved on arm64 because of VMBus using
> a standard Linux IRQ on arm64. But evidently that's not the case. So my
> earlier comment stands: The code changes should go into the architecture
> independent portion of the VMBus driver, and not under arch/x86. I
> can probably work with you to test on arm64 if need be.
>
I can move the code, sure, but I still haven't understood what
invalidates my assumptions (beside what you observed). vmbus_drv calls
request_percpu_irq, and that is - as far as I can see - not injecting
IRQF_NO_THREAD. Any explanations welcome.
Reproduction is still not possible for me. I was playing a bit with qemu
in the hope to make it provide its minimal vmbus support (for
ballooning), but that was not yet successful on arm64.
Jan
--
Siemens AG, Foundational Technologies
Linux Expert Center
^ permalink raw reply
* Re: [PATCH 2/2] drm/hyperv: During panic do VMBus unload after frame buffer is flushed
From: Jocelyn Falempe @ 2026-02-12 10:10 UTC (permalink / raw)
To: mhklkml, mhklinux, drawat.floss, maarten.lankhorst, mripard,
tzimmermann, airlied, simona, kys, haiyangz, wei.liu, decui,
longli, ryasuoka
Cc: dri-devel, linux-kernel, linux-hyperv, stable
In-Reply-To: <e9d35c78-1c4b-4a9c-8cf0-9531e972279f@redhat.com>
On 12/02/2026 10:49, Jocelyn Falempe wrote:
> On 12/02/2026 00:01, mhklkml@zohomail.com wrote:
>> From: Jocelyn Falempe <jfalempe@redhat.com> Sent: Wednesday, February
>> 11, 2026 1:54 PM
>>
>> But for this patch, the issue is that drm_panic() never gets called if
>> CONFIG_PRINTK
>> isn't set. In that case, kmsg_dump_register() is a stub that returns
>> an error. So
>> drm_panic_register() never registers the callback to drm_panic(). And if
>> drm_panic() isn't going to run, responsibility for doing the VMBus unload
>> must remain with the VMBus code. It's hard to actually test this case
>> because
>> of depending on printk() for debugging output, so double-check my
>> thinking.
>
> Ok you're right. I changed from
> atomic_notifier_chain_register(&panic_notifier_list, ...) to
> kmsg_dump_register() in the v10 of drm_panic.
>
> So I should either make DRM_PANIC depends on PRINTK, or call
> atomic_notifier_chain_register() if PRINTK is not defined.
>
> As I think kernel without PRINTK are uncommon, I'll probably do the
> first solution.
>
FYI, I just sent the corresponding change:
https://patchwork.freedesktop.org/series/161544/
Best regards,
--
Jocelyn
^ permalink raw reply
* Re: [PATCH 2/2] drm/hyperv: During panic do VMBus unload after frame buffer is flushed
From: Jocelyn Falempe @ 2026-02-12 9:49 UTC (permalink / raw)
To: mhklkml, mhklinux, drawat.floss, maarten.lankhorst, mripard,
tzimmermann, airlied, simona, kys, haiyangz, wei.liu, decui,
longli, ryasuoka
Cc: dri-devel, linux-kernel, linux-hyperv, stable
In-Reply-To: <002601dc9baa$517d8b40$f478a1c0$@zohomail.com>
On 12/02/2026 00:01, mhklkml@zohomail.com wrote:
> From: Jocelyn Falempe <jfalempe@redhat.com> Sent: Wednesday, February 11, 2026 1:54 PM
>>
>> On 09/02/2026 08:02, mhkelley58@gmail.com wrote:
>>> From: Michael Kelley <mhklinux@outlook.com>
>>>
>>> In a VM, Linux panic information (reason for the panic, stack trace,
>>> etc.) may be written to a serial console and/or a virtual frame buffer
>>> for a graphics console. The latter may need to be flushed back to the
>>> host hypervisor for display.
>>>
>>> The current Hyper-V DRM driver for the frame buffer does the flushing
>>> *after* the VMBus connection has been unloaded, such that panic messages
>>> are not displayed on the graphics console. A user with a Hyper-V graphics
>>> console is left with just a hung empty screen after a panic. The enhanced
>>> control that DRM provides over the panic display in the graphics console
>>> is similarly non-functional.
>>>
>>> Commit 3671f3777758 ("drm/hyperv: Add support for drm_panic") added
>>> the Hyper-V DRM driver support to flush the virtual frame buffer. It
>>> provided necessary functionality but did not handle the sequencing
>>> problem with VMBus unload.
>>>
>>> Fix the full problem by using VMBus functions to suppress the VMBus
>>> unload that is normally done by the VMBus driver in the panic path. Then
>>> after the frame buffer has been flushed, do the VMBus unload so that a
>>> kdump kernel can start cleanly. As expected, CONFIG_DRM_PANIC must be
>>> selected for these changes to have effect. As a side benefit, the
>>> enhanced features of the DRM panic path are also functional.
>>
>> Thanks for properly fixing this issue with DRM Panic on hyperv.
>>
>> I've a small comment below.
>>
>> With that fixed:
>> Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
>>
>> The first patch looks good too, I can review it if no other step up, as
>> I'm not familiar with hyperv.
>>
>>>
>>> Fixes: 3671f3777758 ("drm/hyperv: Add support for drm_panic")
>>> Signed-off-by: Michael Kelley <mhklinux@outlook.com>
>>> ---
>>> drivers/gpu/drm/hyperv/hyperv_drm_drv.c | 4 ++++
>>> drivers/gpu/drm/hyperv/hyperv_drm_modeset.c | 15 ++++++++-------
>>> 2 files changed, 12 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_drv.c
>> b/drivers/gpu/drm/hyperv/hyperv_drm_drv.c
>>> index 06b5d96e6eaf..79e51643be67 100644
>>> --- a/drivers/gpu/drm/hyperv/hyperv_drm_drv.c
>>> +++ b/drivers/gpu/drm/hyperv/hyperv_drm_drv.c
>>> @@ -150,6 +150,9 @@ static int hyperv_vmbus_probe(struct hv_device *hdev,
>>> goto err_free_mmio;
>>> }
>>>
>>> + /* If DRM panic path is stubbed out VMBus code must do the unload */
>>> + if (IS_ENABLED(CONFIG_DRM_PANIC) && IS_ENABLED(CONFIG_PRINTK))
>>
>> I think drm_panic should still work without printk.
>> The "user" panic screen would be unaffected, but the "kmsg" screen might
>> be blank, and the "qr_code" would generate an empty qr code.
>> (Actually I never tried to build a kernel without printk).
>
> Yeah, I had never built such a kernel either until recently when the kernel
> test robot flagged an error in Hyper-V code when CONFIG_PRINTK is not set. :-)
>
> But for this patch, the issue is that drm_panic() never gets called if CONFIG_PRINTK
> isn't set. In that case, kmsg_dump_register() is a stub that returns an error. So
> drm_panic_register() never registers the callback to drm_panic(). And if
> drm_panic() isn't going to run, responsibility for doing the VMBus unload
> must remain with the VMBus code. It's hard to actually test this case because
> of depending on printk() for debugging output, so double-check my
> thinking.
Ok you're right. I changed from
atomic_notifier_chain_register(&panic_notifier_list, ...) to
kmsg_dump_register() in the v10 of drm_panic.
So I should either make DRM_PANIC depends on PRINTK, or call
atomic_notifier_chain_register() if PRINTK is not defined.
As I think kernel without PRINTK are uncommon, I'll probably do the
first solution.
--
Jocelyn
>
> Michael
>
>>
>>> + vmbus_set_skip_unload(true);
>>> drm_client_setup(dev, NULL);
>>>
>>> return 0;
>
^ permalink raw reply
* Re: [RFC PATCH V2] x86/VMBus: Confidential VMBus for dynamic DMA buffer transition
From: Tianyu Lan @ 2026-02-12 8:26 UTC (permalink / raw)
To: Michael Kelley
Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
decui@microsoft.com, longli@microsoft.com, Tianyu Lan,
linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
hch@infradead.org, robin.murphy@arm.com, vdso@hexbites.dev
In-Reply-To: <SN6PR02MB41577FB84EC73E48ABAC7D18D463A@SN6PR02MB4157.namprd02.prod.outlook.com>
On Thu, Feb 12, 2026 at 2:00 AM Michael Kelley <mhklinux@outlook.com> wrote:
>
> From: Tianyu Lan <ltykernel@gmail.com> Sent: Tuesday, February 10, 2026 8:21 AM
> >
> > Hyper-V provides Confidential VMBus to communicate between
> > device model and device guest driver via encrypted/private
> > memory in Confidential VM. The device model is in OpenHCL
> > (https://openvmm.dev/guide/user_guide/openhcl.html) that
> > plays the paravisor rule.
> >
> > For a VMBUS device, there are two communication methods to
>
> s/VMBUS/VMBus/
>
> > talk with Host/Hypervisor. 1) VMBus Ring buffer 2) dynamic
> > DMA transition.
>
> I'm not sure what "dynamic DMA transition" is. Maybe just
> "DMA transfers"? Also, do the same substitution further
> down in this commit message.
>
> > The Confidential VMBus Ring buffer has been
> > upstreamed by Roman Kisel(commit 6802d8af).
>
> It's customary to use 12 character commit IDs, which would be
> 6802d8af47d1 in this case.
>
> >
> > The dynamic DMA transition of VMBus device normally goes
> > through DMA core and it uses SWIOTLB as bounce buffer in
> > CVM
>
> "CVM" is Microsoft-speak. The Linux terminology is "a CoCo VM".
>
> > to communicate with Host/Hypervisor. The Confidential
> > VMBus device may use private/encrypted memory to do DMA
> > and so the device swiotlb(bounce buffer) isn't necessary.
>
> The phrase "isn't necessary" does not capture the real issue
> here. Saying "isn't necessary" makes it sound like this patch is
> just avoids unnecessary work, so that it is a performance
> improvement. But that's not the case.
>
> The real issue is that swiotlb memory is decrypted. So bouncing
> through the swiotlb exposes to the host what is supposed to be
> confidential data passed on the Confidential VMBus. Disabling
> the swiotlb bouncing in this case is a hard requirement to preserve
> confidentially.
>
> So I would reword the sentences as something like this:
>
> The Confidential VMBus device can do DMA directly to
> private/encrypted memory. Because the swiotlb is decrypted
> memory, the DMA transfer must not be bounced through the
> swiotlb, so as to preserve confidentiality. This is different from
> the default for Linux CoCo VMs, so disable the VMBus device's
> use of swiotlb.
>
> > To disable device's swiotlb, set device->dma_io_tlb_mem
> > to NULL in VMBus driver and is_swiotlb_force_bounce()
> > always returns false.
> >
> > Suggested-by: Michael Kelley <mhklinux@outlook.com>
> > Signed-off-by: Tianyu Lan <tiala@microsoft.com>
> > ---
> > Change since v1:
> > Use device.dma_io_tlb_mem to disable device bounce buffer
> >
> > drivers/hv/vmbus_drv.c | 6 +++++-
> > 1 file changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> > index a53af6fe81a6..58dab8cc3fcb 100644
> > --- a/drivers/hv/vmbus_drv.c
> > +++ b/drivers/hv/vmbus_drv.c
> > @@ -2133,11 +2133,15 @@ int vmbus_device_register(struct hv_device *child_device_obj)
> > child_device_obj->device.dma_mask = &child_device_obj->dma_mask;
> > dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64));
> >
> > + device_initialize(&child_device_obj->device);
> > + if (child_device_obj->channel->co_external_memory)
> > + child_device_obj->device.dma_io_tlb_mem = NULL;
> > +
>
> Doing this as part of the VMBus bus driver makes sense. While directly
> setting device.dma_io_tlb_mem to NULL should work, it would be better
> to add a function to the swiotlb code to do this, and then call that function
> here, passing the device as an argument. The need to disable swiotlb on a
> device will likely arise in similar contexts (such as TDISP), and it would be
> better to have a swiotlb function for that purpose. This use case may be
> a bit ahead of the TDISP work, and having a swiotlb function in place will
> help ensure that duplicate mechanisms aren't created as everything
> comes together.
>
> See my earlier comments in [1] about the key point in the commit message,
> and about adding a swiotlb_dev_disable() function to the swiotlb code.
>
> Michael
Hi Michael:
Thanks for your review. Will add swiotlb_dev_disable() in the next version.
>
> [1] https://lore.kernel.org/linux-hyperv/SN6PR02MB4157DAE6D8CC6BA11CA87298D4DCA@SN6PR02MB4157.namprd02.prod.outlook.com/
>
> > /*
> > * Register with the LDM. This will kick off the driver/device
> > * binding...which will eventually call vmbus_match() and vmbus_probe()
> > */
> > - ret = device_register(&child_device_obj->device);
> > + ret = device_add(&child_device_obj->device);
> > if (ret) {
> > pr_err("Unable to register child device\n");
> > put_device(&child_device_obj->device);
> > --
> > 2.50.1
--
Thanks
Tianyu Lan
^ permalink raw reply
* Re: [PATCH 0/2] kexec: Refuse kernel-unsafe Microsoft Hypervisor transitions
From: Stanislav Kinsburskii @ 2026-02-11 23:30 UTC (permalink / raw)
To: rppt, akpm, bhe, kys, haiyangz, wei.liu, decui, longli
Cc: kexec, linux-hyperv, linux-kernel
In-Reply-To: <176962149772.85424.9395505307198316093.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Wed, Jan 28, 2026 at 05:41:56PM +0000, Stanislav Kinsburskii wrote:
> When Microsoft Hypervisor is active, the kernel may have memory “deposited”
> to the hypervisor. Those pages are no longer safe for the kernel to touch,
> and attempting to access them can trigger a GPF. The problem becomes acute
> with kexec: the “deposited pages” state does not survive the transition,
> and the next kernel has no reliable way to know which pages are still
> owned/managed by the hypervisor.
>
> Until there is a proper handoff mechanism to preserve that state across
> kexec, the only safe behavior is to refuse kexec whenever there is shared
> hypervisor state that cannot survive the transition—most notably deposited
> pages, and also cases where VMs are still running.
>
> This series adds the missing kexec integration point needed by MSHV: a
> callback at the kexec “freeze” stage so the driver can make the transition
> safe (or block it). With this hook, MSHV can refuse kexec while VMs are
> running, attempt to withdraw deposited pages when possible (e.g. L1VH
> host), and fail the transition if any pages remain deposited.
>
> ---
>
> Stanislav Kinsburskii (2):
> kexec: Add permission notifier chain for kexec operations
> mshv: Add kexec blocking support
>
Hi,
I’m sending a gentle follow‑up on the patch series below, which I posted
about two weeks ago. I wanted to check whether anyone has had a chance
to look at it, or if there are concerns I should address.
Any feedback would be appreciated.
Thanks for your time.
Best regards,
Stanislav
>
> drivers/hv/Makefile | 1 +
> drivers/hv/hv_proc.c | 4 ++
> drivers/hv/mshv_kexec.c | 66 ++++++++++++++++++++++++++++++++++++++++
> drivers/hv/mshv_root.h | 14 ++++++++
> drivers/hv/mshv_root_hv_call.c | 2 +
> drivers/hv/mshv_root_main.c | 7 ++++
> include/linux/kexec.h | 6 ++++
> kernel/kexec_core.c | 24 +++++++++++++++
> 8 files changed, 124 insertions(+)
> create mode 100644 drivers/hv/mshv_kexec.c
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox