linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP
@ 2025-05-03 13:42 Daisuke Matsuda
  2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-03 13:42 UTC (permalink / raw)
  To: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000; +Cc: Daisuke Matsuda

There is ibv_advise_mr(3) that can be used by applications to optimize
memory access. This series enables the feature on rxe driver, which has
already been available in mlx5.

There is a tiny change on the rdma-core util.
cf. https://github.com/linux-rdma/rdma-core/pull/1605

Daisuke Matsuda (2):
  RDMA/rxe: Implement synchronous prefetch for ODP MRs
  RDMA/rxe: Enable asynchronous prefetch for ODP MRs

 drivers/infiniband/sw/rxe/rxe.c     |   7 ++
 drivers/infiniband/sw/rxe/rxe_loc.h |  10 ++
 drivers/infiniband/sw/rxe/rxe_odp.c | 165 ++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-03 13:42 [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Daisuke Matsuda
@ 2025-05-03 13:42 ` Daisuke Matsuda
  2025-05-05  7:57   ` Zhu Yanjun
  2025-05-09 15:19   ` Zhu Yanjun
  2025-05-03 13:42 ` [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous " Daisuke Matsuda
  2025-05-03 17:08 ` [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Zhu Yanjun
  2 siblings, 2 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-03 13:42 UTC (permalink / raw)
  To: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000; +Cc: Daisuke Matsuda

Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
which are best-effort, will be added subsequently.

Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe.c     |  7 +++
 drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
 drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+)

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 3a77d6db1720..e891199cbdef 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
 	mutex_destroy(&rxe->usdev_lock);
 }
 
+static const struct ib_device_ops rxe_ib_dev_odp_ops = {
+	.advise_mr = rxe_ib_advise_mr,
+};
+
 /* initialize rxe device parameters */
 static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
 {
@@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
 		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
 		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
 		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
+
+		/* set handler for ODP prefetching API - ibv_advise_mr(3) */
+		ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
 	}
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index f7dbb9cddd12..21b070f3dbb8 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
 int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
 			    unsigned int length);
 enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
+int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+		     u32 flags, struct ib_sge *sg_list, u32 num_sge,
+		     struct uverbs_attr_bundle *attrs);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline int
 rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
@@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
 {
 	return RESPST_ERR_UNSUPPORTED_OPCODE;
 }
+static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+				   u32 flags, struct ib_sge *sg_list, u32 num_sge,
+				   struct uverbs_attr_bundle *attrs)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 #endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index 6149d9ffe7f7..e5c60b061d7e 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
 
 	return RESPST_NONE;
 }
+
+static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
+				   enum ib_uverbs_advise_mr_advice advice,
+				   u32 pf_flags, struct ib_sge *sg_list,
+				   u32 num_sge)
+{
+	struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < num_sge; ++i) {
+		struct rxe_mr *mr;
+		struct ib_umem_odp *umem_odp;
+
+		mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
+			       sg_list[i].lkey, RXE_LOOKUP_LOCAL);
+
+		if (IS_ERR(mr)) {
+			rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
+			return PTR_ERR(mr);
+		}
+
+		if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+		    !mr->umem->writable) {
+			rxe_dbg_mr(mr, "missing write permission\n");
+			rxe_put(mr);
+			return -EPERM;
+		}
+
+		ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
+						    sg_list[i].length, pf_flags);
+		if (ret < 0) {
+			if (sg_list[i].length == 0)
+				continue;
+
+			rxe_dbg_mr(mr, "failed to prefetch the mr\n");
+			rxe_put(mr);
+			return ret;
+		}
+
+		umem_odp = to_ib_umem_odp(mr->umem);
+		mutex_unlock(&umem_odp->umem_mutex);
+
+		rxe_put(mr);
+	}
+
+	return 0;
+}
+
+static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
+				     enum ib_uverbs_advise_mr_advice advice,
+				     u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+	u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
+
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
+		pf_flags |= RXE_PAGEFAULT_RDONLY;
+
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
+		pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
+
+	/* Synchronous call */
+	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
+		return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
+					       num_sge);
+
+	/* Asynchronous call is "best-effort" */
+
+	return 0;
+}
+
+int rxe_ib_advise_mr(struct ib_pd *ibpd,
+		     enum ib_uverbs_advise_mr_advice advice,
+		     u32 flags,
+		     struct ib_sge *sg_list,
+		     u32 num_sge,
+		     struct uverbs_attr_bundle *attrs)
+{
+	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
+	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
+		return -EOPNOTSUPP;
+
+	return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
+					 sg_list, num_sge);
+}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous prefetch for ODP MRs
  2025-05-03 13:42 [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Daisuke Matsuda
  2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
@ 2025-05-03 13:42 ` Daisuke Matsuda
  2025-05-05 15:25   ` Zhu Yanjun
  2025-05-03 17:08 ` [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Zhu Yanjun
  2 siblings, 1 reply; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-03 13:42 UTC (permalink / raw)
  To: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000; +Cc: Daisuke Matsuda

Calling ibv_advise_mr(3) with flags other than IBV_ADVISE_MR_FLAG_FLUSH
invokes asynchronous requests. It is best-effort, and thus can safely be
deferred to the system-wide workqueue.

Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_odp.c | 81 ++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index e5c60b061d7e..d98b385a18ce 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -425,6 +425,73 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
 	return RESPST_NONE;
 }
 
+struct prefetch_mr_work {
+	struct work_struct work;
+	u32 pf_flags;
+	u32 num_sge;
+	struct {
+		u64 io_virt;
+		struct rxe_mr *mr;
+		size_t length;
+	} frags[];
+};
+
+static void rxe_ib_prefetch_mr_work(struct work_struct *w)
+{
+	struct prefetch_mr_work *work =
+		container_of(w, struct prefetch_mr_work, work);
+	int ret;
+	u32 i;
+
+	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
+	WARN_ON(!work->num_sge);
+	for (i = 0; i < work->num_sge; ++i) {
+		struct ib_umem_odp *umem_odp;
+
+		ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, work->frags[i].io_virt,
+						    work->frags[i].length, work->pf_flags);
+		if (ret < 0) {
+			rxe_dbg_mr(work->frags[i].mr, "failed to prefetch the mr\n");
+			continue;
+		}
+
+		umem_odp = to_ib_umem_odp(work->frags[i].mr->umem);
+		mutex_unlock(&umem_odp->umem_mutex);
+	}
+
+	kvfree(work);
+}
+
+static int rxe_init_prefetch_work(struct ib_pd *ibpd,
+				  enum ib_uverbs_advise_mr_advice advice,
+				  u32 pf_flags, struct prefetch_mr_work *work,
+				  struct ib_sge *sg_list, u32 num_sge)
+{
+	struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
+	u32 i;
+
+	INIT_WORK(&work->work, rxe_ib_prefetch_mr_work);
+	work->pf_flags = pf_flags;
+
+	for (i = 0; i < num_sge; ++i) {
+		struct rxe_mr *mr;
+
+		mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
+			       sg_list[i].lkey, RXE_LOOKUP_LOCAL);
+		if (IS_ERR(mr)) {
+			work->num_sge = i;
+			return PTR_ERR(mr);
+		}
+		work->frags[i].io_virt = sg_list[i].addr;
+		work->frags[i].length = sg_list[i].length;
+		work->frags[i].mr = mr;
+
+		rxe_put(mr);
+	}
+	work->num_sge = num_sge;
+	return 0;
+}
+
 static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
 				   enum ib_uverbs_advise_mr_advice advice,
 				   u32 pf_flags, struct ib_sge *sg_list,
@@ -478,6 +545,8 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
 				     u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
 	u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
+	struct prefetch_mr_work *work;
+	int rc;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= RXE_PAGEFAULT_RDONLY;
@@ -490,7 +559,17 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
 		return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
 					       num_sge);
 
-	/* Asynchronous call is "best-effort" */
+	/* Asynchronous call is "best-effort" and allowed to fail */
+	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	rc = rxe_init_prefetch_work(ibpd, advice, pf_flags, work, sg_list, num_sge);
+	if (rc) {
+		kvfree(work);
+		return rc;
+	}
+	queue_work(system_unbound_wq, &work->work);
 
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP
  2025-05-03 13:42 [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Daisuke Matsuda
  2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
  2025-05-03 13:42 ` [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous " Daisuke Matsuda
@ 2025-05-03 17:08 ` Zhu Yanjun
  2025-05-04  9:23   ` Daisuke Matsuda
  2 siblings, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-03 17:08 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

在 2025/5/3 15:42, Daisuke Matsuda 写道:
> There is ibv_advise_mr(3) that can be used by applications to optimize
> memory access. This series enables the feature on rxe driver, which has
> already been available in mlx5.
> 
> There is a tiny change on the rdma-core util.
> cf. https://github.com/linux-rdma/rdma-core/pull/1605

Hi, Daisuke

Thanks a lot for your efforts to this patch series. It is very nice. 
With this patch series, we can make prefetch for ODP MRs of RXE.

I read through this patch series. And it seems fine with me.^_^

IIRC, you have added ODP testcases in rdma-core. To verify this prefetch 
work well for ODP MRs, can you add this synchronous/asynchronous 
prefetch to the ODP testcases in rdma-core?

Thus, we can verify this patch series. And in the future, we can use 
these testcases to confirm this prefetch feature work well.

To now, it seems that no tool can verify this prefetch feature.

Thanks a lot.
Zhu Yanjun

> 
> Daisuke Matsuda (2):
>    RDMA/rxe: Implement synchronous prefetch for ODP MRs
>    RDMA/rxe: Enable asynchronous prefetch for ODP MRs
> 
>   drivers/infiniband/sw/rxe/rxe.c     |   7 ++
>   drivers/infiniband/sw/rxe/rxe_loc.h |  10 ++
>   drivers/infiniband/sw/rxe/rxe_odp.c | 165 ++++++++++++++++++++++++++++
>   3 files changed, 182 insertions(+)
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP
  2025-05-03 17:08 ` [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Zhu Yanjun
@ 2025-05-04  9:23   ` Daisuke Matsuda
  0 siblings, 0 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-04  9:23 UTC (permalink / raw)
  To: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 2025/05/04 2:08, Zhu Yanjun wrote:
> 在 2025/5/3 15:42, Daisuke Matsuda 写道:
>> There is ibv_advise_mr(3) that can be used by applications to optimize
>> memory access. This series enables the feature on rxe driver, which has
>> already been available in mlx5.
>>
>> There is a tiny change on the rdma-core util.
>> cf. https://github.com/linux-rdma/rdma-core/pull/1605
> 
> Hi, Daisuke
> 
> Thanks a lot for your efforts to this patch series. It is very nice. With this patch series, we can make prefetch for ODP MRs of RXE.
> 
> I read through this patch series. And it seems fine with me.^_^
> 
> IIRC, you have added ODP testcases in rdma-core. To verify this prefetch work well for ODP MRs, can you add this synchronous/asynchronous prefetch to the ODP testcases in rdma-core?
> 
> Thus, we can verify this patch series. And in the future, we can use these testcases to confirm this prefetch feature work well.
> 
> To now, it seems that no tool can verify this prefetch feature.

Hi, Thank you for taking a look!

There are already relevant testcases implemented in ./tests/test_odp.py:
  - test_odp_sync_prefetch_rc_traffic
  - test_odp_async_prefetch_rc_traffic
  - test_odp_prefetch_sync_no_page_fault_rc_traffic
  - test_odp_prefetch_async_no_page_fault_rc_traffic
On my node (x86-64/linux-6.15.0-rc1+), this series can pass all these tests.

Other than that, this feature is required by librpma as far as I know.
Though it is not maintained anymore, perhaps this could also be used for testing.
cf. https://github.com/pmem/rpma

Thanks,
Daisuke

> 
> Thanks a lot.
> Zhu Yanjun
> 
>>
>> Daisuke Matsuda (2):
>>    RDMA/rxe: Implement synchronous prefetch for ODP MRs
>>    RDMA/rxe: Enable asynchronous prefetch for ODP MRs
>>
>>   drivers/infiniband/sw/rxe/rxe.c     |   7 ++
>>   drivers/infiniband/sw/rxe/rxe_loc.h |  10 ++
>>   drivers/infiniband/sw/rxe/rxe_odp.c | 165 ++++++++++++++++++++++++++++
>>   3 files changed, 182 insertions(+)
>>
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
@ 2025-05-05  7:57   ` Zhu Yanjun
  2025-05-09 11:51     ` Daisuke Matsuda
  2025-05-09 15:19   ` Zhu Yanjun
  1 sibling, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-05  7:57 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 03.05.25 15:42, Daisuke Matsuda wrote:
> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
> which are best-effort, will be added subsequently.
> 
> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
> ---
>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>   3 files changed, 103 insertions(+)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 3a77d6db1720..e891199cbdef 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>   	mutex_destroy(&rxe->usdev_lock);
>   }
>   
> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
> +	.advise_mr = rxe_ib_advise_mr,
> +};
> +
>   /* initialize rxe device parameters */
>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>   {
> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
> +
> +		/* set handler for ODP prefetching API - ibv_advise_mr(3) */
> +		ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>   	}
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index f7dbb9cddd12..21b070f3dbb8 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>   			    unsigned int length);
>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> +		     u32 flags, struct ib_sge *sg_list, u32 num_sge,
> +		     struct uverbs_attr_bundle *attrs);
>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>   static inline int
>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>   {
>   	return RESPST_ERR_UNSUPPORTED_OPCODE;
>   }
> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> +				   u32 flags, struct ib_sge *sg_list, u32 num_sge,
> +				   struct uverbs_attr_bundle *attrs)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>   
>   #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 6149d9ffe7f7..e5c60b061d7e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>   
>   	return RESPST_NONE;
>   }
> +
> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
> +				   enum ib_uverbs_advise_mr_advice advice,
> +				   u32 pf_flags, struct ib_sge *sg_list,
> +				   u32 num_sge)
> +{
> +	struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
> +	unsigned int i;
> +	int ret = 0;
> +
> +	for (i = 0; i < num_sge; ++i) {

i is unsigned int, num_sge is u32. Perhaps they all use u32 type?
It is a minor problem.
Other than that, I am fine with this commit.

I have made tests with rdma-core. Both the synchronous and asynchrounos 
modes can work well.

Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>

Zhu Yanjun

> +		struct rxe_mr *mr;
> +		struct ib_umem_odp *umem_odp;
> +
> +		mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
> +			       sg_list[i].lkey, RXE_LOOKUP_LOCAL);
> +
> +		if (IS_ERR(mr)) {
> +			rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
> +			return PTR_ERR(mr);
> +		}
> +
> +		if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> +		    !mr->umem->writable) {
> +			rxe_dbg_mr(mr, "missing write permission\n");
> +			rxe_put(mr);
> +			return -EPERM;
> +		}
> +
> +		ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
> +						    sg_list[i].length, pf_flags);
> +		if (ret < 0) {
> +			if (sg_list[i].length == 0)
> +				continue;
> +
> +			rxe_dbg_mr(mr, "failed to prefetch the mr\n");
> +			rxe_put(mr);
> +			return ret;
> +		}
> +
> +		umem_odp = to_ib_umem_odp(mr->umem);
> +		mutex_unlock(&umem_odp->umem_mutex);
> +
> +		rxe_put(mr);
> +	}
> +
> +	return 0;
> +}
> +
> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
> +				     enum ib_uverbs_advise_mr_advice advice,
> +				     u32 flags, struct ib_sge *sg_list, u32 num_sge)
> +{
> +	u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
> +
> +	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
> +		pf_flags |= RXE_PAGEFAULT_RDONLY;
> +
> +	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> +		pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
> +
> +	/* Synchronous call */
> +	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
> +		return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
> +					       num_sge);
> +
> +	/* Asynchronous call is "best-effort" */
> +
> +	return 0;
> +}
> +
> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
> +		     enum ib_uverbs_advise_mr_advice advice,
> +		     u32 flags,
> +		     struct ib_sge *sg_list,
> +		     u32 num_sge,
> +		     struct uverbs_attr_bundle *attrs)
> +{
> +	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
> +	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> +	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> +		return -EOPNOTSUPP;
> +
> +	return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
> +					 sg_list, num_sge);
> +}


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous prefetch for ODP MRs
  2025-05-03 13:42 ` [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous " Daisuke Matsuda
@ 2025-05-05 15:25   ` Zhu Yanjun
  2025-05-09 12:19     ` Daisuke Matsuda
  0 siblings, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-05 15:25 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 03.05.25 15:42, Daisuke Matsuda wrote:
> Calling ibv_advise_mr(3) with flags other than IBV_ADVISE_MR_FLAG_FLUSH
> invokes asynchronous requests. It is best-effort, and thus can safely be
> deferred to the system-wide workqueue.
> 
> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>

I have made tests with rdma-core after applying this patch series. It 
seems that it can work well.
I read through this commit. Other than the following minor problems, I 
am fine with this commit.

Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>

> ---
>   drivers/infiniband/sw/rxe/rxe_odp.c | 81 ++++++++++++++++++++++++++++-
>   1 file changed, 80 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index e5c60b061d7e..d98b385a18ce 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -425,6 +425,73 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>   	return RESPST_NONE;
>   }
>   
> +struct prefetch_mr_work {
> +	struct work_struct work;
> +	u32 pf_flags;
> +	u32 num_sge;
> +	struct {
> +		u64 io_virt;
> +		struct rxe_mr *mr;
> +		size_t length;
> +	} frags[];
> +};

The struct prefetch_mr_work should be moved into header file? IMO, it is 
better to move this struct to rxe_loc.h?

> +
> +static void rxe_ib_prefetch_mr_work(struct work_struct *w)
> +{
> +	struct prefetch_mr_work *work =
> +		container_of(w, struct prefetch_mr_work, work);
> +	int ret;
> +	u32 i;
> +
> +	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
> +	WARN_ON(!work->num_sge);
> +	for (i = 0; i < work->num_sge; ++i) {
> +		struct ib_umem_odp *umem_odp;
> +
> +		ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, work->frags[i].io_virt,
> +						    work->frags[i].length, work->pf_flags);
> +		if (ret < 0) {
> +			rxe_dbg_mr(work->frags[i].mr, "failed to prefetch the mr\n");
> +			continue;
> +		}
> +
> +		umem_odp = to_ib_umem_odp(work->frags[i].mr->umem);
> +		mutex_unlock(&umem_odp->umem_mutex);

Obviously this function is dependent on the mutex lock umem_mutex. So in 
the beginning of this function, it is better to  add 
lockdep_assert_held(&umem_odp->umem_mutex)?

Zhu Yanjun

> +	}
> +
> +	kvfree(work);
> +}
> +
> +static int rxe_init_prefetch_work(struct ib_pd *ibpd,
> +				  enum ib_uverbs_advise_mr_advice advice,
> +				  u32 pf_flags, struct prefetch_mr_work *work,
> +				  struct ib_sge *sg_list, u32 num_sge)
> +{
> +	struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
> +	u32 i;
> +
> +	INIT_WORK(&work->work, rxe_ib_prefetch_mr_work);
> +	work->pf_flags = pf_flags;
> +
> +	for (i = 0; i < num_sge; ++i) {
> +		struct rxe_mr *mr;
> +
> +		mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
> +			       sg_list[i].lkey, RXE_LOOKUP_LOCAL);
> +		if (IS_ERR(mr)) {
> +			work->num_sge = i;
> +			return PTR_ERR(mr);
> +		}
> +		work->frags[i].io_virt = sg_list[i].addr;
> +		work->frags[i].length = sg_list[i].length;
> +		work->frags[i].mr = mr;
> +
> +		rxe_put(mr);
> +	}
> +	work->num_sge = num_sge;
> +	return 0;
> +}
> +
>   static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>   				   enum ib_uverbs_advise_mr_advice advice,
>   				   u32 pf_flags, struct ib_sge *sg_list,
> @@ -478,6 +545,8 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>   				     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>   {
>   	u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
> +	struct prefetch_mr_work *work;
> +	int rc;
>   
>   	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>   		pf_flags |= RXE_PAGEFAULT_RDONLY;
> @@ -490,7 +559,17 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>   		return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>   					       num_sge);
>   
> -	/* Asynchronous call is "best-effort" */
> +	/* Asynchronous call is "best-effort" and allowed to fail */
> +	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
> +	if (!work)
> +		return -ENOMEM;
> +
> +	rc = rxe_init_prefetch_work(ibpd, advice, pf_flags, work, sg_list, num_sge);
> +	if (rc) {
> +		kvfree(work);
> +		return rc;
> +	}
> +	queue_work(system_unbound_wq, &work->work);
>   
>   	return 0;
>   }


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-05  7:57   ` Zhu Yanjun
@ 2025-05-09 11:51     ` Daisuke Matsuda
  0 siblings, 0 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-09 11:51 UTC (permalink / raw)
  To: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000


On 2025/05/05 16:57, Zhu Yanjun wrote:
> On 03.05.25 15:42, Daisuke Matsuda wrote:
>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
>> which are best-effort, will be added subsequently.
>>
>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>> ---
>>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>>   3 files changed, 103 insertions(+)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 3a77d6db1720..e891199cbdef 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>       mutex_destroy(&rxe->usdev_lock);
>>   }
>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>> +    .advise_mr = rxe_ib_advise_mr,
>> +};
>> +
>>   /* initialize rxe device parameters */
>>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>   {
>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
>> +
>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>       }
>>   }
>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
>> index f7dbb9cddd12..21b070f3dbb8 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>                   unsigned int length);
>>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>> +             struct uverbs_attr_bundle *attrs);
>>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>   static inline int
>>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>   {
>>       return RESPST_ERR_UNSUPPORTED_OPCODE;
>>   }
>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>> +                   struct uverbs_attr_bundle *attrs)
>> +{
>> +    return -EOPNOTSUPP;
>> +}
>> +
>>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>   #endif /* RXE_LOC_H */
>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
>> index 6149d9ffe7f7..e5c60b061d7e 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>       return RESPST_NONE;
>>   }
>> +
>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>> +                   enum ib_uverbs_advise_mr_advice advice,
>> +                   u32 pf_flags, struct ib_sge *sg_list,
>> +                   u32 num_sge)
>> +{
>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>> +    unsigned int i;
>> +    int ret = 0;
>> +
>> +    for (i = 0; i < num_sge; ++i) {
> 
> i is unsigned int, num_sge is u32. Perhaps they all use u32 type?
> It is a minor problem.

Since we have dropped support for 32-bit archtectures,
these types should practically be identical.

> Other than that, I am fine with this commit.
> 
> I have made tests with rdma-core. Both the synchronous and asynchrounos modes can work well.

Thank you for the review and testing.

Daisuke

> 
> Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> 
> Zhu Yanjun
> 
>> +        struct rxe_mr *mr;
>> +        struct ib_umem_odp *umem_odp;
>> +
>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>> +
>> +        if (IS_ERR(mr)) {
>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
>> +            return PTR_ERR(mr);
>> +        }
>> +
>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>> +            !mr->umem->writable) {
>> +            rxe_dbg_mr(mr, "missing write permission\n");
>> +            rxe_put(mr);
>> +            return -EPERM;
>> +        }
>> +
>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>> +                            sg_list[i].length, pf_flags);
>> +        if (ret < 0) {
>> +            if (sg_list[i].length == 0)
>> +                continue;
>> +
>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>> +            rxe_put(mr);
>> +            return ret;
>> +        }
>> +
>> +        umem_odp = to_ib_umem_odp(mr->umem);
>> +        mutex_unlock(&umem_odp->umem_mutex);
>> +
>> +        rxe_put(mr);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>> +                     enum ib_uverbs_advise_mr_advice advice,
>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>> +{
>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>> +
>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>> +
>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>> +
>> +    /* Synchronous call */
>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>> +                           num_sge);
>> +
>> +    /* Asynchronous call is "best-effort" */
>> +
>> +    return 0;
>> +}
>> +
>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>> +             enum ib_uverbs_advise_mr_advice advice,
>> +             u32 flags,
>> +             struct ib_sge *sg_list,
>> +             u32 num_sge,
>> +             struct uverbs_attr_bundle *attrs)
>> +{
>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>> +        return -EOPNOTSUPP;
>> +
>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>> +                     sg_list, num_sge);
>> +}
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous prefetch for ODP MRs
  2025-05-05 15:25   ` Zhu Yanjun
@ 2025-05-09 12:19     ` Daisuke Matsuda
  2025-05-09 12:52       ` Zhu Yanjun
  2025-05-09 14:48       ` Zhu Yanjun
  0 siblings, 2 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-09 12:19 UTC (permalink / raw)
  To: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000



On 2025/05/06 0:25, Zhu Yanjun wrote:
> On 03.05.25 15:42, Daisuke Matsuda wrote:
>> Calling ibv_advise_mr(3) with flags other than IBV_ADVISE_MR_FLAG_FLUSH
>> invokes asynchronous requests. It is best-effort, and thus can safely be
>> deferred to the system-wide workqueue.
>>
>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
> 
> I have made tests with rdma-core after applying this patch series. It seems that it can work well.
> I read through this commit. Other than the following minor problems, I am fine with this commit.
> 
> Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> 
>> ---
>>   drivers/infiniband/sw/rxe/rxe_odp.c | 81 ++++++++++++++++++++++++++++-
>>   1 file changed, 80 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
>> index e5c60b061d7e..d98b385a18ce 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>> @@ -425,6 +425,73 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>       return RESPST_NONE;
>>   }
>> +struct prefetch_mr_work {
>> +    struct work_struct work;
>> +    u32 pf_flags;
>> +    u32 num_sge;
>> +    struct {
>> +        u64 io_virt;
>> +        struct rxe_mr *mr;
>> +        size_t length;
>> +    } frags[];
>> +};
> 
> The struct prefetch_mr_work should be moved into header file? IMO, it is better to move this struct to rxe_loc.h?

This struct is not likely to be used in other files.
I think leaving it here would be easier for other developers to understand because relevant codes are gathered.
If there is any specific reason to move, I will do so.

> 
>> +
>> +static void rxe_ib_prefetch_mr_work(struct work_struct *w)
>> +{
>> +    struct prefetch_mr_work *work =
>> +        container_of(w, struct prefetch_mr_work, work);
>> +    int ret;
>> +    u32 i;
>> +
>> +    /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
>> +    WARN_ON(!work->num_sge);
>> +    for (i = 0; i < work->num_sge; ++i) {
>> +        struct ib_umem_odp *umem_odp;
>> +
>> +        ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, work->frags[i].io_virt,
>> +                            work->frags[i].length, work->pf_flags);
>> +        if (ret < 0) {
>> +            rxe_dbg_mr(work->frags[i].mr, "failed to prefetch the mr\n");
>> +            continue;
>> +        }
>> +
>> +        umem_odp = to_ib_umem_odp(work->frags[i].mr->umem);
>> +        mutex_unlock(&umem_odp->umem_mutex);
> 
> Obviously this function is dependent on the mutex lock umem_mutex. So in the beginning of this function, it is better to  add lockdep_assert_held(&umem_odp->umem_mutex)?

The mutex **must not** be locked at the beginning, so
perhaps we can add lockdep_assert_not_held() instead at the beginning,
but this one is not used frequently, and we can do without that.

umem_mutex is locked in rxe_odp_do_pagefault_and_lock() in the for loop.
The function calls ib_umem_odp_map_dma_and_lock(), which locks the mutex only when pagefault is successful.
If ib_umem_odp_map_dma_and_lock() fails, an error is returned and then the mutex is not locked.

Daisuke

> 
> Zhu Yanjun
> 
>> +    }
>> +
>> +    kvfree(work);
>> +}
>> +
>> +static int rxe_init_prefetch_work(struct ib_pd *ibpd,
>> +                  enum ib_uverbs_advise_mr_advice advice,
>> +                  u32 pf_flags, struct prefetch_mr_work *work,
>> +                  struct ib_sge *sg_list, u32 num_sge)
>> +{
>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>> +    u32 i;
>> +
>> +    INIT_WORK(&work->work, rxe_ib_prefetch_mr_work);
>> +    work->pf_flags = pf_flags;
>> +
>> +    for (i = 0; i < num_sge; ++i) {
>> +        struct rxe_mr *mr;
>> +
>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>> +        if (IS_ERR(mr)) {
>> +            work->num_sge = i;
>> +            return PTR_ERR(mr);
>> +        }
>> +        work->frags[i].io_virt = sg_list[i].addr;
>> +        work->frags[i].length = sg_list[i].length;
>> +        work->frags[i].mr = mr;
>> +
>> +        rxe_put(mr);
>> +    }
>> +    work->num_sge = num_sge;
>> +    return 0;
>> +}
>> +
>>   static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>                      enum ib_uverbs_advise_mr_advice advice,
>>                      u32 pf_flags, struct ib_sge *sg_list,
>> @@ -478,6 +545,8 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>                        u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>   {
>>       u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>> +    struct prefetch_mr_work *work;
>> +    int rc;
>>       if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>           pf_flags |= RXE_PAGEFAULT_RDONLY;
>> @@ -490,7 +559,17 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>           return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>>                              num_sge);
>> -    /* Asynchronous call is "best-effort" */
>> +    /* Asynchronous call is "best-effort" and allowed to fail */
>> +    work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
>> +    if (!work)
>> +        return -ENOMEM;
>> +
>> +    rc = rxe_init_prefetch_work(ibpd, advice, pf_flags, work, sg_list, num_sge);
>> +    if (rc) {
>> +        kvfree(work);
>> +        return rc;
>> +    }
>> +    queue_work(system_unbound_wq, &work->work);
>>       return 0;
>>   }
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous prefetch for ODP MRs
  2025-05-09 12:19     ` Daisuke Matsuda
@ 2025-05-09 12:52       ` Zhu Yanjun
  2025-05-09 14:48       ` Zhu Yanjun
  1 sibling, 0 replies; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-09 12:52 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 09.05.25 14:19, Daisuke Matsuda wrote:
> 
> 
> On 2025/05/06 0:25, Zhu Yanjun wrote:
>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>> Calling ibv_advise_mr(3) with flags other than IBV_ADVISE_MR_FLAG_FLUSH
>>> invokes asynchronous requests. It is best-effort, and thus can safely be
>>> deferred to the system-wide workqueue.
>>>
>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>
>> I have made tests with rdma-core after applying this patch series. It 
>> seems that it can work well.
>> I read through this commit. Other than the following minor problems, I 
>> am fine with this commit.
>>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>
>>> ---
>>>   drivers/infiniband/sw/rxe/rxe_odp.c | 81 ++++++++++++++++++++++++++++-
>>>   1 file changed, 80 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c 
>>> b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> index e5c60b061d7e..d98b385a18ce 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> @@ -425,6 +425,73 @@ enum resp_states rxe_odp_do_atomic_write(struct 
>>> rxe_mr *mr, u64 iova, u64 value)
>>>       return RESPST_NONE;
>>>   }
>>> +struct prefetch_mr_work {
>>> +    struct work_struct work;
>>> +    u32 pf_flags;
>>> +    u32 num_sge;
>>> +    struct {
>>> +        u64 io_virt;
>>> +        struct rxe_mr *mr;
>>> +        size_t length;
>>> +    } frags[];
>>> +};
>>
>> The struct prefetch_mr_work should be moved into header file? IMO, it 
>> is better to move this struct to rxe_loc.h?
> 
> This struct is not likely to be used in other files.
> I think leaving it here would be easier for other developers to 
> understand because relevant codes are gathered.
> If there is any specific reason to move, I will do so.
> 
>>
>>> +
>>> +static void rxe_ib_prefetch_mr_work(struct work_struct *w)
>>> +{
>>> +    struct prefetch_mr_work *work =
>>> +        container_of(w, struct prefetch_mr_work, work);
>>> +    int ret;
>>> +    u32 i;
>>> +
>>> +    /* We rely on IB/core that work is executed if we have num_sge 
>>> != 0 only. */
>>> +    WARN_ON(!work->num_sge);
>>> +    for (i = 0; i < work->num_sge; ++i) {
>>> +        struct ib_umem_odp *umem_odp;
>>> +
>>> +        ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, 
>>> work->frags[i].io_virt,
>>> +                            work->frags[i].length, work->pf_flags);
>>> +        if (ret < 0) {
>>> +            rxe_dbg_mr(work->frags[i].mr, "failed to prefetch the 
>>> mr\n");
>>> +            continue;
>>> +        }
>>> +
>>> +        umem_odp = to_ib_umem_odp(work->frags[i].mr->umem);
>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>
>> Obviously this function is dependent on the mutex lock umem_mutex. So 
>> in the beginning of this function, it is better to  add 
>> lockdep_assert_held(&umem_odp->umem_mutex)?
> 
> The mutex **must not** be locked at the beginning, so
> perhaps we can add lockdep_assert_not_held() instead at the beginning,
> but this one is not used frequently, and we can do without that.
> 
> umem_mutex is locked in rxe_odp_do_pagefault_and_lock() in the for loop.
> The function calls ib_umem_odp_map_dma_and_lock(), which locks the mutex 
> only when pagefault is successful.

Sure. umem_mutex is locked in other functions. It is not necessary to 
use lockdep_assert_held at the beginning of this function.

Zhu Yanjun

> If ib_umem_odp_map_dma_and_lock() fails, an error is returned and then 
> the mutex is not locked.
> 
> Daisuke
> 
>>
>> Zhu Yanjun
>>
>>> +    }
>>> +
>>> +    kvfree(work);
>>> +}
>>> +
>>> +static int rxe_init_prefetch_work(struct ib_pd *ibpd,
>>> +                  enum ib_uverbs_advise_mr_advice advice,
>>> +                  u32 pf_flags, struct prefetch_mr_work *work,
>>> +                  struct ib_sge *sg_list, u32 num_sge)
>>> +{
>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>> +    u32 i;
>>> +
>>> +    INIT_WORK(&work->work, rxe_ib_prefetch_mr_work);
>>> +    work->pf_flags = pf_flags;
>>> +
>>> +    for (i = 0; i < num_sge; ++i) {
>>> +        struct rxe_mr *mr;
>>> +
>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>> +        if (IS_ERR(mr)) {
>>> +            work->num_sge = i;
>>> +            return PTR_ERR(mr);
>>> +        }
>>> +        work->frags[i].io_virt = sg_list[i].addr;
>>> +        work->frags[i].length = sg_list[i].length;
>>> +        work->frags[i].mr = mr;
>>> +
>>> +        rxe_put(mr);
>>> +    }
>>> +    work->num_sge = num_sge;
>>> +    return 0;
>>> +}
>>> +
>>>   static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>>                      enum ib_uverbs_advise_mr_advice advice,
>>>                      u32 pf_flags, struct ib_sge *sg_list,
>>> @@ -478,6 +545,8 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd 
>>> *ibpd,
>>>                        u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>>   {
>>>       u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>> +    struct prefetch_mr_work *work;
>>> +    int rc;
>>>       if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>>           pf_flags |= RXE_PAGEFAULT_RDONLY;
>>> @@ -490,7 +559,17 @@ static int rxe_ib_advise_mr_prefetch(struct 
>>> ib_pd *ibpd,
>>>           return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, 
>>> sg_list,
>>>                              num_sge);
>>> -    /* Asynchronous call is "best-effort" */
>>> +    /* Asynchronous call is "best-effort" and allowed to fail */
>>> +    work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
>>> +    if (!work)
>>> +        return -ENOMEM;
>>> +
>>> +    rc = rxe_init_prefetch_work(ibpd, advice, pf_flags, work, 
>>> sg_list, num_sge);
>>> +    if (rc) {
>>> +        kvfree(work);
>>> +        return rc;
>>> +    }
>>> +    queue_work(system_unbound_wq, &work->work);
>>>       return 0;
>>>   }
>>
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous prefetch for ODP MRs
  2025-05-09 12:19     ` Daisuke Matsuda
  2025-05-09 12:52       ` Zhu Yanjun
@ 2025-05-09 14:48       ` Zhu Yanjun
  1 sibling, 0 replies; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-09 14:48 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 09.05.25 14:19, Daisuke Matsuda wrote:
> 
> 
> On 2025/05/06 0:25, Zhu Yanjun wrote:
>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>> Calling ibv_advise_mr(3) with flags other than IBV_ADVISE_MR_FLAG_FLUSH
>>> invokes asynchronous requests. It is best-effort, and thus can safely be
>>> deferred to the system-wide workqueue.
>>>
>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>
>> I have made tests with rdma-core after applying this patch series. It 
>> seems that it can work well.
>> I read through this commit. Other than the following minor problems, I 
>> am fine with this commit.
>>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>
>>> ---
>>>   drivers/infiniband/sw/rxe/rxe_odp.c | 81 ++++++++++++++++++++++++++++-
>>>   1 file changed, 80 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c 
>>> b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> index e5c60b061d7e..d98b385a18ce 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> @@ -425,6 +425,73 @@ enum resp_states rxe_odp_do_atomic_write(struct 
>>> rxe_mr *mr, u64 iova, u64 value)
>>>       return RESPST_NONE;
>>>   }
>>> +struct prefetch_mr_work {
>>> +    struct work_struct work;
>>> +    u32 pf_flags;
>>> +    u32 num_sge;
>>> +    struct {
>>> +        u64 io_virt;
>>> +        struct rxe_mr *mr;
>>> +        size_t length;
>>> +    } frags[];
>>> +};
>>
>> The struct prefetch_mr_work should be moved into header file? IMO, it 
>> is better to move this struct to rxe_loc.h?
> 
> This struct is not likely to be used in other files.

Normally a struct should be in a header file. Not mandatory.

Zhu Yanjun

> I think leaving it here would be easier for other developers to 
> understand because relevant codes are gathered.
> If there is any specific reason to move, I will do so.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
  2025-05-05  7:57   ` Zhu Yanjun
@ 2025-05-09 15:19   ` Zhu Yanjun
  2025-05-10  2:46     ` Daisuke Matsuda
  1 sibling, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-09 15:19 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 03.05.25 15:42, Daisuke Matsuda wrote:
> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
> which are best-effort, will be added subsequently.
> 
> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
> ---
>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>   3 files changed, 103 insertions(+)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 3a77d6db1720..e891199cbdef 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>   	mutex_destroy(&rxe->usdev_lock);
>   }
>   
> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
> +	.advise_mr = rxe_ib_advise_mr,
> +};
> +
>   /* initialize rxe device parameters */
>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>   {
> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>   		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
> +
> +		/* set handler for ODP prefetching API - ibv_advise_mr(3) */
> +		ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>   	}
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index f7dbb9cddd12..21b070f3dbb8 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>   			    unsigned int length);
>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> +		     u32 flags, struct ib_sge *sg_list, u32 num_sge,
> +		     struct uverbs_attr_bundle *attrs);
>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>   static inline int
>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>   {
>   	return RESPST_ERR_UNSUPPORTED_OPCODE;
>   }
> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> +				   u32 flags, struct ib_sge *sg_list, u32 num_sge,
> +				   struct uverbs_attr_bundle *attrs)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>   
>   #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 6149d9ffe7f7..e5c60b061d7e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>   
>   	return RESPST_NONE;
>   }
> +
> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
> +				   enum ib_uverbs_advise_mr_advice advice,
> +				   u32 pf_flags, struct ib_sge *sg_list,
> +				   u32 num_sge)
> +{
> +	struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
> +	unsigned int i;
> +	int ret = 0;
> +
> +	for (i = 0; i < num_sge; ++i) {
> +		struct rxe_mr *mr;
> +		struct ib_umem_odp *umem_odp;
> +
> +		mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
> +			       sg_list[i].lkey, RXE_LOOKUP_LOCAL);
> +
> +		if (IS_ERR(mr)) {
> +			rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
> +			return PTR_ERR(mr);
> +		}
> +
> +		if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> +		    !mr->umem->writable) {
> +			rxe_dbg_mr(mr, "missing write permission\n");
> +			rxe_put(mr);
> +			return -EPERM;
> +		}
> +
> +		ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
> +						    sg_list[i].length, pf_flags);
> +		if (ret < 0) {
> +			if (sg_list[i].length == 0)
> +				continue;
> +
> +			rxe_dbg_mr(mr, "failed to prefetch the mr\n");
> +			rxe_put(mr);
> +			return ret;
> +		}
> +
> +		umem_odp = to_ib_umem_odp(mr->umem);
> +		mutex_unlock(&umem_odp->umem_mutex);
> +
> +		rxe_put(mr);
> +	}
> +
> +	return 0;
> +}
> +
> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
> +				     enum ib_uverbs_advise_mr_advice advice,
> +				     u32 flags, struct ib_sge *sg_list, u32 num_sge)
> +{
> +	u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
> +
> +	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
> +		pf_flags |= RXE_PAGEFAULT_RDONLY;
> +
> +	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> +		pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
> +
> +	/* Synchronous call */
> +	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
> +		return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
> +					       num_sge);
> +
> +	/* Asynchronous call is "best-effort" */

Asynchronous call is not implemented now, why does this comment appear?

Zhu Yanjun

> +
> +	return 0;
> +}
> +
> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
> +		     enum ib_uverbs_advise_mr_advice advice,
> +		     u32 flags,
> +		     struct ib_sge *sg_list,
> +		     u32 num_sge,
> +		     struct uverbs_attr_bundle *attrs)
> +{
> +	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
> +	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> +	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> +		return -EOPNOTSUPP;
> +
> +	return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
> +					 sg_list, num_sge);
> +}


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-09 15:19   ` Zhu Yanjun
@ 2025-05-10  2:46     ` Daisuke Matsuda
  2025-05-10  4:43       ` Zhu Yanjun
  0 siblings, 1 reply; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-10  2:46 UTC (permalink / raw)
  To: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 2025/05/10 0:19, Zhu Yanjun wrote:
> On 03.05.25 15:42, Daisuke Matsuda wrote:
>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
>> which are best-effort, will be added subsequently.
>>
>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>> ---
>>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>>   3 files changed, 103 insertions(+)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 3a77d6db1720..e891199cbdef 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>       mutex_destroy(&rxe->usdev_lock);
>>   }
>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>> +    .advise_mr = rxe_ib_advise_mr,
>> +};
>> +
>>   /* initialize rxe device parameters */
>>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>   {
>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
>> +
>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>       }
>>   }
>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
>> index f7dbb9cddd12..21b070f3dbb8 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>                   unsigned int length);
>>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>> +             struct uverbs_attr_bundle *attrs);
>>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>   static inline int
>>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>   {
>>       return RESPST_ERR_UNSUPPORTED_OPCODE;
>>   }
>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>> +                   struct uverbs_attr_bundle *attrs)
>> +{
>> +    return -EOPNOTSUPP;
>> +}
>> +
>>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>   #endif /* RXE_LOC_H */
>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
>> index 6149d9ffe7f7..e5c60b061d7e 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>       return RESPST_NONE;
>>   }
>> +
>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>> +                   enum ib_uverbs_advise_mr_advice advice,
>> +                   u32 pf_flags, struct ib_sge *sg_list,
>> +                   u32 num_sge)
>> +{
>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>> +    unsigned int i;
>> +    int ret = 0;
>> +
>> +    for (i = 0; i < num_sge; ++i) {
>> +        struct rxe_mr *mr;
>> +        struct ib_umem_odp *umem_odp;
>> +
>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>> +
>> +        if (IS_ERR(mr)) {
>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
>> +            return PTR_ERR(mr);
>> +        }
>> +
>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>> +            !mr->umem->writable) {
>> +            rxe_dbg_mr(mr, "missing write permission\n");
>> +            rxe_put(mr);
>> +            return -EPERM;
>> +        }
>> +
>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>> +                            sg_list[i].length, pf_flags);
>> +        if (ret < 0) {
>> +            if (sg_list[i].length == 0)
>> +                continue;
>> +
>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>> +            rxe_put(mr);
>> +            return ret;
>> +        }
>> +
>> +        umem_odp = to_ib_umem_odp(mr->umem);
>> +        mutex_unlock(&umem_odp->umem_mutex);
>> +
>> +        rxe_put(mr);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>> +                     enum ib_uverbs_advise_mr_advice advice,
>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>> +{
>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>> +
>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>> +
>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>> +
>> +    /* Synchronous call */
>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>> +                           num_sge);
>> +
>> +    /* Asynchronous call is "best-effort" */
> 
> Asynchronous call is not implemented now, why does this comment appear?

Even without the 2nd patch, async calls are reported as successful.
The comment is inserted to show the reason, which is based on the
description from 'man 3 ibv_advise_mr' as follows:
===
An application may pre-fetch any address range within an ODP MR when using the IBV_ADVISE_MR_ADVICE_PREFETCH or IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this operation is best-effort. That means the kernel does not guarantee that underlying pages are updated in the HCA or the pre-fetched pages would remain resident.
===

Thanks,
Daisuke

> 
> Zhu Yanjun
> 
>> +
>> +    return 0;
>> +}
>> +
>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>> +             enum ib_uverbs_advise_mr_advice advice,
>> +             u32 flags,
>> +             struct ib_sge *sg_list,
>> +             u32 num_sge,
>> +             struct uverbs_attr_bundle *attrs)
>> +{
>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>> +        return -EOPNOTSUPP;
>> +
>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>> +                     sg_list, num_sge);
>> +}
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-10  2:46     ` Daisuke Matsuda
@ 2025-05-10  4:43       ` Zhu Yanjun
  2025-05-10  7:18         ` Daisuke Matsuda
  0 siblings, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-10  4:43 UTC (permalink / raw)
  To: Daisuke Matsuda, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000


在 2025/5/10 4:46, Daisuke Matsuda 写道:
> On 2025/05/10 0:19, Zhu Yanjun wrote:
>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>> Minimal implementation of ibv_advise_mr(3) requires synchronous 
>>> calls being
>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous 
>>> requests,
>>> which are best-effort, will be added subsequently.
>>>
>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>> ---
>>>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 
>>> +++++++++++++++++++++++++++++
>>>   3 files changed, 103 insertions(+)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c 
>>> b/drivers/infiniband/sw/rxe/rxe.c
>>> index 3a77d6db1720..e891199cbdef 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>>       mutex_destroy(&rxe->usdev_lock);
>>>   }
>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>>> +    .advise_mr = rxe_ib_advise_mr,
>>> +};
>>> +
>>>   /* initialize rxe device parameters */
>>>   static void rxe_init_device_param(struct rxe_dev *rxe, struct 
>>> net_device *ndev)
>>>   {
>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev 
>>> *rxe, struct net_device *ndev)
>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>> IB_ODP_SUPPORT_SRQ_RECV;
>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>> IB_ODP_SUPPORT_FLUSH;
>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>> IB_ODP_SUPPORT_ATOMIC_WRITE;
>>> +
>>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>>       }
>>>   }
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h 
>>> b/drivers/infiniband/sw/rxe/rxe_loc.h
>>> index f7dbb9cddd12..21b070f3dbb8 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr 
>>> *mr, u64 iova, int opcode,
>>>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>>                   unsigned int length);
>>>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 
>>> iova, u64 value);
>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum 
>>> ib_uverbs_advise_mr_advice advice,
>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>> +             struct uverbs_attr_bundle *attrs);
>>>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>   static inline int
>>>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, 
>>> u64 iova,
>>> @@ -225,6 +228,13 @@ static inline enum resp_states 
>>> rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>>   {
>>>       return RESPST_ERR_UNSUPPORTED_OPCODE;
>>>   }
>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum 
>>> ib_uverbs_advise_mr_advice advice,
>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>> +                   struct uverbs_attr_bundle *attrs)
>>> +{
>>> +    return -EOPNOTSUPP;
>>> +}
>>> +
>>>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>   #endif /* RXE_LOC_H */
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c 
>>> b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> index 6149d9ffe7f7..e5c60b061d7e 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct 
>>> rxe_mr *mr, u64 iova, u64 value)
>>>       return RESPST_NONE;
>>>   }
>>> +
>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>> +                   enum ib_uverbs_advise_mr_advice advice,
>>> +                   u32 pf_flags, struct ib_sge *sg_list,
>>> +                   u32 num_sge)
>>> +{
>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>> +    unsigned int i;
>>> +    int ret = 0;
>>> +
>>> +    for (i = 0; i < num_sge; ++i) {
>>> +        struct rxe_mr *mr;
>>> +        struct ib_umem_odp *umem_odp;
>>> +
>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>> +
>>> +        if (IS_ERR(mr)) {
>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", 
>>> sg_list[i].lkey);
>>> +            return PTR_ERR(mr);
>>> +        }
>>> +
>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>> +            !mr->umem->writable) {
>>> +            rxe_dbg_mr(mr, "missing write permission\n");
>>> +            rxe_put(mr);
>>> +            return -EPERM;
>>> +        }
>>> +
>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>>> +                            sg_list[i].length, pf_flags);
>>> +        if (ret < 0) {
>>> +            if (sg_list[i].length == 0)
>>> +                continue;
>>> +
>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>>> +            rxe_put(mr);
>>> +            return ret;
>>> +        }
>>> +
>>> +        umem_odp = to_ib_umem_odp(mr->umem);
>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>> +
>>> +        rxe_put(mr);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>> +                     enum ib_uverbs_advise_mr_advice advice,
>>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>> +{
>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>> +
>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>>> +
>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>>> +
>>> +    /* Synchronous call */
>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, 
>>> sg_list,
>>> +                           num_sge);
>>> +
>>> +    /* Asynchronous call is "best-effort" */
>>
>> Asynchronous call is not implemented now, why does this comment appear?
>
> Even without the 2nd patch, async calls are reported as successful.


Async call is not implemented. How to call "async calls are reported as 
successful"?


Zhu Yanjun

> The comment is inserted to show the reason, which is based on the
> description from 'man 3 ibv_advise_mr' as follows:
> ===
> An application may pre-fetch any address range within an ODP MR when 
> using the IBV_ADVISE_MR_ADVICE_PREFETCH or 
> IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this 
> operation is best-effort. That means the kernel does not guarantee 
> that underlying pages are updated in the HCA or the pre-fetched pages 
> would remain resident.
> ===
>
> Thanks,
> Daisuke
>
>>
>> Zhu Yanjun
>>
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>>> +             enum ib_uverbs_advise_mr_advice advice,
>>> +             u32 flags,
>>> +             struct ib_sge *sg_list,
>>> +             u32 num_sge,
>>> +             struct uverbs_attr_bundle *attrs)
>>> +{
>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>>> +                     sg_list, num_sge);
>>> +}
>>
>
-- 
Best Regards,
Yanjun.Zhu


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-10  4:43       ` Zhu Yanjun
@ 2025-05-10  7:18         ` Daisuke Matsuda
  2025-05-10  8:04           ` Greg Sword
  0 siblings, 1 reply; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-10  7:18 UTC (permalink / raw)
  To: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On 2025/05/10 13:43, Zhu Yanjun wrote:
> 
> 在 2025/5/10 4:46, Daisuke Matsuda 写道:
>> On 2025/05/10 0:19, Zhu Yanjun wrote:
>>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
>>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
>>>> which are best-effort, will be added subsequently.
>>>>
>>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>>> ---
>>>>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>>>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>>>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>>>>   3 files changed, 103 insertions(+)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>> index 3a77d6db1720..e891199cbdef 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>>>       mutex_destroy(&rxe->usdev_lock);
>>>>   }
>>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>>>> +    .advise_mr = rxe_ib_advise_mr,
>>>> +};
>>>> +
>>>>   /* initialize rxe device parameters */
>>>>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>   {
>>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
>>>> +
>>>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>>>       }
>>>>   }
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>> index f7dbb9cddd12..21b070f3dbb8 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>>>>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>>>                   unsigned int length);
>>>>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
>>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>> +             struct uverbs_attr_bundle *attrs);
>>>>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>   static inline int
>>>>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>>>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>>>   {
>>>>       return RESPST_ERR_UNSUPPORTED_OPCODE;
>>>>   }
>>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>> +                   struct uverbs_attr_bundle *attrs)
>>>> +{
>>>> +    return -EOPNOTSUPP;
>>>> +}
>>>> +
>>>>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>   #endif /* RXE_LOC_H */
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>> index 6149d9ffe7f7..e5c60b061d7e 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>>>       return RESPST_NONE;
>>>>   }
>>>> +
>>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>>> +                   enum ib_uverbs_advise_mr_advice advice,
>>>> +                   u32 pf_flags, struct ib_sge *sg_list,
>>>> +                   u32 num_sge)
>>>> +{
>>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>>> +    unsigned int i;
>>>> +    int ret = 0;
>>>> +
>>>> +    for (i = 0; i < num_sge; ++i) {
>>>> +        struct rxe_mr *mr;
>>>> +        struct ib_umem_odp *umem_odp;
>>>> +
>>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>>> +
>>>> +        if (IS_ERR(mr)) {
>>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
>>>> +            return PTR_ERR(mr);
>>>> +        }
>>>> +
>>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>> +            !mr->umem->writable) {
>>>> +            rxe_dbg_mr(mr, "missing write permission\n");
>>>> +            rxe_put(mr);
>>>> +            return -EPERM;
>>>> +        }
>>>> +
>>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>>>> +                            sg_list[i].length, pf_flags);
>>>> +        if (ret < 0) {
>>>> +            if (sg_list[i].length == 0)
>>>> +                continue;
>>>> +
>>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>>>> +            rxe_put(mr);
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        umem_odp = to_ib_umem_odp(mr->umem);
>>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>>> +
>>>> +        rxe_put(mr);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>>> +                     enum ib_uverbs_advise_mr_advice advice,
>>>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>>> +{
>>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>>> +
>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>>>> +
>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>>>> +
>>>> +    /* Synchronous call */
>>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>>>> +                           num_sge);
>>>> +
>>>> +    /* Asynchronous call is "best-effort" */
>>>
>>> Asynchronous call is not implemented now, why does this comment appear?
>>
>> Even without the 2nd patch, async calls are reported as successful.
> 
> 
> Async call is not implemented. How to call "async calls are reported as successful"?

Please see the manual.
cf. https://manpages.debian.org/testing/libibverbs-dev/ibv_advise_mr.3.en.html

If IBV_ADVISE_MR_FLAG_FLUSH is not given to 'flags' parameter,
then this function 'rxe_ib_advise_mr_prefetch()' simply returns 0.
Consequently, ibv_advise_mr(3) and underlying ioctl(2) get no error.
This behaviour is allowd in the spec as I quoted in the last reply.

It might be nice to return -EOPNOTSUPP just below the comment instead,
but not doing so is acceptable according to the spec. Additionally,
such change will be overwritten in the next patch after all.

Thanks,
Daisuke

> 
> 
> Zhu Yanjun
> 
>> The comment is inserted to show the reason, which is based on the
>> description from 'man 3 ibv_advise_mr' as follows:
>> ===
>> An application may pre-fetch any address range within an ODP MR when using the IBV_ADVISE_MR_ADVICE_PREFETCH or IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this operation is best-effort. That means the kernel does not guarantee that underlying pages are updated in the HCA or the pre-fetched pages would remain resident.
>> ===
>>
>> Thanks,
>> Daisuke
>>
>>>
>>> Zhu Yanjun
>>>
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>>>> +             enum ib_uverbs_advise_mr_advice advice,
>>>> +             u32 flags,
>>>> +             struct ib_sge *sg_list,
>>>> +             u32 num_sge,
>>>> +             struct uverbs_attr_bundle *attrs)
>>>> +{
>>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>> +        return -EOPNOTSUPP;
>>>> +
>>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>>>> +                     sg_list, num_sge);
>>>> +}
>>>
>>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-10  7:18         ` Daisuke Matsuda
@ 2025-05-10  8:04           ` Greg Sword
  2025-05-11  2:06             ` Daisuke Matsuda
  0 siblings, 1 reply; 19+ messages in thread
From: Greg Sword @ 2025-05-10  8:04 UTC (permalink / raw)
  To: Daisuke Matsuda
  Cc: Zhu Yanjun, linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

On Sat, May 10, 2025 at 3:19 PM Daisuke Matsuda <dskmtsd@gmail.com> wrote:
>
> On 2025/05/10 13:43, Zhu Yanjun wrote:
> >
> > 在 2025/5/10 4:46, Daisuke Matsuda 写道:
> >> On 2025/05/10 0:19, Zhu Yanjun wrote:
> >>> On 03.05.25 15:42, Daisuke Matsuda wrote:
> >>>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
> >>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
> >>>> which are best-effort, will be added subsequently.
> >>>>
> >>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
> >>>> ---
> >>>>   drivers/infiniband/sw/rxe/rxe.c     |  7 +++
> >>>>   drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
> >>>>   drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
> >>>>   3 files changed, 103 insertions(+)
> >>>>
> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> >>>> index 3a77d6db1720..e891199cbdef 100644
> >>>> --- a/drivers/infiniband/sw/rxe/rxe.c
> >>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
> >>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
> >>>>       mutex_destroy(&rxe->usdev_lock);
> >>>>   }
> >>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
> >>>> +    .advise_mr = rxe_ib_advise_mr,
> >>>> +};
> >>>> +
> >>>>   /* initialize rxe device parameters */
> >>>>   static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
> >>>>   {
> >>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
> >>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> >>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
> >>>>           rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
> >>>> +
> >>>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
> >>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
> >>>>       }
> >>>>   }
> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> >>>> index f7dbb9cddd12..21b070f3dbb8 100644
> >>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> >>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> >>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
> >>>>   int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
> >>>>                   unsigned int length);
> >>>>   enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
> >>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> >>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
> >>>> +             struct uverbs_attr_bundle *attrs);
> >>>>   #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> >>>>   static inline int
> >>>>   rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> >>>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
> >>>>   {
> >>>>       return RESPST_ERR_UNSUPPORTED_OPCODE;
> >>>>   }
> >>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> >>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
> >>>> +                   struct uverbs_attr_bundle *attrs)
> >>>> +{
> >>>> +    return -EOPNOTSUPP;
> >>>> +}
> >>>> +
> >>>>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> >>>>   #endif /* RXE_LOC_H */
> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> >>>> index 6149d9ffe7f7..e5c60b061d7e 100644
> >>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> >>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> >>>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
> >>>>       return RESPST_NONE;
> >>>>   }
> >>>> +
> >>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
> >>>> +                   enum ib_uverbs_advise_mr_advice advice,
> >>>> +                   u32 pf_flags, struct ib_sge *sg_list,
> >>>> +                   u32 num_sge)
> >>>> +{
> >>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
> >>>> +    unsigned int i;
> >>>> +    int ret = 0;
> >>>> +
> >>>> +    for (i = 0; i < num_sge; ++i) {
> >>>> +        struct rxe_mr *mr;
> >>>> +        struct ib_umem_odp *umem_odp;
> >>>> +
> >>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
> >>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
> >>>> +
> >>>> +        if (IS_ERR(mr)) {
> >>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
> >>>> +            return PTR_ERR(mr);
> >>>> +        }
> >>>> +
> >>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> >>>> +            !mr->umem->writable) {
> >>>> +            rxe_dbg_mr(mr, "missing write permission\n");
> >>>> +            rxe_put(mr);
> >>>> +            return -EPERM;
> >>>> +        }
> >>>> +
> >>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
> >>>> +                            sg_list[i].length, pf_flags);
> >>>> +        if (ret < 0) {
> >>>> +            if (sg_list[i].length == 0)
> >>>> +                continue;
> >>>> +
> >>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
> >>>> +            rxe_put(mr);
> >>>> +            return ret;
> >>>> +        }
> >>>> +
> >>>> +        umem_odp = to_ib_umem_odp(mr->umem);
> >>>> +        mutex_unlock(&umem_odp->umem_mutex);
> >>>> +
> >>>> +        rxe_put(mr);
> >>>> +    }
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
> >>>> +                     enum ib_uverbs_advise_mr_advice advice,
> >>>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
> >>>> +{
> >>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
> >>>> +
> >>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
> >>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
> >>>> +
> >>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> >>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
> >>>> +
> >>>> +    /* Synchronous call */
> >>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
> >>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
> >>>> +                           num_sge);
> >>>> +
> >>>> +    /* Asynchronous call is "best-effort" */
> >>>
> >>> Asynchronous call is not implemented now, why does this comment appear?
> >>
> >> Even without the 2nd patch, async calls are reported as successful.
> >
> >
> > Async call is not implemented. How to call "async calls are reported as successful"?
>
> Please see the manual.
> cf. https://manpages.debian.org/testing/libibverbs-dev/ibv_advise_mr.3.en.html
>
> If IBV_ADVISE_MR_FLAG_FLUSH is not given to 'flags' parameter,
> then this function 'rxe_ib_advise_mr_prefetch()' simply returns 0.
> Consequently, ibv_advise_mr(3) and underlying ioctl(2) get no error.
> This behaviour is allowd in the spec as I quoted in the last reply.

The functionality wasn't implemented, you added the comments first.
You're still weaseling when people point this out.

>
> It might be nice to return -EOPNOTSUPP just below the comment instead,
> but not doing so is acceptable according to the spec. Additionally,
> such change will be overwritten in the next patch after all.

Move comments to the next patch. In this patch, return -EOPNOTSUPP.

--G--

>
> Thanks,
> Daisuke
>
> >
> >
> > Zhu Yanjun
> >
> >> The comment is inserted to show the reason, which is based on the
> >> description from 'man 3 ibv_advise_mr' as follows:
> >> ===
> >> An application may pre-fetch any address range within an ODP MR when using the IBV_ADVISE_MR_ADVICE_PREFETCH or IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this operation is best-effort. That means the kernel does not guarantee that underlying pages are updated in the HCA or the pre-fetched pages would remain resident.
> >> ===
> >>
> >> Thanks,
> >> Daisuke
> >>
> >>>
> >>> Zhu Yanjun
> >>>
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
> >>>> +             enum ib_uverbs_advise_mr_advice advice,
> >>>> +             u32 flags,
> >>>> +             struct ib_sge *sg_list,
> >>>> +             u32 num_sge,
> >>>> +             struct uverbs_attr_bundle *attrs)
> >>>> +{
> >>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
> >>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> >>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> >>>> +        return -EOPNOTSUPP;
> >>>> +
> >>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
> >>>> +                     sg_list, num_sge);
> >>>> +}
> >>>
> >>
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-10  8:04           ` Greg Sword
@ 2025-05-11  2:06             ` Daisuke Matsuda
  2025-05-11  4:52               ` Zhu Yanjun
  0 siblings, 1 reply; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-11  2:06 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000


On 2025/05/10 17:04, Greg Sword wrote:
> On Sat, May 10, 2025 at 3:19 PM Daisuke Matsuda <dskmtsd@gmail.com> wrote:
>>
>> On 2025/05/10 13:43, Zhu Yanjun wrote:
>>>
>>> 在 2025/5/10 4:46, Daisuke Matsuda 写道:
>>>> On 2025/05/10 0:19, Zhu Yanjun wrote:
>>>>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>>>>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
>>>>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
>>>>>> which are best-effort, will be added subsequently.
>>>>>>
>>>>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>>>>> ---
>>>>>>    drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>>>>>    drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>>>>>    drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
>>>>>>    3 files changed, 103 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> index 3a77d6db1720..e891199cbdef 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>>>>>        mutex_destroy(&rxe->usdev_lock);
>>>>>>    }
>>>>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>>>>>> +    .advise_mr = rxe_ib_advise_mr,
>>>>>> +};
>>>>>> +
>>>>>>    /* initialize rxe device parameters */
>>>>>>    static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>>>    {
>>>>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
>>>>>> +
>>>>>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>>>>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>>>>>        }
>>>>>>    }
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>> index f7dbb9cddd12..21b070f3dbb8 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>>>>>>    int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>>>>>                    unsigned int length);
>>>>>>    enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
>>>>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>> +             struct uverbs_attr_bundle *attrs);
>>>>>>    #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>    static inline int
>>>>>>    rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>>>>>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>>>>>    {
>>>>>>        return RESPST_ERR_UNSUPPORTED_OPCODE;
>>>>>>    }
>>>>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>> +                   struct uverbs_attr_bundle *attrs)
>>>>>> +{
>>>>>> +    return -EOPNOTSUPP;
>>>>>> +}
>>>>>> +
>>>>>>    #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>    #endif /* RXE_LOC_H */
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>> index 6149d9ffe7f7..e5c60b061d7e 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>>>>>        return RESPST_NONE;
>>>>>>    }
>>>>>> +
>>>>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>>>>> +                   enum ib_uverbs_advise_mr_advice advice,
>>>>>> +                   u32 pf_flags, struct ib_sge *sg_list,
>>>>>> +                   u32 num_sge)
>>>>>> +{
>>>>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>>>>> +    unsigned int i;
>>>>>> +    int ret = 0;
>>>>>> +
>>>>>> +    for (i = 0; i < num_sge; ++i) {
>>>>>> +        struct rxe_mr *mr;
>>>>>> +        struct ib_umem_odp *umem_odp;
>>>>>> +
>>>>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>>>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>>>>> +
>>>>>> +        if (IS_ERR(mr)) {
>>>>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
>>>>>> +            return PTR_ERR(mr);
>>>>>> +        }
>>>>>> +
>>>>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>> +            !mr->umem->writable) {
>>>>>> +            rxe_dbg_mr(mr, "missing write permission\n");
>>>>>> +            rxe_put(mr);
>>>>>> +            return -EPERM;
>>>>>> +        }
>>>>>> +
>>>>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>>>>>> +                            sg_list[i].length, pf_flags);
>>>>>> +        if (ret < 0) {
>>>>>> +            if (sg_list[i].length == 0)
>>>>>> +                continue;
>>>>>> +
>>>>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>>>>>> +            rxe_put(mr);
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +
>>>>>> +        umem_odp = to_ib_umem_odp(mr->umem);
>>>>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>>>>> +
>>>>>> +        rxe_put(mr);
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>>>>> +                     enum ib_uverbs_advise_mr_advice advice,
>>>>>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>>>>> +{
>>>>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>>>>> +
>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>>>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>>>>>> +
>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>>>>>> +
>>>>>> +    /* Synchronous call */
>>>>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>>>>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>>>>>> +                           num_sge);
>>>>>> +
>>>>>> +    /* Asynchronous call is "best-effort" */
>>>>>
>>>>> Asynchronous call is not implemented now, why does this comment appear?
>>>>
>>>> Even without the 2nd patch, async calls are reported as successful.
>>>
>>>
>>> Async call is not implemented. How to call "async calls are reported as successful"?
>>
>> Please see the manual.
>> cf. https://manpages.debian.org/testing/libibverbs-dev/ibv_advise_mr.3.en.html
>>
>> If IBV_ADVISE_MR_FLAG_FLUSH is not given to 'flags' parameter,
>> then this function 'rxe_ib_advise_mr_prefetch()' simply returns 0.
>> Consequently, ibv_advise_mr(3) and underlying ioctl(2) get no error.
>> This behaviour is allowd in the spec as I quoted in the last reply.
> 
> The functionality wasn't implemented, you added the comments first.
> You're still weaseling when people point this out.
> 
>>
>> It might be nice to return -EOPNOTSUPP just below the comment instead,
>> but not doing so is acceptable according to the spec. Additionally,
>> such change will be overwritten in the next patch after all.
> 
> Move comments to the next patch. In this patch, return -EOPNOTSUPP.

Any opinion from Zhu?
I may post a new revision to change the intermediate code,
but the final result after applying the patchset will be the same.
You are the maintainer of rxe, so I will follow that.

Thanks,
Daisuke


> 
> --G--
> 
>>
>> Thanks,
>> Daisuke
>>
>>>
>>>
>>> Zhu Yanjun
>>>
>>>> The comment is inserted to show the reason, which is based on the
>>>> description from 'man 3 ibv_advise_mr' as follows:
>>>> ===
>>>> An application may pre-fetch any address range within an ODP MR when using the IBV_ADVISE_MR_ADVICE_PREFETCH or IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this operation is best-effort. That means the kernel does not guarantee that underlying pages are updated in the HCA or the pre-fetched pages would remain resident.
>>>> ===
>>>>
>>>> Thanks,
>>>> Daisuke
>>>>
>>>>>
>>>>> Zhu Yanjun
>>>>>
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>>>>>> +             enum ib_uverbs_advise_mr_advice advice,
>>>>>> +             u32 flags,
>>>>>> +             struct ib_sge *sg_list,
>>>>>> +             u32 num_sge,
>>>>>> +             struct uverbs_attr_bundle *attrs)
>>>>>> +{
>>>>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>> +        return -EOPNOTSUPP;
>>>>>> +
>>>>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>>>>>> +                     sg_list, num_sge);
>>>>>> +}
>>>>>
>>>>
>>
>>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-11  2:06             ` Daisuke Matsuda
@ 2025-05-11  4:52               ` Zhu Yanjun
  2025-05-13  5:23                 ` Daisuke Matsuda
  0 siblings, 1 reply; 19+ messages in thread
From: Zhu Yanjun @ 2025-05-11  4:52 UTC (permalink / raw)
  To: Daisuke Matsuda; +Cc: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000

在 2025/5/11 4:06, Daisuke Matsuda 写道:
> 
> On 2025/05/10 17:04, Greg Sword wrote:
>> On Sat, May 10, 2025 at 3:19 PM Daisuke Matsuda <dskmtsd@gmail.com> 
>> wrote:
>>>
>>> On 2025/05/10 13:43, Zhu Yanjun wrote:
>>>>
>>>> 在 2025/5/10 4:46, Daisuke Matsuda 写道:
>>>>> On 2025/05/10 0:19, Zhu Yanjun wrote:
>>>>>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>>>>>> Minimal implementation of ibv_advise_mr(3) requires synchronous 
>>>>>>> calls being
>>>>>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous 
>>>>>>> requests,
>>>>>>> which are best-effort, will be added subsequently.
>>>>>>>
>>>>>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>>>>>> ---
>>>>>>>    drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>>>>>>    drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>>>>>>    drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++ 
>>>>>>> ++++++++
>>>>>>>    3 files changed, 103 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/ 
>>>>>>> infiniband/sw/rxe/rxe.c
>>>>>>> index 3a77d6db1720..e891199cbdef 100644
>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>>>>>>        mutex_destroy(&rxe->usdev_lock);
>>>>>>>    }
>>>>>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>>>>>>> +    .advise_mr = rxe_ib_advise_mr,
>>>>>>> +};
>>>>>>> +
>>>>>>>    /* initialize rxe device parameters */
>>>>>>>    static void rxe_init_device_param(struct rxe_dev *rxe, struct 
>>>>>>> net_device *ndev)
>>>>>>>    {
>>>>>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct 
>>>>>>> rxe_dev *rxe, struct net_device *ndev)
>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>>>>>> IB_ODP_SUPPORT_SRQ_RECV;
>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>>>>>> IB_ODP_SUPPORT_FLUSH;
>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= 
>>>>>>> IB_ODP_SUPPORT_ATOMIC_WRITE;
>>>>>>> +
>>>>>>> +        /* set handler for ODP prefetching API - 
>>>>>>> ibv_advise_mr(3) */
>>>>>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>>>>>>        }
>>>>>>>    }
>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/ 
>>>>>>> infiniband/sw/rxe/rxe_loc.h
>>>>>>> index f7dbb9cddd12..21b070f3dbb8 100644
>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct 
>>>>>>> rxe_mr *mr, u64 iova, int opcode,
>>>>>>>    int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>>>>>>                    unsigned int length);
>>>>>>>    enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, 
>>>>>>> u64 iova, u64 value);
>>>>>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum 
>>>>>>> ib_uverbs_advise_mr_advice advice,
>>>>>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>>> +             struct uverbs_attr_bundle *attrs);
>>>>>>>    #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>>    static inline int
>>>>>>>    rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 
>>>>>>> length, u64 iova,
>>>>>>> @@ -225,6 +228,13 @@ static inline enum resp_states 
>>>>>>> rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>>>>>>    {
>>>>>>>        return RESPST_ERR_UNSUPPORTED_OPCODE;
>>>>>>>    }
>>>>>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum 
>>>>>>> ib_uverbs_advise_mr_advice advice,
>>>>>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>>> +                   struct uverbs_attr_bundle *attrs)
>>>>>>> +{
>>>>>>> +    return -EOPNOTSUPP;
>>>>>>> +}
>>>>>>> +
>>>>>>>    #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>>    #endif /* RXE_LOC_H */
>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/ 
>>>>>>> infiniband/sw/rxe/rxe_odp.c
>>>>>>> index 6149d9ffe7f7..e5c60b061d7e 100644
>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>>> @@ -424,3 +424,89 @@ enum resp_states 
>>>>>>> rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>>>>>>        return RESPST_NONE;
>>>>>>>    }
>>>>>>> +
>>>>>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>>>>>> +                   enum ib_uverbs_advise_mr_advice advice,
>>>>>>> +                   u32 pf_flags, struct ib_sge *sg_list,
>>>>>>> +                   u32 num_sge)
>>>>>>> +{
>>>>>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>>>>>> +    unsigned int i;
>>>>>>> +    int ret = 0;
>>>>>>> +
>>>>>>> +    for (i = 0; i < num_sge; ++i) {
>>>>>>> +        struct rxe_mr *mr;
>>>>>>> +        struct ib_umem_odp *umem_odp;
>>>>>>> +
>>>>>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>>>>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>>>>>> +
>>>>>>> +        if (IS_ERR(mr)) {
>>>>>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", 
>>>>>>> sg_list[i].lkey);
>>>>>>> +            return PTR_ERR(mr);
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>>> +            !mr->umem->writable) {
>>>>>>> +            rxe_dbg_mr(mr, "missing write permission\n");
>>>>>>> +            rxe_put(mr);
>>>>>>> +            return -EPERM;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>>>>>>> +                            sg_list[i].length, pf_flags);
>>>>>>> +        if (ret < 0) {
>>>>>>> +            if (sg_list[i].length == 0)
>>>>>>> +                continue;
>>>>>>> +
>>>>>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>>>>>>> +            rxe_put(mr);
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        umem_odp = to_ib_umem_odp(mr->umem);
>>>>>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>>>>>> +
>>>>>>> +        rxe_put(mr);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>>>>>> +                     enum ib_uverbs_advise_mr_advice advice,
>>>>>>> +                     u32 flags, struct ib_sge *sg_list, u32 
>>>>>>> num_sge)
>>>>>>> +{
>>>>>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>>>>>> +
>>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>>>>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>>>>>>> +
>>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>>>>>>> +
>>>>>>> +    /* Synchronous call */
>>>>>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>>>>>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, 
>>>>>>> sg_list,
>>>>>>> +                           num_sge);
>>>>>>> +
>>>>>>> +    /* Asynchronous call is "best-effort" */
>>>>>>
>>>>>> Asynchronous call is not implemented now, why does this comment 
>>>>>> appear?
>>>>>
>>>>> Even without the 2nd patch, async calls are reported as successful.
>>>>
>>>>
>>>> Async call is not implemented. How to call "async calls are reported 
>>>> as successful"?
>>>
>>> Please see the manual.
>>> cf. https://manpages.debian.org/testing/libibverbs-dev/ 
>>> ibv_advise_mr.3.en.html
>>>
>>> If IBV_ADVISE_MR_FLAG_FLUSH is not given to 'flags' parameter,
>>> then this function 'rxe_ib_advise_mr_prefetch()' simply returns 0.
>>> Consequently, ibv_advise_mr(3) and underlying ioctl(2) get no error.
>>> This behaviour is allowd in the spec as I quoted in the last reply.
>>
>> The functionality wasn't implemented, you added the comments first.
>> You're still weaseling when people point this out.
>>
>>>
>>> It might be nice to return -EOPNOTSUPP just below the comment instead,
>>> but not doing so is acceptable according to the spec. Additionally,
>>> such change will be overwritten in the next patch after all.
>>
>> Move comments to the next patch. In this patch, return -EOPNOTSUPP.
> 
> Any opinion from Zhu?
> I may post a new revision to change the intermediate code,
> but the final result after applying the patchset will be the same.
> You are the maintainer of rxe, so I will follow that.

Thanks a lot. I am fine with your commit.
Please "Move comments to the next patch. In this patch, return -EOPNOTSUPP".
I think that it is a good idea. The final result should be the same. 
Please send out the latest commit following the suggestions.

Thanks a lot for your contributions and efforts.

Best Regards,
Zhu Yanjun

> 
> Thanks,
> Daisuke
> 
> 
>>
>> --G--
>>
>>>
>>> Thanks,
>>> Daisuke
>>>
>>>>
>>>>
>>>> Zhu Yanjun
>>>>
>>>>> The comment is inserted to show the reason, which is based on the
>>>>> description from 'man 3 ibv_advise_mr' as follows:
>>>>> ===
>>>>> An application may pre-fetch any address range within an ODP MR 
>>>>> when using the IBV_ADVISE_MR_ADVICE_PREFETCH or 
>>>>> IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this 
>>>>> operation is best-effort. That means the kernel does not guarantee 
>>>>> that underlying pages are updated in the HCA or the pre-fetched 
>>>>> pages would remain resident.
>>>>> ===
>>>>>
>>>>> Thanks,
>>>>> Daisuke
>>>>>
>>>>>>
>>>>>> Zhu Yanjun
>>>>>>
>>>>>>> +
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>>>>>>> +             enum ib_uverbs_advise_mr_advice advice,
>>>>>>> +             u32 flags,
>>>>>>> +             struct ib_sge *sg_list,
>>>>>>> +             u32 num_sge,
>>>>>>> +             struct uverbs_attr_bundle *attrs)
>>>>>>> +{
>>>>>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>>> +        return -EOPNOTSUPP;
>>>>>>> +
>>>>>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>>>>>>> +                     sg_list, num_sge);
>>>>>>> +}
>>>>>>
>>>>>
>>>
>>>
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
  2025-05-11  4:52               ` Zhu Yanjun
@ 2025-05-13  5:23                 ` Daisuke Matsuda
  0 siblings, 0 replies; 19+ messages in thread
From: Daisuke Matsuda @ 2025-05-13  5:23 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: linux-kernel, linux-rdma, leon, jgg, zyjzyj2000



On 2025/05/11 13:52, Zhu Yanjun wrote:
> 在 2025/5/11 4:06, Daisuke Matsuda 写道:
>>
>> On 2025/05/10 17:04, Greg Sword wrote:
>>> On Sat, May 10, 2025 at 3:19 PM Daisuke Matsuda <dskmtsd@gmail.com> wrote:
>>>>
>>>> On 2025/05/10 13:43, Zhu Yanjun wrote:
>>>>>
>>>>> 在 2025/5/10 4:46, Daisuke Matsuda 写道:
>>>>>> On 2025/05/10 0:19, Zhu Yanjun wrote:
>>>>>>> On 03.05.25 15:42, Daisuke Matsuda wrote:
>>>>>>>> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
>>>>>>>> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
>>>>>>>> which are best-effort, will be added subsequently.
>>>>>>>>
>>>>>>>> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
>>>>>>>> ---
>>>>>>>>    drivers/infiniband/sw/rxe/rxe.c     |  7 +++
>>>>>>>>    drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
>>>>>>>>    drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++ ++++++++
>>>>>>>>    3 files changed, 103 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/ infiniband/sw/rxe/rxe.c
>>>>>>>> index 3a77d6db1720..e891199cbdef 100644
>>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>>>>> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>>>>>>>        mutex_destroy(&rxe->usdev_lock);
>>>>>>>>    }
>>>>>>>> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
>>>>>>>> +    .advise_mr = rxe_ib_advise_mr,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>    /* initialize rxe device parameters */
>>>>>>>>    static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>>>>>    {
>>>>>>>> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
>>>>>>>>            rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
>>>>>>>> +
>>>>>>>> +        /* set handler for ODP prefetching API - ibv_advise_mr(3) */
>>>>>>>> +        ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
>>>>>>>>        }
>>>>>>>>    }
>>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/ infiniband/sw/rxe/rxe_loc.h
>>>>>>>> index f7dbb9cddd12..21b070f3dbb8 100644
>>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
>>>>>>>> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
>>>>>>>>    int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
>>>>>>>>                    unsigned int length);
>>>>>>>>    enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
>>>>>>>> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>>>>>> +             u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>>>> +             struct uverbs_attr_bundle *attrs);
>>>>>>>>    #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>>>    static inline int
>>>>>>>>    rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>>>>>>>> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
>>>>>>>>    {
>>>>>>>>        return RESPST_ERR_UNSUPPORTED_OPCODE;
>>>>>>>>    }
>>>>>>>> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
>>>>>>>> +                   u32 flags, struct ib_sge *sg_list, u32 num_sge,
>>>>>>>> +                   struct uverbs_attr_bundle *attrs)
>>>>>>>> +{
>>>>>>>> +    return -EOPNOTSUPP;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>    #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>>>>>>>>    #endif /* RXE_LOC_H */
>>>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/ infiniband/sw/rxe/rxe_odp.c
>>>>>>>> index 6149d9ffe7f7..e5c60b061d7e 100644
>>>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
>>>>>>>> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>>>>>>>>        return RESPST_NONE;
>>>>>>>>    }
>>>>>>>> +
>>>>>>>> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
>>>>>>>> +                   enum ib_uverbs_advise_mr_advice advice,
>>>>>>>> +                   u32 pf_flags, struct ib_sge *sg_list,
>>>>>>>> +                   u32 num_sge)
>>>>>>>> +{
>>>>>>>> +    struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
>>>>>>>> +    unsigned int i;
>>>>>>>> +    int ret = 0;
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < num_sge; ++i) {
>>>>>>>> +        struct rxe_mr *mr;
>>>>>>>> +        struct ib_umem_odp *umem_odp;
>>>>>>>> +
>>>>>>>> +        mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
>>>>>>>> +                   sg_list[i].lkey, RXE_LOOKUP_LOCAL);
>>>>>>>> +
>>>>>>>> +        if (IS_ERR(mr)) {
>>>>>>>> +            rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
>>>>>>>> +            return PTR_ERR(mr);
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>>>> +            !mr->umem->writable) {
>>>>>>>> +            rxe_dbg_mr(mr, "missing write permission\n");
>>>>>>>> +            rxe_put(mr);
>>>>>>>> +            return -EPERM;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
>>>>>>>> +                            sg_list[i].length, pf_flags);
>>>>>>>> +        if (ret < 0) {
>>>>>>>> +            if (sg_list[i].length == 0)
>>>>>>>> +                continue;
>>>>>>>> +
>>>>>>>> +            rxe_dbg_mr(mr, "failed to prefetch the mr\n");
>>>>>>>> +            rxe_put(mr);
>>>>>>>> +            return ret;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        umem_odp = to_ib_umem_odp(mr->umem);
>>>>>>>> +        mutex_unlock(&umem_odp->umem_mutex);
>>>>>>>> +
>>>>>>>> +        rxe_put(mr);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
>>>>>>>> +                     enum ib_uverbs_advise_mr_advice advice,
>>>>>>>> +                     u32 flags, struct ib_sge *sg_list, u32 num_sge)
>>>>>>>> +{
>>>>>>>> +    u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
>>>>>>>> +
>>>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
>>>>>>>> +        pf_flags |= RXE_PAGEFAULT_RDONLY;
>>>>>>>> +
>>>>>>>> +    if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>>>> +        pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
>>>>>>>> +
>>>>>>>> +    /* Synchronous call */
>>>>>>>> +    if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
>>>>>>>> +        return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
>>>>>>>> +                           num_sge);
>>>>>>>> +
>>>>>>>> +    /* Asynchronous call is "best-effort" */
>>>>>>>
>>>>>>> Asynchronous call is not implemented now, why does this comment appear?
>>>>>>
>>>>>> Even without the 2nd patch, async calls are reported as successful.
>>>>>
>>>>>
>>>>> Async call is not implemented. How to call "async calls are reported as successful"?
>>>>
>>>> Please see the manual.
>>>> cf. https://manpages.debian.org/testing/libibverbs-dev/ ibv_advise_mr.3.en.html
>>>>
>>>> If IBV_ADVISE_MR_FLAG_FLUSH is not given to 'flags' parameter,
>>>> then this function 'rxe_ib_advise_mr_prefetch()' simply returns 0.
>>>> Consequently, ibv_advise_mr(3) and underlying ioctl(2) get no error.
>>>> This behaviour is allowd in the spec as I quoted in the last reply.
>>>
>>> The functionality wasn't implemented, you added the comments first.
>>> You're still weaseling when people point this out.
>>>
>>>>
>>>> It might be nice to return -EOPNOTSUPP just below the comment instead,
>>>> but not doing so is acceptable according to the spec. Additionally,
>>>> such change will be overwritten in the next patch after all.
>>>
>>> Move comments to the next patch. In this patch, return -EOPNOTSUPP.
>>
>> Any opinion from Zhu?
>> I may post a new revision to change the intermediate code,
>> but the final result after applying the patchset will be the same.
>> You are the maintainer of rxe, so I will follow that.
> 
> Thanks a lot. I am fine with your commit.
> Please "Move comments to the next patch. In this patch, return -EOPNOTSUPP".
> I think that it is a good idea. The final result should be the same. Please send out the latest commit following the suggestions.
> 
> Thanks a lot for your contributions and efforts.

Thank you for the reply and support.

I've posted a new version with the following changes:
  - Added return -EOPNOTSUPP; (1st patch)
  - Changed type definition from 'unsigned int' to 'u32' as you suggested (1st patch)
  - Deleted redundant if-continue statements in rxe_ib_prefetch_sg_list() (1st patch)

Thanks,
Daisuke

> 
> Best Regards,
> Zhu Yanjun
> 
>>
>> Thanks,
>> Daisuke
>>
>>
>>>
>>> --G--
>>>
>>>>
>>>> Thanks,
>>>> Daisuke
>>>>
>>>>>
>>>>>
>>>>> Zhu Yanjun
>>>>>
>>>>>> The comment is inserted to show the reason, which is based on the
>>>>>> description from 'man 3 ibv_advise_mr' as follows:
>>>>>> ===
>>>>>> An application may pre-fetch any address range within an ODP MR when using the IBV_ADVISE_MR_ADVICE_PREFETCH or IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE advice. Semantically, this operation is best-effort. That means the kernel does not guarantee that underlying pages are updated in the HCA or the pre-fetched pages would remain resident.
>>>>>> ===
>>>>>>
>>>>>> Thanks,
>>>>>> Daisuke
>>>>>>
>>>>>>>
>>>>>>> Zhu Yanjun
>>>>>>>
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
>>>>>>>> +             enum ib_uverbs_advise_mr_advice advice,
>>>>>>>> +             u32 flags,
>>>>>>>> +             struct ib_sge *sg_list,
>>>>>>>> +             u32 num_sge,
>>>>>>>> +             struct uverbs_attr_bundle *attrs)
>>>>>>>> +{
>>>>>>>> +    if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
>>>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
>>>>>>>> +        advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
>>>>>>>> +        return -EOPNOTSUPP;
>>>>>>>> +
>>>>>>>> +    return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
>>>>>>>> +                     sg_list, num_sge);
>>>>>>>> +}
>>>>>>>
>>>>>>
>>>>
>>>>
>>
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-05-13  5:23 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-03 13:42 [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Daisuke Matsuda
2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
2025-05-05  7:57   ` Zhu Yanjun
2025-05-09 11:51     ` Daisuke Matsuda
2025-05-09 15:19   ` Zhu Yanjun
2025-05-10  2:46     ` Daisuke Matsuda
2025-05-10  4:43       ` Zhu Yanjun
2025-05-10  7:18         ` Daisuke Matsuda
2025-05-10  8:04           ` Greg Sword
2025-05-11  2:06             ` Daisuke Matsuda
2025-05-11  4:52               ` Zhu Yanjun
2025-05-13  5:23                 ` Daisuke Matsuda
2025-05-03 13:42 ` [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous " Daisuke Matsuda
2025-05-05 15:25   ` Zhu Yanjun
2025-05-09 12:19     ` Daisuke Matsuda
2025-05-09 12:52       ` Zhu Yanjun
2025-05-09 14:48       ` Zhu Yanjun
2025-05-03 17:08 ` [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Zhu Yanjun
2025-05-04  9:23   ` Daisuke Matsuda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).