From: Zhu Yanjun <yanjun.zhu@linux.dev>
To: Daisuke Matsuda <dskmtsd@gmail.com>,
linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
leon@kernel.org, jgg@ziepe.ca, zyjzyj2000@gmail.com
Subject: Re: [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs
Date: Mon, 5 May 2025 09:57:56 +0200 [thread overview]
Message-ID: <2e3676d3-ce82-4a87-be33-9ce6d7007c3b@linux.dev> (raw)
In-Reply-To: <20250503134224.4867-2-dskmtsd@gmail.com>
On 03.05.25 15:42, Daisuke Matsuda wrote:
> Minimal implementation of ibv_advise_mr(3) requires synchronous calls being
> successful with the IBV_ADVISE_MR_FLAG_FLUSH flag. Asynchronous requests,
> which are best-effort, will be added subsequently.
>
> Signed-off-by: Daisuke Matsuda <dskmtsd@gmail.com>
> ---
> drivers/infiniband/sw/rxe/rxe.c | 7 +++
> drivers/infiniband/sw/rxe/rxe_loc.h | 10 ++++
> drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++
> 3 files changed, 103 insertions(+)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 3a77d6db1720..e891199cbdef 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev)
> mutex_destroy(&rxe->usdev_lock);
> }
>
> +static const struct ib_device_ops rxe_ib_dev_odp_ops = {
> + .advise_mr = rxe_ib_advise_mr,
> +};
> +
> /* initialize rxe device parameters */
> static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
> {
> @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
> rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
> rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
> +
> + /* set handler for ODP prefetching API - ibv_advise_mr(3) */
> + ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops);
> }
> }
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index f7dbb9cddd12..21b070f3dbb8 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -197,6 +197,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
> int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
> unsigned int length);
> enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
> +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> + u32 flags, struct ib_sge *sg_list, u32 num_sge,
> + struct uverbs_attr_bundle *attrs);
> #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> static inline int
> rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -225,6 +228,13 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
> {
> return RESPST_ERR_UNSUPPORTED_OPCODE;
> }
> +static inline int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
> + u32 flags, struct ib_sge *sg_list, u32 num_sge,
> + struct uverbs_attr_bundle *attrs)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>
> #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 6149d9ffe7f7..e5c60b061d7e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -424,3 +424,89 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>
> return RESPST_NONE;
> }
> +
> +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd,
> + enum ib_uverbs_advise_mr_advice advice,
> + u32 pf_flags, struct ib_sge *sg_list,
> + u32 num_sge)
> +{
> + struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd);
> + unsigned int i;
> + int ret = 0;
> +
> + for (i = 0; i < num_sge; ++i) {
i is unsigned int, num_sge is u32. Perhaps they all use u32 type?
It is a minor problem.
Other than that, I am fine with this commit.
I have made tests with rdma-core. Both the synchronous and asynchrounos
modes can work well.
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Zhu Yanjun
> + struct rxe_mr *mr;
> + struct ib_umem_odp *umem_odp;
> +
> + mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE,
> + sg_list[i].lkey, RXE_LOOKUP_LOCAL);
> +
> + if (IS_ERR(mr)) {
> + rxe_dbg_pd(pd, "mr with lkey %x not found\n", sg_list[i].lkey);
> + return PTR_ERR(mr);
> + }
> +
> + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> + !mr->umem->writable) {
> + rxe_dbg_mr(mr, "missing write permission\n");
> + rxe_put(mr);
> + return -EPERM;
> + }
> +
> + ret = rxe_odp_do_pagefault_and_lock(mr, sg_list[i].addr,
> + sg_list[i].length, pf_flags);
> + if (ret < 0) {
> + if (sg_list[i].length == 0)
> + continue;
> +
> + rxe_dbg_mr(mr, "failed to prefetch the mr\n");
> + rxe_put(mr);
> + return ret;
> + }
> +
> + umem_odp = to_ib_umem_odp(mr->umem);
> + mutex_unlock(&umem_odp->umem_mutex);
> +
> + rxe_put(mr);
> + }
> +
> + return 0;
> +}
> +
> +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd,
> + enum ib_uverbs_advise_mr_advice advice,
> + u32 flags, struct ib_sge *sg_list, u32 num_sge)
> +{
> + u32 pf_flags = RXE_PAGEFAULT_DEFAULT;
> +
> + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
> + pf_flags |= RXE_PAGEFAULT_RDONLY;
> +
> + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> + pf_flags |= RXE_PAGEFAULT_SNAPSHOT;
> +
> + /* Synchronous call */
> + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
> + return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list,
> + num_sge);
> +
> + /* Asynchronous call is "best-effort" */
> +
> + return 0;
> +}
> +
> +int rxe_ib_advise_mr(struct ib_pd *ibpd,
> + enum ib_uverbs_advise_mr_advice advice,
> + u32 flags,
> + struct ib_sge *sg_list,
> + u32 num_sge,
> + struct uverbs_attr_bundle *attrs)
> +{
> + if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
> + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
> + return -EOPNOTSUPP;
> +
> + return rxe_ib_advise_mr_prefetch(ibpd, advice, flags,
> + sg_list, num_sge);
> +}
next prev parent reply other threads:[~2025-05-05 7:58 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-03 13:42 [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Daisuke Matsuda
2025-05-03 13:42 ` [PATCH for-next v2 1/2] RDMA/rxe: Implement synchronous prefetch for ODP MRs Daisuke Matsuda
2025-05-05 7:57 ` Zhu Yanjun [this message]
2025-05-09 11:51 ` Daisuke Matsuda
2025-05-09 15:19 ` Zhu Yanjun
2025-05-10 2:46 ` Daisuke Matsuda
2025-05-10 4:43 ` Zhu Yanjun
2025-05-10 7:18 ` Daisuke Matsuda
2025-05-10 8:04 ` Greg Sword
2025-05-11 2:06 ` Daisuke Matsuda
2025-05-11 4:52 ` Zhu Yanjun
2025-05-13 5:23 ` Daisuke Matsuda
2025-05-03 13:42 ` [PATCH for-next v2 2/2] RDMA/rxe: Enable asynchronous " Daisuke Matsuda
2025-05-05 15:25 ` Zhu Yanjun
2025-05-09 12:19 ` Daisuke Matsuda
2025-05-09 12:52 ` Zhu Yanjun
2025-05-09 14:48 ` Zhu Yanjun
2025-05-03 17:08 ` [PATCH for-next v2 0/2] RDMA/rxe: Prefetching pages with explicit ODP Zhu Yanjun
2025-05-04 9:23 ` Daisuke Matsuda
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2e3676d3-ce82-4a87-be33-9ce6d7007c3b@linux.dev \
--to=yanjun.zhu@linux.dev \
--cc=dskmtsd@gmail.com \
--cc=jgg@ziepe.ca \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=zyjzyj2000@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.