* [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-07 22:15 [PATCH v13 0/4] RDMA: Add dma-buf support Jianxin Xiong
@ 2020-12-07 22:15 ` Jianxin Xiong
2020-12-08 7:05 ` Leon Romanovsky
2020-12-07 22:15 ` [PATCH v13 2/4] RDMA/core: Add device method for registering dma-buf based " Jianxin Xiong
` (2 subsequent siblings)
3 siblings, 1 reply; 13+ messages in thread
From: Jianxin Xiong @ 2020-12-07 22:15 UTC (permalink / raw)
To: linux-rdma, dri-devel
Cc: Jianxin Xiong, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
Sumit Semwal, Christian Koenig, Daniel Vetter
Dma-buf is a standard cross-driver buffer sharing mechanism that can be
used to support peer-to-peer access from RDMA devices.
Device memory exported via dma-buf is associated with a file descriptor.
This is passed to the user space as a property associated with the
buffer allocation. When the buffer is registered as a memory region,
the file descriptor is passed to the RDMA driver along with other
parameters.
Implement the common code for importing dma-buf object and mapping
dma-buf pages.
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Conflicts:
include/rdma/ib_umem.h
---
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/umem.c | 3 +
drivers/infiniband/core/umem_dmabuf.c | 173 ++++++++++++++++++++++++++++++++++
include/rdma/ib_umem.h | 43 ++++++++-
4 files changed, 219 insertions(+), 2 deletions(-)
create mode 100644 drivers/infiniband/core/umem_dmabuf.c
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index ccf2670..8ab4eea 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_srq.o \
uverbs_std_types_wq.o \
uverbs_std_types_qp.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 7ca4112..cc131f8 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -2,6 +2,7 @@
* Copyright (c) 2005 Topspin Communications. All rights reserved.
* Copyright (c) 2005 Cisco Systems. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -278,6 +279,8 @@ void ib_umem_release(struct ib_umem *umem)
{
if (!umem)
return;
+ if (umem->is_dmabuf)
+ return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
if (umem->is_odp)
return ib_umem_odp_release(to_ib_umem_odp(umem));
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
new file mode 100644
index 0000000..e50b955
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <linux/dma-mapping.h>
+
+#include "uverbs.h"
+
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ struct dma_fence *fence;
+ unsigned long start, end, cur;
+ unsigned int nmap;
+ int i;
+
+ dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+ if (umem_dmabuf->sgt)
+ return 0;
+
+ sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
+ if (IS_ERR(sgt))
+ return PTR_ERR(sgt);
+
+ /* modify the sg list in-place to match umem address and length */
+
+ start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE);
+ end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length,
+ PAGE_SIZE);
+ cur = 0;
+ nmap = 0;
+ for_each_sgtable_dma_sg(sgt, sg, i) {
+ if (start < cur + sg_dma_len(sg) && cur < end)
+ nmap++;
+ if (cur <= start && start < cur + sg_dma_len(sg)) {
+ unsigned long offset = start - cur;
+
+ umem_dmabuf->first_sg = sg;
+ umem_dmabuf->first_sg_offset = offset;
+ sg_dma_address(sg) += offset;
+ sg_dma_len(sg) -= offset;
+ cur += offset;
+ }
+ if (cur < end && end <= cur + sg_dma_len(sg)) {
+ unsigned long trim = cur + sg_dma_len(sg) - end;
+
+ umem_dmabuf->last_sg = sg;
+ umem_dmabuf->last_sg_trim = trim;
+ sg_dma_len(sg) -= trim;
+ break;
+ }
+ cur += sg_dma_len(sg);
+ }
+
+ umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
+ umem_dmabuf->umem.sg_head.nents = nmap;
+ umem_dmabuf->umem.nmap = nmap;
+ umem_dmabuf->sgt = sgt;
+
+ /*
+ * Although the sg list is valid now, the content of the pages
+ * may be not up-to-date. Wait for the exporter to finish
+ * the migration.
+ */
+ fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
+ if (fence)
+ dma_fence_wait(fence, false);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
+
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+ if (!umem_dmabuf->sgt)
+ return;
+
+ /* retore the original sg list */
+ if (umem_dmabuf->first_sg) {
+ sg_dma_address(umem_dmabuf->first_sg) -=
+ umem_dmabuf->first_sg_offset;
+ sg_dma_len(umem_dmabuf->first_sg) +=
+ umem_dmabuf->first_sg_offset;
+ umem_dmabuf->first_sg = NULL;
+ umem_dmabuf->first_sg_offset = 0;
+ }
+ if (umem_dmabuf->last_sg) {
+ sg_dma_len(umem_dmabuf->last_sg) +=
+ umem_dmabuf->last_sg_trim;
+ umem_dmabuf->last_sg = NULL;
+ umem_dmabuf->last_sg_trim = 0;
+ }
+
+ dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
+ DMA_BIDIRECTIONAL);
+
+ umem_dmabuf->sgt = NULL;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
+
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
+{
+ struct dma_buf *dmabuf;
+ struct ib_umem_dmabuf *umem_dmabuf;
+ struct ib_umem *umem;
+ unsigned long end;
+ long ret = -EINVAL;
+
+ if (check_add_overflow(offset, (unsigned long)size, &end))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(!ops || !ops->move_notify))
+ return ERR_PTR(-EINVAL);
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return (void *)dmabuf;
+
+ if (dmabuf->size < end)
+ goto out_release_dmabuf;
+
+ umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
+ if (!umem_dmabuf)
+ return ERR_PTR(-ENOMEM);
+
+ umem = &umem_dmabuf->umem;
+ umem->ibdev = device;
+ umem->length = size;
+ umem->address = offset;
+ umem->writable = ib_access_writable(access);
+ umem->is_dmabuf = 1;
+
+ if (unlikely(!ib_umem_num_pages(umem)))
+ goto out_free_umem;
+
+ umem_dmabuf->attach = dma_buf_dynamic_attach(
+ dmabuf,
+ device->dma_device,
+ ops,
+ umem_dmabuf);
+ if (IS_ERR(umem_dmabuf->attach)) {
+ ret = PTR_ERR(umem_dmabuf->attach);
+ goto out_free_umem;
+ }
+ return umem;
+
+out_free_umem:
+ kfree(umem_dmabuf);
+
+out_release_dmabuf:
+ dma_buf_put(dmabuf);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+ dma_buf_detach(dmabuf, umem_dmabuf->attach);
+ dma_buf_put(dmabuf);
+ kfree(umem_dmabuf);
+}
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 7752211..b49a96d 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
* Copyright (c) 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
*/
#ifndef IB_UMEM_H
@@ -13,6 +14,7 @@
struct ib_ucontext;
struct ib_umem_odp;
+struct dma_buf_attach_ops;
struct ib_umem {
struct ib_device *ibdev;
@@ -22,12 +24,29 @@ struct ib_umem {
unsigned long address;
u32 writable : 1;
u32 is_odp : 1;
+ u32 is_dmabuf : 1;
struct work_struct work;
struct sg_table sg_head;
int nmap;
unsigned int sg_nents;
};
+struct ib_umem_dmabuf {
+ struct ib_umem umem;
+ struct dma_buf_attachment *attach;
+ struct sg_table *sgt;
+ struct scatterlist *first_sg;
+ struct scatterlist *last_sg;
+ unsigned long first_sg_offset;
+ unsigned long last_sg_trim;
+ void *private;
+};
+
+static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
+{
+ return container_of(umem, struct ib_umem_dmabuf, umem);
+}
+
/* Returns the offset of the umem start relative to the first page. */
static inline int ib_umem_offset(struct ib_umem *umem)
{
@@ -86,6 +105,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
unsigned long pgsz_bitmap,
unsigned long virt);
+
/**
* ib_umem_find_best_pgoff - Find best HW page size
*
@@ -116,6 +136,14 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
dma_addr & pgoff_bitmask);
}
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops);
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+
#else /* CONFIG_INFINIBAND_USER_MEM */
#include <linux/err.h>
@@ -143,7 +171,20 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
{
return 0;
}
+static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access,
+ struct dma_buf_attach_ops *ops)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ return -EINVAL;
+}
+static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
+static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { }
#endif /* CONFIG_INFINIBAND_USER_MEM */
-
#endif /* IB_UMEM_H */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-07 22:15 ` [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region Jianxin Xiong
@ 2020-12-08 7:05 ` Leon Romanovsky
2020-12-08 7:10 ` Xiong, Jianxin
2020-12-08 18:13 ` Xiong, Jianxin
0 siblings, 2 replies; 13+ messages in thread
From: Leon Romanovsky @ 2020-12-08 7:05 UTC (permalink / raw)
To: Jianxin Xiong
Cc: linux-rdma, dri-devel, Doug Ledford, Jason Gunthorpe,
Sumit Semwal, Christian Koenig, Daniel Vetter
On Mon, Dec 07, 2020 at 02:15:50PM -0800, Jianxin Xiong wrote:
> Dma-buf is a standard cross-driver buffer sharing mechanism that can be
> used to support peer-to-peer access from RDMA devices.
>
> Device memory exported via dma-buf is associated with a file descriptor.
> This is passed to the user space as a property associated with the
> buffer allocation. When the buffer is registered as a memory region,
> the file descriptor is passed to the RDMA driver along with other
> parameters.
>
> Implement the common code for importing dma-buf object and mapping
> dma-buf pages.
>
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Acked-by: Christian Koenig <christian.koenig@amd.com>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>
> Conflicts:
> include/rdma/ib_umem.h
This probably leftover from rebase, am I right?
> ---
> drivers/infiniband/core/Makefile | 2 +-
> drivers/infiniband/core/umem.c | 3 +
> drivers/infiniband/core/umem_dmabuf.c | 173 ++++++++++++++++++++++++++++++++++
> include/rdma/ib_umem.h | 43 ++++++++-
> 4 files changed, 219 insertions(+), 2 deletions(-)
> create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index ccf2670..8ab4eea 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
> uverbs_std_types_srq.o \
> uverbs_std_types_wq.o \
> uverbs_std_types_qp.o
> -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
> ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 7ca4112..cc131f8 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -2,6 +2,7 @@
> * Copyright (c) 2005 Topspin Communications. All rights reserved.
> * Copyright (c) 2005 Cisco Systems. All rights reserved.
> * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> * licenses. You may choose to be licensed under the terms of the GNU
> @@ -278,6 +279,8 @@ void ib_umem_release(struct ib_umem *umem)
> {
> if (!umem)
> return;
> + if (umem->is_dmabuf)
> + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
> if (umem->is_odp)
> return ib_umem_odp_release(to_ib_umem_odp(umem));
>
> diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
> new file mode 100644
> index 0000000..e50b955
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.c
> @@ -0,0 +1,173 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/dma-resv.h>
> +#include <linux/dma-mapping.h>
> +
> +#include "uverbs.h"
> +
> +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> + struct sg_table *sgt;
> + struct scatterlist *sg;
> + struct dma_fence *fence;
> + unsigned long start, end, cur;
> + unsigned int nmap;
> + int i;
> +
> + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> + if (umem_dmabuf->sgt)
> + return 0;
> +
> + sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
> + if (IS_ERR(sgt))
> + return PTR_ERR(sgt);
> +
> + /* modify the sg list in-place to match umem address and length */
> +
> + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE);
> + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length,
> + PAGE_SIZE);
> + cur = 0;
> + nmap = 0;
Better to put as part of variable initialization.
> + for_each_sgtable_dma_sg(sgt, sg, i) {
> + if (start < cur + sg_dma_len(sg) && cur < end)
> + nmap++;
> + if (cur <= start && start < cur + sg_dma_len(sg)) {
> + unsigned long offset = start - cur;
> +
> + umem_dmabuf->first_sg = sg;
> + umem_dmabuf->first_sg_offset = offset;
> + sg_dma_address(sg) += offset;
> + sg_dma_len(sg) -= offset;
> + cur += offset;
> + }
> + if (cur < end && end <= cur + sg_dma_len(sg)) {
> + unsigned long trim = cur + sg_dma_len(sg) - end;
> +
> + umem_dmabuf->last_sg = sg;
> + umem_dmabuf->last_sg_trim = trim;
> + sg_dma_len(sg) -= trim;
> + break;
> + }
> + cur += sg_dma_len(sg);
> + }
> +
> + umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
> + umem_dmabuf->umem.sg_head.nents = nmap;
> + umem_dmabuf->umem.nmap = nmap;
> + umem_dmabuf->sgt = sgt;
> +
> + /*
> + * Although the sg list is valid now, the content of the pages
> + * may be not up-to-date. Wait for the exporter to finish
> + * the migration.
> + */
> + fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> + if (fence)
> + dma_fence_wait(fence, false);
Any reason do not check return result from dma_fence_wait()?
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> +
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> + if (!umem_dmabuf->sgt)
> + return;
> +
> + /* retore the original sg list */
> + if (umem_dmabuf->first_sg) {
> + sg_dma_address(umem_dmabuf->first_sg) -=
> + umem_dmabuf->first_sg_offset;
> + sg_dma_len(umem_dmabuf->first_sg) +=
> + umem_dmabuf->first_sg_offset;
> + umem_dmabuf->first_sg = NULL;
> + umem_dmabuf->first_sg_offset = 0;
> + }
> + if (umem_dmabuf->last_sg) {
> + sg_dma_len(umem_dmabuf->last_sg) +=
> + umem_dmabuf->last_sg_trim;
> + umem_dmabuf->last_sg = NULL;
> + umem_dmabuf->last_sg_trim = 0;
> + }
> +
> + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> + DMA_BIDIRECTIONAL);
> +
> + umem_dmabuf->sgt = NULL;
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> + unsigned long offset, size_t size,
> + int fd, int access,
> + const struct dma_buf_attach_ops *ops)
> +{
> + struct dma_buf *dmabuf;
> + struct ib_umem_dmabuf *umem_dmabuf;
> + struct ib_umem *umem;
> + unsigned long end;
> + long ret = -EINVAL;
It is wrong type for the returned value. One of the possible options
is to declare "struct ib_umem *ret;" and set ret = ERR_PTR(-EINVAL) or
ret = ERR_CAST(dmabuf);
> +
> + if (check_add_overflow(offset, (unsigned long)size, &end))
> + return ERR_PTR(-EINVAL);
> +
> + if (unlikely(!ops || !ops->move_notify))
Let's not put likely/unlikely in control paths.
> + return ERR_PTR(-EINVAL);
> +
> + dmabuf = dma_buf_get(fd);
> + if (IS_ERR(dmabuf))
> + return (void *)dmabuf;
return ERR_CAST(dmabuf);
> +
> + if (dmabuf->size < end)
> + goto out_release_dmabuf;
> +
> + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> + if (!umem_dmabuf)
> + return ERR_PTR(-ENOMEM);
You are leaking dmabuf here, forgot to call to dma_buf_put();
> +
> + umem = &umem_dmabuf->umem;
> + umem->ibdev = device;
> + umem->length = size;
> + umem->address = offset;
> + umem->writable = ib_access_writable(access);
> + umem->is_dmabuf = 1;
> +
> + if (unlikely(!ib_umem_num_pages(umem)))
There is no advantage in "unlikely" here.
> + goto out_free_umem;
> +
> + umem_dmabuf->attach = dma_buf_dynamic_attach(
> + dmabuf,
> + device->dma_device,
> + ops,
> + umem_dmabuf);
> + if (IS_ERR(umem_dmabuf->attach)) {
> + ret = PTR_ERR(umem_dmabuf->attach);
> + goto out_free_umem;
> + }
> + return umem;
> +
> +out_free_umem:
> + kfree(umem_dmabuf);
> +
> +out_release_dmabuf:
> + dma_buf_put(dmabuf);
> + return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> +
> + dma_buf_detach(dmabuf, umem_dmabuf->attach);
> + dma_buf_put(dmabuf);
> + kfree(umem_dmabuf);
> +}
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 7752211..b49a96d 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -1,6 +1,7 @@
> /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> /*
> * Copyright (c) 2007 Cisco Systems. All rights reserved.
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> */
>
> #ifndef IB_UMEM_H
> @@ -13,6 +14,7 @@
>
> struct ib_ucontext;
> struct ib_umem_odp;
> +struct dma_buf_attach_ops;
>
> struct ib_umem {
> struct ib_device *ibdev;
> @@ -22,12 +24,29 @@ struct ib_umem {
> unsigned long address;
> u32 writable : 1;
> u32 is_odp : 1;
> + u32 is_dmabuf : 1;
> struct work_struct work;
> struct sg_table sg_head;
> int nmap;
> unsigned int sg_nents;
> };
>
> +struct ib_umem_dmabuf {
> + struct ib_umem umem;
> + struct dma_buf_attachment *attach;
> + struct sg_table *sgt;
> + struct scatterlist *first_sg;
> + struct scatterlist *last_sg;
> + unsigned long first_sg_offset;
> + unsigned long last_sg_trim;
> + void *private;
> +};
> +
> +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
> +{
> + return container_of(umem, struct ib_umem_dmabuf, umem);
> +}
> +
> /* Returns the offset of the umem start relative to the first page. */
> static inline int ib_umem_offset(struct ib_umem *umem)
> {
> @@ -86,6 +105,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
> unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
> unsigned long pgsz_bitmap,
> unsigned long virt);
> +
> /**
> * ib_umem_find_best_pgoff - Find best HW page size
> *
> @@ -116,6 +136,14 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
> dma_addr & pgoff_bitmask);
> }
>
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> + unsigned long offset, size_t size,
> + int fd, int access,
> + const struct dma_buf_attach_ops *ops);
> +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> +
> #else /* CONFIG_INFINIBAND_USER_MEM */
>
> #include <linux/err.h>
> @@ -143,7 +171,20 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
> {
> return 0;
> }
> +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> + unsigned long offset,
> + size_t size, int fd,
> + int access,
> + struct dma_buf_attach_ops *ops)
> +{
> + return ERR_PTR(-EINVAL);
Probably, It should be EOPNOTSUPP and not EINVAL.
> +}
> +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> + return -EINVAL;
> +}
> +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
> +static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { }
>
> #endif /* CONFIG_INFINIBAND_USER_MEM */
> -
> #endif /* IB_UMEM_H */
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-08 7:05 ` Leon Romanovsky
@ 2020-12-08 7:10 ` Xiong, Jianxin
2020-12-08 18:13 ` Xiong, Jianxin
1 sibling, 0 replies; 13+ messages in thread
From: Xiong, Jianxin @ 2020-12-08 7:10 UTC (permalink / raw)
To: Leon Romanovsky
Cc: linux-rdma@vger.kernel.org, dri-devel@lists.freedesktop.org,
Doug Ledford, Jason Gunthorpe, Sumit Semwal, Christian Koenig,
Vetter, Daniel
> -----Original Message-----
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Monday, December 07, 2020 11:06 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel <daniel.vetter@intel.com>
> Subject: Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
>
> On Mon, Dec 07, 2020 at 02:15:50PM -0800, Jianxin Xiong wrote:
> > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > be used to support peer-to-peer access from RDMA devices.
> >
> > Device memory exported via dma-buf is associated with a file descriptor.
> > This is passed to the user space as a property associated with the
> > buffer allocation. When the buffer is registered as a memory region,
> > the file descriptor is passed to the RDMA driver along with other
> > parameters.
> >
> > Implement the common code for importing dma-buf object and mapping
> > dma-buf pages.
> >
> > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Acked-by: Christian Koenig <christian.koenig@amd.com>
> > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> >
> > Conflicts:
> > include/rdma/ib_umem.h
>
> This probably leftover from rebase, am I right?
Right. Should have removed it.
>
> > ---
> > drivers/infiniband/core/Makefile | 2 +-
> > drivers/infiniband/core/umem.c | 3 +
> > drivers/infiniband/core/umem_dmabuf.c | 173 ++++++++++++++++++++++++++++++++++
> > include/rdma/ib_umem.h | 43 ++++++++-
> > 4 files changed, 219 insertions(+), 2 deletions(-) create mode
> > 100644 drivers/infiniband/core/umem_dmabuf.c
> >
> > diff --git a/drivers/infiniband/core/Makefile
> > b/drivers/infiniband/core/Makefile
> > index ccf2670..8ab4eea 100644
> > --- a/drivers/infiniband/core/Makefile
> > +++ b/drivers/infiniband/core/Makefile
> > @@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
> > uverbs_std_types_srq.o \
> > uverbs_std_types_wq.o \
> > uverbs_std_types_qp.o
> > -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> > +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
> > ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff
> > --git a/drivers/infiniband/core/umem.c
> > b/drivers/infiniband/core/umem.c index 7ca4112..cc131f8 100644
> > --- a/drivers/infiniband/core/umem.c
> > +++ b/drivers/infiniband/core/umem.c
> > @@ -2,6 +2,7 @@
> > * Copyright (c) 2005 Topspin Communications. All rights reserved.
> > * Copyright (c) 2005 Cisco Systems. All rights reserved.
> > * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > *
> > * This software is available to you under a choice of one of two
> > * licenses. You may choose to be licensed under the terms of the
> > GNU @@ -278,6 +279,8 @@ void ib_umem_release(struct ib_umem *umem) {
> > if (!umem)
> > return;
> > + if (umem->is_dmabuf)
> > + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
> > if (umem->is_odp)
> > return ib_umem_odp_release(to_ib_umem_odp(umem));
> >
> > diff --git a/drivers/infiniband/core/umem_dmabuf.c
> > b/drivers/infiniband/core/umem_dmabuf.c
> > new file mode 100644
> > index 0000000..e50b955
> > --- /dev/null
> > +++ b/drivers/infiniband/core/umem_dmabuf.c
> > @@ -0,0 +1,173 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > +/*
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > + */
> > +
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-resv.h>
> > +#include <linux/dma-mapping.h>
> > +
> > +#include "uverbs.h"
> > +
> > +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > + struct sg_table *sgt;
> > + struct scatterlist *sg;
> > + struct dma_fence *fence;
> > + unsigned long start, end, cur;
> > + unsigned int nmap;
> > + int i;
> > +
> > + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > + if (umem_dmabuf->sgt)
> > + return 0;
> > +
> > + sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
> > + if (IS_ERR(sgt))
> > + return PTR_ERR(sgt);
> > +
> > + /* modify the sg list in-place to match umem address and length */
> > +
> > + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE);
> > + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length,
> > + PAGE_SIZE);
> > + cur = 0;
> > + nmap = 0;
>
> Better to put as part of variable initialization.
>
> > + for_each_sgtable_dma_sg(sgt, sg, i) {
> > + if (start < cur + sg_dma_len(sg) && cur < end)
> > + nmap++;
> > + if (cur <= start && start < cur + sg_dma_len(sg)) {
> > + unsigned long offset = start - cur;
> > +
> > + umem_dmabuf->first_sg = sg;
> > + umem_dmabuf->first_sg_offset = offset;
> > + sg_dma_address(sg) += offset;
> > + sg_dma_len(sg) -= offset;
> > + cur += offset;
> > + }
> > + if (cur < end && end <= cur + sg_dma_len(sg)) {
> > + unsigned long trim = cur + sg_dma_len(sg) - end;
> > +
> > + umem_dmabuf->last_sg = sg;
> > + umem_dmabuf->last_sg_trim = trim;
> > + sg_dma_len(sg) -= trim;
> > + break;
> > + }
> > + cur += sg_dma_len(sg);
> > + }
> > +
> > + umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
> > + umem_dmabuf->umem.sg_head.nents = nmap;
> > + umem_dmabuf->umem.nmap = nmap;
> > + umem_dmabuf->sgt = sgt;
> > +
> > + /*
> > + * Although the sg list is valid now, the content of the pages
> > + * may be not up-to-date. Wait for the exporter to finish
> > + * the migration.
> > + */
> > + fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> > + if (fence)
> > + dma_fence_wait(fence, false);
>
> Any reason do not check return result from dma_fence_wait()?
>
> > +
> > + return 0;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> > +
> > +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > + if (!umem_dmabuf->sgt)
> > + return;
> > +
> > + /* retore the original sg list */
> > + if (umem_dmabuf->first_sg) {
> > + sg_dma_address(umem_dmabuf->first_sg) -=
> > + umem_dmabuf->first_sg_offset;
> > + sg_dma_len(umem_dmabuf->first_sg) +=
> > + umem_dmabuf->first_sg_offset;
> > + umem_dmabuf->first_sg = NULL;
> > + umem_dmabuf->first_sg_offset = 0;
> > + }
> > + if (umem_dmabuf->last_sg) {
> > + sg_dma_len(umem_dmabuf->last_sg) +=
> > + umem_dmabuf->last_sg_trim;
> > + umem_dmabuf->last_sg = NULL;
> > + umem_dmabuf->last_sg_trim = 0;
> > + }
> > +
> > + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> > + DMA_BIDIRECTIONAL);
> > +
> > + umem_dmabuf->sgt = NULL;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> > +
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset, size_t size,
> > + int fd, int access,
> > + const struct dma_buf_attach_ops *ops) {
> > + struct dma_buf *dmabuf;
> > + struct ib_umem_dmabuf *umem_dmabuf;
> > + struct ib_umem *umem;
> > + unsigned long end;
> > + long ret = -EINVAL;
>
> It is wrong type for the returned value. One of the possible options is to declare "struct ib_umem *ret;" and set ret = ERR_PTR(-EINVAL) or
> ret = ERR_CAST(dmabuf);
>
> > +
> > + if (check_add_overflow(offset, (unsigned long)size, &end))
> > + return ERR_PTR(-EINVAL);
> > +
> > + if (unlikely(!ops || !ops->move_notify))
>
> Let's not put likely/unlikely in control paths.
>
> > + return ERR_PTR(-EINVAL);
> > +
> > + dmabuf = dma_buf_get(fd);
> > + if (IS_ERR(dmabuf))
> > + return (void *)dmabuf;
>
> return ERR_CAST(dmabuf);
>
> > +
> > + if (dmabuf->size < end)
> > + goto out_release_dmabuf;
> > +
> > + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> > + if (!umem_dmabuf)
> > + return ERR_PTR(-ENOMEM);
>
> You are leaking dmabuf here, forgot to call to dma_buf_put();
>
> > +
> > + umem = &umem_dmabuf->umem;
> > + umem->ibdev = device;
> > + umem->length = size;
> > + umem->address = offset;
> > + umem->writable = ib_access_writable(access);
> > + umem->is_dmabuf = 1;
> > +
> > + if (unlikely(!ib_umem_num_pages(umem)))
>
> There is no advantage in "unlikely" here.
>
> > + goto out_free_umem;
> > +
> > + umem_dmabuf->attach = dma_buf_dynamic_attach(
> > + dmabuf,
> > + device->dma_device,
> > + ops,
> > + umem_dmabuf);
> > + if (IS_ERR(umem_dmabuf->attach)) {
> > + ret = PTR_ERR(umem_dmabuf->attach);
> > + goto out_free_umem;
> > + }
> > + return umem;
> > +
> > +out_free_umem:
> > + kfree(umem_dmabuf);
> > +
> > +out_release_dmabuf:
> > + dma_buf_put(dmabuf);
> > + return ERR_PTR(ret);
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> > +
> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> > +
> > + dma_buf_detach(dmabuf, umem_dmabuf->attach);
> > + dma_buf_put(dmabuf);
> > + kfree(umem_dmabuf);
> > +}
> > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index
> > 7752211..b49a96d 100644
> > --- a/include/rdma/ib_umem.h
> > +++ b/include/rdma/ib_umem.h
> > @@ -1,6 +1,7 @@
> > /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> > /*
> > * Copyright (c) 2007 Cisco Systems. All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > */
> >
> > #ifndef IB_UMEM_H
> > @@ -13,6 +14,7 @@
> >
> > struct ib_ucontext;
> > struct ib_umem_odp;
> > +struct dma_buf_attach_ops;
> >
> > struct ib_umem {
> > struct ib_device *ibdev;
> > @@ -22,12 +24,29 @@ struct ib_umem {
> > unsigned long address;
> > u32 writable : 1;
> > u32 is_odp : 1;
> > + u32 is_dmabuf : 1;
> > struct work_struct work;
> > struct sg_table sg_head;
> > int nmap;
> > unsigned int sg_nents;
> > };
> >
> > +struct ib_umem_dmabuf {
> > + struct ib_umem umem;
> > + struct dma_buf_attachment *attach;
> > + struct sg_table *sgt;
> > + struct scatterlist *first_sg;
> > + struct scatterlist *last_sg;
> > + unsigned long first_sg_offset;
> > + unsigned long last_sg_trim;
> > + void *private;
> > +};
> > +
> > +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem
> > +*umem) {
> > + return container_of(umem, struct ib_umem_dmabuf, umem); }
> > +
> > /* Returns the offset of the umem start relative to the first page.
> > */ static inline int ib_umem_offset(struct ib_umem *umem) { @@ -86,6
> > +105,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem,
> > size_t offset, unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
> > unsigned long pgsz_bitmap,
> > unsigned long virt);
> > +
> > /**
> > * ib_umem_find_best_pgoff - Find best HW page size
> > *
> > @@ -116,6 +136,14 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
> > dma_addr & pgoff_bitmask);
> > }
> >
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset, size_t size,
> > + int fd, int access,
> > + const struct dma_buf_attach_ops *ops); int
> > +ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > +ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > +ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> > +
> > #else /* CONFIG_INFINIBAND_USER_MEM */
> >
> > #include <linux/err.h>
> > @@ -143,7 +171,20 @@ static inline unsigned long
> > ib_umem_find_best_pgoff(struct ib_umem *umem, {
> > return 0;
> > }
> > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset,
> > + size_t size, int fd,
> > + int access,
> > + struct dma_buf_attach_ops *ops) {
> > + return ERR_PTR(-EINVAL);
>
> Probably, It should be EOPNOTSUPP and not EINVAL.
>
> > +}
> > +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) {
> > + return -EINVAL;
> > +}
> > +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct
> > +ib_umem_dmabuf *umem_dmabuf) { }
> >
> > #endif /* CONFIG_INFINIBAND_USER_MEM */
> > -
> > #endif /* IB_UMEM_H */
> > --
> > 1.8.3.1
> >
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-08 7:05 ` Leon Romanovsky
2020-12-08 7:10 ` Xiong, Jianxin
@ 2020-12-08 18:13 ` Xiong, Jianxin
2020-12-08 18:59 ` Jason Gunthorpe
2020-12-08 19:02 ` Leon Romanovsky
1 sibling, 2 replies; 13+ messages in thread
From: Xiong, Jianxin @ 2020-12-08 18:13 UTC (permalink / raw)
To: Leon Romanovsky
Cc: linux-rdma@vger.kernel.org, dri-devel@lists.freedesktop.org,
Doug Ledford, Jason Gunthorpe, Sumit Semwal, Christian Koenig,
Vetter, Daniel
> -----Original Message-----
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Monday, December 07, 2020 11:06 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel <daniel.vetter@intel.com>
> Subject: Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
>
> On Mon, Dec 07, 2020 at 02:15:50PM -0800, Jianxin Xiong wrote:
> > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > be used to support peer-to-peer access from RDMA devices.
> >
> > Device memory exported via dma-buf is associated with a file descriptor.
> > This is passed to the user space as a property associated with the
> > buffer allocation. When the buffer is registered as a memory region,
> > the file descriptor is passed to the RDMA driver along with other
> > parameters.
> >
> > Implement the common code for importing dma-buf object and mapping
> > dma-buf pages.
> >
> > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Acked-by: Christian Koenig <christian.koenig@amd.com>
> > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> >
> > Conflicts:
> > include/rdma/ib_umem.h
>
> This probably leftover from rebase, am I right?
>
> > ---
> > drivers/infiniband/core/Makefile | 2 +-
> > drivers/infiniband/core/umem.c | 3 +
> > drivers/infiniband/core/umem_dmabuf.c | 173 ++++++++++++++++++++++++++++++++++
> > include/rdma/ib_umem.h | 43 ++++++++-
> > 4 files changed, 219 insertions(+), 2 deletions(-) create mode
> > 100644 drivers/infiniband/core/umem_dmabuf.c
> >
> > diff --git a/drivers/infiniband/core/Makefile
> > b/drivers/infiniband/core/Makefile
> > index ccf2670..8ab4eea 100644
> > --- a/drivers/infiniband/core/Makefile
> > +++ b/drivers/infiniband/core/Makefile
> > @@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
> > uverbs_std_types_srq.o \
> > uverbs_std_types_wq.o \
> > uverbs_std_types_qp.o
> > -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> > +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
> > ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff
> > --git a/drivers/infiniband/core/umem.c
> > b/drivers/infiniband/core/umem.c index 7ca4112..cc131f8 100644
> > --- a/drivers/infiniband/core/umem.c
> > +++ b/drivers/infiniband/core/umem.c
> > @@ -2,6 +2,7 @@
> > * Copyright (c) 2005 Topspin Communications. All rights reserved.
> > * Copyright (c) 2005 Cisco Systems. All rights reserved.
> > * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > *
> > * This software is available to you under a choice of one of two
> > * licenses. You may choose to be licensed under the terms of the
> > GNU @@ -278,6 +279,8 @@ void ib_umem_release(struct ib_umem *umem) {
> > if (!umem)
> > return;
> > + if (umem->is_dmabuf)
> > + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
> > if (umem->is_odp)
> > return ib_umem_odp_release(to_ib_umem_odp(umem));
> >
> > diff --git a/drivers/infiniband/core/umem_dmabuf.c
> > b/drivers/infiniband/core/umem_dmabuf.c
> > new file mode 100644
> > index 0000000..e50b955
> > --- /dev/null
> > +++ b/drivers/infiniband/core/umem_dmabuf.c
> > @@ -0,0 +1,173 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > +/*
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > + */
> > +
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-resv.h>
> > +#include <linux/dma-mapping.h>
> > +
> > +#include "uverbs.h"
> > +
> > +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > + struct sg_table *sgt;
> > + struct scatterlist *sg;
> > + struct dma_fence *fence;
> > + unsigned long start, end, cur;
> > + unsigned int nmap;
> > + int i;
> > +
> > + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > + if (umem_dmabuf->sgt)
> > + return 0;
> > +
> > + sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
> > + if (IS_ERR(sgt))
> > + return PTR_ERR(sgt);
> > +
> > + /* modify the sg list in-place to match umem address and length */
> > +
> > + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE);
> > + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length,
> > + PAGE_SIZE);
> > + cur = 0;
> > + nmap = 0;
>
> Better to put as part of variable initialization.
Sure, can change that way.
>
> > + for_each_sgtable_dma_sg(sgt, sg, i) {
> > + if (start < cur + sg_dma_len(sg) && cur < end)
> > + nmap++;
> > + if (cur <= start && start < cur + sg_dma_len(sg)) {
> > + unsigned long offset = start - cur;
> > +
> > + umem_dmabuf->first_sg = sg;
> > + umem_dmabuf->first_sg_offset = offset;
> > + sg_dma_address(sg) += offset;
> > + sg_dma_len(sg) -= offset;
> > + cur += offset;
> > + }
> > + if (cur < end && end <= cur + sg_dma_len(sg)) {
> > + unsigned long trim = cur + sg_dma_len(sg) - end;
> > +
> > + umem_dmabuf->last_sg = sg;
> > + umem_dmabuf->last_sg_trim = trim;
> > + sg_dma_len(sg) -= trim;
> > + break;
> > + }
> > + cur += sg_dma_len(sg);
> > + }
> > +
> > + umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
> > + umem_dmabuf->umem.sg_head.nents = nmap;
> > + umem_dmabuf->umem.nmap = nmap;
> > + umem_dmabuf->sgt = sgt;
> > +
> > + /*
> > + * Although the sg list is valid now, the content of the pages
> > + * may be not up-to-date. Wait for the exporter to finish
> > + * the migration.
> > + */
> > + fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> > + if (fence)
> > + dma_fence_wait(fence, false);
>
> Any reason do not check return result from dma_fence_wait()?
This is called with interruptible flag set to false and normally should only return 0.
I do see similar usage cases that check the result and don't check the result. Maybe
we can add a WARN_ON here?
>
> > +
> > + return 0;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> > +
> > +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > + if (!umem_dmabuf->sgt)
> > + return;
> > +
> > + /* retore the original sg list */
> > + if (umem_dmabuf->first_sg) {
> > + sg_dma_address(umem_dmabuf->first_sg) -=
> > + umem_dmabuf->first_sg_offset;
> > + sg_dma_len(umem_dmabuf->first_sg) +=
> > + umem_dmabuf->first_sg_offset;
> > + umem_dmabuf->first_sg = NULL;
> > + umem_dmabuf->first_sg_offset = 0;
> > + }
> > + if (umem_dmabuf->last_sg) {
> > + sg_dma_len(umem_dmabuf->last_sg) +=
> > + umem_dmabuf->last_sg_trim;
> > + umem_dmabuf->last_sg = NULL;
> > + umem_dmabuf->last_sg_trim = 0;
> > + }
> > +
> > + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> > + DMA_BIDIRECTIONAL);
> > +
> > + umem_dmabuf->sgt = NULL;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> > +
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset, size_t size,
> > + int fd, int access,
> > + const struct dma_buf_attach_ops *ops) {
> > + struct dma_buf *dmabuf;
> > + struct ib_umem_dmabuf *umem_dmabuf;
> > + struct ib_umem *umem;
> > + unsigned long end;
> > + long ret = -EINVAL;
>
> It is wrong type for the returned value. One of the possible options is to declare "struct ib_umem *ret;" and set ret = ERR_PTR(-EINVAL) or
> ret = ERR_CAST(dmabuf);
At the actual point the value is returned, ERR_PTR(ret) is used. I think we can change the
variable name to "err" instead to avoid confusion.
>
> > +
> > + if (check_add_overflow(offset, (unsigned long)size, &end))
> > + return ERR_PTR(-EINVAL);
> > +
> > + if (unlikely(!ops || !ops->move_notify))
>
> Let's not put likely/unlikely in control paths.
>
> > + return ERR_PTR(-EINVAL);
> > +
> > + dmabuf = dma_buf_get(fd);
> > + if (IS_ERR(dmabuf))
> > + return (void *)dmabuf;
>
> return ERR_CAST(dmabuf);
Will fix.
>
> > +
> > + if (dmabuf->size < end)
> > + goto out_release_dmabuf;
> > +
> > + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> > + if (!umem_dmabuf)
> > + return ERR_PTR(-ENOMEM);
>
> You are leaking dmabuf here, forgot to call to dma_buf_put();
Will fix.
>
> > +
> > + umem = &umem_dmabuf->umem;
> > + umem->ibdev = device;
> > + umem->length = size;
> > + umem->address = offset;
> > + umem->writable = ib_access_writable(access);
> > + umem->is_dmabuf = 1;
> > +
> > + if (unlikely(!ib_umem_num_pages(umem)))
>
> There is no advantage in "unlikely" here.
Ok.
>
> > + goto out_free_umem;
> > +
> > + umem_dmabuf->attach = dma_buf_dynamic_attach(
> > + dmabuf,
> > + device->dma_device,
> > + ops,
> > + umem_dmabuf);
> > + if (IS_ERR(umem_dmabuf->attach)) {
> > + ret = PTR_ERR(umem_dmabuf->attach);
> > + goto out_free_umem;
> > + }
> > + return umem;
> > +
> > +out_free_umem:
> > + kfree(umem_dmabuf);
> > +
> > +out_release_dmabuf:
> > + dma_buf_put(dmabuf);
> > + return ERR_PTR(ret);
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> > +
> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> > +
> > + dma_buf_detach(dmabuf, umem_dmabuf->attach);
> > + dma_buf_put(dmabuf);
> > + kfree(umem_dmabuf);
> > +}
> > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index
> > 7752211..b49a96d 100644
> > --- a/include/rdma/ib_umem.h
> > +++ b/include/rdma/ib_umem.h
> > @@ -1,6 +1,7 @@
> > /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> > /*
> > * Copyright (c) 2007 Cisco Systems. All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > */
> >
> > #ifndef IB_UMEM_H
> > @@ -13,6 +14,7 @@
> >
> > struct ib_ucontext;
> > struct ib_umem_odp;
> > +struct dma_buf_attach_ops;
> >
> > struct ib_umem {
> > struct ib_device *ibdev;
> > @@ -22,12 +24,29 @@ struct ib_umem {
> > unsigned long address;
> > u32 writable : 1;
> > u32 is_odp : 1;
> > + u32 is_dmabuf : 1;
> > struct work_struct work;
> > struct sg_table sg_head;
> > int nmap;
> > unsigned int sg_nents;
> > };
> >
> > +struct ib_umem_dmabuf {
> > + struct ib_umem umem;
> > + struct dma_buf_attachment *attach;
> > + struct sg_table *sgt;
> > + struct scatterlist *first_sg;
> > + struct scatterlist *last_sg;
> > + unsigned long first_sg_offset;
> > + unsigned long last_sg_trim;
> > + void *private;
> > +};
> > +
> > +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem
> > +*umem) {
> > + return container_of(umem, struct ib_umem_dmabuf, umem); }
> > +
> > /* Returns the offset of the umem start relative to the first page.
> > */ static inline int ib_umem_offset(struct ib_umem *umem) { @@ -86,6
> > +105,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem,
> > size_t offset, unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
> > unsigned long pgsz_bitmap,
> > unsigned long virt);
> > +
> > /**
> > * ib_umem_find_best_pgoff - Find best HW page size
> > *
> > @@ -116,6 +136,14 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
> > dma_addr & pgoff_bitmask);
> > }
> >
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset, size_t size,
> > + int fd, int access,
> > + const struct dma_buf_attach_ops *ops); int
> > +ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > +ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > +ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> > +
> > #else /* CONFIG_INFINIBAND_USER_MEM */
> >
> > #include <linux/err.h>
> > @@ -143,7 +171,20 @@ static inline unsigned long
> > ib_umem_find_best_pgoff(struct ib_umem *umem, {
> > return 0;
> > }
> > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > + unsigned long offset,
> > + size_t size, int fd,
> > + int access,
> > + struct dma_buf_attach_ops *ops) {
> > + return ERR_PTR(-EINVAL);
>
> Probably, It should be EOPNOTSUPP and not EINVAL.
EINVAL is used here to be consistent with existing definitions in the same file.
>
> > +}
> > +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) {
> > + return -EINVAL;
> > +}
> > +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct
> > +ib_umem_dmabuf *umem_dmabuf) { }
> >
> > #endif /* CONFIG_INFINIBAND_USER_MEM */
> > -
> > #endif /* IB_UMEM_H */
> > --
> > 1.8.3.1
> >
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-08 18:13 ` Xiong, Jianxin
@ 2020-12-08 18:59 ` Jason Gunthorpe
2020-12-08 19:33 ` Xiong, Jianxin
2020-12-08 19:02 ` Leon Romanovsky
1 sibling, 1 reply; 13+ messages in thread
From: Jason Gunthorpe @ 2020-12-08 18:59 UTC (permalink / raw)
To: Xiong, Jianxin
Cc: Leon Romanovsky, linux-rdma@vger.kernel.org,
dri-devel@lists.freedesktop.org, Doug Ledford, Sumit Semwal,
Christian Koenig, Vetter, Daniel
On Tue, Dec 08, 2020 at 06:13:20PM +0000, Xiong, Jianxin wrote:
> > > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > + unsigned long offset,
> > > + size_t size, int fd,
> > > + int access,
> > > + struct dma_buf_attach_ops *ops) {
> > > + return ERR_PTR(-EINVAL);
> >
> > Probably, It should be EOPNOTSUPP and not EINVAL.
>
> EINVAL is used here to be consistent with existing definitions in the same file.
They may be wrong, EOPNOTSUPP is right for this situation
Jason
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-08 18:59 ` Jason Gunthorpe
@ 2020-12-08 19:33 ` Xiong, Jianxin
0 siblings, 0 replies; 13+ messages in thread
From: Xiong, Jianxin @ 2020-12-08 19:33 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Leon Romanovsky, linux-rdma@vger.kernel.org,
dri-devel@lists.freedesktop.org, Doug Ledford, Sumit Semwal,
Christian Koenig, Vetter, Daniel
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, December 08, 2020 10:59 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: Leon Romanovsky <leon@kernel.org>; linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford
> <dledford@redhat.com>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
>
> On Tue, Dec 08, 2020 at 06:13:20PM +0000, Xiong, Jianxin wrote:
>
> > > > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > > + unsigned long offset,
> > > > + size_t size, int fd,
> > > > + int access,
> > > > + struct dma_buf_attach_ops *ops) {
> > > > + return ERR_PTR(-EINVAL);
> > >
> > > Probably, It should be EOPNOTSUPP and not EINVAL.
> >
> > EINVAL is used here to be consistent with existing definitions in the same file.
>
> They may be wrong, EOPNOTSUPP is right for this situation
Ok, let me change all of them to EOPNOTSUPP.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
2020-12-08 18:13 ` Xiong, Jianxin
2020-12-08 18:59 ` Jason Gunthorpe
@ 2020-12-08 19:02 ` Leon Romanovsky
1 sibling, 0 replies; 13+ messages in thread
From: Leon Romanovsky @ 2020-12-08 19:02 UTC (permalink / raw)
To: Xiong, Jianxin
Cc: linux-rdma@vger.kernel.org, dri-devel@lists.freedesktop.org,
Doug Ledford, Jason Gunthorpe, Sumit Semwal, Christian Koenig,
Vetter, Daniel
On Tue, Dec 08, 2020 at 06:13:20PM +0000, Xiong, Jianxin wrote:
> > -----Original Message-----
> > From: Leon Romanovsky <leon@kernel.org>
> > Sent: Monday, December 07, 2020 11:06 PM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> > Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel <daniel.vetter@intel.com>
> > Subject: Re: [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region
> >
> > On Mon, Dec 07, 2020 at 02:15:50PM -0800, Jianxin Xiong wrote:
> > > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > > be used to support peer-to-peer access from RDMA devices.
> > >
> > > Device memory exported via dma-buf is associated with a file descriptor.
> > > This is passed to the user space as a property associated with the
> > > buffer allocation. When the buffer is registered as a memory region,
> > > the file descriptor is passed to the RDMA driver along with other
> > > parameters.
> > >
> > > Implement the common code for importing dma-buf object and mapping
> > > dma-buf pages.
> > >
> > > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > Acked-by: Christian Koenig <christian.koenig@amd.com>
> > > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > >
> > > Conflicts:
> > > include/rdma/ib_umem.h
> >
<...>
> > > + /*
> > > + * Although the sg list is valid now, the content of the pages
> > > + * may be not up-to-date. Wait for the exporter to finish
> > > + * the migration.
> > > + */
> > > + fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> > > + if (fence)
> > > + dma_fence_wait(fence, false);
> >
> > Any reason do not check return result from dma_fence_wait()?
>
> This is called with interruptible flag set to false and normally should only return 0.
> I do see similar usage cases that check the result and don't check the result. Maybe
> we can add a WARN_ON here?
I have no idea :), just saw that other places check returned value.
<...>
> > > +
> > > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > + unsigned long offset, size_t size,
> > > + int fd, int access,
> > > + const struct dma_buf_attach_ops *ops) {
> > > + struct dma_buf *dmabuf;
> > > + struct ib_umem_dmabuf *umem_dmabuf;
> > > + struct ib_umem *umem;
> > > + unsigned long end;
> > > + long ret = -EINVAL;
> >
> > It is wrong type for the returned value. One of the possible options is to declare "struct ib_umem *ret;" and set ret = ERR_PTR(-EINVAL) or
> > ret = ERR_CAST(dmabuf);
>
> At the actual point the value is returned, ERR_PTR(ret) is used. I think we can change the
> variable name to "err" instead to avoid confusion.
The point is that "ret" should be declared as "struct ib_umem *" and not
as "long" and ERR_CAST() should be used instead of (void *).
<...>
> > > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > + unsigned long offset,
> > > + size_t size, int fd,
> > > + int access,
> > > + struct dma_buf_attach_ops *ops) {
> > > + return ERR_PTR(-EINVAL);
> >
> > Probably, It should be EOPNOTSUPP and not EINVAL.
>
> EINVAL is used here to be consistent with existing definitions in the same file.
ok
Thanks
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v13 2/4] RDMA/core: Add device method for registering dma-buf based memory region
2020-12-07 22:15 [PATCH v13 0/4] RDMA: Add dma-buf support Jianxin Xiong
2020-12-07 22:15 ` [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region Jianxin Xiong
@ 2020-12-07 22:15 ` Jianxin Xiong
2020-12-08 7:12 ` Leon Romanovsky
2020-12-07 22:15 ` [PATCH v13 3/4] RDMA/uverbs: Add uverbs command for dma-buf based MR registration Jianxin Xiong
2020-12-07 22:15 ` [PATCH v13 4/4] RDMA/mlx5: Support dma-buf based userspace memory region Jianxin Xiong
3 siblings, 1 reply; 13+ messages in thread
From: Jianxin Xiong @ 2020-12-07 22:15 UTC (permalink / raw)
To: linux-rdma, dri-devel
Cc: Jianxin Xiong, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
Sumit Semwal, Christian Koenig, Daniel Vetter
Dma-buf based memory region requires one extra parameter and is processed
quite differently. Adding a separate method allows clean separation from
regular memory regions.
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/infiniband/core/device.c | 1 +
include/rdma/ib_verbs.h | 6 +++++-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 3ab1ede..23f7440 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2677,6 +2677,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, read_counters);
SET_DEVICE_OP(dev_ops, reg_dm_mr);
SET_DEVICE_OP(dev_ops, reg_user_mr);
+ SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf);
SET_DEVICE_OP(dev_ops, req_ncomp_notif);
SET_DEVICE_OP(dev_ops, req_notify_cq);
SET_DEVICE_OP(dev_ops, rereg_user_mr);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7bee8ab..fa3882b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
* Copyright (c) 2004 Infinicon Corporation. All rights reserved.
- * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved.
* Copyright (c) 2004 Topspin Corporation. All rights reserved.
* Copyright (c) 2004 Voltaire Corporation. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
@@ -2433,6 +2433,10 @@ struct ib_device_ops {
struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
+ struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr, int fd,
+ int mr_access_flags,
+ struct ib_udata *udata);
int (*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_pd *pd, struct ib_udata *udata);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v13 2/4] RDMA/core: Add device method for registering dma-buf based memory region
2020-12-07 22:15 ` [PATCH v13 2/4] RDMA/core: Add device method for registering dma-buf based " Jianxin Xiong
@ 2020-12-08 7:12 ` Leon Romanovsky
0 siblings, 0 replies; 13+ messages in thread
From: Leon Romanovsky @ 2020-12-08 7:12 UTC (permalink / raw)
To: Jianxin Xiong
Cc: linux-rdma, dri-devel, Doug Ledford, Jason Gunthorpe,
Sumit Semwal, Christian Koenig, Daniel Vetter
On Mon, Dec 07, 2020 at 02:15:51PM -0800, Jianxin Xiong wrote:
> Dma-buf based memory region requires one extra parameter and is processed
> quite differently. Adding a separate method allows clean separation from
> regular memory regions.
>
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Acked-by: Christian Koenig <christian.koenig@amd.com>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
> drivers/infiniband/core/device.c | 1 +
> include/rdma/ib_verbs.h | 6 +++++-
> 2 files changed, 6 insertions(+), 1 deletion(-)
>
Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v13 3/4] RDMA/uverbs: Add uverbs command for dma-buf based MR registration
2020-12-07 22:15 [PATCH v13 0/4] RDMA: Add dma-buf support Jianxin Xiong
2020-12-07 22:15 ` [PATCH v13 1/4] RDMA/umem: Support importing dma-buf as user memory region Jianxin Xiong
2020-12-07 22:15 ` [PATCH v13 2/4] RDMA/core: Add device method for registering dma-buf based " Jianxin Xiong
@ 2020-12-07 22:15 ` Jianxin Xiong
2020-12-08 7:15 ` Leon Romanovsky
2020-12-07 22:15 ` [PATCH v13 4/4] RDMA/mlx5: Support dma-buf based userspace memory region Jianxin Xiong
3 siblings, 1 reply; 13+ messages in thread
From: Jianxin Xiong @ 2020-12-07 22:15 UTC (permalink / raw)
To: linux-rdma, dri-devel
Cc: Jianxin Xiong, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
Sumit Semwal, Christian Koenig, Daniel Vetter
Implement a new uverbs ioctl method for memory registration with file
descriptor as an extra parameter.
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/infiniband/core/uverbs_std_types_mr.c | 117 +++++++++++++++++++++++++-
include/uapi/rdma/ib_user_ioctl_cmds.h | 14 +++
2 files changed, 129 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index dc58564..4660c76 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -182,6 +183,86 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)(
return IS_UVERBS_COPY_ERR(ret) ? ret : 0;
}
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE);
+ struct ib_device *ib_dev = pd->device;
+
+ u64 offset, length, virt_addr;
+ u32 fd, access_flags;
+ struct ib_mr *mr;
+ int ret;
+
+ if (!ib_dev->ops.reg_user_mr_dmabuf)
+ return -EOPNOTSUPP;
+
+ if (!(pd->device->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&offset, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_OFFSET);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&length, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&virt_addr, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_IOVA);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&fd, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_FD);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&access_flags, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_RELAXED_ORDERING);
+ if (ret)
+ return ret;
+
+ ret = ib_check_mr_access(access_flags);
+ if (ret)
+ return ret;
+
+ mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, virt_addr,
+ fd, access_flags,
+ &attrs->driver_udata);
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
+ &mr->lkey, sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_ADVISE_MR,
UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -247,6 +328,37 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)(
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_REG_DMABUF_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_FD,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ enum ib_access_flags),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
UVERBS_METHOD_MR_DESTROY,
UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
@@ -257,10 +369,11 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)(
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_MR,
UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
+ &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR),
&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
- &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR),
- &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR));
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
const struct uapi_definition uverbs_def_obj_mr[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 7968a18..dafc7eb 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -251,6 +252,7 @@ enum uverbs_methods_mr {
UVERBS_METHOD_MR_DESTROY,
UVERBS_METHOD_ADVISE_MR,
UVERBS_METHOD_QUERY_MR,
+ UVERBS_METHOD_REG_DMABUF_MR,
};
enum uverbs_attrs_mr_destroy_ids {
@@ -272,6 +274,18 @@ enum uverbs_attrs_query_mr_cmd_attr_ids {
UVERBS_ATTR_QUERY_MR_RESP_IOVA,
};
+enum uverbs_attrs_reg_dmabuf_mr_cmd_attr_ids {
+ UVERBS_ATTR_REG_DMABUF_MR_HANDLE,
+ UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE,
+ UVERBS_ATTR_REG_DMABUF_MR_OFFSET,
+ UVERBS_ATTR_REG_DMABUF_MR_LENGTH,
+ UVERBS_ATTR_REG_DMABUF_MR_IOVA,
+ UVERBS_ATTR_REG_DMABUF_MR_FD,
+ UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
+ UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
+};
+
enum uverbs_attrs_create_counters_cmd_attr_ids {
UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
};
--
1.8.3.1
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v13 3/4] RDMA/uverbs: Add uverbs command for dma-buf based MR registration
2020-12-07 22:15 ` [PATCH v13 3/4] RDMA/uverbs: Add uverbs command for dma-buf based MR registration Jianxin Xiong
@ 2020-12-08 7:15 ` Leon Romanovsky
0 siblings, 0 replies; 13+ messages in thread
From: Leon Romanovsky @ 2020-12-08 7:15 UTC (permalink / raw)
To: Jianxin Xiong
Cc: linux-rdma, dri-devel, Doug Ledford, Jason Gunthorpe,
Sumit Semwal, Christian Koenig, Daniel Vetter
On Mon, Dec 07, 2020 at 02:15:52PM -0800, Jianxin Xiong wrote:
> Implement a new uverbs ioctl method for memory registration with file
> descriptor as an extra parameter.
>
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Acked-by: Christian Koenig <christian.koenig@amd.com>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
> drivers/infiniband/core/uverbs_std_types_mr.c | 117 +++++++++++++++++++++++++-
> include/uapi/rdma/ib_user_ioctl_cmds.h | 14 +++
> 2 files changed, 129 insertions(+), 2 deletions(-)
>
Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v13 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
2020-12-07 22:15 [PATCH v13 0/4] RDMA: Add dma-buf support Jianxin Xiong
` (2 preceding siblings ...)
2020-12-07 22:15 ` [PATCH v13 3/4] RDMA/uverbs: Add uverbs command for dma-buf based MR registration Jianxin Xiong
@ 2020-12-07 22:15 ` Jianxin Xiong
3 siblings, 0 replies; 13+ messages in thread
From: Jianxin Xiong @ 2020-12-07 22:15 UTC (permalink / raw)
To: linux-rdma, dri-devel
Cc: Jianxin Xiong, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
Sumit Semwal, Christian Koenig, Daniel Vetter
Implement the new driver method 'reg_user_mr_dmabuf'. Utilize the core
functions to import dma-buf based memory region and update the mappings.
Add code to handle dma-buf related page fault.
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/infiniband/hw/mlx5/main.c | 2 +
drivers/infiniband/hw/mlx5/mlx5_ib.h | 18 +++++
drivers/infiniband/hw/mlx5/mr.c | 128 +++++++++++++++++++++++++++++++++--
drivers/infiniband/hw/mlx5/odp.c | 86 +++++++++++++++++++++--
4 files changed, 225 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 4a054eb..c025746 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
*/
#include <linux/debugfs.h>
@@ -4069,6 +4070,7 @@ static int mlx5_ib_enable_driver(struct ib_device *dev)
.query_srq = mlx5_ib_query_srq,
.query_ucontext = mlx5_ib_query_ucontext,
.reg_user_mr = mlx5_ib_reg_user_mr,
+ .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
.req_notify_cq = mlx5_ib_arm_cq,
.rereg_user_mr = mlx5_ib_rereg_user_mr,
.resize_cq = mlx5_ib_resize_cq,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 718e59f..6f4d1b4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
* Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
*/
#ifndef MLX5_IB_H
@@ -704,6 +705,12 @@ static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
mr->umem->is_odp;
}
+static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
+ mr->umem->is_dmabuf;
+}
+
struct mlx5_ib_mw {
struct ib_mw ibmw;
struct mlx5_core_mkey mmkey;
@@ -1239,6 +1246,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct ib_udata *udata);
int mlx5_ib_advise_mr(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 flags,
@@ -1249,11 +1260,13 @@ int mlx5_ib_advise_mr(struct ib_pd *pd,
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
+int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct ib_udata *udata,
int access_flags);
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr);
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
u64 length, u64 virt_addr, int access_flags,
struct ib_pd *pd, struct ib_udata *udata);
@@ -1341,6 +1354,7 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge);
int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable);
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
@@ -1366,6 +1380,10 @@ static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
{
return -EOPNOTSUPP;
}
+static inline int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b6116f6..e3be1f5 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,8 @@
#include <linux/debugfs.h>
#include <linux/export.h>
#include <linux/delay.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include <rdma/ib_verbs.h>
@@ -957,6 +960,16 @@ static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
return &cache->ent[order];
}
+static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
+ u64 iova)
+{
+ if ((iova ^ umem->address) & (PAGE_SIZE - 1))
+ return 0;
+
+ umem->iova = iova;
+ return PAGE_SIZE;
+}
+
static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
struct ib_umem *umem, u64 iova,
int access_flags)
@@ -966,7 +979,11 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
struct mlx5_ib_mr *mr;
unsigned int page_size;
- page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+ if (umem->is_dmabuf)
+ page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
+ else
+ page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
+ 0, iova);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
ent = mr_cache_ent_from_order(
@@ -1212,8 +1229,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
/*
* Send the DMA list to the HW for a normal MR using UMR.
+ * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
+ * flag may be used.
*/
-static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
{
struct mlx5_ib_dev *dev = mr->dev;
struct device *ddev = &dev->mdev->pdev->dev;
@@ -1255,6 +1274,10 @@ static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
cur_mtt->ptag =
cpu_to_be64(rdma_block_iter_dma_address(&biter) |
MLX5_IB_MTT_PRESENT);
+
+ if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
+ cur_mtt->ptag = 0;
+
cur_mtt++;
}
@@ -1291,8 +1314,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
int err;
bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
- page_size =
- mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+ if (umem->is_dmabuf)
+ page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
+ else
+ page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
+ 0, iova);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
@@ -1572,6 +1598,96 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_PTR(err);
}
+static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+ struct mlx5_ib_mr *mr = umem_dmabuf->private;
+
+ dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+ if (!umem_dmabuf->sgt)
+ return;
+
+ mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+}
+
+static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
+ .allow_peer2peer = 1,
+ .move_notify = mlx5_ib_dmabuf_invalidate_cb,
+};
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct ib_udata *udata)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_ib_mr *mr = NULL;
+ struct ib_umem *umem;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mlx5_ib_dbg(dev,
+ "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
+ offset, virt_addr, length, fd, access_flags);
+
+ if (!mlx5_ib_can_load_pas_with_umr(dev, length))
+ return ERR_PTR(-EINVAL);
+
+ umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
+ &mlx5_ib_dmabuf_attach_ops);
+ if (IS_ERR(umem)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
+ return ERR_PTR(PTR_ERR(umem));
+ }
+
+ mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
+ if (IS_ERR(mr))
+ mr = NULL;
+
+ if (!mr) {
+ mutex_lock(&dev->slow_path_mutex);
+ mr = reg_create(NULL, pd, umem, virt_addr, access_flags,
+ false);
+ mutex_unlock(&dev->slow_path_mutex);
+ }
+
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+ goto error;
+ }
+
+ mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
+
+ mr->umem = umem;
+ atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
+ set_mr_fields(dev, mr, length, access_flags);
+
+ to_ib_umem_dmabuf(umem)->private = mr;
+ init_waitqueue_head(&mr->q_deferred_work);
+ atomic_set(&mr->num_deferred_work, 0);
+ err = xa_err(xa_store(&dev->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
+ GFP_KERNEL));
+ if (err) {
+ dereg_mr(dev, mr);
+ return ERR_PTR(err);
+ }
+
+ err = mlx5_ib_init_dmabuf_mr(mr);
+ if (err) {
+ dereg_mr(dev, mr);
+ return ERR_PTR(err);
+ }
+ return &mr->ibmr;
+error:
+ ib_umem_release(umem);
+ return ERR_PTR(err);
+}
+
/**
* mlx5_mr_cache_invalidate - Fence all DMA on the MR
* @mr: The MR to fence
@@ -1640,7 +1756,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
if (!mr->umem)
return -EINVAL;
- if (is_odp_mr(mr))
+ if (is_odp_mr(mr) || is_dmabuf_mr(mr))
return -EOPNOTSUPP;
if (flags & IB_MR_REREG_TRANS) {
@@ -1804,6 +1920,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Stop all DMA */
if (is_odp_mr(mr))
mlx5_ib_fence_odp_mr(mr);
+ else if (is_dmabuf_mr(mr))
+ mlx5_ib_fence_dmabuf_mr(mr);
else
clean_mr(dev, mr);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5c853ec..35d6770 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -33,6 +33,8 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include <linux/kernel.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include "mlx5_ib.h"
#include "cmd.h"
@@ -664,6 +666,37 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
dma_fence_odp_mr(mr);
}
+/**
+ * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+ /* Prevent new page faults and prefetch requests from succeeding */
+ xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&mr->dev->odp_srcu);
+
+ wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ mlx5_mr_cache_invalidate(mr);
+ umem_dmabuf->private = NULL;
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ if (!mr->cache_ent) {
+ mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+ WARN_ON(mr->descs);
+ }
+}
+
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
#define MLX5_PF_FLAGS_ENABLE BIT(3)
@@ -797,6 +830,41 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
return ret;
}
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
+ u32 *bytes_mapped, u32 flags)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ u32 xlt_flags = 0;
+ int err;
+ unsigned int page_size;
+
+ if (flags & MLX5_PF_FLAGS_ENABLE)
+ xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+ if (!err) {
+ page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
+ log_page_size, 0,
+ umem_dmabuf->umem.iova);
+ if (unlikely(page_size < PAGE_SIZE)) {
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ err = -EINVAL;
+ } else {
+ err = mlx5_ib_update_mr_pas(mr, xlt_flags);
+ }
+ }
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ if (err)
+ return err;
+
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+
+ return ib_umem_num_pages(mr->umem);
+}
+
/*
* Returns:
* -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -815,6 +883,9 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
if (unlikely(io_virt < mr->mmkey.iova))
return -EFAULT;
+ if (mr->umem->is_dmabuf)
+ return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
+
if (!odp->is_implicit_odp) {
u64 user_va;
@@ -845,6 +916,16 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
return ret >= 0 ? 0 : ret;
}
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ int ret;
+
+ ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
+ MLX5_PF_FLAGS_ENABLE);
+
+ return ret >= 0 ? 0 : ret;
+}
+
struct pf_frame {
struct pf_frame *next;
u32 key;
@@ -1747,7 +1828,6 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_core_mkey *mmkey;
- struct ib_umem_odp *odp;
struct mlx5_ib_mr *mr;
lockdep_assert_held(&dev->odp_srcu);
@@ -1761,11 +1841,9 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
if (mr->ibmr.pd != pd)
return NULL;
- odp = to_ib_umem_odp(mr->umem);
-
/* prefetch with write-access must be supported by the MR */
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
- !odp->umem.writable)
+ !mr->umem->writable)
return NULL;
return mr;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 13+ messages in thread