[PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
@ 2025-05-13  9:28 wangtao
  2025-05-13 11:32 ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-13  9:28 UTC (permalink / raw)
  To: sumit.semwal, christian.koenig, benjamin.gaignard, Brian.Starkey,
	jstultz, tjmercier
  Cc: linux-media, dri-devel, linaro-mm-sig, linux-kernel, bintian.wang,
	yipengxiang, liulu.liu, feng.han, wangtao

Support direct file I/O operations for system_heap dma-buf objects.
Implementation includes:
1. Convert sg_table to bio_vec
2. Set IOCB_DIRECT when O_DIRECT is supported
3. Invoke vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O

Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):

| Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
|--------------------|-------:|-------:|--------:|---------:|---------:|
| Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
| Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
| Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
| Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |

Signed-off-by: wangtao <tao.wangtao@honor.com>
---
 drivers/dma-buf/heaps/system_heap.c | 118 ++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index 26d5dc89ea16..f7b71b9843aa 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -20,6 +20,8 @@
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/bvec.h>
+#include <linux/uio.h>
 
 static struct dma_heap *sys_heap;
 
@@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, struct iosys_map *map)
 	iosys_map_clear(map);
 }
 
+static struct bio_vec *system_heap_init_bvec(struct system_heap_buffer *buffer,
+			size_t offset, size_t len, int *nr_segs)
+{
+	struct sg_table *sgt = &buffer->sg_table;
+	struct scatterlist *sg;
+	size_t length = 0;
+	unsigned int i, k = 0;
+	struct bio_vec *bvec;
+	size_t sg_left;
+	size_t sg_offset;
+	size_t sg_len;
+
+	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
+	if (!bvec)
+		return NULL;
+
+	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
+		length += sg->length;
+		if (length <= offset)
+			continue;
+
+		sg_left = length - offset;
+		sg_offset = sg->offset + sg->length - sg_left;
+		sg_len = min(sg_left, len);
+
+		bvec[k].bv_page = sg_page(sg);
+		bvec[k].bv_len = sg_len;
+		bvec[k].bv_offset = sg_offset;
+		k++;
+
+		offset += sg_len;
+		len -= sg_len;
+		if (len <= 0)
+			break;
+	}
+
+	*nr_segs = k;
+	return bvec;
+}
+
+static int system_heap_rw_file(struct system_heap_buffer *buffer, bool is_read,
+		bool direct_io, struct file *filp, loff_t file_offset,
+		size_t buf_offset, size_t len)
+{
+	struct bio_vec *bvec;
+	int nr_segs = 0;
+	struct iov_iter iter;
+	struct kiocb kiocb;
+	ssize_t ret = 0;
+
+	if (direct_io) {
+		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
+			return -EINVAL;
+	}
+
+	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
+	if (!bvec)
+		return -ENOMEM;
+
+	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, nr_segs, len);
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = file_offset;
+	if (direct_io)
+		kiocb.ki_flags |= IOCB_DIRECT;
+
+	while (kiocb.ki_pos < file_offset + len) {
+		if (is_read)
+			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
+		else
+			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
+		if (ret <= 0)
+			break;
+	}
+
+	kvfree(bvec);
+	return ret < 0 ? ret : 0;
+}
+
+static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
+			struct dma_buf_rw_file *back)
+{
+	struct system_heap_buffer *buffer = dmabuf->priv;
+	int ret = 0;
+	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
+	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
+	struct file *filp;
+
+	if (op != DMA_BUF_RW_FLAGS_READ && op != DMA_BUF_RW_FLAGS_WRITE)
+		return -EINVAL;
+	if (direct_io) {
+		if (!PAGE_ALIGNED(back->file_offset) ||
+			!PAGE_ALIGNED(back->buf_offset) ||
+			!PAGE_ALIGNED(back->buf_len))
+		return -EINVAL;
+	}
+	if (!back->buf_len || back->buf_len > dmabuf->size ||
+		back->buf_offset >= dmabuf->size ||
+		back->buf_offset + back->buf_len > dmabuf->size)
+		return -EINVAL;
+	if (back->file_offset + back->buf_len < back->file_offset)
+		return -EINVAL;
+
+	filp = fget(back->fd);
+	if (!filp)
+		return -EBADF;
+
+	mutex_lock(&buffer->lock);
+	ret = system_heap_rw_file(buffer, op == DMA_BUF_RW_FLAGS_READ, direct_io,
+			filp, back->file_offset, back->buf_offset, back->buf_len);
+	mutex_unlock(&buffer->lock);
+
+	fput(filp);
+	return ret;
+}
+
 static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
 {
 	struct system_heap_buffer *buffer = dmabuf->priv;
@@ -308,6 +425,7 @@ static const struct dma_buf_ops system_heap_buf_ops = {
 	.mmap = system_heap_mmap,
 	.vmap = system_heap_vmap,
 	.vunmap = system_heap_vunmap,
+	.rw_file = system_heap_dma_buf_rw_file,
 	.release = system_heap_dma_buf_release,
 };
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-13  9:28 [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap wangtao
@ 2025-05-13 11:32 ` Christian König
  2025-05-13 12:30   ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-13 11:32 UTC (permalink / raw)
  To: wangtao, sumit.semwal, benjamin.gaignard, Brian.Starkey, jstultz,
	tjmercier
  Cc: linux-media, dri-devel, linaro-mm-sig, linux-kernel, bintian.wang,
	yipengxiang, liulu.liu, feng.han

On 5/13/25 11:28, wangtao wrote:
> Support direct file I/O operations for system_heap dma-buf objects.
> Implementation includes:
> 1. Convert sg_table to bio_vec

That is usually illegal for DMA-bufs.

Regards,
Christian.

> 2. Set IOCB_DIRECT when O_DIRECT is supported
> 3. Invoke vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
> 
> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
> 
> | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
> |--------------------|-------:|-------:|--------:|---------:|---------:|
> | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
> | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
> | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
> | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
> 
> Signed-off-by: wangtao <tao.wangtao@honor.com>
> ---
>  drivers/dma-buf/heaps/system_heap.c | 118 ++++++++++++++++++++++++++++
>  1 file changed, 118 insertions(+)
> 
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index 26d5dc89ea16..f7b71b9843aa 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -20,6 +20,8 @@
>  #include <linux/scatterlist.h>
>  #include <linux/slab.h>
>  #include <linux/vmalloc.h>
> +#include <linux/bvec.h>
> +#include <linux/uio.h>
>  
>  static struct dma_heap *sys_heap;
>  
> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, struct iosys_map *map)
>  	iosys_map_clear(map);
>  }
>  
> +static struct bio_vec *system_heap_init_bvec(struct system_heap_buffer *buffer,
> +			size_t offset, size_t len, int *nr_segs)
> +{
> +	struct sg_table *sgt = &buffer->sg_table;
> +	struct scatterlist *sg;
> +	size_t length = 0;
> +	unsigned int i, k = 0;
> +	struct bio_vec *bvec;
> +	size_t sg_left;
> +	size_t sg_offset;
> +	size_t sg_len;
> +
> +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
> +	if (!bvec)
> +		return NULL;
> +
> +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
> +		length += sg->length;
> +		if (length <= offset)
> +			continue;
> +
> +		sg_left = length - offset;
> +		sg_offset = sg->offset + sg->length - sg_left;
> +		sg_len = min(sg_left, len);
> +
> +		bvec[k].bv_page = sg_page(sg);
> +		bvec[k].bv_len = sg_len;
> +		bvec[k].bv_offset = sg_offset;
> +		k++;
> +
> +		offset += sg_len;
> +		len -= sg_len;
> +		if (len <= 0)
> +			break;
> +	}
> +
> +	*nr_segs = k;
> +	return bvec;
> +}
> +
> +static int system_heap_rw_file(struct system_heap_buffer *buffer, bool is_read,
> +		bool direct_io, struct file *filp, loff_t file_offset,
> +		size_t buf_offset, size_t len)
> +{
> +	struct bio_vec *bvec;
> +	int nr_segs = 0;
> +	struct iov_iter iter;
> +	struct kiocb kiocb;
> +	ssize_t ret = 0;
> +
> +	if (direct_io) {
> +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
> +			return -EINVAL;
> +	}
> +
> +	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
> +	if (!bvec)
> +		return -ENOMEM;
> +
> +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, nr_segs, len);
> +	init_sync_kiocb(&kiocb, filp);
> +	kiocb.ki_pos = file_offset;
> +	if (direct_io)
> +		kiocb.ki_flags |= IOCB_DIRECT;
> +
> +	while (kiocb.ki_pos < file_offset + len) {
> +		if (is_read)
> +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
> +		else
> +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
> +		if (ret <= 0)
> +			break;
> +	}
> +
> +	kvfree(bvec);
> +	return ret < 0 ? ret : 0;
> +}
> +
> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
> +			struct dma_buf_rw_file *back)
> +{
> +	struct system_heap_buffer *buffer = dmabuf->priv;
> +	int ret = 0;
> +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
> +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
> +	struct file *filp;
> +
> +	if (op != DMA_BUF_RW_FLAGS_READ && op != DMA_BUF_RW_FLAGS_WRITE)
> +		return -EINVAL;
> +	if (direct_io) {
> +		if (!PAGE_ALIGNED(back->file_offset) ||
> +			!PAGE_ALIGNED(back->buf_offset) ||
> +			!PAGE_ALIGNED(back->buf_len))
> +		return -EINVAL;
> +	}
> +	if (!back->buf_len || back->buf_len > dmabuf->size ||
> +		back->buf_offset >= dmabuf->size ||
> +		back->buf_offset + back->buf_len > dmabuf->size)
> +		return -EINVAL;
> +	if (back->file_offset + back->buf_len < back->file_offset)
> +		return -EINVAL;
> +
> +	filp = fget(back->fd);
> +	if (!filp)
> +		return -EBADF;
> +
> +	mutex_lock(&buffer->lock);
> +	ret = system_heap_rw_file(buffer, op == DMA_BUF_RW_FLAGS_READ, direct_io,
> +			filp, back->file_offset, back->buf_offset, back->buf_len);
> +	mutex_unlock(&buffer->lock);
> +
> +	fput(filp);
> +	return ret;
> +}
> +
>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
>  {
>  	struct system_heap_buffer *buffer = dmabuf->priv;
> @@ -308,6 +425,7 @@ static const struct dma_buf_ops system_heap_buf_ops = {
>  	.mmap = system_heap_mmap,
>  	.vmap = system_heap_vmap,
>  	.vunmap = system_heap_vunmap,
> +	.rw_file = system_heap_dma_buf_rw_file,
>  	.release = system_heap_dma_buf_release,
>  };
>  


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-13 11:32 ` Christian König
@ 2025-05-13 12:30   ` wangtao
  2025-05-13 13:17     ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-13 12:30 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Tuesday, May 13, 2025 7:32 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
> 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/13/25 11:28, wangtao wrote:
> > Support direct file I/O operations for system_heap dma-buf objects.
> > Implementation includes:
> > 1. Convert sg_table to bio_vec
> 
> That is usually illegal for DMA-bufs.
[wangtao] The term 'convert' is misleading in this context. The appropriate phrasing should be: Construct bio_vec from sg_table.
Appreciate your feedback.
> 
> Regards,
> Christian.
> 
> > 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke
> > vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
> >
> > Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
> >
> > | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
> > |--------------------|-------:|-------:|--------:|---------:|---------
> > |--------------------|:|
> > | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
> > | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
> > | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
> > | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
> >
> > Signed-off-by: wangtao <tao.wangtao@honor.com>
> > ---
> >  drivers/dma-buf/heaps/system_heap.c | 118
> > ++++++++++++++++++++++++++++
> >  1 file changed, 118 insertions(+)
> >
> > diff --git a/drivers/dma-buf/heaps/system_heap.c
> > b/drivers/dma-buf/heaps/system_heap.c
> > index 26d5dc89ea16..f7b71b9843aa 100644
> > --- a/drivers/dma-buf/heaps/system_heap.c
> > +++ b/drivers/dma-buf/heaps/system_heap.c
> > @@ -20,6 +20,8 @@
> >  #include <linux/scatterlist.h>
> >  #include <linux/slab.h>
> >  #include <linux/vmalloc.h>
> > +#include <linux/bvec.h>
> > +#include <linux/uio.h>
> >
> >  static struct dma_heap *sys_heap;
> >
> > @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf
> *dmabuf, struct iosys_map *map)
> >  	iosys_map_clear(map);
> >  }
> >
> > +static struct bio_vec *system_heap_init_bvec(struct
> system_heap_buffer *buffer,
> > +			size_t offset, size_t len, int *nr_segs) {
> > +	struct sg_table *sgt = &buffer->sg_table;
> > +	struct scatterlist *sg;
> > +	size_t length = 0;
> > +	unsigned int i, k = 0;
> > +	struct bio_vec *bvec;
> > +	size_t sg_left;
> > +	size_t sg_offset;
> > +	size_t sg_len;
> > +
> > +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
> > +	if (!bvec)
> > +		return NULL;
> > +
> > +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
> > +		length += sg->length;
> > +		if (length <= offset)
> > +			continue;
> > +
> > +		sg_left = length - offset;
> > +		sg_offset = sg->offset + sg->length - sg_left;
> > +		sg_len = min(sg_left, len);
> > +
> > +		bvec[k].bv_page = sg_page(sg);
> > +		bvec[k].bv_len = sg_len;
> > +		bvec[k].bv_offset = sg_offset;
> > +		k++;
> > +
> > +		offset += sg_len;
> > +		len -= sg_len;
> > +		if (len <= 0)
> > +			break;
> > +	}
> > +
> > +	*nr_segs = k;
> > +	return bvec;
> > +}
> > +
> > +static int system_heap_rw_file(struct system_heap_buffer *buffer, bool
> is_read,
> > +		bool direct_io, struct file *filp, loff_t file_offset,
> > +		size_t buf_offset, size_t len)
> > +{
> > +	struct bio_vec *bvec;
> > +	int nr_segs = 0;
> > +	struct iov_iter iter;
> > +	struct kiocb kiocb;
> > +	ssize_t ret = 0;
> > +
> > +	if (direct_io) {
> > +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
> > +			return -EINVAL;
> > +	}
> > +
> > +	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
> > +	if (!bvec)
> > +		return -ENOMEM;
> > +
> > +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec,
> nr_segs, len);
> > +	init_sync_kiocb(&kiocb, filp);
> > +	kiocb.ki_pos = file_offset;
> > +	if (direct_io)
> > +		kiocb.ki_flags |= IOCB_DIRECT;
> > +
> > +	while (kiocb.ki_pos < file_offset + len) {
> > +		if (is_read)
> > +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
> > +		else
> > +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
> > +		if (ret <= 0)
> > +			break;
> > +	}
> > +
> > +	kvfree(bvec);
> > +	return ret < 0 ? ret : 0;
> > +}
> > +
> > +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
> > +			struct dma_buf_rw_file *back)
> > +{
> > +	struct system_heap_buffer *buffer = dmabuf->priv;
> > +	int ret = 0;
> > +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
> > +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
> > +	struct file *filp;
> > +
> > +	if (op != DMA_BUF_RW_FLAGS_READ && op !=
> DMA_BUF_RW_FLAGS_WRITE)
> > +		return -EINVAL;
> > +	if (direct_io) {
> > +		if (!PAGE_ALIGNED(back->file_offset) ||
> > +			!PAGE_ALIGNED(back->buf_offset) ||
> > +			!PAGE_ALIGNED(back->buf_len))
> > +		return -EINVAL;
> > +	}
> > +	if (!back->buf_len || back->buf_len > dmabuf->size ||
> > +		back->buf_offset >= dmabuf->size ||
> > +		back->buf_offset + back->buf_len > dmabuf->size)
> > +		return -EINVAL;
> > +	if (back->file_offset + back->buf_len < back->file_offset)
> > +		return -EINVAL;
> > +
> > +	filp = fget(back->fd);
> > +	if (!filp)
> > +		return -EBADF;
> > +
> > +	mutex_lock(&buffer->lock);
> > +	ret = system_heap_rw_file(buffer, op ==
> DMA_BUF_RW_FLAGS_READ, direct_io,
> > +			filp, back->file_offset, back->buf_offset, back-
> >buf_len);
> > +	mutex_unlock(&buffer->lock);
> > +
> > +	fput(filp);
> > +	return ret;
> > +}
> > +
> >  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)  {
> >  	struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6
> +425,7
> > @@ static const struct dma_buf_ops system_heap_buf_ops = {
> >  	.mmap = system_heap_mmap,
> >  	.vmap = system_heap_vmap,
> >  	.vunmap = system_heap_vunmap,
> > +	.rw_file = system_heap_dma_buf_rw_file,
> >  	.release = system_heap_dma_buf_release,  };
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-13 12:30   ` wangtao
@ 2025-05-13 13:17     ` Christian König
  2025-05-14 11:02       ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-13 13:17 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/13/25 14:30, wangtao wrote:
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Tuesday, May 13, 2025 7:32 PM
>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
>> jstultz@google.com; tjmercier@google.com
>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
>> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
>> 00012985 <feng.han@honor.com>
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/13/25 11:28, wangtao wrote:
>>> Support direct file I/O operations for system_heap dma-buf objects.
>>> Implementation includes:
>>> 1. Convert sg_table to bio_vec
>>
>> That is usually illegal for DMA-bufs.
> [wangtao] The term 'convert' is misleading in this context. The appropriate phrasing should be: Construct bio_vec from sg_table.

Well it doesn't matter what you call it. Touching the page inside an sg table of a DMA-buf is illegal, we even have code to actively prevent that.

Once more: This approach was already rejected multiple times! Please use udmabuf instead!

The hack you came up here is simply not necessary.

Regards,
Christian.


> Appreciate your feedback.
>>
>> Regards,
>> Christian.
>>
>>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke
>>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
>>>
>>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
>>>
>>> | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
>>> |--------------------|-------:|-------:|--------:|---------:|---------
>>> |--------------------|:|
>>> | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
>>> | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
>>> | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
>>> | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
>>>
>>> Signed-off-by: wangtao <tao.wangtao@honor.com>
>>> ---
>>>  drivers/dma-buf/heaps/system_heap.c | 118
>>> ++++++++++++++++++++++++++++
>>>  1 file changed, 118 insertions(+)
>>>
>>> diff --git a/drivers/dma-buf/heaps/system_heap.c
>>> b/drivers/dma-buf/heaps/system_heap.c
>>> index 26d5dc89ea16..f7b71b9843aa 100644
>>> --- a/drivers/dma-buf/heaps/system_heap.c
>>> +++ b/drivers/dma-buf/heaps/system_heap.c
>>> @@ -20,6 +20,8 @@
>>>  #include <linux/scatterlist.h>
>>>  #include <linux/slab.h>
>>>  #include <linux/vmalloc.h>
>>> +#include <linux/bvec.h>
>>> +#include <linux/uio.h>
>>>
>>>  static struct dma_heap *sys_heap;
>>>
>>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf
>> *dmabuf, struct iosys_map *map)
>>>  	iosys_map_clear(map);
>>>  }
>>>
>>> +static struct bio_vec *system_heap_init_bvec(struct
>> system_heap_buffer *buffer,
>>> +			size_t offset, size_t len, int *nr_segs) {
>>> +	struct sg_table *sgt = &buffer->sg_table;
>>> +	struct scatterlist *sg;
>>> +	size_t length = 0;
>>> +	unsigned int i, k = 0;
>>> +	struct bio_vec *bvec;
>>> +	size_t sg_left;
>>> +	size_t sg_offset;
>>> +	size_t sg_len;
>>> +
>>> +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec)
>>> +		return NULL;
>>> +
>>> +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
>>> +		length += sg->length;
>>> +		if (length <= offset)
>>> +			continue;
>>> +
>>> +		sg_left = length - offset;
>>> +		sg_offset = sg->offset + sg->length - sg_left;
>>> +		sg_len = min(sg_left, len);
>>> +
>>> +		bvec[k].bv_page = sg_page(sg);
>>> +		bvec[k].bv_len = sg_len;
>>> +		bvec[k].bv_offset = sg_offset;
>>> +		k++;
>>> +
>>> +		offset += sg_len;
>>> +		len -= sg_len;
>>> +		if (len <= 0)
>>> +			break;
>>> +	}
>>> +
>>> +	*nr_segs = k;
>>> +	return bvec;
>>> +}
>>> +
>>> +static int system_heap_rw_file(struct system_heap_buffer *buffer, bool
>> is_read,
>>> +		bool direct_io, struct file *filp, loff_t file_offset,
>>> +		size_t buf_offset, size_t len)
>>> +{
>>> +	struct bio_vec *bvec;
>>> +	int nr_segs = 0;
>>> +	struct iov_iter iter;
>>> +	struct kiocb kiocb;
>>> +	ssize_t ret = 0;
>>> +
>>> +	if (direct_io) {
>>> +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
>>> +			return -EINVAL;
>>> +	}
>>> +
>>> +	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
>>> +	if (!bvec)
>>> +		return -ENOMEM;
>>> +
>>> +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec,
>> nr_segs, len);
>>> +	init_sync_kiocb(&kiocb, filp);
>>> +	kiocb.ki_pos = file_offset;
>>> +	if (direct_io)
>>> +		kiocb.ki_flags |= IOCB_DIRECT;
>>> +
>>> +	while (kiocb.ki_pos < file_offset + len) {
>>> +		if (is_read)
>>> +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
>>> +		else
>>> +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
>>> +		if (ret <= 0)
>>> +			break;
>>> +	}
>>> +
>>> +	kvfree(bvec);
>>> +	return ret < 0 ? ret : 0;
>>> +}
>>> +
>>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
>>> +			struct dma_buf_rw_file *back)
>>> +{
>>> +	struct system_heap_buffer *buffer = dmabuf->priv;
>>> +	int ret = 0;
>>> +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
>>> +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
>>> +	struct file *filp;
>>> +
>>> +	if (op != DMA_BUF_RW_FLAGS_READ && op !=
>> DMA_BUF_RW_FLAGS_WRITE)
>>> +		return -EINVAL;
>>> +	if (direct_io) {
>>> +		if (!PAGE_ALIGNED(back->file_offset) ||
>>> +			!PAGE_ALIGNED(back->buf_offset) ||
>>> +			!PAGE_ALIGNED(back->buf_len))
>>> +		return -EINVAL;
>>> +	}
>>> +	if (!back->buf_len || back->buf_len > dmabuf->size ||
>>> +		back->buf_offset >= dmabuf->size ||
>>> +		back->buf_offset + back->buf_len > dmabuf->size)
>>> +		return -EINVAL;
>>> +	if (back->file_offset + back->buf_len < back->file_offset)
>>> +		return -EINVAL;
>>> +
>>> +	filp = fget(back->fd);
>>> +	if (!filp)
>>> +		return -EBADF;
>>> +
>>> +	mutex_lock(&buffer->lock);
>>> +	ret = system_heap_rw_file(buffer, op ==
>> DMA_BUF_RW_FLAGS_READ, direct_io,
>>> +			filp, back->file_offset, back->buf_offset, back-
>>> buf_len);
>>> +	mutex_unlock(&buffer->lock);
>>> +
>>> +	fput(filp);
>>> +	return ret;
>>> +}
>>> +
>>>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)  {
>>>  	struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6
>> +425,7
>>> @@ static const struct dma_buf_ops system_heap_buf_ops = {
>>>  	.mmap = system_heap_mmap,
>>>  	.vmap = system_heap_vmap,
>>>  	.vunmap = system_heap_vunmap,
>>> +	.rw_file = system_heap_dma_buf_rw_file,
>>>  	.release = system_heap_dma_buf_release,  };
>>>
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-13 13:17     ` Christian König
@ 2025-05-14 11:02       ` wangtao
  2025-05-14 12:00         ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-14 11:02 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Tuesday, May 13, 2025 9:18 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; <liulu.liu@honor.com>; <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/13/25 14:30, wangtao wrote:
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Tuesday, May 13, 2025 7:32 PM
> >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> >> jstultz@google.com; tjmercier@google.com
> >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; <liulu.liu@honor.com>;
> >> <feng.han@honor.com>
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/13/25 11:28, wangtao wrote:
> >>> Support direct file I/O operations for system_heap dma-buf objects.
> >>> Implementation includes:
> >>> 1. Convert sg_table to bio_vec
> >>
> >> That is usually illegal for DMA-bufs.
> > [wangtao] The term 'convert' is misleading in this context. The appropriate
> phrasing should be: Construct bio_vec from sg_table.
> 
> Well it doesn't matter what you call it. Touching the page inside an sg table of
> a DMA-buf is illegal, we even have code to actively prevent that.
[wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table. But the system heap exporter (sg_table owner) should be allowed to use them.
If a driver takes ownership via dma_buf_map_attachment or similar calls, the exporter must stop using the sg_table.
User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the DMA-buf is not attached.
The exporter must check ownership (e.g., ensure no map_dma_buf/vmap is active) and block new calls during operations.
I'll add these checks in patch v2.

> 
> Once more: This approach was already rejected multiple times! Please use
> udmabuf instead!
> 
> The hack you came up here is simply not necessary.
[wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My method is simpler, uses less CPU/power, and performs better:
  - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU.
  - udmabuf wastes half its CPU time on __get_user_pages.
  - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal.
udmabuf is slightly faster but not enough. Switching to udmabuf is easy for small apps but hard in complex systems without major benefits.
> 
> Regards,
> Christian.
> 
> 
> > Appreciate your feedback.
> >>
> >> Regards,
> >> Christian.
> >>
> >>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke
> >>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
> >>>
> >>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
> >>>
> >>> | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
> >>> |--------------------|-------:|-------:|--------:|---------:|-------
> >>> |--------------------|--
> >>> |--------------------|:|
> >>> | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
> >>> | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
> >>> | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
> >>> | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
> >>>
> >>> Signed-off-by: wangtao <tao.wangtao@honor.com>
> >>> ---
> >>>  drivers/dma-buf/heaps/system_heap.c | 118
> >>> ++++++++++++++++++++++++++++
> >>>  1 file changed, 118 insertions(+)
> >>>
> >>> diff --git a/drivers/dma-buf/heaps/system_heap.c
> >>> b/drivers/dma-buf/heaps/system_heap.c
> >>> index 26d5dc89ea16..f7b71b9843aa 100644
> >>> --- a/drivers/dma-buf/heaps/system_heap.c
> >>> +++ b/drivers/dma-buf/heaps/system_heap.c
> >>> @@ -20,6 +20,8 @@
> >>>  #include <linux/scatterlist.h>
> >>>  #include <linux/slab.h>
> >>>  #include <linux/vmalloc.h>
> >>> +#include <linux/bvec.h>
> >>> +#include <linux/uio.h>
> >>>
> >>>  static struct dma_heap *sys_heap;
> >>>
> >>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct
> dma_buf
> >> *dmabuf, struct iosys_map *map)
> >>>  	iosys_map_clear(map);
> >>>  }
> >>>
> >>> +static struct bio_vec *system_heap_init_bvec(struct
> >> system_heap_buffer *buffer,
> >>> +			size_t offset, size_t len, int *nr_segs) {
> >>> +	struct sg_table *sgt = &buffer->sg_table;
> >>> +	struct scatterlist *sg;
> >>> +	size_t length = 0;
> >>> +	unsigned int i, k = 0;
> >>> +	struct bio_vec *bvec;
> >>> +	size_t sg_left;
> >>> +	size_t sg_offset;
> >>> +	size_t sg_len;
> >>> +
> >>> +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
> >>> +	if (!bvec)
> >>> +		return NULL;
> >>> +
> >>> +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
> >>> +		length += sg->length;
> >>> +		if (length <= offset)
> >>> +			continue;
> >>> +
> >>> +		sg_left = length - offset;
> >>> +		sg_offset = sg->offset + sg->length - sg_left;
> >>> +		sg_len = min(sg_left, len);
> >>> +
> >>> +		bvec[k].bv_page = sg_page(sg);
> >>> +		bvec[k].bv_len = sg_len;
> >>> +		bvec[k].bv_offset = sg_offset;
> >>> +		k++;
> >>> +
> >>> +		offset += sg_len;
> >>> +		len -= sg_len;
> >>> +		if (len <= 0)
> >>> +			break;
> >>> +	}
> >>> +
> >>> +	*nr_segs = k;
> >>> +	return bvec;
> >>> +}
> >>> +
> >>> +static int system_heap_rw_file(struct system_heap_buffer *buffer,
> >>> +bool
> >> is_read,
> >>> +		bool direct_io, struct file *filp, loff_t file_offset,
> >>> +		size_t buf_offset, size_t len)
> >>> +{
> >>> +	struct bio_vec *bvec;
> >>> +	int nr_segs = 0;
> >>> +	struct iov_iter iter;
> >>> +	struct kiocb kiocb;
> >>> +	ssize_t ret = 0;
> >>> +
> >>> +	if (direct_io) {
> >>> +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
> >>> +			return -EINVAL;
> >>> +	}
> >>> +
> >>> +	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
> >>> +	if (!bvec)
> >>> +		return -ENOMEM;
> >>> +
> >>> +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec,
> >> nr_segs, len);
> >>> +	init_sync_kiocb(&kiocb, filp);
> >>> +	kiocb.ki_pos = file_offset;
> >>> +	if (direct_io)
> >>> +		kiocb.ki_flags |= IOCB_DIRECT;
> >>> +
> >>> +	while (kiocb.ki_pos < file_offset + len) {
> >>> +		if (is_read)
> >>> +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
> >>> +		else
> >>> +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
> >>> +		if (ret <= 0)
> >>> +			break;
> >>> +	}
> >>> +
> >>> +	kvfree(bvec);
> >>> +	return ret < 0 ? ret : 0;
> >>> +}
> >>> +
> >>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
> >>> +			struct dma_buf_rw_file *back)
> >>> +{
> >>> +	struct system_heap_buffer *buffer = dmabuf->priv;
> >>> +	int ret = 0;
> >>> +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
> >>> +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
> >>> +	struct file *filp;
> >>> +
> >>> +	if (op != DMA_BUF_RW_FLAGS_READ && op !=
> >> DMA_BUF_RW_FLAGS_WRITE)
> >>> +		return -EINVAL;
> >>> +	if (direct_io) {
> >>> +		if (!PAGE_ALIGNED(back->file_offset) ||
> >>> +			!PAGE_ALIGNED(back->buf_offset) ||
> >>> +			!PAGE_ALIGNED(back->buf_len))
> >>> +		return -EINVAL;
> >>> +	}
> >>> +	if (!back->buf_len || back->buf_len > dmabuf->size ||
> >>> +		back->buf_offset >= dmabuf->size ||
> >>> +		back->buf_offset + back->buf_len > dmabuf->size)
> >>> +		return -EINVAL;
> >>> +	if (back->file_offset + back->buf_len < back->file_offset)
> >>> +		return -EINVAL;
> >>> +
> >>> +	filp = fget(back->fd);
> >>> +	if (!filp)
> >>> +		return -EBADF;
> >>> +
> >>> +	mutex_lock(&buffer->lock);
> >>> +	ret = system_heap_rw_file(buffer, op ==
> >> DMA_BUF_RW_FLAGS_READ, direct_io,
> >>> +			filp, back->file_offset, back->buf_offset, back-
> >>> buf_len);
> >>> +	mutex_unlock(&buffer->lock);
> >>> +
> >>> +	fput(filp);
> >>> +	return ret;
> >>> +}
> >>> +
> >>>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)  {
> >>>  	struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6
> >> +425,7
> >>> @@ static const struct dma_buf_ops system_heap_buf_ops = {
> >>>  	.mmap = system_heap_mmap,
> >>>  	.vmap = system_heap_vmap,
> >>>  	.vunmap = system_heap_vunmap,
> >>> +	.rw_file = system_heap_dma_buf_rw_file,
> >>>  	.release = system_heap_dma_buf_release,  };
> >>>
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-14 11:02       ` wangtao
@ 2025-05-14 12:00         ` Christian König
  2025-05-15 14:03           ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-14 12:00 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/14/25 13:02, wangtao wrote:
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Tuesday, May 13, 2025 9:18 PM
>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
>> jstultz@google.com; tjmercier@google.com
>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
>> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>> <yipengxiang@honor.com>; <liulu.liu@honor.com>; <feng.han@honor.com>
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/13/25 14:30, wangtao wrote:
>>>> -----Original Message-----
>>>> From: Christian König <christian.koenig@amd.com>
>>>> Sent: Tuesday, May 13, 2025 7:32 PM
>>>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
>>>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
>>>> jstultz@google.com; tjmercier@google.com
>>>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
>>>> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
>>>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>>>> <yipengxiang@honor.com>; <liulu.liu@honor.com>;
>>>> <feng.han@honor.com>
>>>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>>>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>>>
>>>> On 5/13/25 11:28, wangtao wrote:
>>>>> Support direct file I/O operations for system_heap dma-buf objects.
>>>>> Implementation includes:
>>>>> 1. Convert sg_table to bio_vec
>>>>
>>>> That is usually illegal for DMA-bufs.
>>> [wangtao] The term 'convert' is misleading in this context. The appropriate
>> phrasing should be: Construct bio_vec from sg_table.
>>
>> Well it doesn't matter what you call it. Touching the page inside an sg table of
>> a DMA-buf is illegal, we even have code to actively prevent that.
> [wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table. But the system heap exporter (sg_table owner) should be allowed to use them.

Good point that might be possible.

> If a driver takes ownership via dma_buf_map_attachment or similar calls, the exporter must stop using the sg_table.
> User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the DMA-buf is not attached.
> The exporter must check ownership (e.g., ensure no map_dma_buf/vmap is active) and block new calls during operations.
> I'll add these checks in patch v2.
> 
>>
>> Once more: This approach was already rejected multiple times! Please use
>> udmabuf instead!
>>
>> The hack you came up here is simply not necessary.
> [wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My method is simpler, uses less CPU/power, and performs better:

I don't think that this is a valid argument.

>   - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU.
>   - udmabuf wastes half its CPU time on __get_user_pages.
>   - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal.

Why would using udmabuf be slower here?

> udmabuf is slightly faster but not enough. Switching to udmabuf is easy for small apps but hard in complex systems without major benefits.

Yeah, but your approach here is a rather clear hack. Using udmabuf is much more cleaner and generally accepted by everybody now.

As far as I can see I have to reject your approach here.

Regards,
Christian.


>>
>> Regards,
>> Christian.
>>
>>
>>> Appreciate your feedback.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke
>>>>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
>>>>>
>>>>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
>>>>>
>>>>> | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
>>>>> |--------------------|-------:|-------:|--------:|---------:|-------
>>>>> |--------------------|--
>>>>> |--------------------|:|
>>>>> | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
>>>>> | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
>>>>> | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
>>>>> | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
>>>>>
>>>>> Signed-off-by: wangtao <tao.wangtao@honor.com>
>>>>> ---
>>>>>  drivers/dma-buf/heaps/system_heap.c | 118
>>>>> ++++++++++++++++++++++++++++
>>>>>  1 file changed, 118 insertions(+)
>>>>>
>>>>> diff --git a/drivers/dma-buf/heaps/system_heap.c
>>>>> b/drivers/dma-buf/heaps/system_heap.c
>>>>> index 26d5dc89ea16..f7b71b9843aa 100644
>>>>> --- a/drivers/dma-buf/heaps/system_heap.c
>>>>> +++ b/drivers/dma-buf/heaps/system_heap.c
>>>>> @@ -20,6 +20,8 @@
>>>>>  #include <linux/scatterlist.h>
>>>>>  #include <linux/slab.h>
>>>>>  #include <linux/vmalloc.h>
>>>>> +#include <linux/bvec.h>
>>>>> +#include <linux/uio.h>
>>>>>
>>>>>  static struct dma_heap *sys_heap;
>>>>>
>>>>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct
>> dma_buf
>>>> *dmabuf, struct iosys_map *map)
>>>>>  	iosys_map_clear(map);
>>>>>  }
>>>>>
>>>>> +static struct bio_vec *system_heap_init_bvec(struct
>>>> system_heap_buffer *buffer,
>>>>> +			size_t offset, size_t len, int *nr_segs) {
>>>>> +	struct sg_table *sgt = &buffer->sg_table;
>>>>> +	struct scatterlist *sg;
>>>>> +	size_t length = 0;
>>>>> +	unsigned int i, k = 0;
>>>>> +	struct bio_vec *bvec;
>>>>> +	size_t sg_left;
>>>>> +	size_t sg_offset;
>>>>> +	size_t sg_len;
>>>>> +
>>>>> +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
>>>>> +	if (!bvec)
>>>>> +		return NULL;
>>>>> +
>>>>> +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
>>>>> +		length += sg->length;
>>>>> +		if (length <= offset)
>>>>> +			continue;
>>>>> +
>>>>> +		sg_left = length - offset;
>>>>> +		sg_offset = sg->offset + sg->length - sg_left;
>>>>> +		sg_len = min(sg_left, len);
>>>>> +
>>>>> +		bvec[k].bv_page = sg_page(sg);
>>>>> +		bvec[k].bv_len = sg_len;
>>>>> +		bvec[k].bv_offset = sg_offset;
>>>>> +		k++;
>>>>> +
>>>>> +		offset += sg_len;
>>>>> +		len -= sg_len;
>>>>> +		if (len <= 0)
>>>>> +			break;
>>>>> +	}
>>>>> +
>>>>> +	*nr_segs = k;
>>>>> +	return bvec;
>>>>> +}
>>>>> +
>>>>> +static int system_heap_rw_file(struct system_heap_buffer *buffer,
>>>>> +bool
>>>> is_read,
>>>>> +		bool direct_io, struct file *filp, loff_t file_offset,
>>>>> +		size_t buf_offset, size_t len)
>>>>> +{
>>>>> +	struct bio_vec *bvec;
>>>>> +	int nr_segs = 0;
>>>>> +	struct iov_iter iter;
>>>>> +	struct kiocb kiocb;
>>>>> +	ssize_t ret = 0;
>>>>> +
>>>>> +	if (direct_io) {
>>>>> +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
>>>>> +			return -EINVAL;
>>>>> +	}
>>>>> +
>>>>> +	bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs);
>>>>> +	if (!bvec)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>> +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec,
>>>> nr_segs, len);
>>>>> +	init_sync_kiocb(&kiocb, filp);
>>>>> +	kiocb.ki_pos = file_offset;
>>>>> +	if (direct_io)
>>>>> +		kiocb.ki_flags |= IOCB_DIRECT;
>>>>> +
>>>>> +	while (kiocb.ki_pos < file_offset + len) {
>>>>> +		if (is_read)
>>>>> +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
>>>>> +		else
>>>>> +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
>>>>> +		if (ret <= 0)
>>>>> +			break;
>>>>> +	}
>>>>> +
>>>>> +	kvfree(bvec);
>>>>> +	return ret < 0 ? ret : 0;
>>>>> +}
>>>>> +
>>>>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
>>>>> +			struct dma_buf_rw_file *back)
>>>>> +{
>>>>> +	struct system_heap_buffer *buffer = dmabuf->priv;
>>>>> +	int ret = 0;
>>>>> +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
>>>>> +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
>>>>> +	struct file *filp;
>>>>> +
>>>>> +	if (op != DMA_BUF_RW_FLAGS_READ && op !=
>>>> DMA_BUF_RW_FLAGS_WRITE)
>>>>> +		return -EINVAL;
>>>>> +	if (direct_io) {
>>>>> +		if (!PAGE_ALIGNED(back->file_offset) ||
>>>>> +			!PAGE_ALIGNED(back->buf_offset) ||
>>>>> +			!PAGE_ALIGNED(back->buf_len))
>>>>> +		return -EINVAL;
>>>>> +	}
>>>>> +	if (!back->buf_len || back->buf_len > dmabuf->size ||
>>>>> +		back->buf_offset >= dmabuf->size ||
>>>>> +		back->buf_offset + back->buf_len > dmabuf->size)
>>>>> +		return -EINVAL;
>>>>> +	if (back->file_offset + back->buf_len < back->file_offset)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	filp = fget(back->fd);
>>>>> +	if (!filp)
>>>>> +		return -EBADF;
>>>>> +
>>>>> +	mutex_lock(&buffer->lock);
>>>>> +	ret = system_heap_rw_file(buffer, op ==
>>>> DMA_BUF_RW_FLAGS_READ, direct_io,
>>>>> +			filp, back->file_offset, back->buf_offset, back-
>>>>> buf_len);
>>>>> +	mutex_unlock(&buffer->lock);
>>>>> +
>>>>> +	fput(filp);
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)  {
>>>>>  	struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6
>>>> +425,7
>>>>> @@ static const struct dma_buf_ops system_heap_buf_ops = {
>>>>>  	.mmap = system_heap_mmap,
>>>>>  	.vmap = system_heap_vmap,
>>>>>  	.vunmap = system_heap_vunmap,
>>>>> +	.rw_file = system_heap_dma_buf_rw_file,
>>>>>  	.release = system_heap_dma_buf_release,  };
>>>>>
>>>
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-14 12:00         ` Christian König
@ 2025-05-15 14:03           ` wangtao
  2025-05-15 14:26             ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-15 14:03 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Wednesday, May 14, 2025 8:00 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; <liulu.liu@honor.com>; hanfeng
>  <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/14/25 13:02, wangtao wrote:
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Tuesday, May 13, 2025 9:18 PM
> >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> >> jstultz@google.com; tjmercier@google.com
> >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; <liulu.liu@honor.com>;
> <feng.han@honor.com>
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/13/25 14:30, wangtao wrote:
> >>>> -----Original Message-----
> >>>> From: Christian König <christian.koenig@amd.com>
> >>>> Sent: Tuesday, May 13, 2025 7:32 PM
> >>>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> >>>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> >>>> jstultz@google.com; tjmercier@google.com
> >>>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >>>> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> >>>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >>>> <yipengxiang@honor.com>; <liulu.liu@honor.com>;
> >>>> <feng.han@honor.com>
> >>>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >>>> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>>>
> >>>> On 5/13/25 11:28, wangtao wrote:
> >>>>> Support direct file I/O operations for system_heap dma-buf objects.
> >>>>> Implementation includes:
> >>>>> 1. Convert sg_table to bio_vec
> >>>>
> >>>> That is usually illegal for DMA-bufs.
> >>> [wangtao] The term 'convert' is misleading in this context. The
> >>> appropriate
> >> phrasing should be: Construct bio_vec from sg_table.
> >>
> >> Well it doesn't matter what you call it. Touching the page inside an
> >> sg table of a DMA-buf is illegal, we even have code to actively prevent
> that.
> > [wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table.
> But the system heap exporter (sg_table owner) should be allowed to use
> them.
> 
> Good point that might be possible.
> 
> > If a driver takes ownership via dma_buf_map_attachment or similar calls,
> the exporter must stop using the sg_table.
> > User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the
> DMA-buf is not attached.
> > The exporter must check ownership (e.g., ensure no map_dma_buf/vmap
> is active) and block new calls during operations.
> > I'll add these checks in patch v2.
> >
> >>
> >> Once more: This approach was already rejected multiple times! Please
> >> use udmabuf instead!
> >>
> >> The hack you came up here is simply not necessary.
> > [wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My
> method is simpler, uses less CPU/power, and performs better:
> 
> I don't think that this is a valid argument.
> 
> >   - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU.
> >   - udmabuf wastes half its CPU time on __get_user_pages.
> >   - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms
> for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal.
> 
> Why would using udmabuf be slower here?
> 
> > udmabuf is slightly faster but not enough. Switching to udmabuf is easy for
> small apps but hard in complex systems without major benefits.
> 
> Yeah, but your approach here is a rather clear hack. Using udmabuf is much
> more cleaner and generally accepted by everybody now.
> 
> As far as I can see I have to reject your approach here.
> 
[wangtao] My Test Configuration (CPU 1GHz, 5-test average):
Allocation: 32x32MB buffer creation
- dmabuf 53ms vs. udmabuf 694ms (10X slower)
- Note: shmem shows excessive allocation time

Read 1024MB File:
- dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
- Note: pin_user_pages_fast consumes majority CPU cycles

Key function call timing: See details below. 

Dmabuf direct io:
|- 12.39% DmaBufTest_PerfDmabufDirectIO_Test::TestBody()
|-|- 5.95% perf_dmabuf_alloc_and_io
|-|-|- 4.38% dmabuf_io_back
|-|-|-|- 3.47% ioctl
|-|-|-|-|- 3.47% __ioctl
|-|-|-|-|-|-|-|-|-|-|-|- 3.47% dma_buf_ioctl
|-|-|-|-|-|-|-|-|-|-|-|-|- 3.47% system_heap_dma_buf_rw_file
|-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% system_heap_rw_file
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% f2fs_file_read_iter
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% __iomap_dio_rw
|-|-|- 1.33% ioctl
|-|-|-|- 1.33% __ioctl
|-|-|-|-|-|-|-|-|-|-|- 1.33% dma_heap_ioctl
|-|-|-|-|-|-|-|-|-|-|-|- 1.33% dma_heap_buffer_alloc
|-|-|-|-|-|-|-|-|-|-|-|-|- 1.33% system_heap_allocate
|-|-|-|-|-|-|-|-|-|-|-|-|-|- 1.33% system_heap_do_allocate
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 1.28% __alloc_pages

Udmabuf direct io:
|- 39.35% DmaBufTest_PerfDmabufUDirectIO_Test::TestBody()
|-|- 32.76% perf_dmabuf_alloc_and_io
|-|-|- 17.43% alloc_udmabuf
|-|-|-|- 13.34% ioctl
|-|-|-|-|-|-|-|-|-|-|-|- 13.34% udmabuf_ioctl
|-|-|-|-|-|-|-|-|-|-|-|-|- 13.32% udmabuf_create
|-|-|-|-|-|-|-|-|-|-|-|-|-|- 13.26% shmem_read_mapping_page_gfp
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 11.94% shmem_read_folio_gfp
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 11.28% shmem_get_folio_gfp
|-|-|- 10.81% dmabuf_io_back
|-|-|-|- 8.85% read
|-|-|-|-|-|-|-|-|-|- 8.85% __arm64_sys_read
|-|-|-|-|-|-|-|-|-|-|- 8.85% f2fs_file_read_iter
|-|-|-|-|-|-|-|-|-|-|-|- 8.84% __iomap_dio_rw
|-|-|-|-|-|-|-|-|-|-|-|-|- 7.85% iomap_dio_bio_iter
|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.61% bio_iov_iter_get_pages
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% iov_iter_extract_pages
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% pin_user_pages_fast
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% internal_get_user_pages_fast
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% __gup_longterm_locked
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.36% __get_user_pages

> Regards,
> Christian.
> 
> 
> >>
> >> Regards,
> >> Christian.
> >>
> >>
> >>> Appreciate your feedback.
> >>>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke
> >>>>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O
> >>>>>
> >>>>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz):
> >>>>>
> >>>>> | Metric             |    1MB |    8MB |    64MB |   1024MB |   3072MB |
> >>>>> |--------------------|-------:|-------:|--------:|---------:|-----
> >>>>> |--------------------|--
> >>>>> |--------------------|--
> >>>>> |--------------------|:|
> >>>>> | Buffer Read (us)   |   1658 |   9028 |   69295 |  1019783 |  2978179 |
> >>>>> | Direct Read (us)   |    707 |   2647 |   18689 |   299627 |   937758 |
> >>>>> | Buffer Rate (MB/s) |    603 |    886 |     924 |     1004 |     1032 |
> >>>>> | Direct Rate (MB/s) |   1414 |   3022 |    3425 |     3418 |     3276 |
> >>>>>
> >>>>> Signed-off-by: wangtao <tao.wangtao@honor.com>
> >>>>> ---
> >>>>>  drivers/dma-buf/heaps/system_heap.c | 118
> >>>>> ++++++++++++++++++++++++++++
> >>>>>  1 file changed, 118 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/dma-buf/heaps/system_heap.c
> >>>>> b/drivers/dma-buf/heaps/system_heap.c
> >>>>> index 26d5dc89ea16..f7b71b9843aa 100644
> >>>>> --- a/drivers/dma-buf/heaps/system_heap.c
> >>>>> +++ b/drivers/dma-buf/heaps/system_heap.c
> >>>>> @@ -20,6 +20,8 @@
> >>>>>  #include <linux/scatterlist.h>
> >>>>>  #include <linux/slab.h>
> >>>>>  #include <linux/vmalloc.h>
> >>>>> +#include <linux/bvec.h>
> >>>>> +#include <linux/uio.h>
> >>>>>
> >>>>>  static struct dma_heap *sys_heap;
> >>>>>
> >>>>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct
> >> dma_buf
> >>>> *dmabuf, struct iosys_map *map)
> >>>>>  	iosys_map_clear(map);
> >>>>>  }
> >>>>>
> >>>>> +static struct bio_vec *system_heap_init_bvec(struct
> >>>> system_heap_buffer *buffer,
> >>>>> +			size_t offset, size_t len, int *nr_segs) {
> >>>>> +	struct sg_table *sgt = &buffer->sg_table;
> >>>>> +	struct scatterlist *sg;
> >>>>> +	size_t length = 0;
> >>>>> +	unsigned int i, k = 0;
> >>>>> +	struct bio_vec *bvec;
> >>>>> +	size_t sg_left;
> >>>>> +	size_t sg_offset;
> >>>>> +	size_t sg_len;
> >>>>> +
> >>>>> +	bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL);
> >>>>> +	if (!bvec)
> >>>>> +		return NULL;
> >>>>> +
> >>>>> +	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
> >>>>> +		length += sg->length;
> >>>>> +		if (length <= offset)
> >>>>> +			continue;
> >>>>> +
> >>>>> +		sg_left = length - offset;
> >>>>> +		sg_offset = sg->offset + sg->length - sg_left;
> >>>>> +		sg_len = min(sg_left, len);
> >>>>> +
> >>>>> +		bvec[k].bv_page = sg_page(sg);
> >>>>> +		bvec[k].bv_len = sg_len;
> >>>>> +		bvec[k].bv_offset = sg_offset;
> >>>>> +		k++;
> >>>>> +
> >>>>> +		offset += sg_len;
> >>>>> +		len -= sg_len;
> >>>>> +		if (len <= 0)
> >>>>> +			break;
> >>>>> +	}
> >>>>> +
> >>>>> +	*nr_segs = k;
> >>>>> +	return bvec;
> >>>>> +}
> >>>>> +
> >>>>> +static int system_heap_rw_file(struct system_heap_buffer *buffer,
> >>>>> +bool
> >>>> is_read,
> >>>>> +		bool direct_io, struct file *filp, loff_t file_offset,
> >>>>> +		size_t buf_offset, size_t len)
> >>>>> +{
> >>>>> +	struct bio_vec *bvec;
> >>>>> +	int nr_segs = 0;
> >>>>> +	struct iov_iter iter;
> >>>>> +	struct kiocb kiocb;
> >>>>> +	ssize_t ret = 0;
> >>>>> +
> >>>>> +	if (direct_io) {
> >>>>> +		if (!(filp->f_mode & FMODE_CAN_ODIRECT))
> >>>>> +			return -EINVAL;
> >>>>> +	}
> >>>>> +
> >>>>> +	bvec = system_heap_init_bvec(buffer, buf_offset, len,
> &nr_segs);
> >>>>> +	if (!bvec)
> >>>>> +		return -ENOMEM;
> >>>>> +
> >>>>> +	iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE,
> bvec,
> >>>> nr_segs, len);
> >>>>> +	init_sync_kiocb(&kiocb, filp);
> >>>>> +	kiocb.ki_pos = file_offset;
> >>>>> +	if (direct_io)
> >>>>> +		kiocb.ki_flags |= IOCB_DIRECT;
> >>>>> +
> >>>>> +	while (kiocb.ki_pos < file_offset + len) {
> >>>>> +		if (is_read)
> >>>>> +			ret = vfs_iocb_iter_read(filp, &kiocb, &iter);
> >>>>> +		else
> >>>>> +			ret = vfs_iocb_iter_write(filp, &kiocb, &iter);
> >>>>> +		if (ret <= 0)
> >>>>> +			break;
> >>>>> +	}
> >>>>> +
> >>>>> +	kvfree(bvec);
> >>>>> +	return ret < 0 ? ret : 0;
> >>>>> +}
> >>>>> +
> >>>>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf,
> >>>>> +			struct dma_buf_rw_file *back)
> >>>>> +{
> >>>>> +	struct system_heap_buffer *buffer = dmabuf->priv;
> >>>>> +	int ret = 0;
> >>>>> +	__u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK;
> >>>>> +	bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT;
> >>>>> +	struct file *filp;
> >>>>> +
> >>>>> +	if (op != DMA_BUF_RW_FLAGS_READ && op !=
> >>>> DMA_BUF_RW_FLAGS_WRITE)
> >>>>> +		return -EINVAL;
> >>>>> +	if (direct_io) {
> >>>>> +		if (!PAGE_ALIGNED(back->file_offset) ||
> >>>>> +			!PAGE_ALIGNED(back->buf_offset) ||
> >>>>> +			!PAGE_ALIGNED(back->buf_len))
> >>>>> +		return -EINVAL;
> >>>>> +	}
> >>>>> +	if (!back->buf_len || back->buf_len > dmabuf->size ||
> >>>>> +		back->buf_offset >= dmabuf->size ||
> >>>>> +		back->buf_offset + back->buf_len > dmabuf->size)
> >>>>> +		return -EINVAL;
> >>>>> +	if (back->file_offset + back->buf_len < back->file_offset)
> >>>>> +		return -EINVAL;
> >>>>> +
> >>>>> +	filp = fget(back->fd);
> >>>>> +	if (!filp)
> >>>>> +		return -EBADF;
> >>>>> +
> >>>>> +	mutex_lock(&buffer->lock);
> >>>>> +	ret = system_heap_rw_file(buffer, op ==
> >>>> DMA_BUF_RW_FLAGS_READ, direct_io,
> >>>>> +			filp, back->file_offset, back->buf_offset,
> back-
> >>>>> buf_len);
> >>>>> +	mutex_unlock(&buffer->lock);
> >>>>> +
> >>>>> +	fput(filp);
> >>>>> +	return ret;
> >>>>> +}
> >>>>> +
> >>>>>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
> {
> >>>>>  	struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6
> >>>> +425,7
> >>>>> @@ static const struct dma_buf_ops system_heap_buf_ops = {
> >>>>>  	.mmap = system_heap_mmap,
> >>>>>  	.vmap = system_heap_vmap,
> >>>>>  	.vunmap = system_heap_vunmap,
> >>>>> +	.rw_file = system_heap_dma_buf_rw_file,
> >>>>>  	.release = system_heap_dma_buf_release,  };
> >>>>>
> >>>
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-15 14:03           ` wangtao
@ 2025-05-15 14:26             ` Christian König
  2025-05-16  7:40               ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-15 14:26 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/15/25 16:03, wangtao wrote:
> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> Allocation: 32x32MB buffer creation
> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> - Note: shmem shows excessive allocation time

Yeah, that is something already noted by others as well. But that is orthogonal.

> 
> Read 1024MB File:
> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> - Note: pin_user_pages_fast consumes majority CPU cycles
> 
> Key function call timing: See details below. 

Those aren't valid, you are comparing different functionalities here.

Please try using udmabuf with sendfile() as confirmed to be working by T.J.

Regards,
Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-15 14:26             ` Christian König
@ 2025-05-16  7:40               ` wangtao
  2025-05-16  8:36                 ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-16  7:40 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Thursday, May 15, 2025 10:26 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
> 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/15/25 16:03, wangtao wrote:
> > [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > Allocation: 32x32MB buffer creation
> > - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > - Note: shmem shows excessive allocation time
> 
> Yeah, that is something already noted by others as well. But that is
> orthogonal.
> 
> >
> > Read 1024MB File:
> > - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > - Note: pin_user_pages_fast consumes majority CPU cycles
> >
> > Key function call timing: See details below.
> 
> Those aren't valid, you are comparing different functionalities here.
> 
> Please try using udmabuf with sendfile() as confirmed to be working by T.J.
[wangtao] Using buffer IO with dmabuf file read/write requires one memory copy.
Direct IO removes this copy to enable zero-copy. The sendfile system call
reduces memory copies from two (read/write) to one. However, with udmabuf,
sendfile still keeps at least one copy, failing zero-copy.

If udmabuf sendfile uses buffer IO (file page cache), read latency matches
dmabuf buffer read, but allocation time is much longer.
With Direct IO, the default 16-page pipe size makes it slower than buffer IO.

Test data shows:
udmabuf direct read is much faster than udmabuf sendfile.
dmabuf direct read outperforms udmabuf direct read by a large margin.

Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
udmabuf for Direct IO might cause errors, but there are no safeguards to
prevent this.

Allocate 32x32MB buffer and read 1024 MB file Test:
Metric                 | alloc (ms) | read (ms) | total (ms)
-----------------------|------------|-----------|-----------
udmabuf buffer read    | 539        | 2017      | 2555
udmabuf direct read    | 522        | 658       | 1179
udmabuf buffer sendfile| 505        | 1040      | 1546
udmabuf direct sendfile| 510        | 2269      | 2780
dmabuf buffer read     | 51         | 1068      | 1118
dmabuf direct read     | 52         | 297       | 349

udmabuf sendfile test steps:
1. Open data file(1024MB), get back_fd
2. Create memfd(32MB) # Loop steps 2-6
3. Allocate udmabuf with memfd
4. Call sendfile(memfd, back_fd)
5. Close memfd after sendfile
6. Close udmabuf
7. Close back_fd

> 
> Regards,
> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16  7:40               ` wangtao
@ 2025-05-16  8:36                 ` Christian König
  2025-05-16  9:49                   ` wangtao
  2025-05-16 18:37                   ` T.J. Mercier
  0 siblings, 2 replies; 28+ messages in thread
From: Christian König @ 2025-05-16  8:36 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/16/25 09:40, wangtao wrote:
> 
> 
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Thursday, May 15, 2025 10:26 PM
>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
>> jstultz@google.com; tjmercier@google.com
>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
>> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
>> 00012985 <feng.han@honor.com>
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/15/25 16:03, wangtao wrote:
>>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
>>> Allocation: 32x32MB buffer creation
>>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
>>> - Note: shmem shows excessive allocation time
>>
>> Yeah, that is something already noted by others as well. But that is
>> orthogonal.
>>
>>>
>>> Read 1024MB File:
>>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
>>> - Note: pin_user_pages_fast consumes majority CPU cycles
>>>
>>> Key function call timing: See details below.
>>
>> Those aren't valid, you are comparing different functionalities here.
>>
>> Please try using udmabuf with sendfile() as confirmed to be working by T.J.
> [wangtao] Using buffer IO with dmabuf file read/write requires one memory copy.
> Direct IO removes this copy to enable zero-copy. The sendfile system call
> reduces memory copies from two (read/write) to one. However, with udmabuf,
> sendfile still keeps at least one copy, failing zero-copy.


Then please work on fixing this.

Regards,
Christian.


> 
> If udmabuf sendfile uses buffer IO (file page cache), read latency matches
> dmabuf buffer read, but allocation time is much longer.
> With Direct IO, the default 16-page pipe size makes it slower than buffer IO.
> 
> Test data shows:
> udmabuf direct read is much faster than udmabuf sendfile.
> dmabuf direct read outperforms udmabuf direct read by a large margin.
> 
> Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
> udmabuf for Direct IO might cause errors, but there are no safeguards to
> prevent this.
> 
> Allocate 32x32MB buffer and read 1024 MB file Test:
> Metric                 | alloc (ms) | read (ms) | total (ms)
> -----------------------|------------|-----------|-----------
> udmabuf buffer read    | 539        | 2017      | 2555
> udmabuf direct read    | 522        | 658       | 1179
> udmabuf buffer sendfile| 505        | 1040      | 1546
> udmabuf direct sendfile| 510        | 2269      | 2780
> dmabuf buffer read     | 51         | 1068      | 1118
> dmabuf direct read     | 52         | 297       | 349
> 
> udmabuf sendfile test steps:
> 1. Open data file(1024MB), get back_fd
> 2. Create memfd(32MB) # Loop steps 2-6
> 3. Allocate udmabuf with memfd
> 4. Call sendfile(memfd, back_fd)
> 5. Close memfd after sendfile
> 6. Close udmabuf
> 7. Close back_fd
> 
>>
>> Regards,
>> Christian.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16  8:36                 ` Christian König
@ 2025-05-16  9:49                   ` wangtao
  2025-05-16 10:29                     ` Christian König
  2025-05-16 18:37                   ` T.J. Mercier
  1 sibling, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-16  9:49 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Friday, May 16, 2025 4:36 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
> 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/16/25 09:40, wangtao wrote:
> >
> >
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Thursday, May 15, 2025 10:26 PM
> >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> >> jstultz@google.com; tjmercier@google.com
> >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>;
> >> hanfeng
> >> 00012985 <feng.han@honor.com>
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/15/25 16:03, wangtao wrote:
> >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> >>> Allocation: 32x32MB buffer creation
> >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> >>> - Note: shmem shows excessive allocation time
> >>
> >> Yeah, that is something already noted by others as well. But that is
> >> orthogonal.
> >>
> >>>
> >>> Read 1024MB File:
> >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> >>>
> >>> Key function call timing: See details below.
> >>
> >> Those aren't valid, you are comparing different functionalities here.
> >>
> >> Please try using udmabuf with sendfile() as confirmed to be working by
> T.J.
> > [wangtao] Using buffer IO with dmabuf file read/write requires one
> memory copy.
> > Direct IO removes this copy to enable zero-copy. The sendfile system
> > call reduces memory copies from two (read/write) to one. However, with
> > udmabuf, sendfile still keeps at least one copy, failing zero-copy.
> 
> 
> Then please work on fixing this.
[wangtao] What needs fixing? Does sendfile achieve zero-copy?
sendfile reduces memory copies (from 2 to 1) for network sockets,
but still requires one copy and cannot achieve zero copies.

> 
> Regards,
> Christian.
> 
> 
> >
> > If udmabuf sendfile uses buffer IO (file page cache), read latency
> > matches dmabuf buffer read, but allocation time is much longer.
> > With Direct IO, the default 16-page pipe size makes it slower than buffer IO.
> >
> > Test data shows:
> > udmabuf direct read is much faster than udmabuf sendfile.
> > dmabuf direct read outperforms udmabuf direct read by a large margin.
> >
> > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
> > udmabuf for Direct IO might cause errors, but there are no safeguards
> > to prevent this.
> >
> > Allocate 32x32MB buffer and read 1024 MB file Test:
> > Metric                 | alloc (ms) | read (ms) | total (ms)
> > -----------------------|------------|-----------|-----------
> > udmabuf buffer read    | 539        | 2017      | 2555
> > udmabuf direct read    | 522        | 658       | 1179
> > udmabuf buffer sendfile| 505        | 1040      | 1546
> > udmabuf direct sendfile| 510        | 2269      | 2780
> > dmabuf buffer read     | 51         | 1068      | 1118
> > dmabuf direct read     | 52         | 297       | 349
> >
> > udmabuf sendfile test steps:
> > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) # Loop
> > steps 2-6 3. Allocate udmabuf with memfd 4. Call sendfile(memfd,
> > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close
> > back_fd
> >
> >>
> >> Regards,
> >> Christian.
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16  9:49                   ` wangtao
@ 2025-05-16 10:29                     ` Christian König
  2025-05-19  4:08                       ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-16 10:29 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/16/25 11:49, wangtao wrote:
>>>> Please try using udmabuf with sendfile() as confirmed to be working by
>> T.J.
>>> [wangtao] Using buffer IO with dmabuf file read/write requires one
>> memory copy.
>>> Direct IO removes this copy to enable zero-copy. The sendfile system
>>> call reduces memory copies from two (read/write) to one. However, with
>>> udmabuf, sendfile still keeps at least one copy, failing zero-copy.
>>
>>
>> Then please work on fixing this.
> [wangtao] What needs fixing? Does sendfile achieve zero-copy?
> sendfile reduces memory copies (from 2 to 1) for network sockets,
> but still requires one copy and cannot achieve zero copies.

Well why not? See sendfile() is the designated Linux uAPI for moving data between two files, maybe splice() is also appropriate.

The memory file descriptor and your destination file are both a files. So those uAPIs apply.

Now what you suggest is to add a new IOCTL to do this in a very specific manner just for the system DMA-buf heap. And as far as I can see that is in general a complete no-go.

I mean I understand why you do this. Instead of improving the existing functionality you're just hacking something together because it is simple for you.

It might be possible to implement that generic for DMA-buf heaps if udmabuf allocation overhead can't be reduced, but that is then just the second step.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16  8:36                 ` Christian König
  2025-05-16  9:49                   ` wangtao
@ 2025-05-16 18:37                   ` T.J. Mercier
  2025-05-19  4:37                     ` wangtao
  2025-05-19 12:03                     ` wangtao
  1 sibling, 2 replies; 28+ messages in thread
From: T.J. Mercier @ 2025-05-16 18:37 UTC (permalink / raw)
  To: Christian König
  Cc: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On Fri, May 16, 2025 at 1:36 AM Christian König
<christian.koenig@amd.com> wrote:
>
> On 5/16/25 09:40, wangtao wrote:
> >
> >
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Thursday, May 15, 2025 10:26 PM
> >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> >> jstultz@google.com; tjmercier@google.com
> >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> >> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng
> >> 00012985 <feng.han@honor.com>
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/15/25 16:03, wangtao wrote:
> >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> >>> Allocation: 32x32MB buffer creation
> >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> >>> - Note: shmem shows excessive allocation time
> >>
> >> Yeah, that is something already noted by others as well. But that is
> >> orthogonal.
> >>
> >>>
> >>> Read 1024MB File:
> >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> >>>
> >>> Key function call timing: See details below.
> >>
> >> Those aren't valid, you are comparing different functionalities here.
> >>
> >> Please try using udmabuf with sendfile() as confirmed to be working by T.J.
> > [wangtao] Using buffer IO with dmabuf file read/write requires one memory copy.
> > Direct IO removes this copy to enable zero-copy. The sendfile system call
> > reduces memory copies from two (read/write) to one. However, with udmabuf,
> > sendfile still keeps at least one copy, failing zero-copy.
>
>
> Then please work on fixing this.
>
> Regards,
> Christian.
>
>
> >
> > If udmabuf sendfile uses buffer IO (file page cache), read latency matches
> > dmabuf buffer read, but allocation time is much longer.
> > With Direct IO, the default 16-page pipe size makes it slower than buffer IO.
> >
> > Test data shows:
> > udmabuf direct read is much faster than udmabuf sendfile.
> > dmabuf direct read outperforms udmabuf direct read by a large margin.
> >
> > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
> > udmabuf for Direct IO might cause errors, but there are no safeguards to
> > prevent this.
> >
> > Allocate 32x32MB buffer and read 1024 MB file Test:
> > Metric                 | alloc (ms) | read (ms) | total (ms)
> > -----------------------|------------|-----------|-----------
> > udmabuf buffer read    | 539        | 2017      | 2555
> > udmabuf direct read    | 522        | 658       | 1179

I can't reproduce the part where udmabuf direct reads are faster than
buffered reads. That's the opposite of what I'd expect. Something
seems wrong with those buffered reads.

> > udmabuf buffer sendfile| 505        | 1040      | 1546
> > udmabuf direct sendfile| 510        | 2269      | 2780

I can reproduce the 3.5x slower udambuf direct sendfile compared to
udmabuf direct read. It's a pretty disappointing result, so it seems
like something could be improved there.

1G from ext4 on 6.12.17 | read/sendfile (ms)
------------------------|-------------------
udmabuf buffer read     | 351
udmabuf direct read     | 540
udmabuf buffer sendfile | 255
udmabuf direct sendfile | 1990


> > dmabuf buffer read     | 51         | 1068      | 1118
> > dmabuf direct read     | 52         | 297       | 349
> >
> > udmabuf sendfile test steps:
> > 1. Open data file(1024MB), get back_fd
> > 2. Create memfd(32MB) # Loop steps 2-6
> > 3. Allocate udmabuf with memfd
> > 4. Call sendfile(memfd, back_fd)
> > 5. Close memfd after sendfile
> > 6. Close udmabuf
> > 7. Close back_fd
> >
> >>
> >> Regards,
> >> Christian.
> >
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16 10:29                     ` Christian König
@ 2025-05-19  4:08                       ` wangtao
  2025-05-19  7:47                         ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-19  4:08 UTC (permalink / raw)
  To: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Friday, May 16, 2025 6:29 PM
> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; tjmercier@google.com
> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> <yipengxiang@honor.com>; liulu <liulu.liu@honor.com>; hanfeng
> <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/16/25 11:49, wangtao wrote:
> >>>> Please try using udmabuf with sendfile() as confirmed to be working
> >>>> by
> >> T.J.
> >>> [wangtao] Using buffer IO with dmabuf file read/write requires one
> >> memory copy.
> >>> Direct IO removes this copy to enable zero-copy. The sendfile system
> >>> call reduces memory copies from two (read/write) to one. However,
> >>> with udmabuf, sendfile still keeps at least one copy, failing zero-copy.
> >>
> >>
> >> Then please work on fixing this.
> > [wangtao] What needs fixing? Does sendfile achieve zero-copy?
> > sendfile reduces memory copies (from 2 to 1) for network sockets, but
> > still requires one copy and cannot achieve zero copies.
> 
> Well why not? See sendfile() is the designated Linux uAPI for moving data
> between two files, maybe splice() is also appropriate.
> 
> The memory file descriptor and your destination file are both a files. So those
> uAPIs apply.
[wangtao] I realize our disagreement lies here:  
You believe sendfile enables zero-copy for regular file → socket/file:  
sendfile(dst_socket, src_disk)  
[disk] --DMA--> [page buffer] --DMA--> [NIC]  
sendfile(dst_disk, src_disk)  
[disk] --DMA--> [page buffer] --DMA--> [DISK]  

But for regular file → memory file (e.g., tmpfs/shmem), a CPU copy is unavoidable:  
sendfile(dst_memfile, src_disk)  
[disk] --DMA--> [page buffer] --CPU copy--> [DISK]  
Without memory-to-memory DMA, this wastes CPU/power — critical for embedded devices.  

> 
> Now what you suggest is to add a new IOCTL to do this in a very specific
> manner just for the system DMA-buf heap. And as far as I can see that is in
> general a complete no-go.
> 
> I mean I understand why you do this. Instead of improving the existing
> functionality you're just hacking something together because it is simple for
> you.
> 
> It might be possible to implement that generic for DMA-buf heaps if
> udmabuf allocation overhead can't be reduced, but that is then just the
> second step.
[wangtao] On dmabuf:  
- DMABUF lacks Direct I/O support, hence our proposal.  
- memfd supports Direct I/O but doesn’t fit our use case.  
- udmabuf via memfd works but needs systemic changes (low ROI) and has slow allocation.  

Your objections:  
1. Adding an IOCTL? This targets dmabuf specifically, and our fix is simple.  
   sendfile doesn’t resolve it.
2. Accessing sgtable pages in the exporter? As the dmabuf creator, the exporter  
   fully controls sgtable/page data. We can restrict access to cases with no  
   external users.

Could you clarify which point you oppose?

> 
> Regards,
> Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16 18:37                   ` T.J. Mercier
@ 2025-05-19  4:37                     ` wangtao
  2025-05-19 12:03                     ` wangtao
  1 sibling, 0 replies; 28+ messages in thread
From: wangtao @ 2025-05-19  4:37 UTC (permalink / raw)
  To: T.J. Mercier, Christian König
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: T.J. Mercier <tjmercier@google.com>
> Sent: Saturday, May 17, 2025 2:37 AM
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On Fri, May 16, 2025 at 1:36 AM Christian König <christian.koenig@amd.com>
> wrote:
> >
> > On 5/16/25 09:40, wangtao wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Christian König <christian.koenig@amd.com>
> > >> Sent: Thursday, May 15, 2025 10:26 PM
> > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > >> DMA_BUF_IOCTL_RW_FILE for system_heap
> > >>
> > >> On 5/15/25 16:03, wangtao wrote:
> > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > >>> Allocation: 32x32MB buffer creation
> > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > >>> - Note: shmem shows excessive allocation time
> > >>
> > >> Yeah, that is something already noted by others as well. But that
> > >> is orthogonal.
> > >>
> > >>>
> > >>> Read 1024MB File:
> > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> > >>>
> > >>> Key function call timing: See details below.
> > >>
> > >> Those aren't valid, you are comparing different functionalities here.
> > >>
> > >> Please try using udmabuf with sendfile() as confirmed to be working by
> T.J.
> > > [wangtao] Using buffer IO with dmabuf file read/write requires one
> memory copy.
> > > Direct IO removes this copy to enable zero-copy. The sendfile system
> > > call reduces memory copies from two (read/write) to one. However,
> > > with udmabuf, sendfile still keeps at least one copy, failing zero-copy.
> >
> >
> > Then please work on fixing this.
> >
> > Regards,
> > Christian.
> >
> >
> > >
> > > If udmabuf sendfile uses buffer IO (file page cache), read latency
> > > matches dmabuf buffer read, but allocation time is much longer.
> > > With Direct IO, the default 16-page pipe size makes it slower than buffer
> IO.
> > >
> > > Test data shows:
> > > udmabuf direct read is much faster than udmabuf sendfile.
> > > dmabuf direct read outperforms udmabuf direct read by a large margin.
> > >
> > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
> > > udmabuf for Direct IO might cause errors, but there are no
> > > safeguards to prevent this.
> > >
> > > Allocate 32x32MB buffer and read 1024 MB file Test:
> > > Metric                 | alloc (ms) | read (ms) | total (ms)
> > > -----------------------|------------|-----------|-----------
> > > udmabuf buffer read    | 539        | 2017      | 2555
> > > udmabuf direct read    | 522        | 658       | 1179
> 
> I can't reproduce the part where udmabuf direct reads are faster than
> buffered reads. That's the opposite of what I'd expect. Something seems
> wrong with those buffered reads.
> 
[wangtao] Buffer read requires an extra CPU memory copy. Our device's low CPU
performance leads to longer latency. On high-performance 3.5GHz CPUs, buffer
read shows better ratios but still lags behind direct I/O.

Tests used single-thread programs with 32MB readahead to minimize latency(Embedded mobile devices usually <= 2MB).

Test results (time in ms):
|                   |     little core @1GHz     |      big core @3.5GHz     |
|                   | alloc             | read  | alloc             | read  |
|-------------------|-------------------|-------|-------------------|-------|
| udmabuf buffer RD | 543               | 2078  | 135               | 549   |
| udmabuf direct RD | 543               | 640   | 163               | 291   |
| udmabuf buffer SF | 494               | 1058  | 137               | 315   |
| udmabuf direct SF | 529               | 2335  | 143               | 909   |
| dmabuf buffer  RD | 39                | 1077  | 23                | 349   |
| patch direct RD   | 51                | 306   | 30                | 267   |

> > > udmabuf buffer sendfile| 505        | 1040      | 1546
> > > udmabuf direct sendfile| 510        | 2269      | 2780
> 
> I can reproduce the 3.5x slower udambuf direct sendfile compared to
> udmabuf direct read. It's a pretty disappointing result, so it seems like
> something could be improved there.
> 
> 1G from ext4 on 6.12.17 | read/sendfile (ms)
> ------------------------|-------------------
> udmabuf buffer read     | 351
> udmabuf direct read     | 540
> udmabuf buffer sendfile | 255
> udmabuf direct sendfile | 1990
> 
[wangtao] Key observations:
1. Direct sendfile underperforms due to small pipe buffers/memory file page,
   requiring more DMA operations.
2. ext4 vs f2fs: ext4 supports hugepage/larger folio (unlike f2fs). Mobile
   devices mostly use f2fs, which affects performance.

I/O path comparison:
- Buffer read: [DISK] → DMA → [page cache] → CPU copy → [memory file]
- Direct read: [DISK] → DMA → [memory file]
- Buffer sendfile: [DISK] → DMA → [page cache] → CPU copy → [memory file]
- Direct sendfile: [DISK] → DMA → [pipe buffer] → CPU copy → [memory file]

The extra CPU copy and pipe limitations explain the performance gap.

> 
> > > dmabuf buffer read     | 51         | 1068      | 1118
> > > dmabuf direct read     | 52         | 297       | 349
> > >
> > > udmabuf sendfile test steps:
> > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) # Loop
> > > steps 2-6 3. Allocate udmabuf with memfd 4. Call sendfile(memfd,
> > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close
> > > back_fd
> > >
> > >>
> > >> Regards,
> > >> Christian.
> > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-19  4:08                       ` wangtao
@ 2025-05-19  7:47                         ` Christian König
  0 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2025-05-19  7:47 UTC (permalink / raw)
  To: wangtao, sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com, tjmercier@google.com
  Cc: linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/19/25 06:08, wangtao wrote:
> 
> 
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Friday, May 16, 2025 6:29 PM
>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
>> jstultz@google.com; tjmercier@google.com
>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro-
>> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>> <yipengxiang@honor.com>; liulu <liulu.liu@honor.com>; hanfeng
>> <feng.han@honor.com>
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/16/25 11:49, wangtao wrote:
>>>>>> Please try using udmabuf with sendfile() as confirmed to be working
>>>>>> by
>>>> T.J.
>>>>> [wangtao] Using buffer IO with dmabuf file read/write requires one
>>>> memory copy.
>>>>> Direct IO removes this copy to enable zero-copy. The sendfile system
>>>>> call reduces memory copies from two (read/write) to one. However,
>>>>> with udmabuf, sendfile still keeps at least one copy, failing zero-copy.
>>>>
>>>>
>>>> Then please work on fixing this.
>>> [wangtao] What needs fixing? Does sendfile achieve zero-copy?
>>> sendfile reduces memory copies (from 2 to 1) for network sockets, but
>>> still requires one copy and cannot achieve zero copies.
>>
>> Well why not? See sendfile() is the designated Linux uAPI for moving data
>> between two files, maybe splice() is also appropriate.
>>
>> The memory file descriptor and your destination file are both a files. So those
>> uAPIs apply.
> [wangtao] I realize our disagreement lies here:  
> You believe sendfile enables zero-copy for regular file → socket/file:

No what I mean is that it should be possible to solve this using sendfile() or splice() and not come uo with a hacky IOCTL to bypass well tested and agreed upon system calls.
  
> sendfile(dst_socket, src_disk)  
> [disk] --DMA--> [page buffer] --DMA--> [NIC]  
> sendfile(dst_disk, src_disk)  
> [disk] --DMA--> [page buffer] --DMA--> [DISK]  
> 
> But for regular file → memory file (e.g., tmpfs/shmem), a CPU copy is unavoidable:  
> sendfile(dst_memfile, src_disk)  
> [disk] --DMA--> [page buffer] --CPU copy--> [DISK]  
> Without memory-to-memory DMA, this wastes CPU/power — critical for embedded devices.  
> 
>>
>> Now what you suggest is to add a new IOCTL to do this in a very specific
>> manner just for the system DMA-buf heap. And as far as I can see that is in
>> general a complete no-go.
>>
>> I mean I understand why you do this. Instead of improving the existing
>> functionality you're just hacking something together because it is simple for
>> you.
>>
>> It might be possible to implement that generic for DMA-buf heaps if
>> udmabuf allocation overhead can't be reduced, but that is then just the
>> second step.
> [wangtao] On dmabuf:  
> - DMABUF lacks Direct I/O support, hence our proposal.  
> - memfd supports Direct I/O but doesn’t fit our use case.  
> - udmabuf via memfd works but needs systemic changes (low ROI) and has slow allocation.  
> 
> Your objections:  
> 1. Adding an IOCTL? This targets dmabuf specifically, and our fix is simple.  
>    sendfile doesn’t resolve it.
> 2. Accessing sgtable pages in the exporter? As the dmabuf creator, the exporter  
>    fully controls sgtable/page data. We can restrict access to cases with no  
>    external users.
> 
> Could you clarify which point you oppose?

Both. I might be repeating myself, but I think what you do here is a no-go and reimplements core system call functionality by a way which we certainly shouldn't allow.

T.J's testing shows that sendfile() seems to work at least in one direction. The other use case can certainly be optimized. So if you want to improve this work on that instead.

Regards,
Christian

> 
>>
>> Regards,
>> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-16 18:37                   ` T.J. Mercier
  2025-05-19  4:37                     ` wangtao
@ 2025-05-19 12:03                     ` wangtao
  2025-05-20  4:06                       ` wangtao
  1 sibling, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-19 12:03 UTC (permalink / raw)
  To: T.J. Mercier, Christian König
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: T.J. Mercier <tjmercier@google.com>
> Sent: Saturday, May 17, 2025 2:37 AM
> To: Christian König <christian.koenig@amd.com>
> Cc: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; linux-media@vger.kernel.org; dri-
> devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On Fri, May 16, 2025 at 1:36 AM Christian König <christian.koenig@amd.com>
> wrote:
> >
> > On 5/16/25 09:40, wangtao wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Christian König <christian.koenig@amd.com>
> > >> Sent: Thursday, May 15, 2025 10:26 PM
> > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > >> jstultz@google.com; tjmercier@google.com
> > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> > >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>;
> > >> hanfeng
> > >> 00012985 <feng.han@honor.com>
> > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > >> DMA_BUF_IOCTL_RW_FILE for system_heap
> > >>
> > >> On 5/15/25 16:03, wangtao wrote:
> > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > >>> Allocation: 32x32MB buffer creation
> > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > >>> - Note: shmem shows excessive allocation time
> > >>
> > >> Yeah, that is something already noted by others as well. But that
> > >> is orthogonal.
> > >>
> > >>>
> > >>> Read 1024MB File:
> > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> > >>>
> > >>> Key function call timing: See details below.
> > >>
> > >> Those aren't valid, you are comparing different functionalities here.
> > >>
> > >> Please try using udmabuf with sendfile() as confirmed to be working by
> T.J.
> > > [wangtao] Using buffer IO with dmabuf file read/write requires one
> memory copy.
> > > Direct IO removes this copy to enable zero-copy. The sendfile system
> > > call reduces memory copies from two (read/write) to one. However,
> > > with udmabuf, sendfile still keeps at least one copy, failing zero-copy.
> >
> >
> > Then please work on fixing this.
> >
> > Regards,
> > Christian.
> >
> >
> > >
> > > If udmabuf sendfile uses buffer IO (file page cache), read latency
> > > matches dmabuf buffer read, but allocation time is much longer.
> > > With Direct IO, the default 16-page pipe size makes it slower than buffer
> IO.
> > >
> > > Test data shows:
> > > udmabuf direct read is much faster than udmabuf sendfile.
> > > dmabuf direct read outperforms udmabuf direct read by a large margin.
> > >
> > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or
> > > udmabuf for Direct IO might cause errors, but there are no
> > > safeguards to prevent this.
> > >
> > > Allocate 32x32MB buffer and read 1024 MB file Test:
> > > Metric                 | alloc (ms) | read (ms) | total (ms)
> > > -----------------------|------------|-----------|-----------
> > > udmabuf buffer read    | 539        | 2017      | 2555
> > > udmabuf direct read    | 522        | 658       | 1179
> 
> I can't reproduce the part where udmabuf direct reads are faster than
> buffered reads. That's the opposite of what I'd expect. Something seems
> wrong with those buffered reads.
> 
> > > udmabuf buffer sendfile| 505        | 1040      | 1546
> > > udmabuf direct sendfile| 510        | 2269      | 2780
> 
> I can reproduce the 3.5x slower udambuf direct sendfile compared to
> udmabuf direct read. It's a pretty disappointing result, so it seems like
> something could be improved there.
> 
> 1G from ext4 on 6.12.17 | read/sendfile (ms)
> ------------------------|-------------------
> udmabuf buffer read     | 351
> udmabuf direct read     | 540
> udmabuf buffer sendfile | 255
> udmabuf direct sendfile | 1990
> 
[wangtao] By the way, did you clear the file cache during testing?
Looking at your data again, read and sendfile buffers are
faster than Direct I/O, which suggests the file cache wasn’t
cleared. If you didn’t clear the file cache, the test results
are unfair and unreliable for reference. On embedded devices,
it’s nearly impossible to maintain stable caching for multi-GB
files. If such files could be cached, we might as well cache
dmabufs directly to save time on creating dmabufs and reading
file data.
You can call posix_fadvise(file_fd, 0, len, POSIX_FADV_DONTNEED)
after opening the file or before closing it to clear the file cache,
ensuring actual file I/O operations are tested.

> 
> > > dmabuf buffer read     | 51         | 1068      | 1118
> > > dmabuf direct read     | 52         | 297       | 349
> > >
> > > udmabuf sendfile test steps:
> > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) # Loop
> > > steps 2-6 3. Allocate udmabuf with memfd 4. Call sendfile(memfd,
> > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close
> > > back_fd
> > >
> > >>
> > >> Regards,
> > >> Christian.
> > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-19 12:03                     ` wangtao
@ 2025-05-20  4:06                       ` wangtao
  2025-05-21  2:00                         ` T.J. Mercier
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-20  4:06 UTC (permalink / raw)
  To: T.J. Mercier, Christian König
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985



> -----Original Message-----
> From: wangtao
> Sent: Monday, May 19, 2025 8:04 PM
> To: 'T.J. Mercier' <tjmercier@google.com>; Christian König
> <christian.koenig@amd.com>
> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>
> Subject: RE: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> 
> 
> > -----Original Message-----
> > From: T.J. Mercier <tjmercier@google.com>
> > Sent: Saturday, May 17, 2025 2:37 AM
> > To: Christian König <christian.koenig@amd.com>
> > Cc: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > jstultz@google.com; linux-media@vger.kernel.org; dri-
> > devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> > kernel@vger.kernel.org; wangbintian(BintianWang)
> > <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> > 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> <feng.han@honor.com>
> > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE
> > for system_heap
> >
> > On Fri, May 16, 2025 at 1:36 AM Christian König
> > <christian.koenig@amd.com>
> > wrote:
> > >
> > > On 5/16/25 09:40, wangtao wrote:
> > > >
> > > >
> > > >> -----Original Message-----
> > > >> From: Christian König <christian.koenig@amd.com>
> > > >> Sent: Thursday, May 15, 2025 10:26 PM
> > > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > > >> jstultz@google.com; tjmercier@google.com
> > > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> > > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> > > >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>;
> > > >> hanfeng
> > > >> 00012985 <feng.han@honor.com>
> > > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > > >> DMA_BUF_IOCTL_RW_FILE for system_heap
> > > >>
> > > >> On 5/15/25 16:03, wangtao wrote:
> > > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > > >>> Allocation: 32x32MB buffer creation
> > > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > > >>> - Note: shmem shows excessive allocation time
> > > >>
> > > >> Yeah, that is something already noted by others as well. But that
> > > >> is orthogonal.
> > > >>
> > > >>>
> > > >>> Read 1024MB File:
> > > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > > >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> > > >>>
> > > >>> Key function call timing: See details below.
> > > >>
> > > >> Those aren't valid, you are comparing different functionalities here.
> > > >>
> > > >> Please try using udmabuf with sendfile() as confirmed to be
> > > >> working by
> > T.J.
> > > > [wangtao] Using buffer IO with dmabuf file read/write requires one
> > memory copy.
> > > > Direct IO removes this copy to enable zero-copy. The sendfile
> > > > system call reduces memory copies from two (read/write) to one.
> > > > However, with udmabuf, sendfile still keeps at least one copy, failing
> zero-copy.
> > >
> > >
> > > Then please work on fixing this.
> > >
> > > Regards,
> > > Christian.
> > >
> > >
> > > >
> > > > If udmabuf sendfile uses buffer IO (file page cache), read latency
> > > > matches dmabuf buffer read, but allocation time is much longer.
> > > > With Direct IO, the default 16-page pipe size makes it slower than
> > > > buffer
> > IO.
> > > >
> > > > Test data shows:
> > > > udmabuf direct read is much faster than udmabuf sendfile.
> > > > dmabuf direct read outperforms udmabuf direct read by a large margin.
> > > >
> > > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd
> > > > or udmabuf for Direct IO might cause errors, but there are no
> > > > safeguards to prevent this.
> > > >
> > > > Allocate 32x32MB buffer and read 1024 MB file Test:
> > > > Metric                 | alloc (ms) | read (ms) | total (ms)
> > > > -----------------------|------------|-----------|-----------
> > > > udmabuf buffer read    | 539        | 2017      | 2555
> > > > udmabuf direct read    | 522        | 658       | 1179
> >
> > I can't reproduce the part where udmabuf direct reads are faster than
> > buffered reads. That's the opposite of what I'd expect. Something
> > seems wrong with those buffered reads.
> >
> > > > udmabuf buffer sendfile| 505        | 1040      | 1546
> > > > udmabuf direct sendfile| 510        | 2269      | 2780
> >
> > I can reproduce the 3.5x slower udambuf direct sendfile compared to
> > udmabuf direct read. It's a pretty disappointing result, so it seems
> > like something could be improved there.
> >
> > 1G from ext4 on 6.12.17 | read/sendfile (ms)
> > ------------------------|-------------------
> > udmabuf buffer read     | 351
> > udmabuf direct read     | 540
> > udmabuf buffer sendfile | 255
> > udmabuf direct sendfile | 1990
> >
> [wangtao] By the way, did you clear the file cache during testing?
> Looking at your data again, read and sendfile buffers are faster than Direct
> I/O, which suggests the file cache wasn’t cleared. If you didn’t clear the file
> cache, the test results are unfair and unreliable for reference. On embedded
> devices, it’s nearly impossible to maintain stable caching for multi-GB files. If
> such files could be cached, we might as well cache dmabufs directly to save
> time on creating dmabufs and reading file data.
> You can call posix_fadvise(file_fd, 0, len, POSIX_FADV_DONTNEED) after
> opening the file or before closing it to clear the file cache, ensuring actual file
> I/O operations are tested.
> 
[wangtao] Please confirm if cache clearing was performed during testing.
I reduced the test scope from 3GB to 1GB. While results without
cache clearing show general alignment, udmabuf buffer read remains
slower than direct read. Comparative data:

Your test reading 1GB(ext4 on 6.12.17:
Method                | read/sendfile (ms) | read vs. (%)
----------------------------------------------------------
udmabuf buffer read   | 351                | 138%
udmabuf direct read   | 540                | 212%
udmabuf buffer sendfile | 255              | 100%
udmabuf direct sendfile | 1990             | 780%

My 3.5GHz tests (f2fs):
Without cache clearing:
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 140   | 386   | 310%
udmabuf direct read   | 151   | 326   | 262%
udmabuf buffer sendfile | 136 | 124   | 100%
udmabuf direct sendfile | 132 | 892   | 717%
dmabuf buffer read    | 23    | 154   | 124%
patch direct read     | 29    | 271   | 218%

With cache clearing:
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 135   | 546   | 180%
udmabuf direct read   | 159   | 300   | 99%
udmabuf buffer sendfile | 134 | 303   | 100%
udmabuf direct sendfile | 141 | 912   | 301%
dmabuf buffer read    | 22    | 362   | 119%
patch direct read     | 29    | 265   | 87%

Results without cache clearing aren't representative for embedded
mobile devices. Notably, on low-power CPUs @1GHz, sendfile latency
without cache clearing exceeds dmabuf direct I/O read time.

Without cache clearing:
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 546   | 1745  | 442%
udmabuf direct read   | 511   | 704   | 178%
udmabuf buffer sendfile | 496 | 395   | 100%
udmabuf direct sendfile | 498 | 2332  | 591%
dmabuf buffer read    | 43    | 453   | 115%
my patch direct read  | 49    | 310   | 79%

With cache clearing:
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 552   | 2067  | 198%
udmabuf direct read   | 540   | 627   | 60%
udmabuf buffer sendfile | 497 | 1045  | 100%
udmabuf direct sendfile | 527 | 2330  | 223%
dmabuf buffer read    | 40    | 1111  | 106%
my patch direct read  | 44    | 310   | 30%

Reducing CPU overhead/power consumption is critical for mobile devices.
We need simpler and more efficient dmabuf direct I/O support.

As Christian evaluated sendfile performance based on your data, could
you confirm whether the cache was cleared? If not, please share the
post-cache-clearing test data. Thank you for your support.


> >
> > > > dmabuf buffer read     | 51         | 1068      | 1118
> > > > dmabuf direct read     | 52         | 297       | 349
> > > >
> > > > udmabuf sendfile test steps:
> > > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) #
> > > > Loop steps 2-6 3. Allocate udmabuf with memfd 4. Call
> > > > sendfile(memfd,
> > > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close
> > > > back_fd
> > > >
> > > >>
> > > >> Regards,
> > > >> Christian.
> > > >
> > >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-20  4:06                       ` wangtao
@ 2025-05-21  2:00                         ` T.J. Mercier
  2025-05-21  4:17                           ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: T.J. Mercier @ 2025-05-21  2:00 UTC (permalink / raw)
  To: wangtao
  Cc: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, linux-media@vger.kernel.org,
	dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
	linux-kernel@vger.kernel.org, wangbintian(BintianWang),
	yipengxiang, liulu 00013167, hanfeng 00012985

On Mon, May 19, 2025 at 9:06 PM wangtao <tao.wangtao@honor.com> wrote:
>
>
>
> > -----Original Message-----
> > From: wangtao
> > Sent: Monday, May 19, 2025 8:04 PM
> > To: 'T.J. Mercier' <tjmercier@google.com>; Christian König
> > <christian.koenig@amd.com>
> > Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> > Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> > dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> > kernel@vger.kernel.org; wangbintian(BintianWang)
> > <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> > 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>
> > Subject: RE: [PATCH 2/2] dmabuf/heaps: implement
> > DMA_BUF_IOCTL_RW_FILE for system_heap
> >
> >
> >
> > > -----Original Message-----
> > > From: T.J. Mercier <tjmercier@google.com>
> > > Sent: Saturday, May 17, 2025 2:37 AM
> > > To: Christian König <christian.koenig@amd.com>
> > > Cc: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > > jstultz@google.com; linux-media@vger.kernel.org; dri-
> > > devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> > > kernel@vger.kernel.org; wangbintian(BintianWang)
> > > <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> > > 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> > <feng.han@honor.com>
> > > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > DMA_BUF_IOCTL_RW_FILE
> > > for system_heap
> > >
> > > On Fri, May 16, 2025 at 1:36 AM Christian König
> > > <christian.koenig@amd.com>
> > > wrote:
> > > >
> > > > On 5/16/25 09:40, wangtao wrote:
> > > > >
> > > > >
> > > > >> -----Original Message-----
> > > > >> From: Christian König <christian.koenig@amd.com>
> > > > >> Sent: Thursday, May 15, 2025 10:26 PM
> > > > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > > > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > > > >> jstultz@google.com; tjmercier@google.com
> > > > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > > > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org;
> > > > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> > > > >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>;
> > > > >> hanfeng
> > > > >> 00012985 <feng.han@honor.com>
> > > > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > > > >> DMA_BUF_IOCTL_RW_FILE for system_heap
> > > > >>
> > > > >> On 5/15/25 16:03, wangtao wrote:
> > > > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > > > >>> Allocation: 32x32MB buffer creation
> > > > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > > > >>> - Note: shmem shows excessive allocation time
> > > > >>
> > > > >> Yeah, that is something already noted by others as well. But that
> > > > >> is orthogonal.
> > > > >>
> > > > >>>
> > > > >>> Read 1024MB File:
> > > > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > > > >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> > > > >>>
> > > > >>> Key function call timing: See details below.
> > > > >>
> > > > >> Those aren't valid, you are comparing different functionalities here.
> > > > >>
> > > > >> Please try using udmabuf with sendfile() as confirmed to be
> > > > >> working by
> > > T.J.
> > > > > [wangtao] Using buffer IO with dmabuf file read/write requires one
> > > memory copy.
> > > > > Direct IO removes this copy to enable zero-copy. The sendfile
> > > > > system call reduces memory copies from two (read/write) to one.
> > > > > However, with udmabuf, sendfile still keeps at least one copy, failing
> > zero-copy.
> > > >
> > > >
> > > > Then please work on fixing this.
> > > >
> > > > Regards,
> > > > Christian.
> > > >
> > > >
> > > > >
> > > > > If udmabuf sendfile uses buffer IO (file page cache), read latency
> > > > > matches dmabuf buffer read, but allocation time is much longer.
> > > > > With Direct IO, the default 16-page pipe size makes it slower than
> > > > > buffer
> > > IO.
> > > > >
> > > > > Test data shows:
> > > > > udmabuf direct read is much faster than udmabuf sendfile.
> > > > > dmabuf direct read outperforms udmabuf direct read by a large margin.
> > > > >
> > > > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd
> > > > > or udmabuf for Direct IO might cause errors, but there are no
> > > > > safeguards to prevent this.
> > > > >
> > > > > Allocate 32x32MB buffer and read 1024 MB file Test:
> > > > > Metric                 | alloc (ms) | read (ms) | total (ms)
> > > > > -----------------------|------------|-----------|-----------
> > > > > udmabuf buffer read    | 539        | 2017      | 2555
> > > > > udmabuf direct read    | 522        | 658       | 1179
> > >
> > > I can't reproduce the part where udmabuf direct reads are faster than
> > > buffered reads. That's the opposite of what I'd expect. Something
> > > seems wrong with those buffered reads.
> > >
> > > > > udmabuf buffer sendfile| 505        | 1040      | 1546
> > > > > udmabuf direct sendfile| 510        | 2269      | 2780
> > >
> > > I can reproduce the 3.5x slower udambuf direct sendfile compared to
> > > udmabuf direct read. It's a pretty disappointing result, so it seems
> > > like something could be improved there.
> > >
> > > 1G from ext4 on 6.12.17 | read/sendfile (ms)
> > > ------------------------|-------------------
> > > udmabuf buffer read     | 351
> > > udmabuf direct read     | 540
> > > udmabuf buffer sendfile | 255
> > > udmabuf direct sendfile | 1990
> > >
> > [wangtao] By the way, did you clear the file cache during testing?
> > Looking at your data again, read and sendfile buffers are faster than Direct
> > I/O, which suggests the file cache wasn’t cleared. If you didn’t clear the file
> > cache, the test results are unfair and unreliable for reference. On embedded
> > devices, it’s nearly impossible to maintain stable caching for multi-GB files. If
> > such files could be cached, we might as well cache dmabufs directly to save
> > time on creating dmabufs and reading file data.
> > You can call posix_fadvise(file_fd, 0, len, POSIX_FADV_DONTNEED) after
> > opening the file or before closing it to clear the file cache, ensuring actual file
> > I/O operations are tested.
> >
> [wangtao] Please confirm if cache clearing was performed during testing.
> I reduced the test scope from 3GB to 1GB. While results without
> cache clearing show general alignment, udmabuf buffer read remains
> slower than direct read. Comparative data:
>
> Your test reading 1GB(ext4 on 6.12.17:
> Method                | read/sendfile (ms) | read vs. (%)
> ----------------------------------------------------------
> udmabuf buffer read   | 351                | 138%
> udmabuf direct read   | 540                | 212%
> udmabuf buffer sendfile | 255              | 100%
> udmabuf direct sendfile | 1990             | 780%
>
> My 3.5GHz tests (f2fs):
> Without cache clearing:
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 140   | 386   | 310%
> udmabuf direct read   | 151   | 326   | 262%
> udmabuf buffer sendfile | 136 | 124   | 100%
> udmabuf direct sendfile | 132 | 892   | 717%
> dmabuf buffer read    | 23    | 154   | 124%
> patch direct read     | 29    | 271   | 218%
>
> With cache clearing:
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 135   | 546   | 180%
> udmabuf direct read   | 159   | 300   | 99%
> udmabuf buffer sendfile | 134 | 303   | 100%
> udmabuf direct sendfile | 141 | 912   | 301%
> dmabuf buffer read    | 22    | 362   | 119%
> patch direct read     | 29    | 265   | 87%
>
> Results without cache clearing aren't representative for embedded
> mobile devices. Notably, on low-power CPUs @1GHz, sendfile latency
> without cache clearing exceeds dmabuf direct I/O read time.
>
> Without cache clearing:
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 546   | 1745  | 442%
> udmabuf direct read   | 511   | 704   | 178%
> udmabuf buffer sendfile | 496 | 395   | 100%
> udmabuf direct sendfile | 498 | 2332  | 591%
> dmabuf buffer read    | 43    | 453   | 115%
> my patch direct read  | 49    | 310   | 79%
>
> With cache clearing:
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 552   | 2067  | 198%
> udmabuf direct read   | 540   | 627   | 60%
> udmabuf buffer sendfile | 497 | 1045  | 100%
> udmabuf direct sendfile | 527 | 2330  | 223%
> dmabuf buffer read    | 40    | 1111  | 106%
> my patch direct read  | 44    | 310   | 30%
>
> Reducing CPU overhead/power consumption is critical for mobile devices.
> We need simpler and more efficient dmabuf direct I/O support.
>
> As Christian evaluated sendfile performance based on your data, could
> you confirm whether the cache was cleared? If not, please share the
> post-cache-clearing test data. Thank you for your support.

Yes sorry, I was out yesterday riding motorcycles. I did not clear the
cache for the buffered reads, I didn't realize you had. The IO plus
the copy certainly explains the difference.

Your point about the unlikelihood of any of that data being in the
cache also makes sense.

I'm not sure it changes anything about the ioctl approach though.
Another way to do this would be to move the (optional) support for
direct IO into the exporter via dma_buf_fops and dma_buf_ops. Then
normal read() syscalls would just work for buffers that support them.
I know that's more complicated, but at least it doesn't require
inventing new uapi to do it.

1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
------------------------|-------------------
udmabuf buffer read     | 1210
udmabuf direct read     | 671
udmabuf buffer sendfile | 1096
udmabuf direct sendfile | 2340



>
> > >
> > > > > dmabuf buffer read     | 51         | 1068      | 1118
> > > > > dmabuf direct read     | 52         | 297       | 349
> > > > >
> > > > > udmabuf sendfile test steps:
> > > > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) #
> > > > > Loop steps 2-6 3. Allocate udmabuf with memfd 4. Call
> > > > > sendfile(memfd,
> > > > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close
> > > > > back_fd
> > > > >
> > > > >>
> > > > >> Regards,
> > > > >> Christian.
> > > > >
> > > >
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-21  2:00                         ` T.J. Mercier
@ 2025-05-21  4:17                           ` wangtao
  2025-05-21  7:35                             ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-21  4:17 UTC (permalink / raw)
  To: T.J. Mercier
  Cc: Christian König, sumit.semwal@linaro.org,
	benjamin.gaignard@collabora.com, Brian.Starkey@arm.com,
	jstultz@google.com, linux-media@vger.kernel.org,
	dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
	linux-kernel@vger.kernel.org, wangbintian(BintianWang),
	yipengxiang, liulu 00013167, hanfeng 00012985



> -----Original Message-----
> From: T.J. Mercier <tjmercier@google.com>
> Sent: Wednesday, May 21, 2025 10:01 AM
> To: wangtao <tao.wangtao@honor.com>
> Cc: Christian König <christian.koenig@amd.com>; sumit.semwal@linaro.org;
> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> jstultz@google.com; linux-media@vger.kernel.org; dri-
> devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On Mon, May 19, 2025 at 9:06 PM wangtao <tao.wangtao@honor.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: wangtao
> > > Sent: Monday, May 19, 2025 8:04 PM
> > > To: 'T.J. Mercier' <tjmercier@google.com>; Christian König
> > > <christian.koenig@amd.com>
> > > Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> > > Brian.Starkey@arm.com; jstultz@google.com;
> > > linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > > linaro-mm-sig@lists.linaro.org; linux- kernel@vger.kernel.org;
> > > wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> > > <yipengxiang@honor.com>; liulu
> > > 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> > > <feng.han@honor.com>
> > > Subject: RE: [PATCH 2/2] dmabuf/heaps: implement
> > > DMA_BUF_IOCTL_RW_FILE for system_heap
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: T.J. Mercier <tjmercier@google.com>
> > > > Sent: Saturday, May 17, 2025 2:37 AM
> > > > To: Christian König <christian.koenig@amd.com>
> > > > Cc: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org;
> > > > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > > > jstultz@google.com; linux-media@vger.kernel.org; dri-
> > > > devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org;
> > > > linux- kernel@vger.kernel.org; wangbintian(BintianWang)
> > > > <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>;
> > > > liulu
> > > > 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> > > <feng.han@honor.com>
> > > > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > > DMA_BUF_IOCTL_RW_FILE
> > > > for system_heap
> > > >
> > > > On Fri, May 16, 2025 at 1:36 AM Christian König
> > > > <christian.koenig@amd.com>
> > > > wrote:
> > > > >
> > > > > On 5/16/25 09:40, wangtao wrote:
> > > > > >
> > > > > >
> > > > > >> -----Original Message-----
> > > > > >> From: Christian König <christian.koenig@amd.com>
> > > > > >> Sent: Thursday, May 15, 2025 10:26 PM
> > > > > >> To: wangtao <tao.wangtao@honor.com>;
> sumit.semwal@linaro.org;
> > > > > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com;
> > > > > >> jstultz@google.com; tjmercier@google.com
> > > > > >> Cc: linux-media@vger.kernel.org;
> > > > > >> dri-devel@lists.freedesktop.org;
> > > > > >> linaro- mm-sig@lists.linaro.org;
> > > > > >> linux-kernel@vger.kernel.org;
> > > > > >> wangbintian(BintianWang) <bintian.wang@honor.com>;
> > > > > >> yipengxiang <yipengxiang@honor.com>; liulu 00013167
> > > > > >> <liulu.liu@honor.com>; hanfeng
> > > > > >> 00012985 <feng.han@honor.com>
> > > > > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> > > > > >> DMA_BUF_IOCTL_RW_FILE for system_heap
> > > > > >>
> > > > > >> On 5/15/25 16:03, wangtao wrote:
> > > > > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average):
> > > > > >>> Allocation: 32x32MB buffer creation
> > > > > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower)
> > > > > >>> - Note: shmem shows excessive allocation time
> > > > > >>
> > > > > >> Yeah, that is something already noted by others as well. But
> > > > > >> that is orthogonal.
> > > > > >>
> > > > > >>>
> > > > > >>> Read 1024MB File:
> > > > > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower)
> > > > > >>> - Note: pin_user_pages_fast consumes majority CPU cycles
> > > > > >>>
> > > > > >>> Key function call timing: See details below.
> > > > > >>
> > > > > >> Those aren't valid, you are comparing different functionalities here.
> > > > > >>
> > > > > >> Please try using udmabuf with sendfile() as confirmed to be
> > > > > >> working by
> > > > T.J.
> > > > > > [wangtao] Using buffer IO with dmabuf file read/write requires
> > > > > > one
> > > > memory copy.
> > > > > > Direct IO removes this copy to enable zero-copy. The sendfile
> > > > > > system call reduces memory copies from two (read/write) to one.
> > > > > > However, with udmabuf, sendfile still keeps at least one copy,
> > > > > > failing
> > > zero-copy.
> > > > >
> > > > >
> > > > > Then please work on fixing this.
> > > > >
> > > > > Regards,
> > > > > Christian.
> > > > >
> > > > >
> > > > > >
> > > > > > If udmabuf sendfile uses buffer IO (file page cache), read
> > > > > > latency matches dmabuf buffer read, but allocation time is much
> longer.
> > > > > > With Direct IO, the default 16-page pipe size makes it slower
> > > > > > than buffer
> > > > IO.
> > > > > >
> > > > > > Test data shows:
> > > > > > udmabuf direct read is much faster than udmabuf sendfile.
> > > > > > dmabuf direct read outperforms udmabuf direct read by a large
> margin.
> > > > > >
> > > > > > Issue: After udmabuf is mapped via map_dma_buf, apps using
> > > > > > memfd or udmabuf for Direct IO might cause errors, but there
> > > > > > are no safeguards to prevent this.
> > > > > >
> > > > > > Allocate 32x32MB buffer and read 1024 MB file Test:
> > > > > > Metric                 | alloc (ms) | read (ms) | total (ms)
> > > > > > -----------------------|------------|-----------|-----------
> > > > > > udmabuf buffer read    | 539        | 2017      | 2555
> > > > > > udmabuf direct read    | 522        | 658       | 1179
> > > >
> > > > I can't reproduce the part where udmabuf direct reads are faster
> > > > than buffered reads. That's the opposite of what I'd expect.
> > > > Something seems wrong with those buffered reads.
> > > >
> > > > > > udmabuf buffer sendfile| 505        | 1040      | 1546
> > > > > > udmabuf direct sendfile| 510        | 2269      | 2780
> > > >
> > > > I can reproduce the 3.5x slower udambuf direct sendfile compared
> > > > to udmabuf direct read. It's a pretty disappointing result, so it
> > > > seems like something could be improved there.
> > > >
> > > > 1G from ext4 on 6.12.17 | read/sendfile (ms)
> > > > ------------------------|-------------------
> > > > udmabuf buffer read     | 351
> > > > udmabuf direct read     | 540
> > > > udmabuf buffer sendfile | 255
> > > > udmabuf direct sendfile | 1990
> > > >
> > > [wangtao] By the way, did you clear the file cache during testing?
> > > Looking at your data again, read and sendfile buffers are faster
> > > than Direct I/O, which suggests the file cache wasn’t cleared. If
> > > you didn’t clear the file cache, the test results are unfair and
> > > unreliable for reference. On embedded devices, it’s nearly
> > > impossible to maintain stable caching for multi-GB files. If such
> > > files could be cached, we might as well cache dmabufs directly to save
> time on creating dmabufs and reading file data.
> > > You can call posix_fadvise(file_fd, 0, len, POSIX_FADV_DONTNEED)
> > > after opening the file or before closing it to clear the file cache,
> > > ensuring actual file I/O operations are tested.
> > >
> > [wangtao] Please confirm if cache clearing was performed during testing.
> > I reduced the test scope from 3GB to 1GB. While results without cache
> > clearing show general alignment, udmabuf buffer read remains slower
> > than direct read. Comparative data:
> >
> > Your test reading 1GB(ext4 on 6.12.17:
> > Method                | read/sendfile (ms) | read vs. (%)
> > ----------------------------------------------------------
> > udmabuf buffer read   | 351                | 138%
> > udmabuf direct read   | 540                | 212%
> > udmabuf buffer sendfile | 255              | 100%
> > udmabuf direct sendfile | 1990             | 780%
> >
> > My 3.5GHz tests (f2fs):
> > Without cache clearing:
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 140   | 386   | 310%
> > udmabuf direct read   | 151   | 326   | 262%
> > udmabuf buffer sendfile | 136 | 124   | 100%
> > udmabuf direct sendfile | 132 | 892   | 717%
> > dmabuf buffer read    | 23    | 154   | 124%
> > patch direct read     | 29    | 271   | 218%
> >
> > With cache clearing:
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 135   | 546   | 180%
> > udmabuf direct read   | 159   | 300   | 99%
> > udmabuf buffer sendfile | 134 | 303   | 100%
> > udmabuf direct sendfile | 141 | 912   | 301%
> > dmabuf buffer read    | 22    | 362   | 119%
> > patch direct read     | 29    | 265   | 87%
> >
> > Results without cache clearing aren't representative for embedded
> > mobile devices. Notably, on low-power CPUs @1GHz, sendfile latency
> > without cache clearing exceeds dmabuf direct I/O read time.
> >
> > Without cache clearing:
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 546   | 1745  | 442%
> > udmabuf direct read   | 511   | 704   | 178%
> > udmabuf buffer sendfile | 496 | 395   | 100%
> > udmabuf direct sendfile | 498 | 2332  | 591%
> > dmabuf buffer read    | 43    | 453   | 115%
> > my patch direct read  | 49    | 310   | 79%
> >
> > With cache clearing:
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 552   | 2067  | 198%
> > udmabuf direct read   | 540   | 627   | 60%
> > udmabuf buffer sendfile | 497 | 1045  | 100% udmabuf direct sendfile |
> > 527 | 2330  | 223%
> > dmabuf buffer read    | 40    | 1111  | 106%
> > my patch direct read  | 44    | 310   | 30%
> >
> > Reducing CPU overhead/power consumption is critical for mobile devices.
> > We need simpler and more efficient dmabuf direct I/O support.
> >
> > As Christian evaluated sendfile performance based on your data, could
> > you confirm whether the cache was cleared? If not, please share the
> > post-cache-clearing test data. Thank you for your support.
> 
> Yes sorry, I was out yesterday riding motorcycles. I did not clear the cache for
> the buffered reads, I didn't realize you had. The IO plus the copy certainly
> explains the difference.
> 
> Your point about the unlikelihood of any of that data being in the cache also
> makes sense.
[wangtao] Thank you for testing and clarifying.

> 
> I'm not sure it changes anything about the ioctl approach though.
> Another way to do this would be to move the (optional) support for direct IO
> into the exporter via dma_buf_fops and dma_buf_ops. Then normal read()
> syscalls would just work for buffers that support them.
> I know that's more complicated, but at least it doesn't require inventing new
> uapi to do it.
> 
[wangtao] Thank you for the discussion. I fully support any method that enables
dmabuf direct I/O.

I understand using sendfile/splice with regular files for dmabuf
adds an extra CPU copy, preventing zero-copy. For example:
sendfile path: [DISK] → DMA → [page cache] → CPU copy → [memory file].

The read() syscall can't pass regular file fd parameters, so I added
an ioctl command.
While copy_file_range() supports two fds (fd_in/fd_out), it blocks cross-fs use.
Even without this restriction, file_out->f_op->copy_file_range
only enables dmabuf direct reads from regular files, not writes.

Since dmabuf's direct I/O limitation comes from its unique
attachment/map/fence model and lacks suitable syscalls, adding
an ioctl seems necessary.

When system exporters return a duplicated sg_table via map_dma_buf
(used exclusively like a pages array), they should retain control
over it.

I welcome all solutions to achieve dmabuf direct I/O! Your feedback
is greatly appreciated.
 
> 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
> ------------------------|-------------------
> udmabuf buffer read     | 1210
> udmabuf direct read     | 671
> udmabuf buffer sendfile | 1096
> udmabuf direct sendfile | 2340
> 
> 
> 
> >
> > > >
> > > > > > dmabuf buffer read     | 51         | 1068      | 1118
> > > > > > dmabuf direct read     | 52         | 297       | 349
> > > > > >
> > > > > > udmabuf sendfile test steps:
> > > > > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) #
> > > > > > Loop steps 2-6 3. Allocate udmabuf with memfd 4. Call
> > > > > > sendfile(memfd,
> > > > > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7.
> > > > > > Close back_fd
> > > > > >
> > > > > >>
> > > > > >> Regards,
> > > > > >> Christian.
> > > > > >
> > > > >
> >

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-21  4:17                           ` wangtao
@ 2025-05-21  7:35                             ` Christian König
  2025-05-21 10:25                               ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-21  7:35 UTC (permalink / raw)
  To: wangtao, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

On 5/21/25 06:17, wangtao wrote:
>>> Reducing CPU overhead/power consumption is critical for mobile devices.
>>> We need simpler and more efficient dmabuf direct I/O support.
>>>
>>> As Christian evaluated sendfile performance based on your data, could
>>> you confirm whether the cache was cleared? If not, please share the
>>> post-cache-clearing test data. Thank you for your support.
>>
>> Yes sorry, I was out yesterday riding motorcycles. I did not clear the cache for
>> the buffered reads, I didn't realize you had. The IO plus the copy certainly
>> explains the difference.
>>
>> Your point about the unlikelihood of any of that data being in the cache also
>> makes sense.
> [wangtao] Thank you for testing and clarifying.
> 
>>
>> I'm not sure it changes anything about the ioctl approach though.
>> Another way to do this would be to move the (optional) support for direct IO
>> into the exporter via dma_buf_fops and dma_buf_ops. Then normal read()
>> syscalls would just work for buffers that support them.
>> I know that's more complicated, but at least it doesn't require inventing new
>> uapi to do it.
>>
> [wangtao] Thank you for the discussion. I fully support any method that enables
> dmabuf direct I/O.
> 
> I understand using sendfile/splice with regular files for dmabuf
> adds an extra CPU copy, preventing zero-copy. For example:
> sendfile path: [DISK] → DMA → [page cache] → CPU copy → [memory file].

Yeah, but why can't you work on improving that?

> The read() syscall can't pass regular file fd parameters, so I added
> an ioctl command.
> While copy_file_range() supports two fds (fd_in/fd_out), it blocks cross-fs use.
> Even without this restriction, file_out->f_op->copy_file_range
> only enables dmabuf direct reads from regular files, not writes.
> 
> Since dmabuf's direct I/O limitation comes from its unique
> attachment/map/fence model and lacks suitable syscalls, adding
> an ioctl seems necessary.

I absolutely don't see that. Both splice and sendfile can take two regular file descriptors.

That the underlying fops currently can't do that is not a valid argument for adding new uAPI. It just means that you need to work on improving those fops.

As long as nobody proves to me that the existing uAPI isn't sufficient for this use case I will systematically reject any approach to adding new one.

Regards,
Christian.

> When system exporters return a duplicated sg_table via map_dma_buf
> (used exclusively like a pages array), they should retain control
> over it.
> 
> I welcome all solutions to achieve dmabuf direct I/O! Your feedback
> is greatly appreciated.
>  
>> 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
>> ------------------------|-------------------
>> udmabuf buffer read     | 1210
>> udmabuf direct read     | 671
>> udmabuf buffer sendfile | 1096
>> udmabuf direct sendfile | 2340
>>
>>
>>
>>>
>>>>>
>>>>>>> dmabuf buffer read     | 51         | 1068      | 1118
>>>>>>> dmabuf direct read     | 52         | 297       | 349
>>>>>>>
>>>>>>> udmabuf sendfile test steps:
>>>>>>> 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) #
>>>>>>> Loop steps 2-6 3. Allocate udmabuf with memfd 4. Call
>>>>>>> sendfile(memfd,
>>>>>>> back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7.
>>>>>>> Close back_fd
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>
>>>>>>
>>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-21  7:35                             ` Christian König
@ 2025-05-21 10:25                               ` wangtao
  2025-05-21 11:56                                 ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-21 10:25 UTC (permalink / raw)
  To: Christian König, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Wednesday, May 21, 2025 3:36 PM
> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> <tjmercier@google.com>
> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/21/25 06:17, wangtao wrote:
> >>> Reducing CPU overhead/power consumption is critical for mobile
> devices.
> >>> We need simpler and more efficient dmabuf direct I/O support.
> >>>
> >>> As Christian evaluated sendfile performance based on your data,
> >>> could you confirm whether the cache was cleared? If not, please
> >>> share the post-cache-clearing test data. Thank you for your support.
> >>
> >> Yes sorry, I was out yesterday riding motorcycles. I did not clear
> >> the cache for the buffered reads, I didn't realize you had. The IO
> >> plus the copy certainly explains the difference.
> >>
> >> Your point about the unlikelihood of any of that data being in the
> >> cache also makes sense.
> > [wangtao] Thank you for testing and clarifying.
> >
> >>
> >> I'm not sure it changes anything about the ioctl approach though.
> >> Another way to do this would be to move the (optional) support for
> >> direct IO into the exporter via dma_buf_fops and dma_buf_ops. Then
> >> normal read() syscalls would just work for buffers that support them.
> >> I know that's more complicated, but at least it doesn't require
> >> inventing new uapi to do it.
> >>
> > [wangtao] Thank you for the discussion. I fully support any method
> > that enables dmabuf direct I/O.
> >
> > I understand using sendfile/splice with regular files for dmabuf adds
> > an extra CPU copy, preventing zero-copy. For example:
> > sendfile path: [DISK] → DMA → [page cache] → CPU copy → [memory
> file].
> 
> Yeah, but why can't you work on improving that?
> 
> > The read() syscall can't pass regular file fd parameters, so I added
> > an ioctl command.
> > While copy_file_range() supports two fds (fd_in/fd_out), it blocks cross-fs
> use.
> > Even without this restriction, file_out->f_op->copy_file_range only
> > enables dmabuf direct reads from regular files, not writes.
> >
> > Since dmabuf's direct I/O limitation comes from its unique
> > attachment/map/fence model and lacks suitable syscalls, adding an
> > ioctl seems necessary.
> 
> I absolutely don't see that. Both splice and sendfile can take two regular file
> descriptors.
> 
> That the underlying fops currently can't do that is not a valid argument for
> adding new uAPI. It just means that you need to work on improving those
> fops.
> 
> As long as nobody proves to me that the existing uAPI isn't sufficient for this
> use case I will systematically reject any approach to adding new one.
> 
[wangtao] I previously explained that read/sendfile/splice/copy_file_range
syscalls can't achieve dmabuf direct IO zero-copy.
1. read() can't pass regular file fd to dmabuf.
2. sendfile() supports regular file <-> regular file/socket zero-copy,
but not regular file <-> memory file.
Example:
sendfile(dst_net, src_disk):
[DISK] --DMA--> [page buffer] --DMA--> [NIC]

sendfile(dst_disk, src_disk):
[DISK] --DMA--> [page buffer] --DMA--> [DISK]

sendfile(dst_memfile, src_disk):
[DISK] --DMA--> [page buffer] --CPU copy--> [MEMORY file]

3. splice() requires one end to be a pipe, making it unsuitable.
4. copy_file_range() is blocked by cross-FS restrictions (Amir's commit
868f9f2f8e004bfe0d3935b1976f625b2924893b). Even without this,
file_out->f_op->copy_file_range only enables dmabuf read from regular files,
not write.

My focus is enabling dmabuf direct I/O for [regular file] <--DMA--> [dmabuf]
zero-copy. Any API achieving this would work. Are there other uAPIs you think
could help? Could you recommend experts who might offer suggestions?
Thank you.

> Regards,
> Christian.
> 
> > When system exporters return a duplicated sg_table via map_dma_buf
> > (used exclusively like a pages array), they should retain control over
> > it.
> >
> > I welcome all solutions to achieve dmabuf direct I/O! Your feedback is
> > greatly appreciated.
> >
> >> 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
> >> ------------------------|-------------------
> >> udmabuf buffer read     | 1210
> >> udmabuf direct read     | 671
> >> udmabuf buffer sendfile | 1096
> >> udmabuf direct sendfile | 2340
> >>
> >>
> >>
> >>>
> >>>>>
> >>>>>>> dmabuf buffer read     | 51         | 1068      | 1118
> >>>>>>> dmabuf direct read     | 52         | 297       | 349
> >>>>>>>
> >>>>>>> udmabuf sendfile test steps:
> >>>>>>> 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) #
> >>>>>>> Loop steps 2-6 3. Allocate udmabuf with memfd 4. Call
> >>>>>>> sendfile(memfd,
> >>>>>>> back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7.
> >>>>>>> Close back_fd
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>> Christian.
> >>>>>>>
> >>>>>>
> >>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-21 10:25                               ` wangtao
@ 2025-05-21 11:56                                 ` Christian König
  2025-05-22  8:02                                   ` wangtao
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-05-21 11:56 UTC (permalink / raw)
  To: wangtao, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com

On 5/21/25 12:25, wangtao wrote:
> [wangtao] I previously explained that read/sendfile/splice/copy_file_range
> syscalls can't achieve dmabuf direct IO zero-copy.

And why can't you work on improving those syscalls instead of creating a new IOCTL?

> My focus is enabling dmabuf direct I/O for [regular file] <--DMA--> [dmabuf]
> zero-copy.

Yeah and that focus is wrong. You need to work on a general solution to the issue and not specific to your problem.

> Any API achieving this would work. Are there other uAPIs you think
> could help? Could you recommend experts who might offer suggestions?

Well once more: Either work on sendfile or copy_file_range or eventually splice to make it what you want to do.

When that is done we can discuss with the VFS people if that approach is feasible.

But just bypassing the VFS review by implementing a DMA-buf specific IOCTL is a NO-GO. That is clearly not something you can do in any way.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-21 11:56                                 ` Christian König
@ 2025-05-22  8:02                                   ` wangtao
  2025-05-22 11:57                                     ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-22  8:02 UTC (permalink / raw)
  To: Christian König, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Wednesday, May 21, 2025 7:57 PM
> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> <tjmercier@google.com>
> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>;
> amir73il@gmail.com
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/21/25 12:25, wangtao wrote:
> > [wangtao] I previously explained that
> > read/sendfile/splice/copy_file_range
> > syscalls can't achieve dmabuf direct IO zero-copy.
> 
> And why can't you work on improving those syscalls instead of creating a new
> IOCTL?
> 
[wangtao] As I mentioned in previous emails, these syscalls cannot
achieve dmabuf zero-copy due to technical constraints. Could you
specify the technical points, code, or principles that need
optimization? 

Let me explain again why these syscalls can't work:
1. read() syscall
   - dmabuf fops lacks read callback implementation. Even if implemented,
     file_fd info cannot be transferred
   - read(file_fd, dmabuf_ptr, len) with remap_pfn_range-based mmap
     cannot access dmabuf_buf pages, forcing buffer-mode reads

2. sendfile() syscall
   - Requires CPU copy from page cache to memory file(tmpfs/shmem):
     [DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
   - CPU overhead (both buffer/direct modes involve copies):
     55.08% do_sendfile
    |- 55.08% do_splice_direct
    |-|- 55.08% splice_direct_to_actor
    |-|-|- 22.51% copy_splice_read
    |-|-|-|- 16.57% f2fs_file_read_iter
    |-|-|-|-|- 15.12% __iomap_dio_rw
    |-|-|- 32.33% direct_splice_actor
    |-|-|-|- 32.11% iter_file_splice_write
    |-|-|-|-|- 28.42% vfs_iter_write
    |-|-|-|-|-|- 28.42% do_iter_write
    |-|-|-|-|-|-|- 28.39% shmem_file_write_iter
    |-|-|-|-|-|-|-|- 24.62% generic_perform_write
    |-|-|-|-|-|-|-|-|- 18.75% __pi_memmove

3. splice() requires one end to be a pipe, incompatible with regular files or dmabuf.

4. copy_file_range()
   - Blocked by cross-FS restrictions (Amir's commit 868f9f2f8e00)
   - Even without this restriction, Even without restrictions, implementing
     the copy_file_range callback in dmabuf fops would only allow dmabuf read
	 from regular files. This is because copy_file_range relies on
	 file_out->f_op->copy_file_range, which cannot support dmabuf write
	 operations to regular files.

Test results confirm these limitations:
T.J. Mercier's 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
------------------------|-------------------
udmabuf buffer read     | 1210
udmabuf direct read     | 671
udmabuf buffer sendfile | 1096
udmabuf direct sendfile | 2340

My 3GHz CPU tests (cache cleared):
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 135   | 546   | 180%
udmabuf direct read   | 159   | 300   | 99%
udmabuf buffer sendfile | 134 | 303   | 100%
udmabuf direct sendfile | 141 | 912   | 301%
dmabuf buffer read    | 22    | 362   | 119%
my patch direct read  | 29    | 265   | 87%

My 1GHz CPU tests (cache cleared):
Method                | alloc | read  | vs. (%)
-----------------------------------------------
udmabuf buffer read   | 552   | 2067  | 198%
udmabuf direct read   | 540   | 627   | 60%
udmabuf buffer sendfile | 497 | 1045  | 100%
udmabuf direct sendfile | 527 | 2330  | 223%
dmabuf buffer read    | 40    | 1111  | 106%
patch direct read     | 44    | 310   | 30%

Test observations align with expectations:
1. dmabuf buffer read requires slow CPU copies
2. udmabuf direct read achieves zero-copy but has page retrieval
   latency from vaddr
3. udmabuf buffer sendfile suffers CPU copy overhead
4. udmabuf direct sendfile combines CPU copies with frequent DMA
   operations due to small pipe buffers
5. dmabuf buffer read also requires CPU copies
6. My direct read patch enables zero-copy with better performance
   on low-power CPUs
7. udmabuf creation time remains problematic (as you’ve noted).

> > My focus is enabling dmabuf direct I/O for [regular file] <--DMA-->
> > [dmabuf] zero-copy.
> 
> Yeah and that focus is wrong. You need to work on a general solution to the
> issue and not specific to your problem.
> 
> > Any API achieving this would work. Are there other uAPIs you think
> > could help? Could you recommend experts who might offer suggestions?
> 
> Well once more: Either work on sendfile or copy_file_range or eventually
> splice to make it what you want to do.
> 
> When that is done we can discuss with the VFS people if that approach is
> feasible.
> 
> But just bypassing the VFS review by implementing a DMA-buf specific IOCTL
> is a NO-GO. That is clearly not something you can do in any way.
[wangtao] The issue is that only dmabuf lacks Direct I/O zero-copy support. Tmpfs/shmem
already work with Direct I/O zero-copy. As explained, existing syscalls or
generic methods can't enable dmabuf direct I/O zero-copy, which is why I
propose adding an IOCTL command.

I respect your perspective. Could you clarify specific technical aspects,
code requirements, or implementation principles for modifying sendfile()
or copy_file_range()? This would help advance our discussion.

Thank you for engaging in this dialogue.

> 
> Regards,
> Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-22  8:02                                   ` wangtao
@ 2025-05-22 11:57                                     ` Christian König
  2025-05-22 12:29                                       ` wangtao
  2025-05-27 14:35                                       ` wangtao
  0 siblings, 2 replies; 28+ messages in thread
From: Christian König @ 2025-05-22 11:57 UTC (permalink / raw)
  To: wangtao, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com

On 5/22/25 10:02, wangtao wrote:
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Wednesday, May 21, 2025 7:57 PM
>> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
>> <tjmercier@google.com>
>> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
>> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
>> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
>> kernel@vger.kernel.org; wangbintian(BintianWang)
>> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
>> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>;
>> amir73il@gmail.com
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/21/25 12:25, wangtao wrote:
>>> [wangtao] I previously explained that
>>> read/sendfile/splice/copy_file_range
>>> syscalls can't achieve dmabuf direct IO zero-copy.
>>
>> And why can't you work on improving those syscalls instead of creating a new
>> IOCTL?
>>
> [wangtao] As I mentioned in previous emails, these syscalls cannot
> achieve dmabuf zero-copy due to technical constraints.

Yeah, and why can't you work on removing those technical constrains?

What is blocking you from improving the sendfile system call or proposing a patch to remove the copy_file_range restrictions?

Regards,
Christian.

 Could you
> specify the technical points, code, or principles that need
> optimization? 
> 
> Let me explain again why these syscalls can't work:
> 1. read() syscall
>    - dmabuf fops lacks read callback implementation. Even if implemented,
>      file_fd info cannot be transferred
>    - read(file_fd, dmabuf_ptr, len) with remap_pfn_range-based mmap
>      cannot access dmabuf_buf pages, forcing buffer-mode reads
> 
> 2. sendfile() syscall
>    - Requires CPU copy from page cache to memory file(tmpfs/shmem):
>      [DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
>    - CPU overhead (both buffer/direct modes involve copies):
>      55.08% do_sendfile
>     |- 55.08% do_splice_direct
>     |-|- 55.08% splice_direct_to_actor
>     |-|-|- 22.51% copy_splice_read
>     |-|-|-|- 16.57% f2fs_file_read_iter
>     |-|-|-|-|- 15.12% __iomap_dio_rw
>     |-|-|- 32.33% direct_splice_actor
>     |-|-|-|- 32.11% iter_file_splice_write
>     |-|-|-|-|- 28.42% vfs_iter_write
>     |-|-|-|-|-|- 28.42% do_iter_write
>     |-|-|-|-|-|-|- 28.39% shmem_file_write_iter
>     |-|-|-|-|-|-|-|- 24.62% generic_perform_write
>     |-|-|-|-|-|-|-|-|- 18.75% __pi_memmove
> 
> 3. splice() requires one end to be a pipe, incompatible with regular files or dmabuf.
> 
> 4. copy_file_range()
>    - Blocked by cross-FS restrictions (Amir's commit 868f9f2f8e00)
>    - Even without this restriction, Even without restrictions, implementing
>      the copy_file_range callback in dmabuf fops would only allow dmabuf read
> 	 from regular files. This is because copy_file_range relies on
> 	 file_out->f_op->copy_file_range, which cannot support dmabuf write
> 	 operations to regular files.
> 
> Test results confirm these limitations:
> T.J. Mercier's 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 > drop_caches
> ------------------------|-------------------
> udmabuf buffer read     | 1210
> udmabuf direct read     | 671
> udmabuf buffer sendfile | 1096
> udmabuf direct sendfile | 2340
> 
> My 3GHz CPU tests (cache cleared):
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 135   | 546   | 180%
> udmabuf direct read   | 159   | 300   | 99%
> udmabuf buffer sendfile | 134 | 303   | 100%
> udmabuf direct sendfile | 141 | 912   | 301%
> dmabuf buffer read    | 22    | 362   | 119%
> my patch direct read  | 29    | 265   | 87%
> 
> My 1GHz CPU tests (cache cleared):
> Method                | alloc | read  | vs. (%)
> -----------------------------------------------
> udmabuf buffer read   | 552   | 2067  | 198%
> udmabuf direct read   | 540   | 627   | 60%
> udmabuf buffer sendfile | 497 | 1045  | 100%
> udmabuf direct sendfile | 527 | 2330  | 223%
> dmabuf buffer read    | 40    | 1111  | 106%
> patch direct read     | 44    | 310   | 30%
> 
> Test observations align with expectations:
> 1. dmabuf buffer read requires slow CPU copies
> 2. udmabuf direct read achieves zero-copy but has page retrieval
>    latency from vaddr
> 3. udmabuf buffer sendfile suffers CPU copy overhead
> 4. udmabuf direct sendfile combines CPU copies with frequent DMA
>    operations due to small pipe buffers
> 5. dmabuf buffer read also requires CPU copies
> 6. My direct read patch enables zero-copy with better performance
>    on low-power CPUs
> 7. udmabuf creation time remains problematic (as you’ve noted).
> 
>>> My focus is enabling dmabuf direct I/O for [regular file] <--DMA-->
>>> [dmabuf] zero-copy.
>>
>> Yeah and that focus is wrong. You need to work on a general solution to the
>> issue and not specific to your problem.
>>
>>> Any API achieving this would work. Are there other uAPIs you think
>>> could help? Could you recommend experts who might offer suggestions?
>>
>> Well once more: Either work on sendfile or copy_file_range or eventually
>> splice to make it what you want to do.
>>
>> When that is done we can discuss with the VFS people if that approach is
>> feasible.
>>
>> But just bypassing the VFS review by implementing a DMA-buf specific IOCTL
>> is a NO-GO. That is clearly not something you can do in any way.
> [wangtao] The issue is that only dmabuf lacks Direct I/O zero-copy support. Tmpfs/shmem
> already work with Direct I/O zero-copy. As explained, existing syscalls or
> generic methods can't enable dmabuf direct I/O zero-copy, which is why I
> propose adding an IOCTL command.
> 
> I respect your perspective. Could you clarify specific technical aspects,
> code requirements, or implementation principles for modifying sendfile()
> or copy_file_range()? This would help advance our discussion.
> 
> Thank you for engaging in this dialogue.
> 
>>
>> Regards,
>> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-22 11:57                                     ` Christian König
@ 2025-05-22 12:29                                       ` wangtao
  2025-05-27 14:35                                       ` wangtao
  1 sibling, 0 replies; 28+ messages in thread
From: wangtao @ 2025-05-22 12:29 UTC (permalink / raw)
  To: Christian König, T.J. Mercier, amir73il@gmail.com,
	viro@zeniv.linux.org.uk, brauner@kernel.org,
	akpm@linux-foundation.org, hughd@google.com
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985

Apologies for interrupting the filesystem/memory experts. Due to dmabuf's
attachment/map/fence model, its mmap callback uses remap_pfn_range, making
read(file_fd, dmabuf_ptr, len) support buffer I/O only, not Direct I/O
zero-copy. Embedded/mobile devices urgently require dmabuf Direct I/O for
large-file operations, with prior patches attempting this.

While tmpfs/shmem support Direct I/O zero-copy, dmabuf does not. My patch
adds an ioctl command for dmabuf Direct I/O zero-copy, achieving >80%
bandwidth even on low-power CPUs.

Christian argues udmabuf + sendfile/splice/copy_file_range could enable
zero-copy, but analysis and testing (detailed prior email) show these
syscalls fail for high-performance dmabuf Direct I/O:
1. sendfile(dst_memfile, src_disk): Requires page cache copies
[DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
2. splice: Requires pipe endpoint (incompatible with files/dmabuf)
3. copy_file_range: Cross-FS prohibited

Technical question: Under fs/mm layer constraints, can/how should we modify
sendfile/splice/copy_file_range (or other syscalls) to achieve efficient
dmabuf Direct I/O zero-copy? Your insights on required syscall modifications
would be invaluable. Thank you for guidance.


> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Thursday, May 22, 2025 7:58 PM
> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> <tjmercier@google.com>
> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>;
> amir73il@gmail.com
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/22/25 10:02, wangtao wrote:
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Wednesday, May 21, 2025 7:57 PM
> >> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> >> <tjmercier@google.com>
> >> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> >> Brian.Starkey@arm.com; jstultz@google.com;
> >> linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >> linaro-mm-sig@lists.linaro.org; linux- kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; liulu
> >> 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> >> <feng.han@honor.com>; amir73il@gmail.com
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/21/25 12:25, wangtao wrote:
> >>> [wangtao] I previously explained that
> >>> read/sendfile/splice/copy_file_range
> >>> syscalls can't achieve dmabuf direct IO zero-copy.
> >>
> >> And why can't you work on improving those syscalls instead of
> >> creating a new IOCTL?
> >>
> > [wangtao] As I mentioned in previous emails, these syscalls cannot
> > achieve dmabuf zero-copy due to technical constraints.
> 
> Yeah, and why can't you work on removing those technical constrains?
> 
> What is blocking you from improving the sendfile system call or proposing a
> patch to remove the copy_file_range restrictions?
> 
> Regards,
> Christian.
> 
>  Could you
> > specify the technical points, code, or principles that need
> > optimization?
> >
> > Let me explain again why these syscalls can't work:
> > 1. read() syscall
> >    - dmabuf fops lacks read callback implementation. Even if implemented,
> >      file_fd info cannot be transferred
> >    - read(file_fd, dmabuf_ptr, len) with remap_pfn_range-based mmap
> >      cannot access dmabuf_buf pages, forcing buffer-mode reads
> >
> > 2. sendfile() syscall
> >    - Requires CPU copy from page cache to memory file(tmpfs/shmem):
> >      [DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
> >    - CPU overhead (both buffer/direct modes involve copies):
> >      55.08% do_sendfile
> >     |- 55.08% do_splice_direct
> >     |-|- 55.08% splice_direct_to_actor
> >     |-|-|- 22.51% copy_splice_read
> >     |-|-|-|- 16.57% f2fs_file_read_iter
> >     |-|-|-|-|- 15.12% __iomap_dio_rw
> >     |-|-|- 32.33% direct_splice_actor
> >     |-|-|-|- 32.11% iter_file_splice_write
> >     |-|-|-|-|- 28.42% vfs_iter_write
> >     |-|-|-|-|-|- 28.42% do_iter_write
> >     |-|-|-|-|-|-|- 28.39% shmem_file_write_iter
> >     |-|-|-|-|-|-|-|- 24.62% generic_perform_write
> >     |-|-|-|-|-|-|-|-|- 18.75% __pi_memmove
> >
> > 3. splice() requires one end to be a pipe, incompatible with regular files or
> dmabuf.
> >
> > 4. copy_file_range()
> >    - Blocked by cross-FS restrictions (Amir's commit 868f9f2f8e00)
> >    - Even without this restriction, Even without restrictions, implementing
> >      the copy_file_range callback in dmabuf fops would only allow dmabuf
> read
> > 	 from regular files. This is because copy_file_range relies on
> > 	 file_out->f_op->copy_file_range, which cannot support dmabuf
> write
> > 	 operations to regular files.
> >
> > Test results confirm these limitations:
> > T.J. Mercier's 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 >
> > drop_caches
> > ------------------------|-------------------
> > udmabuf buffer read     | 1210
> > udmabuf direct read     | 671
> > udmabuf buffer sendfile | 1096
> > udmabuf direct sendfile | 2340
> >
> > My 3GHz CPU tests (cache cleared):
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 135   | 546   | 180%
> > udmabuf direct read   | 159   | 300   | 99%
> > udmabuf buffer sendfile | 134 | 303   | 100%
> > udmabuf direct sendfile | 141 | 912   | 301%
> > dmabuf buffer read    | 22    | 362   | 119%
> > my patch direct read  | 29    | 265   | 87%
> >
> > My 1GHz CPU tests (cache cleared):
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 552   | 2067  | 198%
> > udmabuf direct read   | 540   | 627   | 60%
> > udmabuf buffer sendfile | 497 | 1045  | 100% udmabuf direct sendfile |
> > 527 | 2330  | 223%
> > dmabuf buffer read    | 40    | 1111  | 106%
> > patch direct read     | 44    | 310   | 30%
> >
> > Test observations align with expectations:
> > 1. dmabuf buffer read requires slow CPU copies 2. udmabuf direct read
> > achieves zero-copy but has page retrieval
> >    latency from vaddr
> > 3. udmabuf buffer sendfile suffers CPU copy overhead 4. udmabuf direct
> > sendfile combines CPU copies with frequent DMA
> >    operations due to small pipe buffers 5. dmabuf buffer read also
> > requires CPU copies 6. My direct read patch enables zero-copy with
> > better performance
> >    on low-power CPUs
> > 7. udmabuf creation time remains problematic (as you’ve noted).
> >
> >>> My focus is enabling dmabuf direct I/O for [regular file] <--DMA-->
> >>> [dmabuf] zero-copy.
> >>
> >> Yeah and that focus is wrong. You need to work on a general solution
> >> to the issue and not specific to your problem.
> >>
> >>> Any API achieving this would work. Are there other uAPIs you think
> >>> could help? Could you recommend experts who might offer suggestions?
> >>
> >> Well once more: Either work on sendfile or copy_file_range or
> >> eventually splice to make it what you want to do.
> >>
> >> When that is done we can discuss with the VFS people if that approach
> >> is feasible.
> >>
> >> But just bypassing the VFS review by implementing a DMA-buf specific
> >> IOCTL is a NO-GO. That is clearly not something you can do in any way.
> > [wangtao] The issue is that only dmabuf lacks Direct I/O zero-copy
> > support. Tmpfs/shmem already work with Direct I/O zero-copy. As
> > explained, existing syscalls or generic methods can't enable dmabuf
> > direct I/O zero-copy, which is why I propose adding an IOCTL command.
> >
> > I respect your perspective. Could you clarify specific technical
> > aspects, code requirements, or implementation principles for modifying
> > sendfile() or copy_file_range()? This would help advance our discussion.
> >
> > Thank you for engaging in this dialogue.
> >
> >>
> >> Regards,
> >> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-22 11:57                                     ` Christian König
  2025-05-22 12:29                                       ` wangtao
@ 2025-05-27 14:35                                       ` wangtao
  2025-05-27 15:10                                         ` Christian König
  1 sibling, 1 reply; 28+ messages in thread
From: wangtao @ 2025-05-27 14:35 UTC (permalink / raw)
  To: Christian König, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com, akpm@linux-foundation.org,
	viro@zeniv.linux.org.uk, brauner@kernel.org, hughd@google.com



> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Thursday, May 22, 2025 7:58 PM
> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> <tjmercier@google.com>
> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
> kernel@vger.kernel.org; wangbintian(BintianWang)
> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>;
> amir73il@gmail.com
> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> DMA_BUF_IOCTL_RW_FILE for system_heap
> 
> On 5/22/25 10:02, wangtao wrote:
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Wednesday, May 21, 2025 7:57 PM
> >> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
> >> <tjmercier@google.com>
> >> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
> >> Brian.Starkey@arm.com; jstultz@google.com;
> >> linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
> >> linaro-mm-sig@lists.linaro.org; linux- kernel@vger.kernel.org;
> >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
> >> <yipengxiang@honor.com>; liulu
> >> 00013167 <liulu.liu@honor.com>; hanfeng 00012985
> >> <feng.han@honor.com>; amir73il@gmail.com
> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
> >> DMA_BUF_IOCTL_RW_FILE for system_heap
> >>
> >> On 5/21/25 12:25, wangtao wrote:
> >>> [wangtao] I previously explained that
> >>> read/sendfile/splice/copy_file_range
> >>> syscalls can't achieve dmabuf direct IO zero-copy.
> >>
> >> And why can't you work on improving those syscalls instead of
> >> creating a new IOCTL?
> >>
> > [wangtao] As I mentioned in previous emails, these syscalls cannot
> > achieve dmabuf zero-copy due to technical constraints.
> 
> Yeah, and why can't you work on removing those technical constrains?
> 
> What is blocking you from improving the sendfile system call or proposing a
> patch to remove the copy_file_range restrictions?
[wangtao] Since sendfile/splice can't eliminate CPU copies, I skipped cross-FS checks
in copy_file_range when copying memory/disk files.
Will send new patches after completing shmem/udmabuf callback.
Thank you for your attention to this issue.

UFS 4.0 device @4GB/s, Arm64 CPU @1GHz:
| Metrics                  |Creat(us)|Close(us)| I/O(us) |I/O(MB/s)| Vs.%
|--------------------------|---------|---------|---------|---------|-------
| 0)    dmabuf buffer read |   46898 |    4804 | 1173661 |     914 |  100%
| 1)   udmabuf buffer read |  593844 |  337111 | 2144681 |     500 |   54%
| 2)     memfd buffer read |    1029 |  305322 | 2215859 |     484 |   52%
| 3)     memfd direct read |     562 |  295239 | 1019913 |    1052 |  115%
| 4) memfd buffer sendfile |     785 |  299026 | 1431304 |     750 |   82%
| 5) memfd direct sendfile |     718 |  296307 | 2622270 |     409 |   44%
| 6)   memfd buffer splice |     981 |  299694 | 1573710 |     682 |   74%
| 7)   memfd direct splice |     890 |  302509 | 1269757 |     845 |   92%
| 8)    memfd buffer c_f_r |      33 |    4432 |     N/A |     N/A |   N/A
| 9)    memfd direct c_f_r |      27 |    4421 |     N/A |     N/A |   N/A
|10) memfd buffer sendfile |  595797 |  423105 | 1242494 |     864 |   94%
|11) memfd direct sendfile |  593758 |  357921 | 2344001 |     458 |   50%
|12)   memfd buffer splice |  623221 |  356212 | 1117507 |     960 |  105%
|13)   memfd direct splice |  587059 |  345484 |  857103 |    1252 |  136%
|14)  udmabuf buffer c_f_r |   22725 |   10248 |     N/A |     N/A |   N/A
|15)  udmabuf direct c_f_r |   20120 |    9952 |     N/A |     N/A |   N/A
|16)   dmabuf buffer c_f_r |   46517 |    4708 |  857587 |    1252 |  136%
|17)   dmabuf direct c_f_r |   47339 |    4661 |  284023 |    3780 |  413%

> 
> Regards,
> Christian.
> 
>  Could you
> > specify the technical points, code, or principles that need
> > optimization?
> >
> > Let me explain again why these syscalls can't work:
> > 1. read() syscall
> >    - dmabuf fops lacks read callback implementation. Even if implemented,
> >      file_fd info cannot be transferred
> >    - read(file_fd, dmabuf_ptr, len) with remap_pfn_range-based mmap
> >      cannot access dmabuf_buf pages, forcing buffer-mode reads
> >
> > 2. sendfile() syscall
> >    - Requires CPU copy from page cache to memory file(tmpfs/shmem):
> >      [DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
> >    - CPU overhead (both buffer/direct modes involve copies):
> >      55.08% do_sendfile
> >     |- 55.08% do_splice_direct
> >     |-|- 55.08% splice_direct_to_actor
> >     |-|-|- 22.51% copy_splice_read
> >     |-|-|-|- 16.57% f2fs_file_read_iter
> >     |-|-|-|-|- 15.12% __iomap_dio_rw
> >     |-|-|- 32.33% direct_splice_actor
> >     |-|-|-|- 32.11% iter_file_splice_write
> >     |-|-|-|-|- 28.42% vfs_iter_write
> >     |-|-|-|-|-|- 28.42% do_iter_write
> >     |-|-|-|-|-|-|- 28.39% shmem_file_write_iter
> >     |-|-|-|-|-|-|-|- 24.62% generic_perform_write
> >     |-|-|-|-|-|-|-|-|- 18.75% __pi_memmove
> >
> > 3. splice() requires one end to be a pipe, incompatible with regular files or
> dmabuf.
> >
> > 4. copy_file_range()
> >    - Blocked by cross-FS restrictions (Amir's commit 868f9f2f8e00)
> >    - Even without this restriction, Even without restrictions, implementing
> >      the copy_file_range callback in dmabuf fops would only allow dmabuf
> read
> > 	 from regular files. This is because copy_file_range relies on
> > 	 file_out->f_op->copy_file_range, which cannot support dmabuf
> write
> > 	 operations to regular files.
> >
> > Test results confirm these limitations:
> > T.J. Mercier's 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 >
> > drop_caches
> > ------------------------|-------------------
> > udmabuf buffer read     | 1210
> > udmabuf direct read     | 671
> > udmabuf buffer sendfile | 1096
> > udmabuf direct sendfile | 2340
> >
> > My 3GHz CPU tests (cache cleared):
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 135   | 546   | 180%
> > udmabuf direct read   | 159   | 300   | 99%
> > udmabuf buffer sendfile | 134 | 303   | 100%
> > udmabuf direct sendfile | 141 | 912   | 301%
> > dmabuf buffer read    | 22    | 362   | 119%
> > my patch direct read  | 29    | 265   | 87%
> >
> > My 1GHz CPU tests (cache cleared):
> > Method                | alloc | read  | vs. (%)
> > -----------------------------------------------
> > udmabuf buffer read   | 552   | 2067  | 198%
> > udmabuf direct read   | 540   | 627   | 60%
> > udmabuf buffer sendfile | 497 | 1045  | 100% udmabuf direct sendfile |
> > 527 | 2330  | 223%
> > dmabuf buffer read    | 40    | 1111  | 106%
> > patch direct read     | 44    | 310   | 30%
> >
> > Test observations align with expectations:
> > 1. dmabuf buffer read requires slow CPU copies 2. udmabuf direct read
> > achieves zero-copy but has page retrieval
> >    latency from vaddr
> > 3. udmabuf buffer sendfile suffers CPU copy overhead 4. udmabuf direct
> > sendfile combines CPU copies with frequent DMA
> >    operations due to small pipe buffers 5. dmabuf buffer read also
> > requires CPU copies 6. My direct read patch enables zero-copy with
> > better performance
> >    on low-power CPUs
> > 7. udmabuf creation time remains problematic (as you’ve noted).
> >
> >>> My focus is enabling dmabuf direct I/O for [regular file] <--DMA-->
> >>> [dmabuf] zero-copy.
> >>
> >> Yeah and that focus is wrong. You need to work on a general solution
> >> to the issue and not specific to your problem.
> >>
> >>> Any API achieving this would work. Are there other uAPIs you think
> >>> could help? Could you recommend experts who might offer suggestions?
> >>
> >> Well once more: Either work on sendfile or copy_file_range or
> >> eventually splice to make it what you want to do.
> >>
> >> When that is done we can discuss with the VFS people if that approach
> >> is feasible.
> >>
> >> But just bypassing the VFS review by implementing a DMA-buf specific
> >> IOCTL is a NO-GO. That is clearly not something you can do in any way.
> > [wangtao] The issue is that only dmabuf lacks Direct I/O zero-copy
> > support. Tmpfs/shmem already work with Direct I/O zero-copy. As
> > explained, existing syscalls or generic methods can't enable dmabuf
> > direct I/O zero-copy, which is why I propose adding an IOCTL command.
> >
> > I respect your perspective. Could you clarify specific technical
> > aspects, code requirements, or implementation principles for modifying
> > sendfile() or copy_file_range()? This would help advance our discussion.
> >
> > Thank you for engaging in this dialogue.
> >
> >>
> >> Regards,
> >> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap
  2025-05-27 14:35                                       ` wangtao
@ 2025-05-27 15:10                                         ` Christian König
  0 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2025-05-27 15:10 UTC (permalink / raw)
  To: wangtao, T.J. Mercier
  Cc: sumit.semwal@linaro.org, benjamin.gaignard@collabora.com,
	Brian.Starkey@arm.com, jstultz@google.com,
	linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linaro-mm-sig@lists.linaro.org, linux-kernel@vger.kernel.org,
	wangbintian(BintianWang), yipengxiang, liulu 00013167,
	hanfeng 00012985, amir73il@gmail.com, akpm@linux-foundation.org,
	viro@zeniv.linux.org.uk, brauner@kernel.org, hughd@google.com

On 5/27/25 16:35, wangtao wrote:
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Thursday, May 22, 2025 7:58 PM
>> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
>> <tjmercier@google.com>
>> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
>> Brian.Starkey@arm.com; jstultz@google.com; linux-media@vger.kernel.org;
>> dri-devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux-
>> kernel@vger.kernel.org; wangbintian(BintianWang)
>> <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu
>> 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com>;
>> amir73il@gmail.com
>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>
>> On 5/22/25 10:02, wangtao wrote:
>>>> -----Original Message-----
>>>> From: Christian König <christian.koenig@amd.com>
>>>> Sent: Wednesday, May 21, 2025 7:57 PM
>>>> To: wangtao <tao.wangtao@honor.com>; T.J. Mercier
>>>> <tjmercier@google.com>
>>>> Cc: sumit.semwal@linaro.org; benjamin.gaignard@collabora.com;
>>>> Brian.Starkey@arm.com; jstultz@google.com;
>>>> linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org;
>>>> linaro-mm-sig@lists.linaro.org; linux- kernel@vger.kernel.org;
>>>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang
>>>> <yipengxiang@honor.com>; liulu
>>>> 00013167 <liulu.liu@honor.com>; hanfeng 00012985
>>>> <feng.han@honor.com>; amir73il@gmail.com
>>>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement
>>>> DMA_BUF_IOCTL_RW_FILE for system_heap
>>>>
>>>> On 5/21/25 12:25, wangtao wrote:
>>>>> [wangtao] I previously explained that
>>>>> read/sendfile/splice/copy_file_range
>>>>> syscalls can't achieve dmabuf direct IO zero-copy.
>>>>
>>>> And why can't you work on improving those syscalls instead of
>>>> creating a new IOCTL?
>>>>
>>> [wangtao] As I mentioned in previous emails, these syscalls cannot
>>> achieve dmabuf zero-copy due to technical constraints.
>>
>> Yeah, and why can't you work on removing those technical constrains?
>>
>> What is blocking you from improving the sendfile system call or proposing a
>> patch to remove the copy_file_range restrictions?
> [wangtao] Since sendfile/splice can't eliminate CPU copies, I skipped cross-FS checks
> in copy_file_range when copying memory/disk files.

It will probably be a longer discussion, but I think that having the FS people take a look as well is clearly mandatory.

If Linus or anybody else of those maintainers then say that this isn't going to fly either we can still look into alternatives.

Thanks,
Christian.


> Will send new patches after completing shmem/udmabuf callback.
> Thank you for your attention to this issue.
> 
> UFS 4.0 device @4GB/s, Arm64 CPU @1GHz:
> | Metrics                  |Creat(us)|Close(us)| I/O(us) |I/O(MB/s)| Vs.%
> |--------------------------|---------|---------|---------|---------|-------
> | 0)    dmabuf buffer read |   46898 |    4804 | 1173661 |     914 |  100%
> | 1)   udmabuf buffer read |  593844 |  337111 | 2144681 |     500 |   54%
> | 2)     memfd buffer read |    1029 |  305322 | 2215859 |     484 |   52%
> | 3)     memfd direct read |     562 |  295239 | 1019913 |    1052 |  115%
> | 4) memfd buffer sendfile |     785 |  299026 | 1431304 |     750 |   82%
> | 5) memfd direct sendfile |     718 |  296307 | 2622270 |     409 |   44%
> | 6)   memfd buffer splice |     981 |  299694 | 1573710 |     682 |   74%
> | 7)   memfd direct splice |     890 |  302509 | 1269757 |     845 |   92%
> | 8)    memfd buffer c_f_r |      33 |    4432 |     N/A |     N/A |   N/A
> | 9)    memfd direct c_f_r |      27 |    4421 |     N/A |     N/A |   N/A
> |10) memfd buffer sendfile |  595797 |  423105 | 1242494 |     864 |   94%
> |11) memfd direct sendfile |  593758 |  357921 | 2344001 |     458 |   50%
> |12)   memfd buffer splice |  623221 |  356212 | 1117507 |     960 |  105%
> |13)   memfd direct splice |  587059 |  345484 |  857103 |    1252 |  136%
> |14)  udmabuf buffer c_f_r |   22725 |   10248 |     N/A |     N/A |   N/A
> |15)  udmabuf direct c_f_r |   20120 |    9952 |     N/A |     N/A |   N/A
> |16)   dmabuf buffer c_f_r |   46517 |    4708 |  857587 |    1252 |  136%
> |17)   dmabuf direct c_f_r |   47339 |    4661 |  284023 |    3780 |  413%
> 
>>
>> Regards,
>> Christian.
>>
>>  Could you
>>> specify the technical points, code, or principles that need
>>> optimization?
>>>
>>> Let me explain again why these syscalls can't work:
>>> 1. read() syscall
>>>    - dmabuf fops lacks read callback implementation. Even if implemented,
>>>      file_fd info cannot be transferred
>>>    - read(file_fd, dmabuf_ptr, len) with remap_pfn_range-based mmap
>>>      cannot access dmabuf_buf pages, forcing buffer-mode reads
>>>
>>> 2. sendfile() syscall
>>>    - Requires CPU copy from page cache to memory file(tmpfs/shmem):
>>>      [DISK] --DMA--> [page cache] --CPU copy--> [MEMORY file]
>>>    - CPU overhead (both buffer/direct modes involve copies):
>>>      55.08% do_sendfile
>>>     |- 55.08% do_splice_direct
>>>     |-|- 55.08% splice_direct_to_actor
>>>     |-|-|- 22.51% copy_splice_read
>>>     |-|-|-|- 16.57% f2fs_file_read_iter
>>>     |-|-|-|-|- 15.12% __iomap_dio_rw
>>>     |-|-|- 32.33% direct_splice_actor
>>>     |-|-|-|- 32.11% iter_file_splice_write
>>>     |-|-|-|-|- 28.42% vfs_iter_write
>>>     |-|-|-|-|-|- 28.42% do_iter_write
>>>     |-|-|-|-|-|-|- 28.39% shmem_file_write_iter
>>>     |-|-|-|-|-|-|-|- 24.62% generic_perform_write
>>>     |-|-|-|-|-|-|-|-|- 18.75% __pi_memmove
>>>
>>> 3. splice() requires one end to be a pipe, incompatible with regular files or
>> dmabuf.
>>>
>>> 4. copy_file_range()
>>>    - Blocked by cross-FS restrictions (Amir's commit 868f9f2f8e00)
>>>    - Even without this restriction, Even without restrictions, implementing
>>>      the copy_file_range callback in dmabuf fops would only allow dmabuf
>> read
>>> 	 from regular files. This is because copy_file_range relies on
>>> 	 file_out->f_op->copy_file_range, which cannot support dmabuf
>> write
>>> 	 operations to regular files.
>>>
>>> Test results confirm these limitations:
>>> T.J. Mercier's 1G from ext4 on 6.12.20 | read/sendfile (ms) w/ 3 >
>>> drop_caches
>>> ------------------------|-------------------
>>> udmabuf buffer read     | 1210
>>> udmabuf direct read     | 671
>>> udmabuf buffer sendfile | 1096
>>> udmabuf direct sendfile | 2340
>>>
>>> My 3GHz CPU tests (cache cleared):
>>> Method                | alloc | read  | vs. (%)
>>> -----------------------------------------------
>>> udmabuf buffer read   | 135   | 546   | 180%
>>> udmabuf direct read   | 159   | 300   | 99%
>>> udmabuf buffer sendfile | 134 | 303   | 100%
>>> udmabuf direct sendfile | 141 | 912   | 301%
>>> dmabuf buffer read    | 22    | 362   | 119%
>>> my patch direct read  | 29    | 265   | 87%
>>>
>>> My 1GHz CPU tests (cache cleared):
>>> Method                | alloc | read  | vs. (%)
>>> -----------------------------------------------
>>> udmabuf buffer read   | 552   | 2067  | 198%
>>> udmabuf direct read   | 540   | 627   | 60%
>>> udmabuf buffer sendfile | 497 | 1045  | 100% udmabuf direct sendfile |
>>> 527 | 2330  | 223%
>>> dmabuf buffer read    | 40    | 1111  | 106%
>>> patch direct read     | 44    | 310   | 30%
>>>
>>> Test observations align with expectations:
>>> 1. dmabuf buffer read requires slow CPU copies 2. udmabuf direct read
>>> achieves zero-copy but has page retrieval
>>>    latency from vaddr
>>> 3. udmabuf buffer sendfile suffers CPU copy overhead 4. udmabuf direct
>>> sendfile combines CPU copies with frequent DMA
>>>    operations due to small pipe buffers 5. dmabuf buffer read also
>>> requires CPU copies 6. My direct read patch enables zero-copy with
>>> better performance
>>>    on low-power CPUs
>>> 7. udmabuf creation time remains problematic (as you’ve noted).
>>>
>>>>> My focus is enabling dmabuf direct I/O for [regular file] <--DMA-->
>>>>> [dmabuf] zero-copy.
>>>>
>>>> Yeah and that focus is wrong. You need to work on a general solution
>>>> to the issue and not specific to your problem.
>>>>
>>>>> Any API achieving this would work. Are there other uAPIs you think
>>>>> could help? Could you recommend experts who might offer suggestions?
>>>>
>>>> Well once more: Either work on sendfile or copy_file_range or
>>>> eventually splice to make it what you want to do.
>>>>
>>>> When that is done we can discuss with the VFS people if that approach
>>>> is feasible.
>>>>
>>>> But just bypassing the VFS review by implementing a DMA-buf specific
>>>> IOCTL is a NO-GO. That is clearly not something you can do in any way.
>>> [wangtao] The issue is that only dmabuf lacks Direct I/O zero-copy
>>> support. Tmpfs/shmem already work with Direct I/O zero-copy. As
>>> explained, existing syscalls or generic methods can't enable dmabuf
>>> direct I/O zero-copy, which is why I propose adding an IOCTL command.
>>>
>>> I respect your perspective. Could you clarify specific technical
>>> aspects, code requirements, or implementation principles for modifying
>>> sendfile() or copy_file_range()? This would help advance our discussion.
>>>
>>> Thank you for engaging in this dialogue.
>>>
>>>>
>>>> Regards,
>>>> Christian.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2025-05-27 15:10 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-13  9:28 [PATCH 2/2] dmabuf/heaps: implement DMA_BUF_IOCTL_RW_FILE for system_heap wangtao
2025-05-13 11:32 ` Christian König
2025-05-13 12:30   ` wangtao
2025-05-13 13:17     ` Christian König
2025-05-14 11:02       ` wangtao
2025-05-14 12:00         ` Christian König
2025-05-15 14:03           ` wangtao
2025-05-15 14:26             ` Christian König
2025-05-16  7:40               ` wangtao
2025-05-16  8:36                 ` Christian König
2025-05-16  9:49                   ` wangtao
2025-05-16 10:29                     ` Christian König
2025-05-19  4:08                       ` wangtao
2025-05-19  7:47                         ` Christian König
2025-05-16 18:37                   ` T.J. Mercier
2025-05-19  4:37                     ` wangtao
2025-05-19 12:03                     ` wangtao
2025-05-20  4:06                       ` wangtao
2025-05-21  2:00                         ` T.J. Mercier
2025-05-21  4:17                           ` wangtao
2025-05-21  7:35                             ` Christian König
2025-05-21 10:25                               ` wangtao
2025-05-21 11:56                                 ` Christian König
2025-05-22  8:02                                   ` wangtao
2025-05-22 11:57                                     ` Christian König
2025-05-22 12:29                                       ` wangtao
2025-05-27 14:35                                       ` wangtao
2025-05-27 15:10                                         ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).