Re: [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2)

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mike Kravetz <mike.kravetz@oracle.com>
To: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>,
	David Hildenbrand <david@redhat.com>,
	Junxiao Chang <junxiao.chang@intel.com>,
	Hugh Dickins <hughd@google.com>, Peter Xu <peterx@redhat.com>,
	linux-mm@kvack.org, dri-devel@lists.freedesktop.org,
	Jason Gunthorpe <jgg@nvidia.com>,
	Gerd Hoffmann <kraxel@redhat.com>
Subject: Re: [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2)
Date: Tue, 18 Jul 2023 16:44:36 -0700	[thread overview]
Message-ID: <20230718234436.GB10413@monkey> (raw)
In-Reply-To: <20230718082605.1570740-3-vivek.kasireddy@intel.com>

On 07/18/23 01:26, Vivek Kasireddy wrote:
> A user or admin can configure a VMM (Qemu) Guest's memory to be
> backed by hugetlb pages for various reasons. However, a Guest OS
> would still allocate (and pin) buffers that are backed by regular
> 4k sized pages. In order to map these buffers and create dma-bufs
> for them on the Host, we first need to find the hugetlb pages where
> the buffer allocations are located and then determine the offsets
> of individual chunks (within those pages) and use this information
> to eventually populate a scatterlist.
> 
> Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500 options
> were passed to the Host kernel and Qemu was launched with these
> relevant options: qemu-system-x86_64 -m 4096m....
> -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080
> -display gtk,gl=on
> -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M
> -machine memory-backend=mem1
> 
> Replacing -display gtk,gl=on with -display gtk,gl=off above would
> exercise the mmap handler.
> 
> v2: Updated get_sg_table() to manually populate the scatterlist for
>     both huge page and non-huge-page cases.
> 
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Jason Gunthorpe <jgg@nvidia.com>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Junxiao Chang <junxiao.chang@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>  drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++------
>  1 file changed, 71 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 820c993c8659..10c47bf77fb5 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -10,6 +10,7 @@
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/hugetlb.h>
>  #include <linux/slab.h>
>  #include <linux/udmabuf.h>
>  #include <linux/vmalloc.h>
> @@ -28,6 +29,7 @@ struct udmabuf {
>  	struct page **pages;
>  	struct sg_table *sg;
>  	struct miscdevice *device;
> +	pgoff_t *offsets;
>  };
>  
>  static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	pfn = page_to_pfn(ubuf->pages[pgoff]);
> +	if (ubuf->offsets) {
> +		pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
> +	}
> +
>  	return vmf_insert_pfn(vma, vmf->address, pfn);
>  }
>  
> @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf,
>  {
>  	struct udmabuf *ubuf = buf->priv;
>  	struct sg_table *sg;
> +	struct scatterlist *sgl;
> +	pgoff_t offset;
> +	unsigned long i = 0;
>  	int ret;
>  
>  	sg = kzalloc(sizeof(*sg), GFP_KERNEL);
>  	if (!sg)
>  		return ERR_PTR(-ENOMEM);
> -	ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> -					0, ubuf->pagecount << PAGE_SHIFT,
> -					GFP_KERNEL);
> +
> +	ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
>  	if (ret < 0)
> -		goto err;
> +		goto err_alloc;
> +
> +	for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
> +		offset = ubuf->offsets ? ubuf->offsets[i] : 0;
> +		sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
> +	}
>  	ret = dma_map_sgtable(dev, sg, direction, 0);
>  	if (ret < 0)
> -		goto err;
> +		goto err_map;
>  	return sg;
>  
> -err:
> +err_map:
>  	sg_free_table(sg);
> +err_alloc:
>  	kfree(sg);
>  	return ERR_PTR(ret);
>  }
> @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf)
>  
>  	for (pg = 0; pg < ubuf->pagecount; pg++)
>  		put_page(ubuf->pages[pg]);
> +	kfree(ubuf->offsets);
>  	kfree(ubuf->pages);
>  	kfree(ubuf);
>  }
> @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice *device,
>  	struct udmabuf *ubuf;
>  	struct dma_buf *buf;
>  	pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> -	struct page *page;
> +	struct page *page, *hpage = NULL;
> +	pgoff_t hpoff, chunkoff, maxchunks;
> +	struct hstate *hpstate;
>  	int seals, ret = -EINVAL;
>  	u32 i, flags;
>  
> @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice *device,
>  		if (!memfd)
>  			goto err;
>  		mapping = memfd->f_mapping;
> -		if (!shmem_mapping(mapping))
> +		if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
>  			goto err;
>  		seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
>  		if (seals == -EINVAL)
> @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice *device,
>  			goto err;
>  		pgoff = list[i].offset >> PAGE_SHIFT;
>  		pgcnt = list[i].size   >> PAGE_SHIFT;
> +		if (is_file_hugepages(memfd)) {
> +			if (!ubuf->offsets) {
> +				ubuf->offsets = kmalloc_array(ubuf->pagecount,
> +							      sizeof(*ubuf->offsets),
> +							      GFP_KERNEL);
> +				if (!ubuf->offsets) {
> +					ret = -ENOMEM;
> +					goto err;
> +				}
> +			}
> +			hpstate = hstate_file(memfd);
> +			hpoff = list[i].offset >> huge_page_shift(hpstate);
> +			chunkoff = (list[i].offset &
> +				    ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
> +			maxchunks = huge_page_size(hpstate) >> PAGE_SHIFT;
> +		}
>  		for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> -			page = shmem_read_mapping_page(mapping, pgoff + pgidx);
> -			if (IS_ERR(page)) {
> -				ret = PTR_ERR(page);
> -				goto err;
> +			if (is_file_hugepages(memfd)) {
> +				if (!hpage) {
> +					hpage = find_get_page_flags(mapping, hpoff,
> +								    FGP_ACCESSED);
> +					if (!hpage) {
> +						ret = -EINVAL;
> +						goto err;
> +					}
> +				}
> +				get_page(hpage);

Is the intention to increase the ref count of the hugetlb page once for
each 'sub-page' added?  Or, am I reading that incorrectly?

> +				ubuf->pages[pgbuf] = hpage;

Ah!, answering my own question.  Since the 'head page' is added to the
array the ref count of the head page will decremented in
release_udmabuf.

> +				ubuf->offsets[pgbuf++] = chunkoff << PAGE_SHIFT;
> +				if (++chunkoff == maxchunks) {
> +					put_page(hpage);
> +					hpage = NULL;
> +					chunkoff = 0;
> +					hpoff++;
> +				}
> +			} else {
> +				page = shmem_read_mapping_page(mapping, pgoff + pgidx);

It may not matter to your users, but the semantics for hugetlb and shmem
pages is different.  hugetlb requires the pages exist in the page cache
while shmem will create/add pages to the cache if necessary.

> +				if (IS_ERR(page)) {
> +					ret = PTR_ERR(page);
> +					goto err;
> +				}
> +				ubuf->pages[pgbuf++] = page;
>  			}
> -			ubuf->pages[pgbuf++] = page;
>  		}
>  		fput(memfd);
>  		memfd = NULL;
> +		if (hpage) {
> +			put_page(hpage);
> +			hpage = NULL;
> +		}
>  	}
>  
>  	exp_info.ops  = &udmabuf_ops;
> @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice *device,
>  		put_page(ubuf->pages[--pgbuf]);
>  	if (memfd)
>  		fput(memfd);
> +	kfree(ubuf->offsets);
>  	kfree(ubuf->pages);
>  	kfree(ubuf);
>  	return ret;
> -- 
> 2.39.2
> 

Nothing else stands out,

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>

I see there is a RFC for the coherency issue with hole punch.
-- 
Mike Kravetz

WARNING: multiple messages have this Message-ID (diff)

From: Mike Kravetz <mike.kravetz@oracle.com>
To: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: dri-devel@lists.freedesktop.org, linux-mm@kvack.org,
	David Hildenbrand <david@redhat.com>,
	Hugh Dickins <hughd@google.com>, Peter Xu <peterx@redhat.com>,
	Jason Gunthorpe <jgg@nvidia.com>,
	Gerd Hoffmann <kraxel@redhat.com>,
	Dongwon Kim <dongwon.kim@intel.com>,
	Junxiao Chang <junxiao.chang@intel.com>
Subject: Re: [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2)
Date: Tue, 18 Jul 2023 16:44:36 -0700	[thread overview]
Message-ID: <20230718234436.GB10413@monkey> (raw)
In-Reply-To: <20230718082605.1570740-3-vivek.kasireddy@intel.com>

On 07/18/23 01:26, Vivek Kasireddy wrote:
> A user or admin can configure a VMM (Qemu) Guest's memory to be
> backed by hugetlb pages for various reasons. However, a Guest OS
> would still allocate (and pin) buffers that are backed by regular
> 4k sized pages. In order to map these buffers and create dma-bufs
> for them on the Host, we first need to find the hugetlb pages where
> the buffer allocations are located and then determine the offsets
> of individual chunks (within those pages) and use this information
> to eventually populate a scatterlist.
> 
> Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500 options
> were passed to the Host kernel and Qemu was launched with these
> relevant options: qemu-system-x86_64 -m 4096m....
> -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080
> -display gtk,gl=on
> -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M
> -machine memory-backend=mem1
> 
> Replacing -display gtk,gl=on with -display gtk,gl=off above would
> exercise the mmap handler.
> 
> v2: Updated get_sg_table() to manually populate the scatterlist for
>     both huge page and non-huge-page cases.
> 
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Jason Gunthorpe <jgg@nvidia.com>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Junxiao Chang <junxiao.chang@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>  drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++------
>  1 file changed, 71 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 820c993c8659..10c47bf77fb5 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -10,6 +10,7 @@
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/hugetlb.h>
>  #include <linux/slab.h>
>  #include <linux/udmabuf.h>
>  #include <linux/vmalloc.h>
> @@ -28,6 +29,7 @@ struct udmabuf {
>  	struct page **pages;
>  	struct sg_table *sg;
>  	struct miscdevice *device;
> +	pgoff_t *offsets;
>  };
>  
>  static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	pfn = page_to_pfn(ubuf->pages[pgoff]);
> +	if (ubuf->offsets) {
> +		pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
> +	}
> +
>  	return vmf_insert_pfn(vma, vmf->address, pfn);
>  }
>  
> @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf,
>  {
>  	struct udmabuf *ubuf = buf->priv;
>  	struct sg_table *sg;
> +	struct scatterlist *sgl;
> +	pgoff_t offset;
> +	unsigned long i = 0;
>  	int ret;
>  
>  	sg = kzalloc(sizeof(*sg), GFP_KERNEL);
>  	if (!sg)
>  		return ERR_PTR(-ENOMEM);
> -	ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> -					0, ubuf->pagecount << PAGE_SHIFT,
> -					GFP_KERNEL);
> +
> +	ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
>  	if (ret < 0)
> -		goto err;
> +		goto err_alloc;
> +
> +	for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
> +		offset = ubuf->offsets ? ubuf->offsets[i] : 0;
> +		sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
> +	}
>  	ret = dma_map_sgtable(dev, sg, direction, 0);
>  	if (ret < 0)
> -		goto err;
> +		goto err_map;
>  	return sg;
>  
> -err:
> +err_map:
>  	sg_free_table(sg);
> +err_alloc:
>  	kfree(sg);
>  	return ERR_PTR(ret);
>  }
> @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf)
>  
>  	for (pg = 0; pg < ubuf->pagecount; pg++)
>  		put_page(ubuf->pages[pg]);
> +	kfree(ubuf->offsets);
>  	kfree(ubuf->pages);
>  	kfree(ubuf);
>  }
> @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice *device,
>  	struct udmabuf *ubuf;
>  	struct dma_buf *buf;
>  	pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> -	struct page *page;
> +	struct page *page, *hpage = NULL;
> +	pgoff_t hpoff, chunkoff, maxchunks;
> +	struct hstate *hpstate;
>  	int seals, ret = -EINVAL;
>  	u32 i, flags;
>  
> @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice *device,
>  		if (!memfd)
>  			goto err;
>  		mapping = memfd->f_mapping;
> -		if (!shmem_mapping(mapping))
> +		if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
>  			goto err;
>  		seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
>  		if (seals == -EINVAL)
> @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice *device,
>  			goto err;
>  		pgoff = list[i].offset >> PAGE_SHIFT;
>  		pgcnt = list[i].size   >> PAGE_SHIFT;
> +		if (is_file_hugepages(memfd)) {
> +			if (!ubuf->offsets) {
> +				ubuf->offsets = kmalloc_array(ubuf->pagecount,
> +							      sizeof(*ubuf->offsets),
> +							      GFP_KERNEL);
> +				if (!ubuf->offsets) {
> +					ret = -ENOMEM;
> +					goto err;
> +				}
> +			}
> +			hpstate = hstate_file(memfd);
> +			hpoff = list[i].offset >> huge_page_shift(hpstate);
> +			chunkoff = (list[i].offset &
> +				    ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
> +			maxchunks = huge_page_size(hpstate) >> PAGE_SHIFT;
> +		}
>  		for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> -			page = shmem_read_mapping_page(mapping, pgoff + pgidx);
> -			if (IS_ERR(page)) {
> -				ret = PTR_ERR(page);
> -				goto err;
> +			if (is_file_hugepages(memfd)) {
> +				if (!hpage) {
> +					hpage = find_get_page_flags(mapping, hpoff,
> +								    FGP_ACCESSED);
> +					if (!hpage) {
> +						ret = -EINVAL;
> +						goto err;
> +					}
> +				}
> +				get_page(hpage);

Is the intention to increase the ref count of the hugetlb page once for
each 'sub-page' added?  Or, am I reading that incorrectly?

> +				ubuf->pages[pgbuf] = hpage;

Ah!, answering my own question.  Since the 'head page' is added to the
array the ref count of the head page will decremented in
release_udmabuf.

> +				ubuf->offsets[pgbuf++] = chunkoff << PAGE_SHIFT;
> +				if (++chunkoff == maxchunks) {
> +					put_page(hpage);
> +					hpage = NULL;
> +					chunkoff = 0;
> +					hpoff++;
> +				}
> +			} else {
> +				page = shmem_read_mapping_page(mapping, pgoff + pgidx);

It may not matter to your users, but the semantics for hugetlb and shmem
pages is different.  hugetlb requires the pages exist in the page cache
while shmem will create/add pages to the cache if necessary.

> +				if (IS_ERR(page)) {
> +					ret = PTR_ERR(page);
> +					goto err;
> +				}
> +				ubuf->pages[pgbuf++] = page;
>  			}
> -			ubuf->pages[pgbuf++] = page;
>  		}
>  		fput(memfd);
>  		memfd = NULL;
> +		if (hpage) {
> +			put_page(hpage);
> +			hpage = NULL;
> +		}
>  	}
>  
>  	exp_info.ops  = &udmabuf_ops;
> @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice *device,
>  		put_page(ubuf->pages[--pgbuf]);
>  	if (memfd)
>  		fput(memfd);
> +	kfree(ubuf->offsets);
>  	kfree(ubuf->pages);
>  	kfree(ubuf);
>  	return ret;
> -- 
> 2.39.2
> 

Nothing else stands out,

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>

I see there is a RFC for the coherency issue with hole punch.
-- 
Mike Kravetz

next prev parent reply	other threads:[~2023-07-19  7:01 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-18  8:26 [PATCH v2 0/2] udmabuf: Add back support for mapping hugetlb pages (v2) Vivek Kasireddy
2023-07-18  8:26 ` Vivek Kasireddy
2023-07-18  8:26 ` [PATCH v2 1/2] udmabuf: Use vmf_insert_pfn and VM_PFNMAP for handling mmap Vivek Kasireddy
2023-07-18  8:26   ` Vivek Kasireddy
2023-07-18 18:03   ` David Hildenbrand
2023-07-18 18:03     ` David Hildenbrand
2023-07-18  8:26 ` [PATCH v2 2/2] udmabuf: Add back support for mapping hugetlb pages (v2) Vivek Kasireddy
2023-07-18  8:26   ` Vivek Kasireddy
2023-07-18 18:05   ` David Hildenbrand
2023-07-18 18:05     ` David Hildenbrand
2023-07-18 23:44   ` Mike Kravetz [this message]
2023-07-18 23:44     ` Mike Kravetz
2023-07-19  5:42     ` Kasireddy, Vivek
2023-07-19  5:42       ` Kasireddy, Vivek
2023-07-27 20:54       ` Peter Xu
2023-07-27 20:54         ` Peter Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230718234436.GB10413@monkey \
    --to=mike.kravetz@oracle.com \
    --cc=david@redhat.com \
    --cc=dongwon.kim@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=hughd@google.com \
    --cc=jgg@nvidia.com \
    --cc=junxiao.chang@intel.com \
    --cc=kraxel@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=peterx@redhat.com \
    --cc=vivek.kasireddy@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.