From: Alexey Kardashevskiy <aik@amd.com>
To: <linux-kernel@vger.kernel.org>
Cc: <kvm@vger.kernel.org>, Jason Gunthorpe <jgg@ziepe.ca>,
Kevin Tian <kevin.tian@intel.com>, Joerg Roedel <joro@8bytes.org>,
Will Deacon <will@kernel.org>,
Robin Murphy <robin.murphy@arm.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Steve Sistare <steven.sistare@oracle.com>,
"Nicolin Chen" <nicolinc@nvidia.com>, <iommu@lists.linux.dev>,
Alexey Kardashevskiy <aik@amd.com>, <linux-coco@lists.linux.dev>,
Dan Williams <dan.j.williams@intel.com>,
Santosh Shukla <santosh.shukla@amd.com>,
"Pratik R . Sampat" <prsampat@amd.com>,
Ackerley Tng <ackerleytng@google.com>,
"Sean Christopherson" <seanjc@google.com>,
Fuad Tabba <tabba@google.com>,
Xu Yilun <yilun.xu@linux.intel.com>,
"Aneesh Kumar K . V" <aneesh.kumar@kernel.org>
Subject: [RFC PATCH kernel] iommufd: Allow mapping from KVM's guest_memfd
Date: Wed, 25 Feb 2026 18:52:11 +1100 [thread overview]
Message-ID: <20260225075211.3353194-1-aik@amd.com> (raw)
CoCo VMs get their private memory allocated from guest_memfd
("gmemfd") which is a KVM facility similar to memfd.
The gmemfds does not allow mapping private memory to the userspace
so the IOMMU_IOAS_MAP ioctl does not work.
Use the existing IOMMU_IOAS_MAP_FILE ioctl to allow mapping from
fd + offset. Detect the gmemfd case in pfn_reader_user_pin().
For the new guest_memfd type, no additional reference is taken as
pinning is guaranteed by the KVM guest_memfd library.
There is no KVM-GMEMFD->IOMMUFD direct notification mechanism as
the assumption is that:
1) page stage change events will be handled by VMM which is going
to call IOMMUFD to remap pages;
2) shrinking GMEMFD equals to VM memory unplug and VMM is going to
handle it.
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
---
This is for Trusted IO == TEE-IO == PCIe TDISP, etc.
Previously posted here:
https://lore.kernel.org/r/20250218111017.491719-13-aik@amd.com
The main comment was "what is the lifetime of those folios()" and
GMEMFD + QEMU should take care of it.
And horrendous stuff like this is not really useful:
https://github.com/AMDESE/linux-kvm/commit/7d73fd2cccb8489b1
---
include/linux/kvm_host.h | 4 +
drivers/iommu/iommufd/pages.c | 80 ++++++++++++++++++--
virt/kvm/guest_memfd.c | 36 +++++++++
3 files changed, 113 insertions(+), 7 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 995db7a7ba57..9369cf22b24e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2673,4 +2673,8 @@ unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn);
int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
struct kvm_memory_attributes2 *attrs);
+bool kvm_is_gmemfd(struct file *file);
+struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index,
+ unsigned long *pfn, int *max_order);
+
#endif
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index dbe51ecb9a20..4c07e39e17d0 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -56,6 +56,9 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/vfio_pci_core.h>
+#include <linux/pagemap.h>
+#include <linux/memcontrol.h>
+#include <linux/kvm_host.h>
#include "double_span.h"
#include "io_pagetable.h"
@@ -660,7 +663,8 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
}
static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
- unsigned long *offset_p, unsigned long npages)
+ unsigned long *offset_p, unsigned long npages,
+ bool do_pin)
{
int rc = 0;
struct folio **folios = *folios_p;
@@ -676,7 +680,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
break;
- if (nr > 1) {
+ if (nr > 1 && do_pin) {
rc = folio_add_pins(folio, nr - 1);
if (rc) {
batch_remove_pfn_num(batch, nr);
@@ -697,6 +701,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
unsigned int first_page_off, size_t npages)
{
+ bool do_unpin = !kvm_is_gmemfd(pages->file);
unsigned int cur = 0;
while (first_page_off) {
@@ -710,9 +715,12 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
size_t to_unpin = min_t(size_t, npages,
batch->npfns[cur] - first_page_off);
- unpin_user_page_range_dirty_lock(
- pfn_to_page(batch->pfns[cur] + first_page_off),
- to_unpin, pages->writable);
+ /* Do nothing for guest_memfd */
+ if (do_unpin)
+ unpin_user_page_range_dirty_lock(
+ pfn_to_page(batch->pfns[cur] + first_page_off),
+ to_unpin, pages->writable);
+
iopt_pages_sub_npinned(pages, to_unpin);
cur++;
first_page_off = 0;
@@ -872,6 +880,57 @@ static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start,
return npages_out;
}
+static long pin_guest_memfd_pages(struct pfn_reader_user *user, loff_t start, unsigned long npages)
+{
+ struct page **upages = user->upages;
+ unsigned long offset = 0;
+ loff_t uptr = start;
+ long rc = 0;
+
+ for (unsigned long i = 0; (uptr - start) < (npages << PAGE_SHIFT); ++i) {
+ unsigned long gfn = 0, pfn = 0;
+ int max_order = 0;
+ struct folio *folio;
+
+ folio = kvm_gmemfd_get_pfn(user->file, uptr >> PAGE_SHIFT, &pfn, &max_order);
+ if (IS_ERR(folio))
+ rc = PTR_ERR(folio);
+
+ if (rc == -EINVAL && i == 0) {
+ pr_err_once("Must be vfio mmio at gfn=%lx pfn=%lx, skipping\n", gfn, pfn);
+ return rc;
+ }
+
+ if (rc) {
+ pr_err("%s: %ld %ld %lx -> %lx\n", __func__,
+ rc, i, (unsigned long) uptr, (unsigned long) pfn);
+ break;
+ }
+
+ if (i == 0)
+ offset = offset_in_folio(folio, start) >> PAGE_SHIFT;
+
+ user->ufolios[i] = folio;
+
+ if (upages) {
+ unsigned long np = (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr);
+
+ for (unsigned long j = 0; j < np; ++j)
+ *upages++ = folio_page(folio, offset + j);
+ }
+
+ uptr += (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr);
+ }
+
+ if (!rc) {
+ rc = npages;
+ user->ufolios_next = user->ufolios;
+ user->ufolios_offset = offset;
+ }
+
+ return rc;
+}
+
static int pfn_reader_user_pin(struct pfn_reader_user *user,
struct iopt_pages *pages,
unsigned long start_index,
@@ -925,7 +984,13 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
if (user->file) {
start = pages->start + (start_index * PAGE_SIZE);
- rc = pin_memfd_pages(user, start, npages);
+ if (kvm_is_gmemfd(pages->file)) {
+ rc = pin_guest_memfd_pages(user, start, npages);
+ } else {
+ pr_err("UNEXP PINFD start=%lx sz=%lx file=%lx",
+ start, npages << PAGE_SHIFT, (ulong) pages->file);
+ rc = pin_memfd_pages(user, start, npages);
+ }
} else if (!remote_mm) {
uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
@@ -1221,7 +1286,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
npages);
else
rc = batch_from_folios(&pfns->batch, &user->ufolios_next,
- &user->ufolios_offset, npages);
+ &user->ufolios_offset, npages,
+ !kvm_is_gmemfd(pfns->pages->file));
return rc;
}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e4e21068cf2a..2a313888c21b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1794,3 +1794,39 @@ void kvm_gmem_exit(void)
rcu_barrier();
kmem_cache_destroy(kvm_gmem_inode_cachep);
}
+
+bool kvm_is_gmemfd(struct file *file)
+{
+ if (!file)
+ return false;
+
+ if (file->f_op != &kvm_gmem_fops)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_is_gmemfd);
+
+struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index,
+ unsigned long *pfn, int *max_order)
+{
+ struct inode *inode = file_inode(file);
+ struct folio *folio;
+
+ if (!inode || !kvm_is_gmemfd(file))
+ return NULL;
+
+ folio = kvm_gmem_get_folio(inode, index);
+ if (!folio)
+ return NULL;
+
+
+ *pfn = folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
+ *max_order = folio_order(folio);
+
+ folio_put(folio);
+ folio_unlock(folio);
+
+ return folio;
+}
+EXPORT_SYMBOL_GPL(kvm_gmemfd_get_pfn);
--
2.52.0
next reply other threads:[~2026-02-25 7:52 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-25 7:52 Alexey Kardashevskiy [this message]
2026-02-25 13:55 ` [RFC PATCH kernel] iommufd: Allow mapping from KVM's guest_memfd Sean Christopherson
2026-02-26 6:47 ` Alexey Kardashevskiy
2026-02-26 19:27 ` Jason Gunthorpe
2026-02-27 11:03 ` Xu Yilun
2026-02-26 8:19 ` Ackerley Tng
2026-02-26 19:07 ` Jason Gunthorpe
2026-02-26 22:40 ` Sean Christopherson
2026-02-27 0:21 ` Jason Gunthorpe
2026-02-27 0:28 ` Sean Christopherson
2026-02-27 1:09 ` Jason Gunthorpe
2026-02-27 10:35 ` Xu Yilun
2026-02-27 13:18 ` Jason Gunthorpe
2026-02-28 4:14 ` Xu Yilun
2026-02-28 18:29 ` Jason Gunthorpe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260225075211.3353194-1-aik@amd.com \
--to=aik@amd.com \
--cc=ackerleytng@google.com \
--cc=aneesh.kumar@kernel.org \
--cc=dan.j.williams@intel.com \
--cc=iommu@lists.linux.dev \
--cc=jgg@ziepe.ca \
--cc=joro@8bytes.org \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=linux-coco@lists.linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=nicolinc@nvidia.com \
--cc=pbonzini@redhat.com \
--cc=prsampat@amd.com \
--cc=robin.murphy@arm.com \
--cc=santosh.shukla@amd.com \
--cc=seanjc@google.com \
--cc=steven.sistare@oracle.com \
--cc=tabba@google.com \
--cc=will@kernel.org \
--cc=yilun.xu@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox