From: <mhonap@nvidia.com>
To: <aniketa@nvidia.com>, <ankita@nvidia.com>,
<alwilliamson@nvidia.com>, <vsethi@nvidia.com>, <jgg@nvidia.com>,
<mochs@nvidia.com>, <skolothumtho@nvidia.com>,
<alejandro.lucero-palau@amd.com>, <dave@stgolabs.net>,
<jonathan.cameron@huawei.com>, <dave.jiang@intel.com>,
<alison.schofield@intel.com>, <vishal.l.verma@intel.com>,
<ira.weiny@intel.com>, <dan.j.williams@intel.com>, <jgg@ziepe.ca>,
<yishaih@nvidia.com>, <kevin.tian@intel.com>
Cc: <cjia@nvidia.com>, <targupta@nvidia.com>, <zhiw@nvidia.com>,
<kjaju@nvidia.com>, <linux-kernel@vger.kernel.org>,
<linux-cxl@vger.kernel.org>, <kvm@vger.kernel.org>,
<mhonap@nvidia.com>
Subject: [PATCH 11/20] vfio/cxl: Expose DPA memory region to userspace with fault+zap mmap
Date: Thu, 12 Mar 2026 02:04:31 +0530 [thread overview]
Message-ID: <20260311203440.752648-12-mhonap@nvidia.com> (raw)
In-Reply-To: <20260311203440.752648-1-mhonap@nvidia.com>
From: Manish Honap <mhonap@nvidia.com>
To directly access the device memory, a CXL region is required. For
the userspace (e.g. QEMU) to access the CXL region, the region is
required to be exposed via VFIO interfaces.
Introduce a new VFIO device region and region ops to expose the created
CXL region. Introduce a new sub region type for userspace to identify
a CXL region.
CXL region lifecycle:
- The CXL memory region is registered with VFIO layer during
vfio_pci_open_device
- mmap() establishes the VMA with vm_ops but inserts no PTEs
- Each guest page fault calls vfio_cxl_region_page_fault() which
inserts a single PFN under the memory_lock read side
- On device reset, vfio_cxl_zap_region_locked() sets region_active=false
and calls unmap_mapping_range() to invalidate all DPA PTEs atomically
while holding memory_lock for writing
- Faults racing with reset see region_active==false and return
VM_FAULT_SIGBUS
- vfio_cxl_reactivate_region() restores region_active after successful
hardware reset
Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so
that FLR correctly invalidates DPA mappings and restores them on success.
Co-developed-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
drivers/vfio/pci/cxl/vfio_cxl_core.c | 222 +++++++++++++++++++++++++++
drivers/vfio/pci/cxl/vfio_cxl_priv.h | 2 +
drivers/vfio/pci/vfio_pci.c | 9 ++
drivers/vfio/pci/vfio_pci_core.c | 11 ++
drivers/vfio/pci/vfio_pci_priv.h | 13 ++
5 files changed, 257 insertions(+)
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 9c71f592e74e..03846bd11c8a 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -44,6 +44,7 @@ static int vfio_cxl_create_device_state(struct vfio_pci_core_device *vdev,
cxl = vdev->cxl;
cxl->dvsec = dvsec;
+ cxl->dpa_region_idx = -1;
pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
&cap_word);
@@ -300,3 +301,224 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
vfio_cxl_destroy_cxl_region(vdev);
}
+
+/*
+ * Fault handler for the DPA region VMA. Called under mm->mmap_lock read
+ * side by the fault path. We take memory_lock read side here to exclude
+ * the write-side held by vfio_cxl_zap_region_locked() during reset.
+ */
+static vm_fault_t vfio_cxl_region_page_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ unsigned long pfn;
+
+ guard(rwsem_read)(&vdev->memory_lock);
+
+ if (!READ_ONCE(cxl->region_active))
+ return VM_FAULT_SIGBUS;
+
+ pfn = PHYS_PFN(cxl->region_hpa) +
+ ((vmf->address - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Scrub the page via the kernel ioremap_cache mapping before inserting
+ * the user PFN. Prevent the stale device data from leaking across VFIO
+ * device open/close boundaries.
+ */
+ memset_io((u8 __iomem *)cxl->region_vaddr +
+ ((pfn - PHYS_PFN(cxl->region_hpa)) << PAGE_SHIFT),
+ 0, PAGE_SIZE);
+
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+}
+
+static const struct vm_operations_struct vfio_cxl_region_vm_ops = {
+ .fault = vfio_cxl_region_page_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_region *region,
+ struct vm_area_struct *vma)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ unsigned long req_len;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+ return -EINVAL;
+
+ if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len))
+ return -EOVERFLOW;
+
+ if (req_len > cxl->region_size)
+ return -EINVAL;
+
+ /*
+ * Do not insert PTEs here (no remap_pfn_range). PTEs are inserted
+ * lazily on first fault via vfio_cxl_region_page_fault(). This
+ * allows vfio_cxl_zap_region_locked() to safely invalidate them
+ * during device reset without any userspace cooperation.
+ * Leave vm_page_prot at its default.
+ */
+
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_private_data = vdev;
+ vma->vm_ops = &vfio_cxl_region_vm_ops;
+
+ return 0;
+}
+
+/*
+ * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs.
+ *
+ * Must be called with vdev->memory_lock held for writing. Sets
+ * region_active=false before zapping so any fault racing with zap sees
+ * the inactive state and returns VM_FAULT_SIGBUS rather than inserting
+ * a stale PFN.
+ */
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_device *core_vdev = &vdev->vdev;
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ if (!cxl || cxl->dpa_region_idx < 0)
+ return;
+
+ WRITE_ONCE(cxl->region_active, false);
+ unmap_mapping_range(core_vdev->inode->i_mapping,
+ VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS +
+ cxl->dpa_region_idx),
+ cxl->region_size, true);
+}
+
+/*
+ * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset.
+ *
+ * Must be called with vdev->memory_lock held for writing. Re-reads the
+ * HDM decoder state from hardware (FLR cleared it) and sets region_active
+ * so that subsequent faults can re-insert PFNs without a new mmap.
+ */
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ if (!cxl)
+ return;
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+ char __user *buf, size_t count, loff_t *ppos,
+ bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_pci_cxl_state *cxl = core_dev->region[i].data;
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ guard(rwsem_read)(&core_dev->memory_lock);
+
+ if (!READ_ONCE(cxl->region_active))
+ return -EIO;
+
+ if (!count)
+ return 0;
+
+ return vfio_pci_core_do_io_rw(core_dev, false,
+ cxl->region_vaddr,
+ (char __user *)buf, pos, count,
+ 0, 0, iswrite, VFIO_PCI_IO_WIDTH_8);
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_region *region)
+{
+ struct vfio_pci_cxl_state *cxl = region->data;
+
+ if (cxl->region_vaddr) {
+ iounmap(cxl->region_vaddr);
+ cxl->region_vaddr = NULL;
+ }
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+ .rw = vfio_cxl_region_rw,
+ .mmap = vfio_cxl_region_mmap,
+ .release = vfio_cxl_region_release,
+};
+
+int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ u32 flags;
+ int ret;
+
+ if (!cxl)
+ return -ENODEV;
+
+ if (!cxl->region || cxl->region_vaddr)
+ return -ENODEV;
+
+ cxl->region_vaddr = ioremap_cache(cxl->region_hpa, cxl->region_size);
+ if (!cxl->region_vaddr)
+ return -ENOMEM;
+
+ flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+
+ ret = vfio_pci_core_register_dev_region(vdev,
+ PCI_VENDOR_ID_CXL |
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+ VFIO_REGION_SUBTYPE_CXL,
+ &vfio_cxl_regops,
+ cxl->region_size, flags,
+ cxl);
+ if (ret) {
+ iounmap(cxl->region_vaddr);
+ cxl->region_vaddr = NULL;
+ return ret;
+ }
+
+ /*
+ * Cache the vdev->region[] index before activating the region.
+ * vfio_pci_core_register_dev_region() placed the new entry at
+ * vdev->region[num_regions - 1] and incremented num_regions.
+ * vfio_cxl_zap_region_locked() uses this to avoid scanning
+ * vdev->region[] on every FLR.
+ */
+ cxl->dpa_region_idx = vdev->num_regions - 1;
+ WRITE_ONCE(cxl->region_active, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region);
+
+/**
+ * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region()
+ * @vdev: VFIO PCI device
+ *
+ * Marks the DPA region inactive so any racing fault returns VM_FAULT_SIGBUS
+ * and resets dpa_region_idx. Does NOT call release() or touch num_regions;
+ * vfio_pci_core_disable() will call the idempotent release() callback as
+ * normal during device close.
+ *
+ * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd).
+ * The caller must call vfio_cxl_destroy_cxl_region() separately to release
+ * those objects.
+ */
+void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ if (!cxl || cxl->dpa_region_idx < 0)
+ return;
+
+ WRITE_ONCE(cxl->region_active, false);
+
+ cxl->dpa_region_idx = -1;
+}
+EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region);
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 985680842a13..b870926bfb19 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -26,9 +26,11 @@ struct vfio_pci_cxl_state {
resource_size_t comp_reg_offset;
size_t comp_reg_size;
u32 hdm_count;
+ int dpa_region_idx;
u16 dvsec;
u8 comp_reg_bar;
bool precommitted;
+ bool region_active;
};
/*
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 0c771064c0b8..d3138badeaa6 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -120,6 +120,15 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev)
}
}
+ if (vdev->cxl) {
+ ret = vfio_cxl_register_cxl_region(vdev);
+ if (ret) {
+ pci_warn(pdev, "Failed to setup CXL region\n");
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+ }
+
vfio_pci_core_finish_enable(vdev);
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7364178e23d..48e0274c19aa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ /* Zap CXL DPA region PTEs before hardware reset clears HDM state */
+ vfio_cxl_zap_region_locked(vdev);
+
/*
* This function can be invoked while the power state is non-D0. If
* pci_try_reset_function() has been called while the power state is
@@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+
+ /*
+ * Re-enable DPA region if reset succeeded; fault handler will
+ * re-insert PFNs on next access without requiring a new mmap.
+ */
+ if (!ret)
+ vfio_cxl_reactivate_region(vdev);
+
if (__vfio_pci_memory_enabled(vdev))
vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 818d99f098bf..441b4a47637a 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -140,6 +140,10 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev);
int vfio_cxl_create_cxl_region(struct vfio_pci_core_device *vdev,
resource_size_t size);
void vfio_cxl_destroy_cxl_region(struct vfio_pci_core_device *vdev);
+int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev);
+void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev);
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
#else
@@ -152,6 +156,15 @@ static inline int vfio_cxl_create_cxl_region(struct vfio_pci_core_device *vdev,
{ return 0; }
static inline void
vfio_cxl_destroy_cxl_region(struct vfio_pci_core_device *vdev) { }
+static inline int
+vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev)
+{ return 0; }
+static inline void
+vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
#endif /* CONFIG_VFIO_CXL_CORE */
--
2.25.1
next prev parent reply other threads:[~2026-03-11 20:36 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-11 20:34 [PATCH 00/20] vfio/pci: Add CXL Type-2 device passthrough support mhonap
2026-03-11 20:34 ` [PATCH 01/20] cxl: Introduce cxl_get_hdm_reg_info() mhonap
2026-03-12 11:28 ` Jonathan Cameron
2026-03-12 16:33 ` Dave Jiang
2026-03-11 20:34 ` [PATCH 02/20] cxl: Expose cxl subsystem specific functions for vfio mhonap
2026-03-12 16:49 ` Dave Jiang
2026-03-13 10:05 ` Manish Honap
2026-03-11 20:34 ` [PATCH 03/20] cxl: Move CXL spec defines to public header mhonap
2026-03-13 12:18 ` Jonathan Cameron
2026-03-13 16:56 ` Dave Jiang
2026-03-18 14:56 ` Jonathan Cameron
2026-03-18 17:51 ` Manish Honap
2026-03-11 20:34 ` [PATCH 04/20] cxl: Media ready check refactoring mhonap
2026-03-12 20:29 ` Dave Jiang
2026-03-13 10:05 ` Manish Honap
2026-03-11 20:34 ` [PATCH 05/20] cxl: Expose BAR index and offset from register map mhonap
2026-03-12 20:58 ` Dave Jiang
2026-03-13 10:11 ` Manish Honap
2026-03-11 20:34 ` [PATCH 06/20] vfio/cxl: Add UAPI for CXL Type-2 device passthrough mhonap
2026-03-12 21:04 ` Dave Jiang
2026-03-11 20:34 ` [PATCH 07/20] vfio/pci: Add CXL state to vfio_pci_core_device mhonap
2026-03-11 20:34 ` [PATCH 08/20] vfio/pci: Add vfio-cxl Kconfig and build infrastructure mhonap
2026-03-13 12:27 ` Jonathan Cameron
2026-03-18 17:21 ` Manish Honap
2026-03-11 20:34 ` [PATCH 09/20] vfio/cxl: Implement CXL device detection and HDM register probing mhonap
2026-03-12 22:31 ` Dave Jiang
2026-03-13 12:43 ` Jonathan Cameron
2026-03-18 17:43 ` Manish Honap
2026-03-11 20:34 ` [PATCH 10/20] vfio/cxl: CXL region management mhonap
2026-03-12 22:55 ` Dave Jiang
2026-03-13 12:52 ` Jonathan Cameron
2026-03-18 17:48 ` Manish Honap
2026-03-11 20:34 ` mhonap [this message]
2026-03-13 17:07 ` [PATCH 11/20] vfio/cxl: Expose DPA memory region to userspace with fault+zap mmap Dave Jiang
2026-03-18 17:54 ` Manish Honap
2026-03-11 20:34 ` [PATCH 12/20] vfio/pci: Export config access helpers mhonap
2026-03-11 20:34 ` [PATCH 13/20] vfio/cxl: Introduce HDM decoder register emulation framework mhonap
2026-03-13 19:05 ` Dave Jiang
2026-03-18 17:58 ` Manish Honap
2026-03-11 20:34 ` [PATCH 14/20] vfio/cxl: Check media readiness and create CXL memdev mhonap
2026-03-11 20:34 ` [PATCH 15/20] vfio/cxl: Introduce CXL DVSEC configuration space emulation mhonap
2026-03-13 22:07 ` Dave Jiang
2026-03-18 18:41 ` Manish Honap
2026-03-11 20:34 ` [PATCH 16/20] vfio/pci: Expose CXL device and region info via VFIO ioctl mhonap
2026-03-11 20:34 ` [PATCH 17/20] vfio/cxl: Provide opt-out for CXL feature mhonap
2026-03-11 20:34 ` [PATCH 18/20] docs: vfio-pci: Document CXL Type-2 device passthrough mhonap
2026-03-13 12:13 ` Jonathan Cameron
2026-03-17 21:24 ` Alex Williamson
2026-03-19 16:06 ` Jonathan Cameron
2026-03-23 14:36 ` Manish Honap
2026-03-11 20:34 ` [PATCH 19/20] selftests/vfio: Add CXL Type-2 passthrough tests mhonap
2026-03-11 20:34 ` [PATCH 20/20] selftests/vfio: Fix VLA initialisation in vfio_pci_irq_set() mhonap
2026-03-13 22:23 ` Dave Jiang
2026-03-18 18:07 ` Manish Honap
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260311203440.752648-12-mhonap@nvidia.com \
--to=mhonap@nvidia.com \
--cc=alejandro.lucero-palau@amd.com \
--cc=alison.schofield@intel.com \
--cc=alwilliamson@nvidia.com \
--cc=aniketa@nvidia.com \
--cc=ankita@nvidia.com \
--cc=cjia@nvidia.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=ira.weiny@intel.com \
--cc=jgg@nvidia.com \
--cc=jgg@ziepe.ca \
--cc=jonathan.cameron@huawei.com \
--cc=kevin.tian@intel.com \
--cc=kjaju@nvidia.com \
--cc=kvm@vger.kernel.org \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mochs@nvidia.com \
--cc=skolothumtho@nvidia.com \
--cc=targupta@nvidia.com \
--cc=vishal.l.verma@intel.com \
--cc=vsethi@nvidia.com \
--cc=yishaih@nvidia.com \
--cc=zhiw@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox