From: <mhonap@nvidia.com>
To: <alwilliamson@nvidia.com>, <skolothumtho@nvidia.com>,
<ankita@nvidia.com>, <mst@redhat.com>, <imammedo@redhat.com>,
<anisinha@redhat.com>, <eric.auger@redhat.com>,
<peter.maydell@linaro.org>, <shannon.zhaosl@gmail.com>,
<jonathan.cameron@huawei.com>, <fan.ni@samsung.com>,
<pbonzini@redhat.com>, <richard.henderson@linaro.org>,
<marcel.apfelbaum@gmail.com>, <clg@redhat.com>,
<cohuck@redhat.com>, <dan.j.williams@intel.com>,
<dave.jiang@intel.com>, <alejandro.lucero-palau@amd.com>
Cc: <vsethi@nvidia.com>, <cjia@nvidia.com>, <targupta@nvidia.com>,
<zhiw@nvidia.com>, <kjaju@nvidia.com>,
<linux-cxl@vger.kernel.org>, <kvm@vger.kernel.org>,
<qemu-devel@nongnu.org>, <qemu-arm@nongnu.org>,
"Manish Honap" <mhonap@nvidia.com>
Subject: [RFC 5/9] hw/vfio/pci: Add CXL Type-2 device detection and region setup
Date: Mon, 27 Apr 2026 23:42:31 +0530 [thread overview]
Message-ID: <20260427181235.3003865-6-mhonap@nvidia.com> (raw)
In-Reply-To: <20260427181235.3003865-1-mhonap@nvidia.com>
From: Manish Honap <mhonap@nvidia.com>
When VFIO_DEVICE_FLAGS_CXL is set, the kernel has identified a CXL
Type-2 device and populated the capability chain with a
vfio_device_info_cap_cxl entry. Read that entry to locate the DPA
and CXL Component Register shadow regions, then call vfio_region_setup()
for each.
DPA covers the device's host-managed memory and is faulted in lazily
by the VMM. The CXL Component Register shadow gives the VMM access to
the HDM Decoder Capability block so it can intercept decoder commits
without touching the hardware register page directly.
vfio_cxl_derive_hdm_info() walks the CXL Capability Array inside the
Component Register shadow to find the HDM Decoder capability (ID 0x5)
and extracts hdm_decoder_offset and hdm_count. All reads use
le32_to_cpu() since the capability array is little-endian per the CXL
spec. Dword 0 is the array header; capability entries start at dword 1,
which is why the loop begins at i = 1.
CXL register constants are defined here using names that mirror
<linux/cxl.h> to make cross-referencing straightforward.
Add the VFIOCXL struct embedded in VFIOPCIDevice.
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
hw/vfio/pci.c | 214 +++++++++++++++++++++++++++++++++++++++++++
hw/vfio/pci.h | 14 +++
hw/vfio/trace-events | 4 +
3 files changed, 232 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b2a07f6bb4..49ac661eb3 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -24,6 +24,7 @@
#include "hw/core/hw-error.h"
#include "hw/core/iommu.h"
+#include "hw/cxl/cxl_component.h"
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_bridge.h"
@@ -2957,6 +2958,38 @@ static VFIODeviceOps vfio_pci_ops = {
.vfio_load_config = vfio_pci_load_config,
};
+/*
+ * CXL Component Register Space constants (CXL 4.0 8.2.3).
+ */
+
+/* CXL Capability Array Header (dword 0 of COMP_REGS) */
+#define CXL_CM_CAP_HDR_ARRAY_ID 0x0001U /* expected ID value */
+#define CXL_CM_CAP_HDR_NUM_CAPS_SHIFT 24 /* bits [31:24] = num entries */
+#define CXL_CM_CAP_HDR_NUM_CAPS_MASK 0xffU
+#define CXL_CM_CAP_ENTRY_ID_MASK 0xffffU /* bits [15:0] = cap ID */
+#define CXL_CM_CAP_ENTRY_PTR_SHIFT 20 /* bits [31:20] = byte offset */
+#define CXL_CM_CAP_ENTRY_PTR_MASK 0xfffU
+#define CXL_CM_CAP_ID_HDM 0x0005U /* HDM Decoder cap ID */
+
+/* HDM Decoder Capability (HDMC) register at hdm_decoder_offset+0x00 */
+#define CXL_HDMC_DECODER_COUNT_MASK 0xfU /* bits [3:0]; 0→1, N→N*2 */
+
+/*
+ * Per-decoder register offsets from hdm_decoder_offset (CXL 4.0 Table 8-119).
+ * Decoder records begin at +0x10 and are 0x20 bytes each.
+ */
+#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10)
+#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14)
+#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20)
+
+/* HDM Decoder n Control register bits (CXL 4.0 Table 8-123) */
+#define CXL_HDM_CTRL_COMMIT_LOCK (1U << 8) /* decoder locked */
+#define CXL_HDM_CTRL_COMMIT (1U << 9) /* software trigger */
+#define CXL_HDM_CTRL_COMMITTED (1U << 10) /* hardware status */
+
+/* HDM Decoder BASE_LO: bits [31:28] hold address bits [31:28] */
+#define CXL_HDM_BASE_LO_ADDR_MASK 0xF0000000U
+
bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
@@ -3102,6 +3135,25 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
vfio_display_finalize(vdev);
vfio_bars_finalize(vdev);
+
+ /*
+ * The DPA region is not in bars[] and must be cleaned up here.
+ * Remove it from the system address space before releasing.
+ */
+ if (vdev->cxl.dpa_in_system_mem) {
+ memory_region_del_subregion(get_system_memory(), vdev->cxl.region.mem);
+ vdev->cxl.dpa_in_system_mem = false;
+ trace_vfio_cxl_put_device(vdev->vbasedev.name);
+ }
+ if (vdev->cxl.region.mem) {
+ vfio_region_exit(&vdev->cxl.region);
+ vfio_region_finalize(&vdev->cxl.region);
+ }
+ if (vdev->cxl.comp_regs_region.mem) {
+ vfio_region_exit(&vdev->cxl.comp_regs_region);
+ vfio_region_finalize(&vdev->cxl.comp_regs_region);
+ }
+
vfio_cpr_pci_unregister_device(vdev);
g_free(vdev->emulated_config_bits);
g_free(vdev->rom);
@@ -3254,6 +3306,164 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
}
}
+/*
+ * vfio_cxl_derive_hdm_info - read hdm_decoder_offset and hdm_count from the
+ * COMP_REGS region by traversing the CXL Capability Array.
+ *
+ * Dword 0: CXL Capability Array Header
+ * bits[31:24] = num_caps,
+ * bits[15:0] = 1.
+ * Dwords 1..N:
+ * bits[15:0] = cap ID;
+ * bits[31:20] = byte offset from region start.
+ * HDM Decoder cap ID = 0x5; its offset is hdm_decoder_offset.
+ * HDMC register at hdm_decoder_offset+0:
+ * bits[3:0] encode count (0→1, N→N*2).
+ */
+static bool vfio_cxl_derive_hdm_info(VFIODevice *vbasedev, VFIOCXL *cxl,
+ Error **errp)
+{
+ off_t base = cxl->comp_regs_region.fd_offset;
+ uint32_t hdr, num_caps, i;
+
+ if (pread(vbasedev->fd, &hdr, sizeof(hdr), base) != sizeof(hdr)) {
+ error_setg(errp, "vfio-cxl: failed to read CXL Capability Header");
+ return false;
+ }
+ hdr = le32_to_cpu(hdr);
+
+ if ((hdr & CXL_CM_CAP_ENTRY_ID_MASK) != CXL_CM_CAP_HDR_ARRAY_ID) {
+ error_setg(errp, "vfio-cxl: unexpected CXL Capability Array ID 0x%x",
+ hdr & CXL_CM_CAP_ENTRY_ID_MASK);
+ return false;
+ }
+
+ num_caps = (hdr >> CXL_CM_CAP_HDR_NUM_CAPS_SHIFT) &
+ CXL_CM_CAP_HDR_NUM_CAPS_MASK;
+
+ /*
+ * Dword 0 is the CXL Capability Array Header;
+ * capability entries start at dword 1.
+ */
+ for (i = 1; i <= num_caps; i++) {
+ uint32_t entry, cap_id;
+
+ if (pread(vbasedev->fd, &entry, sizeof(entry),
+ base + i * sizeof(entry)) != sizeof(entry)) {
+ error_setg(errp, "vfio-cxl: failed to read cap entry %u", i);
+ return false;
+ }
+ entry = le32_to_cpu(entry);
+
+ cap_id = entry & CXL_CM_CAP_ENTRY_ID_MASK;
+ if (cap_id == CXL_CM_CAP_ID_HDM) {
+ uint32_t hdmc, field;
+
+ cxl->hdm_decoder_offset = (entry >> CXL_CM_CAP_ENTRY_PTR_SHIFT) &
+ CXL_CM_CAP_ENTRY_PTR_MASK;
+
+ if (pread(vbasedev->fd, &hdmc, sizeof(hdmc),
+ base + cxl->hdm_decoder_offset) != sizeof(hdmc)) {
+ error_setg(errp, "vfio-cxl: failed to read HDMC register");
+ return false;
+ }
+ hdmc = le32_to_cpu(hdmc);
+ field = hdmc & CXL_HDMC_DECODER_COUNT_MASK;
+ cxl->hdm_count = field ? (uint8_t)(field * 2) : 1;
+ return true;
+ }
+ }
+
+ error_setg(errp, "vfio-cxl: HDM Decoder capability not found in COMP_REGS");
+ return false;
+}
+
+static bool vfio_cxl_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ VFIOCXL *cxl = &vdev->cxl;
+ g_autofree struct vfio_device_info *info = NULL;
+ struct vfio_info_cap_header *hdr;
+ struct vfio_device_info_cap_cxl *cap;
+ g_autofree struct vfio_region_info *region_info = NULL;
+ g_autofree struct vfio_region_info *comp_info = NULL;
+ int ret;
+
+ if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_CXL)) {
+ return true;
+ }
+
+ info = vfio_get_device_info(vbasedev->fd);
+ if (!info) {
+ error_setg(errp, "vfio-cxl: failed to get device info");
+ return false;
+ }
+
+ hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_CXL);
+ if (!hdr) {
+ error_setg(errp, "vfio-cxl: CXL capability not found in device info");
+ return false;
+ }
+ cap = (void *)hdr;
+
+ if (cap->dpa_region_index == (uint32_t)-1 ||
+ cap->comp_regs_region_index == (uint32_t)-1) {
+ error_setg(errp, "vfio-cxl: kernel did not provide region indices "
+ "(dpa=%u comp=%u)",
+ cap->dpa_region_index, cap->comp_regs_region_index);
+ return false;
+ }
+
+ cxl->hdm_regs_bar_index = cap->hdm_regs_bar_index;
+ cxl->hdm_regs_offset = cap->hdm_regs_offset;
+
+ /* DPA region */
+ ret = vfio_device_get_region_info(vbasedev, cap->dpa_region_index,
+ ®ion_info);
+ if (ret || !region_info) {
+ error_setg(errp, "vfio-cxl: failed to get DPA region info");
+ return false;
+ }
+ ret = vfio_region_setup(OBJECT(vdev), vbasedev, &cxl->region,
+ region_info->index, "cxl-dpa", errp);
+ if (ret) {
+ error_setg(errp, "vfio-cxl: failed to set up DPA region");
+ return false;
+ }
+ cxl->dpa_size = region_info->size;
+
+ if (vfio_region_mmap(&cxl->region)) {
+ error_setg(errp, "vfio-cxl: failed to mmap DPA region for %s",
+ vbasedev->name);
+ return false;
+ }
+
+ /* COMP_REGS region (HDM decoder shadow) */
+ ret = vfio_device_get_region_info(vbasedev, cap->comp_regs_region_index,
+ &comp_info);
+ if (ret || !comp_info) {
+ error_setg(errp, "vfio-cxl: failed to get COMP_REGS region info");
+ return false;
+ }
+ ret = vfio_region_setup(OBJECT(vdev), vbasedev, &cxl->comp_regs_region,
+ comp_info->index, "cxl-comp-regs", errp);
+ if (ret) {
+ error_setg(errp, "vfio-cxl: failed to set up COMP_REGS region");
+ return false;
+ }
+ cxl->hdm_regs_size = comp_info->size;
+
+ if (!vfio_cxl_derive_hdm_info(vbasedev, cxl, errp)) {
+ return false;
+ }
+
+ trace_vfio_cxl_setup_params(vbasedev->name, cxl->hdm_regs_bar_index,
+ cxl->hdm_regs_offset, cxl->hdm_regs_size,
+ cxl->dpa_size);
+ return true;
+}
+
+
static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
{
Error *err = NULL;
@@ -3508,6 +3718,10 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
goto error;
}
+ if (!vfio_cxl_setup(vdev, errp)) {
+ goto error;
+ }
+
if (!vfio_pci_config_setup(vdev, errp)) {
goto error;
}
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index c3a1f53d35..f3906f0c53 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -122,6 +122,19 @@ typedef struct VFIOMSIXInfo {
OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_DEVICE)
+typedef struct VFIOCXL {
+ uint8_t hdm_regs_bar_index;
+ uint64_t hdm_regs_offset;
+ uint64_t hdm_regs_size;
+ uint64_t hdm_decoder_offset;
+ uint8_t hdm_count;
+ uint64_t dpa_size;
+ hwaddr fmws_base; /* GPA base programmed into HDM decoder 0 */
+ bool dpa_in_system_mem;
+ VFIORegion region;
+ VFIORegion comp_regs_region;
+} VFIOCXL;
+
struct VFIOPCIDevice {
PCIDevice parent_obj;
@@ -191,6 +204,7 @@ struct VFIOPCIDevice {
VFIODisplay *dpy;
Notifier irqchip_change_notifier;
VFIOPCICPR cpr;
+ VFIOCXL cxl;
};
/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 846e3625c5..3678481a8e 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -197,3 +197,7 @@ vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uin
vfio_device_reset_handler(void) ""
vfio_device_attach(const char *name, int group_id) " (%s) group %d"
vfio_device_detach(const char *name, int group_id) " (%s) group %d"
+
+# pci.c CXL Type-2 passthrough
+vfio_cxl_setup_params(const char *name, uint8_t bar, uint64_t hdm_off, uint64_t hdm_sz, uint64_t dpa_sz) " (%s) hdm_bar=%u hdm_regs_offset=0x%"PRIx64" hdm_regs_size=0x%"PRIx64" dpa_size=0x%"PRIx64
+vfio_cxl_put_device(const char *name) " (%s) removing DPA region from system memory"
--
2.25.1
next prev parent reply other threads:[~2026-04-27 18:14 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 18:12 [RFC 0/9] QEMU: CXL Type-2 device passthrough via vfio-pci mhonap
2026-04-27 18:12 ` [RFC 1/9] hw/arm/virt: Add CXL FMWS PA window for device memory mhonap
2026-04-27 18:12 ` [RFC 2/9] cxl: Add preserve_config to pxb-cxl OSC method mhonap
2026-04-27 18:12 ` [RFC 3/9] linux-headers: Update vfio.h for CXL Type-2 device passthrough mhonap
2026-04-27 18:12 ` [RFC 4/9] hw/vfio/region: Add vfio_region_setup_with_ops() for custom region ops mhonap
2026-04-27 18:12 ` mhonap [this message]
2026-04-27 18:12 ` [RFC 6/9] hw/vfio/pci: Wire CXL component-register BAR with COMP_REGS overlay mhonap
2026-04-27 18:12 ` [RFC 7/9] hw/vfio+cxl: Program HDM decoder 0 at machine_done for firmware-committed devices mhonap
2026-04-27 18:12 ` [RFC 8/9] hw/arm/smmu-common: Allow pxb-cxl as SMMUv3 primary bus mhonap
2026-04-27 18:12 ` [RFC 9/9] vfio/listener: Skip DMA mapping for VFIO-owned RAM-device regions mhonap
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427181235.3003865-6-mhonap@nvidia.com \
--to=mhonap@nvidia.com \
--cc=alejandro.lucero-palau@amd.com \
--cc=alwilliamson@nvidia.com \
--cc=anisinha@redhat.com \
--cc=ankita@nvidia.com \
--cc=cjia@nvidia.com \
--cc=clg@redhat.com \
--cc=cohuck@redhat.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=eric.auger@redhat.com \
--cc=fan.ni@samsung.com \
--cc=imammedo@redhat.com \
--cc=jonathan.cameron@huawei.com \
--cc=kjaju@nvidia.com \
--cc=kvm@vger.kernel.org \
--cc=linux-cxl@vger.kernel.org \
--cc=marcel.apfelbaum@gmail.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
--cc=shannon.zhaosl@gmail.com \
--cc=skolothumtho@nvidia.com \
--cc=targupta@nvidia.com \
--cc=vsethi@nvidia.com \
--cc=zhiw@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox