From: <mhonap@nvidia.com>
To: <alwilliamson@nvidia.com>, <skolothumtho@nvidia.com>,
<ankita@nvidia.com>, <mst@redhat.com>, <imammedo@redhat.com>,
<anisinha@redhat.com>, <eric.auger@redhat.com>,
<peter.maydell@linaro.org>, <shannon.zhaosl@gmail.com>,
<jonathan.cameron@huawei.com>, <fan.ni@samsung.com>,
<pbonzini@redhat.com>, <richard.henderson@linaro.org>,
<marcel.apfelbaum@gmail.com>, <clg@redhat.com>,
<cohuck@redhat.com>, <dan.j.williams@intel.com>,
<dave.jiang@intel.com>, <alejandro.lucero-palau@amd.com>
Cc: <vsethi@nvidia.com>, <cjia@nvidia.com>, <targupta@nvidia.com>,
<zhiw@nvidia.com>, <kjaju@nvidia.com>,
<linux-cxl@vger.kernel.org>, <kvm@vger.kernel.org>,
<qemu-devel@nongnu.org>, <qemu-arm@nongnu.org>,
"Manish Honap" <mhonap@nvidia.com>
Subject: [RFC 7/9] hw/vfio+cxl: Program HDM decoder 0 at machine_done for firmware-committed devices
Date: Mon, 27 Apr 2026 23:42:33 +0530 [thread overview]
Message-ID: <20260427181235.3003865-8-mhonap@nvidia.com> (raw)
In-Reply-To: <20260427181235.3003865-1-mhonap@nvidia.com>
From: Manish Honap <mhonap@nvidia.com>
setup_locked_hdm() runs as a machine_done notifier after all devices
have been realized. It programs HDM decoder 0 with the CFMWS base
address so the guest can fault into device memory from the first
instruction.
The notifier is only registered when the kernel reports the device as
firmware-committed (VFIO_CXL_CAP_FIRMWARE_COMMITTED). The host is
responsible for HDM decoder programming; the guest has no mechanism to
remap host physical address mappings.
The function uses cxl->fmws_base (set by the optional cxl-fmws-base
device property) if non-zero; otherwise it falls back to the
cxl_fmws_base global captured by cxl_fmws_set_memmap() during machine
memory-map init. If neither is set, it warns and returns without
programming anything.
If COMMIT_LOCK is set in decoder 0 CTRL at machine_done time (left-over
from a prior FLR?), it is cleared before writing BASE so the subsequent
write is not blocked. COMMIT_LOCK is re-set after programming so the
hardware enforces the committed base.
read_region() return is checked; failure aborts programming rather than
leaving ctrl uninitialized. All write_region() failures are propagated.
The function exits cleanly rather than leaving the decoder half-programmed.
Add cxl_fmws_base as a hwaddr global in cxl-host.c (and a stub in
cxl-host-stubs.c). It is set once by cxl_fmws_set_memmap() and read
later at machine_done time.
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
hw/cxl/cxl-host-stubs.c | 2 +
hw/cxl/cxl-host.c | 8 ++
hw/vfio/pci.c | 176 +++++++++++++++++++++++++++++++++++++-
hw/vfio/pci.h | 1 +
hw/vfio/trace-events | 1 +
include/hw/cxl/cxl_host.h | 10 +++
6 files changed, 196 insertions(+), 2 deletions(-)
diff --git a/hw/cxl/cxl-host-stubs.c b/hw/cxl/cxl-host-stubs.c
index c015baac81..0294d484c0 100644
--- a/hw/cxl/cxl-host-stubs.c
+++ b/hw/cxl/cxl-host-stubs.c
@@ -17,4 +17,6 @@ hwaddr cxl_fmws_set_memmap(hwaddr base, hwaddr max_addr)
};
void cxl_fmws_update_mmio(void) {};
+hwaddr cxl_fmws_base;
+
const MemoryRegionOps cfmws_ops;
diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index a94b893e99..f7e933f452 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -429,11 +429,19 @@ void cxl_fmws_update_mmio(void)
object_child_foreach_recursive(object_get_root(), cxl_fmws_mmio_map, NULL);
}
+/*
+ * GPA base of the first CXL Fixed Memory Window region placed in the memory
+ * map by cxl_fmws_set_memmap(). Set once at machine memory-map init time.
+ */
+hwaddr cxl_fmws_base;
+
hwaddr cxl_fmws_set_memmap(hwaddr base, hwaddr max_addr)
{
GSList *cfmws_list, *iter;
CXLFixedWindow *fw;
+ cxl_fmws_base = base;
+
cfmws_list = cxl_fmws_get_all_sorted();
for (iter = cfmws_list; iter; iter = iter->next) {
fw = CXL_FMW(iter->data);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0270de61d2..2595229ea5 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -25,6 +25,7 @@
#include "hw/core/hw-error.h"
#include "hw/core/iommu.h"
#include "hw/cxl/cxl_component.h"
+#include "hw/cxl/cxl_host.h"
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_bridge.h"
@@ -3016,6 +3017,90 @@ static VFIODeviceOps vfio_pci_ops = {
/* HDM Decoder BASE_LO: bits [31:28] hold address bits [31:28] */
#define CXL_HDM_BASE_LO_ADDR_MASK 0xF0000000U
+static bool read_region(VFIORegion *region, uint32_t *val, uint64_t offset)
+{
+ VFIODevice *vbasedev = region->vbasedev;
+ uint32_t le_val;
+
+ if (pread(vbasedev->fd, &le_val, sizeof(le_val),
+ region->fd_offset + offset) != sizeof(le_val)) {
+ error_report("vfio-cxl: pread %s offset 0x%"PRIx64" failed: %m",
+ vbasedev->name, offset);
+ return false;
+ }
+ /* CXL registers are little-endian; convert to host byte order. */
+ *val = le32_to_cpu(le_val);
+ return true;
+}
+
+static bool write_region(VFIORegion *region, uint32_t *val, uint64_t offset)
+{
+ VFIODevice *vbasedev = region->vbasedev;
+ /* CXL registers are little-endian; convert from host byte order. */
+ uint32_t le_val = cpu_to_le32(*val);
+
+ if (pwrite(vbasedev->fd, &le_val, sizeof(le_val),
+ region->fd_offset + offset) != sizeof(le_val)) {
+ error_report("vfio-cxl: pwrite %s offset 0x%"PRIx64" failed: %m",
+ vbasedev->name, offset);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Direct pread/pwrite MemoryRegionOps for the CXL Component Register shadow.
+ *
+ * The generic vfio_region_ops routes guest MMIO through
+ * vfio_device_io_region_read() which returns EINVAL for vendor region
+ * index 10 at runtime. The same pread() issued directly via
+ * region->fd_offset works fine, as vfio_cxl_derive_hdm_info() already does.
+ *
+ * The kernel enforces 4-byte aligned, 4-byte accesses on this region;
+ * valid and impl min/max_access_size are both set to 4 to match.
+ */
+static uint64_t vfio_cxl_comp_regs_mr_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VFIORegion *region = opaque;
+ VFIODevice *vbasedev = region->vbasedev;
+ uint32_t val = 0xFFFFFFFFU;
+
+ if (pread(vbasedev->fd, &val, size,
+ region->fd_offset + addr) != size) {
+ error_report("vfio-cxl: %s COMP_REGS read at 0x%"HWADDR_PRIx
+ " failed: %m", vbasedev->name, addr);
+ }
+
+ val = le32_to_cpu(val);
+ trace_vfio_region_read(vbasedev->name, region->nr, addr, size, val);
+ return val;
+}
+
+static void vfio_cxl_comp_regs_mr_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ VFIORegion *region = opaque;
+ VFIODevice *vbasedev = region->vbasedev;
+ uint32_t val = cpu_to_le32((uint32_t)data);
+
+ if (pwrite(vbasedev->fd, &val, size,
+ region->fd_offset + addr) != size) {
+ error_report("vfio-cxl: %s COMP_REGS write at 0x%"HWADDR_PRIx
+ " failed: %m", vbasedev->name, addr);
+ }
+
+ trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
+}
+
+static const MemoryRegionOps vfio_cxl_comp_regs_mr_ops = {
+ .read = vfio_cxl_comp_regs_mr_read,
+ .write = vfio_cxl_comp_regs_mr_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .valid = { .min_access_size = 4, .max_access_size = 4 },
+ .impl = { .min_access_size = 4, .max_access_size = 4 },
+};
+
bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
@@ -3404,6 +3489,78 @@ static bool vfio_cxl_derive_hdm_info(VFIODevice *vbasedev, VFIOCXL *cxl,
return false;
}
+/*
+ * setup_locked_hdm - machine_done notifier that programs HDM decoder 0 with
+ * the FMWS base address so the guest can access DPA through a stable GPA.
+ *
+ * Uses cxl->fmws_base (set by the optional cxl-fmws-base device property) if
+ * non-zero; otherwise falls back to the cxl_fmws_base global captured by
+ * cxl_fmws_set_memmap() during machine memory-map init. If neither is set,
+ * the notifier warns and returns without programming anything.
+ */
+static void setup_locked_hdm(Notifier *notifier, void *data)
+{
+ VFIOCXL *cxl = container_of(notifier, VFIOCXL, machine_done);
+ VFIORegion *region = &cxl->comp_regs_region;
+ MemoryRegion *sys_mem = get_system_memory();
+ uint64_t hdm_base = cxl->hdm_decoder_offset;
+ uint32_t base_lo, base_hi, ctrl;
+
+ if (!cxl->fmws_base) {
+ cxl->fmws_base = cxl_fmws_base;
+ if (!cxl->fmws_base) {
+ warn_report("vfio-cxl %s: CXL FMWS base not available",
+ region->vbasedev->name);
+ return;
+ }
+ }
+
+ if (!read_region(region, &ctrl,
+ hdm_base + CXL_HDM_DECODER0_CTRL_OFFSET(0))) {
+ error_report("vfio-cxl: %s failed to read HDM decoder 0 CTRL",
+ region->vbasedev->name);
+ return;
+ }
+
+ /*
+ * If COMMIT_LOCK (bit 8) is still set in the virtual snapshot the kernel
+ * should have cleared it during open. Warn and clear it here so the
+ * subsequent BASE write is not blocked.
+ */
+ if (ctrl & CXL_HDM_CTRL_COMMIT_LOCK) {
+ warn_report("vfio-cxl: COMMIT_LOCK set in HDM decoder 0 CTRL at "
+ "machine_done; clearing before programming guest GPA");
+ ctrl &= ~CXL_HDM_CTRL_COMMIT_LOCK;
+ if (!write_region(region, &ctrl,
+ hdm_base + CXL_HDM_DECODER0_CTRL_OFFSET(0))) {
+ return;
+ }
+ }
+
+ base_lo = (uint32_t)(cxl->fmws_base & CXL_HDM_BASE_LO_ADDR_MASK);
+ base_hi = (uint32_t)(cxl->fmws_base >> 32);
+ ctrl |= CXL_HDM_CTRL_COMMIT | CXL_HDM_CTRL_COMMIT_LOCK;
+
+ if (!write_region(region, &base_lo, hdm_base +
+ CXL_HDM_DECODER0_BASE_LOW_OFFSET(0)) ||
+ !write_region(region, &base_hi, hdm_base +
+ CXL_HDM_DECODER0_BASE_HIGH_OFFSET(0)) ||
+ !write_region(region, &ctrl, hdm_base +
+ CXL_HDM_DECODER0_CTRL_OFFSET(0))) {
+ error_report("vfio-cxl: %s failed to program HDM decoder 0",
+ region->vbasedev->name);
+ return;
+ }
+
+ trace_vfio_cxl_locked_hdm(/* name */ region->vbasedev->name,
+ cxl->fmws_base, base_lo, base_hi, ctrl);
+
+ memory_region_transaction_begin();
+ memory_region_add_subregion(sys_mem, cxl->fmws_base, cxl->region.mem);
+ memory_region_transaction_commit();
+ cxl->dpa_in_system_mem = true;
+}
+
static bool vfio_cxl_setup(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
@@ -3471,8 +3628,11 @@ static bool vfio_cxl_setup(VFIOPCIDevice *vdev, Error **errp)
error_setg(errp, "vfio-cxl: failed to get COMP_REGS region info");
return false;
}
- ret = vfio_region_setup(OBJECT(vdev), vbasedev, &cxl->comp_regs_region,
- comp_info->index, "cxl-comp-regs", errp);
+
+ ret = vfio_region_setup_with_ops(OBJECT(vdev), vbasedev,
+ &cxl->comp_regs_region,
+ comp_info->index, "cxl-comp-regs",
+ errp, &vfio_cxl_comp_regs_mr_ops);
if (ret) {
error_setg(errp, "vfio-cxl: failed to set up COMP_REGS region");
return false;
@@ -3486,6 +3646,18 @@ static bool vfio_cxl_setup(VFIOPCIDevice *vdev, Error **errp)
trace_vfio_cxl_setup_params(vbasedev->name, cxl->hdm_regs_bar_index,
cxl->hdm_regs_offset, cxl->hdm_regs_size,
cxl->dpa_size);
+
+ /*
+ * Only pre-program the HDM decoder if the kernel reported the device as
+ * firmware-committed. Non-committed devices need guest driver involvement
+ * to commit the decoder; registering the notifier for them would write an
+ * uncommitted BASE value that the hardware ignores.
+ */
+ if (cap->flags & VFIO_CXL_CAP_FIRMWARE_COMMITTED) {
+ cxl->machine_done.notify = setup_locked_hdm;
+ qemu_add_machine_init_done_notifier(&cxl->machine_done);
+ }
+
return true;
}
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index f3906f0c53..5667c6ec17 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -133,6 +133,7 @@ typedef struct VFIOCXL {
bool dpa_in_system_mem;
VFIORegion region;
VFIORegion comp_regs_region;
+ Notifier machine_done;
} VFIOCXL;
struct VFIOPCIDevice {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 3bced3cebb..174e577837 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -202,3 +202,4 @@ vfio_device_detach(const char *name, int group_id) " (%s) group %d"
vfio_cxl_setup_params(const char *name, uint8_t bar, uint64_t hdm_off, uint64_t hdm_sz, uint64_t dpa_sz) " (%s) hdm_bar=%u hdm_regs_offset=0x%"PRIx64" hdm_regs_size=0x%"PRIx64" dpa_size=0x%"PRIx64
vfio_cxl_put_device(const char *name) " (%s) removing DPA region from system memory"
vfio_cxl_bar_subregion(const char *name, int nr, uint64_t off) " (%s) BAR%d comp_regs overlay at BAR offset 0x%"PRIx64
+vfio_cxl_locked_hdm(const char *name, uint64_t fmws, uint32_t blo, uint32_t bhi, uint32_t ctrl) " (%s) fmws_base=0x%"PRIx64" wrote decoder0 base_lo=0x%08x base_hi=0x%08x ctrl=0x%08x"
diff --git a/include/hw/cxl/cxl_host.h b/include/hw/cxl/cxl_host.h
index 21619bb748..f890a5c0b9 100644
--- a/include/hw/cxl/cxl_host.h
+++ b/include/hw/cxl/cxl_host.h
@@ -20,6 +20,16 @@ hwaddr cxl_fmws_set_memmap(hwaddr base, hwaddr max_addr);
void cxl_fmws_update_mmio(void);
GSList *cxl_fmws_get_all_sorted(void);
+/**
+ * cxl_fmws_base - GPA base of the first CXL Fixed Memory Window region.
+ *
+ * Set by cxl_fmws_set_memmap() to the base address it receives (typically
+ * ROUND_UP(highest_gpa + 1, 256 MiB) on ARM virt). Valid after the
+ * machine memory-map init callback returns, i.e. at machine_done time.
+ * Zero when no machine has called cxl_fmws_set_memmap() (stub builds).
+ */
+extern hwaddr cxl_fmws_base;
+
extern const MemoryRegionOps cfmws_ops;
#endif
--
2.25.1
next prev parent reply other threads:[~2026-04-27 18:14 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 18:12 [RFC 0/9] QEMU: CXL Type-2 device passthrough via vfio-pci mhonap
2026-04-27 18:12 ` [RFC 1/9] hw/arm/virt: Add CXL FMWS PA window for device memory mhonap
2026-04-27 18:12 ` [RFC 2/9] cxl: Add preserve_config to pxb-cxl OSC method mhonap
2026-04-27 18:12 ` [RFC 3/9] linux-headers: Update vfio.h for CXL Type-2 device passthrough mhonap
2026-04-27 18:12 ` [RFC 4/9] hw/vfio/region: Add vfio_region_setup_with_ops() for custom region ops mhonap
2026-04-27 18:12 ` [RFC 5/9] hw/vfio/pci: Add CXL Type-2 device detection and region setup mhonap
2026-04-27 18:12 ` [RFC 6/9] hw/vfio/pci: Wire CXL component-register BAR with COMP_REGS overlay mhonap
2026-04-27 18:12 ` mhonap [this message]
2026-04-27 18:12 ` [RFC 8/9] hw/arm/smmu-common: Allow pxb-cxl as SMMUv3 primary bus mhonap
2026-04-27 18:12 ` [RFC 9/9] vfio/listener: Skip DMA mapping for VFIO-owned RAM-device regions mhonap
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427181235.3003865-8-mhonap@nvidia.com \
--to=mhonap@nvidia.com \
--cc=alejandro.lucero-palau@amd.com \
--cc=alwilliamson@nvidia.com \
--cc=anisinha@redhat.com \
--cc=ankita@nvidia.com \
--cc=cjia@nvidia.com \
--cc=clg@redhat.com \
--cc=cohuck@redhat.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=eric.auger@redhat.com \
--cc=fan.ni@samsung.com \
--cc=imammedo@redhat.com \
--cc=jonathan.cameron@huawei.com \
--cc=kjaju@nvidia.com \
--cc=kvm@vger.kernel.org \
--cc=linux-cxl@vger.kernel.org \
--cc=marcel.apfelbaum@gmail.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
--cc=shannon.zhaosl@gmail.com \
--cc=skolothumtho@nvidia.com \
--cc=targupta@nvidia.com \
--cc=vsethi@nvidia.com \
--cc=zhiw@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox