[RFC v2 12/15] vfio/cxl: introduce the emulation of CXL configuration space

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: <mhonap@nvidia.com>
To: <aniketa@nvidia.com>, <ankita@nvidia.com>,
	<alwilliamson@nvidia.com>, <vsethi@nvidia.com>, <jgg@nvidia.com>,
	<mochs@nvidia.com>, <skolothumtho@nvidia.com>,
	<alejandro.lucero-palau@amd.com>, <dave@stgolabs.net>,
	<jonathan.cameron@huawei.com>, <dave.jiang@intel.com>,
	<alison.schofield@intel.com>, <vishal.l.verma@intel.com>,
	<ira.weiny@intel.com>, <dan.j.williams@intel.com>, <jgg@ziepe.ca>,
	<yishaih@nvidia.com>, <kevin.tian@intel.com>
Cc: <cjia@nvidia.com>, <kwankhede@nvidia.com>, <targupta@nvidia.com>,
	<zhiw@nvidia.com>, <kjaju@nvidia.com>,
	<linux-kernel@vger.kernel.org>, <linux-cxl@vger.kernel.org>,
	<kvm@vger.kernel.org>, <mhonap@nvidia.com>
Subject: [RFC v2 12/15] vfio/cxl: introduce the emulation of CXL configuration space
Date: Tue, 9 Dec 2025 22:20:16 +0530	[thread overview]
Message-ID: <20251209165019.2643142-13-mhonap@nvidia.com> (raw)
In-Reply-To: <20251209165019.2643142-1-mhonap@nvidia.com>

From: Zhi Wang <zhiw@nvidia.com>

CXL devices have CXL DVSEC registers in the configuration space.
Many of them affects the behaviors of the devices. E.g. enabling
CXL.io/CXL.mem/CXL.cache.

However, these configuration are owned by the host and a virtualization
policy should be applied when handling the access from the guest.

Introduce the emulation of CXL configuration space to handle the access
of the virtual CXL configuration space from the guest.

Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 drivers/vfio/pci/vfio_cxl_core_emu.c | 340 ++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_config.c   |  10 +-
 include/linux/vfio_pci_core.h        |   4 +
 3 files changed, 346 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_cxl_core_emu.c b/drivers/vfio/pci/vfio_cxl_core_emu.c
index 6711ff8975ef..8037737838ba 100644
--- a/drivers/vfio/pci/vfio_cxl_core_emu.c
+++ b/drivers/vfio/pci/vfio_cxl_core_emu.c
@@ -28,6 +28,334 @@ new_reg_block(struct vfio_cxl_core_device *cxl, u64 offset, u64 size,
 	return block;
 }
 
+static int new_config_block(struct vfio_cxl_core_device *cxl, u64 offset,
+			    u64 size, reg_handler_t *read, reg_handler_t *write)
+{
+	struct vfio_emulated_regblock *block;
+
+	block = new_reg_block(cxl, offset, size, read, write);
+	if (IS_ERR(block))
+		return PTR_ERR(block);
+
+	list_add_tail(&block->list, &cxl->config_regblocks_head);
+	return 0;
+}
+
+static ssize_t virt_config_reg_read(struct vfio_cxl_core_device *cxl, void *buf,
+				    u64 offset, u64 size)
+{
+	memcpy(buf, cxl->config_virt + offset, size);
+	return size;
+}
+
+static ssize_t virt_config_reg_write(struct vfio_cxl_core_device *cxl, void *buf,
+				     u64 offset, u64 size)
+{
+	memcpy(cxl->config_virt + offset, buf, size);
+	return size;
+}
+
+static ssize_t hw_config_reg_read(struct vfio_cxl_core_device *cxl, void *buf,
+				  u64 offset, u64 size)
+{
+	return vfio_user_config_read(cxl->pci_core.pdev, offset, buf, size);
+}
+
+static ssize_t hw_config_reg_write(struct vfio_cxl_core_device *cxl, void *buf,
+				   u64 offset, u64 size)
+{
+	__le32 write_val = *(__le32 *)buf;
+
+	return vfio_user_config_write(cxl->pci_core.pdev, offset, write_val, size);
+}
+
+static ssize_t cxl_control_write(struct vfio_cxl_core_device *cxl, void *buf,
+				 u64 offset, u64 size)
+{
+	u16 lock = le16_to_cpu(*(u16 *)(cxl->config_virt + cxl->dvsec + 0x14));
+	u16 cap3 = le16_to_cpu(*(u16 *)(cxl->config_virt + cxl->dvsec + 0x38));
+	u16 new_val = le16_to_cpu(*(u16 *)buf);
+	u16 rev_mask;
+
+	if (WARN_ON_ONCE(size != 2))
+		return -EINVAL;
+
+	/* register is locked */
+	if (lock & BIT(0))
+		return size;
+
+	/* handle reserved bits in the spec */
+	rev_mask = BIT(13) | BIT(15);
+
+	/* no direct p2p cap */
+	if (!(cap3 & BIT(4)))
+		rev_mask |= BIT(12);
+
+	new_val &= ~rev_mask;
+
+	/* CXL.io is always enabled. */
+	new_val |= BIT(1);
+
+	memcpy(cxl->config_virt + offset, &new_val, size);
+	return size;
+}
+
+static ssize_t cxl_status_write(struct vfio_cxl_core_device *cxl, void *buf,
+				u64 offset, u64 size)
+{
+	u16 cur_val = le16_to_cpu(*(u16 *)(cxl->config_virt + offset));
+	u16 new_val = le16_to_cpu(*(u16 *)buf);
+	u16 rev_mask = GENMASK(13, 0) | BIT(15);
+
+	if (WARN_ON_ONCE(size != 2))
+		return -EINVAL;
+
+	/* handle reserved bits in the spec */
+	new_val &= ~rev_mask;
+
+	/* emulate RW1C bit */
+	if (new_val & BIT(14)) {
+		new_val &= ~BIT(14);
+	} else {
+		new_val &= ~BIT(14);
+		new_val |= cur_val & BIT(14);
+	}
+
+	new_val = cpu_to_le16(new_val);
+	memcpy(cxl->config_virt + offset, &new_val, size);
+	return size;
+}
+
+static ssize_t cxl_control_2_write(struct vfio_cxl_core_device *cxl, void *buf,
+				   u64 offset, u64 size)
+{
+	struct pci_dev *pdev = cxl->pci_core.pdev;
+	u16 cap2 = le16_to_cpu(*(u16 *)(cxl->config_virt + cxl->dvsec + 0x16));
+	u16 cap3 = le16_to_cpu(*(u16 *)(cxl->config_virt + cxl->dvsec + 0x38));
+	u16 new_val = le16_to_cpu(*(u16 *)buf);
+	u16 rev_mask = GENMASK(15, 6) | BIT(1) | BIT(2);
+	u16 hw_bits = BIT(0) | BIT(1) | BIT(3);
+	bool initiate_cxl_reset = new_val & BIT(2);
+
+	if (WARN_ON_ONCE(size != 2))
+		return -EINVAL;
+
+	/* no desired volatile HDM state after host reset */
+	if (!(cap3 & BIT(2)))
+		rev_mask |= BIT(4);
+
+	/* no modified completion enable */
+	if (!(cap2 & BIT(6)))
+		rev_mask |= BIT(5);
+
+	/* handle reserved bits in the spec */
+	new_val &= ~rev_mask;
+
+	/* bits go to the HW */
+	hw_bits &= new_val;
+
+	/* update the virt regs */
+	new_val = cpu_to_le16(new_val);
+	memcpy(cxl->config_virt + offset, &new_val, size);
+
+	if (hw_bits)
+		pci_write_config_word(pdev, offset, cpu_to_le16(hw_bits));
+
+	if (initiate_cxl_reset) {
+		/* TODO: call linux CXL reset */
+	}
+	return size;
+}
+
+static ssize_t cxl_status_2_write(struct vfio_cxl_core_device *cxl, void *buf,
+				  u64 offset, u64 size)
+{
+	struct pci_dev *pdev = cxl->pci_core.pdev;
+	u16 cap3 = le16_to_cpu(*(u16 *)(cxl->config_virt + cxl->dvsec + 0x38));
+	u16 new_val = le16_to_cpu(*(u16 *)buf);
+
+	if (WARN_ON_ONCE(size != 2))
+		return -EINVAL;
+
+	/* write RW1CS if supports */
+	if ((cap3 & BIT(2)) && (new_val & BIT(3)))
+		pci_write_config_word(pdev, offset, BIT(3));
+
+	/* No need to update the virt regs, CXL status reads from the HW */
+	return size;
+}
+
+static ssize_t cxl_lock_write(struct vfio_cxl_core_device *cxl, void *buf,
+			      u64 offset, u64 size)
+{
+	u16 cur_val = le16_to_cpu(*(u16 *)(cxl->config_virt + offset));
+	u16 new_val = le16_to_cpu(*(u16 *)buf);
+	u16 rev_mask = GENMASK(15, 1);
+
+	if (WARN_ON_ONCE(size != 2))
+		return -EINVAL;
+
+	/* LOCK is not allowed to be cleared unless conventional reset. */
+	if (cur_val & BIT(0))
+		return size;
+
+	/* handle reserved bits in the spec */
+	new_val &= ~rev_mask;
+
+	new_val = cpu_to_le16(new_val);
+	memcpy(cxl->config_virt + offset, &new_val, size);
+	return size;
+}
+
+static ssize_t cxl_base_lo_write(struct vfio_cxl_core_device *cxl, void *buf,
+				 u64 offset, u64 size)
+{
+	u32 new_val = le32_to_cpu(*(u32 *)buf);
+	u32 rev_mask = GENMASK(27, 0);
+
+	if (WARN_ON_ONCE(size != 4))
+		return -EINVAL;
+
+	/* handle reserved bits in the spec */
+	new_val &= ~rev_mask;
+
+	new_val = cpu_to_le32(new_val);
+	memcpy(cxl->config_virt + offset, &new_val, size);
+	return size;
+}
+
+static ssize_t virt_config_reg_ro_write(struct vfio_cxl_core_device *cxl, void *buf,
+					u64 offset, u64 size)
+{
+	return size;
+}
+
+static int setup_config_emulation(struct vfio_cxl_core_device *cxl)
+{
+	u16 offset;
+	int ret;
+
+#define ALLOC_BLOCK(offset, size, read, write) do {		\
+	ret = new_config_block(cxl, offset, size, read, write); \
+	if (ret)						\
+		return ret;					\
+	} while (0)
+
+	ALLOC_BLOCK(cxl->dvsec, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	ALLOC_BLOCK(cxl->dvsec + 0x4, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	ALLOC_BLOCK(cxl->dvsec + 0x8, 2,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	/* CXL CAPABILITY */
+	ALLOC_BLOCK(cxl->dvsec + 0xa, 2,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	/* CXL CONTROL */
+	ALLOC_BLOCK(cxl->dvsec + 0xc, 2,
+		    virt_config_reg_read,
+		    cxl_control_write);
+
+	/* CXL STATUS */
+	ALLOC_BLOCK(cxl->dvsec + 0xe, 2,
+		    virt_config_reg_read,
+		    cxl_status_write);
+
+	/* CXL CONTROL 2 */
+	ALLOC_BLOCK(cxl->dvsec + 0x10, 2,
+		    virt_config_reg_read,
+		    cxl_control_2_write);
+
+	/* CXL STATUS 2 */
+	ALLOC_BLOCK(cxl->dvsec + 0x12, 2,
+		    hw_config_reg_read,
+		    cxl_status_2_write);
+
+	/* CXL LOCK */
+	ALLOC_BLOCK(cxl->dvsec + 0x14, 2,
+		    virt_config_reg_read,
+		    cxl_lock_write);
+
+	/* CXL CAPABILITY 2 */
+	ALLOC_BLOCK(cxl->dvsec + 0x16, 2,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	/* CXL RANGE 1 SIZE HIGH & LOW */
+	ALLOC_BLOCK(cxl->dvsec + 0x18, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	ALLOC_BLOCK(cxl->dvsec + 0x1c, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	/* CXL RANG BASE 1 HIGH */
+	ALLOC_BLOCK(cxl->dvsec + 0x20, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_write);
+
+	/* CXL RANG BASE 1 LOW */
+	ALLOC_BLOCK(cxl->dvsec + 0x24, 4,
+		    virt_config_reg_read,
+		    cxl_base_lo_write);
+
+	/* CXL RANGE 2 SIZE HIGH & LOW */
+	ALLOC_BLOCK(cxl->dvsec + 0x28, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	ALLOC_BLOCK(cxl->dvsec + 0x2c, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	/* CXL RANG BASE 2 HIGH */
+	ALLOC_BLOCK(cxl->dvsec + 0x30, 4,
+		    virt_config_reg_read,
+		    virt_config_reg_write);
+
+	/* CXL RANG BASE 2 LOW */
+	ALLOC_BLOCK(cxl->dvsec + 0x34, 4,
+		    virt_config_reg_read,
+		    cxl_base_lo_write);
+
+	/* CXL CAPABILITY 3 */
+	ALLOC_BLOCK(cxl->dvsec + 0x38, 2,
+		    virt_config_reg_read,
+		    virt_config_reg_ro_write);
+
+	while ((offset = pci_find_next_ext_capability(cxl->pci_core.pdev,
+						      offset,
+						      PCI_EXT_CAP_ID_DOE))) {
+		ALLOC_BLOCK(offset + PCI_DOE_CTRL, 4,
+			    hw_config_reg_read,
+			    hw_config_reg_write);
+
+		ALLOC_BLOCK(offset + PCI_DOE_STATUS, 4,
+			    hw_config_reg_read,
+			    hw_config_reg_write);
+
+		ALLOC_BLOCK(offset + PCI_DOE_WRITE, 4,
+			    hw_config_reg_read,
+			    hw_config_reg_write);
+
+		ALLOC_BLOCK(offset + PCI_DOE_READ, 4,
+			    hw_config_reg_read,
+			    hw_config_reg_write);
+	}
+
+#undef ALLOC_BLOCK
+
+	return 0;
+}
+
 static int new_mmio_block(struct vfio_cxl_core_device *cxl, u64 offset, u64 size,
 			  reg_handler_t *read, reg_handler_t *write)
 {
@@ -179,10 +507,10 @@ static int setup_mmio_emulation(struct vfio_cxl_core_device *cxl)
 
 	base = hdm_reg_base(cxl);
 
-#define ALLOC_BLOCK(offset, size, read, write) do {			\
-		ret = new_mmio_block(cxl, offset, size, read, write);	\
-		if (ret)						\
-			return ret;					\
+#define ALLOC_BLOCK(offset, size, read, write) do { \
+	ret = new_mmio_block(cxl, offset, size, read, write); \
+	if (ret) \
+		return ret; \
 	} while (0)
 
 	ALLOC_BLOCK(base + 0x4, 4,
@@ -255,6 +583,10 @@ int vfio_cxl_core_setup_register_emulation(struct vfio_cxl_core_device *cxl)
 	INIT_LIST_HEAD(&cxl->config_regblocks_head);
 	INIT_LIST_HEAD(&cxl->mmio_regblocks_head);
 
+	ret = setup_config_emulation(cxl);
+	if (ret)
+		goto err;
+
 	ret = setup_mmio_emulation(cxl);
 	if (ret)
 		goto err;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..4847d09e58b4 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -120,8 +120,8 @@ struct perm_bits {
 #define	NO_WRITE	0
 #define	ALL_WRITE	0xFFFFFFFFU
 
-static int vfio_user_config_read(struct pci_dev *pdev, int offset,
-				 __le32 *val, int count)
+int vfio_user_config_read(struct pci_dev *pdev, int offset,
+			  __le32 *val, int count)
 {
 	int ret = -EINVAL;
 	u32 tmp_val = 0;
@@ -150,9 +150,10 @@ static int vfio_user_config_read(struct pci_dev *pdev, int offset,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_user_config_read);
 
-static int vfio_user_config_write(struct pci_dev *pdev, int offset,
-				  __le32 val, int count)
+int vfio_user_config_write(struct pci_dev *pdev, int offset,
+			   __le32 val, int count)
 {
 	int ret = -EINVAL;
 	u32 tmp_val = le32_to_cpu(val);
@@ -171,6 +172,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_user_config_write);
 
 static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos,
 				    int count, struct perm_bits *perm,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 31fd28626846..8293910e0a96 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -201,6 +201,10 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
 			       size_t x_end, bool iswrite);
+int vfio_user_config_read(struct pci_dev *pdev, int offset,
+			  __le32 *val, int count);
+int vfio_user_config_write(struct pci_dev *pdev, int offset,
+			   __le32 val, int count);
 bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
 					 loff_t reg_start, size_t reg_cnt,
 					 loff_t *buf_offset,
-- 
2.25.1

next prev parent reply	other threads:[~2025-12-09 16:52 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-09 16:50 [RFC v2 00/15] vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough Hello all, mhonap
2025-12-09 16:50 ` [RFC v2 01/15] cxl: factor out cxl_await_range_active() and cxl_media_ready() mhonap
2025-12-22 12:21   ` Jonathan Cameron
2025-12-09 16:50 ` [RFC v2 02/15] cxl: introduce cxl_get_hdm_reg_info() mhonap
2025-12-09 16:50 ` [RFC v2 03/15] cxl: introduce cxl_find_comp_reglock_offset() mhonap
2025-12-09 16:50 ` [RFC v2 04/15] cxl: introduce devm_cxl_del_memdev() mhonap
2025-12-09 16:50 ` [RFC v2 05/15] cxl: introduce cxl_get_committed_regions() mhonap
2025-12-22 12:31   ` Jonathan Cameron
2025-12-09 16:50 ` [RFC v2 06/15] vfio/cxl: introduce vfio-cxl core preludes mhonap
2025-12-22 13:54   ` Jonathan Cameron
2025-12-09 16:50 ` [RFC v2 07/15] vfio/cxl: expose CXL region to the userspace via a new VFIO device region mhonap
2025-12-11 16:06   ` Dave Jiang
2025-12-11 17:31     ` Manish Honap
2025-12-11 18:01       ` Dave Jiang
2025-12-22 14:00   ` Jonathan Cameron
2025-12-09 16:50 ` [RFC v2 08/15] vfio/cxl: discover precommitted CXL region mhonap
2025-12-22 14:09   ` Jonathan Cameron
2025-12-09 16:50 ` [RFC v2 09/15] vfio/cxl: introduce vfio_cxl_core_{read, write}() mhonap
2025-12-09 16:50 ` [RFC v2 10/15] vfio/cxl: introduce the register emulation framework mhonap
2025-12-09 16:50 ` [RFC v2 11/15] vfio/cxl: introduce the emulation of HDM registers mhonap
2025-12-11 18:13   ` Dave Jiang
2025-12-09 16:50 ` mhonap [this message]
2025-12-09 16:50 ` [RFC v2 13/15] vfio/pci: introduce CXL device awareness mhonap
2025-12-09 16:50 ` [RFC v2 14/15] vfio/cxl: VFIO variant driver for QEMU CXL accel device mhonap
2025-12-09 16:50 ` [RFC v2 15/15] cxl/mem: Fix NULL pointer deference in memory device paths mhonap

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:6711ff8975e dfblob:8037737838b dfblob:8f02f236b5b
dfblob:4847d09e58b dfblob:31fd2862684 dfblob:8293910e0a9 )
 OR (
bs:"[RFC v2 12/15] vfio/cxl: introduce the emulation of CXL configuration space" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251209165019.2643142-13-mhonap@nvidia.com \
    --to=mhonap@nvidia.com \
    --cc=alejandro.lucero-palau@amd.com \
    --cc=alison.schofield@intel.com \
    --cc=alwilliamson@nvidia.com \
    --cc=aniketa@nvidia.com \
    --cc=ankita@nvidia.com \
    --cc=cjia@nvidia.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=ira.weiny@intel.com \
    --cc=jgg@nvidia.com \
    --cc=jgg@ziepe.ca \
    --cc=jonathan.cameron@huawei.com \
    --cc=kevin.tian@intel.com \
    --cc=kjaju@nvidia.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwankhede@nvidia.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mochs@nvidia.com \
    --cc=skolothumtho@nvidia.com \
    --cc=targupta@nvidia.com \
    --cc=vishal.l.verma@intel.com \
    --cc=vsethi@nvidia.com \
    --cc=yishaih@nvidia.com \
    --cc=zhiw@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox