public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Zhi Wang <zhiw@nvidia.com>
To: <kvm@vger.kernel.org>, <linux-cxl@vger.kernel.org>
Cc: <alex.williamson@redhat.com>, <kevin.tian@intel.com>,
	<jgg@nvidia.com>, <alison.schofield@intel.com>,
	<dan.j.williams@intel.com>, <dave.jiang@intel.com>,
	<dave@stgolabs.net>, <jonathan.cameron@huawei.com>,
	<ira.weiny@intel.com>, <vishal.l.verma@intel.com>,
	<alucerop@amd.com>, <acurrid@nvidia.com>, <cjia@nvidia.com>,
	<smitra@nvidia.com>, <ankita@nvidia.com>, <aniketa@nvidia.com>,
	<kwankhede@nvidia.com>, <targupta@nvidia.com>, <zhiw@nvidia.com>,
	<zhiwang@kernel.org>
Subject: [RFC 05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region
Date: Fri, 20 Sep 2024 15:34:38 -0700	[thread overview]
Message-ID: <20240920223446.1908673-6-zhiw@nvidia.com> (raw)
In-Reply-To: <20240920223446.1908673-1-zhiw@nvidia.com>

To directly access the device memory, a CXL region is required. Creating
a CXL region requires to configure HDM decoders on the path to map the
access of HPA level by level and evetually hit the DPA in the CXL
topology.

For the usersapce, e.g. QEMU, to access the CXL region, the region is
required to be exposed via VFIO interfaces.

Introduce a new VFIO device region and region ops to expose the created
CXL region when initailize the device in the vfio-cxl-core. Introduce a
new sub region type for the userspace to identify a CXL region.

Signed-off-by: Zhi Wang <zhiw@nvidia.com>
---
 drivers/vfio/pci/vfio_cxl_core.c   | 140 ++++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_config.c |   1 +
 include/linux/vfio_pci_core.h      |   1 +
 include/uapi/linux/vfio.h          |   3 +
 4 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c
index 6a7859333f67..ffc15fd94b22 100644
--- a/drivers/vfio/pci/vfio_cxl_core.c
+++ b/drivers/vfio/pci/vfio_cxl_core.c
@@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev)
 	cxl_accel_get_region_params(cxl->region.region, &start, &end);
 
 	cxl->region.addr = start;
+	cxl->region.vaddr = ioremap(start, end - start);
+	if (!cxl->region.addr) {
+		pci_err(pdev, "Fail to map CXL region\n");
+		cxl_region_detach(cxl->cxled);
+		cxl_dpa_free(cxl->cxled);
+		goto out;
+	}
 out:
 	cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
 	return ret;
@@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev)
 {
 	struct vfio_cxl *cxl = &core_dev->cxl;
 
-	if (cxl->region.region)
+	if (cxl->region.region) {
+		iounmap(cxl->region.vaddr);
 		cxl_region_detach(cxl->cxled);
+	}
 
 	if (cxl->cxled)
 		cxl_dpa_free(cxl->cxled);
 }
 
+static unsigned long vma_to_pfn(struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_cxl *cxl = &vdev->cxl;
+	u64 pgoff;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+	unsigned long addr = vma->vm_start;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
+
+	pfn = vma_to_pfn(vma);
+
+	down_read(&vdev->memory_lock);
+
+	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
+		goto out_unlock;
+
+	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+	if (ret & VM_FAULT_ERROR)
+		goto out_unlock;
+
+	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
+		if (addr == vmf->address)
+			continue;
+
+		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
+			break;
+	}
+
+out_unlock:
+	up_read(&vdev->memory_lock);
+
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_cxl_mmap_ops = {
+	.fault = vfio_cxl_mmap_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
+				struct vfio_pci_region *region,
+				struct vm_area_struct *vma)
+{
+	struct vfio_cxl *cxl = &core_dev->cxl;
+	u64 phys_len, req_len, pgoff, req_start;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+	    (vma->vm_flags & VM_READ))
+		return -EPERM;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+	    (vma->vm_flags & VM_WRITE))
+		return -EPERM;
+
+	phys_len = cxl->region.size;
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	vma->vm_private_data = core_dev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+			VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_ops = &vfio_cxl_mmap_ops;
+
+	return 0;
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+				  char __user *buf, size_t count, loff_t *ppos,
+				  bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_core_do_io_rw(core_dev, false,
+				      cxl_region->vaddr,
+				      (char __user *)buf, pos, count,
+				      0, 0, iswrite);
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_region *region)
+{
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+	.rw		= vfio_cxl_region_rw,
+	.mmap		= vfio_cxl_region_mmap,
+	.release	= vfio_cxl_region_release,
+};
+
 int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
 {
 	struct vfio_cxl *cxl = &core_dev->cxl;
 	struct pci_dev *pdev = core_dev->pdev;
+	u32 flags;
 	u16 dvsec;
 	int ret;
 
@@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
 	if (ret)
 		goto err_enable_cxl_device;
 
+	flags = VFIO_REGION_INFO_FLAG_READ |
+		VFIO_REGION_INFO_FLAG_WRITE |
+		VFIO_REGION_INFO_FLAG_MMAP;
+
+	ret = vfio_pci_core_register_dev_region(core_dev,
+		PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
+		cxl->region.size, flags, &cxl->region);
+	if (ret)
+		goto err_register_cxl_region;
+
 	return 0;
 
+err_register_cxl_region:
+	disable_cxl(core_dev);
 err_enable_cxl_device:
 	vfio_pci_core_disable(core_dev);
 	return ret;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 97422aafaa7b..98f3ac2d305c 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
 	return pdev->current_state < PCI_D3hot &&
 	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
 }
+EXPORT_SYMBOL(__vfio_pci_memory_enabled);
 
 /*
  * Restore the *real* BARs after we detect a FLR or backdoor reset.
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 7762d4a3e825..6523d9d1bffe 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -54,6 +54,7 @@ struct vfio_pci_region {
 struct vfio_cxl_region {
 	u64 size;
 	u64 addr;
+	void *vaddr;
 	struct cxl_region *region;
 };
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..71f766c29060 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -372,6 +372,9 @@ struct vfio_region_info_cap_type {
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
 
+/* sub-types for VFIO CXL region */
+#define VFIO_REGION_SUBTYPE_CXL                 (1)
+
 /**
  * struct vfio_region_gfx_edid - EDID region layout.
  *
-- 
2.34.1


  parent reply	other threads:[~2024-09-20 22:35 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-20 22:34 [RFC 00/13] vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough Zhi Wang
2024-09-20 22:34 ` [RFC 01/13] cxl: allow a type-2 device not to have memory device registers Zhi Wang
2024-09-23  8:01   ` Tian, Kevin
2024-09-23 15:38   ` Dave Jiang
2024-09-24  8:03     ` Zhi Wang
2024-09-20 22:34 ` [RFC 02/13] cxl: introduce cxl_get_hdm_info() Zhi Wang
2024-10-17 15:44   ` Jonathan Cameron
2024-10-19  5:38     ` Zhi Wang
2024-09-20 22:34 ` [RFC 03/13] cxl: introduce cxl_find_comp_reglock_offset() Zhi Wang
2024-09-20 22:34 ` [RFC 04/13] vfio: introduce vfio-cxl core preludes Zhi Wang
2024-10-11 18:33   ` Alex Williamson
2024-09-20 22:34 ` Zhi Wang [this message]
2024-10-11 19:12   ` [RFC 05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region Alex Williamson
2024-09-20 22:34 ` [RFC 06/13] vfio/pci: expose vfio_pci_rw() Zhi Wang
2024-09-20 22:34 ` [RFC 07/13] vfio/cxl: introduce vfio_cxl_core_{read, write}() Zhi Wang
2024-09-20 22:34 ` [RFC 08/13] vfio/cxl: emulate HDM decoder registers Zhi Wang
2024-09-20 22:34 ` [RFC 09/13] vfio/pci: introduce CXL device awareness Zhi Wang
2024-10-11 20:37   ` Alex Williamson
2024-09-20 22:34 ` [RFC 10/13] vfio/pci: emulate CXL DVSEC registers in the configuration space Zhi Wang
2024-10-11 21:02   ` Alex Williamson
2024-09-20 22:34 ` [RFC 11/13] vfio/cxl: introduce VFIO CXL device cap Zhi Wang
2024-10-11 21:14   ` Alex Williamson
2024-09-20 22:34 ` [RFC 12/13] vfio/cxl: VFIO variant driver for QEMU CXL accel device Zhi Wang
2024-09-20 22:34 ` [RFC 13/13] vfio/cxl: workaround: don't take resource region when cxl is enabled Zhi Wang
2024-09-23  8:00 ` [RFC 00/13] vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough Tian, Kevin
2024-09-24  8:30   ` Zhi Wang
2024-09-25 13:05     ` Jonathan Cameron
2024-09-27  7:18       ` Zhi Wang
2024-10-04 11:40         ` Jonathan Cameron
2024-10-19  5:30           ` Zhi Wang
2024-10-21 11:07             ` Alejandro Lucero Palau
2024-09-26  6:55     ` Tian, Kevin
2024-09-25 10:11 ` Alejandro Lucero Palau
2024-09-27  7:38   ` Zhi Wang
2024-09-27  7:38   ` Zhi Wang
2024-10-21 10:49 ` Zhi Wang
2024-10-21 13:10   ` Alejandro Lucero Palau
2024-10-30 11:56 ` Zhi Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240920223446.1908673-6-zhiw@nvidia.com \
    --to=zhiw@nvidia.com \
    --cc=acurrid@nvidia.com \
    --cc=alex.williamson@redhat.com \
    --cc=alison.schofield@intel.com \
    --cc=alucerop@amd.com \
    --cc=aniketa@nvidia.com \
    --cc=ankita@nvidia.com \
    --cc=cjia@nvidia.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=ira.weiny@intel.com \
    --cc=jgg@nvidia.com \
    --cc=jonathan.cameron@huawei.com \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwankhede@nvidia.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=smitra@nvidia.com \
    --cc=targupta@nvidia.com \
    --cc=vishal.l.verma@intel.com \
    --cc=zhiwang@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox