From: Alex Williamson <alex.williamson@redhat.com>
To: Zhi Wang <zhiw@nvidia.com>
Cc: <kvm@vger.kernel.org>, <linux-cxl@vger.kernel.org>,
<kevin.tian@intel.com>, <jgg@nvidia.com>,
<alison.schofield@intel.com>, <dan.j.williams@intel.com>,
<dave.jiang@intel.com>, <dave@stgolabs.net>,
<jonathan.cameron@huawei.com>, <ira.weiny@intel.com>,
<vishal.l.verma@intel.com>, <alucerop@amd.com>,
<acurrid@nvidia.com>, <cjia@nvidia.com>, <smitra@nvidia.com>,
<ankita@nvidia.com>, <aniketa@nvidia.com>, <kwankhede@nvidia.com>,
<targupta@nvidia.com>, <zhiwang@kernel.org>
Subject: Re: [RFC 05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region
Date: Fri, 11 Oct 2024 13:12:02 -0600 [thread overview]
Message-ID: <20241011131202.1b6edc8d.alex.williamson@redhat.com> (raw)
In-Reply-To: <20240920223446.1908673-6-zhiw@nvidia.com>
On Fri, 20 Sep 2024 15:34:38 -0700
Zhi Wang <zhiw@nvidia.com> wrote:
> To directly access the device memory, a CXL region is required. Creating
> a CXL region requires to configure HDM decoders on the path to map the
> access of HPA level by level and evetually hit the DPA in the CXL
> topology.
>
> For the usersapce, e.g. QEMU, to access the CXL region, the region is
> required to be exposed via VFIO interfaces.
>
> Introduce a new VFIO device region and region ops to expose the created
> CXL region when initailize the device in the vfio-cxl-core. Introduce a
> new sub region type for the userspace to identify a CXL region.
>
> Signed-off-by: Zhi Wang <zhiw@nvidia.com>
> ---
> drivers/vfio/pci/vfio_cxl_core.c | 140 ++++++++++++++++++++++++++++-
> drivers/vfio/pci/vfio_pci_config.c | 1 +
> include/linux/vfio_pci_core.h | 1 +
> include/uapi/linux/vfio.h | 3 +
> 4 files changed, 144 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c
> index 6a7859333f67..ffc15fd94b22 100644
> --- a/drivers/vfio/pci/vfio_cxl_core.c
> +++ b/drivers/vfio/pci/vfio_cxl_core.c
> @@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev)
> cxl_accel_get_region_params(cxl->region.region, &start, &end);
>
> cxl->region.addr = start;
> + cxl->region.vaddr = ioremap(start, end - start);
> + if (!cxl->region.addr) {
This is testing addr, not the newly added vaddr.
> + pci_err(pdev, "Fail to map CXL region\n");
> + cxl_region_detach(cxl->cxled);
> + cxl_dpa_free(cxl->cxled);
> + goto out;
> + }
> out:
> cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
> return ret;
> @@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev)
> {
> struct vfio_cxl *cxl = &core_dev->cxl;
>
> - if (cxl->region.region)
> + if (cxl->region.region) {
> + iounmap(cxl->region.vaddr);
> cxl_region_detach(cxl->cxled);
> + }
>
> if (cxl->cxled)
> cxl_dpa_free(cxl->cxled);
> }
>
> +static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> +{
> + struct vfio_pci_core_device *vdev = vma->vm_private_data;
> + struct vfio_cxl *cxl = &vdev->cxl;
> + u64 pgoff;
> +
> + pgoff = vma->vm_pgoff &
> + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +
> + return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
> +}
> +
> +static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + struct vfio_pci_core_device *vdev = vma->vm_private_data;
> + unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
> + unsigned long addr = vma->vm_start;
> + vm_fault_t ret = VM_FAULT_SIGBUS;
> +
> + pfn = vma_to_pfn(vma);
> +
> + down_read(&vdev->memory_lock);
> +
> + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
> + goto out_unlock;
> +
> + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> + if (ret & VM_FAULT_ERROR)
> + goto out_unlock;
> +
> + for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
> + if (addr == vmf->address)
> + continue;
> +
> + if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
> + break;
> + }
> +
> +out_unlock:
> + up_read(&vdev->memory_lock);
> +
> + return ret;
> +}
This is identical to vfio_pci_mmap_fault(), we should figure out how to
reuse it rather than duplicate it.
> +
> +static const struct vm_operations_struct vfio_cxl_mmap_ops = {
> + .fault = vfio_cxl_mmap_fault,
> +};
This should make use of the huge_fault support similar to what we added
in vfio-pci too, right?
> +
> +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
> + struct vfio_pci_region *region,
> + struct vm_area_struct *vma)
> +{
> + struct vfio_cxl *cxl = &core_dev->cxl;
> + u64 phys_len, req_len, pgoff, req_start;
> +
> + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
> + return -EINVAL;
> +
> + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
> + (vma->vm_flags & VM_READ))
> + return -EPERM;
> +
> + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
> + (vma->vm_flags & VM_WRITE))
> + return -EPERM;
> +
> + phys_len = cxl->region.size;
> + req_len = vma->vm_end - vma->vm_start;
> + pgoff = vma->vm_pgoff &
> + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> + req_start = pgoff << PAGE_SHIFT;
> +
> + if (req_start + req_len > phys_len)
> + return -EINVAL;
> +
> + vma->vm_private_data = core_dev;
> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
I thought a large part of CXL was that the memory is coherent, maybe I
don't understand what we're providing access to here.
> + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
> +
> + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
> + VM_DONTEXPAND | VM_DONTDUMP);
> + vma->vm_ops = &vfio_cxl_mmap_ops;
> +
> + return 0;
> +}
> +
> +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
> + char __user *buf, size_t count, loff_t *ppos,
> + bool iswrite)
> +{
> + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> + struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
> + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> + if (!count)
> + return 0;
> +
> + return vfio_pci_core_do_io_rw(core_dev, false,
> + cxl_region->vaddr,
> + (char __user *)buf, pos, count,
> + 0, 0, iswrite);
> +}
> +
> +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
> + struct vfio_pci_region *region)
> +{
> +}
> +
> +static const struct vfio_pci_regops vfio_cxl_regops = {
> + .rw = vfio_cxl_region_rw,
> + .mmap = vfio_cxl_region_mmap,
> + .release = vfio_cxl_region_release,
> +};
> +
> int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
> {
> struct vfio_cxl *cxl = &core_dev->cxl;
> struct pci_dev *pdev = core_dev->pdev;
> + u32 flags;
> u16 dvsec;
> int ret;
>
> @@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
> if (ret)
> goto err_enable_cxl_device;
>
> + flags = VFIO_REGION_INFO_FLAG_READ |
> + VFIO_REGION_INFO_FLAG_WRITE |
> + VFIO_REGION_INFO_FLAG_MMAP;
> +
> + ret = vfio_pci_core_register_dev_region(core_dev,
> + PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
> + VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
> + cxl->region.size, flags, &cxl->region);
> + if (ret)
> + goto err_register_cxl_region;
> +
> return 0;
>
> +err_register_cxl_region:
> + disable_cxl(core_dev);
> err_enable_cxl_device:
> vfio_pci_core_disable(core_dev);
> return ret;
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 97422aafaa7b..98f3ac2d305c 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
> return pdev->current_state < PCI_D3hot &&
> (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
> }
> +EXPORT_SYMBOL(__vfio_pci_memory_enabled);
_GPL
There should also be a lockdep assert added if this is exported. With
that it could also drop the underscore prefix.
>
> /*
> * Restore the *real* BARs after we detect a FLR or backdoor reset.
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index 7762d4a3e825..6523d9d1bffe 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -54,6 +54,7 @@ struct vfio_pci_region {
> struct vfio_cxl_region {
> u64 size;
> u64 addr;
> + void *vaddr;
> struct cxl_region *region;
> };
>
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..71f766c29060 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -372,6 +372,9 @@ struct vfio_region_info_cap_type {
> /* sub-types for VFIO_REGION_TYPE_GFX */
> #define VFIO_REGION_SUBTYPE_GFX_EDID (1)
>
> +/* sub-types for VFIO CXL region */
> +#define VFIO_REGION_SUBTYPE_CXL (1)
This is a PCI vendor sub-type with vendor ID 0x1e98. The comment
should reflect that. Thanks,
Alex
> +
> /**
> * struct vfio_region_gfx_edid - EDID region layout.
> *
next prev parent reply other threads:[~2024-10-11 19:12 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-20 22:34 [RFC 00/13] vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough Zhi Wang
2024-09-20 22:34 ` [RFC 01/13] cxl: allow a type-2 device not to have memory device registers Zhi Wang
2024-09-23 8:01 ` Tian, Kevin
2024-09-23 15:38 ` Dave Jiang
2024-09-24 8:03 ` Zhi Wang
2024-09-20 22:34 ` [RFC 02/13] cxl: introduce cxl_get_hdm_info() Zhi Wang
2024-10-17 15:44 ` Jonathan Cameron
2024-10-19 5:38 ` Zhi Wang
2024-09-20 22:34 ` [RFC 03/13] cxl: introduce cxl_find_comp_reglock_offset() Zhi Wang
2024-09-20 22:34 ` [RFC 04/13] vfio: introduce vfio-cxl core preludes Zhi Wang
2024-10-11 18:33 ` Alex Williamson
2024-09-20 22:34 ` [RFC 05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region Zhi Wang
2024-10-11 19:12 ` Alex Williamson [this message]
2024-09-20 22:34 ` [RFC 06/13] vfio/pci: expose vfio_pci_rw() Zhi Wang
2024-09-20 22:34 ` [RFC 07/13] vfio/cxl: introduce vfio_cxl_core_{read, write}() Zhi Wang
2024-09-20 22:34 ` [RFC 08/13] vfio/cxl: emulate HDM decoder registers Zhi Wang
2024-09-20 22:34 ` [RFC 09/13] vfio/pci: introduce CXL device awareness Zhi Wang
2024-10-11 20:37 ` Alex Williamson
2024-09-20 22:34 ` [RFC 10/13] vfio/pci: emulate CXL DVSEC registers in the configuration space Zhi Wang
2024-10-11 21:02 ` Alex Williamson
2024-09-20 22:34 ` [RFC 11/13] vfio/cxl: introduce VFIO CXL device cap Zhi Wang
2024-10-11 21:14 ` Alex Williamson
2024-09-20 22:34 ` [RFC 12/13] vfio/cxl: VFIO variant driver for QEMU CXL accel device Zhi Wang
2024-09-20 22:34 ` [RFC 13/13] vfio/cxl: workaround: don't take resource region when cxl is enabled Zhi Wang
2024-09-23 8:00 ` [RFC 00/13] vfio: introduce vfio-cxl to support CXL type-2 accelerator passthrough Tian, Kevin
2024-09-24 8:30 ` Zhi Wang
2024-09-25 13:05 ` Jonathan Cameron
2024-09-27 7:18 ` Zhi Wang
2024-10-04 11:40 ` Jonathan Cameron
2024-10-19 5:30 ` Zhi Wang
2024-10-21 11:07 ` Alejandro Lucero Palau
2024-09-26 6:55 ` Tian, Kevin
2024-09-25 10:11 ` Alejandro Lucero Palau
2024-09-27 7:38 ` Zhi Wang
2024-09-27 7:38 ` Zhi Wang
2024-10-21 10:49 ` Zhi Wang
2024-10-21 13:10 ` Alejandro Lucero Palau
2024-10-30 11:56 ` Zhi Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241011131202.1b6edc8d.alex.williamson@redhat.com \
--to=alex.williamson@redhat.com \
--cc=acurrid@nvidia.com \
--cc=alison.schofield@intel.com \
--cc=alucerop@amd.com \
--cc=aniketa@nvidia.com \
--cc=ankita@nvidia.com \
--cc=cjia@nvidia.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=ira.weiny@intel.com \
--cc=jgg@nvidia.com \
--cc=jonathan.cameron@huawei.com \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=kwankhede@nvidia.com \
--cc=linux-cxl@vger.kernel.org \
--cc=smitra@nvidia.com \
--cc=targupta@nvidia.com \
--cc=vishal.l.verma@intel.com \
--cc=zhiw@nvidia.com \
--cc=zhiwang@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox