From: Eric Auger <eric.auger@redhat.com>
To: "Cédric Le Goater" <clg@redhat.com>,
"Zhenzhong Duan" <zhenzhong.duan@intel.com>,
qemu-devel@nongnu.org
Cc: alex.williamson@redhat.com, mst@redhat.com, jasowang@redhat.com,
peterx@redhat.com, ddutile@redhat.com, jgg@nvidia.com,
nicolinc@nvidia.com, skolothumtho@nvidia.com,
joao.m.martins@oracle.com, clement.mathieu--drif@eviden.com,
kevin.tian@intel.com, yi.l.liu@intel.com, chao.p.peng@intel.com,
Yi Sun <yi.y.sun@linux.intel.com>
Subject: Re: [PATCH v7 13/23] intel_iommu: Bind/unbind guest page table to host
Date: Mon, 3 Nov 2025 10:25:49 +0100 [thread overview]
Message-ID: <56714679-d455-4a5f-a46b-b0dbc40f7674@redhat.com> (raw)
In-Reply-To: <90e0d491-e59e-4093-812e-57627baea452@redhat.com>
Hi Cédric, Zhenzhong,
On 10/24/25 7:01 PM, Cédric Le Goater wrote:
> On 10/24/25 10:43, Zhenzhong Duan wrote:
>> This captures the guest PASID table entry modifications and propagates
>> the changes to host to attach a hwpt with type determined per guest
>> IOMMU
>> PGTT configuration.
>>
>> When PGTT=PT, attach PASID_0 to a second stage HWPT(GPA->HPA).
>> When PGTT=FST, attach PASID_0 to nested HWPT with nesting parent HWPT
>> coming from VFIO.
>>
>> Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> include/hw/i386/intel_iommu.h | 1 +
>> hw/i386/intel_iommu.c | 150 +++++++++++++++++++++++++++++++++-
>> hw/i386/trace-events | 3 +
>> 3 files changed, 151 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/hw/i386/intel_iommu.h
>> b/include/hw/i386/intel_iommu.h
>> index 3758ac239c..b5f8a9fc29 100644
>> --- a/include/hw/i386/intel_iommu.h
>> +++ b/include/hw/i386/intel_iommu.h
>> @@ -104,6 +104,7 @@ struct VTDAddressSpace {
>> PCIBus *bus;
>> uint8_t devfn;
>> uint32_t pasid;
>> + uint32_t fs_hwpt;
>> AddressSpace as;
>> IOMMUMemoryRegion iommu;
>> MemoryRegion root; /* The root container of the device */
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 871e6aad19..3789a36147 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -20,6 +20,7 @@
>> */
>> #include "qemu/osdep.h"
>> +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
>> #include "qemu/error-report.h"
>> #include "qemu/main-loop.h"
>> #include "qapi/error.h"
>> @@ -42,6 +43,9 @@
>> #include "migration/vmstate.h"
>> #include "trace.h"
>> #include "system/iommufd.h"
>> +#ifdef CONFIG_IOMMUFD
>> +#include <linux/iommufd.h>
>> +#endif
>
>
> Exposing IOMMUFD in the Intel vIOMMU is unexpected. Initially, we
> introduced HostIOMMUDeviceClass to avoid exposing the IOMMU backends.
> Are we OK to bypass this abstract layer now ?
HostIOMMUDeviceClass was rather introduced to hide the differences
between VFIO and IOMMUFD backend implementations.
This feature is only implemented based on iommufd. Besides it is
specialized for VTD and ARM so to me it looks like a different class
derivation
Maybe you should put that code in a separate file (-accel.c) as
Shameer/Nicolin are doing for SMMU?
Thanks
Eric
>
>
> Thanks,
>
> C.
>
>
>
> > /* context entry operations */
>> #define PASID_0 0
>> @@ -87,6 +91,7 @@ struct vtd_iotlb_key {
>> static void vtd_address_space_refresh_all(IntelIOMMUState *s);
>> static void vtd_address_space_unmap(VTDAddressSpace *as,
>> IOMMUNotifier *n);
>> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp);
>> static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
>> {
>> @@ -98,7 +103,11 @@ static void
>> vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
>> g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
>> while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
>> VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
>> - pc_entry->valid = false;
>> + if (pc_entry->valid) {
>> + pc_entry->valid = false;
>> + /* It's fatal to get failure during reset */
>> + vtd_bind_guest_pasid(vtd_as, &error_fatal);
>> + }
>> }
>> }
>> @@ -2380,6 +2389,128 @@ static void
>> vtd_context_global_invalidate(IntelIOMMUState *s)
>> vtd_iommu_replay_all(s);
>> }
>> +#ifdef CONFIG_IOMMUFD
>> +static int vtd_create_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>> + VTDPASIDEntry *pe, uint32_t *fs_hwpt,
>> + Error **errp)
>> +{
>> + struct iommu_hwpt_vtd_s1 vtd = {};
>> +
>> + vtd.flags = (VTD_SM_PASID_ENTRY_SRE_BIT(pe) ? IOMMU_VTD_S1_SRE :
>> 0) |
>> + (VTD_SM_PASID_ENTRY_WPE_BIT(pe) ? IOMMU_VTD_S1_WPE :
>> 0) |
>> + (VTD_SM_PASID_ENTRY_EAFE_BIT(pe) ? IOMMU_VTD_S1_EAFE
>> : 0);
>> + vtd.addr_width = vtd_pe_get_fs_aw(pe);
>> + vtd.pgtbl_addr = (uint64_t)vtd_pe_get_fspt_base(pe);
>> +
>> + return !iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> + idev->hwpt_id, 0,
>> IOMMU_HWPT_DATA_VTD_S1,
>> + sizeof(vtd), &vtd, fs_hwpt,
>> errp);
>> +}
>> +
>> +static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>> + VTDAddressSpace *vtd_as)
>> +{
>> + if (!vtd_as->fs_hwpt) {
>> + return;
>> + }
>> + iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
>> + vtd_as->fs_hwpt = 0;
>> +}
>> +
>> +static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> + VTDAddressSpace *vtd_as, Error
>> **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
>> + uint32_t hwpt_id;
>> + bool ret;
>> +
>> + /*
>> + * We can get here only if flts=on, the supported PGTT is FST
>> and PT.
>> + * Catch invalid PGTT when processing invalidation request to avoid
>> + * attaching to wrong hwpt.
>> + */
>> + if (!vtd_pe_pgtt_is_fst(pe) && !vtd_pe_pgtt_is_pt(pe)) {
>> + error_setg(errp, "Invalid PGTT type");
>> + return -EINVAL;
>> + }
>> +
>> + if (vtd_pe_pgtt_is_pt(pe)) {
>> + hwpt_id = idev->hwpt_id;
>> + } else if (vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp)) {
>> + return -EINVAL;
>> + }
>> +
>> + ret = host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
>> + trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid,
>> hwpt_id, !ret);
>> + if (ret) {
>> + /* Destroy old fs_hwpt if it's a replacement */
>> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
>> + if (vtd_pe_pgtt_is_fst(pe)) {
>> + vtd_as->fs_hwpt = hwpt_id;
>> + }
>> + } else if (vtd_pe_pgtt_is_fst(pe)) {
>> + iommufd_backend_free_id(idev->iommufd, hwpt_id);
>> + }
>> +
>> + return !ret;
>> +}
>> +
>> +static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> + VTDAddressSpace *vtd_as, Error
>> **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + IntelIOMMUState *s = vtd_as->iommu_state;
>> + uint32_t pasid = vtd_as->pasid;
>> + bool ret;
>> +
>> + if (s->dmar_enabled && s->root_scalable) {
>> + ret = host_iommu_device_iommufd_detach_hwpt(idev, errp);
>> + trace_vtd_device_detach_hwpt(idev->devid, pasid, !ret);
>> + } else {
>> + /*
>> + * If DMAR remapping is disabled or guest switches to legacy
>> mode,
>> + * we fallback to the default HWPT which contains shadow
>> page table.
>> + * So guest DMA could still work.
>> + */
>> + ret = host_iommu_device_iommufd_attach_hwpt(idev,
>> idev->hwpt_id, errp);
>> + trace_vtd_device_reattach_def_hwpt(idev->devid, pasid,
>> idev->hwpt_id,
>> + !ret);
>> + }
>> +
>> + if (ret) {
>> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
>> + }
>> +
>> + return !ret;
>> +}
>> +
>> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
>> +{
>> + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
>> + VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(vtd_as);
>> + int ret;
>> +
>> + /* Ignore emulated device or legacy VFIO backed device */
>> + if (!vtd_hiod) {
>> + return 0;
>> + }
>> +
>> + if (pc_entry->valid) {
>> + ret = vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
>> + } else {
>> + ret = vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
>> + }
>> +
>> + return ret;
>> +}
>> +#else
>> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
>> +{
>> + return 0;
>> +}
>> +#endif
>> +
>> /* Do a context-cache device-selective invalidation.
>> * @func_mask: FM field after shifting
>> */
>> @@ -3134,6 +3265,8 @@ static void
>> vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> VTDPASIDEntry pe;
>> IOMMUNotifier *n;
>> uint16_t did;
>> + const char *err_prefix;
>> + Error *local_err = NULL;
>> if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) {
>> if (!pc_entry->valid) {
>> @@ -3154,7 +3287,9 @@ static void
>> vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> vtd_address_space_unmap(vtd_as, n);
>> }
>> vtd_switch_address_space(vtd_as);
>> - return;
>> +
>> + err_prefix = "Detaching from HWPT failed: ";
>> + goto do_bind_unbind;
>> }
>> /*
>> @@ -3182,12 +3317,21 @@ static void
>> vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> if (!pc_entry->valid) {
>> pc_entry->pasid_entry = pe;
>> pc_entry->valid = true;
>> - } else if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
>> + err_prefix = "Attaching to HWPT failed: ";
>> + } else if (vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
>> + err_prefix = "Replacing HWPT attachment failed: ";
>> + } else {
>> return;
>> }
>> vtd_switch_address_space(vtd_as);
>> vtd_address_space_sync(vtd_as);
>> +
>> +do_bind_unbind:
>> + /* TODO: Fault event injection into guest, report error to QEMU
>> for now */
>> + if (vtd_bind_guest_pasid(vtd_as, &local_err)) {
>> + error_reportf_err(local_err, "%s", err_prefix);
>> + }
>> }
>> static void vtd_pasid_cache_sync(IntelIOMMUState *s,
>> VTDPASIDCacheInfo *pc_info)
>> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
>> index b704f4f90c..5a3ee1cf64 100644
>> --- a/hw/i386/trace-events
>> +++ b/hw/i386/trace-events
>> @@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail
>> 0x%"PRIx16
>> vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target)
>> "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
>> vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target)
>> "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
>> vtd_reset_exit(void) ""
>> +vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t
>> hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
>> +vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret)
>> "dev_id %d pasid %d ret: %d"
>> +vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid,
>> uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
>> # amd_iommu.c
>> amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to
>> write at addr 0x%"PRIx64" + offset 0x%"PRIx32
>
next prev parent reply other threads:[~2025-11-03 9:26 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-24 8:43 [PATCH v7 00/23] intel_iommu: Enable first stage translation for passthrough device Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 01/23] intel_iommu: Rename vtd_ce_get_rid2pasid_entry to vtd_ce_get_pasid_entry Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 02/23] intel_iommu: Delete RPS capability related supporting code Zhenzhong Duan
2025-10-31 7:50 ` Eric Auger
2025-10-31 9:49 ` Duan, Zhenzhong
2025-10-24 8:43 ` [PATCH v7 03/23] intel_iommu: Update terminology to match VTD spec Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 04/23] hw/pci: Export pci_device_get_iommu_bus_devfn() and return bool Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 05/23] hw/pci: Introduce pci_device_get_viommu_flags() Zhenzhong Duan
2025-10-24 17:18 ` Cédric Le Goater
2025-10-28 6:57 ` Duan, Zhenzhong
2025-10-28 15:19 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 06/23] intel_iommu: Implement get_viommu_flags() callback Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 07/23] intel_iommu: Introduce a new structure VTDHostIOMMUDevice Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 08/23] vfio/iommufd: Force creating nesting parent HWPT Zhenzhong Duan
2025-10-24 16:23 ` Cédric Le Goater
2025-10-28 6:00 ` Duan, Zhenzhong
2025-10-24 8:43 ` [PATCH v7 09/23] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-flts=on Zhenzhong Duan
2025-10-31 8:09 ` Eric Auger
2025-10-31 9:52 ` Duan, Zhenzhong
2025-11-05 2:45 ` Nicolin Chen
2025-10-24 8:43 ` [PATCH v7 10/23] intel_iommu: Check for compatibility with IOMMUFD backed " Zhenzhong Duan
2025-10-24 17:29 ` Cédric Le Goater
2025-10-29 7:37 ` Duan, Zhenzhong
2025-10-24 8:43 ` [PATCH v7 11/23] intel_iommu: Fail passthrough device under PCI bridge if x-flts=on Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 12/23] intel_iommu: Add some macros and inline functions Zhenzhong Duan
2025-10-24 16:39 ` Cédric Le Goater
2025-10-28 6:01 ` Duan, Zhenzhong
2025-11-02 11:15 ` Eric Auger
2025-11-03 3:44 ` Duan, Zhenzhong
2025-11-03 7:23 ` Eric Auger
2025-11-06 4:25 ` Duan, Zhenzhong
2025-10-24 8:43 ` [PATCH v7 13/23] intel_iommu: Bind/unbind guest page table to host Zhenzhong Duan
2025-10-24 17:01 ` Cédric Le Goater
2025-11-03 9:25 ` Eric Auger [this message]
2025-10-24 17:33 ` Cédric Le Goater
2025-10-29 9:56 ` Duan, Zhenzhong
2025-11-03 9:37 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 14/23] intel_iommu: Propagate PASID-based iotlb invalidation " Zhenzhong Duan
2025-11-03 10:04 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 15/23] intel_iommu: Replay all pasid bindings when either SRTP or TE bit is changed Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 16/23] intel_iommu: Replay pasid bindings after context cache invalidation Zhenzhong Duan
2025-11-03 10:45 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 17/23] iommufd: Introduce a helper function to extract vendor capabilities Zhenzhong Duan
2025-10-24 16:44 ` Cédric Le Goater
2025-10-28 9:43 ` Duan, Zhenzhong
2025-10-24 17:34 ` Cédric Le Goater
2025-10-28 9:28 ` Duan, Zhenzhong
2025-11-03 12:57 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 18/23] vfio: Add a new element bypass_ro in VFIOContainer Zhenzhong Duan
2025-11-03 13:01 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 19/23] Workaround for ERRATA_772415_SPR17 Zhenzhong Duan
2025-10-24 17:36 ` Cédric Le Goater
2025-10-24 17:38 ` Cédric Le Goater
2025-11-03 13:14 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 20/23] vfio: Bypass readonly region for dirty tracking Zhenzhong Duan
2025-10-24 16:32 ` Cédric Le Goater
2025-10-28 9:47 ` Duan, Zhenzhong
2025-11-03 13:07 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 21/23] intel_iommu: Add migration support with x-flts=on Zhenzhong Duan
2025-11-03 13:16 ` Eric Auger
2025-10-24 8:43 ` [PATCH v7 22/23] intel_iommu: Enable host device when x-flts=on in scalable mode Zhenzhong Duan
2025-10-24 8:43 ` [PATCH v7 23/23] docs/devel: Add IOMMUFD nesting documentation Zhenzhong Duan
2025-11-03 13:23 ` Eric Auger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=56714679-d455-4a5f-a46b-b0dbc40f7674@redhat.com \
--to=eric.auger@redhat.com \
--cc=alex.williamson@redhat.com \
--cc=chao.p.peng@intel.com \
--cc=clement.mathieu--drif@eviden.com \
--cc=clg@redhat.com \
--cc=ddutile@redhat.com \
--cc=jasowang@redhat.com \
--cc=jgg@nvidia.com \
--cc=joao.m.martins@oracle.com \
--cc=kevin.tian@intel.com \
--cc=mst@redhat.com \
--cc=nicolinc@nvidia.com \
--cc=peterx@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=skolothumtho@nvidia.com \
--cc=yi.l.liu@intel.com \
--cc=yi.y.sun@linux.intel.com \
--cc=zhenzhong.duan@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).